1 /*-------------------------------------------------------------------------
2  *
3  * heapam.c
4  *	  heap access method code
5  *
6  * Portions Copyright (c) 1996-2018, PostgreSQL Global Development Group
7  * Portions Copyright (c) 1994, Regents of the University of California
8  *
9  *
10  * IDENTIFICATION
11  *	  src/backend/access/heap/heapam.c
12  *
13  *
14  * INTERFACE ROUTINES
15  *		relation_open	- open any relation by relation OID
16  *		relation_openrv - open any relation specified by a RangeVar
17  *		relation_close	- close any relation
18  *		heap_open		- open a heap relation by relation OID
19  *		heap_openrv		- open a heap relation specified by a RangeVar
20  *		heap_close		- (now just a macro for relation_close)
21  *		heap_beginscan	- begin relation scan
22  *		heap_rescan		- restart a relation scan
23  *		heap_endscan	- end relation scan
24  *		heap_getnext	- retrieve next tuple in scan
25  *		heap_fetch		- retrieve tuple with given tid
26  *		heap_insert		- insert tuple into a relation
27  *		heap_multi_insert - insert multiple tuples into a relation
28  *		heap_delete		- delete a tuple from a relation
29  *		heap_update		- replace a tuple in a relation with another tuple
30  *		heap_sync		- sync heap, for when no WAL has been written
31  *
32  * NOTES
33  *	  This file contains the heap_ routines which implement
34  *	  the POSTGRES heap access method used for all POSTGRES
35  *	  relations.
36  *
37  *-------------------------------------------------------------------------
38  */
39 #include "postgres.h"
40 
41 #include "access/bufmask.h"
42 #include "access/heapam.h"
43 #include "access/heapam_xlog.h"
44 #include "access/hio.h"
45 #include "access/multixact.h"
46 #include "access/parallel.h"
47 #include "access/relscan.h"
48 #include "access/sysattr.h"
49 #include "access/transam.h"
50 #include "access/tuptoaster.h"
51 #include "access/valid.h"
52 #include "access/visibilitymap.h"
53 #include "access/xact.h"
54 #include "access/xlog.h"
55 #include "access/xloginsert.h"
56 #include "access/xlogutils.h"
57 #include "catalog/catalog.h"
58 #include "catalog/namespace.h"
59 #include "catalog/index.h"
60 #include "miscadmin.h"
61 #include "pgstat.h"
62 #include "port/atomics.h"
63 #include "storage/bufmgr.h"
64 #include "storage/freespace.h"
65 #include "storage/lmgr.h"
66 #include "storage/predicate.h"
67 #include "storage/procarray.h"
68 #include "storage/smgr.h"
69 #include "storage/spin.h"
70 #include "storage/standby.h"
71 #include "utils/datum.h"
72 #include "utils/inval.h"
73 #include "utils/lsyscache.h"
74 #include "utils/relcache.h"
75 #include "utils/snapmgr.h"
76 #include "utils/syscache.h"
77 #include "utils/tqual.h"
78 #include "utils/memutils.h"
79 #include "nodes/execnodes.h"
80 #include "executor/executor.h"
81 
82 /* GUC variable */
83 bool		synchronize_seqscans = true;
84 
85 
86 static HeapScanDesc heap_beginscan_internal(Relation relation,
87 						Snapshot snapshot,
88 						int nkeys, ScanKey key,
89 						ParallelHeapScanDesc parallel_scan,
90 						bool allow_strat,
91 						bool allow_sync,
92 						bool allow_pagemode,
93 						bool is_bitmapscan,
94 						bool is_samplescan,
95 						bool temp_snap);
96 static void heap_parallelscan_startblock_init(HeapScanDesc scan);
97 static BlockNumber heap_parallelscan_nextpage(HeapScanDesc scan);
98 static HeapTuple heap_prepare_insert(Relation relation, HeapTuple tup,
99 					TransactionId xid, CommandId cid, int options);
100 static XLogRecPtr log_heap_update(Relation reln, Buffer oldbuf,
101 				Buffer newbuf, HeapTuple oldtup,
102 				HeapTuple newtup, HeapTuple old_key_tup,
103 				bool all_visible_cleared, bool new_all_visible_cleared);
104 static Bitmapset *HeapDetermineModifiedColumns(Relation relation,
105 							 Bitmapset *interesting_cols,
106 							 HeapTuple oldtup, HeapTuple newtup);
107 static bool heap_acquire_tuplock(Relation relation, ItemPointer tid,
108 					 LockTupleMode mode, LockWaitPolicy wait_policy,
109 					 bool *have_tuple_lock);
110 static void compute_new_xmax_infomask(TransactionId xmax, uint16 old_infomask,
111 						  uint16 old_infomask2, TransactionId add_to_xmax,
112 						  LockTupleMode mode, bool is_update,
113 						  TransactionId *result_xmax, uint16 *result_infomask,
114 						  uint16 *result_infomask2);
115 static HTSU_Result heap_lock_updated_tuple(Relation rel, HeapTuple tuple,
116 						ItemPointer ctid, TransactionId xid,
117 						LockTupleMode mode);
118 static void GetMultiXactIdHintBits(MultiXactId multi, uint16 *new_infomask,
119 					   uint16 *new_infomask2);
120 static TransactionId MultiXactIdGetUpdateXid(TransactionId xmax,
121 						uint16 t_infomask);
122 static bool DoesMultiXactIdConflict(MultiXactId multi, uint16 infomask,
123 						LockTupleMode lockmode, bool *current_is_member);
124 static void MultiXactIdWait(MultiXactId multi, MultiXactStatus status, uint16 infomask,
125 				Relation rel, ItemPointer ctid, XLTW_Oper oper,
126 				int *remaining);
127 static bool ConditionalMultiXactIdWait(MultiXactId multi, MultiXactStatus status,
128 						   uint16 infomask, Relation rel, int *remaining);
129 static XLogRecPtr log_heap_new_cid(Relation relation, HeapTuple tup);
130 static HeapTuple ExtractReplicaIdentity(Relation rel, HeapTuple tup, bool key_modified,
131 					   bool *copy);
132 static bool ProjIndexIsUnchanged(Relation relation, HeapTuple oldtup, HeapTuple newtup);
133 
134 
135 /*
136  * Each tuple lock mode has a corresponding heavyweight lock, and one or two
137  * corresponding MultiXactStatuses (one to merely lock tuples, another one to
138  * update them).  This table (and the macros below) helps us determine the
139  * heavyweight lock mode and MultiXactStatus values to use for any particular
140  * tuple lock strength.
141  *
142  * Don't look at lockstatus/updstatus directly!  Use get_mxact_status_for_lock
143  * instead.
144  */
145 static const struct
146 {
147 	LOCKMODE	hwlock;
148 	int			lockstatus;
149 	int			updstatus;
150 }
151 
152 			tupleLockExtraInfo[MaxLockTupleMode + 1] =
153 {
154 	{							/* LockTupleKeyShare */
155 		AccessShareLock,
156 		MultiXactStatusForKeyShare,
157 		-1						/* KeyShare does not allow updating tuples */
158 	},
159 	{							/* LockTupleShare */
160 		RowShareLock,
161 		MultiXactStatusForShare,
162 		-1						/* Share does not allow updating tuples */
163 	},
164 	{							/* LockTupleNoKeyExclusive */
165 		ExclusiveLock,
166 		MultiXactStatusForNoKeyUpdate,
167 		MultiXactStatusNoKeyUpdate
168 	},
169 	{							/* LockTupleExclusive */
170 		AccessExclusiveLock,
171 		MultiXactStatusForUpdate,
172 		MultiXactStatusUpdate
173 	}
174 };
175 
176 /* Get the LOCKMODE for a given MultiXactStatus */
177 #define LOCKMODE_from_mxstatus(status) \
178 			(tupleLockExtraInfo[TUPLOCK_from_mxstatus((status))].hwlock)
179 
180 /*
181  * Acquire heavyweight locks on tuples, using a LockTupleMode strength value.
182  * This is more readable than having every caller translate it to lock.h's
183  * LOCKMODE.
184  */
185 #define LockTupleTuplock(rel, tup, mode) \
186 	LockTuple((rel), (tup), tupleLockExtraInfo[mode].hwlock)
187 #define UnlockTupleTuplock(rel, tup, mode) \
188 	UnlockTuple((rel), (tup), tupleLockExtraInfo[mode].hwlock)
189 #define ConditionalLockTupleTuplock(rel, tup, mode) \
190 	ConditionalLockTuple((rel), (tup), tupleLockExtraInfo[mode].hwlock)
191 
192 /*
193  * This table maps tuple lock strength values for each particular
194  * MultiXactStatus value.
195  */
196 static const int MultiXactStatusLock[MaxMultiXactStatus + 1] =
197 {
198 	LockTupleKeyShare,			/* ForKeyShare */
199 	LockTupleShare,				/* ForShare */
200 	LockTupleNoKeyExclusive,	/* ForNoKeyUpdate */
201 	LockTupleExclusive,			/* ForUpdate */
202 	LockTupleNoKeyExclusive,	/* NoKeyUpdate */
203 	LockTupleExclusive			/* Update */
204 };
205 
206 /* Get the LockTupleMode for a given MultiXactStatus */
207 #define TUPLOCK_from_mxstatus(status) \
208 			(MultiXactStatusLock[(status)])
209 
210 /* ----------------------------------------------------------------
211  *						 heap support routines
212  * ----------------------------------------------------------------
213  */
214 
215 /* ----------------
216  *		initscan - scan code common to heap_beginscan and heap_rescan
217  * ----------------
218  */
219 static void
initscan(HeapScanDesc scan,ScanKey key,bool keep_startblock)220 initscan(HeapScanDesc scan, ScanKey key, bool keep_startblock)
221 {
222 	bool		allow_strat;
223 	bool		allow_sync;
224 
225 	/*
226 	 * Determine the number of blocks we have to scan.
227 	 *
228 	 * It is sufficient to do this once at scan start, since any tuples added
229 	 * while the scan is in progress will be invisible to my snapshot anyway.
230 	 * (That is not true when using a non-MVCC snapshot.  However, we couldn't
231 	 * guarantee to return tuples added after scan start anyway, since they
232 	 * might go into pages we already scanned.  To guarantee consistent
233 	 * results for a non-MVCC snapshot, the caller must hold some higher-level
234 	 * lock that ensures the interesting tuple(s) won't change.)
235 	 */
236 	if (scan->rs_parallel != NULL)
237 		scan->rs_nblocks = scan->rs_parallel->phs_nblocks;
238 	else
239 		scan->rs_nblocks = RelationGetNumberOfBlocks(scan->rs_rd);
240 
241 	/*
242 	 * If the table is large relative to NBuffers, use a bulk-read access
243 	 * strategy and enable synchronized scanning (see syncscan.c).  Although
244 	 * the thresholds for these features could be different, we make them the
245 	 * same so that there are only two behaviors to tune rather than four.
246 	 * (However, some callers need to be able to disable one or both of these
247 	 * behaviors, independently of the size of the table; also there is a GUC
248 	 * variable that can disable synchronized scanning.)
249 	 *
250 	 * Note that heap_parallelscan_initialize has a very similar test; if you
251 	 * change this, consider changing that one, too.
252 	 */
253 	if (!RelationUsesLocalBuffers(scan->rs_rd) &&
254 		scan->rs_nblocks > NBuffers / 4)
255 	{
256 		allow_strat = scan->rs_allow_strat;
257 		allow_sync = scan->rs_allow_sync;
258 	}
259 	else
260 		allow_strat = allow_sync = false;
261 
262 	if (allow_strat)
263 	{
264 		/* During a rescan, keep the previous strategy object. */
265 		if (scan->rs_strategy == NULL)
266 			scan->rs_strategy = GetAccessStrategy(BAS_BULKREAD);
267 	}
268 	else
269 	{
270 		if (scan->rs_strategy != NULL)
271 			FreeAccessStrategy(scan->rs_strategy);
272 		scan->rs_strategy = NULL;
273 	}
274 
275 	if (scan->rs_parallel != NULL)
276 	{
277 		/* For parallel scan, believe whatever ParallelHeapScanDesc says. */
278 		scan->rs_syncscan = scan->rs_parallel->phs_syncscan;
279 	}
280 	else if (keep_startblock)
281 	{
282 		/*
283 		 * When rescanning, we want to keep the previous startblock setting,
284 		 * so that rewinding a cursor doesn't generate surprising results.
285 		 * Reset the active syncscan setting, though.
286 		 */
287 		scan->rs_syncscan = (allow_sync && synchronize_seqscans);
288 	}
289 	else if (allow_sync && synchronize_seqscans)
290 	{
291 		scan->rs_syncscan = true;
292 		scan->rs_startblock = ss_get_location(scan->rs_rd, scan->rs_nblocks);
293 	}
294 	else
295 	{
296 		scan->rs_syncscan = false;
297 		scan->rs_startblock = 0;
298 	}
299 
300 	scan->rs_numblocks = InvalidBlockNumber;
301 	scan->rs_inited = false;
302 	scan->rs_ctup.t_data = NULL;
303 	ItemPointerSetInvalid(&scan->rs_ctup.t_self);
304 	scan->rs_cbuf = InvalidBuffer;
305 	scan->rs_cblock = InvalidBlockNumber;
306 
307 	/* page-at-a-time fields are always invalid when not rs_inited */
308 
309 	/*
310 	 * copy the scan key, if appropriate
311 	 */
312 	if (key != NULL)
313 		memcpy(scan->rs_key, key, scan->rs_nkeys * sizeof(ScanKeyData));
314 
315 	/*
316 	 * Currently, we don't have a stats counter for bitmap heap scans (but the
317 	 * underlying bitmap index scans will be counted) or sample scans (we only
318 	 * update stats for tuple fetches there)
319 	 */
320 	if (!scan->rs_bitmapscan && !scan->rs_samplescan)
321 		pgstat_count_heap_scan(scan->rs_rd);
322 }
323 
324 /*
325  * heap_setscanlimits - restrict range of a heapscan
326  *
327  * startBlk is the page to start at
328  * numBlks is number of pages to scan (InvalidBlockNumber means "all")
329  */
330 void
heap_setscanlimits(HeapScanDesc scan,BlockNumber startBlk,BlockNumber numBlks)331 heap_setscanlimits(HeapScanDesc scan, BlockNumber startBlk, BlockNumber numBlks)
332 {
333 	Assert(!scan->rs_inited);	/* else too late to change */
334 	Assert(!scan->rs_syncscan); /* else rs_startblock is significant */
335 
336 	/* Check startBlk is valid (but allow case of zero blocks...) */
337 	Assert(startBlk == 0 || startBlk < scan->rs_nblocks);
338 
339 	scan->rs_startblock = startBlk;
340 	scan->rs_numblocks = numBlks;
341 }
342 
343 /*
344  * heapgetpage - subroutine for heapgettup()
345  *
346  * This routine reads and pins the specified page of the relation.
347  * In page-at-a-time mode it performs additional work, namely determining
348  * which tuples on the page are visible.
349  */
350 void
heapgetpage(HeapScanDesc scan,BlockNumber page)351 heapgetpage(HeapScanDesc scan, BlockNumber page)
352 {
353 	Buffer		buffer;
354 	Snapshot	snapshot;
355 	Page		dp;
356 	int			lines;
357 	int			ntup;
358 	OffsetNumber lineoff;
359 	ItemId		lpp;
360 	bool		all_visible;
361 
362 	Assert(page < scan->rs_nblocks);
363 
364 	/* release previous scan buffer, if any */
365 	if (BufferIsValid(scan->rs_cbuf))
366 	{
367 		ReleaseBuffer(scan->rs_cbuf);
368 		scan->rs_cbuf = InvalidBuffer;
369 	}
370 
371 	/*
372 	 * Be sure to check for interrupts at least once per page.  Checks at
373 	 * higher code levels won't be able to stop a seqscan that encounters many
374 	 * pages' worth of consecutive dead tuples.
375 	 */
376 	CHECK_FOR_INTERRUPTS();
377 
378 	/* read page using selected strategy */
379 	scan->rs_cbuf = ReadBufferExtended(scan->rs_rd, MAIN_FORKNUM, page,
380 									   RBM_NORMAL, scan->rs_strategy);
381 	scan->rs_cblock = page;
382 
383 	if (!scan->rs_pageatatime)
384 		return;
385 
386 	buffer = scan->rs_cbuf;
387 	snapshot = scan->rs_snapshot;
388 
389 	/*
390 	 * Prune and repair fragmentation for the whole page, if possible.
391 	 */
392 	heap_page_prune_opt(scan->rs_rd, buffer);
393 
394 	/*
395 	 * We must hold share lock on the buffer content while examining tuple
396 	 * visibility.  Afterwards, however, the tuples we have found to be
397 	 * visible are guaranteed good as long as we hold the buffer pin.
398 	 */
399 	LockBuffer(buffer, BUFFER_LOCK_SHARE);
400 
401 	dp = BufferGetPage(buffer);
402 	TestForOldSnapshot(snapshot, scan->rs_rd, dp);
403 	lines = PageGetMaxOffsetNumber(dp);
404 	ntup = 0;
405 
406 	/*
407 	 * If the all-visible flag indicates that all tuples on the page are
408 	 * visible to everyone, we can skip the per-tuple visibility tests.
409 	 *
410 	 * Note: In hot standby, a tuple that's already visible to all
411 	 * transactions in the master might still be invisible to a read-only
412 	 * transaction in the standby. We partly handle this problem by tracking
413 	 * the minimum xmin of visible tuples as the cut-off XID while marking a
414 	 * page all-visible on master and WAL log that along with the visibility
415 	 * map SET operation. In hot standby, we wait for (or abort) all
416 	 * transactions that can potentially may not see one or more tuples on the
417 	 * page. That's how index-only scans work fine in hot standby. A crucial
418 	 * difference between index-only scans and heap scans is that the
419 	 * index-only scan completely relies on the visibility map where as heap
420 	 * scan looks at the page-level PD_ALL_VISIBLE flag. We are not sure if
421 	 * the page-level flag can be trusted in the same way, because it might
422 	 * get propagated somehow without being explicitly WAL-logged, e.g. via a
423 	 * full page write. Until we can prove that beyond doubt, let's check each
424 	 * tuple for visibility the hard way.
425 	 */
426 	all_visible = PageIsAllVisible(dp) && !snapshot->takenDuringRecovery;
427 
428 	for (lineoff = FirstOffsetNumber, lpp = PageGetItemId(dp, lineoff);
429 		 lineoff <= lines;
430 		 lineoff++, lpp++)
431 	{
432 		if (ItemIdIsNormal(lpp))
433 		{
434 			HeapTupleData loctup;
435 			bool		valid;
436 
437 			loctup.t_tableOid = RelationGetRelid(scan->rs_rd);
438 			loctup.t_data = (HeapTupleHeader) PageGetItem((Page) dp, lpp);
439 			loctup.t_len = ItemIdGetLength(lpp);
440 			ItemPointerSet(&(loctup.t_self), page, lineoff);
441 
442 			if (all_visible)
443 				valid = true;
444 			else
445 				valid = HeapTupleSatisfiesVisibility(&loctup, snapshot, buffer);
446 
447 			CheckForSerializableConflictOut(valid, scan->rs_rd, &loctup,
448 											buffer, snapshot);
449 
450 			if (valid)
451 				scan->rs_vistuples[ntup++] = lineoff;
452 		}
453 	}
454 
455 	LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
456 
457 	Assert(ntup <= MaxHeapTuplesPerPage);
458 	scan->rs_ntuples = ntup;
459 }
460 
461 /* ----------------
462  *		heapgettup - fetch next heap tuple
463  *
464  *		Initialize the scan if not already done; then advance to the next
465  *		tuple as indicated by "dir"; return the next tuple in scan->rs_ctup,
466  *		or set scan->rs_ctup.t_data = NULL if no more tuples.
467  *
468  * dir == NoMovementScanDirection means "re-fetch the tuple indicated
469  * by scan->rs_ctup".
470  *
471  * Note: the reason nkeys/key are passed separately, even though they are
472  * kept in the scan descriptor, is that the caller may not want us to check
473  * the scankeys.
474  *
475  * Note: when we fall off the end of the scan in either direction, we
476  * reset rs_inited.  This means that a further request with the same
477  * scan direction will restart the scan, which is a bit odd, but a
478  * request with the opposite scan direction will start a fresh scan
479  * in the proper direction.  The latter is required behavior for cursors,
480  * while the former case is generally undefined behavior in Postgres
481  * so we don't care too much.
482  * ----------------
483  */
484 static void
heapgettup(HeapScanDesc scan,ScanDirection dir,int nkeys,ScanKey key)485 heapgettup(HeapScanDesc scan,
486 		   ScanDirection dir,
487 		   int nkeys,
488 		   ScanKey key)
489 {
490 	HeapTuple	tuple = &(scan->rs_ctup);
491 	Snapshot	snapshot = scan->rs_snapshot;
492 	bool		backward = ScanDirectionIsBackward(dir);
493 	BlockNumber page;
494 	bool		finished;
495 	Page		dp;
496 	int			lines;
497 	OffsetNumber lineoff;
498 	int			linesleft;
499 	ItemId		lpp;
500 
501 	/*
502 	 * calculate next starting lineoff, given scan direction
503 	 */
504 	if (ScanDirectionIsForward(dir))
505 	{
506 		if (!scan->rs_inited)
507 		{
508 			/*
509 			 * return null immediately if relation is empty
510 			 */
511 			if (scan->rs_nblocks == 0 || scan->rs_numblocks == 0)
512 			{
513 				Assert(!BufferIsValid(scan->rs_cbuf));
514 				tuple->t_data = NULL;
515 				return;
516 			}
517 			if (scan->rs_parallel != NULL)
518 			{
519 				heap_parallelscan_startblock_init(scan);
520 
521 				page = heap_parallelscan_nextpage(scan);
522 
523 				/* Other processes might have already finished the scan. */
524 				if (page == InvalidBlockNumber)
525 				{
526 					Assert(!BufferIsValid(scan->rs_cbuf));
527 					tuple->t_data = NULL;
528 					return;
529 				}
530 			}
531 			else
532 				page = scan->rs_startblock; /* first page */
533 			heapgetpage(scan, page);
534 			lineoff = FirstOffsetNumber;	/* first offnum */
535 			scan->rs_inited = true;
536 		}
537 		else
538 		{
539 			/* continue from previously returned page/tuple */
540 			page = scan->rs_cblock; /* current page */
541 			lineoff =			/* next offnum */
542 				OffsetNumberNext(ItemPointerGetOffsetNumber(&(tuple->t_self)));
543 		}
544 
545 		LockBuffer(scan->rs_cbuf, BUFFER_LOCK_SHARE);
546 
547 		dp = BufferGetPage(scan->rs_cbuf);
548 		TestForOldSnapshot(snapshot, scan->rs_rd, dp);
549 		lines = PageGetMaxOffsetNumber(dp);
550 		/* page and lineoff now reference the physically next tid */
551 
552 		linesleft = lines - lineoff + 1;
553 	}
554 	else if (backward)
555 	{
556 		/* backward parallel scan not supported */
557 		Assert(scan->rs_parallel == NULL);
558 
559 		if (!scan->rs_inited)
560 		{
561 			/*
562 			 * return null immediately if relation is empty
563 			 */
564 			if (scan->rs_nblocks == 0 || scan->rs_numblocks == 0)
565 			{
566 				Assert(!BufferIsValid(scan->rs_cbuf));
567 				tuple->t_data = NULL;
568 				return;
569 			}
570 
571 			/*
572 			 * Disable reporting to syncscan logic in a backwards scan; it's
573 			 * not very likely anyone else is doing the same thing at the same
574 			 * time, and much more likely that we'll just bollix things for
575 			 * forward scanners.
576 			 */
577 			scan->rs_syncscan = false;
578 
579 			/*
580 			 * Start from last page of the scan.  Ensure we take into account
581 			 * rs_numblocks if it's been adjusted by heap_setscanlimits().
582 			 */
583 			if (scan->rs_numblocks != InvalidBlockNumber)
584 				page = (scan->rs_startblock + scan->rs_numblocks - 1) % scan->rs_nblocks;
585 			else if (scan->rs_startblock > 0)
586 				page = scan->rs_startblock - 1;
587 			else
588 				page = scan->rs_nblocks - 1;
589 			heapgetpage(scan, page);
590 		}
591 		else
592 		{
593 			/* continue from previously returned page/tuple */
594 			page = scan->rs_cblock; /* current page */
595 		}
596 
597 		LockBuffer(scan->rs_cbuf, BUFFER_LOCK_SHARE);
598 
599 		dp = BufferGetPage(scan->rs_cbuf);
600 		TestForOldSnapshot(snapshot, scan->rs_rd, dp);
601 		lines = PageGetMaxOffsetNumber(dp);
602 
603 		if (!scan->rs_inited)
604 		{
605 			lineoff = lines;	/* final offnum */
606 			scan->rs_inited = true;
607 		}
608 		else
609 		{
610 			lineoff =			/* previous offnum */
611 				OffsetNumberPrev(ItemPointerGetOffsetNumber(&(tuple->t_self)));
612 		}
613 		/* page and lineoff now reference the physically previous tid */
614 
615 		linesleft = lineoff;
616 	}
617 	else
618 	{
619 		/*
620 		 * ``no movement'' scan direction: refetch prior tuple
621 		 */
622 		if (!scan->rs_inited)
623 		{
624 			Assert(!BufferIsValid(scan->rs_cbuf));
625 			tuple->t_data = NULL;
626 			return;
627 		}
628 
629 		page = ItemPointerGetBlockNumber(&(tuple->t_self));
630 		if (page != scan->rs_cblock)
631 			heapgetpage(scan, page);
632 
633 		/* Since the tuple was previously fetched, needn't lock page here */
634 		dp = BufferGetPage(scan->rs_cbuf);
635 		TestForOldSnapshot(snapshot, scan->rs_rd, dp);
636 		lineoff = ItemPointerGetOffsetNumber(&(tuple->t_self));
637 		lpp = PageGetItemId(dp, lineoff);
638 		Assert(ItemIdIsNormal(lpp));
639 
640 		tuple->t_data = (HeapTupleHeader) PageGetItem((Page) dp, lpp);
641 		tuple->t_len = ItemIdGetLength(lpp);
642 
643 		return;
644 	}
645 
646 	/*
647 	 * advance the scan until we find a qualifying tuple or run out of stuff
648 	 * to scan
649 	 */
650 	lpp = PageGetItemId(dp, lineoff);
651 	for (;;)
652 	{
653 		while (linesleft > 0)
654 		{
655 			if (ItemIdIsNormal(lpp))
656 			{
657 				bool		valid;
658 
659 				tuple->t_data = (HeapTupleHeader) PageGetItem((Page) dp, lpp);
660 				tuple->t_len = ItemIdGetLength(lpp);
661 				ItemPointerSet(&(tuple->t_self), page, lineoff);
662 
663 				/*
664 				 * if current tuple qualifies, return it.
665 				 */
666 				valid = HeapTupleSatisfiesVisibility(tuple,
667 													 snapshot,
668 													 scan->rs_cbuf);
669 
670 				CheckForSerializableConflictOut(valid, scan->rs_rd, tuple,
671 												scan->rs_cbuf, snapshot);
672 
673 				if (valid && key != NULL)
674 					HeapKeyTest(tuple, RelationGetDescr(scan->rs_rd),
675 								nkeys, key, valid);
676 
677 				if (valid)
678 				{
679 					LockBuffer(scan->rs_cbuf, BUFFER_LOCK_UNLOCK);
680 					return;
681 				}
682 			}
683 
684 			/*
685 			 * otherwise move to the next item on the page
686 			 */
687 			--linesleft;
688 			if (backward)
689 			{
690 				--lpp;			/* move back in this page's ItemId array */
691 				--lineoff;
692 			}
693 			else
694 			{
695 				++lpp;			/* move forward in this page's ItemId array */
696 				++lineoff;
697 			}
698 		}
699 
700 		/*
701 		 * if we get here, it means we've exhausted the items on this page and
702 		 * it's time to move to the next.
703 		 */
704 		LockBuffer(scan->rs_cbuf, BUFFER_LOCK_UNLOCK);
705 
706 		/*
707 		 * advance to next/prior page and detect end of scan
708 		 */
709 		if (backward)
710 		{
711 			finished = (page == scan->rs_startblock) ||
712 				(scan->rs_numblocks != InvalidBlockNumber ? --scan->rs_numblocks == 0 : false);
713 			if (page == 0)
714 				page = scan->rs_nblocks;
715 			page--;
716 		}
717 		else if (scan->rs_parallel != NULL)
718 		{
719 			page = heap_parallelscan_nextpage(scan);
720 			finished = (page == InvalidBlockNumber);
721 		}
722 		else
723 		{
724 			page++;
725 			if (page >= scan->rs_nblocks)
726 				page = 0;
727 			finished = (page == scan->rs_startblock) ||
728 				(scan->rs_numblocks != InvalidBlockNumber ? --scan->rs_numblocks == 0 : false);
729 
730 			/*
731 			 * Report our new scan position for synchronization purposes. We
732 			 * don't do that when moving backwards, however. That would just
733 			 * mess up any other forward-moving scanners.
734 			 *
735 			 * Note: we do this before checking for end of scan so that the
736 			 * final state of the position hint is back at the start of the
737 			 * rel.  That's not strictly necessary, but otherwise when you run
738 			 * the same query multiple times the starting position would shift
739 			 * a little bit backwards on every invocation, which is confusing.
740 			 * We don't guarantee any specific ordering in general, though.
741 			 */
742 			if (scan->rs_syncscan)
743 				ss_report_location(scan->rs_rd, page);
744 		}
745 
746 		/*
747 		 * return NULL if we've exhausted all the pages
748 		 */
749 		if (finished)
750 		{
751 			if (BufferIsValid(scan->rs_cbuf))
752 				ReleaseBuffer(scan->rs_cbuf);
753 			scan->rs_cbuf = InvalidBuffer;
754 			scan->rs_cblock = InvalidBlockNumber;
755 			tuple->t_data = NULL;
756 			scan->rs_inited = false;
757 			return;
758 		}
759 
760 		heapgetpage(scan, page);
761 
762 		LockBuffer(scan->rs_cbuf, BUFFER_LOCK_SHARE);
763 
764 		dp = BufferGetPage(scan->rs_cbuf);
765 		TestForOldSnapshot(snapshot, scan->rs_rd, dp);
766 		lines = PageGetMaxOffsetNumber((Page) dp);
767 		linesleft = lines;
768 		if (backward)
769 		{
770 			lineoff = lines;
771 			lpp = PageGetItemId(dp, lines);
772 		}
773 		else
774 		{
775 			lineoff = FirstOffsetNumber;
776 			lpp = PageGetItemId(dp, FirstOffsetNumber);
777 		}
778 	}
779 }
780 
781 /* ----------------
782  *		heapgettup_pagemode - fetch next heap tuple in page-at-a-time mode
783  *
784  *		Same API as heapgettup, but used in page-at-a-time mode
785  *
786  * The internal logic is much the same as heapgettup's too, but there are some
787  * differences: we do not take the buffer content lock (that only needs to
788  * happen inside heapgetpage), and we iterate through just the tuples listed
789  * in rs_vistuples[] rather than all tuples on the page.  Notice that
790  * lineindex is 0-based, where the corresponding loop variable lineoff in
791  * heapgettup is 1-based.
792  * ----------------
793  */
794 static void
heapgettup_pagemode(HeapScanDesc scan,ScanDirection dir,int nkeys,ScanKey key)795 heapgettup_pagemode(HeapScanDesc scan,
796 					ScanDirection dir,
797 					int nkeys,
798 					ScanKey key)
799 {
800 	HeapTuple	tuple = &(scan->rs_ctup);
801 	bool		backward = ScanDirectionIsBackward(dir);
802 	BlockNumber page;
803 	bool		finished;
804 	Page		dp;
805 	int			lines;
806 	int			lineindex;
807 	OffsetNumber lineoff;
808 	int			linesleft;
809 	ItemId		lpp;
810 
811 	/*
812 	 * calculate next starting lineindex, given scan direction
813 	 */
814 	if (ScanDirectionIsForward(dir))
815 	{
816 		if (!scan->rs_inited)
817 		{
818 			/*
819 			 * return null immediately if relation is empty
820 			 */
821 			if (scan->rs_nblocks == 0 || scan->rs_numblocks == 0)
822 			{
823 				Assert(!BufferIsValid(scan->rs_cbuf));
824 				tuple->t_data = NULL;
825 				return;
826 			}
827 			if (scan->rs_parallel != NULL)
828 			{
829 				heap_parallelscan_startblock_init(scan);
830 
831 				page = heap_parallelscan_nextpage(scan);
832 
833 				/* Other processes might have already finished the scan. */
834 				if (page == InvalidBlockNumber)
835 				{
836 					Assert(!BufferIsValid(scan->rs_cbuf));
837 					tuple->t_data = NULL;
838 					return;
839 				}
840 			}
841 			else
842 				page = scan->rs_startblock; /* first page */
843 			heapgetpage(scan, page);
844 			lineindex = 0;
845 			scan->rs_inited = true;
846 		}
847 		else
848 		{
849 			/* continue from previously returned page/tuple */
850 			page = scan->rs_cblock; /* current page */
851 			lineindex = scan->rs_cindex + 1;
852 		}
853 
854 		dp = BufferGetPage(scan->rs_cbuf);
855 		TestForOldSnapshot(scan->rs_snapshot, scan->rs_rd, dp);
856 		lines = scan->rs_ntuples;
857 		/* page and lineindex now reference the next visible tid */
858 
859 		linesleft = lines - lineindex;
860 	}
861 	else if (backward)
862 	{
863 		/* backward parallel scan not supported */
864 		Assert(scan->rs_parallel == NULL);
865 
866 		if (!scan->rs_inited)
867 		{
868 			/*
869 			 * return null immediately if relation is empty
870 			 */
871 			if (scan->rs_nblocks == 0 || scan->rs_numblocks == 0)
872 			{
873 				Assert(!BufferIsValid(scan->rs_cbuf));
874 				tuple->t_data = NULL;
875 				return;
876 			}
877 
878 			/*
879 			 * Disable reporting to syncscan logic in a backwards scan; it's
880 			 * not very likely anyone else is doing the same thing at the same
881 			 * time, and much more likely that we'll just bollix things for
882 			 * forward scanners.
883 			 */
884 			scan->rs_syncscan = false;
885 
886 			/*
887 			 * Start from last page of the scan.  Ensure we take into account
888 			 * rs_numblocks if it's been adjusted by heap_setscanlimits().
889 			 */
890 			if (scan->rs_numblocks != InvalidBlockNumber)
891 				page = (scan->rs_startblock + scan->rs_numblocks - 1) % scan->rs_nblocks;
892 			else if (scan->rs_startblock > 0)
893 				page = scan->rs_startblock - 1;
894 			else
895 				page = scan->rs_nblocks - 1;
896 			heapgetpage(scan, page);
897 		}
898 		else
899 		{
900 			/* continue from previously returned page/tuple */
901 			page = scan->rs_cblock; /* current page */
902 		}
903 
904 		dp = BufferGetPage(scan->rs_cbuf);
905 		TestForOldSnapshot(scan->rs_snapshot, scan->rs_rd, dp);
906 		lines = scan->rs_ntuples;
907 
908 		if (!scan->rs_inited)
909 		{
910 			lineindex = lines - 1;
911 			scan->rs_inited = true;
912 		}
913 		else
914 		{
915 			lineindex = scan->rs_cindex - 1;
916 		}
917 		/* page and lineindex now reference the previous visible tid */
918 
919 		linesleft = lineindex + 1;
920 	}
921 	else
922 	{
923 		/*
924 		 * ``no movement'' scan direction: refetch prior tuple
925 		 */
926 		if (!scan->rs_inited)
927 		{
928 			Assert(!BufferIsValid(scan->rs_cbuf));
929 			tuple->t_data = NULL;
930 			return;
931 		}
932 
933 		page = ItemPointerGetBlockNumber(&(tuple->t_self));
934 		if (page != scan->rs_cblock)
935 			heapgetpage(scan, page);
936 
937 		/* Since the tuple was previously fetched, needn't lock page here */
938 		dp = BufferGetPage(scan->rs_cbuf);
939 		TestForOldSnapshot(scan->rs_snapshot, scan->rs_rd, dp);
940 		lineoff = ItemPointerGetOffsetNumber(&(tuple->t_self));
941 		lpp = PageGetItemId(dp, lineoff);
942 		Assert(ItemIdIsNormal(lpp));
943 
944 		tuple->t_data = (HeapTupleHeader) PageGetItem((Page) dp, lpp);
945 		tuple->t_len = ItemIdGetLength(lpp);
946 
947 		/* check that rs_cindex is in sync */
948 		Assert(scan->rs_cindex < scan->rs_ntuples);
949 		Assert(lineoff == scan->rs_vistuples[scan->rs_cindex]);
950 
951 		return;
952 	}
953 
954 	/*
955 	 * advance the scan until we find a qualifying tuple or run out of stuff
956 	 * to scan
957 	 */
958 	for (;;)
959 	{
960 		while (linesleft > 0)
961 		{
962 			lineoff = scan->rs_vistuples[lineindex];
963 			lpp = PageGetItemId(dp, lineoff);
964 			Assert(ItemIdIsNormal(lpp));
965 
966 			tuple->t_data = (HeapTupleHeader) PageGetItem((Page) dp, lpp);
967 			tuple->t_len = ItemIdGetLength(lpp);
968 			ItemPointerSet(&(tuple->t_self), page, lineoff);
969 
970 			/*
971 			 * if current tuple qualifies, return it.
972 			 */
973 			if (key != NULL)
974 			{
975 				bool		valid;
976 
977 				HeapKeyTest(tuple, RelationGetDescr(scan->rs_rd),
978 							nkeys, key, valid);
979 				if (valid)
980 				{
981 					scan->rs_cindex = lineindex;
982 					return;
983 				}
984 			}
985 			else
986 			{
987 				scan->rs_cindex = lineindex;
988 				return;
989 			}
990 
991 			/*
992 			 * otherwise move to the next item on the page
993 			 */
994 			--linesleft;
995 			if (backward)
996 				--lineindex;
997 			else
998 				++lineindex;
999 		}
1000 
1001 		/*
1002 		 * if we get here, it means we've exhausted the items on this page and
1003 		 * it's time to move to the next.
1004 		 */
1005 		if (backward)
1006 		{
1007 			finished = (page == scan->rs_startblock) ||
1008 				(scan->rs_numblocks != InvalidBlockNumber ? --scan->rs_numblocks == 0 : false);
1009 			if (page == 0)
1010 				page = scan->rs_nblocks;
1011 			page--;
1012 		}
1013 		else if (scan->rs_parallel != NULL)
1014 		{
1015 			page = heap_parallelscan_nextpage(scan);
1016 			finished = (page == InvalidBlockNumber);
1017 		}
1018 		else
1019 		{
1020 			page++;
1021 			if (page >= scan->rs_nblocks)
1022 				page = 0;
1023 			finished = (page == scan->rs_startblock) ||
1024 				(scan->rs_numblocks != InvalidBlockNumber ? --scan->rs_numblocks == 0 : false);
1025 
1026 			/*
1027 			 * Report our new scan position for synchronization purposes. We
1028 			 * don't do that when moving backwards, however. That would just
1029 			 * mess up any other forward-moving scanners.
1030 			 *
1031 			 * Note: we do this before checking for end of scan so that the
1032 			 * final state of the position hint is back at the start of the
1033 			 * rel.  That's not strictly necessary, but otherwise when you run
1034 			 * the same query multiple times the starting position would shift
1035 			 * a little bit backwards on every invocation, which is confusing.
1036 			 * We don't guarantee any specific ordering in general, though.
1037 			 */
1038 			if (scan->rs_syncscan)
1039 				ss_report_location(scan->rs_rd, page);
1040 		}
1041 
1042 		/*
1043 		 * return NULL if we've exhausted all the pages
1044 		 */
1045 		if (finished)
1046 		{
1047 			if (BufferIsValid(scan->rs_cbuf))
1048 				ReleaseBuffer(scan->rs_cbuf);
1049 			scan->rs_cbuf = InvalidBuffer;
1050 			scan->rs_cblock = InvalidBlockNumber;
1051 			tuple->t_data = NULL;
1052 			scan->rs_inited = false;
1053 			return;
1054 		}
1055 
1056 		heapgetpage(scan, page);
1057 
1058 		dp = BufferGetPage(scan->rs_cbuf);
1059 		TestForOldSnapshot(scan->rs_snapshot, scan->rs_rd, dp);
1060 		lines = scan->rs_ntuples;
1061 		linesleft = lines;
1062 		if (backward)
1063 			lineindex = lines - 1;
1064 		else
1065 			lineindex = 0;
1066 	}
1067 }
1068 
1069 
1070 #if defined(DISABLE_COMPLEX_MACRO)
1071 /*
1072  * This is formatted so oddly so that the correspondence to the macro
1073  * definition in access/htup_details.h is maintained.
1074  */
1075 Datum
fastgetattr(HeapTuple tup,int attnum,TupleDesc tupleDesc,bool * isnull)1076 fastgetattr(HeapTuple tup, int attnum, TupleDesc tupleDesc,
1077 			bool *isnull)
1078 {
1079 	return (
1080 			(attnum) > 0 ?
1081 			(
1082 			 (*(isnull) = false),
1083 			 HeapTupleNoNulls(tup) ?
1084 			 (
1085 			  TupleDescAttr((tupleDesc), (attnum) - 1)->attcacheoff >= 0 ?
1086 			  (
1087 			   fetchatt(TupleDescAttr((tupleDesc), (attnum) - 1),
1088 						(char *) (tup)->t_data + (tup)->t_data->t_hoff +
1089 						TupleDescAttr((tupleDesc), (attnum) - 1)->attcacheoff)
1090 			   )
1091 			  :
1092 			  nocachegetattr((tup), (attnum), (tupleDesc))
1093 			  )
1094 			 :
1095 			 (
1096 			  att_isnull((attnum) - 1, (tup)->t_data->t_bits) ?
1097 			  (
1098 			   (*(isnull) = true),
1099 			   (Datum) NULL
1100 			   )
1101 			  :
1102 			  (
1103 			   nocachegetattr((tup), (attnum), (tupleDesc))
1104 			   )
1105 			  )
1106 			 )
1107 			:
1108 			(
1109 			 (Datum) NULL
1110 			 )
1111 		);
1112 }
1113 #endif							/* defined(DISABLE_COMPLEX_MACRO) */
1114 
1115 
1116 /* ----------------------------------------------------------------
1117  *					 heap access method interface
1118  * ----------------------------------------------------------------
1119  */
1120 
1121 /* ----------------
1122  *		relation_open - open any relation by relation OID
1123  *
1124  *		If lockmode is not "NoLock", the specified kind of lock is
1125  *		obtained on the relation.  (Generally, NoLock should only be
1126  *		used if the caller knows it has some appropriate lock on the
1127  *		relation already.)
1128  *
1129  *		An error is raised if the relation does not exist.
1130  *
1131  *		NB: a "relation" is anything with a pg_class entry.  The caller is
1132  *		expected to check whether the relkind is something it can handle.
1133  * ----------------
1134  */
1135 Relation
relation_open(Oid relationId,LOCKMODE lockmode)1136 relation_open(Oid relationId, LOCKMODE lockmode)
1137 {
1138 	Relation	r;
1139 
1140 	Assert(lockmode >= NoLock && lockmode < MAX_LOCKMODES);
1141 
1142 	/* Get the lock before trying to open the relcache entry */
1143 	if (lockmode != NoLock)
1144 		LockRelationOid(relationId, lockmode);
1145 
1146 	/* The relcache does all the real work... */
1147 	r = RelationIdGetRelation(relationId);
1148 
1149 	if (!RelationIsValid(r))
1150 		elog(ERROR, "could not open relation with OID %u", relationId);
1151 
1152 	/* Make note that we've accessed a temporary relation */
1153 	if (RelationUsesLocalBuffers(r))
1154 		MyXactFlags |= XACT_FLAGS_ACCESSEDTEMPREL;
1155 
1156 	pgstat_initstats(r);
1157 
1158 	return r;
1159 }
1160 
1161 /* ----------------
1162  *		try_relation_open - open any relation by relation OID
1163  *
1164  *		Same as relation_open, except return NULL instead of failing
1165  *		if the relation does not exist.
1166  * ----------------
1167  */
1168 Relation
try_relation_open(Oid relationId,LOCKMODE lockmode)1169 try_relation_open(Oid relationId, LOCKMODE lockmode)
1170 {
1171 	Relation	r;
1172 
1173 	Assert(lockmode >= NoLock && lockmode < MAX_LOCKMODES);
1174 
1175 	/* Get the lock first */
1176 	if (lockmode != NoLock)
1177 		LockRelationOid(relationId, lockmode);
1178 
1179 	/*
1180 	 * Now that we have the lock, probe to see if the relation really exists
1181 	 * or not.
1182 	 */
1183 	if (!SearchSysCacheExists1(RELOID, ObjectIdGetDatum(relationId)))
1184 	{
1185 		/* Release useless lock */
1186 		if (lockmode != NoLock)
1187 			UnlockRelationOid(relationId, lockmode);
1188 
1189 		return NULL;
1190 	}
1191 
1192 	/* Should be safe to do a relcache load */
1193 	r = RelationIdGetRelation(relationId);
1194 
1195 	if (!RelationIsValid(r))
1196 		elog(ERROR, "could not open relation with OID %u", relationId);
1197 
1198 	/* Make note that we've accessed a temporary relation */
1199 	if (RelationUsesLocalBuffers(r))
1200 		MyXactFlags |= XACT_FLAGS_ACCESSEDTEMPREL;
1201 
1202 	pgstat_initstats(r);
1203 
1204 	return r;
1205 }
1206 
1207 /* ----------------
1208  *		relation_openrv - open any relation specified by a RangeVar
1209  *
1210  *		Same as relation_open, but the relation is specified by a RangeVar.
1211  * ----------------
1212  */
1213 Relation
relation_openrv(const RangeVar * relation,LOCKMODE lockmode)1214 relation_openrv(const RangeVar *relation, LOCKMODE lockmode)
1215 {
1216 	Oid			relOid;
1217 
1218 	/*
1219 	 * Check for shared-cache-inval messages before trying to open the
1220 	 * relation.  This is needed even if we already hold a lock on the
1221 	 * relation, because GRANT/REVOKE are executed without taking any lock on
1222 	 * the target relation, and we want to be sure we see current ACL
1223 	 * information.  We can skip this if asked for NoLock, on the assumption
1224 	 * that such a call is not the first one in the current command, and so we
1225 	 * should be reasonably up-to-date already.  (XXX this all could stand to
1226 	 * be redesigned, but for the moment we'll keep doing this like it's been
1227 	 * done historically.)
1228 	 */
1229 	if (lockmode != NoLock)
1230 		AcceptInvalidationMessages();
1231 
1232 	/* Look up and lock the appropriate relation using namespace search */
1233 	relOid = RangeVarGetRelid(relation, lockmode, false);
1234 
1235 	/* Let relation_open do the rest */
1236 	return relation_open(relOid, NoLock);
1237 }
1238 
1239 /* ----------------
1240  *		relation_openrv_extended - open any relation specified by a RangeVar
1241  *
1242  *		Same as relation_openrv, but with an additional missing_ok argument
1243  *		allowing a NULL return rather than an error if the relation is not
1244  *		found.  (Note that some other causes, such as permissions problems,
1245  *		will still result in an ereport.)
1246  * ----------------
1247  */
1248 Relation
relation_openrv_extended(const RangeVar * relation,LOCKMODE lockmode,bool missing_ok)1249 relation_openrv_extended(const RangeVar *relation, LOCKMODE lockmode,
1250 						 bool missing_ok)
1251 {
1252 	Oid			relOid;
1253 
1254 	/*
1255 	 * Check for shared-cache-inval messages before trying to open the
1256 	 * relation.  See comments in relation_openrv().
1257 	 */
1258 	if (lockmode != NoLock)
1259 		AcceptInvalidationMessages();
1260 
1261 	/* Look up and lock the appropriate relation using namespace search */
1262 	relOid = RangeVarGetRelid(relation, lockmode, missing_ok);
1263 
1264 	/* Return NULL on not-found */
1265 	if (!OidIsValid(relOid))
1266 		return NULL;
1267 
1268 	/* Let relation_open do the rest */
1269 	return relation_open(relOid, NoLock);
1270 }
1271 
1272 /* ----------------
1273  *		relation_close - close any relation
1274  *
1275  *		If lockmode is not "NoLock", we then release the specified lock.
1276  *
1277  *		Note that it is often sensible to hold a lock beyond relation_close;
1278  *		in that case, the lock is released automatically at xact end.
1279  * ----------------
1280  */
1281 void
relation_close(Relation relation,LOCKMODE lockmode)1282 relation_close(Relation relation, LOCKMODE lockmode)
1283 {
1284 	LockRelId	relid = relation->rd_lockInfo.lockRelId;
1285 
1286 	Assert(lockmode >= NoLock && lockmode < MAX_LOCKMODES);
1287 
1288 	/* The relcache does the real work... */
1289 	RelationClose(relation);
1290 
1291 	if (lockmode != NoLock)
1292 		UnlockRelationId(&relid, lockmode);
1293 }
1294 
1295 
1296 /* ----------------
1297  *		heap_open - open a heap relation by relation OID
1298  *
1299  *		This is essentially relation_open plus check that the relation
1300  *		is not an index nor a composite type.  (The caller should also
1301  *		check that it's not a view or foreign table before assuming it has
1302  *		storage.)
1303  * ----------------
1304  */
1305 Relation
heap_open(Oid relationId,LOCKMODE lockmode)1306 heap_open(Oid relationId, LOCKMODE lockmode)
1307 {
1308 	Relation	r;
1309 
1310 	r = relation_open(relationId, lockmode);
1311 
1312 	if (r->rd_rel->relkind == RELKIND_INDEX ||
1313 		r->rd_rel->relkind == RELKIND_PARTITIONED_INDEX)
1314 		ereport(ERROR,
1315 				(errcode(ERRCODE_WRONG_OBJECT_TYPE),
1316 				 errmsg("\"%s\" is an index",
1317 						RelationGetRelationName(r))));
1318 	else if (r->rd_rel->relkind == RELKIND_COMPOSITE_TYPE)
1319 		ereport(ERROR,
1320 				(errcode(ERRCODE_WRONG_OBJECT_TYPE),
1321 				 errmsg("\"%s\" is a composite type",
1322 						RelationGetRelationName(r))));
1323 
1324 	return r;
1325 }
1326 
1327 /* ----------------
1328  *		heap_openrv - open a heap relation specified
1329  *		by a RangeVar node
1330  *
1331  *		As above, but relation is specified by a RangeVar.
1332  * ----------------
1333  */
1334 Relation
heap_openrv(const RangeVar * relation,LOCKMODE lockmode)1335 heap_openrv(const RangeVar *relation, LOCKMODE lockmode)
1336 {
1337 	Relation	r;
1338 
1339 	r = relation_openrv(relation, lockmode);
1340 
1341 	if (r->rd_rel->relkind == RELKIND_INDEX ||
1342 		r->rd_rel->relkind == RELKIND_PARTITIONED_INDEX)
1343 		ereport(ERROR,
1344 				(errcode(ERRCODE_WRONG_OBJECT_TYPE),
1345 				 errmsg("\"%s\" is an index",
1346 						RelationGetRelationName(r))));
1347 	else if (r->rd_rel->relkind == RELKIND_COMPOSITE_TYPE)
1348 		ereport(ERROR,
1349 				(errcode(ERRCODE_WRONG_OBJECT_TYPE),
1350 				 errmsg("\"%s\" is a composite type",
1351 						RelationGetRelationName(r))));
1352 
1353 	return r;
1354 }
1355 
1356 /* ----------------
1357  *		heap_openrv_extended - open a heap relation specified
1358  *		by a RangeVar node
1359  *
1360  *		As above, but optionally return NULL instead of failing for
1361  *		relation-not-found.
1362  * ----------------
1363  */
1364 Relation
heap_openrv_extended(const RangeVar * relation,LOCKMODE lockmode,bool missing_ok)1365 heap_openrv_extended(const RangeVar *relation, LOCKMODE lockmode,
1366 					 bool missing_ok)
1367 {
1368 	Relation	r;
1369 
1370 	r = relation_openrv_extended(relation, lockmode, missing_ok);
1371 
1372 	if (r)
1373 	{
1374 		if (r->rd_rel->relkind == RELKIND_INDEX ||
1375 			r->rd_rel->relkind == RELKIND_PARTITIONED_INDEX)
1376 			ereport(ERROR,
1377 					(errcode(ERRCODE_WRONG_OBJECT_TYPE),
1378 					 errmsg("\"%s\" is an index",
1379 							RelationGetRelationName(r))));
1380 		else if (r->rd_rel->relkind == RELKIND_COMPOSITE_TYPE)
1381 			ereport(ERROR,
1382 					(errcode(ERRCODE_WRONG_OBJECT_TYPE),
1383 					 errmsg("\"%s\" is a composite type",
1384 							RelationGetRelationName(r))));
1385 	}
1386 
1387 	return r;
1388 }
1389 
1390 
1391 /* ----------------
1392  *		heap_beginscan	- begin relation scan
1393  *
1394  * heap_beginscan is the "standard" case.
1395  *
1396  * heap_beginscan_catalog differs in setting up its own temporary snapshot.
1397  *
1398  * heap_beginscan_strat offers an extended API that lets the caller control
1399  * whether a nondefault buffer access strategy can be used, and whether
1400  * syncscan can be chosen (possibly resulting in the scan not starting from
1401  * block zero).  Both of these default to true with plain heap_beginscan.
1402  *
1403  * heap_beginscan_bm is an alternative entry point for setting up a
1404  * HeapScanDesc for a bitmap heap scan.  Although that scan technology is
1405  * really quite unlike a standard seqscan, there is just enough commonality
1406  * to make it worth using the same data structure.
1407  *
1408  * heap_beginscan_sampling is an alternative entry point for setting up a
1409  * HeapScanDesc for a TABLESAMPLE scan.  As with bitmap scans, it's worth
1410  * using the same data structure although the behavior is rather different.
1411  * In addition to the options offered by heap_beginscan_strat, this call
1412  * also allows control of whether page-mode visibility checking is used.
1413  * ----------------
1414  */
1415 HeapScanDesc
heap_beginscan(Relation relation,Snapshot snapshot,int nkeys,ScanKey key)1416 heap_beginscan(Relation relation, Snapshot snapshot,
1417 			   int nkeys, ScanKey key)
1418 {
1419 	return heap_beginscan_internal(relation, snapshot, nkeys, key, NULL,
1420 								   true, true, true, false, false, false);
1421 }
1422 
1423 HeapScanDesc
heap_beginscan_catalog(Relation relation,int nkeys,ScanKey key)1424 heap_beginscan_catalog(Relation relation, int nkeys, ScanKey key)
1425 {
1426 	Oid			relid = RelationGetRelid(relation);
1427 	Snapshot	snapshot = RegisterSnapshot(GetCatalogSnapshot(relid));
1428 
1429 	return heap_beginscan_internal(relation, snapshot, nkeys, key, NULL,
1430 								   true, true, true, false, false, true);
1431 }
1432 
1433 HeapScanDesc
heap_beginscan_strat(Relation relation,Snapshot snapshot,int nkeys,ScanKey key,bool allow_strat,bool allow_sync)1434 heap_beginscan_strat(Relation relation, Snapshot snapshot,
1435 					 int nkeys, ScanKey key,
1436 					 bool allow_strat, bool allow_sync)
1437 {
1438 	return heap_beginscan_internal(relation, snapshot, nkeys, key, NULL,
1439 								   allow_strat, allow_sync, true,
1440 								   false, false, false);
1441 }
1442 
1443 HeapScanDesc
heap_beginscan_bm(Relation relation,Snapshot snapshot,int nkeys,ScanKey key)1444 heap_beginscan_bm(Relation relation, Snapshot snapshot,
1445 				  int nkeys, ScanKey key)
1446 {
1447 	return heap_beginscan_internal(relation, snapshot, nkeys, key, NULL,
1448 								   false, false, true, true, false, false);
1449 }
1450 
1451 HeapScanDesc
heap_beginscan_sampling(Relation relation,Snapshot snapshot,int nkeys,ScanKey key,bool allow_strat,bool allow_sync,bool allow_pagemode)1452 heap_beginscan_sampling(Relation relation, Snapshot snapshot,
1453 						int nkeys, ScanKey key,
1454 						bool allow_strat, bool allow_sync, bool allow_pagemode)
1455 {
1456 	return heap_beginscan_internal(relation, snapshot, nkeys, key, NULL,
1457 								   allow_strat, allow_sync, allow_pagemode,
1458 								   false, true, false);
1459 }
1460 
1461 static HeapScanDesc
heap_beginscan_internal(Relation relation,Snapshot snapshot,int nkeys,ScanKey key,ParallelHeapScanDesc parallel_scan,bool allow_strat,bool allow_sync,bool allow_pagemode,bool is_bitmapscan,bool is_samplescan,bool temp_snap)1462 heap_beginscan_internal(Relation relation, Snapshot snapshot,
1463 						int nkeys, ScanKey key,
1464 						ParallelHeapScanDesc parallel_scan,
1465 						bool allow_strat,
1466 						bool allow_sync,
1467 						bool allow_pagemode,
1468 						bool is_bitmapscan,
1469 						bool is_samplescan,
1470 						bool temp_snap)
1471 {
1472 	HeapScanDesc scan;
1473 
1474 	/*
1475 	 * increment relation ref count while scanning relation
1476 	 *
1477 	 * This is just to make really sure the relcache entry won't go away while
1478 	 * the scan has a pointer to it.  Caller should be holding the rel open
1479 	 * anyway, so this is redundant in all normal scenarios...
1480 	 */
1481 	RelationIncrementReferenceCount(relation);
1482 
1483 	/*
1484 	 * allocate and initialize scan descriptor
1485 	 */
1486 	scan = (HeapScanDesc) palloc(sizeof(HeapScanDescData));
1487 
1488 	scan->rs_rd = relation;
1489 	scan->rs_snapshot = snapshot;
1490 	scan->rs_nkeys = nkeys;
1491 	scan->rs_bitmapscan = is_bitmapscan;
1492 	scan->rs_samplescan = is_samplescan;
1493 	scan->rs_strategy = NULL;	/* set in initscan */
1494 	scan->rs_allow_strat = allow_strat;
1495 	scan->rs_allow_sync = allow_sync;
1496 	scan->rs_temp_snap = temp_snap;
1497 	scan->rs_parallel = parallel_scan;
1498 
1499 	/*
1500 	 * we can use page-at-a-time mode if it's an MVCC-safe snapshot
1501 	 */
1502 	scan->rs_pageatatime = allow_pagemode && IsMVCCSnapshot(snapshot);
1503 
1504 	/*
1505 	 * For a seqscan in a serializable transaction, acquire a predicate lock
1506 	 * on the entire relation. This is required not only to lock all the
1507 	 * matching tuples, but also to conflict with new insertions into the
1508 	 * table. In an indexscan, we take page locks on the index pages covering
1509 	 * the range specified in the scan qual, but in a heap scan there is
1510 	 * nothing more fine-grained to lock. A bitmap scan is a different story,
1511 	 * there we have already scanned the index and locked the index pages
1512 	 * covering the predicate. But in that case we still have to lock any
1513 	 * matching heap tuples.
1514 	 */
1515 	if (!is_bitmapscan)
1516 		PredicateLockRelation(relation, snapshot);
1517 
1518 	/* we only need to set this up once */
1519 	scan->rs_ctup.t_tableOid = RelationGetRelid(relation);
1520 
1521 	/*
1522 	 * we do this here instead of in initscan() because heap_rescan also calls
1523 	 * initscan() and we don't want to allocate memory again
1524 	 */
1525 	if (nkeys > 0)
1526 		scan->rs_key = (ScanKey) palloc(sizeof(ScanKeyData) * nkeys);
1527 	else
1528 		scan->rs_key = NULL;
1529 
1530 	initscan(scan, key, false);
1531 
1532 	return scan;
1533 }
1534 
1535 /* ----------------
1536  *		heap_rescan		- restart a relation scan
1537  * ----------------
1538  */
1539 void
heap_rescan(HeapScanDesc scan,ScanKey key)1540 heap_rescan(HeapScanDesc scan,
1541 			ScanKey key)
1542 {
1543 	/*
1544 	 * unpin scan buffers
1545 	 */
1546 	if (BufferIsValid(scan->rs_cbuf))
1547 		ReleaseBuffer(scan->rs_cbuf);
1548 
1549 	/*
1550 	 * reinitialize scan descriptor
1551 	 */
1552 	initscan(scan, key, true);
1553 }
1554 
1555 /* ----------------
1556  *		heap_rescan_set_params	- restart a relation scan after changing params
1557  *
1558  * This call allows changing the buffer strategy, syncscan, and pagemode
1559  * options before starting a fresh scan.  Note that although the actual use
1560  * of syncscan might change (effectively, enabling or disabling reporting),
1561  * the previously selected startblock will be kept.
1562  * ----------------
1563  */
1564 void
heap_rescan_set_params(HeapScanDesc scan,ScanKey key,bool allow_strat,bool allow_sync,bool allow_pagemode)1565 heap_rescan_set_params(HeapScanDesc scan, ScanKey key,
1566 					   bool allow_strat, bool allow_sync, bool allow_pagemode)
1567 {
1568 	/* adjust parameters */
1569 	scan->rs_allow_strat = allow_strat;
1570 	scan->rs_allow_sync = allow_sync;
1571 	scan->rs_pageatatime = allow_pagemode && IsMVCCSnapshot(scan->rs_snapshot);
1572 	/* ... and rescan */
1573 	heap_rescan(scan, key);
1574 }
1575 
1576 /* ----------------
1577  *		heap_endscan	- end relation scan
1578  *
1579  *		See how to integrate with index scans.
1580  *		Check handling if reldesc caching.
1581  * ----------------
1582  */
1583 void
heap_endscan(HeapScanDesc scan)1584 heap_endscan(HeapScanDesc scan)
1585 {
1586 	/* Note: no locking manipulations needed */
1587 
1588 	/*
1589 	 * unpin scan buffers
1590 	 */
1591 	if (BufferIsValid(scan->rs_cbuf))
1592 		ReleaseBuffer(scan->rs_cbuf);
1593 
1594 	/*
1595 	 * decrement relation reference count and free scan descriptor storage
1596 	 */
1597 	RelationDecrementReferenceCount(scan->rs_rd);
1598 
1599 	if (scan->rs_key)
1600 		pfree(scan->rs_key);
1601 
1602 	if (scan->rs_strategy != NULL)
1603 		FreeAccessStrategy(scan->rs_strategy);
1604 
1605 	if (scan->rs_temp_snap)
1606 		UnregisterSnapshot(scan->rs_snapshot);
1607 
1608 	pfree(scan);
1609 }
1610 
1611 /* ----------------
1612  *		heap_parallelscan_estimate - estimate storage for ParallelHeapScanDesc
1613  *
1614  *		Sadly, this doesn't reduce to a constant, because the size required
1615  *		to serialize the snapshot can vary.
1616  * ----------------
1617  */
1618 Size
heap_parallelscan_estimate(Snapshot snapshot)1619 heap_parallelscan_estimate(Snapshot snapshot)
1620 {
1621 	return add_size(offsetof(ParallelHeapScanDescData, phs_snapshot_data),
1622 					EstimateSnapshotSpace(snapshot));
1623 }
1624 
1625 /* ----------------
1626  *		heap_parallelscan_initialize - initialize ParallelHeapScanDesc
1627  *
1628  *		Must allow as many bytes of shared memory as returned by
1629  *		heap_parallelscan_estimate.  Call this just once in the leader
1630  *		process; then, individual workers attach via heap_beginscan_parallel.
1631  * ----------------
1632  */
1633 void
heap_parallelscan_initialize(ParallelHeapScanDesc target,Relation relation,Snapshot snapshot)1634 heap_parallelscan_initialize(ParallelHeapScanDesc target, Relation relation,
1635 							 Snapshot snapshot)
1636 {
1637 	target->phs_relid = RelationGetRelid(relation);
1638 	target->phs_nblocks = RelationGetNumberOfBlocks(relation);
1639 	/* compare phs_syncscan initialization to similar logic in initscan */
1640 	target->phs_syncscan = synchronize_seqscans &&
1641 		!RelationUsesLocalBuffers(relation) &&
1642 		target->phs_nblocks > NBuffers / 4;
1643 	SpinLockInit(&target->phs_mutex);
1644 	target->phs_startblock = InvalidBlockNumber;
1645 	pg_atomic_init_u64(&target->phs_nallocated, 0);
1646 	if (IsMVCCSnapshot(snapshot))
1647 	{
1648 		SerializeSnapshot(snapshot, target->phs_snapshot_data);
1649 		target->phs_snapshot_any = false;
1650 	}
1651 	else
1652 	{
1653 		Assert(snapshot == SnapshotAny);
1654 		target->phs_snapshot_any = true;
1655 	}
1656 }
1657 
1658 /* ----------------
1659  *		heap_parallelscan_reinitialize - reset a parallel scan
1660  *
1661  *		Call this in the leader process.  Caller is responsible for
1662  *		making sure that all workers have finished the scan beforehand.
1663  * ----------------
1664  */
1665 void
heap_parallelscan_reinitialize(ParallelHeapScanDesc parallel_scan)1666 heap_parallelscan_reinitialize(ParallelHeapScanDesc parallel_scan)
1667 {
1668 	pg_atomic_write_u64(&parallel_scan->phs_nallocated, 0);
1669 }
1670 
1671 /* ----------------
1672  *		heap_beginscan_parallel - join a parallel scan
1673  *
1674  *		Caller must hold a suitable lock on the correct relation.
1675  * ----------------
1676  */
1677 HeapScanDesc
heap_beginscan_parallel(Relation relation,ParallelHeapScanDesc parallel_scan)1678 heap_beginscan_parallel(Relation relation, ParallelHeapScanDesc parallel_scan)
1679 {
1680 	Snapshot	snapshot;
1681 
1682 	Assert(RelationGetRelid(relation) == parallel_scan->phs_relid);
1683 
1684 	if (!parallel_scan->phs_snapshot_any)
1685 	{
1686 		/* Snapshot was serialized -- restore it */
1687 		snapshot = RestoreSnapshot(parallel_scan->phs_snapshot_data);
1688 		RegisterSnapshot(snapshot);
1689 	}
1690 	else
1691 	{
1692 		/* SnapshotAny passed by caller (not serialized) */
1693 		snapshot = SnapshotAny;
1694 	}
1695 
1696 	return heap_beginscan_internal(relation, snapshot, 0, NULL, parallel_scan,
1697 								   true, true, true, false, false,
1698 								   !parallel_scan->phs_snapshot_any);
1699 }
1700 
1701 /* ----------------
1702  *		heap_parallelscan_startblock_init - find and set the scan's startblock
1703  *
1704  *		Determine where the parallel seq scan should start.  This function may
1705  *		be called many times, once by each parallel worker.  We must be careful
1706  *		only to set the startblock once.
1707  * ----------------
1708  */
1709 static void
heap_parallelscan_startblock_init(HeapScanDesc scan)1710 heap_parallelscan_startblock_init(HeapScanDesc scan)
1711 {
1712 	BlockNumber sync_startpage = InvalidBlockNumber;
1713 	ParallelHeapScanDesc parallel_scan;
1714 
1715 	Assert(scan->rs_parallel);
1716 	parallel_scan = scan->rs_parallel;
1717 
1718 retry:
1719 	/* Grab the spinlock. */
1720 	SpinLockAcquire(&parallel_scan->phs_mutex);
1721 
1722 	/*
1723 	 * If the scan's startblock has not yet been initialized, we must do so
1724 	 * now.  If this is not a synchronized scan, we just start at block 0, but
1725 	 * if it is a synchronized scan, we must get the starting position from
1726 	 * the synchronized scan machinery.  We can't hold the spinlock while
1727 	 * doing that, though, so release the spinlock, get the information we
1728 	 * need, and retry.  If nobody else has initialized the scan in the
1729 	 * meantime, we'll fill in the value we fetched on the second time
1730 	 * through.
1731 	 */
1732 	if (parallel_scan->phs_startblock == InvalidBlockNumber)
1733 	{
1734 		if (!parallel_scan->phs_syncscan)
1735 			parallel_scan->phs_startblock = 0;
1736 		else if (sync_startpage != InvalidBlockNumber)
1737 			parallel_scan->phs_startblock = sync_startpage;
1738 		else
1739 		{
1740 			SpinLockRelease(&parallel_scan->phs_mutex);
1741 			sync_startpage = ss_get_location(scan->rs_rd, scan->rs_nblocks);
1742 			goto retry;
1743 		}
1744 	}
1745 	SpinLockRelease(&parallel_scan->phs_mutex);
1746 }
1747 
1748 /* ----------------
1749  *		heap_parallelscan_nextpage - get the next page to scan
1750  *
1751  *		Get the next page to scan.  Even if there are no pages left to scan,
1752  *		another backend could have grabbed a page to scan and not yet finished
1753  *		looking at it, so it doesn't follow that the scan is done when the
1754  *		first backend gets an InvalidBlockNumber return.
1755  * ----------------
1756  */
1757 static BlockNumber
heap_parallelscan_nextpage(HeapScanDesc scan)1758 heap_parallelscan_nextpage(HeapScanDesc scan)
1759 {
1760 	BlockNumber page;
1761 	ParallelHeapScanDesc parallel_scan;
1762 	uint64		nallocated;
1763 
1764 	Assert(scan->rs_parallel);
1765 	parallel_scan = scan->rs_parallel;
1766 
1767 	/*
1768 	 * phs_nallocated tracks how many pages have been allocated to workers
1769 	 * already.  When phs_nallocated >= rs_nblocks, all blocks have been
1770 	 * allocated.
1771 	 *
1772 	 * Because we use an atomic fetch-and-add to fetch the current value, the
1773 	 * phs_nallocated counter will exceed rs_nblocks, because workers will
1774 	 * still increment the value, when they try to allocate the next block but
1775 	 * all blocks have been allocated already. The counter must be 64 bits
1776 	 * wide because of that, to avoid wrapping around when rs_nblocks is close
1777 	 * to 2^32.
1778 	 *
1779 	 * The actual page to return is calculated by adding the counter to the
1780 	 * starting block number, modulo nblocks.
1781 	 */
1782 	nallocated = pg_atomic_fetch_add_u64(&parallel_scan->phs_nallocated, 1);
1783 	if (nallocated >= scan->rs_nblocks)
1784 		page = InvalidBlockNumber;	/* all blocks have been allocated */
1785 	else
1786 		page = (nallocated + parallel_scan->phs_startblock) % scan->rs_nblocks;
1787 
1788 	/*
1789 	 * Report scan location.  Normally, we report the current page number.
1790 	 * When we reach the end of the scan, though, we report the starting page,
1791 	 * not the ending page, just so the starting positions for later scans
1792 	 * doesn't slew backwards.  We only report the position at the end of the
1793 	 * scan once, though: subsequent callers will report nothing.
1794 	 */
1795 	if (scan->rs_syncscan)
1796 	{
1797 		if (page != InvalidBlockNumber)
1798 			ss_report_location(scan->rs_rd, page);
1799 		else if (nallocated == scan->rs_nblocks)
1800 			ss_report_location(scan->rs_rd, parallel_scan->phs_startblock);
1801 	}
1802 
1803 	return page;
1804 }
1805 
1806 /* ----------------
1807  *		heap_update_snapshot
1808  *
1809  *		Update snapshot info in heap scan descriptor.
1810  * ----------------
1811  */
1812 void
heap_update_snapshot(HeapScanDesc scan,Snapshot snapshot)1813 heap_update_snapshot(HeapScanDesc scan, Snapshot snapshot)
1814 {
1815 	Assert(IsMVCCSnapshot(snapshot));
1816 
1817 	RegisterSnapshot(snapshot);
1818 	scan->rs_snapshot = snapshot;
1819 	scan->rs_temp_snap = true;
1820 }
1821 
1822 /* ----------------
1823  *		heap_getnext	- retrieve next tuple in scan
1824  *
1825  *		Fix to work with index relations.
1826  *		We don't return the buffer anymore, but you can get it from the
1827  *		returned HeapTuple.
1828  * ----------------
1829  */
1830 
1831 #ifdef HEAPDEBUGALL
1832 #define HEAPDEBUG_1 \
1833 	elog(DEBUG2, "heap_getnext([%s,nkeys=%d],dir=%d) called", \
1834 		 RelationGetRelationName(scan->rs_rd), scan->rs_nkeys, (int) direction)
1835 #define HEAPDEBUG_2 \
1836 	elog(DEBUG2, "heap_getnext returning EOS")
1837 #define HEAPDEBUG_3 \
1838 	elog(DEBUG2, "heap_getnext returning tuple")
1839 #else
1840 #define HEAPDEBUG_1
1841 #define HEAPDEBUG_2
1842 #define HEAPDEBUG_3
1843 #endif							/* !defined(HEAPDEBUGALL) */
1844 
1845 
1846 HeapTuple
heap_getnext(HeapScanDesc scan,ScanDirection direction)1847 heap_getnext(HeapScanDesc scan, ScanDirection direction)
1848 {
1849 	/* Note: no locking manipulations needed */
1850 
1851 	HEAPDEBUG_1;				/* heap_getnext( info ) */
1852 
1853 	if (scan->rs_pageatatime)
1854 		heapgettup_pagemode(scan, direction,
1855 							scan->rs_nkeys, scan->rs_key);
1856 	else
1857 		heapgettup(scan, direction, scan->rs_nkeys, scan->rs_key);
1858 
1859 	if (scan->rs_ctup.t_data == NULL)
1860 	{
1861 		HEAPDEBUG_2;			/* heap_getnext returning EOS */
1862 		return NULL;
1863 	}
1864 
1865 	/*
1866 	 * if we get here it means we have a new current scan tuple, so point to
1867 	 * the proper return buffer and return the tuple.
1868 	 */
1869 	HEAPDEBUG_3;				/* heap_getnext returning tuple */
1870 
1871 	pgstat_count_heap_getnext(scan->rs_rd);
1872 
1873 	return &(scan->rs_ctup);
1874 }
1875 
1876 /*
1877  *	heap_fetch		- retrieve tuple with given tid
1878  *
1879  * On entry, tuple->t_self is the TID to fetch.  We pin the buffer holding
1880  * the tuple, fill in the remaining fields of *tuple, and check the tuple
1881  * against the specified snapshot.
1882  *
1883  * If successful (tuple found and passes snapshot time qual), then *userbuf
1884  * is set to the buffer holding the tuple and true is returned.  The caller
1885  * must unpin the buffer when done with the tuple.
1886  *
1887  * If the tuple is not found (ie, item number references a deleted slot),
1888  * then tuple->t_data is set to NULL and false is returned.
1889  *
1890  * If the tuple is found but fails the time qual check, then false is returned
1891  * but tuple->t_data is left pointing to the tuple.
1892  *
1893  * keep_buf determines what is done with the buffer in the false-result cases.
1894  * When the caller specifies keep_buf = true, we retain the pin on the buffer
1895  * and return it in *userbuf (so the caller must eventually unpin it); when
1896  * keep_buf = false, the pin is released and *userbuf is set to InvalidBuffer.
1897  *
1898  * stats_relation is the relation to charge the heap_fetch operation against
1899  * for statistical purposes.  (This could be the heap rel itself, an
1900  * associated index, or NULL to not count the fetch at all.)
1901  *
1902  * heap_fetch does not follow HOT chains: only the exact TID requested will
1903  * be fetched.
1904  *
1905  * It is somewhat inconsistent that we ereport() on invalid block number but
1906  * return false on invalid item number.  There are a couple of reasons though.
1907  * One is that the caller can relatively easily check the block number for
1908  * validity, but cannot check the item number without reading the page
1909  * himself.  Another is that when we are following a t_ctid link, we can be
1910  * reasonably confident that the page number is valid (since VACUUM shouldn't
1911  * truncate off the destination page without having killed the referencing
1912  * tuple first), but the item number might well not be good.
1913  */
1914 bool
heap_fetch(Relation relation,Snapshot snapshot,HeapTuple tuple,Buffer * userbuf,bool keep_buf,Relation stats_relation)1915 heap_fetch(Relation relation,
1916 		   Snapshot snapshot,
1917 		   HeapTuple tuple,
1918 		   Buffer *userbuf,
1919 		   bool keep_buf,
1920 		   Relation stats_relation)
1921 {
1922 	ItemPointer tid = &(tuple->t_self);
1923 	ItemId		lp;
1924 	Buffer		buffer;
1925 	Page		page;
1926 	OffsetNumber offnum;
1927 	bool		valid;
1928 
1929 	/*
1930 	 * Fetch and pin the appropriate page of the relation.
1931 	 */
1932 	buffer = ReadBuffer(relation, ItemPointerGetBlockNumber(tid));
1933 
1934 	/*
1935 	 * Need share lock on buffer to examine tuple commit status.
1936 	 */
1937 	LockBuffer(buffer, BUFFER_LOCK_SHARE);
1938 	page = BufferGetPage(buffer);
1939 	TestForOldSnapshot(snapshot, relation, page);
1940 
1941 	/*
1942 	 * We'd better check for out-of-range offnum in case of VACUUM since the
1943 	 * TID was obtained.
1944 	 */
1945 	offnum = ItemPointerGetOffsetNumber(tid);
1946 	if (offnum < FirstOffsetNumber || offnum > PageGetMaxOffsetNumber(page))
1947 	{
1948 		LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
1949 		if (keep_buf)
1950 			*userbuf = buffer;
1951 		else
1952 		{
1953 			ReleaseBuffer(buffer);
1954 			*userbuf = InvalidBuffer;
1955 		}
1956 		tuple->t_data = NULL;
1957 		return false;
1958 	}
1959 
1960 	/*
1961 	 * get the item line pointer corresponding to the requested tid
1962 	 */
1963 	lp = PageGetItemId(page, offnum);
1964 
1965 	/*
1966 	 * Must check for deleted tuple.
1967 	 */
1968 	if (!ItemIdIsNormal(lp))
1969 	{
1970 		LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
1971 		if (keep_buf)
1972 			*userbuf = buffer;
1973 		else
1974 		{
1975 			ReleaseBuffer(buffer);
1976 			*userbuf = InvalidBuffer;
1977 		}
1978 		tuple->t_data = NULL;
1979 		return false;
1980 	}
1981 
1982 	/*
1983 	 * fill in *tuple fields
1984 	 */
1985 	tuple->t_data = (HeapTupleHeader) PageGetItem(page, lp);
1986 	tuple->t_len = ItemIdGetLength(lp);
1987 	tuple->t_tableOid = RelationGetRelid(relation);
1988 
1989 	/*
1990 	 * check time qualification of tuple, then release lock
1991 	 */
1992 	valid = HeapTupleSatisfiesVisibility(tuple, snapshot, buffer);
1993 
1994 	if (valid)
1995 		PredicateLockTuple(relation, tuple, snapshot);
1996 
1997 	CheckForSerializableConflictOut(valid, relation, tuple, buffer, snapshot);
1998 
1999 	LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
2000 
2001 	if (valid)
2002 	{
2003 		/*
2004 		 * All checks passed, so return the tuple as valid. Caller is now
2005 		 * responsible for releasing the buffer.
2006 		 */
2007 		*userbuf = buffer;
2008 
2009 		/* Count the successful fetch against appropriate rel, if any */
2010 		if (stats_relation != NULL)
2011 			pgstat_count_heap_fetch(stats_relation);
2012 
2013 		return true;
2014 	}
2015 
2016 	/* Tuple failed time qual, but maybe caller wants to see it anyway. */
2017 	if (keep_buf)
2018 		*userbuf = buffer;
2019 	else
2020 	{
2021 		ReleaseBuffer(buffer);
2022 		*userbuf = InvalidBuffer;
2023 	}
2024 
2025 	return false;
2026 }
2027 
2028 /*
2029  *	heap_hot_search_buffer	- search HOT chain for tuple satisfying snapshot
2030  *
2031  * On entry, *tid is the TID of a tuple (either a simple tuple, or the root
2032  * of a HOT chain), and buffer is the buffer holding this tuple.  We search
2033  * for the first chain member satisfying the given snapshot.  If one is
2034  * found, we update *tid to reference that tuple's offset number, and
2035  * return true.  If no match, return false without modifying *tid.
2036  *
2037  * heapTuple is a caller-supplied buffer.  When a match is found, we return
2038  * the tuple here, in addition to updating *tid.  If no match is found, the
2039  * contents of this buffer on return are undefined.
2040  *
2041  * If all_dead is not NULL, we check non-visible tuples to see if they are
2042  * globally dead; *all_dead is set true if all members of the HOT chain
2043  * are vacuumable, false if not.
2044  *
2045  * Unlike heap_fetch, the caller must already have pin and (at least) share
2046  * lock on the buffer; it is still pinned/locked at exit.  Also unlike
2047  * heap_fetch, we do not report any pgstats count; caller may do so if wanted.
2048  */
2049 bool
heap_hot_search_buffer(ItemPointer tid,Relation relation,Buffer buffer,Snapshot snapshot,HeapTuple heapTuple,bool * all_dead,bool first_call)2050 heap_hot_search_buffer(ItemPointer tid, Relation relation, Buffer buffer,
2051 					   Snapshot snapshot, HeapTuple heapTuple,
2052 					   bool *all_dead, bool first_call)
2053 {
2054 	Page		dp = (Page) BufferGetPage(buffer);
2055 	TransactionId prev_xmax = InvalidTransactionId;
2056 	BlockNumber blkno;
2057 	OffsetNumber offnum;
2058 	bool		at_chain_start;
2059 	bool		valid;
2060 	bool		skip;
2061 
2062 	/* If this is not the first call, previous call returned a (live!) tuple */
2063 	if (all_dead)
2064 		*all_dead = first_call;
2065 
2066 	blkno = ItemPointerGetBlockNumber(tid);
2067 	offnum = ItemPointerGetOffsetNumber(tid);
2068 	at_chain_start = first_call;
2069 	skip = !first_call;
2070 
2071 	Assert(TransactionIdIsValid(RecentGlobalXmin));
2072 	Assert(BufferGetBlockNumber(buffer) == blkno);
2073 
2074 	/* Scan through possible multiple members of HOT-chain */
2075 	for (;;)
2076 	{
2077 		ItemId		lp;
2078 
2079 		/* check for bogus TID */
2080 		if (offnum < FirstOffsetNumber || offnum > PageGetMaxOffsetNumber(dp))
2081 			break;
2082 
2083 		lp = PageGetItemId(dp, offnum);
2084 
2085 		/* check for unused, dead, or redirected items */
2086 		if (!ItemIdIsNormal(lp))
2087 		{
2088 			/* We should only see a redirect at start of chain */
2089 			if (ItemIdIsRedirected(lp) && at_chain_start)
2090 			{
2091 				/* Follow the redirect */
2092 				offnum = ItemIdGetRedirect(lp);
2093 				at_chain_start = false;
2094 				continue;
2095 			}
2096 			/* else must be end of chain */
2097 			break;
2098 		}
2099 
2100 		/*
2101 		 * Update heapTuple to point to the element of the HOT chain we're
2102 		 * currently investigating. Having t_self set correctly is important
2103 		 * because the SSI checks and the *Satisfies routine for historical
2104 		 * MVCC snapshots need the correct tid to decide about the visibility.
2105 		 */
2106 		heapTuple->t_data = (HeapTupleHeader) PageGetItem(dp, lp);
2107 		heapTuple->t_len = ItemIdGetLength(lp);
2108 		heapTuple->t_tableOid = RelationGetRelid(relation);
2109 		ItemPointerSet(&heapTuple->t_self, blkno, offnum);
2110 
2111 		/*
2112 		 * Shouldn't see a HEAP_ONLY tuple at chain start.
2113 		 */
2114 		if (at_chain_start && HeapTupleIsHeapOnly(heapTuple))
2115 			break;
2116 
2117 		/*
2118 		 * The xmin should match the previous xmax value, else chain is
2119 		 * broken.
2120 		 */
2121 		if (TransactionIdIsValid(prev_xmax) &&
2122 			!TransactionIdEquals(prev_xmax,
2123 								 HeapTupleHeaderGetXmin(heapTuple->t_data)))
2124 			break;
2125 
2126 		/*
2127 		 * When first_call is true (and thus, skip is initially false) we'll
2128 		 * return the first tuple we find.  But on later passes, heapTuple
2129 		 * will initially be pointing to the tuple we returned last time.
2130 		 * Returning it again would be incorrect (and would loop forever), so
2131 		 * we skip it and return the next match we find.
2132 		 */
2133 		if (!skip)
2134 		{
2135 			/* If it's visible per the snapshot, we must return it */
2136 			valid = HeapTupleSatisfiesVisibility(heapTuple, snapshot, buffer);
2137 			CheckForSerializableConflictOut(valid, relation, heapTuple,
2138 											buffer, snapshot);
2139 
2140 			if (valid)
2141 			{
2142 				ItemPointerSetOffsetNumber(tid, offnum);
2143 				PredicateLockTuple(relation, heapTuple, snapshot);
2144 				if (all_dead)
2145 					*all_dead = false;
2146 				return true;
2147 			}
2148 		}
2149 		skip = false;
2150 
2151 		/*
2152 		 * If we can't see it, maybe no one else can either.  At caller
2153 		 * request, check whether all chain members are dead to all
2154 		 * transactions.
2155 		 *
2156 		 * Note: if you change the criterion here for what is "dead", fix the
2157 		 * planner's get_actual_variable_range() function to match.
2158 		 */
2159 		if (all_dead && *all_dead &&
2160 			!HeapTupleIsSurelyDead(heapTuple, RecentGlobalXmin))
2161 			*all_dead = false;
2162 
2163 		/*
2164 		 * Check to see if HOT chain continues past this tuple; if so fetch
2165 		 * the next offnum and loop around.
2166 		 */
2167 		if (HeapTupleIsHotUpdated(heapTuple))
2168 		{
2169 			Assert(ItemPointerGetBlockNumber(&heapTuple->t_data->t_ctid) ==
2170 				   blkno);
2171 			offnum = ItemPointerGetOffsetNumber(&heapTuple->t_data->t_ctid);
2172 			at_chain_start = false;
2173 			prev_xmax = HeapTupleHeaderGetUpdateXid(heapTuple->t_data);
2174 		}
2175 		else
2176 			break;				/* end of chain */
2177 	}
2178 
2179 	return false;
2180 }
2181 
2182 /*
2183  *	heap_hot_search		- search HOT chain for tuple satisfying snapshot
2184  *
2185  * This has the same API as heap_hot_search_buffer, except that the caller
2186  * does not provide the buffer containing the page, rather we access it
2187  * locally.
2188  */
2189 bool
heap_hot_search(ItemPointer tid,Relation relation,Snapshot snapshot,bool * all_dead)2190 heap_hot_search(ItemPointer tid, Relation relation, Snapshot snapshot,
2191 				bool *all_dead)
2192 {
2193 	bool		result;
2194 	Buffer		buffer;
2195 	HeapTupleData heapTuple;
2196 
2197 	buffer = ReadBuffer(relation, ItemPointerGetBlockNumber(tid));
2198 	LockBuffer(buffer, BUFFER_LOCK_SHARE);
2199 	result = heap_hot_search_buffer(tid, relation, buffer, snapshot,
2200 									&heapTuple, all_dead, true);
2201 	LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
2202 	ReleaseBuffer(buffer);
2203 	return result;
2204 }
2205 
2206 /*
2207  *	heap_get_latest_tid -  get the latest tid of a specified tuple
2208  *
2209  * Actually, this gets the latest version that is visible according to
2210  * the passed snapshot.  You can pass SnapshotDirty to get the very latest,
2211  * possibly uncommitted version.
2212  *
2213  * *tid is both an input and an output parameter: it is updated to
2214  * show the latest version of the row.  Note that it will not be changed
2215  * if no version of the row passes the snapshot test.
2216  */
2217 void
heap_get_latest_tid(Relation relation,Snapshot snapshot,ItemPointer tid)2218 heap_get_latest_tid(Relation relation,
2219 					Snapshot snapshot,
2220 					ItemPointer tid)
2221 {
2222 	BlockNumber blk;
2223 	ItemPointerData ctid;
2224 	TransactionId priorXmax;
2225 
2226 	/* this is to avoid Assert failures on bad input */
2227 	if (!ItemPointerIsValid(tid))
2228 		return;
2229 
2230 	/*
2231 	 * Since this can be called with user-supplied TID, don't trust the input
2232 	 * too much.  (RelationGetNumberOfBlocks is an expensive check, so we
2233 	 * don't check t_ctid links again this way.  Note that it would not do to
2234 	 * call it just once and save the result, either.)
2235 	 */
2236 	blk = ItemPointerGetBlockNumber(tid);
2237 	if (blk >= RelationGetNumberOfBlocks(relation))
2238 		elog(ERROR, "block number %u is out of range for relation \"%s\"",
2239 			 blk, RelationGetRelationName(relation));
2240 
2241 	/*
2242 	 * Loop to chase down t_ctid links.  At top of loop, ctid is the tuple we
2243 	 * need to examine, and *tid is the TID we will return if ctid turns out
2244 	 * to be bogus.
2245 	 *
2246 	 * Note that we will loop until we reach the end of the t_ctid chain.
2247 	 * Depending on the snapshot passed, there might be at most one visible
2248 	 * version of the row, but we don't try to optimize for that.
2249 	 */
2250 	ctid = *tid;
2251 	priorXmax = InvalidTransactionId;	/* cannot check first XMIN */
2252 	for (;;)
2253 	{
2254 		Buffer		buffer;
2255 		Page		page;
2256 		OffsetNumber offnum;
2257 		ItemId		lp;
2258 		HeapTupleData tp;
2259 		bool		valid;
2260 
2261 		/*
2262 		 * Read, pin, and lock the page.
2263 		 */
2264 		buffer = ReadBuffer(relation, ItemPointerGetBlockNumber(&ctid));
2265 		LockBuffer(buffer, BUFFER_LOCK_SHARE);
2266 		page = BufferGetPage(buffer);
2267 		TestForOldSnapshot(snapshot, relation, page);
2268 
2269 		/*
2270 		 * Check for bogus item number.  This is not treated as an error
2271 		 * condition because it can happen while following a t_ctid link. We
2272 		 * just assume that the prior tid is OK and return it unchanged.
2273 		 */
2274 		offnum = ItemPointerGetOffsetNumber(&ctid);
2275 		if (offnum < FirstOffsetNumber || offnum > PageGetMaxOffsetNumber(page))
2276 		{
2277 			UnlockReleaseBuffer(buffer);
2278 			break;
2279 		}
2280 		lp = PageGetItemId(page, offnum);
2281 		if (!ItemIdIsNormal(lp))
2282 		{
2283 			UnlockReleaseBuffer(buffer);
2284 			break;
2285 		}
2286 
2287 		/* OK to access the tuple */
2288 		tp.t_self = ctid;
2289 		tp.t_data = (HeapTupleHeader) PageGetItem(page, lp);
2290 		tp.t_len = ItemIdGetLength(lp);
2291 		tp.t_tableOid = RelationGetRelid(relation);
2292 
2293 		/*
2294 		 * After following a t_ctid link, we might arrive at an unrelated
2295 		 * tuple.  Check for XMIN match.
2296 		 */
2297 		if (TransactionIdIsValid(priorXmax) &&
2298 			!TransactionIdEquals(priorXmax, HeapTupleHeaderGetXmin(tp.t_data)))
2299 		{
2300 			UnlockReleaseBuffer(buffer);
2301 			break;
2302 		}
2303 
2304 		/*
2305 		 * Check time qualification of tuple; if visible, set it as the new
2306 		 * result candidate.
2307 		 */
2308 		valid = HeapTupleSatisfiesVisibility(&tp, snapshot, buffer);
2309 		CheckForSerializableConflictOut(valid, relation, &tp, buffer, snapshot);
2310 		if (valid)
2311 			*tid = ctid;
2312 
2313 		/*
2314 		 * If there's a valid t_ctid link, follow it, else we're done.
2315 		 */
2316 		if ((tp.t_data->t_infomask & HEAP_XMAX_INVALID) ||
2317 			HeapTupleHeaderIsOnlyLocked(tp.t_data) ||
2318 			HeapTupleHeaderIndicatesMovedPartitions(tp.t_data) ||
2319 			ItemPointerEquals(&tp.t_self, &tp.t_data->t_ctid))
2320 		{
2321 			UnlockReleaseBuffer(buffer);
2322 			break;
2323 		}
2324 
2325 		ctid = tp.t_data->t_ctid;
2326 		priorXmax = HeapTupleHeaderGetUpdateXid(tp.t_data);
2327 		UnlockReleaseBuffer(buffer);
2328 	}							/* end of loop */
2329 }
2330 
2331 
2332 /*
2333  * UpdateXmaxHintBits - update tuple hint bits after xmax transaction ends
2334  *
2335  * This is called after we have waited for the XMAX transaction to terminate.
2336  * If the transaction aborted, we guarantee the XMAX_INVALID hint bit will
2337  * be set on exit.  If the transaction committed, we set the XMAX_COMMITTED
2338  * hint bit if possible --- but beware that that may not yet be possible,
2339  * if the transaction committed asynchronously.
2340  *
2341  * Note that if the transaction was a locker only, we set HEAP_XMAX_INVALID
2342  * even if it commits.
2343  *
2344  * Hence callers should look only at XMAX_INVALID.
2345  *
2346  * Note this is not allowed for tuples whose xmax is a multixact.
2347  */
2348 static void
UpdateXmaxHintBits(HeapTupleHeader tuple,Buffer buffer,TransactionId xid)2349 UpdateXmaxHintBits(HeapTupleHeader tuple, Buffer buffer, TransactionId xid)
2350 {
2351 	Assert(TransactionIdEquals(HeapTupleHeaderGetRawXmax(tuple), xid));
2352 	Assert(!(tuple->t_infomask & HEAP_XMAX_IS_MULTI));
2353 
2354 	if (!(tuple->t_infomask & (HEAP_XMAX_COMMITTED | HEAP_XMAX_INVALID)))
2355 	{
2356 		if (!HEAP_XMAX_IS_LOCKED_ONLY(tuple->t_infomask) &&
2357 			TransactionIdDidCommit(xid))
2358 			HeapTupleSetHintBits(tuple, buffer, HEAP_XMAX_COMMITTED,
2359 								 xid);
2360 		else
2361 			HeapTupleSetHintBits(tuple, buffer, HEAP_XMAX_INVALID,
2362 								 InvalidTransactionId);
2363 	}
2364 }
2365 
2366 
2367 /*
2368  * GetBulkInsertState - prepare status object for a bulk insert
2369  */
2370 BulkInsertState
GetBulkInsertState(void)2371 GetBulkInsertState(void)
2372 {
2373 	BulkInsertState bistate;
2374 
2375 	bistate = (BulkInsertState) palloc(sizeof(BulkInsertStateData));
2376 	bistate->strategy = GetAccessStrategy(BAS_BULKWRITE);
2377 	bistate->current_buf = InvalidBuffer;
2378 	return bistate;
2379 }
2380 
2381 /*
2382  * FreeBulkInsertState - clean up after finishing a bulk insert
2383  */
2384 void
FreeBulkInsertState(BulkInsertState bistate)2385 FreeBulkInsertState(BulkInsertState bistate)
2386 {
2387 	if (bistate->current_buf != InvalidBuffer)
2388 		ReleaseBuffer(bistate->current_buf);
2389 	FreeAccessStrategy(bistate->strategy);
2390 	pfree(bistate);
2391 }
2392 
2393 /*
2394  * ReleaseBulkInsertStatePin - release a buffer currently held in bistate
2395  */
2396 void
ReleaseBulkInsertStatePin(BulkInsertState bistate)2397 ReleaseBulkInsertStatePin(BulkInsertState bistate)
2398 {
2399 	if (bistate->current_buf != InvalidBuffer)
2400 		ReleaseBuffer(bistate->current_buf);
2401 	bistate->current_buf = InvalidBuffer;
2402 }
2403 
2404 
2405 /*
2406  *	heap_insert		- insert tuple into a heap
2407  *
2408  * The new tuple is stamped with current transaction ID and the specified
2409  * command ID.
2410  *
2411  * If the HEAP_INSERT_SKIP_WAL option is specified, the new tuple is not
2412  * logged in WAL, even for a non-temp relation.  Safe usage of this behavior
2413  * requires that we arrange that all new tuples go into new pages not
2414  * containing any tuples from other transactions, and that the relation gets
2415  * fsync'd before commit.  (See also heap_sync() comments)
2416  *
2417  * The HEAP_INSERT_SKIP_FSM option is passed directly to
2418  * RelationGetBufferForTuple, which see for more info.
2419  *
2420  * HEAP_INSERT_FROZEN should only be specified for inserts into
2421  * relfilenodes created during the current subtransaction and when
2422  * there are no prior snapshots or pre-existing portals open.
2423  * This causes rows to be frozen, which is an MVCC violation and
2424  * requires explicit options chosen by user.
2425  *
2426  * HEAP_INSERT_SPECULATIVE is used on so-called "speculative insertions",
2427  * which can be backed out afterwards without aborting the whole transaction.
2428  * Other sessions can wait for the speculative insertion to be confirmed,
2429  * turning it into a regular tuple, or aborted, as if it never existed.
2430  * Speculatively inserted tuples behave as "value locks" of short duration,
2431  * used to implement INSERT .. ON CONFLICT.
2432  *
2433  * HEAP_INSERT_NO_LOGICAL force-disables the emitting of logical decoding
2434  * information for the tuple. This should solely be used during table rewrites
2435  * where RelationIsLogicallyLogged(relation) is not yet accurate for the new
2436  * relation.
2437  *
2438  * Note that most of these options will be applied when inserting into the
2439  * heap's TOAST table, too, if the tuple requires any out-of-line data.  Only
2440  * HEAP_INSERT_SPECULATIVE is explicitly ignored, as the toast data does not
2441  * partake in speculative insertion.
2442  *
2443  * The BulkInsertState object (if any; bistate can be NULL for default
2444  * behavior) is also just passed through to RelationGetBufferForTuple.
2445  *
2446  * The return value is the OID assigned to the tuple (either here or by the
2447  * caller), or InvalidOid if no OID.  The header fields of *tup are updated
2448  * to match the stored tuple; in particular tup->t_self receives the actual
2449  * TID where the tuple was stored.  But note that any toasting of fields
2450  * within the tuple data is NOT reflected into *tup.
2451  */
2452 Oid
heap_insert(Relation relation,HeapTuple tup,CommandId cid,int options,BulkInsertState bistate)2453 heap_insert(Relation relation, HeapTuple tup, CommandId cid,
2454 			int options, BulkInsertState bistate)
2455 {
2456 	TransactionId xid = GetCurrentTransactionId();
2457 	HeapTuple	heaptup;
2458 	Buffer		buffer;
2459 	Buffer		vmbuffer = InvalidBuffer;
2460 	bool		all_visible_cleared = false;
2461 
2462 	/* Cheap, simplistic check that the tuple matches the rel's rowtype. */
2463 	Assert(HeapTupleHeaderGetNatts(tup->t_data) <=
2464 		   RelationGetNumberOfAttributes(relation));
2465 
2466 	/*
2467 	 * Fill in tuple header fields, assign an OID, and toast the tuple if
2468 	 * necessary.
2469 	 *
2470 	 * Note: below this point, heaptup is the data we actually intend to store
2471 	 * into the relation; tup is the caller's original untoasted data.
2472 	 */
2473 	heaptup = heap_prepare_insert(relation, tup, xid, cid, options);
2474 
2475 	/*
2476 	 * Find buffer to insert this tuple into.  If the page is all visible,
2477 	 * this will also pin the requisite visibility map page.
2478 	 */
2479 	buffer = RelationGetBufferForTuple(relation, heaptup->t_len,
2480 									   InvalidBuffer, options, bistate,
2481 									   &vmbuffer, NULL);
2482 
2483 	/*
2484 	 * We're about to do the actual insert -- but check for conflict first, to
2485 	 * avoid possibly having to roll back work we've just done.
2486 	 *
2487 	 * This is safe without a recheck as long as there is no possibility of
2488 	 * another process scanning the page between this check and the insert
2489 	 * being visible to the scan (i.e., an exclusive buffer content lock is
2490 	 * continuously held from this point until the tuple insert is visible).
2491 	 *
2492 	 * For a heap insert, we only need to check for table-level SSI locks. Our
2493 	 * new tuple can't possibly conflict with existing tuple locks, and heap
2494 	 * page locks are only consolidated versions of tuple locks; they do not
2495 	 * lock "gaps" as index page locks do.  So we don't need to specify a
2496 	 * buffer when making the call, which makes for a faster check.
2497 	 */
2498 	CheckForSerializableConflictIn(relation, NULL, InvalidBuffer);
2499 
2500 	/* NO EREPORT(ERROR) from here till changes are logged */
2501 	START_CRIT_SECTION();
2502 
2503 	RelationPutHeapTuple(relation, buffer, heaptup,
2504 						 (options & HEAP_INSERT_SPECULATIVE) != 0);
2505 
2506 	if (PageIsAllVisible(BufferGetPage(buffer)))
2507 	{
2508 		all_visible_cleared = true;
2509 		PageClearAllVisible(BufferGetPage(buffer));
2510 		visibilitymap_clear(relation,
2511 							ItemPointerGetBlockNumber(&(heaptup->t_self)),
2512 							vmbuffer, VISIBILITYMAP_VALID_BITS);
2513 	}
2514 
2515 	/*
2516 	 * XXX Should we set PageSetPrunable on this page ?
2517 	 *
2518 	 * The inserting transaction may eventually abort thus making this tuple
2519 	 * DEAD and hence available for pruning. Though we don't want to optimize
2520 	 * for aborts, if no other tuple in this page is UPDATEd/DELETEd, the
2521 	 * aborted tuple will never be pruned until next vacuum is triggered.
2522 	 *
2523 	 * If you do add PageSetPrunable here, add it in heap_xlog_insert too.
2524 	 */
2525 
2526 	MarkBufferDirty(buffer);
2527 
2528 	/* XLOG stuff */
2529 	if (!(options & HEAP_INSERT_SKIP_WAL) && RelationNeedsWAL(relation))
2530 	{
2531 		xl_heap_insert xlrec;
2532 		xl_heap_header xlhdr;
2533 		XLogRecPtr	recptr;
2534 		Page		page = BufferGetPage(buffer);
2535 		uint8		info = XLOG_HEAP_INSERT;
2536 		int			bufflags = 0;
2537 
2538 		/*
2539 		 * If this is a catalog, we need to transmit combocids to properly
2540 		 * decode, so log that as well.
2541 		 */
2542 		if (RelationIsAccessibleInLogicalDecoding(relation))
2543 			log_heap_new_cid(relation, heaptup);
2544 
2545 		/*
2546 		 * If this is the single and first tuple on page, we can reinit the
2547 		 * page instead of restoring the whole thing.  Set flag, and hide
2548 		 * buffer references from XLogInsert.
2549 		 */
2550 		if (ItemPointerGetOffsetNumber(&(heaptup->t_self)) == FirstOffsetNumber &&
2551 			PageGetMaxOffsetNumber(page) == FirstOffsetNumber)
2552 		{
2553 			info |= XLOG_HEAP_INIT_PAGE;
2554 			bufflags |= REGBUF_WILL_INIT;
2555 		}
2556 
2557 		xlrec.offnum = ItemPointerGetOffsetNumber(&heaptup->t_self);
2558 		xlrec.flags = 0;
2559 		if (all_visible_cleared)
2560 			xlrec.flags |= XLH_INSERT_ALL_VISIBLE_CLEARED;
2561 		if (options & HEAP_INSERT_SPECULATIVE)
2562 			xlrec.flags |= XLH_INSERT_IS_SPECULATIVE;
2563 		Assert(ItemPointerGetBlockNumber(&heaptup->t_self) == BufferGetBlockNumber(buffer));
2564 
2565 		/*
2566 		 * For logical decoding, we need the tuple even if we're doing a full
2567 		 * page write, so make sure it's included even if we take a full-page
2568 		 * image. (XXX We could alternatively store a pointer into the FPW).
2569 		 */
2570 		if (RelationIsLogicallyLogged(relation) &&
2571 			!(options & HEAP_INSERT_NO_LOGICAL))
2572 		{
2573 			xlrec.flags |= XLH_INSERT_CONTAINS_NEW_TUPLE;
2574 			bufflags |= REGBUF_KEEP_DATA;
2575 		}
2576 
2577 		XLogBeginInsert();
2578 		XLogRegisterData((char *) &xlrec, SizeOfHeapInsert);
2579 
2580 		xlhdr.t_infomask2 = heaptup->t_data->t_infomask2;
2581 		xlhdr.t_infomask = heaptup->t_data->t_infomask;
2582 		xlhdr.t_hoff = heaptup->t_data->t_hoff;
2583 
2584 		/*
2585 		 * note we mark xlhdr as belonging to buffer; if XLogInsert decides to
2586 		 * write the whole page to the xlog, we don't need to store
2587 		 * xl_heap_header in the xlog.
2588 		 */
2589 		XLogRegisterBuffer(0, buffer, REGBUF_STANDARD | bufflags);
2590 		XLogRegisterBufData(0, (char *) &xlhdr, SizeOfHeapHeader);
2591 		/* PG73FORMAT: write bitmap [+ padding] [+ oid] + data */
2592 		XLogRegisterBufData(0,
2593 							(char *) heaptup->t_data + SizeofHeapTupleHeader,
2594 							heaptup->t_len - SizeofHeapTupleHeader);
2595 
2596 		/* filtering by origin on a row level is much more efficient */
2597 		XLogSetRecordFlags(XLOG_INCLUDE_ORIGIN);
2598 
2599 		recptr = XLogInsert(RM_HEAP_ID, info);
2600 
2601 		PageSetLSN(page, recptr);
2602 	}
2603 
2604 	END_CRIT_SECTION();
2605 
2606 	UnlockReleaseBuffer(buffer);
2607 	if (vmbuffer != InvalidBuffer)
2608 		ReleaseBuffer(vmbuffer);
2609 
2610 	/*
2611 	 * If tuple is cachable, mark it for invalidation from the caches in case
2612 	 * we abort.  Note it is OK to do this after releasing the buffer, because
2613 	 * the heaptup data structure is all in local memory, not in the shared
2614 	 * buffer.
2615 	 */
2616 	CacheInvalidateHeapTuple(relation, heaptup, NULL);
2617 
2618 	/* Note: speculative insertions are counted too, even if aborted later */
2619 	pgstat_count_heap_insert(relation, 1);
2620 
2621 	/*
2622 	 * If heaptup is a private copy, release it.  Don't forget to copy t_self
2623 	 * back to the caller's image, too.
2624 	 */
2625 	if (heaptup != tup)
2626 	{
2627 		tup->t_self = heaptup->t_self;
2628 		heap_freetuple(heaptup);
2629 	}
2630 
2631 	return HeapTupleGetOid(tup);
2632 }
2633 
2634 /*
2635  * Subroutine for heap_insert(). Prepares a tuple for insertion. This sets the
2636  * tuple header fields, assigns an OID, and toasts the tuple if necessary.
2637  * Returns a toasted version of the tuple if it was toasted, or the original
2638  * tuple if not. Note that in any case, the header fields are also set in
2639  * the original tuple.
2640  */
2641 static HeapTuple
heap_prepare_insert(Relation relation,HeapTuple tup,TransactionId xid,CommandId cid,int options)2642 heap_prepare_insert(Relation relation, HeapTuple tup, TransactionId xid,
2643 					CommandId cid, int options)
2644 {
2645 	/*
2646 	 * Parallel operations are required to be strictly read-only in a parallel
2647 	 * worker.  Parallel inserts are not safe even in the leader in the
2648 	 * general case, because group locking means that heavyweight locks for
2649 	 * relation extension or GIN page locks will not conflict between members
2650 	 * of a lock group, but we don't prohibit that case here because there are
2651 	 * useful special cases that we can safely allow, such as CREATE TABLE AS.
2652 	 */
2653 	if (IsParallelWorker())
2654 		ereport(ERROR,
2655 				(errcode(ERRCODE_INVALID_TRANSACTION_STATE),
2656 				 errmsg("cannot insert tuples in a parallel worker")));
2657 
2658 	if (relation->rd_rel->relhasoids)
2659 	{
2660 #ifdef NOT_USED
2661 		/* this is redundant with an Assert in HeapTupleSetOid */
2662 		Assert(tup->t_data->t_infomask & HEAP_HASOID);
2663 #endif
2664 
2665 		/*
2666 		 * If the object id of this tuple has already been assigned, trust the
2667 		 * caller.  There are a couple of ways this can happen.  At initial db
2668 		 * creation, the backend program sets oids for tuples. When we define
2669 		 * an index, we set the oid.  Finally, in the future, we may allow
2670 		 * users to set their own object ids in order to support a persistent
2671 		 * object store (objects need to contain pointers to one another).
2672 		 */
2673 		if (!OidIsValid(HeapTupleGetOid(tup)))
2674 			HeapTupleSetOid(tup, GetNewOid(relation));
2675 	}
2676 	else
2677 	{
2678 		/* check there is not space for an OID */
2679 		Assert(!(tup->t_data->t_infomask & HEAP_HASOID));
2680 	}
2681 
2682 	tup->t_data->t_infomask &= ~(HEAP_XACT_MASK);
2683 	tup->t_data->t_infomask2 &= ~(HEAP2_XACT_MASK);
2684 	tup->t_data->t_infomask |= HEAP_XMAX_INVALID;
2685 	HeapTupleHeaderSetXmin(tup->t_data, xid);
2686 	if (options & HEAP_INSERT_FROZEN)
2687 		HeapTupleHeaderSetXminFrozen(tup->t_data);
2688 
2689 	HeapTupleHeaderSetCmin(tup->t_data, cid);
2690 	HeapTupleHeaderSetXmax(tup->t_data, 0); /* for cleanliness */
2691 	tup->t_tableOid = RelationGetRelid(relation);
2692 
2693 	/*
2694 	 * If the new tuple is too big for storage or contains already toasted
2695 	 * out-of-line attributes from some other relation, invoke the toaster.
2696 	 */
2697 	if (relation->rd_rel->relkind != RELKIND_RELATION &&
2698 		relation->rd_rel->relkind != RELKIND_MATVIEW)
2699 	{
2700 		/* toast table entries should never be recursively toasted */
2701 		Assert(!HeapTupleHasExternal(tup));
2702 		return tup;
2703 	}
2704 	else if (HeapTupleHasExternal(tup) || tup->t_len > TOAST_TUPLE_THRESHOLD)
2705 		return toast_insert_or_update(relation, tup, NULL, options);
2706 	else
2707 		return tup;
2708 }
2709 
2710 /*
2711  *	heap_multi_insert	- insert multiple tuple into a heap
2712  *
2713  * This is like heap_insert(), but inserts multiple tuples in one operation.
2714  * That's faster than calling heap_insert() in a loop, because when multiple
2715  * tuples can be inserted on a single page, we can write just a single WAL
2716  * record covering all of them, and only need to lock/unlock the page once.
2717  *
2718  * Note: this leaks memory into the current memory context. You can create a
2719  * temporary context before calling this, if that's a problem.
2720  */
2721 void
heap_multi_insert(Relation relation,HeapTuple * tuples,int ntuples,CommandId cid,int options,BulkInsertState bistate)2722 heap_multi_insert(Relation relation, HeapTuple *tuples, int ntuples,
2723 				  CommandId cid, int options, BulkInsertState bistate)
2724 {
2725 	TransactionId xid = GetCurrentTransactionId();
2726 	HeapTuple  *heaptuples;
2727 	int			i;
2728 	int			ndone;
2729 	PGAlignedBlock scratch;
2730 	Page		page;
2731 	bool		needwal;
2732 	Size		saveFreeSpace;
2733 	bool		need_tuple_data = RelationIsLogicallyLogged(relation);
2734 	bool		need_cids = RelationIsAccessibleInLogicalDecoding(relation);
2735 
2736 	/* currently not needed (thus unsupported) for heap_multi_insert() */
2737 	AssertArg(!(options & HEAP_INSERT_NO_LOGICAL));
2738 
2739 	needwal = !(options & HEAP_INSERT_SKIP_WAL) && RelationNeedsWAL(relation);
2740 	saveFreeSpace = RelationGetTargetPageFreeSpace(relation,
2741 												   HEAP_DEFAULT_FILLFACTOR);
2742 
2743 	/* Toast and set header data in all the tuples */
2744 	heaptuples = palloc(ntuples * sizeof(HeapTuple));
2745 	for (i = 0; i < ntuples; i++)
2746 		heaptuples[i] = heap_prepare_insert(relation, tuples[i],
2747 											xid, cid, options);
2748 
2749 	/*
2750 	 * We're about to do the actual inserts -- but check for conflict first,
2751 	 * to minimize the possibility of having to roll back work we've just
2752 	 * done.
2753 	 *
2754 	 * A check here does not definitively prevent a serialization anomaly;
2755 	 * that check MUST be done at least past the point of acquiring an
2756 	 * exclusive buffer content lock on every buffer that will be affected,
2757 	 * and MAY be done after all inserts are reflected in the buffers and
2758 	 * those locks are released; otherwise there race condition.  Since
2759 	 * multiple buffers can be locked and unlocked in the loop below, and it
2760 	 * would not be feasible to identify and lock all of those buffers before
2761 	 * the loop, we must do a final check at the end.
2762 	 *
2763 	 * The check here could be omitted with no loss of correctness; it is
2764 	 * present strictly as an optimization.
2765 	 *
2766 	 * For heap inserts, we only need to check for table-level SSI locks. Our
2767 	 * new tuples can't possibly conflict with existing tuple locks, and heap
2768 	 * page locks are only consolidated versions of tuple locks; they do not
2769 	 * lock "gaps" as index page locks do.  So we don't need to specify a
2770 	 * buffer when making the call, which makes for a faster check.
2771 	 */
2772 	CheckForSerializableConflictIn(relation, NULL, InvalidBuffer);
2773 
2774 	ndone = 0;
2775 	while (ndone < ntuples)
2776 	{
2777 		Buffer		buffer;
2778 		Buffer		vmbuffer = InvalidBuffer;
2779 		bool		all_visible_cleared = false;
2780 		int			nthispage;
2781 
2782 		CHECK_FOR_INTERRUPTS();
2783 
2784 		/*
2785 		 * Find buffer where at least the next tuple will fit.  If the page is
2786 		 * all-visible, this will also pin the requisite visibility map page.
2787 		 */
2788 		buffer = RelationGetBufferForTuple(relation, heaptuples[ndone]->t_len,
2789 										   InvalidBuffer, options, bistate,
2790 										   &vmbuffer, NULL);
2791 		page = BufferGetPage(buffer);
2792 
2793 		/* NO EREPORT(ERROR) from here till changes are logged */
2794 		START_CRIT_SECTION();
2795 
2796 		/*
2797 		 * RelationGetBufferForTuple has ensured that the first tuple fits.
2798 		 * Put that on the page, and then as many other tuples as fit.
2799 		 */
2800 		RelationPutHeapTuple(relation, buffer, heaptuples[ndone], false);
2801 		for (nthispage = 1; ndone + nthispage < ntuples; nthispage++)
2802 		{
2803 			HeapTuple	heaptup = heaptuples[ndone + nthispage];
2804 
2805 			if (PageGetHeapFreeSpace(page) < MAXALIGN(heaptup->t_len) + saveFreeSpace)
2806 				break;
2807 
2808 			RelationPutHeapTuple(relation, buffer, heaptup, false);
2809 
2810 			/*
2811 			 * We don't use heap_multi_insert for catalog tuples yet, but
2812 			 * better be prepared...
2813 			 */
2814 			if (needwal && need_cids)
2815 				log_heap_new_cid(relation, heaptup);
2816 		}
2817 
2818 		if (PageIsAllVisible(page))
2819 		{
2820 			all_visible_cleared = true;
2821 			PageClearAllVisible(page);
2822 			visibilitymap_clear(relation,
2823 								BufferGetBlockNumber(buffer),
2824 								vmbuffer, VISIBILITYMAP_VALID_BITS);
2825 		}
2826 
2827 		/*
2828 		 * XXX Should we set PageSetPrunable on this page ? See heap_insert()
2829 		 */
2830 
2831 		MarkBufferDirty(buffer);
2832 
2833 		/* XLOG stuff */
2834 		if (needwal)
2835 		{
2836 			XLogRecPtr	recptr;
2837 			xl_heap_multi_insert *xlrec;
2838 			uint8		info = XLOG_HEAP2_MULTI_INSERT;
2839 			char	   *tupledata;
2840 			int			totaldatalen;
2841 			char	   *scratchptr = scratch.data;
2842 			bool		init;
2843 			int			bufflags = 0;
2844 
2845 			/*
2846 			 * If the page was previously empty, we can reinit the page
2847 			 * instead of restoring the whole thing.
2848 			 */
2849 			init = (ItemPointerGetOffsetNumber(&(heaptuples[ndone]->t_self)) == FirstOffsetNumber &&
2850 					PageGetMaxOffsetNumber(page) == FirstOffsetNumber + nthispage - 1);
2851 
2852 			/* allocate xl_heap_multi_insert struct from the scratch area */
2853 			xlrec = (xl_heap_multi_insert *) scratchptr;
2854 			scratchptr += SizeOfHeapMultiInsert;
2855 
2856 			/*
2857 			 * Allocate offsets array. Unless we're reinitializing the page,
2858 			 * in that case the tuples are stored in order starting at
2859 			 * FirstOffsetNumber and we don't need to store the offsets
2860 			 * explicitly.
2861 			 */
2862 			if (!init)
2863 				scratchptr += nthispage * sizeof(OffsetNumber);
2864 
2865 			/* the rest of the scratch space is used for tuple data */
2866 			tupledata = scratchptr;
2867 
2868 			xlrec->flags = all_visible_cleared ? XLH_INSERT_ALL_VISIBLE_CLEARED : 0;
2869 			xlrec->ntuples = nthispage;
2870 
2871 			/*
2872 			 * Write out an xl_multi_insert_tuple and the tuple data itself
2873 			 * for each tuple.
2874 			 */
2875 			for (i = 0; i < nthispage; i++)
2876 			{
2877 				HeapTuple	heaptup = heaptuples[ndone + i];
2878 				xl_multi_insert_tuple *tuphdr;
2879 				int			datalen;
2880 
2881 				if (!init)
2882 					xlrec->offsets[i] = ItemPointerGetOffsetNumber(&heaptup->t_self);
2883 				/* xl_multi_insert_tuple needs two-byte alignment. */
2884 				tuphdr = (xl_multi_insert_tuple *) SHORTALIGN(scratchptr);
2885 				scratchptr = ((char *) tuphdr) + SizeOfMultiInsertTuple;
2886 
2887 				tuphdr->t_infomask2 = heaptup->t_data->t_infomask2;
2888 				tuphdr->t_infomask = heaptup->t_data->t_infomask;
2889 				tuphdr->t_hoff = heaptup->t_data->t_hoff;
2890 
2891 				/* write bitmap [+ padding] [+ oid] + data */
2892 				datalen = heaptup->t_len - SizeofHeapTupleHeader;
2893 				memcpy(scratchptr,
2894 					   (char *) heaptup->t_data + SizeofHeapTupleHeader,
2895 					   datalen);
2896 				tuphdr->datalen = datalen;
2897 				scratchptr += datalen;
2898 			}
2899 			totaldatalen = scratchptr - tupledata;
2900 			Assert((scratchptr - scratch.data) < BLCKSZ);
2901 
2902 			if (need_tuple_data)
2903 				xlrec->flags |= XLH_INSERT_CONTAINS_NEW_TUPLE;
2904 
2905 			/*
2906 			 * Signal that this is the last xl_heap_multi_insert record
2907 			 * emitted by this call to heap_multi_insert(). Needed for logical
2908 			 * decoding so it knows when to cleanup temporary data.
2909 			 */
2910 			if (ndone + nthispage == ntuples)
2911 				xlrec->flags |= XLH_INSERT_LAST_IN_MULTI;
2912 
2913 			if (init)
2914 			{
2915 				info |= XLOG_HEAP_INIT_PAGE;
2916 				bufflags |= REGBUF_WILL_INIT;
2917 			}
2918 
2919 			/*
2920 			 * If we're doing logical decoding, include the new tuple data
2921 			 * even if we take a full-page image of the page.
2922 			 */
2923 			if (need_tuple_data)
2924 				bufflags |= REGBUF_KEEP_DATA;
2925 
2926 			XLogBeginInsert();
2927 			XLogRegisterData((char *) xlrec, tupledata - scratch.data);
2928 			XLogRegisterBuffer(0, buffer, REGBUF_STANDARD | bufflags);
2929 
2930 			XLogRegisterBufData(0, tupledata, totaldatalen);
2931 
2932 			/* filtering by origin on a row level is much more efficient */
2933 			XLogSetRecordFlags(XLOG_INCLUDE_ORIGIN);
2934 
2935 			recptr = XLogInsert(RM_HEAP2_ID, info);
2936 
2937 			PageSetLSN(page, recptr);
2938 		}
2939 
2940 		END_CRIT_SECTION();
2941 
2942 		UnlockReleaseBuffer(buffer);
2943 		if (vmbuffer != InvalidBuffer)
2944 			ReleaseBuffer(vmbuffer);
2945 
2946 		ndone += nthispage;
2947 	}
2948 
2949 	/*
2950 	 * We're done with the actual inserts.  Check for conflicts again, to
2951 	 * ensure that all rw-conflicts in to these inserts are detected.  Without
2952 	 * this final check, a sequential scan of the heap may have locked the
2953 	 * table after the "before" check, missing one opportunity to detect the
2954 	 * conflict, and then scanned the table before the new tuples were there,
2955 	 * missing the other chance to detect the conflict.
2956 	 *
2957 	 * For heap inserts, we only need to check for table-level SSI locks. Our
2958 	 * new tuples can't possibly conflict with existing tuple locks, and heap
2959 	 * page locks are only consolidated versions of tuple locks; they do not
2960 	 * lock "gaps" as index page locks do.  So we don't need to specify a
2961 	 * buffer when making the call.
2962 	 */
2963 	CheckForSerializableConflictIn(relation, NULL, InvalidBuffer);
2964 
2965 	/*
2966 	 * If tuples are cachable, mark them for invalidation from the caches in
2967 	 * case we abort.  Note it is OK to do this after releasing the buffer,
2968 	 * because the heaptuples data structure is all in local memory, not in
2969 	 * the shared buffer.
2970 	 */
2971 	if (IsCatalogRelation(relation))
2972 	{
2973 		for (i = 0; i < ntuples; i++)
2974 			CacheInvalidateHeapTuple(relation, heaptuples[i], NULL);
2975 	}
2976 
2977 	/*
2978 	 * Copy t_self fields back to the caller's original tuples. This does
2979 	 * nothing for untoasted tuples (tuples[i] == heaptuples[i)], but it's
2980 	 * probably faster to always copy than check.
2981 	 */
2982 	for (i = 0; i < ntuples; i++)
2983 		tuples[i]->t_self = heaptuples[i]->t_self;
2984 
2985 	pgstat_count_heap_insert(relation, ntuples);
2986 }
2987 
2988 /*
2989  *	simple_heap_insert - insert a tuple
2990  *
2991  * Currently, this routine differs from heap_insert only in supplying
2992  * a default command ID and not allowing access to the speedup options.
2993  *
2994  * This should be used rather than using heap_insert directly in most places
2995  * where we are modifying system catalogs.
2996  */
2997 Oid
simple_heap_insert(Relation relation,HeapTuple tup)2998 simple_heap_insert(Relation relation, HeapTuple tup)
2999 {
3000 	return heap_insert(relation, tup, GetCurrentCommandId(true), 0, NULL);
3001 }
3002 
3003 /*
3004  * Given infomask/infomask2, compute the bits that must be saved in the
3005  * "infobits" field of xl_heap_delete, xl_heap_update, xl_heap_lock,
3006  * xl_heap_lock_updated WAL records.
3007  *
3008  * See fix_infomask_from_infobits.
3009  */
3010 static uint8
compute_infobits(uint16 infomask,uint16 infomask2)3011 compute_infobits(uint16 infomask, uint16 infomask2)
3012 {
3013 	return
3014 		((infomask & HEAP_XMAX_IS_MULTI) != 0 ? XLHL_XMAX_IS_MULTI : 0) |
3015 		((infomask & HEAP_XMAX_LOCK_ONLY) != 0 ? XLHL_XMAX_LOCK_ONLY : 0) |
3016 		((infomask & HEAP_XMAX_EXCL_LOCK) != 0 ? XLHL_XMAX_EXCL_LOCK : 0) |
3017 	/* note we ignore HEAP_XMAX_SHR_LOCK here */
3018 		((infomask & HEAP_XMAX_KEYSHR_LOCK) != 0 ? XLHL_XMAX_KEYSHR_LOCK : 0) |
3019 		((infomask2 & HEAP_KEYS_UPDATED) != 0 ?
3020 		 XLHL_KEYS_UPDATED : 0);
3021 }
3022 
3023 /*
3024  * Given two versions of the same t_infomask for a tuple, compare them and
3025  * return whether the relevant status for a tuple Xmax has changed.  This is
3026  * used after a buffer lock has been released and reacquired: we want to ensure
3027  * that the tuple state continues to be the same it was when we previously
3028  * examined it.
3029  *
3030  * Note the Xmax field itself must be compared separately.
3031  */
3032 static inline bool
xmax_infomask_changed(uint16 new_infomask,uint16 old_infomask)3033 xmax_infomask_changed(uint16 new_infomask, uint16 old_infomask)
3034 {
3035 	const uint16 interesting =
3036 	HEAP_XMAX_IS_MULTI | HEAP_XMAX_LOCK_ONLY | HEAP_LOCK_MASK;
3037 
3038 	if ((new_infomask & interesting) != (old_infomask & interesting))
3039 		return true;
3040 
3041 	return false;
3042 }
3043 
3044 /*
3045  *	heap_delete - delete a tuple
3046  *
3047  * NB: do not call this directly unless you are prepared to deal with
3048  * concurrent-update conditions.  Use simple_heap_delete instead.
3049  *
3050  *	relation - table to be modified (caller must hold suitable lock)
3051  *	tid - TID of tuple to be deleted
3052  *	cid - delete command ID (used for visibility test, and stored into
3053  *		cmax if successful)
3054  *	crosscheck - if not InvalidSnapshot, also check tuple against this
3055  *	wait - true if should wait for any conflicting update to commit/abort
3056  *	hufd - output parameter, filled in failure cases (see below)
3057  *	changingPart - true iff the tuple is being moved to another partition
3058  *		table due to an update of the partition key. Otherwise, false.
3059  *
3060  * Normal, successful return value is HeapTupleMayBeUpdated, which
3061  * actually means we did delete it.  Failure return codes are
3062  * HeapTupleSelfUpdated, HeapTupleUpdated, or HeapTupleBeingUpdated
3063  * (the last only possible if wait == false).
3064  *
3065  * In the failure cases, the routine fills *hufd with the tuple's t_ctid,
3066  * t_xmax (resolving a possible MultiXact, if necessary), and t_cmax
3067  * (the last only for HeapTupleSelfUpdated, since we
3068  * cannot obtain cmax from a combocid generated by another transaction).
3069  * See comments for struct HeapUpdateFailureData for additional info.
3070  */
3071 HTSU_Result
heap_delete(Relation relation,ItemPointer tid,CommandId cid,Snapshot crosscheck,bool wait,HeapUpdateFailureData * hufd,bool changingPart)3072 heap_delete(Relation relation, ItemPointer tid,
3073 			CommandId cid, Snapshot crosscheck, bool wait,
3074 			HeapUpdateFailureData *hufd, bool changingPart)
3075 {
3076 	HTSU_Result result;
3077 	TransactionId xid = GetCurrentTransactionId();
3078 	ItemId		lp;
3079 	HeapTupleData tp;
3080 	Page		page;
3081 	BlockNumber block;
3082 	Buffer		buffer;
3083 	Buffer		vmbuffer = InvalidBuffer;
3084 	TransactionId new_xmax;
3085 	uint16		new_infomask,
3086 				new_infomask2;
3087 	bool		have_tuple_lock = false;
3088 	bool		iscombo;
3089 	bool		all_visible_cleared = false;
3090 	HeapTuple	old_key_tuple = NULL;	/* replica identity of the tuple */
3091 	bool		old_key_copied = false;
3092 
3093 	Assert(ItemPointerIsValid(tid));
3094 
3095 	/*
3096 	 * Forbid this during a parallel operation, lest it allocate a combocid.
3097 	 * Other workers might need that combocid for visibility checks, and we
3098 	 * have no provision for broadcasting it to them.
3099 	 */
3100 	if (IsInParallelMode())
3101 		ereport(ERROR,
3102 				(errcode(ERRCODE_INVALID_TRANSACTION_STATE),
3103 				 errmsg("cannot delete tuples during a parallel operation")));
3104 
3105 	block = ItemPointerGetBlockNumber(tid);
3106 	buffer = ReadBuffer(relation, block);
3107 	page = BufferGetPage(buffer);
3108 
3109 	/*
3110 	 * Before locking the buffer, pin the visibility map page if it appears to
3111 	 * be necessary.  Since we haven't got the lock yet, someone else might be
3112 	 * in the middle of changing this, so we'll need to recheck after we have
3113 	 * the lock.
3114 	 */
3115 	if (PageIsAllVisible(page))
3116 		visibilitymap_pin(relation, block, &vmbuffer);
3117 
3118 	LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE);
3119 
3120 	/*
3121 	 * If we didn't pin the visibility map page and the page has become all
3122 	 * visible while we were busy locking the buffer, we'll have to unlock and
3123 	 * re-lock, to avoid holding the buffer lock across an I/O.  That's a bit
3124 	 * unfortunate, but hopefully shouldn't happen often.
3125 	 */
3126 	if (vmbuffer == InvalidBuffer && PageIsAllVisible(page))
3127 	{
3128 		LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
3129 		visibilitymap_pin(relation, block, &vmbuffer);
3130 		LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE);
3131 	}
3132 
3133 	lp = PageGetItemId(page, ItemPointerGetOffsetNumber(tid));
3134 	Assert(ItemIdIsNormal(lp));
3135 
3136 	tp.t_tableOid = RelationGetRelid(relation);
3137 	tp.t_data = (HeapTupleHeader) PageGetItem(page, lp);
3138 	tp.t_len = ItemIdGetLength(lp);
3139 	tp.t_self = *tid;
3140 
3141 l1:
3142 	result = HeapTupleSatisfiesUpdate(&tp, cid, buffer);
3143 
3144 	if (result == HeapTupleInvisible)
3145 	{
3146 		UnlockReleaseBuffer(buffer);
3147 		ereport(ERROR,
3148 				(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
3149 				 errmsg("attempted to delete invisible tuple")));
3150 	}
3151 	else if (result == HeapTupleBeingUpdated && wait)
3152 	{
3153 		TransactionId xwait;
3154 		uint16		infomask;
3155 
3156 		/* must copy state data before unlocking buffer */
3157 		xwait = HeapTupleHeaderGetRawXmax(tp.t_data);
3158 		infomask = tp.t_data->t_infomask;
3159 
3160 		/*
3161 		 * Sleep until concurrent transaction ends -- except when there's a
3162 		 * single locker and it's our own transaction.  Note we don't care
3163 		 * which lock mode the locker has, because we need the strongest one.
3164 		 *
3165 		 * Before sleeping, we need to acquire tuple lock to establish our
3166 		 * priority for the tuple (see heap_lock_tuple).  LockTuple will
3167 		 * release us when we are next-in-line for the tuple.
3168 		 *
3169 		 * If we are forced to "start over" below, we keep the tuple lock;
3170 		 * this arranges that we stay at the head of the line while rechecking
3171 		 * tuple state.
3172 		 */
3173 		if (infomask & HEAP_XMAX_IS_MULTI)
3174 		{
3175 			bool		current_is_member = false;
3176 
3177 			if (DoesMultiXactIdConflict((MultiXactId) xwait, infomask,
3178 										LockTupleExclusive, &current_is_member))
3179 			{
3180 				LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
3181 
3182 				/*
3183 				 * Acquire the lock, if necessary (but skip it when we're
3184 				 * requesting a lock and already have one; avoids deadlock).
3185 				 */
3186 				if (!current_is_member)
3187 					heap_acquire_tuplock(relation, &(tp.t_self), LockTupleExclusive,
3188 										 LockWaitBlock, &have_tuple_lock);
3189 
3190 				/* wait for multixact */
3191 				MultiXactIdWait((MultiXactId) xwait, MultiXactStatusUpdate, infomask,
3192 								relation, &(tp.t_self), XLTW_Delete,
3193 								NULL);
3194 				LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE);
3195 
3196 				/*
3197 				 * If xwait had just locked the tuple then some other xact
3198 				 * could update this tuple before we get to this point.  Check
3199 				 * for xmax change, and start over if so.
3200 				 */
3201 				if (xmax_infomask_changed(tp.t_data->t_infomask, infomask) ||
3202 					!TransactionIdEquals(HeapTupleHeaderGetRawXmax(tp.t_data),
3203 										 xwait))
3204 					goto l1;
3205 			}
3206 
3207 			/*
3208 			 * You might think the multixact is necessarily done here, but not
3209 			 * so: it could have surviving members, namely our own xact or
3210 			 * other subxacts of this backend.  It is legal for us to delete
3211 			 * the tuple in either case, however (the latter case is
3212 			 * essentially a situation of upgrading our former shared lock to
3213 			 * exclusive).  We don't bother changing the on-disk hint bits
3214 			 * since we are about to overwrite the xmax altogether.
3215 			 */
3216 		}
3217 		else if (!TransactionIdIsCurrentTransactionId(xwait))
3218 		{
3219 			/*
3220 			 * Wait for regular transaction to end; but first, acquire tuple
3221 			 * lock.
3222 			 */
3223 			LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
3224 			heap_acquire_tuplock(relation, &(tp.t_self), LockTupleExclusive,
3225 								 LockWaitBlock, &have_tuple_lock);
3226 			XactLockTableWait(xwait, relation, &(tp.t_self), XLTW_Delete);
3227 			LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE);
3228 
3229 			/*
3230 			 * xwait is done, but if xwait had just locked the tuple then some
3231 			 * other xact could update this tuple before we get to this point.
3232 			 * Check for xmax change, and start over if so.
3233 			 */
3234 			if (xmax_infomask_changed(tp.t_data->t_infomask, infomask) ||
3235 				!TransactionIdEquals(HeapTupleHeaderGetRawXmax(tp.t_data),
3236 									 xwait))
3237 				goto l1;
3238 
3239 			/* Otherwise check if it committed or aborted */
3240 			UpdateXmaxHintBits(tp.t_data, buffer, xwait);
3241 		}
3242 
3243 		/*
3244 		 * We may overwrite if previous xmax aborted, or if it committed but
3245 		 * only locked the tuple without updating it.
3246 		 */
3247 		if ((tp.t_data->t_infomask & HEAP_XMAX_INVALID) ||
3248 			HEAP_XMAX_IS_LOCKED_ONLY(tp.t_data->t_infomask) ||
3249 			HeapTupleHeaderIsOnlyLocked(tp.t_data))
3250 			result = HeapTupleMayBeUpdated;
3251 		else
3252 			result = HeapTupleUpdated;
3253 	}
3254 
3255 	if (crosscheck != InvalidSnapshot && result == HeapTupleMayBeUpdated)
3256 	{
3257 		/* Perform additional check for transaction-snapshot mode RI updates */
3258 		if (!HeapTupleSatisfiesVisibility(&tp, crosscheck, buffer))
3259 			result = HeapTupleUpdated;
3260 	}
3261 
3262 	if (result != HeapTupleMayBeUpdated)
3263 	{
3264 		Assert(result == HeapTupleSelfUpdated ||
3265 			   result == HeapTupleUpdated ||
3266 			   result == HeapTupleBeingUpdated);
3267 		Assert(!(tp.t_data->t_infomask & HEAP_XMAX_INVALID));
3268 		hufd->ctid = tp.t_data->t_ctid;
3269 		hufd->xmax = HeapTupleHeaderGetUpdateXid(tp.t_data);
3270 		if (result == HeapTupleSelfUpdated)
3271 			hufd->cmax = HeapTupleHeaderGetCmax(tp.t_data);
3272 		else
3273 			hufd->cmax = InvalidCommandId;
3274 		UnlockReleaseBuffer(buffer);
3275 		if (have_tuple_lock)
3276 			UnlockTupleTuplock(relation, &(tp.t_self), LockTupleExclusive);
3277 		if (vmbuffer != InvalidBuffer)
3278 			ReleaseBuffer(vmbuffer);
3279 		return result;
3280 	}
3281 
3282 	/*
3283 	 * We're about to do the actual delete -- check for conflict first, to
3284 	 * avoid possibly having to roll back work we've just done.
3285 	 *
3286 	 * This is safe without a recheck as long as there is no possibility of
3287 	 * another process scanning the page between this check and the delete
3288 	 * being visible to the scan (i.e., an exclusive buffer content lock is
3289 	 * continuously held from this point until the tuple delete is visible).
3290 	 */
3291 	CheckForSerializableConflictIn(relation, &tp, buffer);
3292 
3293 	/* replace cid with a combo cid if necessary */
3294 	HeapTupleHeaderAdjustCmax(tp.t_data, &cid, &iscombo);
3295 
3296 	/*
3297 	 * Compute replica identity tuple before entering the critical section so
3298 	 * we don't PANIC upon a memory allocation failure.
3299 	 */
3300 	old_key_tuple = ExtractReplicaIdentity(relation, &tp, true, &old_key_copied);
3301 
3302 	/*
3303 	 * If this is the first possibly-multixact-able operation in the current
3304 	 * transaction, set my per-backend OldestMemberMXactId setting. We can be
3305 	 * certain that the transaction will never become a member of any older
3306 	 * MultiXactIds than that.  (We have to do this even if we end up just
3307 	 * using our own TransactionId below, since some other backend could
3308 	 * incorporate our XID into a MultiXact immediately afterwards.)
3309 	 */
3310 	MultiXactIdSetOldestMember();
3311 
3312 	compute_new_xmax_infomask(HeapTupleHeaderGetRawXmax(tp.t_data),
3313 							  tp.t_data->t_infomask, tp.t_data->t_infomask2,
3314 							  xid, LockTupleExclusive, true,
3315 							  &new_xmax, &new_infomask, &new_infomask2);
3316 
3317 	START_CRIT_SECTION();
3318 
3319 	/*
3320 	 * If this transaction commits, the tuple will become DEAD sooner or
3321 	 * later.  Set flag that this page is a candidate for pruning once our xid
3322 	 * falls below the OldestXmin horizon.  If the transaction finally aborts,
3323 	 * the subsequent page pruning will be a no-op and the hint will be
3324 	 * cleared.
3325 	 */
3326 	PageSetPrunable(page, xid);
3327 
3328 	if (PageIsAllVisible(page))
3329 	{
3330 		all_visible_cleared = true;
3331 		PageClearAllVisible(page);
3332 		visibilitymap_clear(relation, BufferGetBlockNumber(buffer),
3333 							vmbuffer, VISIBILITYMAP_VALID_BITS);
3334 	}
3335 
3336 	/* store transaction information of xact deleting the tuple */
3337 	tp.t_data->t_infomask &= ~(HEAP_XMAX_BITS | HEAP_MOVED);
3338 	tp.t_data->t_infomask2 &= ~HEAP_KEYS_UPDATED;
3339 	tp.t_data->t_infomask |= new_infomask;
3340 	tp.t_data->t_infomask2 |= new_infomask2;
3341 	HeapTupleHeaderClearHotUpdated(tp.t_data);
3342 	HeapTupleHeaderSetXmax(tp.t_data, new_xmax);
3343 	HeapTupleHeaderSetCmax(tp.t_data, cid, iscombo);
3344 	/* Make sure there is no forward chain link in t_ctid */
3345 	tp.t_data->t_ctid = tp.t_self;
3346 
3347 	/* Signal that this is actually a move into another partition */
3348 	if (changingPart)
3349 		HeapTupleHeaderSetMovedPartitions(tp.t_data);
3350 
3351 	MarkBufferDirty(buffer);
3352 
3353 	/*
3354 	 * XLOG stuff
3355 	 *
3356 	 * NB: heap_abort_speculative() uses the same xlog record and replay
3357 	 * routines.
3358 	 */
3359 	if (RelationNeedsWAL(relation))
3360 	{
3361 		xl_heap_delete xlrec;
3362 		xl_heap_header xlhdr;
3363 		XLogRecPtr	recptr;
3364 
3365 		/* For logical decode we need combocids to properly decode the catalog */
3366 		if (RelationIsAccessibleInLogicalDecoding(relation))
3367 			log_heap_new_cid(relation, &tp);
3368 
3369 		xlrec.flags = 0;
3370 		if (all_visible_cleared)
3371 			xlrec.flags |= XLH_DELETE_ALL_VISIBLE_CLEARED;
3372 		if (changingPart)
3373 			xlrec.flags |= XLH_DELETE_IS_PARTITION_MOVE;
3374 		xlrec.infobits_set = compute_infobits(tp.t_data->t_infomask,
3375 											  tp.t_data->t_infomask2);
3376 		xlrec.offnum = ItemPointerGetOffsetNumber(&tp.t_self);
3377 		xlrec.xmax = new_xmax;
3378 
3379 		if (old_key_tuple != NULL)
3380 		{
3381 			if (relation->rd_rel->relreplident == REPLICA_IDENTITY_FULL)
3382 				xlrec.flags |= XLH_DELETE_CONTAINS_OLD_TUPLE;
3383 			else
3384 				xlrec.flags |= XLH_DELETE_CONTAINS_OLD_KEY;
3385 		}
3386 
3387 		XLogBeginInsert();
3388 		XLogRegisterData((char *) &xlrec, SizeOfHeapDelete);
3389 
3390 		XLogRegisterBuffer(0, buffer, REGBUF_STANDARD);
3391 
3392 		/*
3393 		 * Log replica identity of the deleted tuple if there is one
3394 		 */
3395 		if (old_key_tuple != NULL)
3396 		{
3397 			xlhdr.t_infomask2 = old_key_tuple->t_data->t_infomask2;
3398 			xlhdr.t_infomask = old_key_tuple->t_data->t_infomask;
3399 			xlhdr.t_hoff = old_key_tuple->t_data->t_hoff;
3400 
3401 			XLogRegisterData((char *) &xlhdr, SizeOfHeapHeader);
3402 			XLogRegisterData((char *) old_key_tuple->t_data
3403 							 + SizeofHeapTupleHeader,
3404 							 old_key_tuple->t_len
3405 							 - SizeofHeapTupleHeader);
3406 		}
3407 
3408 		/* filtering by origin on a row level is much more efficient */
3409 		XLogSetRecordFlags(XLOG_INCLUDE_ORIGIN);
3410 
3411 		recptr = XLogInsert(RM_HEAP_ID, XLOG_HEAP_DELETE);
3412 
3413 		PageSetLSN(page, recptr);
3414 	}
3415 
3416 	END_CRIT_SECTION();
3417 
3418 	LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
3419 
3420 	if (vmbuffer != InvalidBuffer)
3421 		ReleaseBuffer(vmbuffer);
3422 
3423 	/*
3424 	 * If the tuple has toasted out-of-line attributes, we need to delete
3425 	 * those items too.  We have to do this before releasing the buffer
3426 	 * because we need to look at the contents of the tuple, but it's OK to
3427 	 * release the content lock on the buffer first.
3428 	 */
3429 	if (relation->rd_rel->relkind != RELKIND_RELATION &&
3430 		relation->rd_rel->relkind != RELKIND_MATVIEW)
3431 	{
3432 		/* toast table entries should never be recursively toasted */
3433 		Assert(!HeapTupleHasExternal(&tp));
3434 	}
3435 	else if (HeapTupleHasExternal(&tp))
3436 		toast_delete(relation, &tp, false);
3437 
3438 	/*
3439 	 * Mark tuple for invalidation from system caches at next command
3440 	 * boundary. We have to do this before releasing the buffer because we
3441 	 * need to look at the contents of the tuple.
3442 	 */
3443 	CacheInvalidateHeapTuple(relation, &tp, NULL);
3444 
3445 	/* Now we can release the buffer */
3446 	ReleaseBuffer(buffer);
3447 
3448 	/*
3449 	 * Release the lmgr tuple lock, if we had it.
3450 	 */
3451 	if (have_tuple_lock)
3452 		UnlockTupleTuplock(relation, &(tp.t_self), LockTupleExclusive);
3453 
3454 	pgstat_count_heap_delete(relation);
3455 
3456 	if (old_key_tuple != NULL && old_key_copied)
3457 		heap_freetuple(old_key_tuple);
3458 
3459 	return HeapTupleMayBeUpdated;
3460 }
3461 
3462 /*
3463  *	simple_heap_delete - delete a tuple
3464  *
3465  * This routine may be used to delete a tuple when concurrent updates of
3466  * the target tuple are not expected (for example, because we have a lock
3467  * on the relation associated with the tuple).  Any failure is reported
3468  * via ereport().
3469  */
3470 void
simple_heap_delete(Relation relation,ItemPointer tid)3471 simple_heap_delete(Relation relation, ItemPointer tid)
3472 {
3473 	HTSU_Result result;
3474 	HeapUpdateFailureData hufd;
3475 
3476 	result = heap_delete(relation, tid,
3477 						 GetCurrentCommandId(true), InvalidSnapshot,
3478 						 true /* wait for commit */ ,
3479 						 &hufd, false /* changingPart */ );
3480 	switch (result)
3481 	{
3482 		case HeapTupleSelfUpdated:
3483 			/* Tuple was already updated in current command? */
3484 			elog(ERROR, "tuple already updated by self");
3485 			break;
3486 
3487 		case HeapTupleMayBeUpdated:
3488 			/* done successfully */
3489 			break;
3490 
3491 		case HeapTupleUpdated:
3492 			elog(ERROR, "tuple concurrently updated");
3493 			break;
3494 
3495 		default:
3496 			elog(ERROR, "unrecognized heap_delete status: %u", result);
3497 			break;
3498 	}
3499 }
3500 
3501 /*
3502  *	heap_update - replace a tuple
3503  *
3504  * NB: do not call this directly unless you are prepared to deal with
3505  * concurrent-update conditions.  Use simple_heap_update instead.
3506  *
3507  *	relation - table to be modified (caller must hold suitable lock)
3508  *	otid - TID of old tuple to be replaced
3509  *	newtup - newly constructed tuple data to store
3510  *	cid - update command ID (used for visibility test, and stored into
3511  *		cmax/cmin if successful)
3512  *	crosscheck - if not InvalidSnapshot, also check old tuple against this
3513  *	wait - true if should wait for any conflicting update to commit/abort
3514  *	hufd - output parameter, filled in failure cases (see below)
3515  *	lockmode - output parameter, filled with lock mode acquired on tuple
3516  *
3517  * Normal, successful return value is HeapTupleMayBeUpdated, which
3518  * actually means we *did* update it.  Failure return codes are
3519  * HeapTupleSelfUpdated, HeapTupleUpdated, or HeapTupleBeingUpdated
3520  * (the last only possible if wait == false).
3521  *
3522  * On success, the header fields of *newtup are updated to match the new
3523  * stored tuple; in particular, newtup->t_self is set to the TID where the
3524  * new tuple was inserted, and its HEAP_ONLY_TUPLE flag is set iff a HOT
3525  * update was done.  However, any TOAST changes in the new tuple's
3526  * data are not reflected into *newtup.
3527  *
3528  * In the failure cases, the routine fills *hufd with the tuple's t_ctid,
3529  * t_xmax (resolving a possible MultiXact, if necessary), and t_cmax
3530  * (the last only for HeapTupleSelfUpdated, since we
3531  * cannot obtain cmax from a combocid generated by another transaction).
3532  * See comments for struct HeapUpdateFailureData for additional info.
3533  */
3534 HTSU_Result
heap_update(Relation relation,ItemPointer otid,HeapTuple newtup,CommandId cid,Snapshot crosscheck,bool wait,HeapUpdateFailureData * hufd,LockTupleMode * lockmode)3535 heap_update(Relation relation, ItemPointer otid, HeapTuple newtup,
3536 			CommandId cid, Snapshot crosscheck, bool wait,
3537 			HeapUpdateFailureData *hufd, LockTupleMode *lockmode)
3538 {
3539 	HTSU_Result result;
3540 	TransactionId xid = GetCurrentTransactionId();
3541 	Bitmapset  *hot_attrs;
3542 	Bitmapset  *proj_idx_attrs;
3543 	Bitmapset  *key_attrs;
3544 	Bitmapset  *id_attrs;
3545 	Bitmapset  *interesting_attrs;
3546 	Bitmapset  *modified_attrs;
3547 	ItemId		lp;
3548 	HeapTupleData oldtup;
3549 	HeapTuple	heaptup;
3550 	HeapTuple	old_key_tuple = NULL;
3551 	bool		old_key_copied = false;
3552 	Page		page;
3553 	BlockNumber block;
3554 	MultiXactStatus mxact_status;
3555 	Buffer		buffer,
3556 				newbuf,
3557 				vmbuffer = InvalidBuffer,
3558 				vmbuffer_new = InvalidBuffer;
3559 	bool		need_toast;
3560 	Size		newtupsize,
3561 				pagefree;
3562 	bool		have_tuple_lock = false;
3563 	bool		iscombo;
3564 	bool		use_hot_update = false;
3565 	bool		hot_attrs_checked = false;
3566 	bool		key_intact;
3567 	bool		all_visible_cleared = false;
3568 	bool		all_visible_cleared_new = false;
3569 	bool		checked_lockers;
3570 	bool		locker_remains;
3571 	TransactionId xmax_new_tuple,
3572 				xmax_old_tuple;
3573 	uint16		infomask_old_tuple,
3574 				infomask2_old_tuple,
3575 				infomask_new_tuple,
3576 				infomask2_new_tuple;
3577 
3578 	Assert(ItemPointerIsValid(otid));
3579 
3580 	/* Cheap, simplistic check that the tuple matches the rel's rowtype. */
3581 	Assert(HeapTupleHeaderGetNatts(newtup->t_data) <=
3582 		   RelationGetNumberOfAttributes(relation));
3583 
3584 	/*
3585 	 * Forbid this during a parallel operation, lest it allocate a combocid.
3586 	 * Other workers might need that combocid for visibility checks, and we
3587 	 * have no provision for broadcasting it to them.
3588 	 */
3589 	if (IsInParallelMode())
3590 		ereport(ERROR,
3591 				(errcode(ERRCODE_INVALID_TRANSACTION_STATE),
3592 				 errmsg("cannot update tuples during a parallel operation")));
3593 
3594 	/*
3595 	 * Fetch the list of attributes to be checked for various operations.
3596 	 *
3597 	 * For HOT considerations, this is wasted effort if we fail to update or
3598 	 * have to put the new tuple on a different page.  But we must compute the
3599 	 * list before obtaining buffer lock --- in the worst case, if we are
3600 	 * doing an update on one of the relevant system catalogs, we could
3601 	 * deadlock if we try to fetch the list later.  In any case, the relcache
3602 	 * caches the data so this is usually pretty cheap.
3603 	 *
3604 	 * We also need columns used by the replica identity and columns that are
3605 	 * considered the "key" of rows in the table.
3606 	 *
3607 	 * Note that we get copies of each bitmap, so we need not worry about
3608 	 * relcache flush happening midway through.
3609 	 */
3610 	hot_attrs = RelationGetIndexAttrBitmap(relation, INDEX_ATTR_BITMAP_HOT);
3611 	proj_idx_attrs = RelationGetIndexAttrBitmap(relation, INDEX_ATTR_BITMAP_PROJ);
3612 	key_attrs = RelationGetIndexAttrBitmap(relation, INDEX_ATTR_BITMAP_KEY);
3613 	id_attrs = RelationGetIndexAttrBitmap(relation,
3614 										  INDEX_ATTR_BITMAP_IDENTITY_KEY);
3615 	block = ItemPointerGetBlockNumber(otid);
3616 	buffer = ReadBuffer(relation, block);
3617 	page = BufferGetPage(buffer);
3618 
3619 	interesting_attrs = NULL;
3620 
3621 	/*
3622 	 * If the page is already full, there is hardly any chance of doing a HOT
3623 	 * update on this page. It might be wasteful effort to look for index
3624 	 * column updates only to later reject HOT updates for lack of space in
3625 	 * the same page. So we be conservative and only fetch hot_attrs if the
3626 	 * page is not already full. Since we are already holding a pin on the
3627 	 * buffer, there is no chance that the buffer can get cleaned up
3628 	 * concurrently and even if that was possible, in the worst case we lose a
3629 	 * chance to do a HOT update.
3630 	 */
3631 	if (!PageIsFull(page))
3632 	{
3633 		interesting_attrs = bms_add_members(interesting_attrs, hot_attrs);
3634 		interesting_attrs = bms_add_members(interesting_attrs, proj_idx_attrs);
3635 		hot_attrs_checked = true;
3636 	}
3637 	interesting_attrs = bms_add_members(interesting_attrs, key_attrs);
3638 	interesting_attrs = bms_add_members(interesting_attrs, id_attrs);
3639 
3640 	/*
3641 	 * Before locking the buffer, pin the visibility map page if it appears to
3642 	 * be necessary.  Since we haven't got the lock yet, someone else might be
3643 	 * in the middle of changing this, so we'll need to recheck after we have
3644 	 * the lock.
3645 	 */
3646 	if (PageIsAllVisible(page))
3647 		visibilitymap_pin(relation, block, &vmbuffer);
3648 
3649 	LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE);
3650 
3651 	lp = PageGetItemId(page, ItemPointerGetOffsetNumber(otid));
3652 	Assert(ItemIdIsNormal(lp));
3653 
3654 	/*
3655 	 * Fill in enough data in oldtup for HeapDetermineModifiedColumns to work
3656 	 * properly.
3657 	 */
3658 	oldtup.t_tableOid = RelationGetRelid(relation);
3659 	oldtup.t_data = (HeapTupleHeader) PageGetItem(page, lp);
3660 	oldtup.t_len = ItemIdGetLength(lp);
3661 	oldtup.t_self = *otid;
3662 
3663 	/* the new tuple is ready, except for this: */
3664 	newtup->t_tableOid = RelationGetRelid(relation);
3665 
3666 	/* Fill in OID for newtup */
3667 	if (relation->rd_rel->relhasoids)
3668 	{
3669 #ifdef NOT_USED
3670 		/* this is redundant with an Assert in HeapTupleSetOid */
3671 		Assert(newtup->t_data->t_infomask & HEAP_HASOID);
3672 #endif
3673 		HeapTupleSetOid(newtup, HeapTupleGetOid(&oldtup));
3674 	}
3675 	else
3676 	{
3677 		/* check there is not space for an OID */
3678 		Assert(!(newtup->t_data->t_infomask & HEAP_HASOID));
3679 	}
3680 
3681 	/* Determine columns modified by the update. */
3682 	modified_attrs = HeapDetermineModifiedColumns(relation, interesting_attrs,
3683 												  &oldtup, newtup);
3684 
3685 	/*
3686 	 * If we're not updating any "key" column, we can grab a weaker lock type.
3687 	 * This allows for more concurrency when we are running simultaneously
3688 	 * with foreign key checks.
3689 	 *
3690 	 * Note that if a column gets detoasted while executing the update, but
3691 	 * the value ends up being the same, this test will fail and we will use
3692 	 * the stronger lock.  This is acceptable; the important case to optimize
3693 	 * is updates that don't manipulate key columns, not those that
3694 	 * serendipitiously arrive at the same key values.
3695 	 */
3696 	if (!bms_overlap(modified_attrs, key_attrs))
3697 	{
3698 		*lockmode = LockTupleNoKeyExclusive;
3699 		mxact_status = MultiXactStatusNoKeyUpdate;
3700 		key_intact = true;
3701 
3702 		/*
3703 		 * If this is the first possibly-multixact-able operation in the
3704 		 * current transaction, set my per-backend OldestMemberMXactId
3705 		 * setting. We can be certain that the transaction will never become a
3706 		 * member of any older MultiXactIds than that.  (We have to do this
3707 		 * even if we end up just using our own TransactionId below, since
3708 		 * some other backend could incorporate our XID into a MultiXact
3709 		 * immediately afterwards.)
3710 		 */
3711 		MultiXactIdSetOldestMember();
3712 	}
3713 	else
3714 	{
3715 		*lockmode = LockTupleExclusive;
3716 		mxact_status = MultiXactStatusUpdate;
3717 		key_intact = false;
3718 	}
3719 
3720 	/*
3721 	 * Note: beyond this point, use oldtup not otid to refer to old tuple.
3722 	 * otid may very well point at newtup->t_self, which we will overwrite
3723 	 * with the new tuple's location, so there's great risk of confusion if we
3724 	 * use otid anymore.
3725 	 */
3726 
3727 l2:
3728 	checked_lockers = false;
3729 	locker_remains = false;
3730 	result = HeapTupleSatisfiesUpdate(&oldtup, cid, buffer);
3731 
3732 	/* see below about the "no wait" case */
3733 	Assert(result != HeapTupleBeingUpdated || wait);
3734 
3735 	if (result == HeapTupleInvisible)
3736 	{
3737 		UnlockReleaseBuffer(buffer);
3738 		ereport(ERROR,
3739 				(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
3740 				 errmsg("attempted to update invisible tuple")));
3741 	}
3742 	else if (result == HeapTupleBeingUpdated && wait)
3743 	{
3744 		TransactionId xwait;
3745 		uint16		infomask;
3746 		bool		can_continue = false;
3747 
3748 		/*
3749 		 * XXX note that we don't consider the "no wait" case here.  This
3750 		 * isn't a problem currently because no caller uses that case, but it
3751 		 * should be fixed if such a caller is introduced.  It wasn't a
3752 		 * problem previously because this code would always wait, but now
3753 		 * that some tuple locks do not conflict with one of the lock modes we
3754 		 * use, it is possible that this case is interesting to handle
3755 		 * specially.
3756 		 *
3757 		 * This may cause failures with third-party code that calls
3758 		 * heap_update directly.
3759 		 */
3760 
3761 		/* must copy state data before unlocking buffer */
3762 		xwait = HeapTupleHeaderGetRawXmax(oldtup.t_data);
3763 		infomask = oldtup.t_data->t_infomask;
3764 
3765 		/*
3766 		 * Now we have to do something about the existing locker.  If it's a
3767 		 * multi, sleep on it; we might be awakened before it is completely
3768 		 * gone (or even not sleep at all in some cases); we need to preserve
3769 		 * it as locker, unless it is gone completely.
3770 		 *
3771 		 * If it's not a multi, we need to check for sleeping conditions
3772 		 * before actually going to sleep.  If the update doesn't conflict
3773 		 * with the locks, we just continue without sleeping (but making sure
3774 		 * it is preserved).
3775 		 *
3776 		 * Before sleeping, we need to acquire tuple lock to establish our
3777 		 * priority for the tuple (see heap_lock_tuple).  LockTuple will
3778 		 * release us when we are next-in-line for the tuple.  Note we must
3779 		 * not acquire the tuple lock until we're sure we're going to sleep;
3780 		 * otherwise we're open for race conditions with other transactions
3781 		 * holding the tuple lock which sleep on us.
3782 		 *
3783 		 * If we are forced to "start over" below, we keep the tuple lock;
3784 		 * this arranges that we stay at the head of the line while rechecking
3785 		 * tuple state.
3786 		 */
3787 		if (infomask & HEAP_XMAX_IS_MULTI)
3788 		{
3789 			TransactionId update_xact;
3790 			int			remain;
3791 			bool		current_is_member = false;
3792 
3793 			if (DoesMultiXactIdConflict((MultiXactId) xwait, infomask,
3794 										*lockmode, &current_is_member))
3795 			{
3796 				LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
3797 
3798 				/*
3799 				 * Acquire the lock, if necessary (but skip it when we're
3800 				 * requesting a lock and already have one; avoids deadlock).
3801 				 */
3802 				if (!current_is_member)
3803 					heap_acquire_tuplock(relation, &(oldtup.t_self), *lockmode,
3804 										 LockWaitBlock, &have_tuple_lock);
3805 
3806 				/* wait for multixact */
3807 				MultiXactIdWait((MultiXactId) xwait, mxact_status, infomask,
3808 								relation, &oldtup.t_self, XLTW_Update,
3809 								&remain);
3810 				checked_lockers = true;
3811 				locker_remains = remain != 0;
3812 				LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE);
3813 
3814 				/*
3815 				 * If xwait had just locked the tuple then some other xact
3816 				 * could update this tuple before we get to this point.  Check
3817 				 * for xmax change, and start over if so.
3818 				 */
3819 				if (xmax_infomask_changed(oldtup.t_data->t_infomask,
3820 										  infomask) ||
3821 					!TransactionIdEquals(HeapTupleHeaderGetRawXmax(oldtup.t_data),
3822 										 xwait))
3823 					goto l2;
3824 			}
3825 
3826 			/*
3827 			 * Note that the multixact may not be done by now.  It could have
3828 			 * surviving members; our own xact or other subxacts of this
3829 			 * backend, and also any other concurrent transaction that locked
3830 			 * the tuple with LockTupleKeyShare if we only got
3831 			 * LockTupleNoKeyExclusive.  If this is the case, we have to be
3832 			 * careful to mark the updated tuple with the surviving members in
3833 			 * Xmax.
3834 			 *
3835 			 * Note that there could have been another update in the
3836 			 * MultiXact. In that case, we need to check whether it committed
3837 			 * or aborted. If it aborted we are safe to update it again;
3838 			 * otherwise there is an update conflict, and we have to return
3839 			 * HeapTupleUpdated below.
3840 			 *
3841 			 * In the LockTupleExclusive case, we still need to preserve the
3842 			 * surviving members: those would include the tuple locks we had
3843 			 * before this one, which are important to keep in case this
3844 			 * subxact aborts.
3845 			 */
3846 			if (!HEAP_XMAX_IS_LOCKED_ONLY(oldtup.t_data->t_infomask))
3847 				update_xact = HeapTupleGetUpdateXid(oldtup.t_data);
3848 			else
3849 				update_xact = InvalidTransactionId;
3850 
3851 			/*
3852 			 * There was no UPDATE in the MultiXact; or it aborted. No
3853 			 * TransactionIdIsInProgress() call needed here, since we called
3854 			 * MultiXactIdWait() above.
3855 			 */
3856 			if (!TransactionIdIsValid(update_xact) ||
3857 				TransactionIdDidAbort(update_xact))
3858 				can_continue = true;
3859 		}
3860 		else if (TransactionIdIsCurrentTransactionId(xwait))
3861 		{
3862 			/*
3863 			 * The only locker is ourselves; we can avoid grabbing the tuple
3864 			 * lock here, but must preserve our locking information.
3865 			 */
3866 			checked_lockers = true;
3867 			locker_remains = true;
3868 			can_continue = true;
3869 		}
3870 		else if (HEAP_XMAX_IS_KEYSHR_LOCKED(infomask) && key_intact)
3871 		{
3872 			/*
3873 			 * If it's just a key-share locker, and we're not changing the key
3874 			 * columns, we don't need to wait for it to end; but we need to
3875 			 * preserve it as locker.
3876 			 */
3877 			checked_lockers = true;
3878 			locker_remains = true;
3879 			can_continue = true;
3880 		}
3881 		else
3882 		{
3883 			/*
3884 			 * Wait for regular transaction to end; but first, acquire tuple
3885 			 * lock.
3886 			 */
3887 			LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
3888 			heap_acquire_tuplock(relation, &(oldtup.t_self), *lockmode,
3889 								 LockWaitBlock, &have_tuple_lock);
3890 			XactLockTableWait(xwait, relation, &oldtup.t_self,
3891 							  XLTW_Update);
3892 			checked_lockers = true;
3893 			LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE);
3894 
3895 			/*
3896 			 * xwait is done, but if xwait had just locked the tuple then some
3897 			 * other xact could update this tuple before we get to this point.
3898 			 * Check for xmax change, and start over if so.
3899 			 */
3900 			if (xmax_infomask_changed(oldtup.t_data->t_infomask, infomask) ||
3901 				!TransactionIdEquals(xwait,
3902 									 HeapTupleHeaderGetRawXmax(oldtup.t_data)))
3903 				goto l2;
3904 
3905 			/* Otherwise check if it committed or aborted */
3906 			UpdateXmaxHintBits(oldtup.t_data, buffer, xwait);
3907 			if (oldtup.t_data->t_infomask & HEAP_XMAX_INVALID)
3908 				can_continue = true;
3909 		}
3910 
3911 		result = can_continue ? HeapTupleMayBeUpdated : HeapTupleUpdated;
3912 	}
3913 
3914 	if (crosscheck != InvalidSnapshot && result == HeapTupleMayBeUpdated)
3915 	{
3916 		/* Perform additional check for transaction-snapshot mode RI updates */
3917 		if (!HeapTupleSatisfiesVisibility(&oldtup, crosscheck, buffer))
3918 			result = HeapTupleUpdated;
3919 	}
3920 
3921 	if (result != HeapTupleMayBeUpdated)
3922 	{
3923 		Assert(result == HeapTupleSelfUpdated ||
3924 			   result == HeapTupleUpdated ||
3925 			   result == HeapTupleBeingUpdated);
3926 		Assert(!(oldtup.t_data->t_infomask & HEAP_XMAX_INVALID));
3927 		hufd->ctid = oldtup.t_data->t_ctid;
3928 		hufd->xmax = HeapTupleHeaderGetUpdateXid(oldtup.t_data);
3929 		if (result == HeapTupleSelfUpdated)
3930 			hufd->cmax = HeapTupleHeaderGetCmax(oldtup.t_data);
3931 		else
3932 			hufd->cmax = InvalidCommandId;
3933 		UnlockReleaseBuffer(buffer);
3934 		if (have_tuple_lock)
3935 			UnlockTupleTuplock(relation, &(oldtup.t_self), *lockmode);
3936 		if (vmbuffer != InvalidBuffer)
3937 			ReleaseBuffer(vmbuffer);
3938 		bms_free(hot_attrs);
3939 		bms_free(proj_idx_attrs);
3940 		bms_free(key_attrs);
3941 		bms_free(id_attrs);
3942 		bms_free(modified_attrs);
3943 		bms_free(interesting_attrs);
3944 		return result;
3945 	}
3946 
3947 	/*
3948 	 * If we didn't pin the visibility map page and the page has become all
3949 	 * visible while we were busy locking the buffer, or during some
3950 	 * subsequent window during which we had it unlocked, we'll have to unlock
3951 	 * and re-lock, to avoid holding the buffer lock across an I/O.  That's a
3952 	 * bit unfortunate, especially since we'll now have to recheck whether the
3953 	 * tuple has been locked or updated under us, but hopefully it won't
3954 	 * happen very often.
3955 	 */
3956 	if (vmbuffer == InvalidBuffer && PageIsAllVisible(page))
3957 	{
3958 		LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
3959 		visibilitymap_pin(relation, block, &vmbuffer);
3960 		LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE);
3961 		goto l2;
3962 	}
3963 
3964 	/* Fill in transaction status data */
3965 
3966 	/*
3967 	 * If the tuple we're updating is locked, we need to preserve the locking
3968 	 * info in the old tuple's Xmax.  Prepare a new Xmax value for this.
3969 	 */
3970 	compute_new_xmax_infomask(HeapTupleHeaderGetRawXmax(oldtup.t_data),
3971 							  oldtup.t_data->t_infomask,
3972 							  oldtup.t_data->t_infomask2,
3973 							  xid, *lockmode, true,
3974 							  &xmax_old_tuple, &infomask_old_tuple,
3975 							  &infomask2_old_tuple);
3976 
3977 	/*
3978 	 * And also prepare an Xmax value for the new copy of the tuple.  If there
3979 	 * was no xmax previously, or there was one but all lockers are now gone,
3980 	 * then use InvalidXid; otherwise, get the xmax from the old tuple.  (In
3981 	 * rare cases that might also be InvalidXid and yet not have the
3982 	 * HEAP_XMAX_INVALID bit set; that's fine.)
3983 	 */
3984 	if ((oldtup.t_data->t_infomask & HEAP_XMAX_INVALID) ||
3985 		HEAP_LOCKED_UPGRADED(oldtup.t_data->t_infomask) ||
3986 		(checked_lockers && !locker_remains))
3987 		xmax_new_tuple = InvalidTransactionId;
3988 	else
3989 		xmax_new_tuple = HeapTupleHeaderGetRawXmax(oldtup.t_data);
3990 
3991 	if (!TransactionIdIsValid(xmax_new_tuple))
3992 	{
3993 		infomask_new_tuple = HEAP_XMAX_INVALID;
3994 		infomask2_new_tuple = 0;
3995 	}
3996 	else
3997 	{
3998 		/*
3999 		 * If we found a valid Xmax for the new tuple, then the infomask bits
4000 		 * to use on the new tuple depend on what was there on the old one.
4001 		 * Note that since we're doing an update, the only possibility is that
4002 		 * the lockers had FOR KEY SHARE lock.
4003 		 */
4004 		if (oldtup.t_data->t_infomask & HEAP_XMAX_IS_MULTI)
4005 		{
4006 			GetMultiXactIdHintBits(xmax_new_tuple, &infomask_new_tuple,
4007 								   &infomask2_new_tuple);
4008 		}
4009 		else
4010 		{
4011 			infomask_new_tuple = HEAP_XMAX_KEYSHR_LOCK | HEAP_XMAX_LOCK_ONLY;
4012 			infomask2_new_tuple = 0;
4013 		}
4014 	}
4015 
4016 	/*
4017 	 * Prepare the new tuple with the appropriate initial values of Xmin and
4018 	 * Xmax, as well as initial infomask bits as computed above.
4019 	 */
4020 	newtup->t_data->t_infomask &= ~(HEAP_XACT_MASK);
4021 	newtup->t_data->t_infomask2 &= ~(HEAP2_XACT_MASK);
4022 	HeapTupleHeaderSetXmin(newtup->t_data, xid);
4023 	HeapTupleHeaderSetCmin(newtup->t_data, cid);
4024 	newtup->t_data->t_infomask |= HEAP_UPDATED | infomask_new_tuple;
4025 	newtup->t_data->t_infomask2 |= infomask2_new_tuple;
4026 	HeapTupleHeaderSetXmax(newtup->t_data, xmax_new_tuple);
4027 
4028 	/*
4029 	 * Replace cid with a combo cid if necessary.  Note that we already put
4030 	 * the plain cid into the new tuple.
4031 	 */
4032 	HeapTupleHeaderAdjustCmax(oldtup.t_data, &cid, &iscombo);
4033 
4034 	/*
4035 	 * If the toaster needs to be activated, OR if the new tuple will not fit
4036 	 * on the same page as the old, then we need to release the content lock
4037 	 * (but not the pin!) on the old tuple's buffer while we are off doing
4038 	 * TOAST and/or table-file-extension work.  We must mark the old tuple to
4039 	 * show that it's locked, else other processes may try to update it
4040 	 * themselves.
4041 	 *
4042 	 * We need to invoke the toaster if there are already any out-of-line
4043 	 * toasted values present, or if the new tuple is over-threshold.
4044 	 */
4045 	if (relation->rd_rel->relkind != RELKIND_RELATION &&
4046 		relation->rd_rel->relkind != RELKIND_MATVIEW)
4047 	{
4048 		/* toast table entries should never be recursively toasted */
4049 		Assert(!HeapTupleHasExternal(&oldtup));
4050 		Assert(!HeapTupleHasExternal(newtup));
4051 		need_toast = false;
4052 	}
4053 	else
4054 		need_toast = (HeapTupleHasExternal(&oldtup) ||
4055 					  HeapTupleHasExternal(newtup) ||
4056 					  newtup->t_len > TOAST_TUPLE_THRESHOLD);
4057 
4058 	pagefree = PageGetHeapFreeSpace(page);
4059 
4060 	newtupsize = MAXALIGN(newtup->t_len);
4061 
4062 	if (need_toast || newtupsize > pagefree)
4063 	{
4064 		TransactionId xmax_lock_old_tuple;
4065 		uint16		infomask_lock_old_tuple,
4066 					infomask2_lock_old_tuple;
4067 		bool		cleared_all_frozen = false;
4068 
4069 		/*
4070 		 * To prevent concurrent sessions from updating the tuple, we have to
4071 		 * temporarily mark it locked, while we release the page-level lock.
4072 		 *
4073 		 * To satisfy the rule that any xid potentially appearing in a buffer
4074 		 * written out to disk, we unfortunately have to WAL log this
4075 		 * temporary modification.  We can reuse xl_heap_lock for this
4076 		 * purpose.  If we crash/error before following through with the
4077 		 * actual update, xmax will be of an aborted transaction, allowing
4078 		 * other sessions to proceed.
4079 		 */
4080 
4081 		/*
4082 		 * Compute xmax / infomask appropriate for locking the tuple. This has
4083 		 * to be done separately from the combo that's going to be used for
4084 		 * updating, because the potentially created multixact would otherwise
4085 		 * be wrong.
4086 		 */
4087 		compute_new_xmax_infomask(HeapTupleHeaderGetRawXmax(oldtup.t_data),
4088 								  oldtup.t_data->t_infomask,
4089 								  oldtup.t_data->t_infomask2,
4090 								  xid, *lockmode, false,
4091 								  &xmax_lock_old_tuple, &infomask_lock_old_tuple,
4092 								  &infomask2_lock_old_tuple);
4093 
4094 		Assert(HEAP_XMAX_IS_LOCKED_ONLY(infomask_lock_old_tuple));
4095 
4096 		START_CRIT_SECTION();
4097 
4098 		/* Clear obsolete visibility flags ... */
4099 		oldtup.t_data->t_infomask &= ~(HEAP_XMAX_BITS | HEAP_MOVED);
4100 		oldtup.t_data->t_infomask2 &= ~HEAP_KEYS_UPDATED;
4101 		HeapTupleClearHotUpdated(&oldtup);
4102 		/* ... and store info about transaction updating this tuple */
4103 		Assert(TransactionIdIsValid(xmax_lock_old_tuple));
4104 		HeapTupleHeaderSetXmax(oldtup.t_data, xmax_lock_old_tuple);
4105 		oldtup.t_data->t_infomask |= infomask_lock_old_tuple;
4106 		oldtup.t_data->t_infomask2 |= infomask2_lock_old_tuple;
4107 		HeapTupleHeaderSetCmax(oldtup.t_data, cid, iscombo);
4108 
4109 		/* temporarily make it look not-updated, but locked */
4110 		oldtup.t_data->t_ctid = oldtup.t_self;
4111 
4112 		/*
4113 		 * Clear all-frozen bit on visibility map if needed. We could
4114 		 * immediately reset ALL_VISIBLE, but given that the WAL logging
4115 		 * overhead would be unchanged, that doesn't seem necessarily
4116 		 * worthwhile.
4117 		 */
4118 		if (PageIsAllVisible(BufferGetPage(buffer)) &&
4119 			visibilitymap_clear(relation, block, vmbuffer,
4120 								VISIBILITYMAP_ALL_FROZEN))
4121 			cleared_all_frozen = true;
4122 
4123 		MarkBufferDirty(buffer);
4124 
4125 		if (RelationNeedsWAL(relation))
4126 		{
4127 			xl_heap_lock xlrec;
4128 			XLogRecPtr	recptr;
4129 
4130 			XLogBeginInsert();
4131 			XLogRegisterBuffer(0, buffer, REGBUF_STANDARD);
4132 
4133 			xlrec.offnum = ItemPointerGetOffsetNumber(&oldtup.t_self);
4134 			xlrec.locking_xid = xmax_lock_old_tuple;
4135 			xlrec.infobits_set = compute_infobits(oldtup.t_data->t_infomask,
4136 												  oldtup.t_data->t_infomask2);
4137 			xlrec.flags =
4138 				cleared_all_frozen ? XLH_LOCK_ALL_FROZEN_CLEARED : 0;
4139 			XLogRegisterData((char *) &xlrec, SizeOfHeapLock);
4140 			recptr = XLogInsert(RM_HEAP_ID, XLOG_HEAP_LOCK);
4141 			PageSetLSN(page, recptr);
4142 		}
4143 
4144 		END_CRIT_SECTION();
4145 
4146 		LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
4147 
4148 		/*
4149 		 * Let the toaster do its thing, if needed.
4150 		 *
4151 		 * Note: below this point, heaptup is the data we actually intend to
4152 		 * store into the relation; newtup is the caller's original untoasted
4153 		 * data.
4154 		 */
4155 		if (need_toast)
4156 		{
4157 			/* Note we always use WAL and FSM during updates */
4158 			heaptup = toast_insert_or_update(relation, newtup, &oldtup, 0);
4159 			newtupsize = MAXALIGN(heaptup->t_len);
4160 		}
4161 		else
4162 			heaptup = newtup;
4163 
4164 		/*
4165 		 * Now, do we need a new page for the tuple, or not?  This is a bit
4166 		 * tricky since someone else could have added tuples to the page while
4167 		 * we weren't looking.  We have to recheck the available space after
4168 		 * reacquiring the buffer lock.  But don't bother to do that if the
4169 		 * former amount of free space is still not enough; it's unlikely
4170 		 * there's more free now than before.
4171 		 *
4172 		 * What's more, if we need to get a new page, we will need to acquire
4173 		 * buffer locks on both old and new pages.  To avoid deadlock against
4174 		 * some other backend trying to get the same two locks in the other
4175 		 * order, we must be consistent about the order we get the locks in.
4176 		 * We use the rule "lock the lower-numbered page of the relation
4177 		 * first".  To implement this, we must do RelationGetBufferForTuple
4178 		 * while not holding the lock on the old page, and we must rely on it
4179 		 * to get the locks on both pages in the correct order.
4180 		 */
4181 		if (newtupsize > pagefree)
4182 		{
4183 			/* Assume there's no chance to put heaptup on same page. */
4184 			newbuf = RelationGetBufferForTuple(relation, heaptup->t_len,
4185 											   buffer, 0, NULL,
4186 											   &vmbuffer_new, &vmbuffer);
4187 		}
4188 		else
4189 		{
4190 			/* Re-acquire the lock on the old tuple's page. */
4191 			LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE);
4192 			/* Re-check using the up-to-date free space */
4193 			pagefree = PageGetHeapFreeSpace(page);
4194 			if (newtupsize > pagefree)
4195 			{
4196 				/*
4197 				 * Rats, it doesn't fit anymore.  We must now unlock and
4198 				 * relock to avoid deadlock.  Fortunately, this path should
4199 				 * seldom be taken.
4200 				 */
4201 				LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
4202 				newbuf = RelationGetBufferForTuple(relation, heaptup->t_len,
4203 												   buffer, 0, NULL,
4204 												   &vmbuffer_new, &vmbuffer);
4205 			}
4206 			else
4207 			{
4208 				/* OK, it fits here, so we're done. */
4209 				newbuf = buffer;
4210 			}
4211 		}
4212 	}
4213 	else
4214 	{
4215 		/* No TOAST work needed, and it'll fit on same page */
4216 		newbuf = buffer;
4217 		heaptup = newtup;
4218 	}
4219 
4220 	/*
4221 	 * We're about to do the actual update -- check for conflict first, to
4222 	 * avoid possibly having to roll back work we've just done.
4223 	 *
4224 	 * This is safe without a recheck as long as there is no possibility of
4225 	 * another process scanning the pages between this check and the update
4226 	 * being visible to the scan (i.e., exclusive buffer content lock(s) are
4227 	 * continuously held from this point until the tuple update is visible).
4228 	 *
4229 	 * For the new tuple the only check needed is at the relation level, but
4230 	 * since both tuples are in the same relation and the check for oldtup
4231 	 * will include checking the relation level, there is no benefit to a
4232 	 * separate check for the new tuple.
4233 	 */
4234 	CheckForSerializableConflictIn(relation, &oldtup, buffer);
4235 
4236 	/*
4237 	 * At this point newbuf and buffer are both pinned and locked, and newbuf
4238 	 * has enough space for the new tuple.  If they are the same buffer, only
4239 	 * one pin is held.
4240 	 */
4241 
4242 	if (newbuf == buffer)
4243 	{
4244 		/*
4245 		 * Since the new tuple is going into the same page, we might be able
4246 		 * to do a HOT update.  A HOT update is possible if none of the index
4247 		 * columns, nor columns used in projection functional indexes, have
4248 		 * changed.  If the page was already full, we may have skipped
4249 		 * checking for index columns, and also can't do a HOT update.
4250 		 */
4251 		if (hot_attrs_checked
4252 			&& !bms_overlap(modified_attrs, hot_attrs)
4253 			&& (!bms_overlap(modified_attrs, proj_idx_attrs)
4254 				|| ProjIndexIsUnchanged(relation, &oldtup, newtup)))
4255 		{
4256 			use_hot_update = true;
4257 		}
4258 	}
4259 	else
4260 	{
4261 		/* Set a hint that the old page could use prune/defrag */
4262 		PageSetFull(page);
4263 	}
4264 
4265 	/*
4266 	 * Compute replica identity tuple before entering the critical section so
4267 	 * we don't PANIC upon a memory allocation failure.
4268 	 * ExtractReplicaIdentity() will return NULL if nothing needs to be
4269 	 * logged.
4270 	 */
4271 	old_key_tuple = ExtractReplicaIdentity(relation, &oldtup,
4272 										   bms_overlap(modified_attrs, id_attrs),
4273 										   &old_key_copied);
4274 
4275 	/* NO EREPORT(ERROR) from here till changes are logged */
4276 	START_CRIT_SECTION();
4277 
4278 	/*
4279 	 * If this transaction commits, the old tuple will become DEAD sooner or
4280 	 * later.  Set flag that this page is a candidate for pruning once our xid
4281 	 * falls below the OldestXmin horizon.  If the transaction finally aborts,
4282 	 * the subsequent page pruning will be a no-op and the hint will be
4283 	 * cleared.
4284 	 *
4285 	 * XXX Should we set hint on newbuf as well?  If the transaction aborts,
4286 	 * there would be a prunable tuple in the newbuf; but for now we choose
4287 	 * not to optimize for aborts.  Note that heap_xlog_update must be kept in
4288 	 * sync if this decision changes.
4289 	 */
4290 	PageSetPrunable(page, xid);
4291 
4292 	if (use_hot_update)
4293 	{
4294 		/* Mark the old tuple as HOT-updated */
4295 		HeapTupleSetHotUpdated(&oldtup);
4296 		/* And mark the new tuple as heap-only */
4297 		HeapTupleSetHeapOnly(heaptup);
4298 		/* Mark the caller's copy too, in case different from heaptup */
4299 		HeapTupleSetHeapOnly(newtup);
4300 	}
4301 	else
4302 	{
4303 		/* Make sure tuples are correctly marked as not-HOT */
4304 		HeapTupleClearHotUpdated(&oldtup);
4305 		HeapTupleClearHeapOnly(heaptup);
4306 		HeapTupleClearHeapOnly(newtup);
4307 	}
4308 
4309 	RelationPutHeapTuple(relation, newbuf, heaptup, false); /* insert new tuple */
4310 
4311 
4312 	/* Clear obsolete visibility flags, possibly set by ourselves above... */
4313 	oldtup.t_data->t_infomask &= ~(HEAP_XMAX_BITS | HEAP_MOVED);
4314 	oldtup.t_data->t_infomask2 &= ~HEAP_KEYS_UPDATED;
4315 	/* ... and store info about transaction updating this tuple */
4316 	Assert(TransactionIdIsValid(xmax_old_tuple));
4317 	HeapTupleHeaderSetXmax(oldtup.t_data, xmax_old_tuple);
4318 	oldtup.t_data->t_infomask |= infomask_old_tuple;
4319 	oldtup.t_data->t_infomask2 |= infomask2_old_tuple;
4320 	HeapTupleHeaderSetCmax(oldtup.t_data, cid, iscombo);
4321 
4322 	/* record address of new tuple in t_ctid of old one */
4323 	oldtup.t_data->t_ctid = heaptup->t_self;
4324 
4325 	/* clear PD_ALL_VISIBLE flags, reset all visibilitymap bits */
4326 	if (PageIsAllVisible(BufferGetPage(buffer)))
4327 	{
4328 		all_visible_cleared = true;
4329 		PageClearAllVisible(BufferGetPage(buffer));
4330 		visibilitymap_clear(relation, BufferGetBlockNumber(buffer),
4331 							vmbuffer, VISIBILITYMAP_VALID_BITS);
4332 	}
4333 	if (newbuf != buffer && PageIsAllVisible(BufferGetPage(newbuf)))
4334 	{
4335 		all_visible_cleared_new = true;
4336 		PageClearAllVisible(BufferGetPage(newbuf));
4337 		visibilitymap_clear(relation, BufferGetBlockNumber(newbuf),
4338 							vmbuffer_new, VISIBILITYMAP_VALID_BITS);
4339 	}
4340 
4341 	if (newbuf != buffer)
4342 		MarkBufferDirty(newbuf);
4343 	MarkBufferDirty(buffer);
4344 
4345 	/* XLOG stuff */
4346 	if (RelationNeedsWAL(relation))
4347 	{
4348 		XLogRecPtr	recptr;
4349 
4350 		/*
4351 		 * For logical decoding we need combocids to properly decode the
4352 		 * catalog.
4353 		 */
4354 		if (RelationIsAccessibleInLogicalDecoding(relation))
4355 		{
4356 			log_heap_new_cid(relation, &oldtup);
4357 			log_heap_new_cid(relation, heaptup);
4358 		}
4359 
4360 		recptr = log_heap_update(relation, buffer,
4361 								 newbuf, &oldtup, heaptup,
4362 								 old_key_tuple,
4363 								 all_visible_cleared,
4364 								 all_visible_cleared_new);
4365 		if (newbuf != buffer)
4366 		{
4367 			PageSetLSN(BufferGetPage(newbuf), recptr);
4368 		}
4369 		PageSetLSN(BufferGetPage(buffer), recptr);
4370 	}
4371 
4372 	END_CRIT_SECTION();
4373 
4374 	if (newbuf != buffer)
4375 		LockBuffer(newbuf, BUFFER_LOCK_UNLOCK);
4376 	LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
4377 
4378 	/*
4379 	 * Mark old tuple for invalidation from system caches at next command
4380 	 * boundary, and mark the new tuple for invalidation in case we abort. We
4381 	 * have to do this before releasing the buffer because oldtup is in the
4382 	 * buffer.  (heaptup is all in local memory, but it's necessary to process
4383 	 * both tuple versions in one call to inval.c so we can avoid redundant
4384 	 * sinval messages.)
4385 	 */
4386 	CacheInvalidateHeapTuple(relation, &oldtup, heaptup);
4387 
4388 	/* Now we can release the buffer(s) */
4389 	if (newbuf != buffer)
4390 		ReleaseBuffer(newbuf);
4391 	ReleaseBuffer(buffer);
4392 	if (BufferIsValid(vmbuffer_new))
4393 		ReleaseBuffer(vmbuffer_new);
4394 	if (BufferIsValid(vmbuffer))
4395 		ReleaseBuffer(vmbuffer);
4396 
4397 	/*
4398 	 * Release the lmgr tuple lock, if we had it.
4399 	 */
4400 	if (have_tuple_lock)
4401 		UnlockTupleTuplock(relation, &(oldtup.t_self), *lockmode);
4402 
4403 	pgstat_count_heap_update(relation, use_hot_update);
4404 
4405 	/*
4406 	 * If heaptup is a private copy, release it.  Don't forget to copy t_self
4407 	 * back to the caller's image, too.
4408 	 */
4409 	if (heaptup != newtup)
4410 	{
4411 		newtup->t_self = heaptup->t_self;
4412 		heap_freetuple(heaptup);
4413 	}
4414 
4415 	if (old_key_tuple != NULL && old_key_copied)
4416 		heap_freetuple(old_key_tuple);
4417 
4418 	bms_free(hot_attrs);
4419 	bms_free(proj_idx_attrs);
4420 	bms_free(key_attrs);
4421 	bms_free(id_attrs);
4422 	bms_free(modified_attrs);
4423 	bms_free(interesting_attrs);
4424 
4425 	return HeapTupleMayBeUpdated;
4426 }
4427 
4428 /*
4429  * Check if the specified attribute's value is same in both given tuples.
4430  * Subroutine for HeapDetermineModifiedColumns.
4431  */
4432 static bool
heap_tuple_attr_equals(TupleDesc tupdesc,int attrnum,HeapTuple tup1,HeapTuple tup2)4433 heap_tuple_attr_equals(TupleDesc tupdesc, int attrnum,
4434 					   HeapTuple tup1, HeapTuple tup2)
4435 {
4436 	Datum		value1,
4437 				value2;
4438 	bool		isnull1,
4439 				isnull2;
4440 	Form_pg_attribute att;
4441 
4442 	/*
4443 	 * If it's a whole-tuple reference, say "not equal".  It's not really
4444 	 * worth supporting this case, since it could only succeed after a no-op
4445 	 * update, which is hardly a case worth optimizing for.
4446 	 */
4447 	if (attrnum == 0)
4448 		return false;
4449 
4450 	/*
4451 	 * Likewise, automatically say "not equal" for any system attribute other
4452 	 * than OID and tableOID; we cannot expect these to be consistent in a HOT
4453 	 * chain, or even to be set correctly yet in the new tuple.
4454 	 */
4455 	if (attrnum < 0)
4456 	{
4457 		if (attrnum != ObjectIdAttributeNumber &&
4458 			attrnum != TableOidAttributeNumber)
4459 			return false;
4460 	}
4461 
4462 	/*
4463 	 * Extract the corresponding values.  XXX this is pretty inefficient if
4464 	 * there are many indexed columns.  Should HeapDetermineModifiedColumns do
4465 	 * a single heap_deform_tuple call on each tuple, instead?	But that
4466 	 * doesn't work for system columns ...
4467 	 */
4468 	value1 = heap_getattr(tup1, attrnum, tupdesc, &isnull1);
4469 	value2 = heap_getattr(tup2, attrnum, tupdesc, &isnull2);
4470 
4471 	/*
4472 	 * If one value is NULL and other is not, then they are certainly not
4473 	 * equal
4474 	 */
4475 	if (isnull1 != isnull2)
4476 		return false;
4477 
4478 	/*
4479 	 * If both are NULL, they can be considered equal.
4480 	 */
4481 	if (isnull1)
4482 		return true;
4483 
4484 	/*
4485 	 * We do simple binary comparison of the two datums.  This may be overly
4486 	 * strict because there can be multiple binary representations for the
4487 	 * same logical value.  But we should be OK as long as there are no false
4488 	 * positives.  Using a type-specific equality operator is messy because
4489 	 * there could be multiple notions of equality in different operator
4490 	 * classes; furthermore, we cannot safely invoke user-defined functions
4491 	 * while holding exclusive buffer lock.
4492 	 */
4493 	if (attrnum <= 0)
4494 	{
4495 		/* The only allowed system columns are OIDs, so do this */
4496 		return (DatumGetObjectId(value1) == DatumGetObjectId(value2));
4497 	}
4498 	else
4499 	{
4500 		Assert(attrnum <= tupdesc->natts);
4501 		att = TupleDescAttr(tupdesc, attrnum - 1);
4502 		return datumIsEqual(value1, value2, att->attbyval, att->attlen);
4503 	}
4504 }
4505 
4506 /*
4507  * Check whether the value is unchanged after update of a projection
4508  * functional index. Compare the new and old values of the indexed
4509  * expression to see if we are able to use a HOT update or not.
4510  */
4511 static bool
ProjIndexIsUnchanged(Relation relation,HeapTuple oldtup,HeapTuple newtup)4512 ProjIndexIsUnchanged(Relation relation, HeapTuple oldtup, HeapTuple newtup)
4513 {
4514 	ListCell   *l;
4515 	List	   *indexoidlist = RelationGetIndexList(relation);
4516 	EState	   *estate = CreateExecutorState();
4517 	ExprContext *econtext = GetPerTupleExprContext(estate);
4518 	TupleTableSlot *slot = MakeSingleTupleTableSlot(RelationGetDescr(relation));
4519 	bool		equals = true;
4520 	Datum		old_values[INDEX_MAX_KEYS];
4521 	bool		old_isnull[INDEX_MAX_KEYS];
4522 	Datum		new_values[INDEX_MAX_KEYS];
4523 	bool		new_isnull[INDEX_MAX_KEYS];
4524 	int			indexno = 0;
4525 
4526 	econtext->ecxt_scantuple = slot;
4527 
4528 	foreach(l, indexoidlist)
4529 	{
4530 		if (bms_is_member(indexno, relation->rd_projidx))
4531 		{
4532 			Oid			indexOid = lfirst_oid(l);
4533 			Relation	indexDesc = index_open(indexOid, AccessShareLock);
4534 			IndexInfo  *indexInfo = BuildIndexInfo(indexDesc);
4535 			int			i;
4536 
4537 			ResetExprContext(econtext);
4538 			ExecStoreTuple(oldtup, slot, InvalidBuffer, false);
4539 			FormIndexDatum(indexInfo,
4540 						   slot,
4541 						   estate,
4542 						   old_values,
4543 						   old_isnull);
4544 
4545 			ExecStoreTuple(newtup, slot, InvalidBuffer, false);
4546 			FormIndexDatum(indexInfo,
4547 						   slot,
4548 						   estate,
4549 						   new_values,
4550 						   new_isnull);
4551 
4552 			for (i = 0; i < indexInfo->ii_NumIndexAttrs; i++)
4553 			{
4554 				if (old_isnull[i] != new_isnull[i])
4555 				{
4556 					equals = false;
4557 					break;
4558 				}
4559 				else if (!old_isnull[i])
4560 				{
4561 					Form_pg_attribute att = TupleDescAttr(RelationGetDescr(indexDesc), i);
4562 
4563 					if (!datumIsEqual(old_values[i], new_values[i], att->attbyval, att->attlen))
4564 					{
4565 						equals = false;
4566 						break;
4567 					}
4568 				}
4569 			}
4570 			index_close(indexDesc, AccessShareLock);
4571 
4572 			if (!equals)
4573 			{
4574 				break;
4575 			}
4576 		}
4577 		indexno += 1;
4578 	}
4579 	ExecDropSingleTupleTableSlot(slot);
4580 	FreeExecutorState(estate);
4581 
4582 	return equals;
4583 }
4584 
4585 
4586 /*
4587  * Check which columns are being updated.
4588  *
4589  * Given an updated tuple, determine (and return into the output bitmapset),
4590  * from those listed as interesting, the set of columns that changed.
4591  *
4592  * The input bitmapset is destructively modified; that is OK since this is
4593  * invoked at most once in heap_update.
4594  */
4595 static Bitmapset *
HeapDetermineModifiedColumns(Relation relation,Bitmapset * interesting_cols,HeapTuple oldtup,HeapTuple newtup)4596 HeapDetermineModifiedColumns(Relation relation, Bitmapset *interesting_cols,
4597 							 HeapTuple oldtup, HeapTuple newtup)
4598 {
4599 	int			attnum;
4600 	Bitmapset  *modified = NULL;
4601 
4602 	while ((attnum = bms_first_member(interesting_cols)) >= 0)
4603 	{
4604 		attnum += FirstLowInvalidHeapAttributeNumber;
4605 
4606 		if (!heap_tuple_attr_equals(RelationGetDescr(relation),
4607 									attnum, oldtup, newtup))
4608 			modified = bms_add_member(modified,
4609 									  attnum - FirstLowInvalidHeapAttributeNumber);
4610 	}
4611 
4612 	return modified;
4613 }
4614 
4615 /*
4616  *	simple_heap_update - replace a tuple
4617  *
4618  * This routine may be used to update a tuple when concurrent updates of
4619  * the target tuple are not expected (for example, because we have a lock
4620  * on the relation associated with the tuple).  Any failure is reported
4621  * via ereport().
4622  */
4623 void
simple_heap_update(Relation relation,ItemPointer otid,HeapTuple tup)4624 simple_heap_update(Relation relation, ItemPointer otid, HeapTuple tup)
4625 {
4626 	HTSU_Result result;
4627 	HeapUpdateFailureData hufd;
4628 	LockTupleMode lockmode;
4629 
4630 	result = heap_update(relation, otid, tup,
4631 						 GetCurrentCommandId(true), InvalidSnapshot,
4632 						 true /* wait for commit */ ,
4633 						 &hufd, &lockmode);
4634 	switch (result)
4635 	{
4636 		case HeapTupleSelfUpdated:
4637 			/* Tuple was already updated in current command? */
4638 			elog(ERROR, "tuple already updated by self");
4639 			break;
4640 
4641 		case HeapTupleMayBeUpdated:
4642 			/* done successfully */
4643 			break;
4644 
4645 		case HeapTupleUpdated:
4646 			elog(ERROR, "tuple concurrently updated");
4647 			break;
4648 
4649 		default:
4650 			elog(ERROR, "unrecognized heap_update status: %u", result);
4651 			break;
4652 	}
4653 }
4654 
4655 
4656 /*
4657  * Return the MultiXactStatus corresponding to the given tuple lock mode.
4658  */
4659 static MultiXactStatus
get_mxact_status_for_lock(LockTupleMode mode,bool is_update)4660 get_mxact_status_for_lock(LockTupleMode mode, bool is_update)
4661 {
4662 	int			retval;
4663 
4664 	if (is_update)
4665 		retval = tupleLockExtraInfo[mode].updstatus;
4666 	else
4667 		retval = tupleLockExtraInfo[mode].lockstatus;
4668 
4669 	if (retval == -1)
4670 		elog(ERROR, "invalid lock tuple mode %d/%s", mode,
4671 			 is_update ? "true" : "false");
4672 
4673 	return (MultiXactStatus) retval;
4674 }
4675 
4676 /*
4677  *	heap_lock_tuple - lock a tuple in shared or exclusive mode
4678  *
4679  * Note that this acquires a buffer pin, which the caller must release.
4680  *
4681  * Input parameters:
4682  *	relation: relation containing tuple (caller must hold suitable lock)
4683  *	tuple->t_self: TID of tuple to lock (rest of struct need not be valid)
4684  *	cid: current command ID (used for visibility test, and stored into
4685  *		tuple's cmax if lock is successful)
4686  *	mode: indicates if shared or exclusive tuple lock is desired
4687  *	wait_policy: what to do if tuple lock is not available
4688  *	follow_updates: if true, follow the update chain to also lock descendant
4689  *		tuples.
4690  *
4691  * Output parameters:
4692  *	*tuple: all fields filled in
4693  *	*buffer: set to buffer holding tuple (pinned but not locked at exit)
4694  *	*hufd: filled in failure cases (see below)
4695  *
4696  * Function result may be:
4697  *	HeapTupleMayBeUpdated: lock was successfully acquired
4698  *	HeapTupleInvisible: lock failed because tuple was never visible to us
4699  *	HeapTupleSelfUpdated: lock failed because tuple updated by self
4700  *	HeapTupleUpdated: lock failed because tuple updated by other xact
4701  *	HeapTupleWouldBlock: lock couldn't be acquired and wait_policy is skip
4702  *
4703  * In the failure cases other than HeapTupleInvisible, the routine fills
4704  * *hufd with the tuple's t_ctid, t_xmax (resolving a possible MultiXact,
4705  * if necessary), and t_cmax (the last only for HeapTupleSelfUpdated,
4706  * since we cannot obtain cmax from a combocid generated by another
4707  * transaction).
4708  * See comments for struct HeapUpdateFailureData for additional info.
4709  *
4710  * See README.tuplock for a thorough explanation of this mechanism.
4711  */
4712 HTSU_Result
heap_lock_tuple(Relation relation,HeapTuple tuple,CommandId cid,LockTupleMode mode,LockWaitPolicy wait_policy,bool follow_updates,Buffer * buffer,HeapUpdateFailureData * hufd)4713 heap_lock_tuple(Relation relation, HeapTuple tuple,
4714 				CommandId cid, LockTupleMode mode, LockWaitPolicy wait_policy,
4715 				bool follow_updates,
4716 				Buffer *buffer, HeapUpdateFailureData *hufd)
4717 {
4718 	HTSU_Result result;
4719 	ItemPointer tid = &(tuple->t_self);
4720 	ItemId		lp;
4721 	Page		page;
4722 	Buffer		vmbuffer = InvalidBuffer;
4723 	BlockNumber block;
4724 	TransactionId xid,
4725 				xmax;
4726 	uint16		old_infomask,
4727 				new_infomask,
4728 				new_infomask2;
4729 	bool		first_time = true;
4730 	bool		skip_tuple_lock = false;
4731 	bool		have_tuple_lock = false;
4732 	bool		cleared_all_frozen = false;
4733 
4734 	*buffer = ReadBuffer(relation, ItemPointerGetBlockNumber(tid));
4735 	block = ItemPointerGetBlockNumber(tid);
4736 
4737 	/*
4738 	 * Before locking the buffer, pin the visibility map page if it appears to
4739 	 * be necessary.  Since we haven't got the lock yet, someone else might be
4740 	 * in the middle of changing this, so we'll need to recheck after we have
4741 	 * the lock.
4742 	 */
4743 	if (PageIsAllVisible(BufferGetPage(*buffer)))
4744 		visibilitymap_pin(relation, block, &vmbuffer);
4745 
4746 	LockBuffer(*buffer, BUFFER_LOCK_EXCLUSIVE);
4747 
4748 	page = BufferGetPage(*buffer);
4749 	lp = PageGetItemId(page, ItemPointerGetOffsetNumber(tid));
4750 	Assert(ItemIdIsNormal(lp));
4751 
4752 	tuple->t_data = (HeapTupleHeader) PageGetItem(page, lp);
4753 	tuple->t_len = ItemIdGetLength(lp);
4754 	tuple->t_tableOid = RelationGetRelid(relation);
4755 
4756 l3:
4757 	result = HeapTupleSatisfiesUpdate(tuple, cid, *buffer);
4758 
4759 	if (result == HeapTupleInvisible)
4760 	{
4761 		/*
4762 		 * This is possible, but only when locking a tuple for ON CONFLICT
4763 		 * UPDATE.  We return this value here rather than throwing an error in
4764 		 * order to give that case the opportunity to throw a more specific
4765 		 * error.
4766 		 */
4767 		result = HeapTupleInvisible;
4768 		goto out_locked;
4769 	}
4770 	else if (result == HeapTupleBeingUpdated || result == HeapTupleUpdated)
4771 	{
4772 		TransactionId xwait;
4773 		uint16		infomask;
4774 		uint16		infomask2;
4775 		bool		require_sleep;
4776 		ItemPointerData t_ctid;
4777 
4778 		/* must copy state data before unlocking buffer */
4779 		xwait = HeapTupleHeaderGetRawXmax(tuple->t_data);
4780 		infomask = tuple->t_data->t_infomask;
4781 		infomask2 = tuple->t_data->t_infomask2;
4782 		ItemPointerCopy(&tuple->t_data->t_ctid, &t_ctid);
4783 
4784 		LockBuffer(*buffer, BUFFER_LOCK_UNLOCK);
4785 
4786 		/*
4787 		 * If any subtransaction of the current top transaction already holds
4788 		 * a lock as strong as or stronger than what we're requesting, we
4789 		 * effectively hold the desired lock already.  We *must* succeed
4790 		 * without trying to take the tuple lock, else we will deadlock
4791 		 * against anyone wanting to acquire a stronger lock.
4792 		 *
4793 		 * Note we only do this the first time we loop on the HTSU result;
4794 		 * there is no point in testing in subsequent passes, because
4795 		 * evidently our own transaction cannot have acquired a new lock after
4796 		 * the first time we checked.
4797 		 */
4798 		if (first_time)
4799 		{
4800 			first_time = false;
4801 
4802 			if (infomask & HEAP_XMAX_IS_MULTI)
4803 			{
4804 				int			i;
4805 				int			nmembers;
4806 				MultiXactMember *members;
4807 
4808 				/*
4809 				 * We don't need to allow old multixacts here; if that had
4810 				 * been the case, HeapTupleSatisfiesUpdate would have returned
4811 				 * MayBeUpdated and we wouldn't be here.
4812 				 */
4813 				nmembers =
4814 					GetMultiXactIdMembers(xwait, &members, false,
4815 										  HEAP_XMAX_IS_LOCKED_ONLY(infomask));
4816 
4817 				for (i = 0; i < nmembers; i++)
4818 				{
4819 					/* only consider members of our own transaction */
4820 					if (!TransactionIdIsCurrentTransactionId(members[i].xid))
4821 						continue;
4822 
4823 					if (TUPLOCK_from_mxstatus(members[i].status) >= mode)
4824 					{
4825 						pfree(members);
4826 						result = HeapTupleMayBeUpdated;
4827 						goto out_unlocked;
4828 					}
4829 					else
4830 					{
4831 						/*
4832 						 * Disable acquisition of the heavyweight tuple lock.
4833 						 * Otherwise, when promoting a weaker lock, we might
4834 						 * deadlock with another locker that has acquired the
4835 						 * heavyweight tuple lock and is waiting for our
4836 						 * transaction to finish.
4837 						 *
4838 						 * Note that in this case we still need to wait for
4839 						 * the multixact if required, to avoid acquiring
4840 						 * conflicting locks.
4841 						 */
4842 						skip_tuple_lock = true;
4843 					}
4844 				}
4845 
4846 				if (members)
4847 					pfree(members);
4848 			}
4849 			else if (TransactionIdIsCurrentTransactionId(xwait))
4850 			{
4851 				switch (mode)
4852 				{
4853 					case LockTupleKeyShare:
4854 						Assert(HEAP_XMAX_IS_KEYSHR_LOCKED(infomask) ||
4855 							   HEAP_XMAX_IS_SHR_LOCKED(infomask) ||
4856 							   HEAP_XMAX_IS_EXCL_LOCKED(infomask));
4857 						result = HeapTupleMayBeUpdated;
4858 						goto out_unlocked;
4859 					case LockTupleShare:
4860 						if (HEAP_XMAX_IS_SHR_LOCKED(infomask) ||
4861 							HEAP_XMAX_IS_EXCL_LOCKED(infomask))
4862 						{
4863 							result = HeapTupleMayBeUpdated;
4864 							goto out_unlocked;
4865 						}
4866 						break;
4867 					case LockTupleNoKeyExclusive:
4868 						if (HEAP_XMAX_IS_EXCL_LOCKED(infomask))
4869 						{
4870 							result = HeapTupleMayBeUpdated;
4871 							goto out_unlocked;
4872 						}
4873 						break;
4874 					case LockTupleExclusive:
4875 						if (HEAP_XMAX_IS_EXCL_LOCKED(infomask) &&
4876 							infomask2 & HEAP_KEYS_UPDATED)
4877 						{
4878 							result = HeapTupleMayBeUpdated;
4879 							goto out_unlocked;
4880 						}
4881 						break;
4882 				}
4883 			}
4884 		}
4885 
4886 		/*
4887 		 * Initially assume that we will have to wait for the locking
4888 		 * transaction(s) to finish.  We check various cases below in which
4889 		 * this can be turned off.
4890 		 */
4891 		require_sleep = true;
4892 		if (mode == LockTupleKeyShare)
4893 		{
4894 			/*
4895 			 * If we're requesting KeyShare, and there's no update present, we
4896 			 * don't need to wait.  Even if there is an update, we can still
4897 			 * continue if the key hasn't been modified.
4898 			 *
4899 			 * However, if there are updates, we need to walk the update chain
4900 			 * to mark future versions of the row as locked, too.  That way,
4901 			 * if somebody deletes that future version, we're protected
4902 			 * against the key going away.  This locking of future versions
4903 			 * could block momentarily, if a concurrent transaction is
4904 			 * deleting a key; or it could return a value to the effect that
4905 			 * the transaction deleting the key has already committed.  So we
4906 			 * do this before re-locking the buffer; otherwise this would be
4907 			 * prone to deadlocks.
4908 			 *
4909 			 * Note that the TID we're locking was grabbed before we unlocked
4910 			 * the buffer.  For it to change while we're not looking, the
4911 			 * other properties we're testing for below after re-locking the
4912 			 * buffer would also change, in which case we would restart this
4913 			 * loop above.
4914 			 */
4915 			if (!(infomask2 & HEAP_KEYS_UPDATED))
4916 			{
4917 				bool		updated;
4918 
4919 				updated = !HEAP_XMAX_IS_LOCKED_ONLY(infomask);
4920 
4921 				/*
4922 				 * If there are updates, follow the update chain; bail out if
4923 				 * that cannot be done.
4924 				 */
4925 				if (follow_updates && updated)
4926 				{
4927 					HTSU_Result res;
4928 
4929 					res = heap_lock_updated_tuple(relation, tuple, &t_ctid,
4930 												  GetCurrentTransactionId(),
4931 												  mode);
4932 					if (res != HeapTupleMayBeUpdated)
4933 					{
4934 						result = res;
4935 						/* recovery code expects to have buffer lock held */
4936 						LockBuffer(*buffer, BUFFER_LOCK_EXCLUSIVE);
4937 						goto failed;
4938 					}
4939 				}
4940 
4941 				LockBuffer(*buffer, BUFFER_LOCK_EXCLUSIVE);
4942 
4943 				/*
4944 				 * Make sure it's still an appropriate lock, else start over.
4945 				 * Also, if it wasn't updated before we released the lock, but
4946 				 * is updated now, we start over too; the reason is that we
4947 				 * now need to follow the update chain to lock the new
4948 				 * versions.
4949 				 */
4950 				if (!HeapTupleHeaderIsOnlyLocked(tuple->t_data) &&
4951 					((tuple->t_data->t_infomask2 & HEAP_KEYS_UPDATED) ||
4952 					 !updated))
4953 					goto l3;
4954 
4955 				/* Things look okay, so we can skip sleeping */
4956 				require_sleep = false;
4957 
4958 				/*
4959 				 * Note we allow Xmax to change here; other updaters/lockers
4960 				 * could have modified it before we grabbed the buffer lock.
4961 				 * However, this is not a problem, because with the recheck we
4962 				 * just did we ensure that they still don't conflict with the
4963 				 * lock we want.
4964 				 */
4965 			}
4966 		}
4967 		else if (mode == LockTupleShare)
4968 		{
4969 			/*
4970 			 * If we're requesting Share, we can similarly avoid sleeping if
4971 			 * there's no update and no exclusive lock present.
4972 			 */
4973 			if (HEAP_XMAX_IS_LOCKED_ONLY(infomask) &&
4974 				!HEAP_XMAX_IS_EXCL_LOCKED(infomask))
4975 			{
4976 				LockBuffer(*buffer, BUFFER_LOCK_EXCLUSIVE);
4977 
4978 				/*
4979 				 * Make sure it's still an appropriate lock, else start over.
4980 				 * See above about allowing xmax to change.
4981 				 */
4982 				if (!HEAP_XMAX_IS_LOCKED_ONLY(tuple->t_data->t_infomask) ||
4983 					HEAP_XMAX_IS_EXCL_LOCKED(tuple->t_data->t_infomask))
4984 					goto l3;
4985 				require_sleep = false;
4986 			}
4987 		}
4988 		else if (mode == LockTupleNoKeyExclusive)
4989 		{
4990 			/*
4991 			 * If we're requesting NoKeyExclusive, we might also be able to
4992 			 * avoid sleeping; just ensure that there no conflicting lock
4993 			 * already acquired.
4994 			 */
4995 			if (infomask & HEAP_XMAX_IS_MULTI)
4996 			{
4997 				if (!DoesMultiXactIdConflict((MultiXactId) xwait, infomask,
4998 											 mode, NULL))
4999 				{
5000 					/*
5001 					 * No conflict, but if the xmax changed under us in the
5002 					 * meantime, start over.
5003 					 */
5004 					LockBuffer(*buffer, BUFFER_LOCK_EXCLUSIVE);
5005 					if (xmax_infomask_changed(tuple->t_data->t_infomask, infomask) ||
5006 						!TransactionIdEquals(HeapTupleHeaderGetRawXmax(tuple->t_data),
5007 											 xwait))
5008 						goto l3;
5009 
5010 					/* otherwise, we're good */
5011 					require_sleep = false;
5012 				}
5013 			}
5014 			else if (HEAP_XMAX_IS_KEYSHR_LOCKED(infomask))
5015 			{
5016 				LockBuffer(*buffer, BUFFER_LOCK_EXCLUSIVE);
5017 
5018 				/* if the xmax changed in the meantime, start over */
5019 				if (xmax_infomask_changed(tuple->t_data->t_infomask, infomask) ||
5020 					!TransactionIdEquals(
5021 										 HeapTupleHeaderGetRawXmax(tuple->t_data),
5022 										 xwait))
5023 					goto l3;
5024 				/* otherwise, we're good */
5025 				require_sleep = false;
5026 			}
5027 		}
5028 
5029 		/*
5030 		 * As a check independent from those above, we can also avoid sleeping
5031 		 * if the current transaction is the sole locker of the tuple.  Note
5032 		 * that the strength of the lock already held is irrelevant; this is
5033 		 * not about recording the lock in Xmax (which will be done regardless
5034 		 * of this optimization, below).  Also, note that the cases where we
5035 		 * hold a lock stronger than we are requesting are already handled
5036 		 * above by not doing anything.
5037 		 *
5038 		 * Note we only deal with the non-multixact case here; MultiXactIdWait
5039 		 * is well equipped to deal with this situation on its own.
5040 		 */
5041 		if (require_sleep && !(infomask & HEAP_XMAX_IS_MULTI) &&
5042 			TransactionIdIsCurrentTransactionId(xwait))
5043 		{
5044 			/* ... but if the xmax changed in the meantime, start over */
5045 			LockBuffer(*buffer, BUFFER_LOCK_EXCLUSIVE);
5046 			if (xmax_infomask_changed(tuple->t_data->t_infomask, infomask) ||
5047 				!TransactionIdEquals(HeapTupleHeaderGetRawXmax(tuple->t_data),
5048 									 xwait))
5049 				goto l3;
5050 			Assert(HEAP_XMAX_IS_LOCKED_ONLY(tuple->t_data->t_infomask));
5051 			require_sleep = false;
5052 		}
5053 
5054 		/*
5055 		 * Time to sleep on the other transaction/multixact, if necessary.
5056 		 *
5057 		 * If the other transaction is an update that's already committed,
5058 		 * then sleeping cannot possibly do any good: if we're required to
5059 		 * sleep, get out to raise an error instead.
5060 		 *
5061 		 * By here, we either have already acquired the buffer exclusive lock,
5062 		 * or we must wait for the locking transaction or multixact; so below
5063 		 * we ensure that we grab buffer lock after the sleep.
5064 		 */
5065 		if (require_sleep && result == HeapTupleUpdated)
5066 		{
5067 			LockBuffer(*buffer, BUFFER_LOCK_EXCLUSIVE);
5068 			goto failed;
5069 		}
5070 		else if (require_sleep)
5071 		{
5072 			/*
5073 			 * Acquire tuple lock to establish our priority for the tuple, or
5074 			 * die trying.  LockTuple will release us when we are next-in-line
5075 			 * for the tuple.  We must do this even if we are share-locking,
5076 			 * but not if we already have a weaker lock on the tuple.
5077 			 *
5078 			 * If we are forced to "start over" below, we keep the tuple lock;
5079 			 * this arranges that we stay at the head of the line while
5080 			 * rechecking tuple state.
5081 			 */
5082 			if (!skip_tuple_lock &&
5083 				!heap_acquire_tuplock(relation, tid, mode, wait_policy,
5084 									  &have_tuple_lock))
5085 			{
5086 				/*
5087 				 * This can only happen if wait_policy is Skip and the lock
5088 				 * couldn't be obtained.
5089 				 */
5090 				result = HeapTupleWouldBlock;
5091 				/* recovery code expects to have buffer lock held */
5092 				LockBuffer(*buffer, BUFFER_LOCK_EXCLUSIVE);
5093 				goto failed;
5094 			}
5095 
5096 			if (infomask & HEAP_XMAX_IS_MULTI)
5097 			{
5098 				MultiXactStatus status = get_mxact_status_for_lock(mode, false);
5099 
5100 				/* We only ever lock tuples, never update them */
5101 				if (status >= MultiXactStatusNoKeyUpdate)
5102 					elog(ERROR, "invalid lock mode in heap_lock_tuple");
5103 
5104 				/* wait for multixact to end, or die trying  */
5105 				switch (wait_policy)
5106 				{
5107 					case LockWaitBlock:
5108 						MultiXactIdWait((MultiXactId) xwait, status, infomask,
5109 										relation, &tuple->t_self, XLTW_Lock, NULL);
5110 						break;
5111 					case LockWaitSkip:
5112 						if (!ConditionalMultiXactIdWait((MultiXactId) xwait,
5113 														status, infomask, relation,
5114 														NULL))
5115 						{
5116 							result = HeapTupleWouldBlock;
5117 							/* recovery code expects to have buffer lock held */
5118 							LockBuffer(*buffer, BUFFER_LOCK_EXCLUSIVE);
5119 							goto failed;
5120 						}
5121 						break;
5122 					case LockWaitError:
5123 						if (!ConditionalMultiXactIdWait((MultiXactId) xwait,
5124 														status, infomask, relation,
5125 														NULL))
5126 							ereport(ERROR,
5127 									(errcode(ERRCODE_LOCK_NOT_AVAILABLE),
5128 									 errmsg("could not obtain lock on row in relation \"%s\"",
5129 											RelationGetRelationName(relation))));
5130 
5131 						break;
5132 				}
5133 
5134 				/*
5135 				 * Of course, the multixact might not be done here: if we're
5136 				 * requesting a light lock mode, other transactions with light
5137 				 * locks could still be alive, as well as locks owned by our
5138 				 * own xact or other subxacts of this backend.  We need to
5139 				 * preserve the surviving MultiXact members.  Note that it
5140 				 * isn't absolutely necessary in the latter case, but doing so
5141 				 * is simpler.
5142 				 */
5143 			}
5144 			else
5145 			{
5146 				/* wait for regular transaction to end, or die trying */
5147 				switch (wait_policy)
5148 				{
5149 					case LockWaitBlock:
5150 						XactLockTableWait(xwait, relation, &tuple->t_self,
5151 										  XLTW_Lock);
5152 						break;
5153 					case LockWaitSkip:
5154 						if (!ConditionalXactLockTableWait(xwait))
5155 						{
5156 							result = HeapTupleWouldBlock;
5157 							/* recovery code expects to have buffer lock held */
5158 							LockBuffer(*buffer, BUFFER_LOCK_EXCLUSIVE);
5159 							goto failed;
5160 						}
5161 						break;
5162 					case LockWaitError:
5163 						if (!ConditionalXactLockTableWait(xwait))
5164 							ereport(ERROR,
5165 									(errcode(ERRCODE_LOCK_NOT_AVAILABLE),
5166 									 errmsg("could not obtain lock on row in relation \"%s\"",
5167 											RelationGetRelationName(relation))));
5168 						break;
5169 				}
5170 			}
5171 
5172 			/* if there are updates, follow the update chain */
5173 			if (follow_updates && !HEAP_XMAX_IS_LOCKED_ONLY(infomask))
5174 			{
5175 				HTSU_Result res;
5176 
5177 				res = heap_lock_updated_tuple(relation, tuple, &t_ctid,
5178 											  GetCurrentTransactionId(),
5179 											  mode);
5180 				if (res != HeapTupleMayBeUpdated)
5181 				{
5182 					result = res;
5183 					/* recovery code expects to have buffer lock held */
5184 					LockBuffer(*buffer, BUFFER_LOCK_EXCLUSIVE);
5185 					goto failed;
5186 				}
5187 			}
5188 
5189 			LockBuffer(*buffer, BUFFER_LOCK_EXCLUSIVE);
5190 
5191 			/*
5192 			 * xwait is done, but if xwait had just locked the tuple then some
5193 			 * other xact could update this tuple before we get to this point.
5194 			 * Check for xmax change, and start over if so.
5195 			 */
5196 			if (xmax_infomask_changed(tuple->t_data->t_infomask, infomask) ||
5197 				!TransactionIdEquals(HeapTupleHeaderGetRawXmax(tuple->t_data),
5198 									 xwait))
5199 				goto l3;
5200 
5201 			if (!(infomask & HEAP_XMAX_IS_MULTI))
5202 			{
5203 				/*
5204 				 * Otherwise check if it committed or aborted.  Note we cannot
5205 				 * be here if the tuple was only locked by somebody who didn't
5206 				 * conflict with us; that would have been handled above.  So
5207 				 * that transaction must necessarily be gone by now.  But
5208 				 * don't check for this in the multixact case, because some
5209 				 * locker transactions might still be running.
5210 				 */
5211 				UpdateXmaxHintBits(tuple->t_data, *buffer, xwait);
5212 			}
5213 		}
5214 
5215 		/* By here, we're certain that we hold buffer exclusive lock again */
5216 
5217 		/*
5218 		 * We may lock if previous xmax aborted, or if it committed but only
5219 		 * locked the tuple without updating it; or if we didn't have to wait
5220 		 * at all for whatever reason.
5221 		 */
5222 		if (!require_sleep ||
5223 			(tuple->t_data->t_infomask & HEAP_XMAX_INVALID) ||
5224 			HEAP_XMAX_IS_LOCKED_ONLY(tuple->t_data->t_infomask) ||
5225 			HeapTupleHeaderIsOnlyLocked(tuple->t_data))
5226 			result = HeapTupleMayBeUpdated;
5227 		else
5228 			result = HeapTupleUpdated;
5229 	}
5230 
5231 failed:
5232 	if (result != HeapTupleMayBeUpdated)
5233 	{
5234 		Assert(result == HeapTupleSelfUpdated || result == HeapTupleUpdated ||
5235 			   result == HeapTupleWouldBlock);
5236 		Assert(!(tuple->t_data->t_infomask & HEAP_XMAX_INVALID));
5237 		hufd->ctid = tuple->t_data->t_ctid;
5238 		hufd->xmax = HeapTupleHeaderGetUpdateXid(tuple->t_data);
5239 		if (result == HeapTupleSelfUpdated)
5240 			hufd->cmax = HeapTupleHeaderGetCmax(tuple->t_data);
5241 		else
5242 			hufd->cmax = InvalidCommandId;
5243 		goto out_locked;
5244 	}
5245 
5246 	/*
5247 	 * If we didn't pin the visibility map page and the page has become all
5248 	 * visible while we were busy locking the buffer, or during some
5249 	 * subsequent window during which we had it unlocked, we'll have to unlock
5250 	 * and re-lock, to avoid holding the buffer lock across I/O.  That's a bit
5251 	 * unfortunate, especially since we'll now have to recheck whether the
5252 	 * tuple has been locked or updated under us, but hopefully it won't
5253 	 * happen very often.
5254 	 */
5255 	if (vmbuffer == InvalidBuffer && PageIsAllVisible(page))
5256 	{
5257 		LockBuffer(*buffer, BUFFER_LOCK_UNLOCK);
5258 		visibilitymap_pin(relation, block, &vmbuffer);
5259 		LockBuffer(*buffer, BUFFER_LOCK_EXCLUSIVE);
5260 		goto l3;
5261 	}
5262 
5263 	xmax = HeapTupleHeaderGetRawXmax(tuple->t_data);
5264 	old_infomask = tuple->t_data->t_infomask;
5265 
5266 	/*
5267 	 * If this is the first possibly-multixact-able operation in the current
5268 	 * transaction, set my per-backend OldestMemberMXactId setting. We can be
5269 	 * certain that the transaction will never become a member of any older
5270 	 * MultiXactIds than that.  (We have to do this even if we end up just
5271 	 * using our own TransactionId below, since some other backend could
5272 	 * incorporate our XID into a MultiXact immediately afterwards.)
5273 	 */
5274 	MultiXactIdSetOldestMember();
5275 
5276 	/*
5277 	 * Compute the new xmax and infomask to store into the tuple.  Note we do
5278 	 * not modify the tuple just yet, because that would leave it in the wrong
5279 	 * state if multixact.c elogs.
5280 	 */
5281 	compute_new_xmax_infomask(xmax, old_infomask, tuple->t_data->t_infomask2,
5282 							  GetCurrentTransactionId(), mode, false,
5283 							  &xid, &new_infomask, &new_infomask2);
5284 
5285 	START_CRIT_SECTION();
5286 
5287 	/*
5288 	 * Store transaction information of xact locking the tuple.
5289 	 *
5290 	 * Note: Cmax is meaningless in this context, so don't set it; this avoids
5291 	 * possibly generating a useless combo CID.  Moreover, if we're locking a
5292 	 * previously updated tuple, it's important to preserve the Cmax.
5293 	 *
5294 	 * Also reset the HOT UPDATE bit, but only if there's no update; otherwise
5295 	 * we would break the HOT chain.
5296 	 */
5297 	tuple->t_data->t_infomask &= ~HEAP_XMAX_BITS;
5298 	tuple->t_data->t_infomask2 &= ~HEAP_KEYS_UPDATED;
5299 	tuple->t_data->t_infomask |= new_infomask;
5300 	tuple->t_data->t_infomask2 |= new_infomask2;
5301 	if (HEAP_XMAX_IS_LOCKED_ONLY(new_infomask))
5302 		HeapTupleHeaderClearHotUpdated(tuple->t_data);
5303 	HeapTupleHeaderSetXmax(tuple->t_data, xid);
5304 
5305 	/*
5306 	 * Make sure there is no forward chain link in t_ctid.  Note that in the
5307 	 * cases where the tuple has been updated, we must not overwrite t_ctid,
5308 	 * because it was set by the updater.  Moreover, if the tuple has been
5309 	 * updated, we need to follow the update chain to lock the new versions of
5310 	 * the tuple as well.
5311 	 */
5312 	if (HEAP_XMAX_IS_LOCKED_ONLY(new_infomask))
5313 		tuple->t_data->t_ctid = *tid;
5314 
5315 	/* Clear only the all-frozen bit on visibility map if needed */
5316 	if (PageIsAllVisible(page) &&
5317 		visibilitymap_clear(relation, block, vmbuffer,
5318 							VISIBILITYMAP_ALL_FROZEN))
5319 		cleared_all_frozen = true;
5320 
5321 
5322 	MarkBufferDirty(*buffer);
5323 
5324 	/*
5325 	 * XLOG stuff.  You might think that we don't need an XLOG record because
5326 	 * there is no state change worth restoring after a crash.  You would be
5327 	 * wrong however: we have just written either a TransactionId or a
5328 	 * MultiXactId that may never have been seen on disk before, and we need
5329 	 * to make sure that there are XLOG entries covering those ID numbers.
5330 	 * Else the same IDs might be re-used after a crash, which would be
5331 	 * disastrous if this page made it to disk before the crash.  Essentially
5332 	 * we have to enforce the WAL log-before-data rule even in this case.
5333 	 * (Also, in a PITR log-shipping or 2PC environment, we have to have XLOG
5334 	 * entries for everything anyway.)
5335 	 */
5336 	if (RelationNeedsWAL(relation))
5337 	{
5338 		xl_heap_lock xlrec;
5339 		XLogRecPtr	recptr;
5340 
5341 		XLogBeginInsert();
5342 		XLogRegisterBuffer(0, *buffer, REGBUF_STANDARD);
5343 
5344 		xlrec.offnum = ItemPointerGetOffsetNumber(&tuple->t_self);
5345 		xlrec.locking_xid = xid;
5346 		xlrec.infobits_set = compute_infobits(new_infomask,
5347 											  tuple->t_data->t_infomask2);
5348 		xlrec.flags = cleared_all_frozen ? XLH_LOCK_ALL_FROZEN_CLEARED : 0;
5349 		XLogRegisterData((char *) &xlrec, SizeOfHeapLock);
5350 
5351 		/* we don't decode row locks atm, so no need to log the origin */
5352 
5353 		recptr = XLogInsert(RM_HEAP_ID, XLOG_HEAP_LOCK);
5354 
5355 		PageSetLSN(page, recptr);
5356 	}
5357 
5358 	END_CRIT_SECTION();
5359 
5360 	result = HeapTupleMayBeUpdated;
5361 
5362 out_locked:
5363 	LockBuffer(*buffer, BUFFER_LOCK_UNLOCK);
5364 
5365 out_unlocked:
5366 	if (BufferIsValid(vmbuffer))
5367 		ReleaseBuffer(vmbuffer);
5368 
5369 	/*
5370 	 * Don't update the visibility map here. Locking a tuple doesn't change
5371 	 * visibility info.
5372 	 */
5373 
5374 	/*
5375 	 * Now that we have successfully marked the tuple as locked, we can
5376 	 * release the lmgr tuple lock, if we had it.
5377 	 */
5378 	if (have_tuple_lock)
5379 		UnlockTupleTuplock(relation, tid, mode);
5380 
5381 	return result;
5382 }
5383 
5384 /*
5385  * Acquire heavyweight lock on the given tuple, in preparation for acquiring
5386  * its normal, Xmax-based tuple lock.
5387  *
5388  * have_tuple_lock is an input and output parameter: on input, it indicates
5389  * whether the lock has previously been acquired (and this function does
5390  * nothing in that case).  If this function returns success, have_tuple_lock
5391  * has been flipped to true.
5392  *
5393  * Returns false if it was unable to obtain the lock; this can only happen if
5394  * wait_policy is Skip.
5395  */
5396 static bool
heap_acquire_tuplock(Relation relation,ItemPointer tid,LockTupleMode mode,LockWaitPolicy wait_policy,bool * have_tuple_lock)5397 heap_acquire_tuplock(Relation relation, ItemPointer tid, LockTupleMode mode,
5398 					 LockWaitPolicy wait_policy, bool *have_tuple_lock)
5399 {
5400 	if (*have_tuple_lock)
5401 		return true;
5402 
5403 	switch (wait_policy)
5404 	{
5405 		case LockWaitBlock:
5406 			LockTupleTuplock(relation, tid, mode);
5407 			break;
5408 
5409 		case LockWaitSkip:
5410 			if (!ConditionalLockTupleTuplock(relation, tid, mode))
5411 				return false;
5412 			break;
5413 
5414 		case LockWaitError:
5415 			if (!ConditionalLockTupleTuplock(relation, tid, mode))
5416 				ereport(ERROR,
5417 						(errcode(ERRCODE_LOCK_NOT_AVAILABLE),
5418 						 errmsg("could not obtain lock on row in relation \"%s\"",
5419 								RelationGetRelationName(relation))));
5420 			break;
5421 	}
5422 	*have_tuple_lock = true;
5423 
5424 	return true;
5425 }
5426 
5427 /*
5428  * Given an original set of Xmax and infomask, and a transaction (identified by
5429  * add_to_xmax) acquiring a new lock of some mode, compute the new Xmax and
5430  * corresponding infomasks to use on the tuple.
5431  *
5432  * Note that this might have side effects such as creating a new MultiXactId.
5433  *
5434  * Most callers will have called HeapTupleSatisfiesUpdate before this function;
5435  * that will have set the HEAP_XMAX_INVALID bit if the xmax was a MultiXactId
5436  * but it was not running anymore. There is a race condition, which is that the
5437  * MultiXactId may have finished since then, but that uncommon case is handled
5438  * either here, or within MultiXactIdExpand.
5439  *
5440  * There is a similar race condition possible when the old xmax was a regular
5441  * TransactionId.  We test TransactionIdIsInProgress again just to narrow the
5442  * window, but it's still possible to end up creating an unnecessary
5443  * MultiXactId.  Fortunately this is harmless.
5444  */
5445 static void
compute_new_xmax_infomask(TransactionId xmax,uint16 old_infomask,uint16 old_infomask2,TransactionId add_to_xmax,LockTupleMode mode,bool is_update,TransactionId * result_xmax,uint16 * result_infomask,uint16 * result_infomask2)5446 compute_new_xmax_infomask(TransactionId xmax, uint16 old_infomask,
5447 						  uint16 old_infomask2, TransactionId add_to_xmax,
5448 						  LockTupleMode mode, bool is_update,
5449 						  TransactionId *result_xmax, uint16 *result_infomask,
5450 						  uint16 *result_infomask2)
5451 {
5452 	TransactionId new_xmax;
5453 	uint16		new_infomask,
5454 				new_infomask2;
5455 
5456 	Assert(TransactionIdIsCurrentTransactionId(add_to_xmax));
5457 
5458 l5:
5459 	new_infomask = 0;
5460 	new_infomask2 = 0;
5461 	if (old_infomask & HEAP_XMAX_INVALID)
5462 	{
5463 		/*
5464 		 * No previous locker; we just insert our own TransactionId.
5465 		 *
5466 		 * Note that it's critical that this case be the first one checked,
5467 		 * because there are several blocks below that come back to this one
5468 		 * to implement certain optimizations; old_infomask might contain
5469 		 * other dirty bits in those cases, but we don't really care.
5470 		 */
5471 		if (is_update)
5472 		{
5473 			new_xmax = add_to_xmax;
5474 			if (mode == LockTupleExclusive)
5475 				new_infomask2 |= HEAP_KEYS_UPDATED;
5476 		}
5477 		else
5478 		{
5479 			new_infomask |= HEAP_XMAX_LOCK_ONLY;
5480 			switch (mode)
5481 			{
5482 				case LockTupleKeyShare:
5483 					new_xmax = add_to_xmax;
5484 					new_infomask |= HEAP_XMAX_KEYSHR_LOCK;
5485 					break;
5486 				case LockTupleShare:
5487 					new_xmax = add_to_xmax;
5488 					new_infomask |= HEAP_XMAX_SHR_LOCK;
5489 					break;
5490 				case LockTupleNoKeyExclusive:
5491 					new_xmax = add_to_xmax;
5492 					new_infomask |= HEAP_XMAX_EXCL_LOCK;
5493 					break;
5494 				case LockTupleExclusive:
5495 					new_xmax = add_to_xmax;
5496 					new_infomask |= HEAP_XMAX_EXCL_LOCK;
5497 					new_infomask2 |= HEAP_KEYS_UPDATED;
5498 					break;
5499 				default:
5500 					new_xmax = InvalidTransactionId;	/* silence compiler */
5501 					elog(ERROR, "invalid lock mode");
5502 			}
5503 		}
5504 	}
5505 	else if (old_infomask & HEAP_XMAX_IS_MULTI)
5506 	{
5507 		MultiXactStatus new_status;
5508 
5509 		/*
5510 		 * Currently we don't allow XMAX_COMMITTED to be set for multis, so
5511 		 * cross-check.
5512 		 */
5513 		Assert(!(old_infomask & HEAP_XMAX_COMMITTED));
5514 
5515 		/*
5516 		 * A multixact together with LOCK_ONLY set but neither lock bit set
5517 		 * (i.e. a pg_upgraded share locked tuple) cannot possibly be running
5518 		 * anymore.  This check is critical for databases upgraded by
5519 		 * pg_upgrade; both MultiXactIdIsRunning and MultiXactIdExpand assume
5520 		 * that such multis are never passed.
5521 		 */
5522 		if (HEAP_LOCKED_UPGRADED(old_infomask))
5523 		{
5524 			old_infomask &= ~HEAP_XMAX_IS_MULTI;
5525 			old_infomask |= HEAP_XMAX_INVALID;
5526 			goto l5;
5527 		}
5528 
5529 		/*
5530 		 * If the XMAX is already a MultiXactId, then we need to expand it to
5531 		 * include add_to_xmax; but if all the members were lockers and are
5532 		 * all gone, we can do away with the IS_MULTI bit and just set
5533 		 * add_to_xmax as the only locker/updater.  If all lockers are gone
5534 		 * and we have an updater that aborted, we can also do without a
5535 		 * multi.
5536 		 *
5537 		 * The cost of doing GetMultiXactIdMembers would be paid by
5538 		 * MultiXactIdExpand if we weren't to do this, so this check is not
5539 		 * incurring extra work anyhow.
5540 		 */
5541 		if (!MultiXactIdIsRunning(xmax, HEAP_XMAX_IS_LOCKED_ONLY(old_infomask)))
5542 		{
5543 			if (HEAP_XMAX_IS_LOCKED_ONLY(old_infomask) ||
5544 				!TransactionIdDidCommit(MultiXactIdGetUpdateXid(xmax,
5545 																old_infomask)))
5546 			{
5547 				/*
5548 				 * Reset these bits and restart; otherwise fall through to
5549 				 * create a new multi below.
5550 				 */
5551 				old_infomask &= ~HEAP_XMAX_IS_MULTI;
5552 				old_infomask |= HEAP_XMAX_INVALID;
5553 				goto l5;
5554 			}
5555 		}
5556 
5557 		new_status = get_mxact_status_for_lock(mode, is_update);
5558 
5559 		new_xmax = MultiXactIdExpand((MultiXactId) xmax, add_to_xmax,
5560 									 new_status);
5561 		GetMultiXactIdHintBits(new_xmax, &new_infomask, &new_infomask2);
5562 	}
5563 	else if (old_infomask & HEAP_XMAX_COMMITTED)
5564 	{
5565 		/*
5566 		 * It's a committed update, so we need to preserve him as updater of
5567 		 * the tuple.
5568 		 */
5569 		MultiXactStatus status;
5570 		MultiXactStatus new_status;
5571 
5572 		if (old_infomask2 & HEAP_KEYS_UPDATED)
5573 			status = MultiXactStatusUpdate;
5574 		else
5575 			status = MultiXactStatusNoKeyUpdate;
5576 
5577 		new_status = get_mxact_status_for_lock(mode, is_update);
5578 
5579 		/*
5580 		 * since it's not running, it's obviously impossible for the old
5581 		 * updater to be identical to the current one, so we need not check
5582 		 * for that case as we do in the block above.
5583 		 */
5584 		new_xmax = MultiXactIdCreate(xmax, status, add_to_xmax, new_status);
5585 		GetMultiXactIdHintBits(new_xmax, &new_infomask, &new_infomask2);
5586 	}
5587 	else if (TransactionIdIsInProgress(xmax))
5588 	{
5589 		/*
5590 		 * If the XMAX is a valid, in-progress TransactionId, then we need to
5591 		 * create a new MultiXactId that includes both the old locker or
5592 		 * updater and our own TransactionId.
5593 		 */
5594 		MultiXactStatus new_status;
5595 		MultiXactStatus old_status;
5596 		LockTupleMode old_mode;
5597 
5598 		if (HEAP_XMAX_IS_LOCKED_ONLY(old_infomask))
5599 		{
5600 			if (HEAP_XMAX_IS_KEYSHR_LOCKED(old_infomask))
5601 				old_status = MultiXactStatusForKeyShare;
5602 			else if (HEAP_XMAX_IS_SHR_LOCKED(old_infomask))
5603 				old_status = MultiXactStatusForShare;
5604 			else if (HEAP_XMAX_IS_EXCL_LOCKED(old_infomask))
5605 			{
5606 				if (old_infomask2 & HEAP_KEYS_UPDATED)
5607 					old_status = MultiXactStatusForUpdate;
5608 				else
5609 					old_status = MultiXactStatusForNoKeyUpdate;
5610 			}
5611 			else
5612 			{
5613 				/*
5614 				 * LOCK_ONLY can be present alone only when a page has been
5615 				 * upgraded by pg_upgrade.  But in that case,
5616 				 * TransactionIdIsInProgress() should have returned false.  We
5617 				 * assume it's no longer locked in this case.
5618 				 */
5619 				elog(WARNING, "LOCK_ONLY found for Xid in progress %u", xmax);
5620 				old_infomask |= HEAP_XMAX_INVALID;
5621 				old_infomask &= ~HEAP_XMAX_LOCK_ONLY;
5622 				goto l5;
5623 			}
5624 		}
5625 		else
5626 		{
5627 			/* it's an update, but which kind? */
5628 			if (old_infomask2 & HEAP_KEYS_UPDATED)
5629 				old_status = MultiXactStatusUpdate;
5630 			else
5631 				old_status = MultiXactStatusNoKeyUpdate;
5632 		}
5633 
5634 		old_mode = TUPLOCK_from_mxstatus(old_status);
5635 
5636 		/*
5637 		 * If the lock to be acquired is for the same TransactionId as the
5638 		 * existing lock, there's an optimization possible: consider only the
5639 		 * strongest of both locks as the only one present, and restart.
5640 		 */
5641 		if (xmax == add_to_xmax)
5642 		{
5643 			/*
5644 			 * Note that it's not possible for the original tuple to be
5645 			 * updated: we wouldn't be here because the tuple would have been
5646 			 * invisible and we wouldn't try to update it.  As a subtlety,
5647 			 * this code can also run when traversing an update chain to lock
5648 			 * future versions of a tuple.  But we wouldn't be here either,
5649 			 * because the add_to_xmax would be different from the original
5650 			 * updater.
5651 			 */
5652 			Assert(HEAP_XMAX_IS_LOCKED_ONLY(old_infomask));
5653 
5654 			/* acquire the strongest of both */
5655 			if (mode < old_mode)
5656 				mode = old_mode;
5657 			/* mustn't touch is_update */
5658 
5659 			old_infomask |= HEAP_XMAX_INVALID;
5660 			goto l5;
5661 		}
5662 
5663 		/* otherwise, just fall back to creating a new multixact */
5664 		new_status = get_mxact_status_for_lock(mode, is_update);
5665 		new_xmax = MultiXactIdCreate(xmax, old_status,
5666 									 add_to_xmax, new_status);
5667 		GetMultiXactIdHintBits(new_xmax, &new_infomask, &new_infomask2);
5668 	}
5669 	else if (!HEAP_XMAX_IS_LOCKED_ONLY(old_infomask) &&
5670 			 TransactionIdDidCommit(xmax))
5671 	{
5672 		/*
5673 		 * It's a committed update, so we gotta preserve him as updater of the
5674 		 * tuple.
5675 		 */
5676 		MultiXactStatus status;
5677 		MultiXactStatus new_status;
5678 
5679 		if (old_infomask2 & HEAP_KEYS_UPDATED)
5680 			status = MultiXactStatusUpdate;
5681 		else
5682 			status = MultiXactStatusNoKeyUpdate;
5683 
5684 		new_status = get_mxact_status_for_lock(mode, is_update);
5685 
5686 		/*
5687 		 * since it's not running, it's obviously impossible for the old
5688 		 * updater to be identical to the current one, so we need not check
5689 		 * for that case as we do in the block above.
5690 		 */
5691 		new_xmax = MultiXactIdCreate(xmax, status, add_to_xmax, new_status);
5692 		GetMultiXactIdHintBits(new_xmax, &new_infomask, &new_infomask2);
5693 	}
5694 	else
5695 	{
5696 		/*
5697 		 * Can get here iff the locking/updating transaction was running when
5698 		 * the infomask was extracted from the tuple, but finished before
5699 		 * TransactionIdIsInProgress got to run.  Deal with it as if there was
5700 		 * no locker at all in the first place.
5701 		 */
5702 		old_infomask |= HEAP_XMAX_INVALID;
5703 		goto l5;
5704 	}
5705 
5706 	*result_infomask = new_infomask;
5707 	*result_infomask2 = new_infomask2;
5708 	*result_xmax = new_xmax;
5709 }
5710 
5711 /*
5712  * Subroutine for heap_lock_updated_tuple_rec.
5713  *
5714  * Given a hypothetical multixact status held by the transaction identified
5715  * with the given xid, does the current transaction need to wait, fail, or can
5716  * it continue if it wanted to acquire a lock of the given mode?  "needwait"
5717  * is set to true if waiting is necessary; if it can continue, then
5718  * HeapTupleMayBeUpdated is returned.  If the lock is already held by the
5719  * current transaction, return HeapTupleSelfUpdated.  In case of a conflict
5720  * with another transaction, a different HeapTupleSatisfiesUpdate return code
5721  * is returned.
5722  *
5723  * The held status is said to be hypothetical because it might correspond to a
5724  * lock held by a single Xid, i.e. not a real MultiXactId; we express it this
5725  * way for simplicity of API.
5726  */
5727 static HTSU_Result
test_lockmode_for_conflict(MultiXactStatus status,TransactionId xid,LockTupleMode mode,bool * needwait)5728 test_lockmode_for_conflict(MultiXactStatus status, TransactionId xid,
5729 						   LockTupleMode mode, bool *needwait)
5730 {
5731 	MultiXactStatus wantedstatus;
5732 
5733 	*needwait = false;
5734 	wantedstatus = get_mxact_status_for_lock(mode, false);
5735 
5736 	/*
5737 	 * Note: we *must* check TransactionIdIsInProgress before
5738 	 * TransactionIdDidAbort/Commit; see comment at top of tqual.c for an
5739 	 * explanation.
5740 	 */
5741 	if (TransactionIdIsCurrentTransactionId(xid))
5742 	{
5743 		/*
5744 		 * The tuple has already been locked by our own transaction.  This is
5745 		 * very rare but can happen if multiple transactions are trying to
5746 		 * lock an ancient version of the same tuple.
5747 		 */
5748 		return HeapTupleSelfUpdated;
5749 	}
5750 	else if (TransactionIdIsInProgress(xid))
5751 	{
5752 		/*
5753 		 * If the locking transaction is running, what we do depends on
5754 		 * whether the lock modes conflict: if they do, then we must wait for
5755 		 * it to finish; otherwise we can fall through to lock this tuple
5756 		 * version without waiting.
5757 		 */
5758 		if (DoLockModesConflict(LOCKMODE_from_mxstatus(status),
5759 								LOCKMODE_from_mxstatus(wantedstatus)))
5760 		{
5761 			*needwait = true;
5762 		}
5763 
5764 		/*
5765 		 * If we set needwait above, then this value doesn't matter;
5766 		 * otherwise, this value signals to caller that it's okay to proceed.
5767 		 */
5768 		return HeapTupleMayBeUpdated;
5769 	}
5770 	else if (TransactionIdDidAbort(xid))
5771 		return HeapTupleMayBeUpdated;
5772 	else if (TransactionIdDidCommit(xid))
5773 	{
5774 		/*
5775 		 * The other transaction committed.  If it was only a locker, then the
5776 		 * lock is completely gone now and we can return success; but if it
5777 		 * was an update, then what we do depends on whether the two lock
5778 		 * modes conflict.  If they conflict, then we must report error to
5779 		 * caller. But if they don't, we can fall through to allow the current
5780 		 * transaction to lock the tuple.
5781 		 *
5782 		 * Note: the reason we worry about ISUPDATE here is because as soon as
5783 		 * a transaction ends, all its locks are gone and meaningless, and
5784 		 * thus we can ignore them; whereas its updates persist.  In the
5785 		 * TransactionIdIsInProgress case, above, we don't need to check
5786 		 * because we know the lock is still "alive" and thus a conflict needs
5787 		 * always be checked.
5788 		 */
5789 		if (!ISUPDATE_from_mxstatus(status))
5790 			return HeapTupleMayBeUpdated;
5791 
5792 		if (DoLockModesConflict(LOCKMODE_from_mxstatus(status),
5793 								LOCKMODE_from_mxstatus(wantedstatus)))
5794 			/* bummer */
5795 			return HeapTupleUpdated;
5796 
5797 		return HeapTupleMayBeUpdated;
5798 	}
5799 
5800 	/* Not in progress, not aborted, not committed -- must have crashed */
5801 	return HeapTupleMayBeUpdated;
5802 }
5803 
5804 
5805 /*
5806  * Recursive part of heap_lock_updated_tuple
5807  *
5808  * Fetch the tuple pointed to by tid in rel, and mark it as locked by the given
5809  * xid with the given mode; if this tuple is updated, recurse to lock the new
5810  * version as well.
5811  */
5812 static HTSU_Result
heap_lock_updated_tuple_rec(Relation rel,ItemPointer tid,TransactionId xid,LockTupleMode mode)5813 heap_lock_updated_tuple_rec(Relation rel, ItemPointer tid, TransactionId xid,
5814 							LockTupleMode mode)
5815 {
5816 	HTSU_Result result;
5817 	ItemPointerData tupid;
5818 	HeapTupleData mytup;
5819 	Buffer		buf;
5820 	uint16		new_infomask,
5821 				new_infomask2,
5822 				old_infomask,
5823 				old_infomask2;
5824 	TransactionId xmax,
5825 				new_xmax;
5826 	TransactionId priorXmax = InvalidTransactionId;
5827 	bool		cleared_all_frozen = false;
5828 	bool		pinned_desired_page;
5829 	Buffer		vmbuffer = InvalidBuffer;
5830 	BlockNumber block;
5831 
5832 	ItemPointerCopy(tid, &tupid);
5833 
5834 	for (;;)
5835 	{
5836 		new_infomask = 0;
5837 		new_xmax = InvalidTransactionId;
5838 		block = ItemPointerGetBlockNumber(&tupid);
5839 		ItemPointerCopy(&tupid, &(mytup.t_self));
5840 
5841 		if (!heap_fetch(rel, SnapshotAny, &mytup, &buf, false, NULL))
5842 		{
5843 			/*
5844 			 * if we fail to find the updated version of the tuple, it's
5845 			 * because it was vacuumed/pruned away after its creator
5846 			 * transaction aborted.  So behave as if we got to the end of the
5847 			 * chain, and there's no further tuple to lock: return success to
5848 			 * caller.
5849 			 */
5850 			result = HeapTupleMayBeUpdated;
5851 			goto out_unlocked;
5852 		}
5853 
5854 l4:
5855 		CHECK_FOR_INTERRUPTS();
5856 
5857 		/*
5858 		 * Before locking the buffer, pin the visibility map page if it
5859 		 * appears to be necessary.  Since we haven't got the lock yet,
5860 		 * someone else might be in the middle of changing this, so we'll need
5861 		 * to recheck after we have the lock.
5862 		 */
5863 		if (PageIsAllVisible(BufferGetPage(buf)))
5864 		{
5865 			visibilitymap_pin(rel, block, &vmbuffer);
5866 			pinned_desired_page = true;
5867 		}
5868 		else
5869 			pinned_desired_page = false;
5870 
5871 		LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE);
5872 
5873 		/*
5874 		 * If we didn't pin the visibility map page and the page has become
5875 		 * all visible while we were busy locking the buffer, we'll have to
5876 		 * unlock and re-lock, to avoid holding the buffer lock across I/O.
5877 		 * That's a bit unfortunate, but hopefully shouldn't happen often.
5878 		 *
5879 		 * Note: in some paths through this function, we will reach here
5880 		 * holding a pin on a vm page that may or may not be the one matching
5881 		 * this page.  If this page isn't all-visible, we won't use the vm
5882 		 * page, but we hold onto such a pin till the end of the function.
5883 		 */
5884 		if (!pinned_desired_page && PageIsAllVisible(BufferGetPage(buf)))
5885 		{
5886 			LockBuffer(buf, BUFFER_LOCK_UNLOCK);
5887 			visibilitymap_pin(rel, block, &vmbuffer);
5888 			LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE);
5889 		}
5890 
5891 		/*
5892 		 * Check the tuple XMIN against prior XMAX, if any.  If we reached the
5893 		 * end of the chain, we're done, so return success.
5894 		 */
5895 		if (TransactionIdIsValid(priorXmax) &&
5896 			!TransactionIdEquals(HeapTupleHeaderGetXmin(mytup.t_data),
5897 								 priorXmax))
5898 		{
5899 			result = HeapTupleMayBeUpdated;
5900 			goto out_locked;
5901 		}
5902 
5903 		/*
5904 		 * Also check Xmin: if this tuple was created by an aborted
5905 		 * (sub)transaction, then we already locked the last live one in the
5906 		 * chain, thus we're done, so return success.
5907 		 */
5908 		if (TransactionIdDidAbort(HeapTupleHeaderGetXmin(mytup.t_data)))
5909 		{
5910 			result = HeapTupleMayBeUpdated;
5911 			goto out_locked;
5912 		}
5913 
5914 		old_infomask = mytup.t_data->t_infomask;
5915 		old_infomask2 = mytup.t_data->t_infomask2;
5916 		xmax = HeapTupleHeaderGetRawXmax(mytup.t_data);
5917 
5918 		/*
5919 		 * If this tuple version has been updated or locked by some concurrent
5920 		 * transaction(s), what we do depends on whether our lock mode
5921 		 * conflicts with what those other transactions hold, and also on the
5922 		 * status of them.
5923 		 */
5924 		if (!(old_infomask & HEAP_XMAX_INVALID))
5925 		{
5926 			TransactionId rawxmax;
5927 			bool		needwait;
5928 
5929 			rawxmax = HeapTupleHeaderGetRawXmax(mytup.t_data);
5930 			if (old_infomask & HEAP_XMAX_IS_MULTI)
5931 			{
5932 				int			nmembers;
5933 				int			i;
5934 				MultiXactMember *members;
5935 
5936 				/*
5937 				 * We don't need a test for pg_upgrade'd tuples: this is only
5938 				 * applied to tuples after the first in an update chain.  Said
5939 				 * first tuple in the chain may well be locked-in-9.2-and-
5940 				 * pg_upgraded, but that one was already locked by our caller,
5941 				 * not us; and any subsequent ones cannot be because our
5942 				 * caller must necessarily have obtained a snapshot later than
5943 				 * the pg_upgrade itself.
5944 				 */
5945 				Assert(!HEAP_LOCKED_UPGRADED(mytup.t_data->t_infomask));
5946 
5947 				nmembers = GetMultiXactIdMembers(rawxmax, &members, false,
5948 												 HEAP_XMAX_IS_LOCKED_ONLY(old_infomask));
5949 				for (i = 0; i < nmembers; i++)
5950 				{
5951 					result = test_lockmode_for_conflict(members[i].status,
5952 														members[i].xid,
5953 														mode, &needwait);
5954 
5955 					/*
5956 					 * If the tuple was already locked by ourselves in a
5957 					 * previous iteration of this (say heap_lock_tuple was
5958 					 * forced to restart the locking loop because of a change
5959 					 * in xmax), then we hold the lock already on this tuple
5960 					 * version and we don't need to do anything; and this is
5961 					 * not an error condition either.  We just need to skip
5962 					 * this tuple and continue locking the next version in the
5963 					 * update chain.
5964 					 */
5965 					if (result == HeapTupleSelfUpdated)
5966 					{
5967 						pfree(members);
5968 						goto next;
5969 					}
5970 
5971 					if (needwait)
5972 					{
5973 						LockBuffer(buf, BUFFER_LOCK_UNLOCK);
5974 						XactLockTableWait(members[i].xid, rel,
5975 										  &mytup.t_self,
5976 										  XLTW_LockUpdated);
5977 						pfree(members);
5978 						goto l4;
5979 					}
5980 					if (result != HeapTupleMayBeUpdated)
5981 					{
5982 						pfree(members);
5983 						goto out_locked;
5984 					}
5985 				}
5986 				if (members)
5987 					pfree(members);
5988 			}
5989 			else
5990 			{
5991 				MultiXactStatus status;
5992 
5993 				/*
5994 				 * For a non-multi Xmax, we first need to compute the
5995 				 * corresponding MultiXactStatus by using the infomask bits.
5996 				 */
5997 				if (HEAP_XMAX_IS_LOCKED_ONLY(old_infomask))
5998 				{
5999 					if (HEAP_XMAX_IS_KEYSHR_LOCKED(old_infomask))
6000 						status = MultiXactStatusForKeyShare;
6001 					else if (HEAP_XMAX_IS_SHR_LOCKED(old_infomask))
6002 						status = MultiXactStatusForShare;
6003 					else if (HEAP_XMAX_IS_EXCL_LOCKED(old_infomask))
6004 					{
6005 						if (old_infomask2 & HEAP_KEYS_UPDATED)
6006 							status = MultiXactStatusForUpdate;
6007 						else
6008 							status = MultiXactStatusForNoKeyUpdate;
6009 					}
6010 					else
6011 					{
6012 						/*
6013 						 * LOCK_ONLY present alone (a pg_upgraded tuple marked
6014 						 * as share-locked in the old cluster) shouldn't be
6015 						 * seen in the middle of an update chain.
6016 						 */
6017 						elog(ERROR, "invalid lock status in tuple");
6018 					}
6019 				}
6020 				else
6021 				{
6022 					/* it's an update, but which kind? */
6023 					if (old_infomask2 & HEAP_KEYS_UPDATED)
6024 						status = MultiXactStatusUpdate;
6025 					else
6026 						status = MultiXactStatusNoKeyUpdate;
6027 				}
6028 
6029 				result = test_lockmode_for_conflict(status, rawxmax, mode,
6030 													&needwait);
6031 
6032 				/*
6033 				 * If the tuple was already locked by ourselves in a previous
6034 				 * iteration of this (say heap_lock_tuple was forced to
6035 				 * restart the locking loop because of a change in xmax), then
6036 				 * we hold the lock already on this tuple version and we don't
6037 				 * need to do anything; and this is not an error condition
6038 				 * either.  We just need to skip this tuple and continue
6039 				 * locking the next version in the update chain.
6040 				 */
6041 				if (result == HeapTupleSelfUpdated)
6042 					goto next;
6043 
6044 				if (needwait)
6045 				{
6046 					LockBuffer(buf, BUFFER_LOCK_UNLOCK);
6047 					XactLockTableWait(rawxmax, rel, &mytup.t_self,
6048 									  XLTW_LockUpdated);
6049 					goto l4;
6050 				}
6051 				if (result != HeapTupleMayBeUpdated)
6052 				{
6053 					goto out_locked;
6054 				}
6055 			}
6056 		}
6057 
6058 		/* compute the new Xmax and infomask values for the tuple ... */
6059 		compute_new_xmax_infomask(xmax, old_infomask, mytup.t_data->t_infomask2,
6060 								  xid, mode, false,
6061 								  &new_xmax, &new_infomask, &new_infomask2);
6062 
6063 		if (PageIsAllVisible(BufferGetPage(buf)) &&
6064 			visibilitymap_clear(rel, block, vmbuffer,
6065 								VISIBILITYMAP_ALL_FROZEN))
6066 			cleared_all_frozen = true;
6067 
6068 		START_CRIT_SECTION();
6069 
6070 		/* ... and set them */
6071 		HeapTupleHeaderSetXmax(mytup.t_data, new_xmax);
6072 		mytup.t_data->t_infomask &= ~HEAP_XMAX_BITS;
6073 		mytup.t_data->t_infomask2 &= ~HEAP_KEYS_UPDATED;
6074 		mytup.t_data->t_infomask |= new_infomask;
6075 		mytup.t_data->t_infomask2 |= new_infomask2;
6076 
6077 		MarkBufferDirty(buf);
6078 
6079 		/* XLOG stuff */
6080 		if (RelationNeedsWAL(rel))
6081 		{
6082 			xl_heap_lock_updated xlrec;
6083 			XLogRecPtr	recptr;
6084 			Page		page = BufferGetPage(buf);
6085 
6086 			XLogBeginInsert();
6087 			XLogRegisterBuffer(0, buf, REGBUF_STANDARD);
6088 
6089 			xlrec.offnum = ItemPointerGetOffsetNumber(&mytup.t_self);
6090 			xlrec.xmax = new_xmax;
6091 			xlrec.infobits_set = compute_infobits(new_infomask, new_infomask2);
6092 			xlrec.flags =
6093 				cleared_all_frozen ? XLH_LOCK_ALL_FROZEN_CLEARED : 0;
6094 
6095 			XLogRegisterData((char *) &xlrec, SizeOfHeapLockUpdated);
6096 
6097 			recptr = XLogInsert(RM_HEAP2_ID, XLOG_HEAP2_LOCK_UPDATED);
6098 
6099 			PageSetLSN(page, recptr);
6100 		}
6101 
6102 		END_CRIT_SECTION();
6103 
6104 next:
6105 		/* if we find the end of update chain, we're done. */
6106 		if (mytup.t_data->t_infomask & HEAP_XMAX_INVALID ||
6107 			HeapTupleHeaderIndicatesMovedPartitions(mytup.t_data) ||
6108 			ItemPointerEquals(&mytup.t_self, &mytup.t_data->t_ctid) ||
6109 			HeapTupleHeaderIsOnlyLocked(mytup.t_data))
6110 		{
6111 			result = HeapTupleMayBeUpdated;
6112 			goto out_locked;
6113 		}
6114 
6115 		/* tail recursion */
6116 		priorXmax = HeapTupleHeaderGetUpdateXid(mytup.t_data);
6117 		ItemPointerCopy(&(mytup.t_data->t_ctid), &tupid);
6118 		UnlockReleaseBuffer(buf);
6119 	}
6120 
6121 	result = HeapTupleMayBeUpdated;
6122 
6123 out_locked:
6124 	UnlockReleaseBuffer(buf);
6125 
6126 out_unlocked:
6127 	if (vmbuffer != InvalidBuffer)
6128 		ReleaseBuffer(vmbuffer);
6129 
6130 	return result;
6131 }
6132 
6133 /*
6134  * heap_lock_updated_tuple
6135  *		Follow update chain when locking an updated tuple, acquiring locks (row
6136  *		marks) on the updated versions.
6137  *
6138  * The initial tuple is assumed to be already locked.
6139  *
6140  * This function doesn't check visibility, it just unconditionally marks the
6141  * tuple(s) as locked.  If any tuple in the updated chain is being deleted
6142  * concurrently (or updated with the key being modified), sleep until the
6143  * transaction doing it is finished.
6144  *
6145  * Note that we don't acquire heavyweight tuple locks on the tuples we walk
6146  * when we have to wait for other transactions to release them, as opposed to
6147  * what heap_lock_tuple does.  The reason is that having more than one
6148  * transaction walking the chain is probably uncommon enough that risk of
6149  * starvation is not likely: one of the preconditions for being here is that
6150  * the snapshot in use predates the update that created this tuple (because we
6151  * started at an earlier version of the tuple), but at the same time such a
6152  * transaction cannot be using repeatable read or serializable isolation
6153  * levels, because that would lead to a serializability failure.
6154  */
6155 static HTSU_Result
heap_lock_updated_tuple(Relation rel,HeapTuple tuple,ItemPointer ctid,TransactionId xid,LockTupleMode mode)6156 heap_lock_updated_tuple(Relation rel, HeapTuple tuple, ItemPointer ctid,
6157 						TransactionId xid, LockTupleMode mode)
6158 {
6159 	/*
6160 	 * If the tuple has not been updated, or has moved into another partition
6161 	 * (effectively a delete) stop here.
6162 	 */
6163 	if (!HeapTupleHeaderIndicatesMovedPartitions(tuple->t_data) &&
6164 		!ItemPointerEquals(&tuple->t_self, ctid))
6165 	{
6166 		/*
6167 		 * If this is the first possibly-multixact-able operation in the
6168 		 * current transaction, set my per-backend OldestMemberMXactId
6169 		 * setting. We can be certain that the transaction will never become a
6170 		 * member of any older MultiXactIds than that.  (We have to do this
6171 		 * even if we end up just using our own TransactionId below, since
6172 		 * some other backend could incorporate our XID into a MultiXact
6173 		 * immediately afterwards.)
6174 		 */
6175 		MultiXactIdSetOldestMember();
6176 
6177 		return heap_lock_updated_tuple_rec(rel, ctid, xid, mode);
6178 	}
6179 
6180 	/* nothing to lock */
6181 	return HeapTupleMayBeUpdated;
6182 }
6183 
6184 /*
6185  *	heap_finish_speculative - mark speculative insertion as successful
6186  *
6187  * To successfully finish a speculative insertion we have to clear speculative
6188  * token from tuple.  To do so the t_ctid field, which will contain a
6189  * speculative token value, is modified in place to point to the tuple itself,
6190  * which is characteristic of a newly inserted ordinary tuple.
6191  *
6192  * NB: It is not ok to commit without either finishing or aborting a
6193  * speculative insertion.  We could treat speculative tuples of committed
6194  * transactions implicitly as completed, but then we would have to be prepared
6195  * to deal with speculative tokens on committed tuples.  That wouldn't be
6196  * difficult - no-one looks at the ctid field of a tuple with invalid xmax -
6197  * but clearing the token at completion isn't very expensive either.
6198  * An explicit confirmation WAL record also makes logical decoding simpler.
6199  */
6200 void
heap_finish_speculative(Relation relation,HeapTuple tuple)6201 heap_finish_speculative(Relation relation, HeapTuple tuple)
6202 {
6203 	Buffer		buffer;
6204 	Page		page;
6205 	OffsetNumber offnum;
6206 	ItemId		lp = NULL;
6207 	HeapTupleHeader htup;
6208 
6209 	buffer = ReadBuffer(relation, ItemPointerGetBlockNumber(&(tuple->t_self)));
6210 	LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE);
6211 	page = (Page) BufferGetPage(buffer);
6212 
6213 	offnum = ItemPointerGetOffsetNumber(&(tuple->t_self));
6214 	if (PageGetMaxOffsetNumber(page) >= offnum)
6215 		lp = PageGetItemId(page, offnum);
6216 
6217 	if (PageGetMaxOffsetNumber(page) < offnum || !ItemIdIsNormal(lp))
6218 		elog(ERROR, "invalid lp");
6219 
6220 	htup = (HeapTupleHeader) PageGetItem(page, lp);
6221 
6222 	/* SpecTokenOffsetNumber should be distinguishable from any real offset */
6223 	StaticAssertStmt(MaxOffsetNumber < SpecTokenOffsetNumber,
6224 					 "invalid speculative token constant");
6225 
6226 	/* NO EREPORT(ERROR) from here till changes are logged */
6227 	START_CRIT_SECTION();
6228 
6229 	Assert(HeapTupleHeaderIsSpeculative(tuple->t_data));
6230 
6231 	MarkBufferDirty(buffer);
6232 
6233 	/*
6234 	 * Replace the speculative insertion token with a real t_ctid, pointing to
6235 	 * itself like it does on regular tuples.
6236 	 */
6237 	htup->t_ctid = tuple->t_self;
6238 
6239 	/* XLOG stuff */
6240 	if (RelationNeedsWAL(relation))
6241 	{
6242 		xl_heap_confirm xlrec;
6243 		XLogRecPtr	recptr;
6244 
6245 		xlrec.offnum = ItemPointerGetOffsetNumber(&tuple->t_self);
6246 
6247 		XLogBeginInsert();
6248 
6249 		/* We want the same filtering on this as on a plain insert */
6250 		XLogSetRecordFlags(XLOG_INCLUDE_ORIGIN);
6251 
6252 		XLogRegisterData((char *) &xlrec, SizeOfHeapConfirm);
6253 		XLogRegisterBuffer(0, buffer, REGBUF_STANDARD);
6254 
6255 		recptr = XLogInsert(RM_HEAP_ID, XLOG_HEAP_CONFIRM);
6256 
6257 		PageSetLSN(page, recptr);
6258 	}
6259 
6260 	END_CRIT_SECTION();
6261 
6262 	UnlockReleaseBuffer(buffer);
6263 }
6264 
6265 /*
6266  *	heap_abort_speculative - kill a speculatively inserted tuple
6267  *
6268  * Marks a tuple that was speculatively inserted in the same command as dead,
6269  * by setting its xmin as invalid.  That makes it immediately appear as dead
6270  * to all transactions, including our own.  In particular, it makes
6271  * HeapTupleSatisfiesDirty() regard the tuple as dead, so that another backend
6272  * inserting a duplicate key value won't unnecessarily wait for our whole
6273  * transaction to finish (it'll just wait for our speculative insertion to
6274  * finish).
6275  *
6276  * Killing the tuple prevents "unprincipled deadlocks", which are deadlocks
6277  * that arise due to a mutual dependency that is not user visible.  By
6278  * definition, unprincipled deadlocks cannot be prevented by the user
6279  * reordering lock acquisition in client code, because the implementation level
6280  * lock acquisitions are not under the user's direct control.  If speculative
6281  * inserters did not take this precaution, then under high concurrency they
6282  * could deadlock with each other, which would not be acceptable.
6283  *
6284  * This is somewhat redundant with heap_delete, but we prefer to have a
6285  * dedicated routine with stripped down requirements.  Note that this is also
6286  * used to delete the TOAST tuples created during speculative insertion.
6287  *
6288  * This routine does not affect logical decoding as it only looks at
6289  * confirmation records.
6290  */
6291 void
heap_abort_speculative(Relation relation,HeapTuple tuple)6292 heap_abort_speculative(Relation relation, HeapTuple tuple)
6293 {
6294 	TransactionId xid = GetCurrentTransactionId();
6295 	ItemPointer tid = &(tuple->t_self);
6296 	ItemId		lp;
6297 	HeapTupleData tp;
6298 	Page		page;
6299 	BlockNumber block;
6300 	Buffer		buffer;
6301 	TransactionId prune_xid;
6302 
6303 	Assert(ItemPointerIsValid(tid));
6304 
6305 	block = ItemPointerGetBlockNumber(tid);
6306 	buffer = ReadBuffer(relation, block);
6307 	page = BufferGetPage(buffer);
6308 
6309 	LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE);
6310 
6311 	/*
6312 	 * Page can't be all visible, we just inserted into it, and are still
6313 	 * running.
6314 	 */
6315 	Assert(!PageIsAllVisible(page));
6316 
6317 	lp = PageGetItemId(page, ItemPointerGetOffsetNumber(tid));
6318 	Assert(ItemIdIsNormal(lp));
6319 
6320 	tp.t_tableOid = RelationGetRelid(relation);
6321 	tp.t_data = (HeapTupleHeader) PageGetItem(page, lp);
6322 	tp.t_len = ItemIdGetLength(lp);
6323 	tp.t_self = *tid;
6324 
6325 	/*
6326 	 * Sanity check that the tuple really is a speculatively inserted tuple,
6327 	 * inserted by us.
6328 	 */
6329 	if (tp.t_data->t_choice.t_heap.t_xmin != xid)
6330 		elog(ERROR, "attempted to kill a tuple inserted by another transaction");
6331 	if (!(IsToastRelation(relation) || HeapTupleHeaderIsSpeculative(tp.t_data)))
6332 		elog(ERROR, "attempted to kill a non-speculative tuple");
6333 	Assert(!HeapTupleHeaderIsHeapOnly(tp.t_data));
6334 
6335 	/*
6336 	 * No need to check for serializable conflicts here.  There is never a
6337 	 * need for a combocid, either.  No need to extract replica identity, or
6338 	 * do anything special with infomask bits.
6339 	 */
6340 
6341 	START_CRIT_SECTION();
6342 
6343 	/*
6344 	 * The tuple will become DEAD immediately.  Flag that this page is a
6345 	 * candidate for pruning by setting xmin to TransactionXmin. While not
6346 	 * immediately prunable, it is the oldest xid we can cheaply determine
6347 	 * that's safe against wraparound / being older than the table's
6348 	 * relfrozenxid.  To defend against the unlikely case of a new relation
6349 	 * having a newer relfrozenxid than our TransactionXmin, use relfrozenxid
6350 	 * if so (vacuum can't subsequently move relfrozenxid to beyond
6351 	 * TransactionXmin, so there's no race here).
6352 	 */
6353 	Assert(TransactionIdIsValid(TransactionXmin));
6354 	if (TransactionIdPrecedes(TransactionXmin, relation->rd_rel->relfrozenxid))
6355 		prune_xid = relation->rd_rel->relfrozenxid;
6356 	else
6357 		prune_xid = TransactionXmin;
6358 	PageSetPrunable(page, prune_xid);
6359 
6360 	/* store transaction information of xact deleting the tuple */
6361 	tp.t_data->t_infomask &= ~(HEAP_XMAX_BITS | HEAP_MOVED);
6362 	tp.t_data->t_infomask2 &= ~HEAP_KEYS_UPDATED;
6363 
6364 	/*
6365 	 * Set the tuple header xmin to InvalidTransactionId.  This makes the
6366 	 * tuple immediately invisible everyone.  (In particular, to any
6367 	 * transactions waiting on the speculative token, woken up later.)
6368 	 */
6369 	HeapTupleHeaderSetXmin(tp.t_data, InvalidTransactionId);
6370 
6371 	/* Clear the speculative insertion token too */
6372 	tp.t_data->t_ctid = tp.t_self;
6373 
6374 	MarkBufferDirty(buffer);
6375 
6376 	/*
6377 	 * XLOG stuff
6378 	 *
6379 	 * The WAL records generated here match heap_delete().  The same recovery
6380 	 * routines are used.
6381 	 */
6382 	if (RelationNeedsWAL(relation))
6383 	{
6384 		xl_heap_delete xlrec;
6385 		XLogRecPtr	recptr;
6386 
6387 		xlrec.flags = XLH_DELETE_IS_SUPER;
6388 		xlrec.infobits_set = compute_infobits(tp.t_data->t_infomask,
6389 											  tp.t_data->t_infomask2);
6390 		xlrec.offnum = ItemPointerGetOffsetNumber(&tp.t_self);
6391 		xlrec.xmax = xid;
6392 
6393 		XLogBeginInsert();
6394 		XLogRegisterData((char *) &xlrec, SizeOfHeapDelete);
6395 		XLogRegisterBuffer(0, buffer, REGBUF_STANDARD);
6396 
6397 		/* No replica identity & replication origin logged */
6398 
6399 		recptr = XLogInsert(RM_HEAP_ID, XLOG_HEAP_DELETE);
6400 
6401 		PageSetLSN(page, recptr);
6402 	}
6403 
6404 	END_CRIT_SECTION();
6405 
6406 	LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
6407 
6408 	if (HeapTupleHasExternal(&tp))
6409 	{
6410 		Assert(!IsToastRelation(relation));
6411 		toast_delete(relation, &tp, true);
6412 	}
6413 
6414 	/*
6415 	 * Never need to mark tuple for invalidation, since catalogs don't support
6416 	 * speculative insertion
6417 	 */
6418 
6419 	/* Now we can release the buffer */
6420 	ReleaseBuffer(buffer);
6421 
6422 	/* count deletion, as we counted the insertion too */
6423 	pgstat_count_heap_delete(relation);
6424 }
6425 
6426 /*
6427  * heap_inplace_update - update a tuple "in place" (ie, overwrite it)
6428  *
6429  * Overwriting violates both MVCC and transactional safety, so the uses
6430  * of this function in Postgres are extremely limited.  Nonetheless we
6431  * find some places to use it.
6432  *
6433  * The tuple cannot change size, and therefore it's reasonable to assume
6434  * that its null bitmap (if any) doesn't change either.  So we just
6435  * overwrite the data portion of the tuple without touching the null
6436  * bitmap or any of the header fields.
6437  *
6438  * tuple is an in-memory tuple structure containing the data to be written
6439  * over the target tuple.  Also, tuple->t_self identifies the target tuple.
6440  */
6441 void
heap_inplace_update(Relation relation,HeapTuple tuple)6442 heap_inplace_update(Relation relation, HeapTuple tuple)
6443 {
6444 	Buffer		buffer;
6445 	Page		page;
6446 	OffsetNumber offnum;
6447 	ItemId		lp = NULL;
6448 	HeapTupleHeader htup;
6449 	uint32		oldlen;
6450 	uint32		newlen;
6451 
6452 	/*
6453 	 * For now, parallel operations are required to be strictly read-only.
6454 	 * Unlike a regular update, this should never create a combo CID, so it
6455 	 * might be possible to relax this restriction, but not without more
6456 	 * thought and testing.  It's not clear that it would be useful, anyway.
6457 	 */
6458 	if (IsInParallelMode())
6459 		ereport(ERROR,
6460 				(errcode(ERRCODE_INVALID_TRANSACTION_STATE),
6461 				 errmsg("cannot update tuples during a parallel operation")));
6462 
6463 	buffer = ReadBuffer(relation, ItemPointerGetBlockNumber(&(tuple->t_self)));
6464 	LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE);
6465 	page = (Page) BufferGetPage(buffer);
6466 
6467 	offnum = ItemPointerGetOffsetNumber(&(tuple->t_self));
6468 	if (PageGetMaxOffsetNumber(page) >= offnum)
6469 		lp = PageGetItemId(page, offnum);
6470 
6471 	if (PageGetMaxOffsetNumber(page) < offnum || !ItemIdIsNormal(lp))
6472 		elog(ERROR, "invalid lp");
6473 
6474 	htup = (HeapTupleHeader) PageGetItem(page, lp);
6475 
6476 	oldlen = ItemIdGetLength(lp) - htup->t_hoff;
6477 	newlen = tuple->t_len - tuple->t_data->t_hoff;
6478 	if (oldlen != newlen || htup->t_hoff != tuple->t_data->t_hoff)
6479 		elog(ERROR, "wrong tuple length");
6480 
6481 	/* NO EREPORT(ERROR) from here till changes are logged */
6482 	START_CRIT_SECTION();
6483 
6484 	memcpy((char *) htup + htup->t_hoff,
6485 		   (char *) tuple->t_data + tuple->t_data->t_hoff,
6486 		   newlen);
6487 
6488 	MarkBufferDirty(buffer);
6489 
6490 	/* XLOG stuff */
6491 	if (RelationNeedsWAL(relation))
6492 	{
6493 		xl_heap_inplace xlrec;
6494 		XLogRecPtr	recptr;
6495 
6496 		xlrec.offnum = ItemPointerGetOffsetNumber(&tuple->t_self);
6497 
6498 		XLogBeginInsert();
6499 		XLogRegisterData((char *) &xlrec, SizeOfHeapInplace);
6500 
6501 		XLogRegisterBuffer(0, buffer, REGBUF_STANDARD);
6502 		XLogRegisterBufData(0, (char *) htup + htup->t_hoff, newlen);
6503 
6504 		/* inplace updates aren't decoded atm, don't log the origin */
6505 
6506 		recptr = XLogInsert(RM_HEAP_ID, XLOG_HEAP_INPLACE);
6507 
6508 		PageSetLSN(page, recptr);
6509 	}
6510 
6511 	END_CRIT_SECTION();
6512 
6513 	UnlockReleaseBuffer(buffer);
6514 
6515 	/*
6516 	 * Send out shared cache inval if necessary.  Note that because we only
6517 	 * pass the new version of the tuple, this mustn't be used for any
6518 	 * operations that could change catcache lookup keys.  But we aren't
6519 	 * bothering with index updates either, so that's true a fortiori.
6520 	 */
6521 	if (!IsBootstrapProcessingMode())
6522 		CacheInvalidateHeapTuple(relation, tuple, NULL);
6523 }
6524 
6525 #define		FRM_NOOP				0x0001
6526 #define		FRM_INVALIDATE_XMAX		0x0002
6527 #define		FRM_RETURN_IS_XID		0x0004
6528 #define		FRM_RETURN_IS_MULTI		0x0008
6529 #define		FRM_MARK_COMMITTED		0x0010
6530 
6531 /*
6532  * FreezeMultiXactId
6533  *		Determine what to do during freezing when a tuple is marked by a
6534  *		MultiXactId.
6535  *
6536  * NB -- this might have the side-effect of creating a new MultiXactId!
6537  *
6538  * "flags" is an output value; it's used to tell caller what to do on return.
6539  * Possible flags are:
6540  * FRM_NOOP
6541  *		don't do anything -- keep existing Xmax
6542  * FRM_INVALIDATE_XMAX
6543  *		mark Xmax as InvalidTransactionId and set XMAX_INVALID flag.
6544  * FRM_RETURN_IS_XID
6545  *		The Xid return value is a single update Xid to set as xmax.
6546  * FRM_MARK_COMMITTED
6547  *		Xmax can be marked as HEAP_XMAX_COMMITTED
6548  * FRM_RETURN_IS_MULTI
6549  *		The return value is a new MultiXactId to set as new Xmax.
6550  *		(caller must obtain proper infomask bits using GetMultiXactIdHintBits)
6551  */
6552 static TransactionId
FreezeMultiXactId(MultiXactId multi,uint16 t_infomask,TransactionId relfrozenxid,TransactionId relminmxid,TransactionId cutoff_xid,MultiXactId cutoff_multi,uint16 * flags)6553 FreezeMultiXactId(MultiXactId multi, uint16 t_infomask,
6554 				  TransactionId relfrozenxid, TransactionId relminmxid,
6555 				  TransactionId cutoff_xid, MultiXactId cutoff_multi,
6556 				  uint16 *flags)
6557 {
6558 	TransactionId xid = InvalidTransactionId;
6559 	int			i;
6560 	MultiXactMember *members;
6561 	int			nmembers;
6562 	bool		need_replace;
6563 	int			nnewmembers;
6564 	MultiXactMember *newmembers;
6565 	bool		has_lockers;
6566 	TransactionId update_xid;
6567 	bool		update_committed;
6568 
6569 	*flags = 0;
6570 
6571 	/* We should only be called in Multis */
6572 	Assert(t_infomask & HEAP_XMAX_IS_MULTI);
6573 
6574 	if (!MultiXactIdIsValid(multi) ||
6575 		HEAP_LOCKED_UPGRADED(t_infomask))
6576 	{
6577 		/* Ensure infomask bits are appropriately set/reset */
6578 		*flags |= FRM_INVALIDATE_XMAX;
6579 		return InvalidTransactionId;
6580 	}
6581 	else if (MultiXactIdPrecedes(multi, relminmxid))
6582 		ereport(ERROR,
6583 				(errcode(ERRCODE_DATA_CORRUPTED),
6584 				 errmsg_internal("found multixact %u from before relminmxid %u",
6585 								 multi, relminmxid)));
6586 	else if (MultiXactIdPrecedes(multi, cutoff_multi))
6587 	{
6588 		/*
6589 		 * This old multi cannot possibly have members still running, but
6590 		 * verify just in case.  If it was a locker only, it can be removed
6591 		 * without any further consideration; but if it contained an update,
6592 		 * we might need to preserve it.
6593 		 */
6594 		if (MultiXactIdIsRunning(multi,
6595 								 HEAP_XMAX_IS_LOCKED_ONLY(t_infomask)))
6596 			ereport(ERROR,
6597 					(errcode(ERRCODE_DATA_CORRUPTED),
6598 					 errmsg_internal("multixact %u from before cutoff %u found to be still running",
6599 									 multi, cutoff_multi)));
6600 
6601 		if (HEAP_XMAX_IS_LOCKED_ONLY(t_infomask))
6602 		{
6603 			*flags |= FRM_INVALIDATE_XMAX;
6604 			xid = InvalidTransactionId; /* not strictly necessary */
6605 		}
6606 		else
6607 		{
6608 			/* replace multi by update xid */
6609 			xid = MultiXactIdGetUpdateXid(multi, t_infomask);
6610 
6611 			/* wasn't only a lock, xid needs to be valid */
6612 			Assert(TransactionIdIsValid(xid));
6613 
6614 			if (TransactionIdPrecedes(xid, relfrozenxid))
6615 				ereport(ERROR,
6616 						(errcode(ERRCODE_DATA_CORRUPTED),
6617 						 errmsg_internal("found update xid %u from before relfrozenxid %u",
6618 										 xid, relfrozenxid)));
6619 
6620 			/*
6621 			 * If the xid is older than the cutoff, it has to have aborted,
6622 			 * otherwise the tuple would have gotten pruned away.
6623 			 */
6624 			if (TransactionIdPrecedes(xid, cutoff_xid))
6625 			{
6626 				if (TransactionIdDidCommit(xid))
6627 					ereport(ERROR,
6628 							(errcode(ERRCODE_DATA_CORRUPTED),
6629 							 errmsg_internal("cannot freeze committed update xid %u", xid)));
6630 				*flags |= FRM_INVALIDATE_XMAX;
6631 				xid = InvalidTransactionId; /* not strictly necessary */
6632 			}
6633 			else
6634 			{
6635 				*flags |= FRM_RETURN_IS_XID;
6636 			}
6637 		}
6638 
6639 		return xid;
6640 	}
6641 
6642 	/*
6643 	 * This multixact might have or might not have members still running, but
6644 	 * we know it's valid and is newer than the cutoff point for multis.
6645 	 * However, some member(s) of it may be below the cutoff for Xids, so we
6646 	 * need to walk the whole members array to figure out what to do, if
6647 	 * anything.
6648 	 */
6649 
6650 	nmembers =
6651 		GetMultiXactIdMembers(multi, &members, false,
6652 							  HEAP_XMAX_IS_LOCKED_ONLY(t_infomask));
6653 	if (nmembers <= 0)
6654 	{
6655 		/* Nothing worth keeping */
6656 		*flags |= FRM_INVALIDATE_XMAX;
6657 		return InvalidTransactionId;
6658 	}
6659 
6660 	/* is there anything older than the cutoff? */
6661 	need_replace = false;
6662 	for (i = 0; i < nmembers; i++)
6663 	{
6664 		if (TransactionIdPrecedes(members[i].xid, cutoff_xid))
6665 		{
6666 			need_replace = true;
6667 			break;
6668 		}
6669 	}
6670 
6671 	/*
6672 	 * In the simplest case, there is no member older than the cutoff; we can
6673 	 * keep the existing MultiXactId as is.
6674 	 */
6675 	if (!need_replace)
6676 	{
6677 		*flags |= FRM_NOOP;
6678 		pfree(members);
6679 		return InvalidTransactionId;
6680 	}
6681 
6682 	/*
6683 	 * If the multi needs to be updated, figure out which members do we need
6684 	 * to keep.
6685 	 */
6686 	nnewmembers = 0;
6687 	newmembers = palloc(sizeof(MultiXactMember) * nmembers);
6688 	has_lockers = false;
6689 	update_xid = InvalidTransactionId;
6690 	update_committed = false;
6691 
6692 	for (i = 0; i < nmembers; i++)
6693 	{
6694 		/*
6695 		 * Determine whether to keep this member or ignore it.
6696 		 */
6697 		if (ISUPDATE_from_mxstatus(members[i].status))
6698 		{
6699 			TransactionId xid = members[i].xid;
6700 
6701 			Assert(TransactionIdIsValid(xid));
6702 			if (TransactionIdPrecedes(xid, relfrozenxid))
6703 				ereport(ERROR,
6704 						(errcode(ERRCODE_DATA_CORRUPTED),
6705 						 errmsg_internal("found update xid %u from before relfrozenxid %u",
6706 										 xid, relfrozenxid)));
6707 
6708 			/*
6709 			 * It's an update; should we keep it?  If the transaction is known
6710 			 * aborted or crashed then it's okay to ignore it, otherwise not.
6711 			 * Note that an updater older than cutoff_xid cannot possibly be
6712 			 * committed, because HeapTupleSatisfiesVacuum would have returned
6713 			 * HEAPTUPLE_DEAD and we would not be trying to freeze the tuple.
6714 			 *
6715 			 * As with all tuple visibility routines, it's critical to test
6716 			 * TransactionIdIsInProgress before TransactionIdDidCommit,
6717 			 * because of race conditions explained in detail in tqual.c.
6718 			 */
6719 			if (TransactionIdIsCurrentTransactionId(xid) ||
6720 				TransactionIdIsInProgress(xid))
6721 			{
6722 				Assert(!TransactionIdIsValid(update_xid));
6723 				update_xid = xid;
6724 			}
6725 			else if (TransactionIdDidCommit(xid))
6726 			{
6727 				/*
6728 				 * The transaction committed, so we can tell caller to set
6729 				 * HEAP_XMAX_COMMITTED.  (We can only do this because we know
6730 				 * the transaction is not running.)
6731 				 */
6732 				Assert(!TransactionIdIsValid(update_xid));
6733 				update_committed = true;
6734 				update_xid = xid;
6735 			}
6736 			else
6737 			{
6738 				/*
6739 				 * Not in progress, not committed -- must be aborted or
6740 				 * crashed; we can ignore it.
6741 				 */
6742 			}
6743 
6744 			/*
6745 			 * Since the tuple wasn't marked HEAPTUPLE_DEAD by vacuum, the
6746 			 * update Xid cannot possibly be older than the xid cutoff. The
6747 			 * presence of such a tuple would cause corruption, so be paranoid
6748 			 * and check.
6749 			 */
6750 			if (TransactionIdIsValid(update_xid) &&
6751 				TransactionIdPrecedes(update_xid, cutoff_xid))
6752 				ereport(ERROR,
6753 						(errcode(ERRCODE_DATA_CORRUPTED),
6754 						 errmsg_internal("found update xid %u from before xid cutoff %u",
6755 										 update_xid, cutoff_xid)));
6756 
6757 			/*
6758 			 * If we determined that it's an Xid corresponding to an update
6759 			 * that must be retained, additionally add it to the list of
6760 			 * members of the new Multi, in case we end up using that.  (We
6761 			 * might still decide to use only an update Xid and not a multi,
6762 			 * but it's easier to maintain the list as we walk the old members
6763 			 * list.)
6764 			 */
6765 			if (TransactionIdIsValid(update_xid))
6766 				newmembers[nnewmembers++] = members[i];
6767 		}
6768 		else
6769 		{
6770 			/* We only keep lockers if they are still running */
6771 			if (TransactionIdIsCurrentTransactionId(members[i].xid) ||
6772 				TransactionIdIsInProgress(members[i].xid))
6773 			{
6774 				/* running locker cannot possibly be older than the cutoff */
6775 				Assert(!TransactionIdPrecedes(members[i].xid, cutoff_xid));
6776 				newmembers[nnewmembers++] = members[i];
6777 				has_lockers = true;
6778 			}
6779 		}
6780 	}
6781 
6782 	pfree(members);
6783 
6784 	if (nnewmembers == 0)
6785 	{
6786 		/* nothing worth keeping!? Tell caller to remove the whole thing */
6787 		*flags |= FRM_INVALIDATE_XMAX;
6788 		xid = InvalidTransactionId;
6789 	}
6790 	else if (TransactionIdIsValid(update_xid) && !has_lockers)
6791 	{
6792 		/*
6793 		 * If there's a single member and it's an update, pass it back alone
6794 		 * without creating a new Multi.  (XXX we could do this when there's a
6795 		 * single remaining locker, too, but that would complicate the API too
6796 		 * much; moreover, the case with the single updater is more
6797 		 * interesting, because those are longer-lived.)
6798 		 */
6799 		Assert(nnewmembers == 1);
6800 		*flags |= FRM_RETURN_IS_XID;
6801 		if (update_committed)
6802 			*flags |= FRM_MARK_COMMITTED;
6803 		xid = update_xid;
6804 	}
6805 	else
6806 	{
6807 		/*
6808 		 * Create a new multixact with the surviving members of the previous
6809 		 * one, to set as new Xmax in the tuple.
6810 		 */
6811 		xid = MultiXactIdCreateFromMembers(nnewmembers, newmembers);
6812 		*flags |= FRM_RETURN_IS_MULTI;
6813 	}
6814 
6815 	pfree(newmembers);
6816 
6817 	return xid;
6818 }
6819 
6820 /*
6821  * heap_prepare_freeze_tuple
6822  *
6823  * Check to see whether any of the XID fields of a tuple (xmin, xmax, xvac)
6824  * are older than the specified cutoff XID and cutoff MultiXactId.  If so,
6825  * setup enough state (in the *frz output argument) to later execute and
6826  * WAL-log what we would need to do, and return true.  Return false if nothing
6827  * is to be changed.  In addition, set *totally_frozen_p to true if the tuple
6828  * will be totally frozen after these operations are performed and false if
6829  * more freezing will eventually be required.
6830  *
6831  * Caller is responsible for setting the offset field, if appropriate.
6832  *
6833  * It is assumed that the caller has checked the tuple with
6834  * HeapTupleSatisfiesVacuum() and determined that it is not HEAPTUPLE_DEAD
6835  * (else we should be removing the tuple, not freezing it).
6836  *
6837  * NB: cutoff_xid *must* be <= the current global xmin, to ensure that any
6838  * XID older than it could neither be running nor seen as running by any
6839  * open transaction.  This ensures that the replacement will not change
6840  * anyone's idea of the tuple state.
6841  * Similarly, cutoff_multi must be less than or equal to the smallest
6842  * MultiXactId used by any transaction currently open.
6843  *
6844  * If the tuple is in a shared buffer, caller must hold an exclusive lock on
6845  * that buffer.
6846  *
6847  * NB: It is not enough to set hint bits to indicate something is
6848  * committed/invalid -- they might not be set on a standby, or after crash
6849  * recovery.  We really need to remove old xids.
6850  */
6851 bool
heap_prepare_freeze_tuple(HeapTupleHeader tuple,TransactionId relfrozenxid,TransactionId relminmxid,TransactionId cutoff_xid,TransactionId cutoff_multi,xl_heap_freeze_tuple * frz,bool * totally_frozen_p)6852 heap_prepare_freeze_tuple(HeapTupleHeader tuple,
6853 						  TransactionId relfrozenxid, TransactionId relminmxid,
6854 						  TransactionId cutoff_xid, TransactionId cutoff_multi,
6855 						  xl_heap_freeze_tuple *frz, bool *totally_frozen_p)
6856 {
6857 	bool		changed = false;
6858 	bool		xmax_already_frozen = false;
6859 	bool		xmin_frozen;
6860 	bool		freeze_xmax;
6861 	TransactionId xid;
6862 
6863 	frz->frzflags = 0;
6864 	frz->t_infomask2 = tuple->t_infomask2;
6865 	frz->t_infomask = tuple->t_infomask;
6866 	frz->xmax = HeapTupleHeaderGetRawXmax(tuple);
6867 
6868 	/* Process xmin */
6869 	xid = HeapTupleHeaderGetXmin(tuple);
6870 	xmin_frozen = ((xid == FrozenTransactionId) ||
6871 				   HeapTupleHeaderXminFrozen(tuple));
6872 	if (TransactionIdIsNormal(xid))
6873 	{
6874 		if (TransactionIdPrecedes(xid, relfrozenxid))
6875 			ereport(ERROR,
6876 					(errcode(ERRCODE_DATA_CORRUPTED),
6877 					 errmsg_internal("found xmin %u from before relfrozenxid %u",
6878 									 xid, relfrozenxid)));
6879 
6880 		if (TransactionIdPrecedes(xid, cutoff_xid))
6881 		{
6882 			if (!TransactionIdDidCommit(xid))
6883 				ereport(ERROR,
6884 						(errcode(ERRCODE_DATA_CORRUPTED),
6885 						 errmsg_internal("uncommitted xmin %u from before xid cutoff %u needs to be frozen",
6886 										 xid, cutoff_xid)));
6887 
6888 			frz->t_infomask |= HEAP_XMIN_FROZEN;
6889 			changed = true;
6890 			xmin_frozen = true;
6891 		}
6892 	}
6893 
6894 	/*
6895 	 * Process xmax.  To thoroughly examine the current Xmax value we need to
6896 	 * resolve a MultiXactId to its member Xids, in case some of them are
6897 	 * below the given cutoff for Xids.  In that case, those values might need
6898 	 * freezing, too.  Also, if a multi needs freezing, we cannot simply take
6899 	 * it out --- if there's a live updater Xid, it needs to be kept.
6900 	 *
6901 	 * Make sure to keep heap_tuple_needs_freeze in sync with this.
6902 	 */
6903 	xid = HeapTupleHeaderGetRawXmax(tuple);
6904 
6905 	if (tuple->t_infomask & HEAP_XMAX_IS_MULTI)
6906 	{
6907 		TransactionId newxmax;
6908 		uint16		flags;
6909 
6910 		newxmax = FreezeMultiXactId(xid, tuple->t_infomask,
6911 									relfrozenxid, relminmxid,
6912 									cutoff_xid, cutoff_multi, &flags);
6913 
6914 		freeze_xmax = (flags & FRM_INVALIDATE_XMAX);
6915 
6916 		if (flags & FRM_RETURN_IS_XID)
6917 		{
6918 			/*
6919 			 * NB -- some of these transformations are only valid because we
6920 			 * know the return Xid is a tuple updater (i.e. not merely a
6921 			 * locker.) Also note that the only reason we don't explicitly
6922 			 * worry about HEAP_KEYS_UPDATED is because it lives in
6923 			 * t_infomask2 rather than t_infomask.
6924 			 */
6925 			frz->t_infomask &= ~HEAP_XMAX_BITS;
6926 			frz->xmax = newxmax;
6927 			if (flags & FRM_MARK_COMMITTED)
6928 				frz->t_infomask |= HEAP_XMAX_COMMITTED;
6929 			changed = true;
6930 		}
6931 		else if (flags & FRM_RETURN_IS_MULTI)
6932 		{
6933 			uint16		newbits;
6934 			uint16		newbits2;
6935 
6936 			/*
6937 			 * We can't use GetMultiXactIdHintBits directly on the new multi
6938 			 * here; that routine initializes the masks to all zeroes, which
6939 			 * would lose other bits we need.  Doing it this way ensures all
6940 			 * unrelated bits remain untouched.
6941 			 */
6942 			frz->t_infomask &= ~HEAP_XMAX_BITS;
6943 			frz->t_infomask2 &= ~HEAP_KEYS_UPDATED;
6944 			GetMultiXactIdHintBits(newxmax, &newbits, &newbits2);
6945 			frz->t_infomask |= newbits;
6946 			frz->t_infomask2 |= newbits2;
6947 
6948 			frz->xmax = newxmax;
6949 
6950 			changed = true;
6951 		}
6952 	}
6953 	else if (TransactionIdIsNormal(xid))
6954 	{
6955 		if (TransactionIdPrecedes(xid, relfrozenxid))
6956 			ereport(ERROR,
6957 					(errcode(ERRCODE_DATA_CORRUPTED),
6958 					 errmsg_internal("found xmax %u from before relfrozenxid %u",
6959 									 xid, relfrozenxid)));
6960 
6961 		if (TransactionIdPrecedes(xid, cutoff_xid))
6962 		{
6963 			/*
6964 			 * If we freeze xmax, make absolutely sure that it's not an XID
6965 			 * that is important.  (Note, a lock-only xmax can be removed
6966 			 * independent of committedness, since a committed lock holder has
6967 			 * released the lock).
6968 			 */
6969 			if (!HEAP_XMAX_IS_LOCKED_ONLY(tuple->t_infomask) &&
6970 				TransactionIdDidCommit(xid))
6971 				ereport(ERROR,
6972 						(errcode(ERRCODE_DATA_CORRUPTED),
6973 						 errmsg_internal("cannot freeze committed xmax %u",
6974 										 xid)));
6975 			freeze_xmax = true;
6976 		}
6977 		else
6978 			freeze_xmax = false;
6979 	}
6980 	else if ((tuple->t_infomask & HEAP_XMAX_INVALID) ||
6981 			 !TransactionIdIsValid(HeapTupleHeaderGetRawXmax(tuple)))
6982 	{
6983 		freeze_xmax = false;
6984 		xmax_already_frozen = true;
6985 	}
6986 	else
6987 		ereport(ERROR,
6988 				(errcode(ERRCODE_DATA_CORRUPTED),
6989 				 errmsg_internal("found xmax %u (infomask 0x%04x) not frozen, not multi, not normal",
6990 								 xid, tuple->t_infomask)));
6991 
6992 	if (freeze_xmax)
6993 	{
6994 		Assert(!xmax_already_frozen);
6995 
6996 		frz->xmax = InvalidTransactionId;
6997 
6998 		/*
6999 		 * The tuple might be marked either XMAX_INVALID or XMAX_COMMITTED +
7000 		 * LOCKED.  Normalize to INVALID just to be sure no one gets confused.
7001 		 * Also get rid of the HEAP_KEYS_UPDATED bit.
7002 		 */
7003 		frz->t_infomask &= ~HEAP_XMAX_BITS;
7004 		frz->t_infomask |= HEAP_XMAX_INVALID;
7005 		frz->t_infomask2 &= ~HEAP_HOT_UPDATED;
7006 		frz->t_infomask2 &= ~HEAP_KEYS_UPDATED;
7007 		changed = true;
7008 	}
7009 
7010 	/*
7011 	 * Old-style VACUUM FULL is gone, but we have to keep this code as long as
7012 	 * we support having MOVED_OFF/MOVED_IN tuples in the database.
7013 	 */
7014 	if (tuple->t_infomask & HEAP_MOVED)
7015 	{
7016 		xid = HeapTupleHeaderGetXvac(tuple);
7017 
7018 		/*
7019 		 * For Xvac, we ignore the cutoff_xid and just always perform the
7020 		 * freeze operation.  The oldest release in which such a value can
7021 		 * actually be set is PostgreSQL 8.4, because old-style VACUUM FULL
7022 		 * was removed in PostgreSQL 9.0.  Note that if we were to respect
7023 		 * cutoff_xid here, we'd need to make surely to clear totally_frozen
7024 		 * when we skipped freezing on that basis.
7025 		 */
7026 		if (TransactionIdIsNormal(xid))
7027 		{
7028 			/*
7029 			 * If a MOVED_OFF tuple is not dead, the xvac transaction must
7030 			 * have failed; whereas a non-dead MOVED_IN tuple must mean the
7031 			 * xvac transaction succeeded.
7032 			 */
7033 			if (tuple->t_infomask & HEAP_MOVED_OFF)
7034 				frz->frzflags |= XLH_INVALID_XVAC;
7035 			else
7036 				frz->frzflags |= XLH_FREEZE_XVAC;
7037 
7038 			/*
7039 			 * Might as well fix the hint bits too; usually XMIN_COMMITTED
7040 			 * will already be set here, but there's a small chance not.
7041 			 */
7042 			Assert(!(tuple->t_infomask & HEAP_XMIN_INVALID));
7043 			frz->t_infomask |= HEAP_XMIN_COMMITTED;
7044 			changed = true;
7045 		}
7046 	}
7047 
7048 	*totally_frozen_p = (xmin_frozen &&
7049 						 (freeze_xmax || xmax_already_frozen));
7050 	return changed;
7051 }
7052 
7053 /*
7054  * heap_execute_freeze_tuple
7055  *		Execute the prepared freezing of a tuple.
7056  *
7057  * Caller is responsible for ensuring that no other backend can access the
7058  * storage underlying this tuple, either by holding an exclusive lock on the
7059  * buffer containing it (which is what lazy VACUUM does), or by having it be
7060  * in private storage (which is what CLUSTER and friends do).
7061  *
7062  * Note: it might seem we could make the changes without exclusive lock, since
7063  * TransactionId read/write is assumed atomic anyway.  However there is a race
7064  * condition: someone who just fetched an old XID that we overwrite here could
7065  * conceivably not finish checking the XID against pg_xact before we finish
7066  * the VACUUM and perhaps truncate off the part of pg_xact he needs.  Getting
7067  * exclusive lock ensures no other backend is in process of checking the
7068  * tuple status.  Also, getting exclusive lock makes it safe to adjust the
7069  * infomask bits.
7070  *
7071  * NB: All code in here must be safe to execute during crash recovery!
7072  */
7073 void
heap_execute_freeze_tuple(HeapTupleHeader tuple,xl_heap_freeze_tuple * frz)7074 heap_execute_freeze_tuple(HeapTupleHeader tuple, xl_heap_freeze_tuple *frz)
7075 {
7076 	HeapTupleHeaderSetXmax(tuple, frz->xmax);
7077 
7078 	if (frz->frzflags & XLH_FREEZE_XVAC)
7079 		HeapTupleHeaderSetXvac(tuple, FrozenTransactionId);
7080 
7081 	if (frz->frzflags & XLH_INVALID_XVAC)
7082 		HeapTupleHeaderSetXvac(tuple, InvalidTransactionId);
7083 
7084 	tuple->t_infomask = frz->t_infomask;
7085 	tuple->t_infomask2 = frz->t_infomask2;
7086 }
7087 
7088 /*
7089  * heap_freeze_tuple
7090  *		Freeze tuple in place, without WAL logging.
7091  *
7092  * Useful for callers like CLUSTER that perform their own WAL logging.
7093  */
7094 bool
heap_freeze_tuple(HeapTupleHeader tuple,TransactionId relfrozenxid,TransactionId relminmxid,TransactionId cutoff_xid,TransactionId cutoff_multi)7095 heap_freeze_tuple(HeapTupleHeader tuple,
7096 				  TransactionId relfrozenxid, TransactionId relminmxid,
7097 				  TransactionId cutoff_xid, TransactionId cutoff_multi)
7098 {
7099 	xl_heap_freeze_tuple frz;
7100 	bool		do_freeze;
7101 	bool		tuple_totally_frozen;
7102 
7103 	do_freeze = heap_prepare_freeze_tuple(tuple,
7104 										  relfrozenxid, relminmxid,
7105 										  cutoff_xid, cutoff_multi,
7106 										  &frz, &tuple_totally_frozen);
7107 
7108 	/*
7109 	 * Note that because this is not a WAL-logged operation, we don't need to
7110 	 * fill in the offset in the freeze record.
7111 	 */
7112 
7113 	if (do_freeze)
7114 		heap_execute_freeze_tuple(tuple, &frz);
7115 	return do_freeze;
7116 }
7117 
7118 /*
7119  * For a given MultiXactId, return the hint bits that should be set in the
7120  * tuple's infomask.
7121  *
7122  * Normally this should be called for a multixact that was just created, and
7123  * so is on our local cache, so the GetMembers call is fast.
7124  */
7125 static void
GetMultiXactIdHintBits(MultiXactId multi,uint16 * new_infomask,uint16 * new_infomask2)7126 GetMultiXactIdHintBits(MultiXactId multi, uint16 *new_infomask,
7127 					   uint16 *new_infomask2)
7128 {
7129 	int			nmembers;
7130 	MultiXactMember *members;
7131 	int			i;
7132 	uint16		bits = HEAP_XMAX_IS_MULTI;
7133 	uint16		bits2 = 0;
7134 	bool		has_update = false;
7135 	LockTupleMode strongest = LockTupleKeyShare;
7136 
7137 	/*
7138 	 * We only use this in multis we just created, so they cannot be values
7139 	 * pre-pg_upgrade.
7140 	 */
7141 	nmembers = GetMultiXactIdMembers(multi, &members, false, false);
7142 
7143 	for (i = 0; i < nmembers; i++)
7144 	{
7145 		LockTupleMode mode;
7146 
7147 		/*
7148 		 * Remember the strongest lock mode held by any member of the
7149 		 * multixact.
7150 		 */
7151 		mode = TUPLOCK_from_mxstatus(members[i].status);
7152 		if (mode > strongest)
7153 			strongest = mode;
7154 
7155 		/* See what other bits we need */
7156 		switch (members[i].status)
7157 		{
7158 			case MultiXactStatusForKeyShare:
7159 			case MultiXactStatusForShare:
7160 			case MultiXactStatusForNoKeyUpdate:
7161 				break;
7162 
7163 			case MultiXactStatusForUpdate:
7164 				bits2 |= HEAP_KEYS_UPDATED;
7165 				break;
7166 
7167 			case MultiXactStatusNoKeyUpdate:
7168 				has_update = true;
7169 				break;
7170 
7171 			case MultiXactStatusUpdate:
7172 				bits2 |= HEAP_KEYS_UPDATED;
7173 				has_update = true;
7174 				break;
7175 		}
7176 	}
7177 
7178 	if (strongest == LockTupleExclusive ||
7179 		strongest == LockTupleNoKeyExclusive)
7180 		bits |= HEAP_XMAX_EXCL_LOCK;
7181 	else if (strongest == LockTupleShare)
7182 		bits |= HEAP_XMAX_SHR_LOCK;
7183 	else if (strongest == LockTupleKeyShare)
7184 		bits |= HEAP_XMAX_KEYSHR_LOCK;
7185 
7186 	if (!has_update)
7187 		bits |= HEAP_XMAX_LOCK_ONLY;
7188 
7189 	if (nmembers > 0)
7190 		pfree(members);
7191 
7192 	*new_infomask = bits;
7193 	*new_infomask2 = bits2;
7194 }
7195 
7196 /*
7197  * MultiXactIdGetUpdateXid
7198  *
7199  * Given a multixact Xmax and corresponding infomask, which does not have the
7200  * HEAP_XMAX_LOCK_ONLY bit set, obtain and return the Xid of the updating
7201  * transaction.
7202  *
7203  * Caller is expected to check the status of the updating transaction, if
7204  * necessary.
7205  */
7206 static TransactionId
MultiXactIdGetUpdateXid(TransactionId xmax,uint16 t_infomask)7207 MultiXactIdGetUpdateXid(TransactionId xmax, uint16 t_infomask)
7208 {
7209 	TransactionId update_xact = InvalidTransactionId;
7210 	MultiXactMember *members;
7211 	int			nmembers;
7212 
7213 	Assert(!(t_infomask & HEAP_XMAX_LOCK_ONLY));
7214 	Assert(t_infomask & HEAP_XMAX_IS_MULTI);
7215 
7216 	/*
7217 	 * Since we know the LOCK_ONLY bit is not set, this cannot be a multi from
7218 	 * pre-pg_upgrade.
7219 	 */
7220 	nmembers = GetMultiXactIdMembers(xmax, &members, false, false);
7221 
7222 	if (nmembers > 0)
7223 	{
7224 		int			i;
7225 
7226 		for (i = 0; i < nmembers; i++)
7227 		{
7228 			/* Ignore lockers */
7229 			if (!ISUPDATE_from_mxstatus(members[i].status))
7230 				continue;
7231 
7232 			/* there can be at most one updater */
7233 			Assert(update_xact == InvalidTransactionId);
7234 			update_xact = members[i].xid;
7235 #ifndef USE_ASSERT_CHECKING
7236 
7237 			/*
7238 			 * in an assert-enabled build, walk the whole array to ensure
7239 			 * there's no other updater.
7240 			 */
7241 			break;
7242 #endif
7243 		}
7244 
7245 		pfree(members);
7246 	}
7247 
7248 	return update_xact;
7249 }
7250 
7251 /*
7252  * HeapTupleGetUpdateXid
7253  *		As above, but use a HeapTupleHeader
7254  *
7255  * See also HeapTupleHeaderGetUpdateXid, which can be used without previously
7256  * checking the hint bits.
7257  */
7258 TransactionId
HeapTupleGetUpdateXid(HeapTupleHeader tuple)7259 HeapTupleGetUpdateXid(HeapTupleHeader tuple)
7260 {
7261 	return MultiXactIdGetUpdateXid(HeapTupleHeaderGetRawXmax(tuple),
7262 								   tuple->t_infomask);
7263 }
7264 
7265 /*
7266  * Does the given multixact conflict with the current transaction grabbing a
7267  * tuple lock of the given strength?
7268  *
7269  * The passed infomask pairs up with the given multixact in the tuple header.
7270  *
7271  * If current_is_member is not NULL, it is set to 'true' if the current
7272  * transaction is a member of the given multixact.
7273  */
7274 static bool
DoesMultiXactIdConflict(MultiXactId multi,uint16 infomask,LockTupleMode lockmode,bool * current_is_member)7275 DoesMultiXactIdConflict(MultiXactId multi, uint16 infomask,
7276 						LockTupleMode lockmode, bool *current_is_member)
7277 {
7278 	int			nmembers;
7279 	MultiXactMember *members;
7280 	bool		result = false;
7281 	LOCKMODE	wanted = tupleLockExtraInfo[lockmode].hwlock;
7282 
7283 	if (HEAP_LOCKED_UPGRADED(infomask))
7284 		return false;
7285 
7286 	nmembers = GetMultiXactIdMembers(multi, &members, false,
7287 									 HEAP_XMAX_IS_LOCKED_ONLY(infomask));
7288 	if (nmembers >= 0)
7289 	{
7290 		int			i;
7291 
7292 		for (i = 0; i < nmembers; i++)
7293 		{
7294 			TransactionId memxid;
7295 			LOCKMODE	memlockmode;
7296 
7297 			if (result && (current_is_member == NULL || *current_is_member))
7298 				break;
7299 
7300 			memlockmode = LOCKMODE_from_mxstatus(members[i].status);
7301 
7302 			/* ignore members from current xact (but track their presence) */
7303 			memxid = members[i].xid;
7304 			if (TransactionIdIsCurrentTransactionId(memxid))
7305 			{
7306 				if (current_is_member != NULL)
7307 					*current_is_member = true;
7308 				continue;
7309 			}
7310 			else if (result)
7311 				continue;
7312 
7313 			/* ignore members that don't conflict with the lock we want */
7314 			if (!DoLockModesConflict(memlockmode, wanted))
7315 				continue;
7316 
7317 			if (ISUPDATE_from_mxstatus(members[i].status))
7318 			{
7319 				/* ignore aborted updaters */
7320 				if (TransactionIdDidAbort(memxid))
7321 					continue;
7322 			}
7323 			else
7324 			{
7325 				/* ignore lockers-only that are no longer in progress */
7326 				if (!TransactionIdIsInProgress(memxid))
7327 					continue;
7328 			}
7329 
7330 			/*
7331 			 * Whatever remains are either live lockers that conflict with our
7332 			 * wanted lock, and updaters that are not aborted.  Those conflict
7333 			 * with what we want.  Set up to return true, but keep going to
7334 			 * look for the current transaction among the multixact members,
7335 			 * if needed.
7336 			 */
7337 			result = true;
7338 		}
7339 		pfree(members);
7340 	}
7341 
7342 	return result;
7343 }
7344 
7345 /*
7346  * Do_MultiXactIdWait
7347  *		Actual implementation for the two functions below.
7348  *
7349  * 'multi', 'status' and 'infomask' indicate what to sleep on (the status is
7350  * needed to ensure we only sleep on conflicting members, and the infomask is
7351  * used to optimize multixact access in case it's a lock-only multi); 'nowait'
7352  * indicates whether to use conditional lock acquisition, to allow callers to
7353  * fail if lock is unavailable.  'rel', 'ctid' and 'oper' are used to set up
7354  * context information for error messages.  'remaining', if not NULL, receives
7355  * the number of members that are still running, including any (non-aborted)
7356  * subtransactions of our own transaction.
7357  *
7358  * We do this by sleeping on each member using XactLockTableWait.  Any
7359  * members that belong to the current backend are *not* waited for, however;
7360  * this would not merely be useless but would lead to Assert failure inside
7361  * XactLockTableWait.  By the time this returns, it is certain that all
7362  * transactions *of other backends* that were members of the MultiXactId
7363  * that conflict with the requested status are dead (and no new ones can have
7364  * been added, since it is not legal to add members to an existing
7365  * MultiXactId).
7366  *
7367  * But by the time we finish sleeping, someone else may have changed the Xmax
7368  * of the containing tuple, so the caller needs to iterate on us somehow.
7369  *
7370  * Note that in case we return false, the number of remaining members is
7371  * not to be trusted.
7372  */
7373 static bool
Do_MultiXactIdWait(MultiXactId multi,MultiXactStatus status,uint16 infomask,bool nowait,Relation rel,ItemPointer ctid,XLTW_Oper oper,int * remaining)7374 Do_MultiXactIdWait(MultiXactId multi, MultiXactStatus status,
7375 				   uint16 infomask, bool nowait,
7376 				   Relation rel, ItemPointer ctid, XLTW_Oper oper,
7377 				   int *remaining)
7378 {
7379 	bool		result = true;
7380 	MultiXactMember *members;
7381 	int			nmembers;
7382 	int			remain = 0;
7383 
7384 	/* for pre-pg_upgrade tuples, no need to sleep at all */
7385 	nmembers = HEAP_LOCKED_UPGRADED(infomask) ? -1 :
7386 		GetMultiXactIdMembers(multi, &members, false,
7387 							  HEAP_XMAX_IS_LOCKED_ONLY(infomask));
7388 
7389 	if (nmembers >= 0)
7390 	{
7391 		int			i;
7392 
7393 		for (i = 0; i < nmembers; i++)
7394 		{
7395 			TransactionId memxid = members[i].xid;
7396 			MultiXactStatus memstatus = members[i].status;
7397 
7398 			if (TransactionIdIsCurrentTransactionId(memxid))
7399 			{
7400 				remain++;
7401 				continue;
7402 			}
7403 
7404 			if (!DoLockModesConflict(LOCKMODE_from_mxstatus(memstatus),
7405 									 LOCKMODE_from_mxstatus(status)))
7406 			{
7407 				if (remaining && TransactionIdIsInProgress(memxid))
7408 					remain++;
7409 				continue;
7410 			}
7411 
7412 			/*
7413 			 * This member conflicts with our multi, so we have to sleep (or
7414 			 * return failure, if asked to avoid waiting.)
7415 			 *
7416 			 * Note that we don't set up an error context callback ourselves,
7417 			 * but instead we pass the info down to XactLockTableWait.  This
7418 			 * might seem a bit wasteful because the context is set up and
7419 			 * tore down for each member of the multixact, but in reality it
7420 			 * should be barely noticeable, and it avoids duplicate code.
7421 			 */
7422 			if (nowait)
7423 			{
7424 				result = ConditionalXactLockTableWait(memxid);
7425 				if (!result)
7426 					break;
7427 			}
7428 			else
7429 				XactLockTableWait(memxid, rel, ctid, oper);
7430 		}
7431 
7432 		pfree(members);
7433 	}
7434 
7435 	if (remaining)
7436 		*remaining = remain;
7437 
7438 	return result;
7439 }
7440 
7441 /*
7442  * MultiXactIdWait
7443  *		Sleep on a MultiXactId.
7444  *
7445  * By the time we finish sleeping, someone else may have changed the Xmax
7446  * of the containing tuple, so the caller needs to iterate on us somehow.
7447  *
7448  * We return (in *remaining, if not NULL) the number of members that are still
7449  * running, including any (non-aborted) subtransactions of our own transaction.
7450  */
7451 static void
MultiXactIdWait(MultiXactId multi,MultiXactStatus status,uint16 infomask,Relation rel,ItemPointer ctid,XLTW_Oper oper,int * remaining)7452 MultiXactIdWait(MultiXactId multi, MultiXactStatus status, uint16 infomask,
7453 				Relation rel, ItemPointer ctid, XLTW_Oper oper,
7454 				int *remaining)
7455 {
7456 	(void) Do_MultiXactIdWait(multi, status, infomask, false,
7457 							  rel, ctid, oper, remaining);
7458 }
7459 
7460 /*
7461  * ConditionalMultiXactIdWait
7462  *		As above, but only lock if we can get the lock without blocking.
7463  *
7464  * By the time we finish sleeping, someone else may have changed the Xmax
7465  * of the containing tuple, so the caller needs to iterate on us somehow.
7466  *
7467  * If the multixact is now all gone, return true.  Returns false if some
7468  * transactions might still be running.
7469  *
7470  * We return (in *remaining, if not NULL) the number of members that are still
7471  * running, including any (non-aborted) subtransactions of our own transaction.
7472  */
7473 static bool
ConditionalMultiXactIdWait(MultiXactId multi,MultiXactStatus status,uint16 infomask,Relation rel,int * remaining)7474 ConditionalMultiXactIdWait(MultiXactId multi, MultiXactStatus status,
7475 						   uint16 infomask, Relation rel, int *remaining)
7476 {
7477 	return Do_MultiXactIdWait(multi, status, infomask, true,
7478 							  rel, NULL, XLTW_None, remaining);
7479 }
7480 
7481 /*
7482  * heap_tuple_needs_eventual_freeze
7483  *
7484  * Check to see whether any of the XID fields of a tuple (xmin, xmax, xvac)
7485  * will eventually require freezing.  Similar to heap_tuple_needs_freeze,
7486  * but there's no cutoff, since we're trying to figure out whether freezing
7487  * will ever be needed, not whether it's needed now.
7488  */
7489 bool
heap_tuple_needs_eventual_freeze(HeapTupleHeader tuple)7490 heap_tuple_needs_eventual_freeze(HeapTupleHeader tuple)
7491 {
7492 	TransactionId xid;
7493 
7494 	/*
7495 	 * If xmin is a normal transaction ID, this tuple is definitely not
7496 	 * frozen.
7497 	 */
7498 	xid = HeapTupleHeaderGetXmin(tuple);
7499 	if (TransactionIdIsNormal(xid))
7500 		return true;
7501 
7502 	/*
7503 	 * If xmax is a valid xact or multixact, this tuple is also not frozen.
7504 	 */
7505 	if (tuple->t_infomask & HEAP_XMAX_IS_MULTI)
7506 	{
7507 		MultiXactId multi;
7508 
7509 		multi = HeapTupleHeaderGetRawXmax(tuple);
7510 		if (MultiXactIdIsValid(multi))
7511 			return true;
7512 	}
7513 	else
7514 	{
7515 		xid = HeapTupleHeaderGetRawXmax(tuple);
7516 		if (TransactionIdIsNormal(xid))
7517 			return true;
7518 	}
7519 
7520 	if (tuple->t_infomask & HEAP_MOVED)
7521 	{
7522 		xid = HeapTupleHeaderGetXvac(tuple);
7523 		if (TransactionIdIsNormal(xid))
7524 			return true;
7525 	}
7526 
7527 	return false;
7528 }
7529 
7530 /*
7531  * heap_tuple_needs_freeze
7532  *
7533  * Check to see whether any of the XID fields of a tuple (xmin, xmax, xvac)
7534  * are older than the specified cutoff XID or MultiXactId.  If so, return true.
7535  *
7536  * It doesn't matter whether the tuple is alive or dead, we are checking
7537  * to see if a tuple needs to be removed or frozen to avoid wraparound.
7538  *
7539  * NB: Cannot rely on hint bits here, they might not be set after a crash or
7540  * on a standby.
7541  */
7542 bool
heap_tuple_needs_freeze(HeapTupleHeader tuple,TransactionId cutoff_xid,MultiXactId cutoff_multi,Buffer buf)7543 heap_tuple_needs_freeze(HeapTupleHeader tuple, TransactionId cutoff_xid,
7544 						MultiXactId cutoff_multi, Buffer buf)
7545 {
7546 	TransactionId xid;
7547 
7548 	xid = HeapTupleHeaderGetXmin(tuple);
7549 	if (TransactionIdIsNormal(xid) &&
7550 		TransactionIdPrecedes(xid, cutoff_xid))
7551 		return true;
7552 
7553 	/*
7554 	 * The considerations for multixacts are complicated; look at
7555 	 * heap_prepare_freeze_tuple for justifications.  This routine had better
7556 	 * be in sync with that one!
7557 	 */
7558 	if (tuple->t_infomask & HEAP_XMAX_IS_MULTI)
7559 	{
7560 		MultiXactId multi;
7561 
7562 		multi = HeapTupleHeaderGetRawXmax(tuple);
7563 		if (!MultiXactIdIsValid(multi))
7564 		{
7565 			/* no xmax set, ignore */
7566 			;
7567 		}
7568 		else if (HEAP_LOCKED_UPGRADED(tuple->t_infomask))
7569 			return true;
7570 		else if (MultiXactIdPrecedes(multi, cutoff_multi))
7571 			return true;
7572 		else
7573 		{
7574 			MultiXactMember *members;
7575 			int			nmembers;
7576 			int			i;
7577 
7578 			/* need to check whether any member of the mxact is too old */
7579 
7580 			nmembers = GetMultiXactIdMembers(multi, &members, false,
7581 											 HEAP_XMAX_IS_LOCKED_ONLY(tuple->t_infomask));
7582 
7583 			for (i = 0; i < nmembers; i++)
7584 			{
7585 				if (TransactionIdPrecedes(members[i].xid, cutoff_xid))
7586 				{
7587 					pfree(members);
7588 					return true;
7589 				}
7590 			}
7591 			if (nmembers > 0)
7592 				pfree(members);
7593 		}
7594 	}
7595 	else
7596 	{
7597 		xid = HeapTupleHeaderGetRawXmax(tuple);
7598 		if (TransactionIdIsNormal(xid) &&
7599 			TransactionIdPrecedes(xid, cutoff_xid))
7600 			return true;
7601 	}
7602 
7603 	if (tuple->t_infomask & HEAP_MOVED)
7604 	{
7605 		xid = HeapTupleHeaderGetXvac(tuple);
7606 		if (TransactionIdIsNormal(xid) &&
7607 			TransactionIdPrecedes(xid, cutoff_xid))
7608 			return true;
7609 	}
7610 
7611 	return false;
7612 }
7613 
7614 /*
7615  * If 'tuple' contains any visible XID greater than latestRemovedXid,
7616  * ratchet forwards latestRemovedXid to the greatest one found.
7617  * This is used as the basis for generating Hot Standby conflicts, so
7618  * if a tuple was never visible then removing it should not conflict
7619  * with queries.
7620  */
7621 void
HeapTupleHeaderAdvanceLatestRemovedXid(HeapTupleHeader tuple,TransactionId * latestRemovedXid)7622 HeapTupleHeaderAdvanceLatestRemovedXid(HeapTupleHeader tuple,
7623 									   TransactionId *latestRemovedXid)
7624 {
7625 	TransactionId xmin = HeapTupleHeaderGetXmin(tuple);
7626 	TransactionId xmax = HeapTupleHeaderGetUpdateXid(tuple);
7627 	TransactionId xvac = HeapTupleHeaderGetXvac(tuple);
7628 
7629 	if (tuple->t_infomask & HEAP_MOVED)
7630 	{
7631 		if (TransactionIdPrecedes(*latestRemovedXid, xvac))
7632 			*latestRemovedXid = xvac;
7633 	}
7634 
7635 	/*
7636 	 * Ignore tuples inserted by an aborted transaction or if the tuple was
7637 	 * updated/deleted by the inserting transaction.
7638 	 *
7639 	 * Look for a committed hint bit, or if no xmin bit is set, check clog.
7640 	 * This needs to work on both master and standby, where it is used to
7641 	 * assess btree delete records.
7642 	 */
7643 	if (HeapTupleHeaderXminCommitted(tuple) ||
7644 		(!HeapTupleHeaderXminInvalid(tuple) && TransactionIdDidCommit(xmin)))
7645 	{
7646 		if (xmax != xmin &&
7647 			TransactionIdFollows(xmax, *latestRemovedXid))
7648 			*latestRemovedXid = xmax;
7649 	}
7650 
7651 	/* *latestRemovedXid may still be invalid at end */
7652 }
7653 
7654 /*
7655  * Perform XLogInsert to register a heap cleanup info message. These
7656  * messages are sent once per VACUUM and are required because
7657  * of the phasing of removal operations during a lazy VACUUM.
7658  * see comments for vacuum_log_cleanup_info().
7659  */
7660 XLogRecPtr
log_heap_cleanup_info(RelFileNode rnode,TransactionId latestRemovedXid)7661 log_heap_cleanup_info(RelFileNode rnode, TransactionId latestRemovedXid)
7662 {
7663 	xl_heap_cleanup_info xlrec;
7664 	XLogRecPtr	recptr;
7665 
7666 	xlrec.node = rnode;
7667 	xlrec.latestRemovedXid = latestRemovedXid;
7668 
7669 	XLogBeginInsert();
7670 	XLogRegisterData((char *) &xlrec, SizeOfHeapCleanupInfo);
7671 
7672 	recptr = XLogInsert(RM_HEAP2_ID, XLOG_HEAP2_CLEANUP_INFO);
7673 
7674 	return recptr;
7675 }
7676 
7677 /*
7678  * Perform XLogInsert for a heap-clean operation.  Caller must already
7679  * have modified the buffer and marked it dirty.
7680  *
7681  * Note: prior to Postgres 8.3, the entries in the nowunused[] array were
7682  * zero-based tuple indexes.  Now they are one-based like other uses
7683  * of OffsetNumber.
7684  *
7685  * We also include latestRemovedXid, which is the greatest XID present in
7686  * the removed tuples. That allows recovery processing to cancel or wait
7687  * for long standby queries that can still see these tuples.
7688  */
7689 XLogRecPtr
log_heap_clean(Relation reln,Buffer buffer,OffsetNumber * redirected,int nredirected,OffsetNumber * nowdead,int ndead,OffsetNumber * nowunused,int nunused,TransactionId latestRemovedXid)7690 log_heap_clean(Relation reln, Buffer buffer,
7691 			   OffsetNumber *redirected, int nredirected,
7692 			   OffsetNumber *nowdead, int ndead,
7693 			   OffsetNumber *nowunused, int nunused,
7694 			   TransactionId latestRemovedXid)
7695 {
7696 	xl_heap_clean xlrec;
7697 	XLogRecPtr	recptr;
7698 
7699 	/* Caller should not call me on a non-WAL-logged relation */
7700 	Assert(RelationNeedsWAL(reln));
7701 
7702 	xlrec.latestRemovedXid = latestRemovedXid;
7703 	xlrec.nredirected = nredirected;
7704 	xlrec.ndead = ndead;
7705 
7706 	XLogBeginInsert();
7707 	XLogRegisterData((char *) &xlrec, SizeOfHeapClean);
7708 
7709 	XLogRegisterBuffer(0, buffer, REGBUF_STANDARD);
7710 
7711 	/*
7712 	 * The OffsetNumber arrays are not actually in the buffer, but we pretend
7713 	 * that they are.  When XLogInsert stores the whole buffer, the offset
7714 	 * arrays need not be stored too.  Note that even if all three arrays are
7715 	 * empty, we want to expose the buffer as a candidate for whole-page
7716 	 * storage, since this record type implies a defragmentation operation
7717 	 * even if no item pointers changed state.
7718 	 */
7719 	if (nredirected > 0)
7720 		XLogRegisterBufData(0, (char *) redirected,
7721 							nredirected * sizeof(OffsetNumber) * 2);
7722 
7723 	if (ndead > 0)
7724 		XLogRegisterBufData(0, (char *) nowdead,
7725 							ndead * sizeof(OffsetNumber));
7726 
7727 	if (nunused > 0)
7728 		XLogRegisterBufData(0, (char *) nowunused,
7729 							nunused * sizeof(OffsetNumber));
7730 
7731 	recptr = XLogInsert(RM_HEAP2_ID, XLOG_HEAP2_CLEAN);
7732 
7733 	return recptr;
7734 }
7735 
7736 /*
7737  * Perform XLogInsert for a heap-freeze operation.  Caller must have already
7738  * modified the buffer and marked it dirty.
7739  */
7740 XLogRecPtr
log_heap_freeze(Relation reln,Buffer buffer,TransactionId cutoff_xid,xl_heap_freeze_tuple * tuples,int ntuples)7741 log_heap_freeze(Relation reln, Buffer buffer, TransactionId cutoff_xid,
7742 				xl_heap_freeze_tuple *tuples, int ntuples)
7743 {
7744 	xl_heap_freeze_page xlrec;
7745 	XLogRecPtr	recptr;
7746 
7747 	/* Caller should not call me on a non-WAL-logged relation */
7748 	Assert(RelationNeedsWAL(reln));
7749 	/* nor when there are no tuples to freeze */
7750 	Assert(ntuples > 0);
7751 
7752 	xlrec.cutoff_xid = cutoff_xid;
7753 	xlrec.ntuples = ntuples;
7754 
7755 	XLogBeginInsert();
7756 	XLogRegisterData((char *) &xlrec, SizeOfHeapFreezePage);
7757 
7758 	/*
7759 	 * The freeze plan array is not actually in the buffer, but pretend that
7760 	 * it is.  When XLogInsert stores the whole buffer, the freeze plan need
7761 	 * not be stored too.
7762 	 */
7763 	XLogRegisterBuffer(0, buffer, REGBUF_STANDARD);
7764 	XLogRegisterBufData(0, (char *) tuples,
7765 						ntuples * sizeof(xl_heap_freeze_tuple));
7766 
7767 	recptr = XLogInsert(RM_HEAP2_ID, XLOG_HEAP2_FREEZE_PAGE);
7768 
7769 	return recptr;
7770 }
7771 
7772 /*
7773  * Perform XLogInsert for a heap-visible operation.  'block' is the block
7774  * being marked all-visible, and vm_buffer is the buffer containing the
7775  * corresponding visibility map block.  Both should have already been modified
7776  * and dirtied.
7777  *
7778  * If checksums are enabled, we also generate a full-page image of
7779  * heap_buffer, if necessary.
7780  */
7781 XLogRecPtr
log_heap_visible(RelFileNode rnode,Buffer heap_buffer,Buffer vm_buffer,TransactionId cutoff_xid,uint8 vmflags)7782 log_heap_visible(RelFileNode rnode, Buffer heap_buffer, Buffer vm_buffer,
7783 				 TransactionId cutoff_xid, uint8 vmflags)
7784 {
7785 	xl_heap_visible xlrec;
7786 	XLogRecPtr	recptr;
7787 	uint8		flags;
7788 
7789 	Assert(BufferIsValid(heap_buffer));
7790 	Assert(BufferIsValid(vm_buffer));
7791 
7792 	xlrec.cutoff_xid = cutoff_xid;
7793 	xlrec.flags = vmflags;
7794 	XLogBeginInsert();
7795 	XLogRegisterData((char *) &xlrec, SizeOfHeapVisible);
7796 
7797 	XLogRegisterBuffer(0, vm_buffer, 0);
7798 
7799 	flags = REGBUF_STANDARD;
7800 	if (!XLogHintBitIsNeeded())
7801 		flags |= REGBUF_NO_IMAGE;
7802 	XLogRegisterBuffer(1, heap_buffer, flags);
7803 
7804 	recptr = XLogInsert(RM_HEAP2_ID, XLOG_HEAP2_VISIBLE);
7805 
7806 	return recptr;
7807 }
7808 
7809 /*
7810  * Perform XLogInsert for a heap-update operation.  Caller must already
7811  * have modified the buffer(s) and marked them dirty.
7812  */
7813 static XLogRecPtr
log_heap_update(Relation reln,Buffer oldbuf,Buffer newbuf,HeapTuple oldtup,HeapTuple newtup,HeapTuple old_key_tuple,bool all_visible_cleared,bool new_all_visible_cleared)7814 log_heap_update(Relation reln, Buffer oldbuf,
7815 				Buffer newbuf, HeapTuple oldtup, HeapTuple newtup,
7816 				HeapTuple old_key_tuple,
7817 				bool all_visible_cleared, bool new_all_visible_cleared)
7818 {
7819 	xl_heap_update xlrec;
7820 	xl_heap_header xlhdr;
7821 	xl_heap_header xlhdr_idx;
7822 	uint8		info;
7823 	uint16		prefix_suffix[2];
7824 	uint16		prefixlen = 0,
7825 				suffixlen = 0;
7826 	XLogRecPtr	recptr;
7827 	Page		page = BufferGetPage(newbuf);
7828 	bool		need_tuple_data = RelationIsLogicallyLogged(reln);
7829 	bool		init;
7830 	int			bufflags;
7831 
7832 	/* Caller should not call me on a non-WAL-logged relation */
7833 	Assert(RelationNeedsWAL(reln));
7834 
7835 	XLogBeginInsert();
7836 
7837 	if (HeapTupleIsHeapOnly(newtup))
7838 		info = XLOG_HEAP_HOT_UPDATE;
7839 	else
7840 		info = XLOG_HEAP_UPDATE;
7841 
7842 	/*
7843 	 * If the old and new tuple are on the same page, we only need to log the
7844 	 * parts of the new tuple that were changed.  That saves on the amount of
7845 	 * WAL we need to write.  Currently, we just count any unchanged bytes in
7846 	 * the beginning and end of the tuple.  That's quick to check, and
7847 	 * perfectly covers the common case that only one field is updated.
7848 	 *
7849 	 * We could do this even if the old and new tuple are on different pages,
7850 	 * but only if we don't make a full-page image of the old page, which is
7851 	 * difficult to know in advance.  Also, if the old tuple is corrupt for
7852 	 * some reason, it would allow the corruption to propagate the new page,
7853 	 * so it seems best to avoid.  Under the general assumption that most
7854 	 * updates tend to create the new tuple version on the same page, there
7855 	 * isn't much to be gained by doing this across pages anyway.
7856 	 *
7857 	 * Skip this if we're taking a full-page image of the new page, as we
7858 	 * don't include the new tuple in the WAL record in that case.  Also
7859 	 * disable if wal_level='logical', as logical decoding needs to be able to
7860 	 * read the new tuple in whole from the WAL record alone.
7861 	 */
7862 	if (oldbuf == newbuf && !need_tuple_data &&
7863 		!XLogCheckBufferNeedsBackup(newbuf))
7864 	{
7865 		char	   *oldp = (char *) oldtup->t_data + oldtup->t_data->t_hoff;
7866 		char	   *newp = (char *) newtup->t_data + newtup->t_data->t_hoff;
7867 		int			oldlen = oldtup->t_len - oldtup->t_data->t_hoff;
7868 		int			newlen = newtup->t_len - newtup->t_data->t_hoff;
7869 
7870 		/* Check for common prefix between old and new tuple */
7871 		for (prefixlen = 0; prefixlen < Min(oldlen, newlen); prefixlen++)
7872 		{
7873 			if (newp[prefixlen] != oldp[prefixlen])
7874 				break;
7875 		}
7876 
7877 		/*
7878 		 * Storing the length of the prefix takes 2 bytes, so we need to save
7879 		 * at least 3 bytes or there's no point.
7880 		 */
7881 		if (prefixlen < 3)
7882 			prefixlen = 0;
7883 
7884 		/* Same for suffix */
7885 		for (suffixlen = 0; suffixlen < Min(oldlen, newlen) - prefixlen; suffixlen++)
7886 		{
7887 			if (newp[newlen - suffixlen - 1] != oldp[oldlen - suffixlen - 1])
7888 				break;
7889 		}
7890 		if (suffixlen < 3)
7891 			suffixlen = 0;
7892 	}
7893 
7894 	/* Prepare main WAL data chain */
7895 	xlrec.flags = 0;
7896 	if (all_visible_cleared)
7897 		xlrec.flags |= XLH_UPDATE_OLD_ALL_VISIBLE_CLEARED;
7898 	if (new_all_visible_cleared)
7899 		xlrec.flags |= XLH_UPDATE_NEW_ALL_VISIBLE_CLEARED;
7900 	if (prefixlen > 0)
7901 		xlrec.flags |= XLH_UPDATE_PREFIX_FROM_OLD;
7902 	if (suffixlen > 0)
7903 		xlrec.flags |= XLH_UPDATE_SUFFIX_FROM_OLD;
7904 	if (need_tuple_data)
7905 	{
7906 		xlrec.flags |= XLH_UPDATE_CONTAINS_NEW_TUPLE;
7907 		if (old_key_tuple)
7908 		{
7909 			if (reln->rd_rel->relreplident == REPLICA_IDENTITY_FULL)
7910 				xlrec.flags |= XLH_UPDATE_CONTAINS_OLD_TUPLE;
7911 			else
7912 				xlrec.flags |= XLH_UPDATE_CONTAINS_OLD_KEY;
7913 		}
7914 	}
7915 
7916 	/* If new tuple is the single and first tuple on page... */
7917 	if (ItemPointerGetOffsetNumber(&(newtup->t_self)) == FirstOffsetNumber &&
7918 		PageGetMaxOffsetNumber(page) == FirstOffsetNumber)
7919 	{
7920 		info |= XLOG_HEAP_INIT_PAGE;
7921 		init = true;
7922 	}
7923 	else
7924 		init = false;
7925 
7926 	/* Prepare WAL data for the old page */
7927 	xlrec.old_offnum = ItemPointerGetOffsetNumber(&oldtup->t_self);
7928 	xlrec.old_xmax = HeapTupleHeaderGetRawXmax(oldtup->t_data);
7929 	xlrec.old_infobits_set = compute_infobits(oldtup->t_data->t_infomask,
7930 											  oldtup->t_data->t_infomask2);
7931 
7932 	/* Prepare WAL data for the new page */
7933 	xlrec.new_offnum = ItemPointerGetOffsetNumber(&newtup->t_self);
7934 	xlrec.new_xmax = HeapTupleHeaderGetRawXmax(newtup->t_data);
7935 
7936 	bufflags = REGBUF_STANDARD;
7937 	if (init)
7938 		bufflags |= REGBUF_WILL_INIT;
7939 	if (need_tuple_data)
7940 		bufflags |= REGBUF_KEEP_DATA;
7941 
7942 	XLogRegisterBuffer(0, newbuf, bufflags);
7943 	if (oldbuf != newbuf)
7944 		XLogRegisterBuffer(1, oldbuf, REGBUF_STANDARD);
7945 
7946 	XLogRegisterData((char *) &xlrec, SizeOfHeapUpdate);
7947 
7948 	/*
7949 	 * Prepare WAL data for the new tuple.
7950 	 */
7951 	if (prefixlen > 0 || suffixlen > 0)
7952 	{
7953 		if (prefixlen > 0 && suffixlen > 0)
7954 		{
7955 			prefix_suffix[0] = prefixlen;
7956 			prefix_suffix[1] = suffixlen;
7957 			XLogRegisterBufData(0, (char *) &prefix_suffix, sizeof(uint16) * 2);
7958 		}
7959 		else if (prefixlen > 0)
7960 		{
7961 			XLogRegisterBufData(0, (char *) &prefixlen, sizeof(uint16));
7962 		}
7963 		else
7964 		{
7965 			XLogRegisterBufData(0, (char *) &suffixlen, sizeof(uint16));
7966 		}
7967 	}
7968 
7969 	xlhdr.t_infomask2 = newtup->t_data->t_infomask2;
7970 	xlhdr.t_infomask = newtup->t_data->t_infomask;
7971 	xlhdr.t_hoff = newtup->t_data->t_hoff;
7972 	Assert(SizeofHeapTupleHeader + prefixlen + suffixlen <= newtup->t_len);
7973 
7974 	/*
7975 	 * PG73FORMAT: write bitmap [+ padding] [+ oid] + data
7976 	 *
7977 	 * The 'data' doesn't include the common prefix or suffix.
7978 	 */
7979 	XLogRegisterBufData(0, (char *) &xlhdr, SizeOfHeapHeader);
7980 	if (prefixlen == 0)
7981 	{
7982 		XLogRegisterBufData(0,
7983 							((char *) newtup->t_data) + SizeofHeapTupleHeader,
7984 							newtup->t_len - SizeofHeapTupleHeader - suffixlen);
7985 	}
7986 	else
7987 	{
7988 		/*
7989 		 * Have to write the null bitmap and data after the common prefix as
7990 		 * two separate rdata entries.
7991 		 */
7992 		/* bitmap [+ padding] [+ oid] */
7993 		if (newtup->t_data->t_hoff - SizeofHeapTupleHeader > 0)
7994 		{
7995 			XLogRegisterBufData(0,
7996 								((char *) newtup->t_data) + SizeofHeapTupleHeader,
7997 								newtup->t_data->t_hoff - SizeofHeapTupleHeader);
7998 		}
7999 
8000 		/* data after common prefix */
8001 		XLogRegisterBufData(0,
8002 							((char *) newtup->t_data) + newtup->t_data->t_hoff + prefixlen,
8003 							newtup->t_len - newtup->t_data->t_hoff - prefixlen - suffixlen);
8004 	}
8005 
8006 	/* We need to log a tuple identity */
8007 	if (need_tuple_data && old_key_tuple)
8008 	{
8009 		/* don't really need this, but its more comfy to decode */
8010 		xlhdr_idx.t_infomask2 = old_key_tuple->t_data->t_infomask2;
8011 		xlhdr_idx.t_infomask = old_key_tuple->t_data->t_infomask;
8012 		xlhdr_idx.t_hoff = old_key_tuple->t_data->t_hoff;
8013 
8014 		XLogRegisterData((char *) &xlhdr_idx, SizeOfHeapHeader);
8015 
8016 		/* PG73FORMAT: write bitmap [+ padding] [+ oid] + data */
8017 		XLogRegisterData((char *) old_key_tuple->t_data + SizeofHeapTupleHeader,
8018 						 old_key_tuple->t_len - SizeofHeapTupleHeader);
8019 	}
8020 
8021 	/* filtering by origin on a row level is much more efficient */
8022 	XLogSetRecordFlags(XLOG_INCLUDE_ORIGIN);
8023 
8024 	recptr = XLogInsert(RM_HEAP_ID, info);
8025 
8026 	return recptr;
8027 }
8028 
8029 /*
8030  * Perform XLogInsert of an XLOG_HEAP2_NEW_CID record
8031  *
8032  * This is only used in wal_level >= WAL_LEVEL_LOGICAL, and only for catalog
8033  * tuples.
8034  */
8035 static XLogRecPtr
log_heap_new_cid(Relation relation,HeapTuple tup)8036 log_heap_new_cid(Relation relation, HeapTuple tup)
8037 {
8038 	xl_heap_new_cid xlrec;
8039 
8040 	XLogRecPtr	recptr;
8041 	HeapTupleHeader hdr = tup->t_data;
8042 
8043 	Assert(ItemPointerIsValid(&tup->t_self));
8044 	Assert(tup->t_tableOid != InvalidOid);
8045 
8046 	xlrec.top_xid = GetTopTransactionId();
8047 	xlrec.target_node = relation->rd_node;
8048 	xlrec.target_tid = tup->t_self;
8049 
8050 	/*
8051 	 * If the tuple got inserted & deleted in the same TX we definitely have a
8052 	 * combocid, set cmin and cmax.
8053 	 */
8054 	if (hdr->t_infomask & HEAP_COMBOCID)
8055 	{
8056 		Assert(!(hdr->t_infomask & HEAP_XMAX_INVALID));
8057 		Assert(!HeapTupleHeaderXminInvalid(hdr));
8058 		xlrec.cmin = HeapTupleHeaderGetCmin(hdr);
8059 		xlrec.cmax = HeapTupleHeaderGetCmax(hdr);
8060 		xlrec.combocid = HeapTupleHeaderGetRawCommandId(hdr);
8061 	}
8062 	/* No combocid, so only cmin or cmax can be set by this TX */
8063 	else
8064 	{
8065 		/*
8066 		 * Tuple inserted.
8067 		 *
8068 		 * We need to check for LOCK ONLY because multixacts might be
8069 		 * transferred to the new tuple in case of FOR KEY SHARE updates in
8070 		 * which case there will be an xmax, although the tuple just got
8071 		 * inserted.
8072 		 */
8073 		if (hdr->t_infomask & HEAP_XMAX_INVALID ||
8074 			HEAP_XMAX_IS_LOCKED_ONLY(hdr->t_infomask))
8075 		{
8076 			xlrec.cmin = HeapTupleHeaderGetRawCommandId(hdr);
8077 			xlrec.cmax = InvalidCommandId;
8078 		}
8079 		/* Tuple from a different tx updated or deleted. */
8080 		else
8081 		{
8082 			xlrec.cmin = InvalidCommandId;
8083 			xlrec.cmax = HeapTupleHeaderGetRawCommandId(hdr);
8084 
8085 		}
8086 		xlrec.combocid = InvalidCommandId;
8087 	}
8088 
8089 	/*
8090 	 * Note that we don't need to register the buffer here, because this
8091 	 * operation does not modify the page. The insert/update/delete that
8092 	 * called us certainly did, but that's WAL-logged separately.
8093 	 */
8094 	XLogBeginInsert();
8095 	XLogRegisterData((char *) &xlrec, SizeOfHeapNewCid);
8096 
8097 	/* will be looked at irrespective of origin */
8098 
8099 	recptr = XLogInsert(RM_HEAP2_ID, XLOG_HEAP2_NEW_CID);
8100 
8101 	return recptr;
8102 }
8103 
8104 /*
8105  * Build a heap tuple representing the configured REPLICA IDENTITY to represent
8106  * the old tuple in a UPDATE or DELETE.
8107  *
8108  * Returns NULL if there's no need to log an identity or if there's no suitable
8109  * key in the Relation relation.
8110  */
8111 static HeapTuple
ExtractReplicaIdentity(Relation relation,HeapTuple tp,bool key_changed,bool * copy)8112 ExtractReplicaIdentity(Relation relation, HeapTuple tp, bool key_changed, bool *copy)
8113 {
8114 	TupleDesc	desc = RelationGetDescr(relation);
8115 	Oid			replidindex;
8116 	Relation	idx_rel;
8117 	char		replident = relation->rd_rel->relreplident;
8118 	HeapTuple	key_tuple = NULL;
8119 	bool		nulls[MaxHeapAttributeNumber];
8120 	Datum		values[MaxHeapAttributeNumber];
8121 	int			natt;
8122 
8123 	*copy = false;
8124 
8125 	if (!RelationIsLogicallyLogged(relation))
8126 		return NULL;
8127 
8128 	if (replident == REPLICA_IDENTITY_NOTHING)
8129 		return NULL;
8130 
8131 	if (replident == REPLICA_IDENTITY_FULL)
8132 	{
8133 		/*
8134 		 * When logging the entire old tuple, it very well could contain
8135 		 * toasted columns. If so, force them to be inlined.
8136 		 */
8137 		if (HeapTupleHasExternal(tp))
8138 		{
8139 			*copy = true;
8140 			tp = toast_flatten_tuple(tp, RelationGetDescr(relation));
8141 		}
8142 		return tp;
8143 	}
8144 
8145 	/* if the key hasn't changed and we're only logging the key, we're done */
8146 	if (!key_changed)
8147 		return NULL;
8148 
8149 	/* find the replica identity index */
8150 	replidindex = RelationGetReplicaIndex(relation);
8151 	if (!OidIsValid(replidindex))
8152 	{
8153 		elog(DEBUG4, "could not find configured replica identity for table \"%s\"",
8154 			 RelationGetRelationName(relation));
8155 		return NULL;
8156 	}
8157 
8158 	idx_rel = RelationIdGetRelation(replidindex);
8159 
8160 	if (!RelationIsValid(idx_rel))
8161 		elog(ERROR, "could not open relation with OID %u", replidindex);
8162 
8163 	/* deform tuple, so we have fast access to columns */
8164 	heap_deform_tuple(tp, desc, values, nulls);
8165 
8166 	/* set all columns to NULL, regardless of whether they actually are */
8167 	memset(nulls, 1, sizeof(nulls));
8168 
8169 	/*
8170 	 * Now set all columns contained in the index to NOT NULL, they cannot
8171 	 * currently be NULL.
8172 	 */
8173 	for (natt = 0; natt < IndexRelationGetNumberOfKeyAttributes(idx_rel); natt++)
8174 	{
8175 		int			attno = idx_rel->rd_index->indkey.values[natt];
8176 
8177 		if (attno < 0)
8178 		{
8179 			/*
8180 			 * The OID column can appear in an index definition, but that's
8181 			 * OK, because we always copy the OID if present (see below).
8182 			 * Other system columns may not.
8183 			 */
8184 			if (attno == ObjectIdAttributeNumber)
8185 				continue;
8186 			elog(ERROR, "system column in index");
8187 		}
8188 		nulls[attno - 1] = false;
8189 	}
8190 
8191 	key_tuple = heap_form_tuple(desc, values, nulls);
8192 	*copy = true;
8193 	RelationClose(idx_rel);
8194 
8195 	/*
8196 	 * Always copy oids if the table has them, even if not included in the
8197 	 * index. The space in the logged tuple is used anyway, so there's little
8198 	 * point in not including the information.
8199 	 */
8200 	if (relation->rd_rel->relhasoids)
8201 		HeapTupleSetOid(key_tuple, HeapTupleGetOid(tp));
8202 
8203 	/*
8204 	 * If the tuple, which by here only contains indexed columns, still has
8205 	 * toasted columns, force them to be inlined. This is somewhat unlikely
8206 	 * since there's limits on the size of indexed columns, so we don't
8207 	 * duplicate toast_flatten_tuple()s functionality in the above loop over
8208 	 * the indexed columns, even if it would be more efficient.
8209 	 */
8210 	if (HeapTupleHasExternal(key_tuple))
8211 	{
8212 		HeapTuple	oldtup = key_tuple;
8213 
8214 		key_tuple = toast_flatten_tuple(oldtup, RelationGetDescr(relation));
8215 		heap_freetuple(oldtup);
8216 	}
8217 
8218 	return key_tuple;
8219 }
8220 
8221 /*
8222  * Handles CLEANUP_INFO
8223  */
8224 static void
heap_xlog_cleanup_info(XLogReaderState * record)8225 heap_xlog_cleanup_info(XLogReaderState *record)
8226 {
8227 	xl_heap_cleanup_info *xlrec = (xl_heap_cleanup_info *) XLogRecGetData(record);
8228 
8229 	if (InHotStandby)
8230 		ResolveRecoveryConflictWithSnapshot(xlrec->latestRemovedXid, xlrec->node);
8231 
8232 	/*
8233 	 * Actual operation is a no-op. Record type exists to provide a means for
8234 	 * conflict processing to occur before we begin index vacuum actions. see
8235 	 * vacuumlazy.c and also comments in btvacuumpage()
8236 	 */
8237 
8238 	/* Backup blocks are not used in cleanup_info records */
8239 	Assert(!XLogRecHasAnyBlockRefs(record));
8240 }
8241 
8242 /*
8243  * Handles XLOG_HEAP2_CLEAN record type
8244  */
8245 static void
heap_xlog_clean(XLogReaderState * record)8246 heap_xlog_clean(XLogReaderState *record)
8247 {
8248 	XLogRecPtr	lsn = record->EndRecPtr;
8249 	xl_heap_clean *xlrec = (xl_heap_clean *) XLogRecGetData(record);
8250 	Buffer		buffer;
8251 	RelFileNode rnode;
8252 	BlockNumber blkno;
8253 	XLogRedoAction action;
8254 
8255 	XLogRecGetBlockTag(record, 0, &rnode, NULL, &blkno);
8256 
8257 	/*
8258 	 * We're about to remove tuples. In Hot Standby mode, ensure that there's
8259 	 * no queries running for which the removed tuples are still visible.
8260 	 *
8261 	 * Not all HEAP2_CLEAN records remove tuples with xids, so we only want to
8262 	 * conflict on the records that cause MVCC failures for user queries. If
8263 	 * latestRemovedXid is invalid, skip conflict processing.
8264 	 */
8265 	if (InHotStandby && TransactionIdIsValid(xlrec->latestRemovedXid))
8266 		ResolveRecoveryConflictWithSnapshot(xlrec->latestRemovedXid, rnode);
8267 
8268 	/*
8269 	 * If we have a full-page image, restore it (using a cleanup lock) and
8270 	 * we're done.
8271 	 */
8272 	action = XLogReadBufferForRedoExtended(record, 0, RBM_NORMAL, true,
8273 										   &buffer);
8274 	if (action == BLK_NEEDS_REDO)
8275 	{
8276 		Page		page = (Page) BufferGetPage(buffer);
8277 		OffsetNumber *end;
8278 		OffsetNumber *redirected;
8279 		OffsetNumber *nowdead;
8280 		OffsetNumber *nowunused;
8281 		int			nredirected;
8282 		int			ndead;
8283 		int			nunused;
8284 		Size		datalen;
8285 
8286 		redirected = (OffsetNumber *) XLogRecGetBlockData(record, 0, &datalen);
8287 
8288 		nredirected = xlrec->nredirected;
8289 		ndead = xlrec->ndead;
8290 		end = (OffsetNumber *) ((char *) redirected + datalen);
8291 		nowdead = redirected + (nredirected * 2);
8292 		nowunused = nowdead + ndead;
8293 		nunused = (end - nowunused);
8294 		Assert(nunused >= 0);
8295 
8296 		/* Update all item pointers per the record, and repair fragmentation */
8297 		heap_page_prune_execute(buffer,
8298 								redirected, nredirected,
8299 								nowdead, ndead,
8300 								nowunused, nunused);
8301 
8302 		/*
8303 		 * Note: we don't worry about updating the page's prunability hints.
8304 		 * At worst this will cause an extra prune cycle to occur soon.
8305 		 */
8306 
8307 		PageSetLSN(page, lsn);
8308 		MarkBufferDirty(buffer);
8309 	}
8310 
8311 	if (BufferIsValid(buffer))
8312 	{
8313 		Size		freespace = PageGetHeapFreeSpace(BufferGetPage(buffer));
8314 
8315 		UnlockReleaseBuffer(buffer);
8316 
8317 		/*
8318 		 * After cleaning records from a page, it's useful to update the FSM
8319 		 * about it, as it may cause the page become target for insertions
8320 		 * later even if vacuum decides not to visit it (which is possible if
8321 		 * gets marked all-visible.)
8322 		 *
8323 		 * Do this regardless of a full-page image being applied, since the
8324 		 * FSM data is not in the page anyway.
8325 		 */
8326 		XLogRecordPageWithFreeSpace(rnode, blkno, freespace);
8327 	}
8328 }
8329 
8330 /*
8331  * Replay XLOG_HEAP2_VISIBLE record.
8332  *
8333  * The critical integrity requirement here is that we must never end up with
8334  * a situation where the visibility map bit is set, and the page-level
8335  * PD_ALL_VISIBLE bit is clear.  If that were to occur, then a subsequent
8336  * page modification would fail to clear the visibility map bit.
8337  */
8338 static void
heap_xlog_visible(XLogReaderState * record)8339 heap_xlog_visible(XLogReaderState *record)
8340 {
8341 	XLogRecPtr	lsn = record->EndRecPtr;
8342 	xl_heap_visible *xlrec = (xl_heap_visible *) XLogRecGetData(record);
8343 	Buffer		vmbuffer = InvalidBuffer;
8344 	Buffer		buffer;
8345 	Page		page;
8346 	RelFileNode rnode;
8347 	BlockNumber blkno;
8348 	XLogRedoAction action;
8349 
8350 	XLogRecGetBlockTag(record, 1, &rnode, NULL, &blkno);
8351 
8352 	/*
8353 	 * If there are any Hot Standby transactions running that have an xmin
8354 	 * horizon old enough that this page isn't all-visible for them, they
8355 	 * might incorrectly decide that an index-only scan can skip a heap fetch.
8356 	 *
8357 	 * NB: It might be better to throw some kind of "soft" conflict here that
8358 	 * forces any index-only scan that is in flight to perform heap fetches,
8359 	 * rather than killing the transaction outright.
8360 	 */
8361 	if (InHotStandby)
8362 		ResolveRecoveryConflictWithSnapshot(xlrec->cutoff_xid, rnode);
8363 
8364 	/*
8365 	 * Read the heap page, if it still exists. If the heap file has dropped or
8366 	 * truncated later in recovery, we don't need to update the page, but we'd
8367 	 * better still update the visibility map.
8368 	 */
8369 	action = XLogReadBufferForRedo(record, 1, &buffer);
8370 	if (action == BLK_NEEDS_REDO)
8371 	{
8372 		/*
8373 		 * We don't bump the LSN of the heap page when setting the visibility
8374 		 * map bit (unless checksums or wal_hint_bits is enabled, in which
8375 		 * case we must), because that would generate an unworkable volume of
8376 		 * full-page writes.  This exposes us to torn page hazards, but since
8377 		 * we're not inspecting the existing page contents in any way, we
8378 		 * don't care.
8379 		 *
8380 		 * However, all operations that clear the visibility map bit *do* bump
8381 		 * the LSN, and those operations will only be replayed if the XLOG LSN
8382 		 * follows the page LSN.  Thus, if the page LSN has advanced past our
8383 		 * XLOG record's LSN, we mustn't mark the page all-visible, because
8384 		 * the subsequent update won't be replayed to clear the flag.
8385 		 */
8386 		page = BufferGetPage(buffer);
8387 
8388 		PageSetAllVisible(page);
8389 
8390 		MarkBufferDirty(buffer);
8391 	}
8392 	else if (action == BLK_RESTORED)
8393 	{
8394 		/*
8395 		 * If heap block was backed up, we already restored it and there's
8396 		 * nothing more to do. (This can only happen with checksums or
8397 		 * wal_log_hints enabled.)
8398 		 */
8399 	}
8400 
8401 	if (BufferIsValid(buffer))
8402 	{
8403 		Size		space = PageGetFreeSpace(BufferGetPage(buffer));
8404 
8405 		UnlockReleaseBuffer(buffer);
8406 
8407 		/*
8408 		 * Since FSM is not WAL-logged and only updated heuristically, it
8409 		 * easily becomes stale in standbys.  If the standby is later promoted
8410 		 * and runs VACUUM, it will skip updating individual free space
8411 		 * figures for pages that became all-visible (or all-frozen, depending
8412 		 * on the vacuum mode,) which is troublesome when FreeSpaceMapVacuum
8413 		 * propagates too optimistic free space values to upper FSM layers;
8414 		 * later inserters try to use such pages only to find out that they
8415 		 * are unusable.  This can cause long stalls when there are many such
8416 		 * pages.
8417 		 *
8418 		 * Forestall those problems by updating FSM's idea about a page that
8419 		 * is becoming all-visible or all-frozen.
8420 		 *
8421 		 * Do this regardless of a full-page image being applied, since the
8422 		 * FSM data is not in the page anyway.
8423 		 */
8424 		if (xlrec->flags & VISIBILITYMAP_VALID_BITS)
8425 			XLogRecordPageWithFreeSpace(rnode, blkno, space);
8426 	}
8427 
8428 	/*
8429 	 * Even if we skipped the heap page update due to the LSN interlock, it's
8430 	 * still safe to update the visibility map.  Any WAL record that clears
8431 	 * the visibility map bit does so before checking the page LSN, so any
8432 	 * bits that need to be cleared will still be cleared.
8433 	 */
8434 	if (XLogReadBufferForRedoExtended(record, 0, RBM_ZERO_ON_ERROR, false,
8435 									  &vmbuffer) == BLK_NEEDS_REDO)
8436 	{
8437 		Page		vmpage = BufferGetPage(vmbuffer);
8438 		Relation	reln;
8439 
8440 		/* initialize the page if it was read as zeros */
8441 		if (PageIsNew(vmpage))
8442 			PageInit(vmpage, BLCKSZ, 0);
8443 
8444 		/*
8445 		 * XLogReadBufferForRedoExtended locked the buffer. But
8446 		 * visibilitymap_set will handle locking itself.
8447 		 */
8448 		LockBuffer(vmbuffer, BUFFER_LOCK_UNLOCK);
8449 
8450 		reln = CreateFakeRelcacheEntry(rnode);
8451 		visibilitymap_pin(reln, blkno, &vmbuffer);
8452 
8453 		/*
8454 		 * Don't set the bit if replay has already passed this point.
8455 		 *
8456 		 * It might be safe to do this unconditionally; if replay has passed
8457 		 * this point, we'll replay at least as far this time as we did
8458 		 * before, and if this bit needs to be cleared, the record responsible
8459 		 * for doing so should be again replayed, and clear it.  For right
8460 		 * now, out of an abundance of conservatism, we use the same test here
8461 		 * we did for the heap page.  If this results in a dropped bit, no
8462 		 * real harm is done; and the next VACUUM will fix it.
8463 		 */
8464 		if (lsn > PageGetLSN(vmpage))
8465 			visibilitymap_set(reln, blkno, InvalidBuffer, lsn, vmbuffer,
8466 							  xlrec->cutoff_xid, xlrec->flags);
8467 
8468 		ReleaseBuffer(vmbuffer);
8469 		FreeFakeRelcacheEntry(reln);
8470 	}
8471 	else if (BufferIsValid(vmbuffer))
8472 		UnlockReleaseBuffer(vmbuffer);
8473 }
8474 
8475 /*
8476  * Replay XLOG_HEAP2_FREEZE_PAGE records
8477  */
8478 static void
heap_xlog_freeze_page(XLogReaderState * record)8479 heap_xlog_freeze_page(XLogReaderState *record)
8480 {
8481 	XLogRecPtr	lsn = record->EndRecPtr;
8482 	xl_heap_freeze_page *xlrec = (xl_heap_freeze_page *) XLogRecGetData(record);
8483 	TransactionId cutoff_xid = xlrec->cutoff_xid;
8484 	Buffer		buffer;
8485 	int			ntup;
8486 
8487 	/*
8488 	 * In Hot Standby mode, ensure that there's no queries running which still
8489 	 * consider the frozen xids as running.
8490 	 */
8491 	if (InHotStandby)
8492 	{
8493 		RelFileNode rnode;
8494 		TransactionId latestRemovedXid = cutoff_xid;
8495 
8496 		TransactionIdRetreat(latestRemovedXid);
8497 
8498 		XLogRecGetBlockTag(record, 0, &rnode, NULL, NULL);
8499 		ResolveRecoveryConflictWithSnapshot(latestRemovedXid, rnode);
8500 	}
8501 
8502 	if (XLogReadBufferForRedo(record, 0, &buffer) == BLK_NEEDS_REDO)
8503 	{
8504 		Page		page = BufferGetPage(buffer);
8505 		xl_heap_freeze_tuple *tuples;
8506 
8507 		tuples = (xl_heap_freeze_tuple *) XLogRecGetBlockData(record, 0, NULL);
8508 
8509 		/* now execute freeze plan for each frozen tuple */
8510 		for (ntup = 0; ntup < xlrec->ntuples; ntup++)
8511 		{
8512 			xl_heap_freeze_tuple *xlrec_tp;
8513 			ItemId		lp;
8514 			HeapTupleHeader tuple;
8515 
8516 			xlrec_tp = &tuples[ntup];
8517 			lp = PageGetItemId(page, xlrec_tp->offset); /* offsets are one-based */
8518 			tuple = (HeapTupleHeader) PageGetItem(page, lp);
8519 
8520 			heap_execute_freeze_tuple(tuple, xlrec_tp);
8521 		}
8522 
8523 		PageSetLSN(page, lsn);
8524 		MarkBufferDirty(buffer);
8525 	}
8526 	if (BufferIsValid(buffer))
8527 		UnlockReleaseBuffer(buffer);
8528 }
8529 
8530 /*
8531  * Given an "infobits" field from an XLog record, set the correct bits in the
8532  * given infomask and infomask2 for the tuple touched by the record.
8533  *
8534  * (This is the reverse of compute_infobits).
8535  */
8536 static void
fix_infomask_from_infobits(uint8 infobits,uint16 * infomask,uint16 * infomask2)8537 fix_infomask_from_infobits(uint8 infobits, uint16 *infomask, uint16 *infomask2)
8538 {
8539 	*infomask &= ~(HEAP_XMAX_IS_MULTI | HEAP_XMAX_LOCK_ONLY |
8540 				   HEAP_XMAX_KEYSHR_LOCK | HEAP_XMAX_EXCL_LOCK);
8541 	*infomask2 &= ~HEAP_KEYS_UPDATED;
8542 
8543 	if (infobits & XLHL_XMAX_IS_MULTI)
8544 		*infomask |= HEAP_XMAX_IS_MULTI;
8545 	if (infobits & XLHL_XMAX_LOCK_ONLY)
8546 		*infomask |= HEAP_XMAX_LOCK_ONLY;
8547 	if (infobits & XLHL_XMAX_EXCL_LOCK)
8548 		*infomask |= HEAP_XMAX_EXCL_LOCK;
8549 	/* note HEAP_XMAX_SHR_LOCK isn't considered here */
8550 	if (infobits & XLHL_XMAX_KEYSHR_LOCK)
8551 		*infomask |= HEAP_XMAX_KEYSHR_LOCK;
8552 
8553 	if (infobits & XLHL_KEYS_UPDATED)
8554 		*infomask2 |= HEAP_KEYS_UPDATED;
8555 }
8556 
8557 static void
heap_xlog_delete(XLogReaderState * record)8558 heap_xlog_delete(XLogReaderState *record)
8559 {
8560 	XLogRecPtr	lsn = record->EndRecPtr;
8561 	xl_heap_delete *xlrec = (xl_heap_delete *) XLogRecGetData(record);
8562 	Buffer		buffer;
8563 	Page		page;
8564 	ItemId		lp = NULL;
8565 	HeapTupleHeader htup;
8566 	BlockNumber blkno;
8567 	RelFileNode target_node;
8568 	ItemPointerData target_tid;
8569 
8570 	XLogRecGetBlockTag(record, 0, &target_node, NULL, &blkno);
8571 	ItemPointerSetBlockNumber(&target_tid, blkno);
8572 	ItemPointerSetOffsetNumber(&target_tid, xlrec->offnum);
8573 
8574 	/*
8575 	 * The visibility map may need to be fixed even if the heap page is
8576 	 * already up-to-date.
8577 	 */
8578 	if (xlrec->flags & XLH_DELETE_ALL_VISIBLE_CLEARED)
8579 	{
8580 		Relation	reln = CreateFakeRelcacheEntry(target_node);
8581 		Buffer		vmbuffer = InvalidBuffer;
8582 
8583 		visibilitymap_pin(reln, blkno, &vmbuffer);
8584 		visibilitymap_clear(reln, blkno, vmbuffer, VISIBILITYMAP_VALID_BITS);
8585 		ReleaseBuffer(vmbuffer);
8586 		FreeFakeRelcacheEntry(reln);
8587 	}
8588 
8589 	if (XLogReadBufferForRedo(record, 0, &buffer) == BLK_NEEDS_REDO)
8590 	{
8591 		page = BufferGetPage(buffer);
8592 
8593 		if (PageGetMaxOffsetNumber(page) >= xlrec->offnum)
8594 			lp = PageGetItemId(page, xlrec->offnum);
8595 
8596 		if (PageGetMaxOffsetNumber(page) < xlrec->offnum || !ItemIdIsNormal(lp))
8597 			elog(PANIC, "invalid lp");
8598 
8599 		htup = (HeapTupleHeader) PageGetItem(page, lp);
8600 
8601 		htup->t_infomask &= ~(HEAP_XMAX_BITS | HEAP_MOVED);
8602 		htup->t_infomask2 &= ~HEAP_KEYS_UPDATED;
8603 		HeapTupleHeaderClearHotUpdated(htup);
8604 		fix_infomask_from_infobits(xlrec->infobits_set,
8605 								   &htup->t_infomask, &htup->t_infomask2);
8606 		if (!(xlrec->flags & XLH_DELETE_IS_SUPER))
8607 			HeapTupleHeaderSetXmax(htup, xlrec->xmax);
8608 		else
8609 			HeapTupleHeaderSetXmin(htup, InvalidTransactionId);
8610 		HeapTupleHeaderSetCmax(htup, FirstCommandId, false);
8611 
8612 		/* Mark the page as a candidate for pruning */
8613 		PageSetPrunable(page, XLogRecGetXid(record));
8614 
8615 		if (xlrec->flags & XLH_DELETE_ALL_VISIBLE_CLEARED)
8616 			PageClearAllVisible(page);
8617 
8618 		/* Make sure t_ctid is set correctly */
8619 		if (xlrec->flags & XLH_DELETE_IS_PARTITION_MOVE)
8620 			HeapTupleHeaderSetMovedPartitions(htup);
8621 		else
8622 			htup->t_ctid = target_tid;
8623 		PageSetLSN(page, lsn);
8624 		MarkBufferDirty(buffer);
8625 	}
8626 	if (BufferIsValid(buffer))
8627 		UnlockReleaseBuffer(buffer);
8628 }
8629 
8630 static void
heap_xlog_insert(XLogReaderState * record)8631 heap_xlog_insert(XLogReaderState *record)
8632 {
8633 	XLogRecPtr	lsn = record->EndRecPtr;
8634 	xl_heap_insert *xlrec = (xl_heap_insert *) XLogRecGetData(record);
8635 	Buffer		buffer;
8636 	Page		page;
8637 	union
8638 	{
8639 		HeapTupleHeaderData hdr;
8640 		char		data[MaxHeapTupleSize];
8641 	}			tbuf;
8642 	HeapTupleHeader htup;
8643 	xl_heap_header xlhdr;
8644 	uint32		newlen;
8645 	Size		freespace = 0;
8646 	RelFileNode target_node;
8647 	BlockNumber blkno;
8648 	ItemPointerData target_tid;
8649 	XLogRedoAction action;
8650 
8651 	XLogRecGetBlockTag(record, 0, &target_node, NULL, &blkno);
8652 	ItemPointerSetBlockNumber(&target_tid, blkno);
8653 	ItemPointerSetOffsetNumber(&target_tid, xlrec->offnum);
8654 
8655 	/*
8656 	 * The visibility map may need to be fixed even if the heap page is
8657 	 * already up-to-date.
8658 	 */
8659 	if (xlrec->flags & XLH_INSERT_ALL_VISIBLE_CLEARED)
8660 	{
8661 		Relation	reln = CreateFakeRelcacheEntry(target_node);
8662 		Buffer		vmbuffer = InvalidBuffer;
8663 
8664 		visibilitymap_pin(reln, blkno, &vmbuffer);
8665 		visibilitymap_clear(reln, blkno, vmbuffer, VISIBILITYMAP_VALID_BITS);
8666 		ReleaseBuffer(vmbuffer);
8667 		FreeFakeRelcacheEntry(reln);
8668 	}
8669 
8670 	/*
8671 	 * If we inserted the first and only tuple on the page, re-initialize the
8672 	 * page from scratch.
8673 	 */
8674 	if (XLogRecGetInfo(record) & XLOG_HEAP_INIT_PAGE)
8675 	{
8676 		buffer = XLogInitBufferForRedo(record, 0);
8677 		page = BufferGetPage(buffer);
8678 		PageInit(page, BufferGetPageSize(buffer), 0);
8679 		action = BLK_NEEDS_REDO;
8680 	}
8681 	else
8682 		action = XLogReadBufferForRedo(record, 0, &buffer);
8683 	if (action == BLK_NEEDS_REDO)
8684 	{
8685 		Size		datalen;
8686 		char	   *data;
8687 
8688 		page = BufferGetPage(buffer);
8689 
8690 		if (PageGetMaxOffsetNumber(page) + 1 < xlrec->offnum)
8691 			elog(PANIC, "invalid max offset number");
8692 
8693 		data = XLogRecGetBlockData(record, 0, &datalen);
8694 
8695 		newlen = datalen - SizeOfHeapHeader;
8696 		Assert(datalen > SizeOfHeapHeader && newlen <= MaxHeapTupleSize);
8697 		memcpy((char *) &xlhdr, data, SizeOfHeapHeader);
8698 		data += SizeOfHeapHeader;
8699 
8700 		htup = &tbuf.hdr;
8701 		MemSet((char *) htup, 0, SizeofHeapTupleHeader);
8702 		/* PG73FORMAT: get bitmap [+ padding] [+ oid] + data */
8703 		memcpy((char *) htup + SizeofHeapTupleHeader,
8704 			   data,
8705 			   newlen);
8706 		newlen += SizeofHeapTupleHeader;
8707 		htup->t_infomask2 = xlhdr.t_infomask2;
8708 		htup->t_infomask = xlhdr.t_infomask;
8709 		htup->t_hoff = xlhdr.t_hoff;
8710 		HeapTupleHeaderSetXmin(htup, XLogRecGetXid(record));
8711 		HeapTupleHeaderSetCmin(htup, FirstCommandId);
8712 		htup->t_ctid = target_tid;
8713 
8714 		if (PageAddItem(page, (Item) htup, newlen, xlrec->offnum,
8715 						true, true) == InvalidOffsetNumber)
8716 			elog(PANIC, "failed to add tuple");
8717 
8718 		freespace = PageGetHeapFreeSpace(page); /* needed to update FSM below */
8719 
8720 		PageSetLSN(page, lsn);
8721 
8722 		if (xlrec->flags & XLH_INSERT_ALL_VISIBLE_CLEARED)
8723 			PageClearAllVisible(page);
8724 
8725 		MarkBufferDirty(buffer);
8726 	}
8727 	if (BufferIsValid(buffer))
8728 		UnlockReleaseBuffer(buffer);
8729 
8730 	/*
8731 	 * If the page is running low on free space, update the FSM as well.
8732 	 * Arbitrarily, our definition of "low" is less than 20%. We can't do much
8733 	 * better than that without knowing the fill-factor for the table.
8734 	 *
8735 	 * XXX: Don't do this if the page was restored from full page image. We
8736 	 * don't bother to update the FSM in that case, it doesn't need to be
8737 	 * totally accurate anyway.
8738 	 */
8739 	if (action == BLK_NEEDS_REDO && freespace < BLCKSZ / 5)
8740 		XLogRecordPageWithFreeSpace(target_node, blkno, freespace);
8741 }
8742 
8743 /*
8744  * Handles MULTI_INSERT record type.
8745  */
8746 static void
heap_xlog_multi_insert(XLogReaderState * record)8747 heap_xlog_multi_insert(XLogReaderState *record)
8748 {
8749 	XLogRecPtr	lsn = record->EndRecPtr;
8750 	xl_heap_multi_insert *xlrec;
8751 	RelFileNode rnode;
8752 	BlockNumber blkno;
8753 	Buffer		buffer;
8754 	Page		page;
8755 	union
8756 	{
8757 		HeapTupleHeaderData hdr;
8758 		char		data[MaxHeapTupleSize];
8759 	}			tbuf;
8760 	HeapTupleHeader htup;
8761 	uint32		newlen;
8762 	Size		freespace = 0;
8763 	int			i;
8764 	bool		isinit = (XLogRecGetInfo(record) & XLOG_HEAP_INIT_PAGE) != 0;
8765 	XLogRedoAction action;
8766 
8767 	/*
8768 	 * Insertion doesn't overwrite MVCC data, so no conflict processing is
8769 	 * required.
8770 	 */
8771 	xlrec = (xl_heap_multi_insert *) XLogRecGetData(record);
8772 
8773 	XLogRecGetBlockTag(record, 0, &rnode, NULL, &blkno);
8774 
8775 	/*
8776 	 * The visibility map may need to be fixed even if the heap page is
8777 	 * already up-to-date.
8778 	 */
8779 	if (xlrec->flags & XLH_INSERT_ALL_VISIBLE_CLEARED)
8780 	{
8781 		Relation	reln = CreateFakeRelcacheEntry(rnode);
8782 		Buffer		vmbuffer = InvalidBuffer;
8783 
8784 		visibilitymap_pin(reln, blkno, &vmbuffer);
8785 		visibilitymap_clear(reln, blkno, vmbuffer, VISIBILITYMAP_VALID_BITS);
8786 		ReleaseBuffer(vmbuffer);
8787 		FreeFakeRelcacheEntry(reln);
8788 	}
8789 
8790 	if (isinit)
8791 	{
8792 		buffer = XLogInitBufferForRedo(record, 0);
8793 		page = BufferGetPage(buffer);
8794 		PageInit(page, BufferGetPageSize(buffer), 0);
8795 		action = BLK_NEEDS_REDO;
8796 	}
8797 	else
8798 		action = XLogReadBufferForRedo(record, 0, &buffer);
8799 	if (action == BLK_NEEDS_REDO)
8800 	{
8801 		char	   *tupdata;
8802 		char	   *endptr;
8803 		Size		len;
8804 
8805 		/* Tuples are stored as block data */
8806 		tupdata = XLogRecGetBlockData(record, 0, &len);
8807 		endptr = tupdata + len;
8808 
8809 		page = (Page) BufferGetPage(buffer);
8810 
8811 		for (i = 0; i < xlrec->ntuples; i++)
8812 		{
8813 			OffsetNumber offnum;
8814 			xl_multi_insert_tuple *xlhdr;
8815 
8816 			/*
8817 			 * If we're reinitializing the page, the tuples are stored in
8818 			 * order from FirstOffsetNumber. Otherwise there's an array of
8819 			 * offsets in the WAL record, and the tuples come after that.
8820 			 */
8821 			if (isinit)
8822 				offnum = FirstOffsetNumber + i;
8823 			else
8824 				offnum = xlrec->offsets[i];
8825 			if (PageGetMaxOffsetNumber(page) + 1 < offnum)
8826 				elog(PANIC, "invalid max offset number");
8827 
8828 			xlhdr = (xl_multi_insert_tuple *) SHORTALIGN(tupdata);
8829 			tupdata = ((char *) xlhdr) + SizeOfMultiInsertTuple;
8830 
8831 			newlen = xlhdr->datalen;
8832 			Assert(newlen <= MaxHeapTupleSize);
8833 			htup = &tbuf.hdr;
8834 			MemSet((char *) htup, 0, SizeofHeapTupleHeader);
8835 			/* PG73FORMAT: get bitmap [+ padding] [+ oid] + data */
8836 			memcpy((char *) htup + SizeofHeapTupleHeader,
8837 				   (char *) tupdata,
8838 				   newlen);
8839 			tupdata += newlen;
8840 
8841 			newlen += SizeofHeapTupleHeader;
8842 			htup->t_infomask2 = xlhdr->t_infomask2;
8843 			htup->t_infomask = xlhdr->t_infomask;
8844 			htup->t_hoff = xlhdr->t_hoff;
8845 			HeapTupleHeaderSetXmin(htup, XLogRecGetXid(record));
8846 			HeapTupleHeaderSetCmin(htup, FirstCommandId);
8847 			ItemPointerSetBlockNumber(&htup->t_ctid, blkno);
8848 			ItemPointerSetOffsetNumber(&htup->t_ctid, offnum);
8849 
8850 			offnum = PageAddItem(page, (Item) htup, newlen, offnum, true, true);
8851 			if (offnum == InvalidOffsetNumber)
8852 				elog(PANIC, "failed to add tuple");
8853 		}
8854 		if (tupdata != endptr)
8855 			elog(PANIC, "total tuple length mismatch");
8856 
8857 		freespace = PageGetHeapFreeSpace(page); /* needed to update FSM below */
8858 
8859 		PageSetLSN(page, lsn);
8860 
8861 		if (xlrec->flags & XLH_INSERT_ALL_VISIBLE_CLEARED)
8862 			PageClearAllVisible(page);
8863 
8864 		MarkBufferDirty(buffer);
8865 	}
8866 	if (BufferIsValid(buffer))
8867 		UnlockReleaseBuffer(buffer);
8868 
8869 	/*
8870 	 * If the page is running low on free space, update the FSM as well.
8871 	 * Arbitrarily, our definition of "low" is less than 20%. We can't do much
8872 	 * better than that without knowing the fill-factor for the table.
8873 	 *
8874 	 * XXX: Don't do this if the page was restored from full page image. We
8875 	 * don't bother to update the FSM in that case, it doesn't need to be
8876 	 * totally accurate anyway.
8877 	 */
8878 	if (action == BLK_NEEDS_REDO && freespace < BLCKSZ / 5)
8879 		XLogRecordPageWithFreeSpace(rnode, blkno, freespace);
8880 }
8881 
8882 /*
8883  * Handles UPDATE and HOT_UPDATE
8884  */
8885 static void
heap_xlog_update(XLogReaderState * record,bool hot_update)8886 heap_xlog_update(XLogReaderState *record, bool hot_update)
8887 {
8888 	XLogRecPtr	lsn = record->EndRecPtr;
8889 	xl_heap_update *xlrec = (xl_heap_update *) XLogRecGetData(record);
8890 	RelFileNode rnode;
8891 	BlockNumber oldblk;
8892 	BlockNumber newblk;
8893 	ItemPointerData newtid;
8894 	Buffer		obuffer,
8895 				nbuffer;
8896 	Page		page;
8897 	OffsetNumber offnum;
8898 	ItemId		lp = NULL;
8899 	HeapTupleData oldtup;
8900 	HeapTupleHeader htup;
8901 	uint16		prefixlen = 0,
8902 				suffixlen = 0;
8903 	char	   *newp;
8904 	union
8905 	{
8906 		HeapTupleHeaderData hdr;
8907 		char		data[MaxHeapTupleSize];
8908 	}			tbuf;
8909 	xl_heap_header xlhdr;
8910 	uint32		newlen;
8911 	Size		freespace = 0;
8912 	XLogRedoAction oldaction;
8913 	XLogRedoAction newaction;
8914 
8915 	/* initialize to keep the compiler quiet */
8916 	oldtup.t_data = NULL;
8917 	oldtup.t_len = 0;
8918 
8919 	XLogRecGetBlockTag(record, 0, &rnode, NULL, &newblk);
8920 	if (XLogRecGetBlockTag(record, 1, NULL, NULL, &oldblk))
8921 	{
8922 		/* HOT updates are never done across pages */
8923 		Assert(!hot_update);
8924 	}
8925 	else
8926 		oldblk = newblk;
8927 
8928 	ItemPointerSet(&newtid, newblk, xlrec->new_offnum);
8929 
8930 	/*
8931 	 * The visibility map may need to be fixed even if the heap page is
8932 	 * already up-to-date.
8933 	 */
8934 	if (xlrec->flags & XLH_UPDATE_OLD_ALL_VISIBLE_CLEARED)
8935 	{
8936 		Relation	reln = CreateFakeRelcacheEntry(rnode);
8937 		Buffer		vmbuffer = InvalidBuffer;
8938 
8939 		visibilitymap_pin(reln, oldblk, &vmbuffer);
8940 		visibilitymap_clear(reln, oldblk, vmbuffer, VISIBILITYMAP_VALID_BITS);
8941 		ReleaseBuffer(vmbuffer);
8942 		FreeFakeRelcacheEntry(reln);
8943 	}
8944 
8945 	/*
8946 	 * In normal operation, it is important to lock the two pages in
8947 	 * page-number order, to avoid possible deadlocks against other update
8948 	 * operations going the other way.  However, during WAL replay there can
8949 	 * be no other update happening, so we don't need to worry about that. But
8950 	 * we *do* need to worry that we don't expose an inconsistent state to Hot
8951 	 * Standby queries --- so the original page can't be unlocked before we've
8952 	 * added the new tuple to the new page.
8953 	 */
8954 
8955 	/* Deal with old tuple version */
8956 	oldaction = XLogReadBufferForRedo(record, (oldblk == newblk) ? 0 : 1,
8957 									  &obuffer);
8958 	if (oldaction == BLK_NEEDS_REDO)
8959 	{
8960 		page = BufferGetPage(obuffer);
8961 		offnum = xlrec->old_offnum;
8962 		if (PageGetMaxOffsetNumber(page) >= offnum)
8963 			lp = PageGetItemId(page, offnum);
8964 
8965 		if (PageGetMaxOffsetNumber(page) < offnum || !ItemIdIsNormal(lp))
8966 			elog(PANIC, "invalid lp");
8967 
8968 		htup = (HeapTupleHeader) PageGetItem(page, lp);
8969 
8970 		oldtup.t_data = htup;
8971 		oldtup.t_len = ItemIdGetLength(lp);
8972 
8973 		htup->t_infomask &= ~(HEAP_XMAX_BITS | HEAP_MOVED);
8974 		htup->t_infomask2 &= ~HEAP_KEYS_UPDATED;
8975 		if (hot_update)
8976 			HeapTupleHeaderSetHotUpdated(htup);
8977 		else
8978 			HeapTupleHeaderClearHotUpdated(htup);
8979 		fix_infomask_from_infobits(xlrec->old_infobits_set, &htup->t_infomask,
8980 								   &htup->t_infomask2);
8981 		HeapTupleHeaderSetXmax(htup, xlrec->old_xmax);
8982 		HeapTupleHeaderSetCmax(htup, FirstCommandId, false);
8983 		/* Set forward chain link in t_ctid */
8984 		htup->t_ctid = newtid;
8985 
8986 		/* Mark the page as a candidate for pruning */
8987 		PageSetPrunable(page, XLogRecGetXid(record));
8988 
8989 		if (xlrec->flags & XLH_UPDATE_OLD_ALL_VISIBLE_CLEARED)
8990 			PageClearAllVisible(page);
8991 
8992 		PageSetLSN(page, lsn);
8993 		MarkBufferDirty(obuffer);
8994 	}
8995 
8996 	/*
8997 	 * Read the page the new tuple goes into, if different from old.
8998 	 */
8999 	if (oldblk == newblk)
9000 	{
9001 		nbuffer = obuffer;
9002 		newaction = oldaction;
9003 	}
9004 	else if (XLogRecGetInfo(record) & XLOG_HEAP_INIT_PAGE)
9005 	{
9006 		nbuffer = XLogInitBufferForRedo(record, 0);
9007 		page = (Page) BufferGetPage(nbuffer);
9008 		PageInit(page, BufferGetPageSize(nbuffer), 0);
9009 		newaction = BLK_NEEDS_REDO;
9010 	}
9011 	else
9012 		newaction = XLogReadBufferForRedo(record, 0, &nbuffer);
9013 
9014 	/*
9015 	 * The visibility map may need to be fixed even if the heap page is
9016 	 * already up-to-date.
9017 	 */
9018 	if (xlrec->flags & XLH_UPDATE_NEW_ALL_VISIBLE_CLEARED)
9019 	{
9020 		Relation	reln = CreateFakeRelcacheEntry(rnode);
9021 		Buffer		vmbuffer = InvalidBuffer;
9022 
9023 		visibilitymap_pin(reln, newblk, &vmbuffer);
9024 		visibilitymap_clear(reln, newblk, vmbuffer, VISIBILITYMAP_VALID_BITS);
9025 		ReleaseBuffer(vmbuffer);
9026 		FreeFakeRelcacheEntry(reln);
9027 	}
9028 
9029 	/* Deal with new tuple */
9030 	if (newaction == BLK_NEEDS_REDO)
9031 	{
9032 		char	   *recdata;
9033 		char	   *recdata_end;
9034 		Size		datalen;
9035 		Size		tuplen;
9036 
9037 		recdata = XLogRecGetBlockData(record, 0, &datalen);
9038 		recdata_end = recdata + datalen;
9039 
9040 		page = BufferGetPage(nbuffer);
9041 
9042 		offnum = xlrec->new_offnum;
9043 		if (PageGetMaxOffsetNumber(page) + 1 < offnum)
9044 			elog(PANIC, "invalid max offset number");
9045 
9046 		if (xlrec->flags & XLH_UPDATE_PREFIX_FROM_OLD)
9047 		{
9048 			Assert(newblk == oldblk);
9049 			memcpy(&prefixlen, recdata, sizeof(uint16));
9050 			recdata += sizeof(uint16);
9051 		}
9052 		if (xlrec->flags & XLH_UPDATE_SUFFIX_FROM_OLD)
9053 		{
9054 			Assert(newblk == oldblk);
9055 			memcpy(&suffixlen, recdata, sizeof(uint16));
9056 			recdata += sizeof(uint16);
9057 		}
9058 
9059 		memcpy((char *) &xlhdr, recdata, SizeOfHeapHeader);
9060 		recdata += SizeOfHeapHeader;
9061 
9062 		tuplen = recdata_end - recdata;
9063 		Assert(tuplen <= MaxHeapTupleSize);
9064 
9065 		htup = &tbuf.hdr;
9066 		MemSet((char *) htup, 0, SizeofHeapTupleHeader);
9067 
9068 		/*
9069 		 * Reconstruct the new tuple using the prefix and/or suffix from the
9070 		 * old tuple, and the data stored in the WAL record.
9071 		 */
9072 		newp = (char *) htup + SizeofHeapTupleHeader;
9073 		if (prefixlen > 0)
9074 		{
9075 			int			len;
9076 
9077 			/* copy bitmap [+ padding] [+ oid] from WAL record */
9078 			len = xlhdr.t_hoff - SizeofHeapTupleHeader;
9079 			memcpy(newp, recdata, len);
9080 			recdata += len;
9081 			newp += len;
9082 
9083 			/* copy prefix from old tuple */
9084 			memcpy(newp, (char *) oldtup.t_data + oldtup.t_data->t_hoff, prefixlen);
9085 			newp += prefixlen;
9086 
9087 			/* copy new tuple data from WAL record */
9088 			len = tuplen - (xlhdr.t_hoff - SizeofHeapTupleHeader);
9089 			memcpy(newp, recdata, len);
9090 			recdata += len;
9091 			newp += len;
9092 		}
9093 		else
9094 		{
9095 			/*
9096 			 * copy bitmap [+ padding] [+ oid] + data from record, all in one
9097 			 * go
9098 			 */
9099 			memcpy(newp, recdata, tuplen);
9100 			recdata += tuplen;
9101 			newp += tuplen;
9102 		}
9103 		Assert(recdata == recdata_end);
9104 
9105 		/* copy suffix from old tuple */
9106 		if (suffixlen > 0)
9107 			memcpy(newp, (char *) oldtup.t_data + oldtup.t_len - suffixlen, suffixlen);
9108 
9109 		newlen = SizeofHeapTupleHeader + tuplen + prefixlen + suffixlen;
9110 		htup->t_infomask2 = xlhdr.t_infomask2;
9111 		htup->t_infomask = xlhdr.t_infomask;
9112 		htup->t_hoff = xlhdr.t_hoff;
9113 
9114 		HeapTupleHeaderSetXmin(htup, XLogRecGetXid(record));
9115 		HeapTupleHeaderSetCmin(htup, FirstCommandId);
9116 		HeapTupleHeaderSetXmax(htup, xlrec->new_xmax);
9117 		/* Make sure there is no forward chain link in t_ctid */
9118 		htup->t_ctid = newtid;
9119 
9120 		offnum = PageAddItem(page, (Item) htup, newlen, offnum, true, true);
9121 		if (offnum == InvalidOffsetNumber)
9122 			elog(PANIC, "failed to add tuple");
9123 
9124 		if (xlrec->flags & XLH_UPDATE_NEW_ALL_VISIBLE_CLEARED)
9125 			PageClearAllVisible(page);
9126 
9127 		freespace = PageGetHeapFreeSpace(page); /* needed to update FSM below */
9128 
9129 		PageSetLSN(page, lsn);
9130 		MarkBufferDirty(nbuffer);
9131 	}
9132 
9133 	if (BufferIsValid(nbuffer) && nbuffer != obuffer)
9134 		UnlockReleaseBuffer(nbuffer);
9135 	if (BufferIsValid(obuffer))
9136 		UnlockReleaseBuffer(obuffer);
9137 
9138 	/*
9139 	 * If the new page is running low on free space, update the FSM as well.
9140 	 * Arbitrarily, our definition of "low" is less than 20%. We can't do much
9141 	 * better than that without knowing the fill-factor for the table.
9142 	 *
9143 	 * However, don't update the FSM on HOT updates, because after crash
9144 	 * recovery, either the old or the new tuple will certainly be dead and
9145 	 * prunable. After pruning, the page will have roughly as much free space
9146 	 * as it did before the update, assuming the new tuple is about the same
9147 	 * size as the old one.
9148 	 *
9149 	 * XXX: Don't do this if the page was restored from full page image. We
9150 	 * don't bother to update the FSM in that case, it doesn't need to be
9151 	 * totally accurate anyway.
9152 	 */
9153 	if (newaction == BLK_NEEDS_REDO && !hot_update && freespace < BLCKSZ / 5)
9154 		XLogRecordPageWithFreeSpace(rnode, newblk, freespace);
9155 }
9156 
9157 static void
heap_xlog_confirm(XLogReaderState * record)9158 heap_xlog_confirm(XLogReaderState *record)
9159 {
9160 	XLogRecPtr	lsn = record->EndRecPtr;
9161 	xl_heap_confirm *xlrec = (xl_heap_confirm *) XLogRecGetData(record);
9162 	Buffer		buffer;
9163 	Page		page;
9164 	OffsetNumber offnum;
9165 	ItemId		lp = NULL;
9166 	HeapTupleHeader htup;
9167 
9168 	if (XLogReadBufferForRedo(record, 0, &buffer) == BLK_NEEDS_REDO)
9169 	{
9170 		page = BufferGetPage(buffer);
9171 
9172 		offnum = xlrec->offnum;
9173 		if (PageGetMaxOffsetNumber(page) >= offnum)
9174 			lp = PageGetItemId(page, offnum);
9175 
9176 		if (PageGetMaxOffsetNumber(page) < offnum || !ItemIdIsNormal(lp))
9177 			elog(PANIC, "invalid lp");
9178 
9179 		htup = (HeapTupleHeader) PageGetItem(page, lp);
9180 
9181 		/*
9182 		 * Confirm tuple as actually inserted
9183 		 */
9184 		ItemPointerSet(&htup->t_ctid, BufferGetBlockNumber(buffer), offnum);
9185 
9186 		PageSetLSN(page, lsn);
9187 		MarkBufferDirty(buffer);
9188 	}
9189 	if (BufferIsValid(buffer))
9190 		UnlockReleaseBuffer(buffer);
9191 }
9192 
9193 static void
heap_xlog_lock(XLogReaderState * record)9194 heap_xlog_lock(XLogReaderState *record)
9195 {
9196 	XLogRecPtr	lsn = record->EndRecPtr;
9197 	xl_heap_lock *xlrec = (xl_heap_lock *) XLogRecGetData(record);
9198 	Buffer		buffer;
9199 	Page		page;
9200 	OffsetNumber offnum;
9201 	ItemId		lp = NULL;
9202 	HeapTupleHeader htup;
9203 
9204 	/*
9205 	 * The visibility map may need to be fixed even if the heap page is
9206 	 * already up-to-date.
9207 	 */
9208 	if (xlrec->flags & XLH_LOCK_ALL_FROZEN_CLEARED)
9209 	{
9210 		RelFileNode rnode;
9211 		Buffer		vmbuffer = InvalidBuffer;
9212 		BlockNumber block;
9213 		Relation	reln;
9214 
9215 		XLogRecGetBlockTag(record, 0, &rnode, NULL, &block);
9216 		reln = CreateFakeRelcacheEntry(rnode);
9217 
9218 		visibilitymap_pin(reln, block, &vmbuffer);
9219 		visibilitymap_clear(reln, block, vmbuffer, VISIBILITYMAP_ALL_FROZEN);
9220 
9221 		ReleaseBuffer(vmbuffer);
9222 		FreeFakeRelcacheEntry(reln);
9223 	}
9224 
9225 	if (XLogReadBufferForRedo(record, 0, &buffer) == BLK_NEEDS_REDO)
9226 	{
9227 		page = (Page) BufferGetPage(buffer);
9228 
9229 		offnum = xlrec->offnum;
9230 		if (PageGetMaxOffsetNumber(page) >= offnum)
9231 			lp = PageGetItemId(page, offnum);
9232 
9233 		if (PageGetMaxOffsetNumber(page) < offnum || !ItemIdIsNormal(lp))
9234 			elog(PANIC, "invalid lp");
9235 
9236 		htup = (HeapTupleHeader) PageGetItem(page, lp);
9237 
9238 		htup->t_infomask &= ~(HEAP_XMAX_BITS | HEAP_MOVED);
9239 		htup->t_infomask2 &= ~HEAP_KEYS_UPDATED;
9240 		fix_infomask_from_infobits(xlrec->infobits_set, &htup->t_infomask,
9241 								   &htup->t_infomask2);
9242 
9243 		/*
9244 		 * Clear relevant update flags, but only if the modified infomask says
9245 		 * there's no update.
9246 		 */
9247 		if (HEAP_XMAX_IS_LOCKED_ONLY(htup->t_infomask))
9248 		{
9249 			HeapTupleHeaderClearHotUpdated(htup);
9250 			/* Make sure there is no forward chain link in t_ctid */
9251 			ItemPointerSet(&htup->t_ctid,
9252 						   BufferGetBlockNumber(buffer),
9253 						   offnum);
9254 		}
9255 		HeapTupleHeaderSetXmax(htup, xlrec->locking_xid);
9256 		HeapTupleHeaderSetCmax(htup, FirstCommandId, false);
9257 		PageSetLSN(page, lsn);
9258 		MarkBufferDirty(buffer);
9259 	}
9260 	if (BufferIsValid(buffer))
9261 		UnlockReleaseBuffer(buffer);
9262 }
9263 
9264 static void
heap_xlog_lock_updated(XLogReaderState * record)9265 heap_xlog_lock_updated(XLogReaderState *record)
9266 {
9267 	XLogRecPtr	lsn = record->EndRecPtr;
9268 	xl_heap_lock_updated *xlrec;
9269 	Buffer		buffer;
9270 	Page		page;
9271 	OffsetNumber offnum;
9272 	ItemId		lp = NULL;
9273 	HeapTupleHeader htup;
9274 
9275 	xlrec = (xl_heap_lock_updated *) XLogRecGetData(record);
9276 
9277 	/*
9278 	 * The visibility map may need to be fixed even if the heap page is
9279 	 * already up-to-date.
9280 	 */
9281 	if (xlrec->flags & XLH_LOCK_ALL_FROZEN_CLEARED)
9282 	{
9283 		RelFileNode rnode;
9284 		Buffer		vmbuffer = InvalidBuffer;
9285 		BlockNumber block;
9286 		Relation	reln;
9287 
9288 		XLogRecGetBlockTag(record, 0, &rnode, NULL, &block);
9289 		reln = CreateFakeRelcacheEntry(rnode);
9290 
9291 		visibilitymap_pin(reln, block, &vmbuffer);
9292 		visibilitymap_clear(reln, block, vmbuffer, VISIBILITYMAP_ALL_FROZEN);
9293 
9294 		ReleaseBuffer(vmbuffer);
9295 		FreeFakeRelcacheEntry(reln);
9296 	}
9297 
9298 	if (XLogReadBufferForRedo(record, 0, &buffer) == BLK_NEEDS_REDO)
9299 	{
9300 		page = BufferGetPage(buffer);
9301 
9302 		offnum = xlrec->offnum;
9303 		if (PageGetMaxOffsetNumber(page) >= offnum)
9304 			lp = PageGetItemId(page, offnum);
9305 
9306 		if (PageGetMaxOffsetNumber(page) < offnum || !ItemIdIsNormal(lp))
9307 			elog(PANIC, "invalid lp");
9308 
9309 		htup = (HeapTupleHeader) PageGetItem(page, lp);
9310 
9311 		htup->t_infomask &= ~(HEAP_XMAX_BITS | HEAP_MOVED);
9312 		htup->t_infomask2 &= ~HEAP_KEYS_UPDATED;
9313 		fix_infomask_from_infobits(xlrec->infobits_set, &htup->t_infomask,
9314 								   &htup->t_infomask2);
9315 		HeapTupleHeaderSetXmax(htup, xlrec->xmax);
9316 
9317 		PageSetLSN(page, lsn);
9318 		MarkBufferDirty(buffer);
9319 	}
9320 	if (BufferIsValid(buffer))
9321 		UnlockReleaseBuffer(buffer);
9322 }
9323 
9324 static void
heap_xlog_inplace(XLogReaderState * record)9325 heap_xlog_inplace(XLogReaderState *record)
9326 {
9327 	XLogRecPtr	lsn = record->EndRecPtr;
9328 	xl_heap_inplace *xlrec = (xl_heap_inplace *) XLogRecGetData(record);
9329 	Buffer		buffer;
9330 	Page		page;
9331 	OffsetNumber offnum;
9332 	ItemId		lp = NULL;
9333 	HeapTupleHeader htup;
9334 	uint32		oldlen;
9335 	Size		newlen;
9336 
9337 	if (XLogReadBufferForRedo(record, 0, &buffer) == BLK_NEEDS_REDO)
9338 	{
9339 		char	   *newtup = XLogRecGetBlockData(record, 0, &newlen);
9340 
9341 		page = BufferGetPage(buffer);
9342 
9343 		offnum = xlrec->offnum;
9344 		if (PageGetMaxOffsetNumber(page) >= offnum)
9345 			lp = PageGetItemId(page, offnum);
9346 
9347 		if (PageGetMaxOffsetNumber(page) < offnum || !ItemIdIsNormal(lp))
9348 			elog(PANIC, "invalid lp");
9349 
9350 		htup = (HeapTupleHeader) PageGetItem(page, lp);
9351 
9352 		oldlen = ItemIdGetLength(lp) - htup->t_hoff;
9353 		if (oldlen != newlen)
9354 			elog(PANIC, "wrong tuple length");
9355 
9356 		memcpy((char *) htup + htup->t_hoff, newtup, newlen);
9357 
9358 		PageSetLSN(page, lsn);
9359 		MarkBufferDirty(buffer);
9360 	}
9361 	if (BufferIsValid(buffer))
9362 		UnlockReleaseBuffer(buffer);
9363 }
9364 
9365 void
heap_redo(XLogReaderState * record)9366 heap_redo(XLogReaderState *record)
9367 {
9368 	uint8		info = XLogRecGetInfo(record) & ~XLR_INFO_MASK;
9369 
9370 	/*
9371 	 * These operations don't overwrite MVCC data so no conflict processing is
9372 	 * required. The ones in heap2 rmgr do.
9373 	 */
9374 
9375 	switch (info & XLOG_HEAP_OPMASK)
9376 	{
9377 		case XLOG_HEAP_INSERT:
9378 			heap_xlog_insert(record);
9379 			break;
9380 		case XLOG_HEAP_DELETE:
9381 			heap_xlog_delete(record);
9382 			break;
9383 		case XLOG_HEAP_UPDATE:
9384 			heap_xlog_update(record, false);
9385 			break;
9386 		case XLOG_HEAP_TRUNCATE:
9387 
9388 			/*
9389 			 * TRUNCATE is a no-op because the actions are already logged as
9390 			 * SMGR WAL records.  TRUNCATE WAL record only exists for logical
9391 			 * decoding.
9392 			 */
9393 			break;
9394 		case XLOG_HEAP_HOT_UPDATE:
9395 			heap_xlog_update(record, true);
9396 			break;
9397 		case XLOG_HEAP_CONFIRM:
9398 			heap_xlog_confirm(record);
9399 			break;
9400 		case XLOG_HEAP_LOCK:
9401 			heap_xlog_lock(record);
9402 			break;
9403 		case XLOG_HEAP_INPLACE:
9404 			heap_xlog_inplace(record);
9405 			break;
9406 		default:
9407 			elog(PANIC, "heap_redo: unknown op code %u", info);
9408 	}
9409 }
9410 
9411 void
heap2_redo(XLogReaderState * record)9412 heap2_redo(XLogReaderState *record)
9413 {
9414 	uint8		info = XLogRecGetInfo(record) & ~XLR_INFO_MASK;
9415 
9416 	switch (info & XLOG_HEAP_OPMASK)
9417 	{
9418 		case XLOG_HEAP2_CLEAN:
9419 			heap_xlog_clean(record);
9420 			break;
9421 		case XLOG_HEAP2_FREEZE_PAGE:
9422 			heap_xlog_freeze_page(record);
9423 			break;
9424 		case XLOG_HEAP2_CLEANUP_INFO:
9425 			heap_xlog_cleanup_info(record);
9426 			break;
9427 		case XLOG_HEAP2_VISIBLE:
9428 			heap_xlog_visible(record);
9429 			break;
9430 		case XLOG_HEAP2_MULTI_INSERT:
9431 			heap_xlog_multi_insert(record);
9432 			break;
9433 		case XLOG_HEAP2_LOCK_UPDATED:
9434 			heap_xlog_lock_updated(record);
9435 			break;
9436 		case XLOG_HEAP2_NEW_CID:
9437 
9438 			/*
9439 			 * Nothing to do on a real replay, only used during logical
9440 			 * decoding.
9441 			 */
9442 			break;
9443 		case XLOG_HEAP2_REWRITE:
9444 			heap_xlog_logical_rewrite(record);
9445 			break;
9446 		default:
9447 			elog(PANIC, "heap2_redo: unknown op code %u", info);
9448 	}
9449 }
9450 
9451 /*
9452  *	heap_sync		- sync a heap, for use when no WAL has been written
9453  *
9454  * This forces the heap contents (including TOAST heap if any) down to disk.
9455  * If we skipped using WAL, and WAL is otherwise needed, we must force the
9456  * relation down to disk before it's safe to commit the transaction.  This
9457  * requires writing out any dirty buffers and then doing a forced fsync.
9458  *
9459  * Indexes are not touched.  (Currently, index operations associated with
9460  * the commands that use this are WAL-logged and so do not need fsync.
9461  * That behavior might change someday, but in any case it's likely that
9462  * any fsync decisions required would be per-index and hence not appropriate
9463  * to be done here.)
9464  */
9465 void
heap_sync(Relation rel)9466 heap_sync(Relation rel)
9467 {
9468 	/* non-WAL-logged tables never need fsync */
9469 	if (!RelationNeedsWAL(rel))
9470 		return;
9471 
9472 	/* main heap */
9473 	FlushRelationBuffers(rel);
9474 	/* FlushRelationBuffers will have opened rd_smgr */
9475 	smgrimmedsync(rel->rd_smgr, MAIN_FORKNUM);
9476 
9477 	/* FSM is not critical, don't bother syncing it */
9478 
9479 	/* toast heap, if any */
9480 	if (OidIsValid(rel->rd_rel->reltoastrelid))
9481 	{
9482 		Relation	toastrel;
9483 
9484 		toastrel = heap_open(rel->rd_rel->reltoastrelid, AccessShareLock);
9485 		FlushRelationBuffers(toastrel);
9486 		smgrimmedsync(toastrel->rd_smgr, MAIN_FORKNUM);
9487 		heap_close(toastrel, AccessShareLock);
9488 	}
9489 }
9490 
9491 /*
9492  * Mask a heap page before performing consistency checks on it.
9493  */
9494 void
heap_mask(char * pagedata,BlockNumber blkno)9495 heap_mask(char *pagedata, BlockNumber blkno)
9496 {
9497 	Page		page = (Page) pagedata;
9498 	OffsetNumber off;
9499 
9500 	mask_page_lsn_and_checksum(page);
9501 
9502 	mask_page_hint_bits(page);
9503 	mask_unused_space(page);
9504 
9505 	for (off = 1; off <= PageGetMaxOffsetNumber(page); off++)
9506 	{
9507 		ItemId		iid = PageGetItemId(page, off);
9508 		char	   *page_item;
9509 
9510 		page_item = (char *) (page + ItemIdGetOffset(iid));
9511 
9512 		if (ItemIdIsNormal(iid))
9513 		{
9514 			HeapTupleHeader page_htup = (HeapTupleHeader) page_item;
9515 
9516 			/*
9517 			 * If xmin of a tuple is not yet frozen, we should ignore
9518 			 * differences in hint bits, since they can be set without
9519 			 * emitting WAL.
9520 			 */
9521 			if (!HeapTupleHeaderXminFrozen(page_htup))
9522 				page_htup->t_infomask &= ~HEAP_XACT_MASK;
9523 			else
9524 			{
9525 				/* Still we need to mask xmax hint bits. */
9526 				page_htup->t_infomask &= ~HEAP_XMAX_INVALID;
9527 				page_htup->t_infomask &= ~HEAP_XMAX_COMMITTED;
9528 			}
9529 
9530 			/*
9531 			 * During replay, we set Command Id to FirstCommandId. Hence, mask
9532 			 * it. See heap_xlog_insert() for details.
9533 			 */
9534 			page_htup->t_choice.t_heap.t_field3.t_cid = MASK_MARKER;
9535 
9536 			/*
9537 			 * For a speculative tuple, heap_insert() does not set ctid in the
9538 			 * caller-passed heap tuple itself, leaving the ctid field to
9539 			 * contain a speculative token value - a per-backend monotonically
9540 			 * increasing identifier. Besides, it does not WAL-log ctid under
9541 			 * any circumstances.
9542 			 *
9543 			 * During redo, heap_xlog_insert() sets t_ctid to current block
9544 			 * number and self offset number. It doesn't care about any
9545 			 * speculative insertions in master. Hence, we set t_ctid to
9546 			 * current block number and self offset number to ignore any
9547 			 * inconsistency.
9548 			 */
9549 			if (HeapTupleHeaderIsSpeculative(page_htup))
9550 				ItemPointerSet(&page_htup->t_ctid, blkno, off);
9551 
9552 			/*
9553 			 * NB: Not ignoring ctid changes due to the tuple having moved
9554 			 * (i.e. HeapTupleHeaderIndicatesMovedPartitions), because that's
9555 			 * important information that needs to be in-sync between primary
9556 			 * and standby, and thus is WAL logged.
9557 			 */
9558 		}
9559 
9560 		/*
9561 		 * Ignore any padding bytes after the tuple, when the length of the
9562 		 * item is not MAXALIGNed.
9563 		 */
9564 		if (ItemIdHasStorage(iid))
9565 		{
9566 			int			len = ItemIdGetLength(iid);
9567 			int			padlen = MAXALIGN(len) - len;
9568 
9569 			if (padlen > 0)
9570 				memset(page_item + len, MASK_MARKER, padlen);
9571 		}
9572 	}
9573 }
9574