1 /*-------------------------------------------------------------------------
2 *
3 * heapam.c
4 * heap access method code
5 *
6 * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
7 * Portions Copyright (c) 1994, Regents of the University of California
8 *
9 *
10 * IDENTIFICATION
11 * src/backend/access/heap/heapam.c
12 *
13 *
14 * INTERFACE ROUTINES
15 * heap_beginscan - begin relation scan
16 * heap_rescan - restart a relation scan
17 * heap_endscan - end relation scan
18 * heap_getnext - retrieve next tuple in scan
19 * heap_fetch - retrieve tuple with given tid
20 * heap_insert - insert tuple into a relation
21 * heap_multi_insert - insert multiple tuples into a relation
22 * heap_delete - delete a tuple from a relation
23 * heap_update - replace a tuple in a relation with another tuple
24 *
25 * NOTES
26 * This file contains the heap_ routines which implement
27 * the POSTGRES heap access method used for all POSTGRES
28 * relations.
29 *
30 *-------------------------------------------------------------------------
31 */
32 #include "postgres.h"
33
34 #include "access/bufmask.h"
35 #include "access/genam.h"
36 #include "access/heapam.h"
37 #include "access/heapam_xlog.h"
38 #include "access/heaptoast.h"
39 #include "access/hio.h"
40 #include "access/multixact.h"
41 #include "access/parallel.h"
42 #include "access/relscan.h"
43 #include "access/subtrans.h"
44 #include "access/syncscan.h"
45 #include "access/sysattr.h"
46 #include "access/tableam.h"
47 #include "access/transam.h"
48 #include "access/valid.h"
49 #include "access/visibilitymap.h"
50 #include "access/xact.h"
51 #include "access/xlog.h"
52 #include "access/xloginsert.h"
53 #include "access/xlogutils.h"
54 #include "catalog/catalog.h"
55 #include "miscadmin.h"
56 #include "pgstat.h"
57 #include "port/atomics.h"
58 #include "port/pg_bitutils.h"
59 #include "storage/bufmgr.h"
60 #include "storage/freespace.h"
61 #include "storage/lmgr.h"
62 #include "storage/predicate.h"
63 #include "storage/procarray.h"
64 #include "storage/smgr.h"
65 #include "storage/spin.h"
66 #include "storage/standby.h"
67 #include "utils/datum.h"
68 #include "utils/inval.h"
69 #include "utils/lsyscache.h"
70 #include "utils/relcache.h"
71 #include "utils/snapmgr.h"
72 #include "utils/spccache.h"
73
74
75 static HeapTuple heap_prepare_insert(Relation relation, HeapTuple tup,
76 TransactionId xid, CommandId cid, int options);
77 static XLogRecPtr log_heap_update(Relation reln, Buffer oldbuf,
78 Buffer newbuf, HeapTuple oldtup,
79 HeapTuple newtup, HeapTuple old_key_tuple,
80 bool all_visible_cleared, bool new_all_visible_cleared);
81 static Bitmapset *HeapDetermineModifiedColumns(Relation relation,
82 Bitmapset *interesting_cols,
83 HeapTuple oldtup, HeapTuple newtup);
84 static bool heap_acquire_tuplock(Relation relation, ItemPointer tid,
85 LockTupleMode mode, LockWaitPolicy wait_policy,
86 bool *have_tuple_lock);
87 static void compute_new_xmax_infomask(TransactionId xmax, uint16 old_infomask,
88 uint16 old_infomask2, TransactionId add_to_xmax,
89 LockTupleMode mode, bool is_update,
90 TransactionId *result_xmax, uint16 *result_infomask,
91 uint16 *result_infomask2);
92 static TM_Result heap_lock_updated_tuple(Relation rel, HeapTuple tuple,
93 ItemPointer ctid, TransactionId xid,
94 LockTupleMode mode);
95 static void GetMultiXactIdHintBits(MultiXactId multi, uint16 *new_infomask,
96 uint16 *new_infomask2);
97 static TransactionId MultiXactIdGetUpdateXid(TransactionId xmax,
98 uint16 t_infomask);
99 static bool DoesMultiXactIdConflict(MultiXactId multi, uint16 infomask,
100 LockTupleMode lockmode, bool *current_is_member);
101 static void MultiXactIdWait(MultiXactId multi, MultiXactStatus status, uint16 infomask,
102 Relation rel, ItemPointer ctid, XLTW_Oper oper,
103 int *remaining);
104 static bool ConditionalMultiXactIdWait(MultiXactId multi, MultiXactStatus status,
105 uint16 infomask, Relation rel, int *remaining);
106 static void index_delete_sort(TM_IndexDeleteOp *delstate);
107 static int bottomup_sort_and_shrink(TM_IndexDeleteOp *delstate);
108 static XLogRecPtr log_heap_new_cid(Relation relation, HeapTuple tup);
109 static HeapTuple ExtractReplicaIdentity(Relation rel, HeapTuple tup, bool key_changed,
110 bool *copy);
111
112
113 /*
114 * Each tuple lock mode has a corresponding heavyweight lock, and one or two
115 * corresponding MultiXactStatuses (one to merely lock tuples, another one to
116 * update them). This table (and the macros below) helps us determine the
117 * heavyweight lock mode and MultiXactStatus values to use for any particular
118 * tuple lock strength.
119 *
120 * Don't look at lockstatus/updstatus directly! Use get_mxact_status_for_lock
121 * instead.
122 */
123 static const struct
124 {
125 LOCKMODE hwlock;
126 int lockstatus;
127 int updstatus;
128 }
129
130 tupleLockExtraInfo[MaxLockTupleMode + 1] =
131 {
132 { /* LockTupleKeyShare */
133 AccessShareLock,
134 MultiXactStatusForKeyShare,
135 -1 /* KeyShare does not allow updating tuples */
136 },
137 { /* LockTupleShare */
138 RowShareLock,
139 MultiXactStatusForShare,
140 -1 /* Share does not allow updating tuples */
141 },
142 { /* LockTupleNoKeyExclusive */
143 ExclusiveLock,
144 MultiXactStatusForNoKeyUpdate,
145 MultiXactStatusNoKeyUpdate
146 },
147 { /* LockTupleExclusive */
148 AccessExclusiveLock,
149 MultiXactStatusForUpdate,
150 MultiXactStatusUpdate
151 }
152 };
153
154 /* Get the LOCKMODE for a given MultiXactStatus */
155 #define LOCKMODE_from_mxstatus(status) \
156 (tupleLockExtraInfo[TUPLOCK_from_mxstatus((status))].hwlock)
157
158 /*
159 * Acquire heavyweight locks on tuples, using a LockTupleMode strength value.
160 * This is more readable than having every caller translate it to lock.h's
161 * LOCKMODE.
162 */
163 #define LockTupleTuplock(rel, tup, mode) \
164 LockTuple((rel), (tup), tupleLockExtraInfo[mode].hwlock)
165 #define UnlockTupleTuplock(rel, tup, mode) \
166 UnlockTuple((rel), (tup), tupleLockExtraInfo[mode].hwlock)
167 #define ConditionalLockTupleTuplock(rel, tup, mode) \
168 ConditionalLockTuple((rel), (tup), tupleLockExtraInfo[mode].hwlock)
169
170 #ifdef USE_PREFETCH
171 /*
172 * heap_index_delete_tuples and index_delete_prefetch_buffer use this
173 * structure to coordinate prefetching activity
174 */
175 typedef struct
176 {
177 BlockNumber cur_hblkno;
178 int next_item;
179 int ndeltids;
180 TM_IndexDelete *deltids;
181 } IndexDeletePrefetchState;
182 #endif
183
184 /* heap_index_delete_tuples bottom-up index deletion costing constants */
185 #define BOTTOMUP_MAX_NBLOCKS 6
186 #define BOTTOMUP_TOLERANCE_NBLOCKS 3
187
188 /*
189 * heap_index_delete_tuples uses this when determining which heap blocks it
190 * must visit to help its bottom-up index deletion caller
191 */
192 typedef struct IndexDeleteCounts
193 {
194 int16 npromisingtids; /* Number of "promising" TIDs in group */
195 int16 ntids; /* Number of TIDs in group */
196 int16 ifirsttid; /* Offset to group's first deltid */
197 } IndexDeleteCounts;
198
199 /*
200 * This table maps tuple lock strength values for each particular
201 * MultiXactStatus value.
202 */
203 static const int MultiXactStatusLock[MaxMultiXactStatus + 1] =
204 {
205 LockTupleKeyShare, /* ForKeyShare */
206 LockTupleShare, /* ForShare */
207 LockTupleNoKeyExclusive, /* ForNoKeyUpdate */
208 LockTupleExclusive, /* ForUpdate */
209 LockTupleNoKeyExclusive, /* NoKeyUpdate */
210 LockTupleExclusive /* Update */
211 };
212
213 /* Get the LockTupleMode for a given MultiXactStatus */
214 #define TUPLOCK_from_mxstatus(status) \
215 (MultiXactStatusLock[(status)])
216
217 /* ----------------------------------------------------------------
218 * heap support routines
219 * ----------------------------------------------------------------
220 */
221
222 /* ----------------
223 * initscan - scan code common to heap_beginscan and heap_rescan
224 * ----------------
225 */
226 static void
initscan(HeapScanDesc scan,ScanKey key,bool keep_startblock)227 initscan(HeapScanDesc scan, ScanKey key, bool keep_startblock)
228 {
229 ParallelBlockTableScanDesc bpscan = NULL;
230 bool allow_strat;
231 bool allow_sync;
232
233 /*
234 * Determine the number of blocks we have to scan.
235 *
236 * It is sufficient to do this once at scan start, since any tuples added
237 * while the scan is in progress will be invisible to my snapshot anyway.
238 * (That is not true when using a non-MVCC snapshot. However, we couldn't
239 * guarantee to return tuples added after scan start anyway, since they
240 * might go into pages we already scanned. To guarantee consistent
241 * results for a non-MVCC snapshot, the caller must hold some higher-level
242 * lock that ensures the interesting tuple(s) won't change.)
243 */
244 if (scan->rs_base.rs_parallel != NULL)
245 {
246 bpscan = (ParallelBlockTableScanDesc) scan->rs_base.rs_parallel;
247 scan->rs_nblocks = bpscan->phs_nblocks;
248 }
249 else
250 scan->rs_nblocks = RelationGetNumberOfBlocks(scan->rs_base.rs_rd);
251
252 /*
253 * If the table is large relative to NBuffers, use a bulk-read access
254 * strategy and enable synchronized scanning (see syncscan.c). Although
255 * the thresholds for these features could be different, we make them the
256 * same so that there are only two behaviors to tune rather than four.
257 * (However, some callers need to be able to disable one or both of these
258 * behaviors, independently of the size of the table; also there is a GUC
259 * variable that can disable synchronized scanning.)
260 *
261 * Note that table_block_parallelscan_initialize has a very similar test;
262 * if you change this, consider changing that one, too.
263 */
264 if (!RelationUsesLocalBuffers(scan->rs_base.rs_rd) &&
265 scan->rs_nblocks > NBuffers / 4)
266 {
267 allow_strat = (scan->rs_base.rs_flags & SO_ALLOW_STRAT) != 0;
268 allow_sync = (scan->rs_base.rs_flags & SO_ALLOW_SYNC) != 0;
269 }
270 else
271 allow_strat = allow_sync = false;
272
273 if (allow_strat)
274 {
275 /* During a rescan, keep the previous strategy object. */
276 if (scan->rs_strategy == NULL)
277 scan->rs_strategy = GetAccessStrategy(BAS_BULKREAD);
278 }
279 else
280 {
281 if (scan->rs_strategy != NULL)
282 FreeAccessStrategy(scan->rs_strategy);
283 scan->rs_strategy = NULL;
284 }
285
286 if (scan->rs_base.rs_parallel != NULL)
287 {
288 /* For parallel scan, believe whatever ParallelTableScanDesc says. */
289 if (scan->rs_base.rs_parallel->phs_syncscan)
290 scan->rs_base.rs_flags |= SO_ALLOW_SYNC;
291 else
292 scan->rs_base.rs_flags &= ~SO_ALLOW_SYNC;
293 }
294 else if (keep_startblock)
295 {
296 /*
297 * When rescanning, we want to keep the previous startblock setting,
298 * so that rewinding a cursor doesn't generate surprising results.
299 * Reset the active syncscan setting, though.
300 */
301 if (allow_sync && synchronize_seqscans)
302 scan->rs_base.rs_flags |= SO_ALLOW_SYNC;
303 else
304 scan->rs_base.rs_flags &= ~SO_ALLOW_SYNC;
305 }
306 else if (allow_sync && synchronize_seqscans)
307 {
308 scan->rs_base.rs_flags |= SO_ALLOW_SYNC;
309 scan->rs_startblock = ss_get_location(scan->rs_base.rs_rd, scan->rs_nblocks);
310 }
311 else
312 {
313 scan->rs_base.rs_flags &= ~SO_ALLOW_SYNC;
314 scan->rs_startblock = 0;
315 }
316
317 scan->rs_numblocks = InvalidBlockNumber;
318 scan->rs_inited = false;
319 scan->rs_ctup.t_data = NULL;
320 ItemPointerSetInvalid(&scan->rs_ctup.t_self);
321 scan->rs_cbuf = InvalidBuffer;
322 scan->rs_cblock = InvalidBlockNumber;
323
324 /* page-at-a-time fields are always invalid when not rs_inited */
325
326 /*
327 * copy the scan key, if appropriate
328 */
329 if (key != NULL)
330 memcpy(scan->rs_base.rs_key, key, scan->rs_base.rs_nkeys * sizeof(ScanKeyData));
331
332 /*
333 * Currently, we only have a stats counter for sequential heap scans (but
334 * e.g for bitmap scans the underlying bitmap index scans will be counted,
335 * and for sample scans we update stats for tuple fetches).
336 */
337 if (scan->rs_base.rs_flags & SO_TYPE_SEQSCAN)
338 pgstat_count_heap_scan(scan->rs_base.rs_rd);
339 }
340
341 /*
342 * heap_setscanlimits - restrict range of a heapscan
343 *
344 * startBlk is the page to start at
345 * numBlks is number of pages to scan (InvalidBlockNumber means "all")
346 */
347 void
heap_setscanlimits(TableScanDesc sscan,BlockNumber startBlk,BlockNumber numBlks)348 heap_setscanlimits(TableScanDesc sscan, BlockNumber startBlk, BlockNumber numBlks)
349 {
350 HeapScanDesc scan = (HeapScanDesc) sscan;
351
352 Assert(!scan->rs_inited); /* else too late to change */
353 /* else rs_startblock is significant */
354 Assert(!(scan->rs_base.rs_flags & SO_ALLOW_SYNC));
355
356 /* Check startBlk is valid (but allow case of zero blocks...) */
357 Assert(startBlk == 0 || startBlk < scan->rs_nblocks);
358
359 scan->rs_startblock = startBlk;
360 scan->rs_numblocks = numBlks;
361 }
362
363 /*
364 * heapgetpage - subroutine for heapgettup()
365 *
366 * This routine reads and pins the specified page of the relation.
367 * In page-at-a-time mode it performs additional work, namely determining
368 * which tuples on the page are visible.
369 */
370 void
heapgetpage(TableScanDesc sscan,BlockNumber page)371 heapgetpage(TableScanDesc sscan, BlockNumber page)
372 {
373 HeapScanDesc scan = (HeapScanDesc) sscan;
374 Buffer buffer;
375 Snapshot snapshot;
376 Page dp;
377 int lines;
378 int ntup;
379 OffsetNumber lineoff;
380 ItemId lpp;
381 bool all_visible;
382
383 Assert(page < scan->rs_nblocks);
384
385 /* release previous scan buffer, if any */
386 if (BufferIsValid(scan->rs_cbuf))
387 {
388 ReleaseBuffer(scan->rs_cbuf);
389 scan->rs_cbuf = InvalidBuffer;
390 }
391
392 /*
393 * Be sure to check for interrupts at least once per page. Checks at
394 * higher code levels won't be able to stop a seqscan that encounters many
395 * pages' worth of consecutive dead tuples.
396 */
397 CHECK_FOR_INTERRUPTS();
398
399 /* read page using selected strategy */
400 scan->rs_cbuf = ReadBufferExtended(scan->rs_base.rs_rd, MAIN_FORKNUM, page,
401 RBM_NORMAL, scan->rs_strategy);
402 scan->rs_cblock = page;
403
404 if (!(scan->rs_base.rs_flags & SO_ALLOW_PAGEMODE))
405 return;
406
407 buffer = scan->rs_cbuf;
408 snapshot = scan->rs_base.rs_snapshot;
409
410 /*
411 * Prune and repair fragmentation for the whole page, if possible.
412 */
413 heap_page_prune_opt(scan->rs_base.rs_rd, buffer);
414
415 /*
416 * We must hold share lock on the buffer content while examining tuple
417 * visibility. Afterwards, however, the tuples we have found to be
418 * visible are guaranteed good as long as we hold the buffer pin.
419 */
420 LockBuffer(buffer, BUFFER_LOCK_SHARE);
421
422 dp = BufferGetPage(buffer);
423 TestForOldSnapshot(snapshot, scan->rs_base.rs_rd, dp);
424 lines = PageGetMaxOffsetNumber(dp);
425 ntup = 0;
426
427 /*
428 * If the all-visible flag indicates that all tuples on the page are
429 * visible to everyone, we can skip the per-tuple visibility tests.
430 *
431 * Note: In hot standby, a tuple that's already visible to all
432 * transactions on the primary might still be invisible to a read-only
433 * transaction in the standby. We partly handle this problem by tracking
434 * the minimum xmin of visible tuples as the cut-off XID while marking a
435 * page all-visible on the primary and WAL log that along with the
436 * visibility map SET operation. In hot standby, we wait for (or abort)
437 * all transactions that can potentially may not see one or more tuples on
438 * the page. That's how index-only scans work fine in hot standby. A
439 * crucial difference between index-only scans and heap scans is that the
440 * index-only scan completely relies on the visibility map where as heap
441 * scan looks at the page-level PD_ALL_VISIBLE flag. We are not sure if
442 * the page-level flag can be trusted in the same way, because it might
443 * get propagated somehow without being explicitly WAL-logged, e.g. via a
444 * full page write. Until we can prove that beyond doubt, let's check each
445 * tuple for visibility the hard way.
446 */
447 all_visible = PageIsAllVisible(dp) && !snapshot->takenDuringRecovery;
448
449 for (lineoff = FirstOffsetNumber, lpp = PageGetItemId(dp, lineoff);
450 lineoff <= lines;
451 lineoff++, lpp++)
452 {
453 if (ItemIdIsNormal(lpp))
454 {
455 HeapTupleData loctup;
456 bool valid;
457
458 loctup.t_tableOid = RelationGetRelid(scan->rs_base.rs_rd);
459 loctup.t_data = (HeapTupleHeader) PageGetItem((Page) dp, lpp);
460 loctup.t_len = ItemIdGetLength(lpp);
461 ItemPointerSet(&(loctup.t_self), page, lineoff);
462
463 if (all_visible)
464 valid = true;
465 else
466 valid = HeapTupleSatisfiesVisibility(&loctup, snapshot, buffer);
467
468 HeapCheckForSerializableConflictOut(valid, scan->rs_base.rs_rd,
469 &loctup, buffer, snapshot);
470
471 if (valid)
472 scan->rs_vistuples[ntup++] = lineoff;
473 }
474 }
475
476 LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
477
478 Assert(ntup <= MaxHeapTuplesPerPage);
479 scan->rs_ntuples = ntup;
480 }
481
482 /* ----------------
483 * heapgettup - fetch next heap tuple
484 *
485 * Initialize the scan if not already done; then advance to the next
486 * tuple as indicated by "dir"; return the next tuple in scan->rs_ctup,
487 * or set scan->rs_ctup.t_data = NULL if no more tuples.
488 *
489 * dir == NoMovementScanDirection means "re-fetch the tuple indicated
490 * by scan->rs_ctup".
491 *
492 * Note: the reason nkeys/key are passed separately, even though they are
493 * kept in the scan descriptor, is that the caller may not want us to check
494 * the scankeys.
495 *
496 * Note: when we fall off the end of the scan in either direction, we
497 * reset rs_inited. This means that a further request with the same
498 * scan direction will restart the scan, which is a bit odd, but a
499 * request with the opposite scan direction will start a fresh scan
500 * in the proper direction. The latter is required behavior for cursors,
501 * while the former case is generally undefined behavior in Postgres
502 * so we don't care too much.
503 * ----------------
504 */
505 static void
heapgettup(HeapScanDesc scan,ScanDirection dir,int nkeys,ScanKey key)506 heapgettup(HeapScanDesc scan,
507 ScanDirection dir,
508 int nkeys,
509 ScanKey key)
510 {
511 HeapTuple tuple = &(scan->rs_ctup);
512 Snapshot snapshot = scan->rs_base.rs_snapshot;
513 bool backward = ScanDirectionIsBackward(dir);
514 BlockNumber page;
515 bool finished;
516 Page dp;
517 int lines;
518 OffsetNumber lineoff;
519 int linesleft;
520 ItemId lpp;
521
522 /*
523 * calculate next starting lineoff, given scan direction
524 */
525 if (ScanDirectionIsForward(dir))
526 {
527 if (!scan->rs_inited)
528 {
529 /*
530 * return null immediately if relation is empty
531 */
532 if (scan->rs_nblocks == 0 || scan->rs_numblocks == 0)
533 {
534 Assert(!BufferIsValid(scan->rs_cbuf));
535 tuple->t_data = NULL;
536 return;
537 }
538 if (scan->rs_base.rs_parallel != NULL)
539 {
540 ParallelBlockTableScanDesc pbscan =
541 (ParallelBlockTableScanDesc) scan->rs_base.rs_parallel;
542 ParallelBlockTableScanWorker pbscanwork =
543 scan->rs_parallelworkerdata;
544
545 table_block_parallelscan_startblock_init(scan->rs_base.rs_rd,
546 pbscanwork, pbscan);
547
548 page = table_block_parallelscan_nextpage(scan->rs_base.rs_rd,
549 pbscanwork, pbscan);
550
551 /* Other processes might have already finished the scan. */
552 if (page == InvalidBlockNumber)
553 {
554 Assert(!BufferIsValid(scan->rs_cbuf));
555 tuple->t_data = NULL;
556 return;
557 }
558 }
559 else
560 page = scan->rs_startblock; /* first page */
561 heapgetpage((TableScanDesc) scan, page);
562 lineoff = FirstOffsetNumber; /* first offnum */
563 scan->rs_inited = true;
564 }
565 else
566 {
567 /* continue from previously returned page/tuple */
568 page = scan->rs_cblock; /* current page */
569 lineoff = /* next offnum */
570 OffsetNumberNext(ItemPointerGetOffsetNumber(&(tuple->t_self)));
571 }
572
573 LockBuffer(scan->rs_cbuf, BUFFER_LOCK_SHARE);
574
575 dp = BufferGetPage(scan->rs_cbuf);
576 TestForOldSnapshot(snapshot, scan->rs_base.rs_rd, dp);
577 lines = PageGetMaxOffsetNumber(dp);
578 /* page and lineoff now reference the physically next tid */
579
580 linesleft = lines - lineoff + 1;
581 }
582 else if (backward)
583 {
584 /* backward parallel scan not supported */
585 Assert(scan->rs_base.rs_parallel == NULL);
586
587 if (!scan->rs_inited)
588 {
589 /*
590 * return null immediately if relation is empty
591 */
592 if (scan->rs_nblocks == 0 || scan->rs_numblocks == 0)
593 {
594 Assert(!BufferIsValid(scan->rs_cbuf));
595 tuple->t_data = NULL;
596 return;
597 }
598
599 /*
600 * Disable reporting to syncscan logic in a backwards scan; it's
601 * not very likely anyone else is doing the same thing at the same
602 * time, and much more likely that we'll just bollix things for
603 * forward scanners.
604 */
605 scan->rs_base.rs_flags &= ~SO_ALLOW_SYNC;
606
607 /*
608 * Start from last page of the scan. Ensure we take into account
609 * rs_numblocks if it's been adjusted by heap_setscanlimits().
610 */
611 if (scan->rs_numblocks != InvalidBlockNumber)
612 page = (scan->rs_startblock + scan->rs_numblocks - 1) % scan->rs_nblocks;
613 else if (scan->rs_startblock > 0)
614 page = scan->rs_startblock - 1;
615 else
616 page = scan->rs_nblocks - 1;
617 heapgetpage((TableScanDesc) scan, page);
618 }
619 else
620 {
621 /* continue from previously returned page/tuple */
622 page = scan->rs_cblock; /* current page */
623 }
624
625 LockBuffer(scan->rs_cbuf, BUFFER_LOCK_SHARE);
626
627 dp = BufferGetPage(scan->rs_cbuf);
628 TestForOldSnapshot(snapshot, scan->rs_base.rs_rd, dp);
629 lines = PageGetMaxOffsetNumber(dp);
630
631 if (!scan->rs_inited)
632 {
633 lineoff = lines; /* final offnum */
634 scan->rs_inited = true;
635 }
636 else
637 {
638 /*
639 * The previous returned tuple may have been vacuumed since the
640 * previous scan when we use a non-MVCC snapshot, so we must
641 * re-establish the lineoff <= PageGetMaxOffsetNumber(dp)
642 * invariant
643 */
644 lineoff = /* previous offnum */
645 Min(lines,
646 OffsetNumberPrev(ItemPointerGetOffsetNumber(&(tuple->t_self))));
647 }
648 /* page and lineoff now reference the physically previous tid */
649
650 linesleft = lineoff;
651 }
652 else
653 {
654 /*
655 * ``no movement'' scan direction: refetch prior tuple
656 */
657 if (!scan->rs_inited)
658 {
659 Assert(!BufferIsValid(scan->rs_cbuf));
660 tuple->t_data = NULL;
661 return;
662 }
663
664 page = ItemPointerGetBlockNumber(&(tuple->t_self));
665 if (page != scan->rs_cblock)
666 heapgetpage((TableScanDesc) scan, page);
667
668 /* Since the tuple was previously fetched, needn't lock page here */
669 dp = BufferGetPage(scan->rs_cbuf);
670 TestForOldSnapshot(snapshot, scan->rs_base.rs_rd, dp);
671 lineoff = ItemPointerGetOffsetNumber(&(tuple->t_self));
672 lpp = PageGetItemId(dp, lineoff);
673 Assert(ItemIdIsNormal(lpp));
674
675 tuple->t_data = (HeapTupleHeader) PageGetItem((Page) dp, lpp);
676 tuple->t_len = ItemIdGetLength(lpp);
677
678 return;
679 }
680
681 /*
682 * advance the scan until we find a qualifying tuple or run out of stuff
683 * to scan
684 */
685 lpp = PageGetItemId(dp, lineoff);
686 for (;;)
687 {
688 /*
689 * Only continue scanning the page while we have lines left.
690 *
691 * Note that this protects us from accessing line pointers past
692 * PageGetMaxOffsetNumber(); both for forward scans when we resume the
693 * table scan, and for when we start scanning a new page.
694 */
695 while (linesleft > 0)
696 {
697 if (ItemIdIsNormal(lpp))
698 {
699 bool valid;
700
701 tuple->t_data = (HeapTupleHeader) PageGetItem((Page) dp, lpp);
702 tuple->t_len = ItemIdGetLength(lpp);
703 ItemPointerSet(&(tuple->t_self), page, lineoff);
704
705 /*
706 * if current tuple qualifies, return it.
707 */
708 valid = HeapTupleSatisfiesVisibility(tuple,
709 snapshot,
710 scan->rs_cbuf);
711
712 HeapCheckForSerializableConflictOut(valid, scan->rs_base.rs_rd,
713 tuple, scan->rs_cbuf,
714 snapshot);
715
716 if (valid && key != NULL)
717 HeapKeyTest(tuple, RelationGetDescr(scan->rs_base.rs_rd),
718 nkeys, key, valid);
719
720 if (valid)
721 {
722 LockBuffer(scan->rs_cbuf, BUFFER_LOCK_UNLOCK);
723 return;
724 }
725 }
726
727 /*
728 * otherwise move to the next item on the page
729 */
730 --linesleft;
731 if (backward)
732 {
733 --lpp; /* move back in this page's ItemId array */
734 --lineoff;
735 }
736 else
737 {
738 ++lpp; /* move forward in this page's ItemId array */
739 ++lineoff;
740 }
741 }
742
743 /*
744 * if we get here, it means we've exhausted the items on this page and
745 * it's time to move to the next.
746 */
747 LockBuffer(scan->rs_cbuf, BUFFER_LOCK_UNLOCK);
748
749 /*
750 * advance to next/prior page and detect end of scan
751 */
752 if (backward)
753 {
754 finished = (page == scan->rs_startblock) ||
755 (scan->rs_numblocks != InvalidBlockNumber ? --scan->rs_numblocks == 0 : false);
756 if (page == 0)
757 page = scan->rs_nblocks;
758 page--;
759 }
760 else if (scan->rs_base.rs_parallel != NULL)
761 {
762 ParallelBlockTableScanDesc pbscan =
763 (ParallelBlockTableScanDesc) scan->rs_base.rs_parallel;
764 ParallelBlockTableScanWorker pbscanwork =
765 scan->rs_parallelworkerdata;
766
767 page = table_block_parallelscan_nextpage(scan->rs_base.rs_rd,
768 pbscanwork, pbscan);
769 finished = (page == InvalidBlockNumber);
770 }
771 else
772 {
773 page++;
774 if (page >= scan->rs_nblocks)
775 page = 0;
776 finished = (page == scan->rs_startblock) ||
777 (scan->rs_numblocks != InvalidBlockNumber ? --scan->rs_numblocks == 0 : false);
778
779 /*
780 * Report our new scan position for synchronization purposes. We
781 * don't do that when moving backwards, however. That would just
782 * mess up any other forward-moving scanners.
783 *
784 * Note: we do this before checking for end of scan so that the
785 * final state of the position hint is back at the start of the
786 * rel. That's not strictly necessary, but otherwise when you run
787 * the same query multiple times the starting position would shift
788 * a little bit backwards on every invocation, which is confusing.
789 * We don't guarantee any specific ordering in general, though.
790 */
791 if (scan->rs_base.rs_flags & SO_ALLOW_SYNC)
792 ss_report_location(scan->rs_base.rs_rd, page);
793 }
794
795 /*
796 * return NULL if we've exhausted all the pages
797 */
798 if (finished)
799 {
800 if (BufferIsValid(scan->rs_cbuf))
801 ReleaseBuffer(scan->rs_cbuf);
802 scan->rs_cbuf = InvalidBuffer;
803 scan->rs_cblock = InvalidBlockNumber;
804 tuple->t_data = NULL;
805 scan->rs_inited = false;
806 return;
807 }
808
809 heapgetpage((TableScanDesc) scan, page);
810
811 LockBuffer(scan->rs_cbuf, BUFFER_LOCK_SHARE);
812
813 dp = BufferGetPage(scan->rs_cbuf);
814 TestForOldSnapshot(snapshot, scan->rs_base.rs_rd, dp);
815 lines = PageGetMaxOffsetNumber((Page) dp);
816 linesleft = lines;
817 if (backward)
818 {
819 lineoff = lines;
820 lpp = PageGetItemId(dp, lines);
821 }
822 else
823 {
824 lineoff = FirstOffsetNumber;
825 lpp = PageGetItemId(dp, FirstOffsetNumber);
826 }
827 }
828 }
829
830 /* ----------------
831 * heapgettup_pagemode - fetch next heap tuple in page-at-a-time mode
832 *
833 * Same API as heapgettup, but used in page-at-a-time mode
834 *
835 * The internal logic is much the same as heapgettup's too, but there are some
836 * differences: we do not take the buffer content lock (that only needs to
837 * happen inside heapgetpage), and we iterate through just the tuples listed
838 * in rs_vistuples[] rather than all tuples on the page. Notice that
839 * lineindex is 0-based, where the corresponding loop variable lineoff in
840 * heapgettup is 1-based.
841 * ----------------
842 */
843 static void
heapgettup_pagemode(HeapScanDesc scan,ScanDirection dir,int nkeys,ScanKey key)844 heapgettup_pagemode(HeapScanDesc scan,
845 ScanDirection dir,
846 int nkeys,
847 ScanKey key)
848 {
849 HeapTuple tuple = &(scan->rs_ctup);
850 bool backward = ScanDirectionIsBackward(dir);
851 BlockNumber page;
852 bool finished;
853 Page dp;
854 int lines;
855 int lineindex;
856 OffsetNumber lineoff;
857 int linesleft;
858 ItemId lpp;
859
860 /*
861 * calculate next starting lineindex, given scan direction
862 */
863 if (ScanDirectionIsForward(dir))
864 {
865 if (!scan->rs_inited)
866 {
867 /*
868 * return null immediately if relation is empty
869 */
870 if (scan->rs_nblocks == 0 || scan->rs_numblocks == 0)
871 {
872 Assert(!BufferIsValid(scan->rs_cbuf));
873 tuple->t_data = NULL;
874 return;
875 }
876 if (scan->rs_base.rs_parallel != NULL)
877 {
878 ParallelBlockTableScanDesc pbscan =
879 (ParallelBlockTableScanDesc) scan->rs_base.rs_parallel;
880 ParallelBlockTableScanWorker pbscanwork =
881 scan->rs_parallelworkerdata;
882
883 table_block_parallelscan_startblock_init(scan->rs_base.rs_rd,
884 pbscanwork, pbscan);
885
886 page = table_block_parallelscan_nextpage(scan->rs_base.rs_rd,
887 pbscanwork, pbscan);
888
889 /* Other processes might have already finished the scan. */
890 if (page == InvalidBlockNumber)
891 {
892 Assert(!BufferIsValid(scan->rs_cbuf));
893 tuple->t_data = NULL;
894 return;
895 }
896 }
897 else
898 page = scan->rs_startblock; /* first page */
899 heapgetpage((TableScanDesc) scan, page);
900 lineindex = 0;
901 scan->rs_inited = true;
902 }
903 else
904 {
905 /* continue from previously returned page/tuple */
906 page = scan->rs_cblock; /* current page */
907 lineindex = scan->rs_cindex + 1;
908 }
909
910 dp = BufferGetPage(scan->rs_cbuf);
911 TestForOldSnapshot(scan->rs_base.rs_snapshot, scan->rs_base.rs_rd, dp);
912 lines = scan->rs_ntuples;
913 /* page and lineindex now reference the next visible tid */
914
915 linesleft = lines - lineindex;
916 }
917 else if (backward)
918 {
919 /* backward parallel scan not supported */
920 Assert(scan->rs_base.rs_parallel == NULL);
921
922 if (!scan->rs_inited)
923 {
924 /*
925 * return null immediately if relation is empty
926 */
927 if (scan->rs_nblocks == 0 || scan->rs_numblocks == 0)
928 {
929 Assert(!BufferIsValid(scan->rs_cbuf));
930 tuple->t_data = NULL;
931 return;
932 }
933
934 /*
935 * Disable reporting to syncscan logic in a backwards scan; it's
936 * not very likely anyone else is doing the same thing at the same
937 * time, and much more likely that we'll just bollix things for
938 * forward scanners.
939 */
940 scan->rs_base.rs_flags &= ~SO_ALLOW_SYNC;
941
942 /*
943 * Start from last page of the scan. Ensure we take into account
944 * rs_numblocks if it's been adjusted by heap_setscanlimits().
945 */
946 if (scan->rs_numblocks != InvalidBlockNumber)
947 page = (scan->rs_startblock + scan->rs_numblocks - 1) % scan->rs_nblocks;
948 else if (scan->rs_startblock > 0)
949 page = scan->rs_startblock - 1;
950 else
951 page = scan->rs_nblocks - 1;
952 heapgetpage((TableScanDesc) scan, page);
953 }
954 else
955 {
956 /* continue from previously returned page/tuple */
957 page = scan->rs_cblock; /* current page */
958 }
959
960 dp = BufferGetPage(scan->rs_cbuf);
961 TestForOldSnapshot(scan->rs_base.rs_snapshot, scan->rs_base.rs_rd, dp);
962 lines = scan->rs_ntuples;
963
964 if (!scan->rs_inited)
965 {
966 lineindex = lines - 1;
967 scan->rs_inited = true;
968 }
969 else
970 {
971 lineindex = scan->rs_cindex - 1;
972 }
973 /* page and lineindex now reference the previous visible tid */
974
975 linesleft = lineindex + 1;
976 }
977 else
978 {
979 /*
980 * ``no movement'' scan direction: refetch prior tuple
981 */
982 if (!scan->rs_inited)
983 {
984 Assert(!BufferIsValid(scan->rs_cbuf));
985 tuple->t_data = NULL;
986 return;
987 }
988
989 page = ItemPointerGetBlockNumber(&(tuple->t_self));
990 if (page != scan->rs_cblock)
991 heapgetpage((TableScanDesc) scan, page);
992
993 /* Since the tuple was previously fetched, needn't lock page here */
994 dp = BufferGetPage(scan->rs_cbuf);
995 TestForOldSnapshot(scan->rs_base.rs_snapshot, scan->rs_base.rs_rd, dp);
996 lineoff = ItemPointerGetOffsetNumber(&(tuple->t_self));
997 lpp = PageGetItemId(dp, lineoff);
998 Assert(ItemIdIsNormal(lpp));
999
1000 tuple->t_data = (HeapTupleHeader) PageGetItem((Page) dp, lpp);
1001 tuple->t_len = ItemIdGetLength(lpp);
1002
1003 /* check that rs_cindex is in sync */
1004 Assert(scan->rs_cindex < scan->rs_ntuples);
1005 Assert(lineoff == scan->rs_vistuples[scan->rs_cindex]);
1006
1007 return;
1008 }
1009
1010 /*
1011 * advance the scan until we find a qualifying tuple or run out of stuff
1012 * to scan
1013 */
1014 for (;;)
1015 {
1016 while (linesleft > 0)
1017 {
1018 lineoff = scan->rs_vistuples[lineindex];
1019 lpp = PageGetItemId(dp, lineoff);
1020 Assert(ItemIdIsNormal(lpp));
1021
1022 tuple->t_data = (HeapTupleHeader) PageGetItem((Page) dp, lpp);
1023 tuple->t_len = ItemIdGetLength(lpp);
1024 ItemPointerSet(&(tuple->t_self), page, lineoff);
1025
1026 /*
1027 * if current tuple qualifies, return it.
1028 */
1029 if (key != NULL)
1030 {
1031 bool valid;
1032
1033 HeapKeyTest(tuple, RelationGetDescr(scan->rs_base.rs_rd),
1034 nkeys, key, valid);
1035 if (valid)
1036 {
1037 scan->rs_cindex = lineindex;
1038 return;
1039 }
1040 }
1041 else
1042 {
1043 scan->rs_cindex = lineindex;
1044 return;
1045 }
1046
1047 /*
1048 * otherwise move to the next item on the page
1049 */
1050 --linesleft;
1051 if (backward)
1052 --lineindex;
1053 else
1054 ++lineindex;
1055 }
1056
1057 /*
1058 * if we get here, it means we've exhausted the items on this page and
1059 * it's time to move to the next.
1060 */
1061 if (backward)
1062 {
1063 finished = (page == scan->rs_startblock) ||
1064 (scan->rs_numblocks != InvalidBlockNumber ? --scan->rs_numblocks == 0 : false);
1065 if (page == 0)
1066 page = scan->rs_nblocks;
1067 page--;
1068 }
1069 else if (scan->rs_base.rs_parallel != NULL)
1070 {
1071 ParallelBlockTableScanDesc pbscan =
1072 (ParallelBlockTableScanDesc) scan->rs_base.rs_parallel;
1073 ParallelBlockTableScanWorker pbscanwork =
1074 scan->rs_parallelworkerdata;
1075
1076 page = table_block_parallelscan_nextpage(scan->rs_base.rs_rd,
1077 pbscanwork, pbscan);
1078 finished = (page == InvalidBlockNumber);
1079 }
1080 else
1081 {
1082 page++;
1083 if (page >= scan->rs_nblocks)
1084 page = 0;
1085 finished = (page == scan->rs_startblock) ||
1086 (scan->rs_numblocks != InvalidBlockNumber ? --scan->rs_numblocks == 0 : false);
1087
1088 /*
1089 * Report our new scan position for synchronization purposes. We
1090 * don't do that when moving backwards, however. That would just
1091 * mess up any other forward-moving scanners.
1092 *
1093 * Note: we do this before checking for end of scan so that the
1094 * final state of the position hint is back at the start of the
1095 * rel. That's not strictly necessary, but otherwise when you run
1096 * the same query multiple times the starting position would shift
1097 * a little bit backwards on every invocation, which is confusing.
1098 * We don't guarantee any specific ordering in general, though.
1099 */
1100 if (scan->rs_base.rs_flags & SO_ALLOW_SYNC)
1101 ss_report_location(scan->rs_base.rs_rd, page);
1102 }
1103
1104 /*
1105 * return NULL if we've exhausted all the pages
1106 */
1107 if (finished)
1108 {
1109 if (BufferIsValid(scan->rs_cbuf))
1110 ReleaseBuffer(scan->rs_cbuf);
1111 scan->rs_cbuf = InvalidBuffer;
1112 scan->rs_cblock = InvalidBlockNumber;
1113 tuple->t_data = NULL;
1114 scan->rs_inited = false;
1115 return;
1116 }
1117
1118 heapgetpage((TableScanDesc) scan, page);
1119
1120 dp = BufferGetPage(scan->rs_cbuf);
1121 TestForOldSnapshot(scan->rs_base.rs_snapshot, scan->rs_base.rs_rd, dp);
1122 lines = scan->rs_ntuples;
1123 linesleft = lines;
1124 if (backward)
1125 lineindex = lines - 1;
1126 else
1127 lineindex = 0;
1128 }
1129 }
1130
1131
1132 #if defined(DISABLE_COMPLEX_MACRO)
1133 /*
1134 * This is formatted so oddly so that the correspondence to the macro
1135 * definition in access/htup_details.h is maintained.
1136 */
1137 Datum
fastgetattr(HeapTuple tup,int attnum,TupleDesc tupleDesc,bool * isnull)1138 fastgetattr(HeapTuple tup, int attnum, TupleDesc tupleDesc,
1139 bool *isnull)
1140 {
1141 return (
1142 (attnum) > 0 ?
1143 (
1144 (*(isnull) = false),
1145 HeapTupleNoNulls(tup) ?
1146 (
1147 TupleDescAttr((tupleDesc), (attnum) - 1)->attcacheoff >= 0 ?
1148 (
1149 fetchatt(TupleDescAttr((tupleDesc), (attnum) - 1),
1150 (char *) (tup)->t_data + (tup)->t_data->t_hoff +
1151 TupleDescAttr((tupleDesc), (attnum) - 1)->attcacheoff)
1152 )
1153 :
1154 nocachegetattr((tup), (attnum), (tupleDesc))
1155 )
1156 :
1157 (
1158 att_isnull((attnum) - 1, (tup)->t_data->t_bits) ?
1159 (
1160 (*(isnull) = true),
1161 (Datum) NULL
1162 )
1163 :
1164 (
1165 nocachegetattr((tup), (attnum), (tupleDesc))
1166 )
1167 )
1168 )
1169 :
1170 (
1171 (Datum) NULL
1172 )
1173 );
1174 }
1175 #endif /* defined(DISABLE_COMPLEX_MACRO) */
1176
1177
1178 /* ----------------------------------------------------------------
1179 * heap access method interface
1180 * ----------------------------------------------------------------
1181 */
1182
1183
1184 TableScanDesc
heap_beginscan(Relation relation,Snapshot snapshot,int nkeys,ScanKey key,ParallelTableScanDesc parallel_scan,uint32 flags)1185 heap_beginscan(Relation relation, Snapshot snapshot,
1186 int nkeys, ScanKey key,
1187 ParallelTableScanDesc parallel_scan,
1188 uint32 flags)
1189 {
1190 HeapScanDesc scan;
1191
1192 /*
1193 * increment relation ref count while scanning relation
1194 *
1195 * This is just to make really sure the relcache entry won't go away while
1196 * the scan has a pointer to it. Caller should be holding the rel open
1197 * anyway, so this is redundant in all normal scenarios...
1198 */
1199 RelationIncrementReferenceCount(relation);
1200
1201 /*
1202 * allocate and initialize scan descriptor
1203 */
1204 scan = (HeapScanDesc) palloc(sizeof(HeapScanDescData));
1205
1206 scan->rs_base.rs_rd = relation;
1207 scan->rs_base.rs_snapshot = snapshot;
1208 scan->rs_base.rs_nkeys = nkeys;
1209 scan->rs_base.rs_flags = flags;
1210 scan->rs_base.rs_parallel = parallel_scan;
1211 scan->rs_strategy = NULL; /* set in initscan */
1212
1213 /*
1214 * Disable page-at-a-time mode if it's not a MVCC-safe snapshot.
1215 */
1216 if (!(snapshot && IsMVCCSnapshot(snapshot)))
1217 scan->rs_base.rs_flags &= ~SO_ALLOW_PAGEMODE;
1218
1219 /*
1220 * For seqscan and sample scans in a serializable transaction, acquire a
1221 * predicate lock on the entire relation. This is required not only to
1222 * lock all the matching tuples, but also to conflict with new insertions
1223 * into the table. In an indexscan, we take page locks on the index pages
1224 * covering the range specified in the scan qual, but in a heap scan there
1225 * is nothing more fine-grained to lock. A bitmap scan is a different
1226 * story, there we have already scanned the index and locked the index
1227 * pages covering the predicate. But in that case we still have to lock
1228 * any matching heap tuples. For sample scan we could optimize the locking
1229 * to be at least page-level granularity, but we'd need to add per-tuple
1230 * locking for that.
1231 */
1232 if (scan->rs_base.rs_flags & (SO_TYPE_SEQSCAN | SO_TYPE_SAMPLESCAN))
1233 {
1234 /*
1235 * Ensure a missing snapshot is noticed reliably, even if the
1236 * isolation mode means predicate locking isn't performed (and
1237 * therefore the snapshot isn't used here).
1238 */
1239 Assert(snapshot);
1240 PredicateLockRelation(relation, snapshot);
1241 }
1242
1243 /* we only need to set this up once */
1244 scan->rs_ctup.t_tableOid = RelationGetRelid(relation);
1245
1246 /*
1247 * Allocate memory to keep track of page allocation for parallel workers
1248 * when doing a parallel scan.
1249 */
1250 if (parallel_scan != NULL)
1251 scan->rs_parallelworkerdata = palloc(sizeof(ParallelBlockTableScanWorkerData));
1252 else
1253 scan->rs_parallelworkerdata = NULL;
1254
1255 /*
1256 * we do this here instead of in initscan() because heap_rescan also calls
1257 * initscan() and we don't want to allocate memory again
1258 */
1259 if (nkeys > 0)
1260 scan->rs_base.rs_key = (ScanKey) palloc(sizeof(ScanKeyData) * nkeys);
1261 else
1262 scan->rs_base.rs_key = NULL;
1263
1264 initscan(scan, key, false);
1265
1266 return (TableScanDesc) scan;
1267 }
1268
1269 void
heap_rescan(TableScanDesc sscan,ScanKey key,bool set_params,bool allow_strat,bool allow_sync,bool allow_pagemode)1270 heap_rescan(TableScanDesc sscan, ScanKey key, bool set_params,
1271 bool allow_strat, bool allow_sync, bool allow_pagemode)
1272 {
1273 HeapScanDesc scan = (HeapScanDesc) sscan;
1274
1275 if (set_params)
1276 {
1277 if (allow_strat)
1278 scan->rs_base.rs_flags |= SO_ALLOW_STRAT;
1279 else
1280 scan->rs_base.rs_flags &= ~SO_ALLOW_STRAT;
1281
1282 if (allow_sync)
1283 scan->rs_base.rs_flags |= SO_ALLOW_SYNC;
1284 else
1285 scan->rs_base.rs_flags &= ~SO_ALLOW_SYNC;
1286
1287 if (allow_pagemode && scan->rs_base.rs_snapshot &&
1288 IsMVCCSnapshot(scan->rs_base.rs_snapshot))
1289 scan->rs_base.rs_flags |= SO_ALLOW_PAGEMODE;
1290 else
1291 scan->rs_base.rs_flags &= ~SO_ALLOW_PAGEMODE;
1292 }
1293
1294 /*
1295 * unpin scan buffers
1296 */
1297 if (BufferIsValid(scan->rs_cbuf))
1298 ReleaseBuffer(scan->rs_cbuf);
1299
1300 /*
1301 * reinitialize scan descriptor
1302 */
1303 initscan(scan, key, true);
1304 }
1305
1306 void
heap_endscan(TableScanDesc sscan)1307 heap_endscan(TableScanDesc sscan)
1308 {
1309 HeapScanDesc scan = (HeapScanDesc) sscan;
1310
1311 /* Note: no locking manipulations needed */
1312
1313 /*
1314 * unpin scan buffers
1315 */
1316 if (BufferIsValid(scan->rs_cbuf))
1317 ReleaseBuffer(scan->rs_cbuf);
1318
1319 /*
1320 * decrement relation reference count and free scan descriptor storage
1321 */
1322 RelationDecrementReferenceCount(scan->rs_base.rs_rd);
1323
1324 if (scan->rs_base.rs_key)
1325 pfree(scan->rs_base.rs_key);
1326
1327 if (scan->rs_strategy != NULL)
1328 FreeAccessStrategy(scan->rs_strategy);
1329
1330 if (scan->rs_parallelworkerdata != NULL)
1331 pfree(scan->rs_parallelworkerdata);
1332
1333 if (scan->rs_base.rs_flags & SO_TEMP_SNAPSHOT)
1334 UnregisterSnapshot(scan->rs_base.rs_snapshot);
1335
1336 pfree(scan);
1337 }
1338
1339 HeapTuple
heap_getnext(TableScanDesc sscan,ScanDirection direction)1340 heap_getnext(TableScanDesc sscan, ScanDirection direction)
1341 {
1342 HeapScanDesc scan = (HeapScanDesc) sscan;
1343
1344 /*
1345 * This is still widely used directly, without going through table AM, so
1346 * add a safety check. It's possible we should, at a later point,
1347 * downgrade this to an assert. The reason for checking the AM routine,
1348 * rather than the AM oid, is that this allows to write regression tests
1349 * that create another AM reusing the heap handler.
1350 */
1351 if (unlikely(sscan->rs_rd->rd_tableam != GetHeapamTableAmRoutine()))
1352 ereport(ERROR,
1353 (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
1354 errmsg_internal("only heap AM is supported")));
1355
1356 /*
1357 * We don't expect direct calls to heap_getnext with valid CheckXidAlive
1358 * for catalog or regular tables. See detailed comments in xact.c where
1359 * these variables are declared. Normally we have such a check at tableam
1360 * level API but this is called from many places so we need to ensure it
1361 * here.
1362 */
1363 if (unlikely(TransactionIdIsValid(CheckXidAlive) && !bsysscan))
1364 elog(ERROR, "unexpected heap_getnext call during logical decoding");
1365
1366 /* Note: no locking manipulations needed */
1367
1368 if (scan->rs_base.rs_flags & SO_ALLOW_PAGEMODE)
1369 heapgettup_pagemode(scan, direction,
1370 scan->rs_base.rs_nkeys, scan->rs_base.rs_key);
1371 else
1372 heapgettup(scan, direction,
1373 scan->rs_base.rs_nkeys, scan->rs_base.rs_key);
1374
1375 if (scan->rs_ctup.t_data == NULL)
1376 return NULL;
1377
1378 /*
1379 * if we get here it means we have a new current scan tuple, so point to
1380 * the proper return buffer and return the tuple.
1381 */
1382
1383 pgstat_count_heap_getnext(scan->rs_base.rs_rd);
1384
1385 return &scan->rs_ctup;
1386 }
1387
1388 bool
heap_getnextslot(TableScanDesc sscan,ScanDirection direction,TupleTableSlot * slot)1389 heap_getnextslot(TableScanDesc sscan, ScanDirection direction, TupleTableSlot *slot)
1390 {
1391 HeapScanDesc scan = (HeapScanDesc) sscan;
1392
1393 /* Note: no locking manipulations needed */
1394
1395 if (sscan->rs_flags & SO_ALLOW_PAGEMODE)
1396 heapgettup_pagemode(scan, direction, sscan->rs_nkeys, sscan->rs_key);
1397 else
1398 heapgettup(scan, direction, sscan->rs_nkeys, sscan->rs_key);
1399
1400 if (scan->rs_ctup.t_data == NULL)
1401 {
1402 ExecClearTuple(slot);
1403 return false;
1404 }
1405
1406 /*
1407 * if we get here it means we have a new current scan tuple, so point to
1408 * the proper return buffer and return the tuple.
1409 */
1410
1411 pgstat_count_heap_getnext(scan->rs_base.rs_rd);
1412
1413 ExecStoreBufferHeapTuple(&scan->rs_ctup, slot,
1414 scan->rs_cbuf);
1415 return true;
1416 }
1417
1418 void
heap_set_tidrange(TableScanDesc sscan,ItemPointer mintid,ItemPointer maxtid)1419 heap_set_tidrange(TableScanDesc sscan, ItemPointer mintid,
1420 ItemPointer maxtid)
1421 {
1422 HeapScanDesc scan = (HeapScanDesc) sscan;
1423 BlockNumber startBlk;
1424 BlockNumber numBlks;
1425 ItemPointerData highestItem;
1426 ItemPointerData lowestItem;
1427
1428 /*
1429 * For relations without any pages, we can simply leave the TID range
1430 * unset. There will be no tuples to scan, therefore no tuples outside
1431 * the given TID range.
1432 */
1433 if (scan->rs_nblocks == 0)
1434 return;
1435
1436 /*
1437 * Set up some ItemPointers which point to the first and last possible
1438 * tuples in the heap.
1439 */
1440 ItemPointerSet(&highestItem, scan->rs_nblocks - 1, MaxOffsetNumber);
1441 ItemPointerSet(&lowestItem, 0, FirstOffsetNumber);
1442
1443 /*
1444 * If the given maximum TID is below the highest possible TID in the
1445 * relation, then restrict the range to that, otherwise we scan to the end
1446 * of the relation.
1447 */
1448 if (ItemPointerCompare(maxtid, &highestItem) < 0)
1449 ItemPointerCopy(maxtid, &highestItem);
1450
1451 /*
1452 * If the given minimum TID is above the lowest possible TID in the
1453 * relation, then restrict the range to only scan for TIDs above that.
1454 */
1455 if (ItemPointerCompare(mintid, &lowestItem) > 0)
1456 ItemPointerCopy(mintid, &lowestItem);
1457
1458 /*
1459 * Check for an empty range and protect from would be negative results
1460 * from the numBlks calculation below.
1461 */
1462 if (ItemPointerCompare(&highestItem, &lowestItem) < 0)
1463 {
1464 /* Set an empty range of blocks to scan */
1465 heap_setscanlimits(sscan, 0, 0);
1466 return;
1467 }
1468
1469 /*
1470 * Calculate the first block and the number of blocks we must scan. We
1471 * could be more aggressive here and perform some more validation to try
1472 * and further narrow the scope of blocks to scan by checking if the
1473 * lowerItem has an offset above MaxOffsetNumber. In this case, we could
1474 * advance startBlk by one. Likewise, if highestItem has an offset of 0
1475 * we could scan one fewer blocks. However, such an optimization does not
1476 * seem worth troubling over, currently.
1477 */
1478 startBlk = ItemPointerGetBlockNumberNoCheck(&lowestItem);
1479
1480 numBlks = ItemPointerGetBlockNumberNoCheck(&highestItem) -
1481 ItemPointerGetBlockNumberNoCheck(&lowestItem) + 1;
1482
1483 /* Set the start block and number of blocks to scan */
1484 heap_setscanlimits(sscan, startBlk, numBlks);
1485
1486 /* Finally, set the TID range in sscan */
1487 ItemPointerCopy(&lowestItem, &sscan->rs_mintid);
1488 ItemPointerCopy(&highestItem, &sscan->rs_maxtid);
1489 }
1490
1491 bool
heap_getnextslot_tidrange(TableScanDesc sscan,ScanDirection direction,TupleTableSlot * slot)1492 heap_getnextslot_tidrange(TableScanDesc sscan, ScanDirection direction,
1493 TupleTableSlot *slot)
1494 {
1495 HeapScanDesc scan = (HeapScanDesc) sscan;
1496 ItemPointer mintid = &sscan->rs_mintid;
1497 ItemPointer maxtid = &sscan->rs_maxtid;
1498
1499 /* Note: no locking manipulations needed */
1500 for (;;)
1501 {
1502 if (sscan->rs_flags & SO_ALLOW_PAGEMODE)
1503 heapgettup_pagemode(scan, direction, sscan->rs_nkeys, sscan->rs_key);
1504 else
1505 heapgettup(scan, direction, sscan->rs_nkeys, sscan->rs_key);
1506
1507 if (scan->rs_ctup.t_data == NULL)
1508 {
1509 ExecClearTuple(slot);
1510 return false;
1511 }
1512
1513 /*
1514 * heap_set_tidrange will have used heap_setscanlimits to limit the
1515 * range of pages we scan to only ones that can contain the TID range
1516 * we're scanning for. Here we must filter out any tuples from these
1517 * pages that are outwith that range.
1518 */
1519 if (ItemPointerCompare(&scan->rs_ctup.t_self, mintid) < 0)
1520 {
1521 ExecClearTuple(slot);
1522
1523 /*
1524 * When scanning backwards, the TIDs will be in descending order.
1525 * Future tuples in this direction will be lower still, so we can
1526 * just return false to indicate there will be no more tuples.
1527 */
1528 if (ScanDirectionIsBackward(direction))
1529 return false;
1530
1531 continue;
1532 }
1533
1534 /*
1535 * Likewise for the final page, we must filter out TIDs greater than
1536 * maxtid.
1537 */
1538 if (ItemPointerCompare(&scan->rs_ctup.t_self, maxtid) > 0)
1539 {
1540 ExecClearTuple(slot);
1541
1542 /*
1543 * When scanning forward, the TIDs will be in ascending order.
1544 * Future tuples in this direction will be higher still, so we can
1545 * just return false to indicate there will be no more tuples.
1546 */
1547 if (ScanDirectionIsForward(direction))
1548 return false;
1549 continue;
1550 }
1551
1552 break;
1553 }
1554
1555 /*
1556 * if we get here it means we have a new current scan tuple, so point to
1557 * the proper return buffer and return the tuple.
1558 */
1559 pgstat_count_heap_getnext(scan->rs_base.rs_rd);
1560
1561 ExecStoreBufferHeapTuple(&scan->rs_ctup, slot, scan->rs_cbuf);
1562 return true;
1563 }
1564
1565 /*
1566 * heap_fetch - retrieve tuple with given tid
1567 *
1568 * On entry, tuple->t_self is the TID to fetch. We pin the buffer holding
1569 * the tuple, fill in the remaining fields of *tuple, and check the tuple
1570 * against the specified snapshot.
1571 *
1572 * If successful (tuple found and passes snapshot time qual), then *userbuf
1573 * is set to the buffer holding the tuple and true is returned. The caller
1574 * must unpin the buffer when done with the tuple.
1575 *
1576 * If the tuple is not found (ie, item number references a deleted slot),
1577 * then tuple->t_data is set to NULL and false is returned.
1578 *
1579 * If the tuple is found but fails the time qual check, then false is returned
1580 * but tuple->t_data is left pointing to the tuple.
1581 *
1582 * heap_fetch does not follow HOT chains: only the exact TID requested will
1583 * be fetched.
1584 *
1585 * It is somewhat inconsistent that we ereport() on invalid block number but
1586 * return false on invalid item number. There are a couple of reasons though.
1587 * One is that the caller can relatively easily check the block number for
1588 * validity, but cannot check the item number without reading the page
1589 * himself. Another is that when we are following a t_ctid link, we can be
1590 * reasonably confident that the page number is valid (since VACUUM shouldn't
1591 * truncate off the destination page without having killed the referencing
1592 * tuple first), but the item number might well not be good.
1593 */
1594 bool
heap_fetch(Relation relation,Snapshot snapshot,HeapTuple tuple,Buffer * userbuf)1595 heap_fetch(Relation relation,
1596 Snapshot snapshot,
1597 HeapTuple tuple,
1598 Buffer *userbuf)
1599 {
1600 ItemPointer tid = &(tuple->t_self);
1601 ItemId lp;
1602 Buffer buffer;
1603 Page page;
1604 OffsetNumber offnum;
1605 bool valid;
1606
1607 /*
1608 * Fetch and pin the appropriate page of the relation.
1609 */
1610 buffer = ReadBuffer(relation, ItemPointerGetBlockNumber(tid));
1611
1612 /*
1613 * Need share lock on buffer to examine tuple commit status.
1614 */
1615 LockBuffer(buffer, BUFFER_LOCK_SHARE);
1616 page = BufferGetPage(buffer);
1617 TestForOldSnapshot(snapshot, relation, page);
1618
1619 /*
1620 * We'd better check for out-of-range offnum in case of VACUUM since the
1621 * TID was obtained.
1622 */
1623 offnum = ItemPointerGetOffsetNumber(tid);
1624 if (offnum < FirstOffsetNumber || offnum > PageGetMaxOffsetNumber(page))
1625 {
1626 LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
1627 ReleaseBuffer(buffer);
1628 *userbuf = InvalidBuffer;
1629 tuple->t_data = NULL;
1630 return false;
1631 }
1632
1633 /*
1634 * get the item line pointer corresponding to the requested tid
1635 */
1636 lp = PageGetItemId(page, offnum);
1637
1638 /*
1639 * Must check for deleted tuple.
1640 */
1641 if (!ItemIdIsNormal(lp))
1642 {
1643 LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
1644 ReleaseBuffer(buffer);
1645 *userbuf = InvalidBuffer;
1646 tuple->t_data = NULL;
1647 return false;
1648 }
1649
1650 /*
1651 * fill in *tuple fields
1652 */
1653 tuple->t_data = (HeapTupleHeader) PageGetItem(page, lp);
1654 tuple->t_len = ItemIdGetLength(lp);
1655 tuple->t_tableOid = RelationGetRelid(relation);
1656
1657 /*
1658 * check tuple visibility, then release lock
1659 */
1660 valid = HeapTupleSatisfiesVisibility(tuple, snapshot, buffer);
1661
1662 if (valid)
1663 PredicateLockTID(relation, &(tuple->t_self), snapshot,
1664 HeapTupleHeaderGetXmin(tuple->t_data));
1665
1666 HeapCheckForSerializableConflictOut(valid, relation, tuple, buffer, snapshot);
1667
1668 LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
1669
1670 if (valid)
1671 {
1672 /*
1673 * All checks passed, so return the tuple as valid. Caller is now
1674 * responsible for releasing the buffer.
1675 */
1676 *userbuf = buffer;
1677
1678 return true;
1679 }
1680
1681 /* Tuple failed time qual */
1682 ReleaseBuffer(buffer);
1683 *userbuf = InvalidBuffer;
1684
1685 return false;
1686 }
1687
1688 /*
1689 * heap_hot_search_buffer - search HOT chain for tuple satisfying snapshot
1690 *
1691 * On entry, *tid is the TID of a tuple (either a simple tuple, or the root
1692 * of a HOT chain), and buffer is the buffer holding this tuple. We search
1693 * for the first chain member satisfying the given snapshot. If one is
1694 * found, we update *tid to reference that tuple's offset number, and
1695 * return true. If no match, return false without modifying *tid.
1696 *
1697 * heapTuple is a caller-supplied buffer. When a match is found, we return
1698 * the tuple here, in addition to updating *tid. If no match is found, the
1699 * contents of this buffer on return are undefined.
1700 *
1701 * If all_dead is not NULL, we check non-visible tuples to see if they are
1702 * globally dead; *all_dead is set true if all members of the HOT chain
1703 * are vacuumable, false if not.
1704 *
1705 * Unlike heap_fetch, the caller must already have pin and (at least) share
1706 * lock on the buffer; it is still pinned/locked at exit. Also unlike
1707 * heap_fetch, we do not report any pgstats count; caller may do so if wanted.
1708 */
1709 bool
heap_hot_search_buffer(ItemPointer tid,Relation relation,Buffer buffer,Snapshot snapshot,HeapTuple heapTuple,bool * all_dead,bool first_call)1710 heap_hot_search_buffer(ItemPointer tid, Relation relation, Buffer buffer,
1711 Snapshot snapshot, HeapTuple heapTuple,
1712 bool *all_dead, bool first_call)
1713 {
1714 Page dp = (Page) BufferGetPage(buffer);
1715 TransactionId prev_xmax = InvalidTransactionId;
1716 BlockNumber blkno;
1717 OffsetNumber offnum;
1718 bool at_chain_start;
1719 bool valid;
1720 bool skip;
1721 GlobalVisState *vistest = NULL;
1722
1723 /* If this is not the first call, previous call returned a (live!) tuple */
1724 if (all_dead)
1725 *all_dead = first_call;
1726
1727 blkno = ItemPointerGetBlockNumber(tid);
1728 offnum = ItemPointerGetOffsetNumber(tid);
1729 at_chain_start = first_call;
1730 skip = !first_call;
1731
1732 /* XXX: we should assert that a snapshot is pushed or registered */
1733 Assert(TransactionIdIsValid(RecentXmin));
1734 Assert(BufferGetBlockNumber(buffer) == blkno);
1735
1736 /* Scan through possible multiple members of HOT-chain */
1737 for (;;)
1738 {
1739 ItemId lp;
1740
1741 /* check for bogus TID */
1742 if (offnum < FirstOffsetNumber || offnum > PageGetMaxOffsetNumber(dp))
1743 break;
1744
1745 lp = PageGetItemId(dp, offnum);
1746
1747 /* check for unused, dead, or redirected items */
1748 if (!ItemIdIsNormal(lp))
1749 {
1750 /* We should only see a redirect at start of chain */
1751 if (ItemIdIsRedirected(lp) && at_chain_start)
1752 {
1753 /* Follow the redirect */
1754 offnum = ItemIdGetRedirect(lp);
1755 at_chain_start = false;
1756 continue;
1757 }
1758 /* else must be end of chain */
1759 break;
1760 }
1761
1762 /*
1763 * Update heapTuple to point to the element of the HOT chain we're
1764 * currently investigating. Having t_self set correctly is important
1765 * because the SSI checks and the *Satisfies routine for historical
1766 * MVCC snapshots need the correct tid to decide about the visibility.
1767 */
1768 heapTuple->t_data = (HeapTupleHeader) PageGetItem(dp, lp);
1769 heapTuple->t_len = ItemIdGetLength(lp);
1770 heapTuple->t_tableOid = RelationGetRelid(relation);
1771 ItemPointerSet(&heapTuple->t_self, blkno, offnum);
1772
1773 /*
1774 * Shouldn't see a HEAP_ONLY tuple at chain start.
1775 */
1776 if (at_chain_start && HeapTupleIsHeapOnly(heapTuple))
1777 break;
1778
1779 /*
1780 * The xmin should match the previous xmax value, else chain is
1781 * broken.
1782 */
1783 if (TransactionIdIsValid(prev_xmax) &&
1784 !TransactionIdEquals(prev_xmax,
1785 HeapTupleHeaderGetXmin(heapTuple->t_data)))
1786 break;
1787
1788 /*
1789 * When first_call is true (and thus, skip is initially false) we'll
1790 * return the first tuple we find. But on later passes, heapTuple
1791 * will initially be pointing to the tuple we returned last time.
1792 * Returning it again would be incorrect (and would loop forever), so
1793 * we skip it and return the next match we find.
1794 */
1795 if (!skip)
1796 {
1797 /* If it's visible per the snapshot, we must return it */
1798 valid = HeapTupleSatisfiesVisibility(heapTuple, snapshot, buffer);
1799 HeapCheckForSerializableConflictOut(valid, relation, heapTuple,
1800 buffer, snapshot);
1801
1802 if (valid)
1803 {
1804 ItemPointerSetOffsetNumber(tid, offnum);
1805 PredicateLockTID(relation, &heapTuple->t_self, snapshot,
1806 HeapTupleHeaderGetXmin(heapTuple->t_data));
1807 if (all_dead)
1808 *all_dead = false;
1809 return true;
1810 }
1811 }
1812 skip = false;
1813
1814 /*
1815 * If we can't see it, maybe no one else can either. At caller
1816 * request, check whether all chain members are dead to all
1817 * transactions.
1818 *
1819 * Note: if you change the criterion here for what is "dead", fix the
1820 * planner's get_actual_variable_range() function to match.
1821 */
1822 if (all_dead && *all_dead)
1823 {
1824 if (!vistest)
1825 vistest = GlobalVisTestFor(relation);
1826
1827 if (!HeapTupleIsSurelyDead(heapTuple, vistest))
1828 *all_dead = false;
1829 }
1830
1831 /*
1832 * Check to see if HOT chain continues past this tuple; if so fetch
1833 * the next offnum and loop around.
1834 */
1835 if (HeapTupleIsHotUpdated(heapTuple))
1836 {
1837 Assert(ItemPointerGetBlockNumber(&heapTuple->t_data->t_ctid) ==
1838 blkno);
1839 offnum = ItemPointerGetOffsetNumber(&heapTuple->t_data->t_ctid);
1840 at_chain_start = false;
1841 prev_xmax = HeapTupleHeaderGetUpdateXid(heapTuple->t_data);
1842 }
1843 else
1844 break; /* end of chain */
1845 }
1846
1847 return false;
1848 }
1849
1850 /*
1851 * heap_get_latest_tid - get the latest tid of a specified tuple
1852 *
1853 * Actually, this gets the latest version that is visible according to the
1854 * scan's snapshot. Create a scan using SnapshotDirty to get the very latest,
1855 * possibly uncommitted version.
1856 *
1857 * *tid is both an input and an output parameter: it is updated to
1858 * show the latest version of the row. Note that it will not be changed
1859 * if no version of the row passes the snapshot test.
1860 */
1861 void
heap_get_latest_tid(TableScanDesc sscan,ItemPointer tid)1862 heap_get_latest_tid(TableScanDesc sscan,
1863 ItemPointer tid)
1864 {
1865 Relation relation = sscan->rs_rd;
1866 Snapshot snapshot = sscan->rs_snapshot;
1867 ItemPointerData ctid;
1868 TransactionId priorXmax;
1869
1870 /*
1871 * table_tuple_get_latest_tid() verified that the passed in tid is valid.
1872 * Assume that t_ctid links are valid however - there shouldn't be invalid
1873 * ones in the table.
1874 */
1875 Assert(ItemPointerIsValid(tid));
1876
1877 /*
1878 * Loop to chase down t_ctid links. At top of loop, ctid is the tuple we
1879 * need to examine, and *tid is the TID we will return if ctid turns out
1880 * to be bogus.
1881 *
1882 * Note that we will loop until we reach the end of the t_ctid chain.
1883 * Depending on the snapshot passed, there might be at most one visible
1884 * version of the row, but we don't try to optimize for that.
1885 */
1886 ctid = *tid;
1887 priorXmax = InvalidTransactionId; /* cannot check first XMIN */
1888 for (;;)
1889 {
1890 Buffer buffer;
1891 Page page;
1892 OffsetNumber offnum;
1893 ItemId lp;
1894 HeapTupleData tp;
1895 bool valid;
1896
1897 /*
1898 * Read, pin, and lock the page.
1899 */
1900 buffer = ReadBuffer(relation, ItemPointerGetBlockNumber(&ctid));
1901 LockBuffer(buffer, BUFFER_LOCK_SHARE);
1902 page = BufferGetPage(buffer);
1903 TestForOldSnapshot(snapshot, relation, page);
1904
1905 /*
1906 * Check for bogus item number. This is not treated as an error
1907 * condition because it can happen while following a t_ctid link. We
1908 * just assume that the prior tid is OK and return it unchanged.
1909 */
1910 offnum = ItemPointerGetOffsetNumber(&ctid);
1911 if (offnum < FirstOffsetNumber || offnum > PageGetMaxOffsetNumber(page))
1912 {
1913 UnlockReleaseBuffer(buffer);
1914 break;
1915 }
1916 lp = PageGetItemId(page, offnum);
1917 if (!ItemIdIsNormal(lp))
1918 {
1919 UnlockReleaseBuffer(buffer);
1920 break;
1921 }
1922
1923 /* OK to access the tuple */
1924 tp.t_self = ctid;
1925 tp.t_data = (HeapTupleHeader) PageGetItem(page, lp);
1926 tp.t_len = ItemIdGetLength(lp);
1927 tp.t_tableOid = RelationGetRelid(relation);
1928
1929 /*
1930 * After following a t_ctid link, we might arrive at an unrelated
1931 * tuple. Check for XMIN match.
1932 */
1933 if (TransactionIdIsValid(priorXmax) &&
1934 !TransactionIdEquals(priorXmax, HeapTupleHeaderGetXmin(tp.t_data)))
1935 {
1936 UnlockReleaseBuffer(buffer);
1937 break;
1938 }
1939
1940 /*
1941 * Check tuple visibility; if visible, set it as the new result
1942 * candidate.
1943 */
1944 valid = HeapTupleSatisfiesVisibility(&tp, snapshot, buffer);
1945 HeapCheckForSerializableConflictOut(valid, relation, &tp, buffer, snapshot);
1946 if (valid)
1947 *tid = ctid;
1948
1949 /*
1950 * If there's a valid t_ctid link, follow it, else we're done.
1951 */
1952 if ((tp.t_data->t_infomask & HEAP_XMAX_INVALID) ||
1953 HeapTupleHeaderIsOnlyLocked(tp.t_data) ||
1954 HeapTupleHeaderIndicatesMovedPartitions(tp.t_data) ||
1955 ItemPointerEquals(&tp.t_self, &tp.t_data->t_ctid))
1956 {
1957 UnlockReleaseBuffer(buffer);
1958 break;
1959 }
1960
1961 ctid = tp.t_data->t_ctid;
1962 priorXmax = HeapTupleHeaderGetUpdateXid(tp.t_data);
1963 UnlockReleaseBuffer(buffer);
1964 } /* end of loop */
1965 }
1966
1967
1968 /*
1969 * UpdateXmaxHintBits - update tuple hint bits after xmax transaction ends
1970 *
1971 * This is called after we have waited for the XMAX transaction to terminate.
1972 * If the transaction aborted, we guarantee the XMAX_INVALID hint bit will
1973 * be set on exit. If the transaction committed, we set the XMAX_COMMITTED
1974 * hint bit if possible --- but beware that that may not yet be possible,
1975 * if the transaction committed asynchronously.
1976 *
1977 * Note that if the transaction was a locker only, we set HEAP_XMAX_INVALID
1978 * even if it commits.
1979 *
1980 * Hence callers should look only at XMAX_INVALID.
1981 *
1982 * Note this is not allowed for tuples whose xmax is a multixact.
1983 */
1984 static void
UpdateXmaxHintBits(HeapTupleHeader tuple,Buffer buffer,TransactionId xid)1985 UpdateXmaxHintBits(HeapTupleHeader tuple, Buffer buffer, TransactionId xid)
1986 {
1987 Assert(TransactionIdEquals(HeapTupleHeaderGetRawXmax(tuple), xid));
1988 Assert(!(tuple->t_infomask & HEAP_XMAX_IS_MULTI));
1989
1990 if (!(tuple->t_infomask & (HEAP_XMAX_COMMITTED | HEAP_XMAX_INVALID)))
1991 {
1992 if (!HEAP_XMAX_IS_LOCKED_ONLY(tuple->t_infomask) &&
1993 TransactionIdDidCommit(xid))
1994 HeapTupleSetHintBits(tuple, buffer, HEAP_XMAX_COMMITTED,
1995 xid);
1996 else
1997 HeapTupleSetHintBits(tuple, buffer, HEAP_XMAX_INVALID,
1998 InvalidTransactionId);
1999 }
2000 }
2001
2002
2003 /*
2004 * GetBulkInsertState - prepare status object for a bulk insert
2005 */
2006 BulkInsertState
GetBulkInsertState(void)2007 GetBulkInsertState(void)
2008 {
2009 BulkInsertState bistate;
2010
2011 bistate = (BulkInsertState) palloc(sizeof(BulkInsertStateData));
2012 bistate->strategy = GetAccessStrategy(BAS_BULKWRITE);
2013 bistate->current_buf = InvalidBuffer;
2014 return bistate;
2015 }
2016
2017 /*
2018 * FreeBulkInsertState - clean up after finishing a bulk insert
2019 */
2020 void
FreeBulkInsertState(BulkInsertState bistate)2021 FreeBulkInsertState(BulkInsertState bistate)
2022 {
2023 if (bistate->current_buf != InvalidBuffer)
2024 ReleaseBuffer(bistate->current_buf);
2025 FreeAccessStrategy(bistate->strategy);
2026 pfree(bistate);
2027 }
2028
2029 /*
2030 * ReleaseBulkInsertStatePin - release a buffer currently held in bistate
2031 */
2032 void
ReleaseBulkInsertStatePin(BulkInsertState bistate)2033 ReleaseBulkInsertStatePin(BulkInsertState bistate)
2034 {
2035 if (bistate->current_buf != InvalidBuffer)
2036 ReleaseBuffer(bistate->current_buf);
2037 bistate->current_buf = InvalidBuffer;
2038 }
2039
2040
2041 /*
2042 * heap_insert - insert tuple into a heap
2043 *
2044 * The new tuple is stamped with current transaction ID and the specified
2045 * command ID.
2046 *
2047 * See table_tuple_insert for comments about most of the input flags, except
2048 * that this routine directly takes a tuple rather than a slot.
2049 *
2050 * There's corresponding HEAP_INSERT_ options to all the TABLE_INSERT_
2051 * options, and there additionally is HEAP_INSERT_SPECULATIVE which is used to
2052 * implement table_tuple_insert_speculative().
2053 *
2054 * On return the header fields of *tup are updated to match the stored tuple;
2055 * in particular tup->t_self receives the actual TID where the tuple was
2056 * stored. But note that any toasting of fields within the tuple data is NOT
2057 * reflected into *tup.
2058 */
2059 void
heap_insert(Relation relation,HeapTuple tup,CommandId cid,int options,BulkInsertState bistate)2060 heap_insert(Relation relation, HeapTuple tup, CommandId cid,
2061 int options, BulkInsertState bistate)
2062 {
2063 TransactionId xid = GetCurrentTransactionId();
2064 HeapTuple heaptup;
2065 Buffer buffer;
2066 Buffer vmbuffer = InvalidBuffer;
2067 bool all_visible_cleared = false;
2068
2069 /* Cheap, simplistic check that the tuple matches the rel's rowtype. */
2070 Assert(HeapTupleHeaderGetNatts(tup->t_data) <=
2071 RelationGetNumberOfAttributes(relation));
2072
2073 /*
2074 * Fill in tuple header fields and toast the tuple if necessary.
2075 *
2076 * Note: below this point, heaptup is the data we actually intend to store
2077 * into the relation; tup is the caller's original untoasted data.
2078 */
2079 heaptup = heap_prepare_insert(relation, tup, xid, cid, options);
2080
2081 /*
2082 * Find buffer to insert this tuple into. If the page is all visible,
2083 * this will also pin the requisite visibility map page.
2084 */
2085 buffer = RelationGetBufferForTuple(relation, heaptup->t_len,
2086 InvalidBuffer, options, bistate,
2087 &vmbuffer, NULL);
2088
2089 /*
2090 * We're about to do the actual insert -- but check for conflict first, to
2091 * avoid possibly having to roll back work we've just done.
2092 *
2093 * This is safe without a recheck as long as there is no possibility of
2094 * another process scanning the page between this check and the insert
2095 * being visible to the scan (i.e., an exclusive buffer content lock is
2096 * continuously held from this point until the tuple insert is visible).
2097 *
2098 * For a heap insert, we only need to check for table-level SSI locks. Our
2099 * new tuple can't possibly conflict with existing tuple locks, and heap
2100 * page locks are only consolidated versions of tuple locks; they do not
2101 * lock "gaps" as index page locks do. So we don't need to specify a
2102 * buffer when making the call, which makes for a faster check.
2103 */
2104 CheckForSerializableConflictIn(relation, NULL, InvalidBlockNumber);
2105
2106 /* NO EREPORT(ERROR) from here till changes are logged */
2107 START_CRIT_SECTION();
2108
2109 RelationPutHeapTuple(relation, buffer, heaptup,
2110 (options & HEAP_INSERT_SPECULATIVE) != 0);
2111
2112 if (PageIsAllVisible(BufferGetPage(buffer)))
2113 {
2114 all_visible_cleared = true;
2115 PageClearAllVisible(BufferGetPage(buffer));
2116 visibilitymap_clear(relation,
2117 ItemPointerGetBlockNumber(&(heaptup->t_self)),
2118 vmbuffer, VISIBILITYMAP_VALID_BITS);
2119 }
2120
2121 /*
2122 * XXX Should we set PageSetPrunable on this page ?
2123 *
2124 * The inserting transaction may eventually abort thus making this tuple
2125 * DEAD and hence available for pruning. Though we don't want to optimize
2126 * for aborts, if no other tuple in this page is UPDATEd/DELETEd, the
2127 * aborted tuple will never be pruned until next vacuum is triggered.
2128 *
2129 * If you do add PageSetPrunable here, add it in heap_xlog_insert too.
2130 */
2131
2132 MarkBufferDirty(buffer);
2133
2134 /* XLOG stuff */
2135 if (RelationNeedsWAL(relation))
2136 {
2137 xl_heap_insert xlrec;
2138 xl_heap_header xlhdr;
2139 XLogRecPtr recptr;
2140 Page page = BufferGetPage(buffer);
2141 uint8 info = XLOG_HEAP_INSERT;
2142 int bufflags = 0;
2143
2144 /*
2145 * If this is a catalog, we need to transmit combo CIDs to properly
2146 * decode, so log that as well.
2147 */
2148 if (RelationIsAccessibleInLogicalDecoding(relation))
2149 log_heap_new_cid(relation, heaptup);
2150
2151 /*
2152 * If this is the single and first tuple on page, we can reinit the
2153 * page instead of restoring the whole thing. Set flag, and hide
2154 * buffer references from XLogInsert.
2155 */
2156 if (ItemPointerGetOffsetNumber(&(heaptup->t_self)) == FirstOffsetNumber &&
2157 PageGetMaxOffsetNumber(page) == FirstOffsetNumber)
2158 {
2159 info |= XLOG_HEAP_INIT_PAGE;
2160 bufflags |= REGBUF_WILL_INIT;
2161 }
2162
2163 xlrec.offnum = ItemPointerGetOffsetNumber(&heaptup->t_self);
2164 xlrec.flags = 0;
2165 if (all_visible_cleared)
2166 xlrec.flags |= XLH_INSERT_ALL_VISIBLE_CLEARED;
2167 if (options & HEAP_INSERT_SPECULATIVE)
2168 xlrec.flags |= XLH_INSERT_IS_SPECULATIVE;
2169 Assert(ItemPointerGetBlockNumber(&heaptup->t_self) == BufferGetBlockNumber(buffer));
2170
2171 /*
2172 * For logical decoding, we need the tuple even if we're doing a full
2173 * page write, so make sure it's included even if we take a full-page
2174 * image. (XXX We could alternatively store a pointer into the FPW).
2175 */
2176 if (RelationIsLogicallyLogged(relation) &&
2177 !(options & HEAP_INSERT_NO_LOGICAL))
2178 {
2179 xlrec.flags |= XLH_INSERT_CONTAINS_NEW_TUPLE;
2180 bufflags |= REGBUF_KEEP_DATA;
2181
2182 if (IsToastRelation(relation))
2183 xlrec.flags |= XLH_INSERT_ON_TOAST_RELATION;
2184 }
2185
2186 XLogBeginInsert();
2187 XLogRegisterData((char *) &xlrec, SizeOfHeapInsert);
2188
2189 xlhdr.t_infomask2 = heaptup->t_data->t_infomask2;
2190 xlhdr.t_infomask = heaptup->t_data->t_infomask;
2191 xlhdr.t_hoff = heaptup->t_data->t_hoff;
2192
2193 /*
2194 * note we mark xlhdr as belonging to buffer; if XLogInsert decides to
2195 * write the whole page to the xlog, we don't need to store
2196 * xl_heap_header in the xlog.
2197 */
2198 XLogRegisterBuffer(0, buffer, REGBUF_STANDARD | bufflags);
2199 XLogRegisterBufData(0, (char *) &xlhdr, SizeOfHeapHeader);
2200 /* PG73FORMAT: write bitmap [+ padding] [+ oid] + data */
2201 XLogRegisterBufData(0,
2202 (char *) heaptup->t_data + SizeofHeapTupleHeader,
2203 heaptup->t_len - SizeofHeapTupleHeader);
2204
2205 /* filtering by origin on a row level is much more efficient */
2206 XLogSetRecordFlags(XLOG_INCLUDE_ORIGIN);
2207
2208 recptr = XLogInsert(RM_HEAP_ID, info);
2209
2210 PageSetLSN(page, recptr);
2211 }
2212
2213 END_CRIT_SECTION();
2214
2215 UnlockReleaseBuffer(buffer);
2216 if (vmbuffer != InvalidBuffer)
2217 ReleaseBuffer(vmbuffer);
2218
2219 /*
2220 * If tuple is cachable, mark it for invalidation from the caches in case
2221 * we abort. Note it is OK to do this after releasing the buffer, because
2222 * the heaptup data structure is all in local memory, not in the shared
2223 * buffer.
2224 */
2225 CacheInvalidateHeapTuple(relation, heaptup, NULL);
2226
2227 /* Note: speculative insertions are counted too, even if aborted later */
2228 pgstat_count_heap_insert(relation, 1);
2229
2230 /*
2231 * If heaptup is a private copy, release it. Don't forget to copy t_self
2232 * back to the caller's image, too.
2233 */
2234 if (heaptup != tup)
2235 {
2236 tup->t_self = heaptup->t_self;
2237 heap_freetuple(heaptup);
2238 }
2239 }
2240
2241 /*
2242 * Subroutine for heap_insert(). Prepares a tuple for insertion. This sets the
2243 * tuple header fields and toasts the tuple if necessary. Returns a toasted
2244 * version of the tuple if it was toasted, or the original tuple if not. Note
2245 * that in any case, the header fields are also set in the original tuple.
2246 */
2247 static HeapTuple
heap_prepare_insert(Relation relation,HeapTuple tup,TransactionId xid,CommandId cid,int options)2248 heap_prepare_insert(Relation relation, HeapTuple tup, TransactionId xid,
2249 CommandId cid, int options)
2250 {
2251 /*
2252 * To allow parallel inserts, we need to ensure that they are safe to be
2253 * performed in workers. We have the infrastructure to allow parallel
2254 * inserts in general except for the cases where inserts generate a new
2255 * CommandId (eg. inserts into a table having a foreign key column).
2256 */
2257 if (IsParallelWorker())
2258 ereport(ERROR,
2259 (errcode(ERRCODE_INVALID_TRANSACTION_STATE),
2260 errmsg("cannot insert tuples in a parallel worker")));
2261
2262 tup->t_data->t_infomask &= ~(HEAP_XACT_MASK);
2263 tup->t_data->t_infomask2 &= ~(HEAP2_XACT_MASK);
2264 tup->t_data->t_infomask |= HEAP_XMAX_INVALID;
2265 HeapTupleHeaderSetXmin(tup->t_data, xid);
2266 if (options & HEAP_INSERT_FROZEN)
2267 HeapTupleHeaderSetXminFrozen(tup->t_data);
2268
2269 HeapTupleHeaderSetCmin(tup->t_data, cid);
2270 HeapTupleHeaderSetXmax(tup->t_data, 0); /* for cleanliness */
2271 tup->t_tableOid = RelationGetRelid(relation);
2272
2273 /*
2274 * If the new tuple is too big for storage or contains already toasted
2275 * out-of-line attributes from some other relation, invoke the toaster.
2276 */
2277 if (relation->rd_rel->relkind != RELKIND_RELATION &&
2278 relation->rd_rel->relkind != RELKIND_MATVIEW)
2279 {
2280 /* toast table entries should never be recursively toasted */
2281 Assert(!HeapTupleHasExternal(tup));
2282 return tup;
2283 }
2284 else if (HeapTupleHasExternal(tup) || tup->t_len > TOAST_TUPLE_THRESHOLD)
2285 return heap_toast_insert_or_update(relation, tup, NULL, options);
2286 else
2287 return tup;
2288 }
2289
2290 /*
2291 * heap_multi_insert - insert multiple tuples into a heap
2292 *
2293 * This is like heap_insert(), but inserts multiple tuples in one operation.
2294 * That's faster than calling heap_insert() in a loop, because when multiple
2295 * tuples can be inserted on a single page, we can write just a single WAL
2296 * record covering all of them, and only need to lock/unlock the page once.
2297 *
2298 * Note: this leaks memory into the current memory context. You can create a
2299 * temporary context before calling this, if that's a problem.
2300 */
2301 void
heap_multi_insert(Relation relation,TupleTableSlot ** slots,int ntuples,CommandId cid,int options,BulkInsertState bistate)2302 heap_multi_insert(Relation relation, TupleTableSlot **slots, int ntuples,
2303 CommandId cid, int options, BulkInsertState bistate)
2304 {
2305 TransactionId xid = GetCurrentTransactionId();
2306 HeapTuple *heaptuples;
2307 int i;
2308 int ndone;
2309 PGAlignedBlock scratch;
2310 Page page;
2311 Buffer vmbuffer = InvalidBuffer;
2312 bool needwal;
2313 Size saveFreeSpace;
2314 bool need_tuple_data = RelationIsLogicallyLogged(relation);
2315 bool need_cids = RelationIsAccessibleInLogicalDecoding(relation);
2316
2317 /* currently not needed (thus unsupported) for heap_multi_insert() */
2318 AssertArg(!(options & HEAP_INSERT_NO_LOGICAL));
2319
2320 needwal = RelationNeedsWAL(relation);
2321 saveFreeSpace = RelationGetTargetPageFreeSpace(relation,
2322 HEAP_DEFAULT_FILLFACTOR);
2323
2324 /* Toast and set header data in all the slots */
2325 heaptuples = palloc(ntuples * sizeof(HeapTuple));
2326 for (i = 0; i < ntuples; i++)
2327 {
2328 HeapTuple tuple;
2329
2330 tuple = ExecFetchSlotHeapTuple(slots[i], true, NULL);
2331 slots[i]->tts_tableOid = RelationGetRelid(relation);
2332 tuple->t_tableOid = slots[i]->tts_tableOid;
2333 heaptuples[i] = heap_prepare_insert(relation, tuple, xid, cid,
2334 options);
2335 }
2336
2337 /*
2338 * We're about to do the actual inserts -- but check for conflict first,
2339 * to minimize the possibility of having to roll back work we've just
2340 * done.
2341 *
2342 * A check here does not definitively prevent a serialization anomaly;
2343 * that check MUST be done at least past the point of acquiring an
2344 * exclusive buffer content lock on every buffer that will be affected,
2345 * and MAY be done after all inserts are reflected in the buffers and
2346 * those locks are released; otherwise there is a race condition. Since
2347 * multiple buffers can be locked and unlocked in the loop below, and it
2348 * would not be feasible to identify and lock all of those buffers before
2349 * the loop, we must do a final check at the end.
2350 *
2351 * The check here could be omitted with no loss of correctness; it is
2352 * present strictly as an optimization.
2353 *
2354 * For heap inserts, we only need to check for table-level SSI locks. Our
2355 * new tuples can't possibly conflict with existing tuple locks, and heap
2356 * page locks are only consolidated versions of tuple locks; they do not
2357 * lock "gaps" as index page locks do. So we don't need to specify a
2358 * buffer when making the call, which makes for a faster check.
2359 */
2360 CheckForSerializableConflictIn(relation, NULL, InvalidBlockNumber);
2361
2362 ndone = 0;
2363 while (ndone < ntuples)
2364 {
2365 Buffer buffer;
2366 bool starting_with_empty_page;
2367 bool all_visible_cleared = false;
2368 bool all_frozen_set = false;
2369 int nthispage;
2370
2371 CHECK_FOR_INTERRUPTS();
2372
2373 /*
2374 * Find buffer where at least the next tuple will fit. If the page is
2375 * all-visible, this will also pin the requisite visibility map page.
2376 *
2377 * Also pin visibility map page if COPY FREEZE inserts tuples into an
2378 * empty page. See all_frozen_set below.
2379 */
2380 buffer = RelationGetBufferForTuple(relation, heaptuples[ndone]->t_len,
2381 InvalidBuffer, options, bistate,
2382 &vmbuffer, NULL);
2383 page = BufferGetPage(buffer);
2384
2385 starting_with_empty_page = PageGetMaxOffsetNumber(page) == 0;
2386
2387 if (starting_with_empty_page && (options & HEAP_INSERT_FROZEN))
2388 all_frozen_set = true;
2389
2390 /* NO EREPORT(ERROR) from here till changes are logged */
2391 START_CRIT_SECTION();
2392
2393 /*
2394 * RelationGetBufferForTuple has ensured that the first tuple fits.
2395 * Put that on the page, and then as many other tuples as fit.
2396 */
2397 RelationPutHeapTuple(relation, buffer, heaptuples[ndone], false);
2398
2399 /*
2400 * For logical decoding we need combo CIDs to properly decode the
2401 * catalog.
2402 */
2403 if (needwal && need_cids)
2404 log_heap_new_cid(relation, heaptuples[ndone]);
2405
2406 for (nthispage = 1; ndone + nthispage < ntuples; nthispage++)
2407 {
2408 HeapTuple heaptup = heaptuples[ndone + nthispage];
2409
2410 if (PageGetHeapFreeSpace(page) < MAXALIGN(heaptup->t_len) + saveFreeSpace)
2411 break;
2412
2413 RelationPutHeapTuple(relation, buffer, heaptup, false);
2414
2415 /*
2416 * For logical decoding we need combo CIDs to properly decode the
2417 * catalog.
2418 */
2419 if (needwal && need_cids)
2420 log_heap_new_cid(relation, heaptup);
2421 }
2422
2423 /*
2424 * If the page is all visible, need to clear that, unless we're only
2425 * going to add further frozen rows to it.
2426 *
2427 * If we're only adding already frozen rows to a previously empty
2428 * page, mark it as all-visible.
2429 */
2430 if (PageIsAllVisible(page) && !(options & HEAP_INSERT_FROZEN))
2431 {
2432 all_visible_cleared = true;
2433 PageClearAllVisible(page);
2434 visibilitymap_clear(relation,
2435 BufferGetBlockNumber(buffer),
2436 vmbuffer, VISIBILITYMAP_VALID_BITS);
2437 }
2438 else if (all_frozen_set)
2439 PageSetAllVisible(page);
2440
2441 /*
2442 * XXX Should we set PageSetPrunable on this page ? See heap_insert()
2443 */
2444
2445 MarkBufferDirty(buffer);
2446
2447 /* XLOG stuff */
2448 if (needwal)
2449 {
2450 XLogRecPtr recptr;
2451 xl_heap_multi_insert *xlrec;
2452 uint8 info = XLOG_HEAP2_MULTI_INSERT;
2453 char *tupledata;
2454 int totaldatalen;
2455 char *scratchptr = scratch.data;
2456 bool init;
2457 int bufflags = 0;
2458
2459 /*
2460 * If the page was previously empty, we can reinit the page
2461 * instead of restoring the whole thing.
2462 */
2463 init = starting_with_empty_page;
2464
2465 /* allocate xl_heap_multi_insert struct from the scratch area */
2466 xlrec = (xl_heap_multi_insert *) scratchptr;
2467 scratchptr += SizeOfHeapMultiInsert;
2468
2469 /*
2470 * Allocate offsets array. Unless we're reinitializing the page,
2471 * in that case the tuples are stored in order starting at
2472 * FirstOffsetNumber and we don't need to store the offsets
2473 * explicitly.
2474 */
2475 if (!init)
2476 scratchptr += nthispage * sizeof(OffsetNumber);
2477
2478 /* the rest of the scratch space is used for tuple data */
2479 tupledata = scratchptr;
2480
2481 /* check that the mutually exclusive flags are not both set */
2482 Assert(!(all_visible_cleared && all_frozen_set));
2483
2484 xlrec->flags = 0;
2485 if (all_visible_cleared)
2486 xlrec->flags = XLH_INSERT_ALL_VISIBLE_CLEARED;
2487 if (all_frozen_set)
2488 xlrec->flags = XLH_INSERT_ALL_FROZEN_SET;
2489
2490 xlrec->ntuples = nthispage;
2491
2492 /*
2493 * Write out an xl_multi_insert_tuple and the tuple data itself
2494 * for each tuple.
2495 */
2496 for (i = 0; i < nthispage; i++)
2497 {
2498 HeapTuple heaptup = heaptuples[ndone + i];
2499 xl_multi_insert_tuple *tuphdr;
2500 int datalen;
2501
2502 if (!init)
2503 xlrec->offsets[i] = ItemPointerGetOffsetNumber(&heaptup->t_self);
2504 /* xl_multi_insert_tuple needs two-byte alignment. */
2505 tuphdr = (xl_multi_insert_tuple *) SHORTALIGN(scratchptr);
2506 scratchptr = ((char *) tuphdr) + SizeOfMultiInsertTuple;
2507
2508 tuphdr->t_infomask2 = heaptup->t_data->t_infomask2;
2509 tuphdr->t_infomask = heaptup->t_data->t_infomask;
2510 tuphdr->t_hoff = heaptup->t_data->t_hoff;
2511
2512 /* write bitmap [+ padding] [+ oid] + data */
2513 datalen = heaptup->t_len - SizeofHeapTupleHeader;
2514 memcpy(scratchptr,
2515 (char *) heaptup->t_data + SizeofHeapTupleHeader,
2516 datalen);
2517 tuphdr->datalen = datalen;
2518 scratchptr += datalen;
2519 }
2520 totaldatalen = scratchptr - tupledata;
2521 Assert((scratchptr - scratch.data) < BLCKSZ);
2522
2523 if (need_tuple_data)
2524 xlrec->flags |= XLH_INSERT_CONTAINS_NEW_TUPLE;
2525
2526 /*
2527 * Signal that this is the last xl_heap_multi_insert record
2528 * emitted by this call to heap_multi_insert(). Needed for logical
2529 * decoding so it knows when to cleanup temporary data.
2530 */
2531 if (ndone + nthispage == ntuples)
2532 xlrec->flags |= XLH_INSERT_LAST_IN_MULTI;
2533
2534 if (init)
2535 {
2536 info |= XLOG_HEAP_INIT_PAGE;
2537 bufflags |= REGBUF_WILL_INIT;
2538 }
2539
2540 /*
2541 * If we're doing logical decoding, include the new tuple data
2542 * even if we take a full-page image of the page.
2543 */
2544 if (need_tuple_data)
2545 bufflags |= REGBUF_KEEP_DATA;
2546
2547 XLogBeginInsert();
2548 XLogRegisterData((char *) xlrec, tupledata - scratch.data);
2549 XLogRegisterBuffer(0, buffer, REGBUF_STANDARD | bufflags);
2550
2551 XLogRegisterBufData(0, tupledata, totaldatalen);
2552
2553 /* filtering by origin on a row level is much more efficient */
2554 XLogSetRecordFlags(XLOG_INCLUDE_ORIGIN);
2555
2556 recptr = XLogInsert(RM_HEAP2_ID, info);
2557
2558 PageSetLSN(page, recptr);
2559 }
2560
2561 END_CRIT_SECTION();
2562
2563 /*
2564 * If we've frozen everything on the page, update the visibilitymap.
2565 * We're already holding pin on the vmbuffer.
2566 */
2567 if (all_frozen_set)
2568 {
2569 Assert(PageIsAllVisible(page));
2570 Assert(visibilitymap_pin_ok(BufferGetBlockNumber(buffer), vmbuffer));
2571
2572 /*
2573 * It's fine to use InvalidTransactionId here - this is only used
2574 * when HEAP_INSERT_FROZEN is specified, which intentionally
2575 * violates visibility rules.
2576 */
2577 visibilitymap_set(relation, BufferGetBlockNumber(buffer), buffer,
2578 InvalidXLogRecPtr, vmbuffer,
2579 InvalidTransactionId,
2580 VISIBILITYMAP_ALL_VISIBLE | VISIBILITYMAP_ALL_FROZEN);
2581 }
2582
2583 UnlockReleaseBuffer(buffer);
2584 ndone += nthispage;
2585
2586 /*
2587 * NB: Only release vmbuffer after inserting all tuples - it's fairly
2588 * likely that we'll insert into subsequent heap pages that are likely
2589 * to use the same vm page.
2590 */
2591 }
2592
2593 /* We're done with inserting all tuples, so release the last vmbuffer. */
2594 if (vmbuffer != InvalidBuffer)
2595 ReleaseBuffer(vmbuffer);
2596
2597 /*
2598 * We're done with the actual inserts. Check for conflicts again, to
2599 * ensure that all rw-conflicts in to these inserts are detected. Without
2600 * this final check, a sequential scan of the heap may have locked the
2601 * table after the "before" check, missing one opportunity to detect the
2602 * conflict, and then scanned the table before the new tuples were there,
2603 * missing the other chance to detect the conflict.
2604 *
2605 * For heap inserts, we only need to check for table-level SSI locks. Our
2606 * new tuples can't possibly conflict with existing tuple locks, and heap
2607 * page locks are only consolidated versions of tuple locks; they do not
2608 * lock "gaps" as index page locks do. So we don't need to specify a
2609 * buffer when making the call.
2610 */
2611 CheckForSerializableConflictIn(relation, NULL, InvalidBlockNumber);
2612
2613 /*
2614 * If tuples are cachable, mark them for invalidation from the caches in
2615 * case we abort. Note it is OK to do this after releasing the buffer,
2616 * because the heaptuples data structure is all in local memory, not in
2617 * the shared buffer.
2618 */
2619 if (IsCatalogRelation(relation))
2620 {
2621 for (i = 0; i < ntuples; i++)
2622 CacheInvalidateHeapTuple(relation, heaptuples[i], NULL);
2623 }
2624
2625 /* copy t_self fields back to the caller's slots */
2626 for (i = 0; i < ntuples; i++)
2627 slots[i]->tts_tid = heaptuples[i]->t_self;
2628
2629 pgstat_count_heap_insert(relation, ntuples);
2630 }
2631
2632 /*
2633 * simple_heap_insert - insert a tuple
2634 *
2635 * Currently, this routine differs from heap_insert only in supplying
2636 * a default command ID and not allowing access to the speedup options.
2637 *
2638 * This should be used rather than using heap_insert directly in most places
2639 * where we are modifying system catalogs.
2640 */
2641 void
simple_heap_insert(Relation relation,HeapTuple tup)2642 simple_heap_insert(Relation relation, HeapTuple tup)
2643 {
2644 heap_insert(relation, tup, GetCurrentCommandId(true), 0, NULL);
2645 }
2646
2647 /*
2648 * Given infomask/infomask2, compute the bits that must be saved in the
2649 * "infobits" field of xl_heap_delete, xl_heap_update, xl_heap_lock,
2650 * xl_heap_lock_updated WAL records.
2651 *
2652 * See fix_infomask_from_infobits.
2653 */
2654 static uint8
compute_infobits(uint16 infomask,uint16 infomask2)2655 compute_infobits(uint16 infomask, uint16 infomask2)
2656 {
2657 return
2658 ((infomask & HEAP_XMAX_IS_MULTI) != 0 ? XLHL_XMAX_IS_MULTI : 0) |
2659 ((infomask & HEAP_XMAX_LOCK_ONLY) != 0 ? XLHL_XMAX_LOCK_ONLY : 0) |
2660 ((infomask & HEAP_XMAX_EXCL_LOCK) != 0 ? XLHL_XMAX_EXCL_LOCK : 0) |
2661 /* note we ignore HEAP_XMAX_SHR_LOCK here */
2662 ((infomask & HEAP_XMAX_KEYSHR_LOCK) != 0 ? XLHL_XMAX_KEYSHR_LOCK : 0) |
2663 ((infomask2 & HEAP_KEYS_UPDATED) != 0 ?
2664 XLHL_KEYS_UPDATED : 0);
2665 }
2666
2667 /*
2668 * Given two versions of the same t_infomask for a tuple, compare them and
2669 * return whether the relevant status for a tuple Xmax has changed. This is
2670 * used after a buffer lock has been released and reacquired: we want to ensure
2671 * that the tuple state continues to be the same it was when we previously
2672 * examined it.
2673 *
2674 * Note the Xmax field itself must be compared separately.
2675 */
2676 static inline bool
xmax_infomask_changed(uint16 new_infomask,uint16 old_infomask)2677 xmax_infomask_changed(uint16 new_infomask, uint16 old_infomask)
2678 {
2679 const uint16 interesting =
2680 HEAP_XMAX_IS_MULTI | HEAP_XMAX_LOCK_ONLY | HEAP_LOCK_MASK;
2681
2682 if ((new_infomask & interesting) != (old_infomask & interesting))
2683 return true;
2684
2685 return false;
2686 }
2687
2688 /*
2689 * heap_delete - delete a tuple
2690 *
2691 * See table_tuple_delete() for an explanation of the parameters, except that
2692 * this routine directly takes a tuple rather than a slot.
2693 *
2694 * In the failure cases, the routine fills *tmfd with the tuple's t_ctid,
2695 * t_xmax (resolving a possible MultiXact, if necessary), and t_cmax (the last
2696 * only for TM_SelfModified, since we cannot obtain cmax from a combo CID
2697 * generated by another transaction).
2698 */
2699 TM_Result
heap_delete(Relation relation,ItemPointer tid,CommandId cid,Snapshot crosscheck,bool wait,TM_FailureData * tmfd,bool changingPart)2700 heap_delete(Relation relation, ItemPointer tid,
2701 CommandId cid, Snapshot crosscheck, bool wait,
2702 TM_FailureData *tmfd, bool changingPart)
2703 {
2704 TM_Result result;
2705 TransactionId xid = GetCurrentTransactionId();
2706 ItemId lp;
2707 HeapTupleData tp;
2708 Page page;
2709 BlockNumber block;
2710 Buffer buffer;
2711 Buffer vmbuffer = InvalidBuffer;
2712 TransactionId new_xmax;
2713 uint16 new_infomask,
2714 new_infomask2;
2715 bool have_tuple_lock = false;
2716 bool iscombo;
2717 bool all_visible_cleared = false;
2718 HeapTuple old_key_tuple = NULL; /* replica identity of the tuple */
2719 bool old_key_copied = false;
2720
2721 Assert(ItemPointerIsValid(tid));
2722
2723 /*
2724 * Forbid this during a parallel operation, lest it allocate a combo CID.
2725 * Other workers might need that combo CID for visibility checks, and we
2726 * have no provision for broadcasting it to them.
2727 */
2728 if (IsInParallelMode())
2729 ereport(ERROR,
2730 (errcode(ERRCODE_INVALID_TRANSACTION_STATE),
2731 errmsg("cannot delete tuples during a parallel operation")));
2732
2733 block = ItemPointerGetBlockNumber(tid);
2734 buffer = ReadBuffer(relation, block);
2735 page = BufferGetPage(buffer);
2736
2737 /*
2738 * Before locking the buffer, pin the visibility map page if it appears to
2739 * be necessary. Since we haven't got the lock yet, someone else might be
2740 * in the middle of changing this, so we'll need to recheck after we have
2741 * the lock.
2742 */
2743 if (PageIsAllVisible(page))
2744 visibilitymap_pin(relation, block, &vmbuffer);
2745
2746 LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE);
2747
2748 /*
2749 * If we didn't pin the visibility map page and the page has become all
2750 * visible while we were busy locking the buffer, we'll have to unlock and
2751 * re-lock, to avoid holding the buffer lock across an I/O. That's a bit
2752 * unfortunate, but hopefully shouldn't happen often.
2753 */
2754 if (vmbuffer == InvalidBuffer && PageIsAllVisible(page))
2755 {
2756 LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
2757 visibilitymap_pin(relation, block, &vmbuffer);
2758 LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE);
2759 }
2760
2761 lp = PageGetItemId(page, ItemPointerGetOffsetNumber(tid));
2762 Assert(ItemIdIsNormal(lp));
2763
2764 tp.t_tableOid = RelationGetRelid(relation);
2765 tp.t_data = (HeapTupleHeader) PageGetItem(page, lp);
2766 tp.t_len = ItemIdGetLength(lp);
2767 tp.t_self = *tid;
2768
2769 l1:
2770 result = HeapTupleSatisfiesUpdate(&tp, cid, buffer);
2771
2772 if (result == TM_Invisible)
2773 {
2774 UnlockReleaseBuffer(buffer);
2775 ereport(ERROR,
2776 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
2777 errmsg("attempted to delete invisible tuple")));
2778 }
2779 else if (result == TM_BeingModified && wait)
2780 {
2781 TransactionId xwait;
2782 uint16 infomask;
2783
2784 /* must copy state data before unlocking buffer */
2785 xwait = HeapTupleHeaderGetRawXmax(tp.t_data);
2786 infomask = tp.t_data->t_infomask;
2787
2788 /*
2789 * Sleep until concurrent transaction ends -- except when there's a
2790 * single locker and it's our own transaction. Note we don't care
2791 * which lock mode the locker has, because we need the strongest one.
2792 *
2793 * Before sleeping, we need to acquire tuple lock to establish our
2794 * priority for the tuple (see heap_lock_tuple). LockTuple will
2795 * release us when we are next-in-line for the tuple.
2796 *
2797 * If we are forced to "start over" below, we keep the tuple lock;
2798 * this arranges that we stay at the head of the line while rechecking
2799 * tuple state.
2800 */
2801 if (infomask & HEAP_XMAX_IS_MULTI)
2802 {
2803 bool current_is_member = false;
2804
2805 if (DoesMultiXactIdConflict((MultiXactId) xwait, infomask,
2806 LockTupleExclusive, ¤t_is_member))
2807 {
2808 LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
2809
2810 /*
2811 * Acquire the lock, if necessary (but skip it when we're
2812 * requesting a lock and already have one; avoids deadlock).
2813 */
2814 if (!current_is_member)
2815 heap_acquire_tuplock(relation, &(tp.t_self), LockTupleExclusive,
2816 LockWaitBlock, &have_tuple_lock);
2817
2818 /* wait for multixact */
2819 MultiXactIdWait((MultiXactId) xwait, MultiXactStatusUpdate, infomask,
2820 relation, &(tp.t_self), XLTW_Delete,
2821 NULL);
2822 LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE);
2823
2824 /*
2825 * If xwait had just locked the tuple then some other xact
2826 * could update this tuple before we get to this point. Check
2827 * for xmax change, and start over if so.
2828 */
2829 if (xmax_infomask_changed(tp.t_data->t_infomask, infomask) ||
2830 !TransactionIdEquals(HeapTupleHeaderGetRawXmax(tp.t_data),
2831 xwait))
2832 goto l1;
2833 }
2834
2835 /*
2836 * You might think the multixact is necessarily done here, but not
2837 * so: it could have surviving members, namely our own xact or
2838 * other subxacts of this backend. It is legal for us to delete
2839 * the tuple in either case, however (the latter case is
2840 * essentially a situation of upgrading our former shared lock to
2841 * exclusive). We don't bother changing the on-disk hint bits
2842 * since we are about to overwrite the xmax altogether.
2843 */
2844 }
2845 else if (!TransactionIdIsCurrentTransactionId(xwait))
2846 {
2847 /*
2848 * Wait for regular transaction to end; but first, acquire tuple
2849 * lock.
2850 */
2851 LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
2852 heap_acquire_tuplock(relation, &(tp.t_self), LockTupleExclusive,
2853 LockWaitBlock, &have_tuple_lock);
2854 XactLockTableWait(xwait, relation, &(tp.t_self), XLTW_Delete);
2855 LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE);
2856
2857 /*
2858 * xwait is done, but if xwait had just locked the tuple then some
2859 * other xact could update this tuple before we get to this point.
2860 * Check for xmax change, and start over if so.
2861 */
2862 if (xmax_infomask_changed(tp.t_data->t_infomask, infomask) ||
2863 !TransactionIdEquals(HeapTupleHeaderGetRawXmax(tp.t_data),
2864 xwait))
2865 goto l1;
2866
2867 /* Otherwise check if it committed or aborted */
2868 UpdateXmaxHintBits(tp.t_data, buffer, xwait);
2869 }
2870
2871 /*
2872 * We may overwrite if previous xmax aborted, or if it committed but
2873 * only locked the tuple without updating it.
2874 */
2875 if ((tp.t_data->t_infomask & HEAP_XMAX_INVALID) ||
2876 HEAP_XMAX_IS_LOCKED_ONLY(tp.t_data->t_infomask) ||
2877 HeapTupleHeaderIsOnlyLocked(tp.t_data))
2878 result = TM_Ok;
2879 else if (!ItemPointerEquals(&tp.t_self, &tp.t_data->t_ctid))
2880 result = TM_Updated;
2881 else
2882 result = TM_Deleted;
2883 }
2884
2885 if (crosscheck != InvalidSnapshot && result == TM_Ok)
2886 {
2887 /* Perform additional check for transaction-snapshot mode RI updates */
2888 if (!HeapTupleSatisfiesVisibility(&tp, crosscheck, buffer))
2889 result = TM_Updated;
2890 }
2891
2892 if (result != TM_Ok)
2893 {
2894 Assert(result == TM_SelfModified ||
2895 result == TM_Updated ||
2896 result == TM_Deleted ||
2897 result == TM_BeingModified);
2898 Assert(!(tp.t_data->t_infomask & HEAP_XMAX_INVALID));
2899 Assert(result != TM_Updated ||
2900 !ItemPointerEquals(&tp.t_self, &tp.t_data->t_ctid));
2901 tmfd->ctid = tp.t_data->t_ctid;
2902 tmfd->xmax = HeapTupleHeaderGetUpdateXid(tp.t_data);
2903 if (result == TM_SelfModified)
2904 tmfd->cmax = HeapTupleHeaderGetCmax(tp.t_data);
2905 else
2906 tmfd->cmax = InvalidCommandId;
2907 UnlockReleaseBuffer(buffer);
2908 if (have_tuple_lock)
2909 UnlockTupleTuplock(relation, &(tp.t_self), LockTupleExclusive);
2910 if (vmbuffer != InvalidBuffer)
2911 ReleaseBuffer(vmbuffer);
2912 return result;
2913 }
2914
2915 /*
2916 * We're about to do the actual delete -- check for conflict first, to
2917 * avoid possibly having to roll back work we've just done.
2918 *
2919 * This is safe without a recheck as long as there is no possibility of
2920 * another process scanning the page between this check and the delete
2921 * being visible to the scan (i.e., an exclusive buffer content lock is
2922 * continuously held from this point until the tuple delete is visible).
2923 */
2924 CheckForSerializableConflictIn(relation, tid, BufferGetBlockNumber(buffer));
2925
2926 /* replace cid with a combo CID if necessary */
2927 HeapTupleHeaderAdjustCmax(tp.t_data, &cid, &iscombo);
2928
2929 /*
2930 * Compute replica identity tuple before entering the critical section so
2931 * we don't PANIC upon a memory allocation failure.
2932 */
2933 old_key_tuple = ExtractReplicaIdentity(relation, &tp, true, &old_key_copied);
2934
2935 /*
2936 * If this is the first possibly-multixact-able operation in the current
2937 * transaction, set my per-backend OldestMemberMXactId setting. We can be
2938 * certain that the transaction will never become a member of any older
2939 * MultiXactIds than that. (We have to do this even if we end up just
2940 * using our own TransactionId below, since some other backend could
2941 * incorporate our XID into a MultiXact immediately afterwards.)
2942 */
2943 MultiXactIdSetOldestMember();
2944
2945 compute_new_xmax_infomask(HeapTupleHeaderGetRawXmax(tp.t_data),
2946 tp.t_data->t_infomask, tp.t_data->t_infomask2,
2947 xid, LockTupleExclusive, true,
2948 &new_xmax, &new_infomask, &new_infomask2);
2949
2950 START_CRIT_SECTION();
2951
2952 /*
2953 * If this transaction commits, the tuple will become DEAD sooner or
2954 * later. Set flag that this page is a candidate for pruning once our xid
2955 * falls below the OldestXmin horizon. If the transaction finally aborts,
2956 * the subsequent page pruning will be a no-op and the hint will be
2957 * cleared.
2958 */
2959 PageSetPrunable(page, xid);
2960
2961 if (PageIsAllVisible(page))
2962 {
2963 all_visible_cleared = true;
2964 PageClearAllVisible(page);
2965 visibilitymap_clear(relation, BufferGetBlockNumber(buffer),
2966 vmbuffer, VISIBILITYMAP_VALID_BITS);
2967 }
2968
2969 /* store transaction information of xact deleting the tuple */
2970 tp.t_data->t_infomask &= ~(HEAP_XMAX_BITS | HEAP_MOVED);
2971 tp.t_data->t_infomask2 &= ~HEAP_KEYS_UPDATED;
2972 tp.t_data->t_infomask |= new_infomask;
2973 tp.t_data->t_infomask2 |= new_infomask2;
2974 HeapTupleHeaderClearHotUpdated(tp.t_data);
2975 HeapTupleHeaderSetXmax(tp.t_data, new_xmax);
2976 HeapTupleHeaderSetCmax(tp.t_data, cid, iscombo);
2977 /* Make sure there is no forward chain link in t_ctid */
2978 tp.t_data->t_ctid = tp.t_self;
2979
2980 /* Signal that this is actually a move into another partition */
2981 if (changingPart)
2982 HeapTupleHeaderSetMovedPartitions(tp.t_data);
2983
2984 MarkBufferDirty(buffer);
2985
2986 /*
2987 * XLOG stuff
2988 *
2989 * NB: heap_abort_speculative() uses the same xlog record and replay
2990 * routines.
2991 */
2992 if (RelationNeedsWAL(relation))
2993 {
2994 xl_heap_delete xlrec;
2995 xl_heap_header xlhdr;
2996 XLogRecPtr recptr;
2997
2998 /*
2999 * For logical decode we need combo CIDs to properly decode the
3000 * catalog
3001 */
3002 if (RelationIsAccessibleInLogicalDecoding(relation))
3003 log_heap_new_cid(relation, &tp);
3004
3005 xlrec.flags = 0;
3006 if (all_visible_cleared)
3007 xlrec.flags |= XLH_DELETE_ALL_VISIBLE_CLEARED;
3008 if (changingPart)
3009 xlrec.flags |= XLH_DELETE_IS_PARTITION_MOVE;
3010 xlrec.infobits_set = compute_infobits(tp.t_data->t_infomask,
3011 tp.t_data->t_infomask2);
3012 xlrec.offnum = ItemPointerGetOffsetNumber(&tp.t_self);
3013 xlrec.xmax = new_xmax;
3014
3015 if (old_key_tuple != NULL)
3016 {
3017 if (relation->rd_rel->relreplident == REPLICA_IDENTITY_FULL)
3018 xlrec.flags |= XLH_DELETE_CONTAINS_OLD_TUPLE;
3019 else
3020 xlrec.flags |= XLH_DELETE_CONTAINS_OLD_KEY;
3021 }
3022
3023 XLogBeginInsert();
3024 XLogRegisterData((char *) &xlrec, SizeOfHeapDelete);
3025
3026 XLogRegisterBuffer(0, buffer, REGBUF_STANDARD);
3027
3028 /*
3029 * Log replica identity of the deleted tuple if there is one
3030 */
3031 if (old_key_tuple != NULL)
3032 {
3033 xlhdr.t_infomask2 = old_key_tuple->t_data->t_infomask2;
3034 xlhdr.t_infomask = old_key_tuple->t_data->t_infomask;
3035 xlhdr.t_hoff = old_key_tuple->t_data->t_hoff;
3036
3037 XLogRegisterData((char *) &xlhdr, SizeOfHeapHeader);
3038 XLogRegisterData((char *) old_key_tuple->t_data
3039 + SizeofHeapTupleHeader,
3040 old_key_tuple->t_len
3041 - SizeofHeapTupleHeader);
3042 }
3043
3044 /* filtering by origin on a row level is much more efficient */
3045 XLogSetRecordFlags(XLOG_INCLUDE_ORIGIN);
3046
3047 recptr = XLogInsert(RM_HEAP_ID, XLOG_HEAP_DELETE);
3048
3049 PageSetLSN(page, recptr);
3050 }
3051
3052 END_CRIT_SECTION();
3053
3054 LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
3055
3056 if (vmbuffer != InvalidBuffer)
3057 ReleaseBuffer(vmbuffer);
3058
3059 /*
3060 * If the tuple has toasted out-of-line attributes, we need to delete
3061 * those items too. We have to do this before releasing the buffer
3062 * because we need to look at the contents of the tuple, but it's OK to
3063 * release the content lock on the buffer first.
3064 */
3065 if (relation->rd_rel->relkind != RELKIND_RELATION &&
3066 relation->rd_rel->relkind != RELKIND_MATVIEW)
3067 {
3068 /* toast table entries should never be recursively toasted */
3069 Assert(!HeapTupleHasExternal(&tp));
3070 }
3071 else if (HeapTupleHasExternal(&tp))
3072 heap_toast_delete(relation, &tp, false);
3073
3074 /*
3075 * Mark tuple for invalidation from system caches at next command
3076 * boundary. We have to do this before releasing the buffer because we
3077 * need to look at the contents of the tuple.
3078 */
3079 CacheInvalidateHeapTuple(relation, &tp, NULL);
3080
3081 /* Now we can release the buffer */
3082 ReleaseBuffer(buffer);
3083
3084 /*
3085 * Release the lmgr tuple lock, if we had it.
3086 */
3087 if (have_tuple_lock)
3088 UnlockTupleTuplock(relation, &(tp.t_self), LockTupleExclusive);
3089
3090 pgstat_count_heap_delete(relation);
3091
3092 if (old_key_tuple != NULL && old_key_copied)
3093 heap_freetuple(old_key_tuple);
3094
3095 return TM_Ok;
3096 }
3097
3098 /*
3099 * simple_heap_delete - delete a tuple
3100 *
3101 * This routine may be used to delete a tuple when concurrent updates of
3102 * the target tuple are not expected (for example, because we have a lock
3103 * on the relation associated with the tuple). Any failure is reported
3104 * via ereport().
3105 */
3106 void
simple_heap_delete(Relation relation,ItemPointer tid)3107 simple_heap_delete(Relation relation, ItemPointer tid)
3108 {
3109 TM_Result result;
3110 TM_FailureData tmfd;
3111
3112 result = heap_delete(relation, tid,
3113 GetCurrentCommandId(true), InvalidSnapshot,
3114 true /* wait for commit */ ,
3115 &tmfd, false /* changingPart */ );
3116 switch (result)
3117 {
3118 case TM_SelfModified:
3119 /* Tuple was already updated in current command? */
3120 elog(ERROR, "tuple already updated by self");
3121 break;
3122
3123 case TM_Ok:
3124 /* done successfully */
3125 break;
3126
3127 case TM_Updated:
3128 elog(ERROR, "tuple concurrently updated");
3129 break;
3130
3131 case TM_Deleted:
3132 elog(ERROR, "tuple concurrently deleted");
3133 break;
3134
3135 default:
3136 elog(ERROR, "unrecognized heap_delete status: %u", result);
3137 break;
3138 }
3139 }
3140
3141 /*
3142 * heap_update - replace a tuple
3143 *
3144 * See table_tuple_update() for an explanation of the parameters, except that
3145 * this routine directly takes a tuple rather than a slot.
3146 *
3147 * In the failure cases, the routine fills *tmfd with the tuple's t_ctid,
3148 * t_xmax (resolving a possible MultiXact, if necessary), and t_cmax (the last
3149 * only for TM_SelfModified, since we cannot obtain cmax from a combo CID
3150 * generated by another transaction).
3151 */
3152 TM_Result
heap_update(Relation relation,ItemPointer otid,HeapTuple newtup,CommandId cid,Snapshot crosscheck,bool wait,TM_FailureData * tmfd,LockTupleMode * lockmode)3153 heap_update(Relation relation, ItemPointer otid, HeapTuple newtup,
3154 CommandId cid, Snapshot crosscheck, bool wait,
3155 TM_FailureData *tmfd, LockTupleMode *lockmode)
3156 {
3157 TM_Result result;
3158 TransactionId xid = GetCurrentTransactionId();
3159 Bitmapset *hot_attrs;
3160 Bitmapset *key_attrs;
3161 Bitmapset *id_attrs;
3162 Bitmapset *interesting_attrs;
3163 Bitmapset *modified_attrs;
3164 ItemId lp;
3165 HeapTupleData oldtup;
3166 HeapTuple heaptup;
3167 HeapTuple old_key_tuple = NULL;
3168 bool old_key_copied = false;
3169 Page page;
3170 BlockNumber block;
3171 MultiXactStatus mxact_status;
3172 Buffer buffer,
3173 newbuf,
3174 vmbuffer = InvalidBuffer,
3175 vmbuffer_new = InvalidBuffer;
3176 bool need_toast;
3177 Size newtupsize,
3178 pagefree;
3179 bool have_tuple_lock = false;
3180 bool iscombo;
3181 bool use_hot_update = false;
3182 bool hot_attrs_checked = false;
3183 bool key_intact;
3184 bool all_visible_cleared = false;
3185 bool all_visible_cleared_new = false;
3186 bool checked_lockers;
3187 bool locker_remains;
3188 TransactionId xmax_new_tuple,
3189 xmax_old_tuple;
3190 uint16 infomask_old_tuple,
3191 infomask2_old_tuple,
3192 infomask_new_tuple,
3193 infomask2_new_tuple;
3194
3195 Assert(ItemPointerIsValid(otid));
3196
3197 /* Cheap, simplistic check that the tuple matches the rel's rowtype. */
3198 Assert(HeapTupleHeaderGetNatts(newtup->t_data) <=
3199 RelationGetNumberOfAttributes(relation));
3200
3201 /*
3202 * Forbid this during a parallel operation, lest it allocate a combo CID.
3203 * Other workers might need that combo CID for visibility checks, and we
3204 * have no provision for broadcasting it to them.
3205 */
3206 if (IsInParallelMode())
3207 ereport(ERROR,
3208 (errcode(ERRCODE_INVALID_TRANSACTION_STATE),
3209 errmsg("cannot update tuples during a parallel operation")));
3210
3211 /*
3212 * Fetch the list of attributes to be checked for various operations.
3213 *
3214 * For HOT considerations, this is wasted effort if we fail to update or
3215 * have to put the new tuple on a different page. But we must compute the
3216 * list before obtaining buffer lock --- in the worst case, if we are
3217 * doing an update on one of the relevant system catalogs, we could
3218 * deadlock if we try to fetch the list later. In any case, the relcache
3219 * caches the data so this is usually pretty cheap.
3220 *
3221 * We also need columns used by the replica identity and columns that are
3222 * considered the "key" of rows in the table.
3223 *
3224 * Note that we get copies of each bitmap, so we need not worry about
3225 * relcache flush happening midway through.
3226 */
3227 hot_attrs = RelationGetIndexAttrBitmap(relation, INDEX_ATTR_BITMAP_ALL);
3228 key_attrs = RelationGetIndexAttrBitmap(relation, INDEX_ATTR_BITMAP_KEY);
3229 id_attrs = RelationGetIndexAttrBitmap(relation,
3230 INDEX_ATTR_BITMAP_IDENTITY_KEY);
3231
3232
3233 block = ItemPointerGetBlockNumber(otid);
3234 buffer = ReadBuffer(relation, block);
3235 page = BufferGetPage(buffer);
3236
3237 interesting_attrs = NULL;
3238
3239 /*
3240 * If the page is already full, there is hardly any chance of doing a HOT
3241 * update on this page. It might be wasteful effort to look for index
3242 * column updates only to later reject HOT updates for lack of space in
3243 * the same page. So we be conservative and only fetch hot_attrs if the
3244 * page is not already full. Since we are already holding a pin on the
3245 * buffer, there is no chance that the buffer can get cleaned up
3246 * concurrently and even if that was possible, in the worst case we lose a
3247 * chance to do a HOT update.
3248 */
3249 if (!PageIsFull(page))
3250 {
3251 interesting_attrs = bms_add_members(interesting_attrs, hot_attrs);
3252 hot_attrs_checked = true;
3253 }
3254 interesting_attrs = bms_add_members(interesting_attrs, key_attrs);
3255 interesting_attrs = bms_add_members(interesting_attrs, id_attrs);
3256
3257 /*
3258 * Before locking the buffer, pin the visibility map page if it appears to
3259 * be necessary. Since we haven't got the lock yet, someone else might be
3260 * in the middle of changing this, so we'll need to recheck after we have
3261 * the lock.
3262 */
3263 if (PageIsAllVisible(page))
3264 visibilitymap_pin(relation, block, &vmbuffer);
3265
3266 LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE);
3267
3268 lp = PageGetItemId(page, ItemPointerGetOffsetNumber(otid));
3269 Assert(ItemIdIsNormal(lp));
3270
3271 /*
3272 * Fill in enough data in oldtup for HeapDetermineModifiedColumns to work
3273 * properly.
3274 */
3275 oldtup.t_tableOid = RelationGetRelid(relation);
3276 oldtup.t_data = (HeapTupleHeader) PageGetItem(page, lp);
3277 oldtup.t_len = ItemIdGetLength(lp);
3278 oldtup.t_self = *otid;
3279
3280 /* the new tuple is ready, except for this: */
3281 newtup->t_tableOid = RelationGetRelid(relation);
3282
3283 /* Determine columns modified by the update. */
3284 modified_attrs = HeapDetermineModifiedColumns(relation, interesting_attrs,
3285 &oldtup, newtup);
3286
3287 /*
3288 * If we're not updating any "key" column, we can grab a weaker lock type.
3289 * This allows for more concurrency when we are running simultaneously
3290 * with foreign key checks.
3291 *
3292 * Note that if a column gets detoasted while executing the update, but
3293 * the value ends up being the same, this test will fail and we will use
3294 * the stronger lock. This is acceptable; the important case to optimize
3295 * is updates that don't manipulate key columns, not those that
3296 * serendipitously arrive at the same key values.
3297 */
3298 if (!bms_overlap(modified_attrs, key_attrs))
3299 {
3300 *lockmode = LockTupleNoKeyExclusive;
3301 mxact_status = MultiXactStatusNoKeyUpdate;
3302 key_intact = true;
3303
3304 /*
3305 * If this is the first possibly-multixact-able operation in the
3306 * current transaction, set my per-backend OldestMemberMXactId
3307 * setting. We can be certain that the transaction will never become a
3308 * member of any older MultiXactIds than that. (We have to do this
3309 * even if we end up just using our own TransactionId below, since
3310 * some other backend could incorporate our XID into a MultiXact
3311 * immediately afterwards.)
3312 */
3313 MultiXactIdSetOldestMember();
3314 }
3315 else
3316 {
3317 *lockmode = LockTupleExclusive;
3318 mxact_status = MultiXactStatusUpdate;
3319 key_intact = false;
3320 }
3321
3322 /*
3323 * Note: beyond this point, use oldtup not otid to refer to old tuple.
3324 * otid may very well point at newtup->t_self, which we will overwrite
3325 * with the new tuple's location, so there's great risk of confusion if we
3326 * use otid anymore.
3327 */
3328
3329 l2:
3330 checked_lockers = false;
3331 locker_remains = false;
3332 result = HeapTupleSatisfiesUpdate(&oldtup, cid, buffer);
3333
3334 /* see below about the "no wait" case */
3335 Assert(result != TM_BeingModified || wait);
3336
3337 if (result == TM_Invisible)
3338 {
3339 UnlockReleaseBuffer(buffer);
3340 ereport(ERROR,
3341 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
3342 errmsg("attempted to update invisible tuple")));
3343 }
3344 else if (result == TM_BeingModified && wait)
3345 {
3346 TransactionId xwait;
3347 uint16 infomask;
3348 bool can_continue = false;
3349
3350 /*
3351 * XXX note that we don't consider the "no wait" case here. This
3352 * isn't a problem currently because no caller uses that case, but it
3353 * should be fixed if such a caller is introduced. It wasn't a
3354 * problem previously because this code would always wait, but now
3355 * that some tuple locks do not conflict with one of the lock modes we
3356 * use, it is possible that this case is interesting to handle
3357 * specially.
3358 *
3359 * This may cause failures with third-party code that calls
3360 * heap_update directly.
3361 */
3362
3363 /* must copy state data before unlocking buffer */
3364 xwait = HeapTupleHeaderGetRawXmax(oldtup.t_data);
3365 infomask = oldtup.t_data->t_infomask;
3366
3367 /*
3368 * Now we have to do something about the existing locker. If it's a
3369 * multi, sleep on it; we might be awakened before it is completely
3370 * gone (or even not sleep at all in some cases); we need to preserve
3371 * it as locker, unless it is gone completely.
3372 *
3373 * If it's not a multi, we need to check for sleeping conditions
3374 * before actually going to sleep. If the update doesn't conflict
3375 * with the locks, we just continue without sleeping (but making sure
3376 * it is preserved).
3377 *
3378 * Before sleeping, we need to acquire tuple lock to establish our
3379 * priority for the tuple (see heap_lock_tuple). LockTuple will
3380 * release us when we are next-in-line for the tuple. Note we must
3381 * not acquire the tuple lock until we're sure we're going to sleep;
3382 * otherwise we're open for race conditions with other transactions
3383 * holding the tuple lock which sleep on us.
3384 *
3385 * If we are forced to "start over" below, we keep the tuple lock;
3386 * this arranges that we stay at the head of the line while rechecking
3387 * tuple state.
3388 */
3389 if (infomask & HEAP_XMAX_IS_MULTI)
3390 {
3391 TransactionId update_xact;
3392 int remain;
3393 bool current_is_member = false;
3394
3395 if (DoesMultiXactIdConflict((MultiXactId) xwait, infomask,
3396 *lockmode, ¤t_is_member))
3397 {
3398 LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
3399
3400 /*
3401 * Acquire the lock, if necessary (but skip it when we're
3402 * requesting a lock and already have one; avoids deadlock).
3403 */
3404 if (!current_is_member)
3405 heap_acquire_tuplock(relation, &(oldtup.t_self), *lockmode,
3406 LockWaitBlock, &have_tuple_lock);
3407
3408 /* wait for multixact */
3409 MultiXactIdWait((MultiXactId) xwait, mxact_status, infomask,
3410 relation, &oldtup.t_self, XLTW_Update,
3411 &remain);
3412 checked_lockers = true;
3413 locker_remains = remain != 0;
3414 LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE);
3415
3416 /*
3417 * If xwait had just locked the tuple then some other xact
3418 * could update this tuple before we get to this point. Check
3419 * for xmax change, and start over if so.
3420 */
3421 if (xmax_infomask_changed(oldtup.t_data->t_infomask,
3422 infomask) ||
3423 !TransactionIdEquals(HeapTupleHeaderGetRawXmax(oldtup.t_data),
3424 xwait))
3425 goto l2;
3426 }
3427
3428 /*
3429 * Note that the multixact may not be done by now. It could have
3430 * surviving members; our own xact or other subxacts of this
3431 * backend, and also any other concurrent transaction that locked
3432 * the tuple with LockTupleKeyShare if we only got
3433 * LockTupleNoKeyExclusive. If this is the case, we have to be
3434 * careful to mark the updated tuple with the surviving members in
3435 * Xmax.
3436 *
3437 * Note that there could have been another update in the
3438 * MultiXact. In that case, we need to check whether it committed
3439 * or aborted. If it aborted we are safe to update it again;
3440 * otherwise there is an update conflict, and we have to return
3441 * TableTuple{Deleted, Updated} below.
3442 *
3443 * In the LockTupleExclusive case, we still need to preserve the
3444 * surviving members: those would include the tuple locks we had
3445 * before this one, which are important to keep in case this
3446 * subxact aborts.
3447 */
3448 if (!HEAP_XMAX_IS_LOCKED_ONLY(oldtup.t_data->t_infomask))
3449 update_xact = HeapTupleGetUpdateXid(oldtup.t_data);
3450 else
3451 update_xact = InvalidTransactionId;
3452
3453 /*
3454 * There was no UPDATE in the MultiXact; or it aborted. No
3455 * TransactionIdIsInProgress() call needed here, since we called
3456 * MultiXactIdWait() above.
3457 */
3458 if (!TransactionIdIsValid(update_xact) ||
3459 TransactionIdDidAbort(update_xact))
3460 can_continue = true;
3461 }
3462 else if (TransactionIdIsCurrentTransactionId(xwait))
3463 {
3464 /*
3465 * The only locker is ourselves; we can avoid grabbing the tuple
3466 * lock here, but must preserve our locking information.
3467 */
3468 checked_lockers = true;
3469 locker_remains = true;
3470 can_continue = true;
3471 }
3472 else if (HEAP_XMAX_IS_KEYSHR_LOCKED(infomask) && key_intact)
3473 {
3474 /*
3475 * If it's just a key-share locker, and we're not changing the key
3476 * columns, we don't need to wait for it to end; but we need to
3477 * preserve it as locker.
3478 */
3479 checked_lockers = true;
3480 locker_remains = true;
3481 can_continue = true;
3482 }
3483 else
3484 {
3485 /*
3486 * Wait for regular transaction to end; but first, acquire tuple
3487 * lock.
3488 */
3489 LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
3490 heap_acquire_tuplock(relation, &(oldtup.t_self), *lockmode,
3491 LockWaitBlock, &have_tuple_lock);
3492 XactLockTableWait(xwait, relation, &oldtup.t_self,
3493 XLTW_Update);
3494 checked_lockers = true;
3495 LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE);
3496
3497 /*
3498 * xwait is done, but if xwait had just locked the tuple then some
3499 * other xact could update this tuple before we get to this point.
3500 * Check for xmax change, and start over if so.
3501 */
3502 if (xmax_infomask_changed(oldtup.t_data->t_infomask, infomask) ||
3503 !TransactionIdEquals(xwait,
3504 HeapTupleHeaderGetRawXmax(oldtup.t_data)))
3505 goto l2;
3506
3507 /* Otherwise check if it committed or aborted */
3508 UpdateXmaxHintBits(oldtup.t_data, buffer, xwait);
3509 if (oldtup.t_data->t_infomask & HEAP_XMAX_INVALID)
3510 can_continue = true;
3511 }
3512
3513 if (can_continue)
3514 result = TM_Ok;
3515 else if (!ItemPointerEquals(&oldtup.t_self, &oldtup.t_data->t_ctid))
3516 result = TM_Updated;
3517 else
3518 result = TM_Deleted;
3519 }
3520
3521 if (crosscheck != InvalidSnapshot && result == TM_Ok)
3522 {
3523 /* Perform additional check for transaction-snapshot mode RI updates */
3524 if (!HeapTupleSatisfiesVisibility(&oldtup, crosscheck, buffer))
3525 {
3526 result = TM_Updated;
3527 Assert(!ItemPointerEquals(&oldtup.t_self, &oldtup.t_data->t_ctid));
3528 }
3529 }
3530
3531 if (result != TM_Ok)
3532 {
3533 Assert(result == TM_SelfModified ||
3534 result == TM_Updated ||
3535 result == TM_Deleted ||
3536 result == TM_BeingModified);
3537 Assert(!(oldtup.t_data->t_infomask & HEAP_XMAX_INVALID));
3538 Assert(result != TM_Updated ||
3539 !ItemPointerEquals(&oldtup.t_self, &oldtup.t_data->t_ctid));
3540 tmfd->ctid = oldtup.t_data->t_ctid;
3541 tmfd->xmax = HeapTupleHeaderGetUpdateXid(oldtup.t_data);
3542 if (result == TM_SelfModified)
3543 tmfd->cmax = HeapTupleHeaderGetCmax(oldtup.t_data);
3544 else
3545 tmfd->cmax = InvalidCommandId;
3546 UnlockReleaseBuffer(buffer);
3547 if (have_tuple_lock)
3548 UnlockTupleTuplock(relation, &(oldtup.t_self), *lockmode);
3549 if (vmbuffer != InvalidBuffer)
3550 ReleaseBuffer(vmbuffer);
3551 bms_free(hot_attrs);
3552 bms_free(key_attrs);
3553 bms_free(id_attrs);
3554 bms_free(modified_attrs);
3555 bms_free(interesting_attrs);
3556 return result;
3557 }
3558
3559 /*
3560 * If we didn't pin the visibility map page and the page has become all
3561 * visible while we were busy locking the buffer, or during some
3562 * subsequent window during which we had it unlocked, we'll have to unlock
3563 * and re-lock, to avoid holding the buffer lock across an I/O. That's a
3564 * bit unfortunate, especially since we'll now have to recheck whether the
3565 * tuple has been locked or updated under us, but hopefully it won't
3566 * happen very often.
3567 */
3568 if (vmbuffer == InvalidBuffer && PageIsAllVisible(page))
3569 {
3570 LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
3571 visibilitymap_pin(relation, block, &vmbuffer);
3572 LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE);
3573 goto l2;
3574 }
3575
3576 /* Fill in transaction status data */
3577
3578 /*
3579 * If the tuple we're updating is locked, we need to preserve the locking
3580 * info in the old tuple's Xmax. Prepare a new Xmax value for this.
3581 */
3582 compute_new_xmax_infomask(HeapTupleHeaderGetRawXmax(oldtup.t_data),
3583 oldtup.t_data->t_infomask,
3584 oldtup.t_data->t_infomask2,
3585 xid, *lockmode, true,
3586 &xmax_old_tuple, &infomask_old_tuple,
3587 &infomask2_old_tuple);
3588
3589 /*
3590 * And also prepare an Xmax value for the new copy of the tuple. If there
3591 * was no xmax previously, or there was one but all lockers are now gone,
3592 * then use InvalidXid; otherwise, get the xmax from the old tuple. (In
3593 * rare cases that might also be InvalidXid and yet not have the
3594 * HEAP_XMAX_INVALID bit set; that's fine.)
3595 */
3596 if ((oldtup.t_data->t_infomask & HEAP_XMAX_INVALID) ||
3597 HEAP_LOCKED_UPGRADED(oldtup.t_data->t_infomask) ||
3598 (checked_lockers && !locker_remains))
3599 xmax_new_tuple = InvalidTransactionId;
3600 else
3601 xmax_new_tuple = HeapTupleHeaderGetRawXmax(oldtup.t_data);
3602
3603 if (!TransactionIdIsValid(xmax_new_tuple))
3604 {
3605 infomask_new_tuple = HEAP_XMAX_INVALID;
3606 infomask2_new_tuple = 0;
3607 }
3608 else
3609 {
3610 /*
3611 * If we found a valid Xmax for the new tuple, then the infomask bits
3612 * to use on the new tuple depend on what was there on the old one.
3613 * Note that since we're doing an update, the only possibility is that
3614 * the lockers had FOR KEY SHARE lock.
3615 */
3616 if (oldtup.t_data->t_infomask & HEAP_XMAX_IS_MULTI)
3617 {
3618 GetMultiXactIdHintBits(xmax_new_tuple, &infomask_new_tuple,
3619 &infomask2_new_tuple);
3620 }
3621 else
3622 {
3623 infomask_new_tuple = HEAP_XMAX_KEYSHR_LOCK | HEAP_XMAX_LOCK_ONLY;
3624 infomask2_new_tuple = 0;
3625 }
3626 }
3627
3628 /*
3629 * Prepare the new tuple with the appropriate initial values of Xmin and
3630 * Xmax, as well as initial infomask bits as computed above.
3631 */
3632 newtup->t_data->t_infomask &= ~(HEAP_XACT_MASK);
3633 newtup->t_data->t_infomask2 &= ~(HEAP2_XACT_MASK);
3634 HeapTupleHeaderSetXmin(newtup->t_data, xid);
3635 HeapTupleHeaderSetCmin(newtup->t_data, cid);
3636 newtup->t_data->t_infomask |= HEAP_UPDATED | infomask_new_tuple;
3637 newtup->t_data->t_infomask2 |= infomask2_new_tuple;
3638 HeapTupleHeaderSetXmax(newtup->t_data, xmax_new_tuple);
3639
3640 /*
3641 * Replace cid with a combo CID if necessary. Note that we already put
3642 * the plain cid into the new tuple.
3643 */
3644 HeapTupleHeaderAdjustCmax(oldtup.t_data, &cid, &iscombo);
3645
3646 /*
3647 * If the toaster needs to be activated, OR if the new tuple will not fit
3648 * on the same page as the old, then we need to release the content lock
3649 * (but not the pin!) on the old tuple's buffer while we are off doing
3650 * TOAST and/or table-file-extension work. We must mark the old tuple to
3651 * show that it's locked, else other processes may try to update it
3652 * themselves.
3653 *
3654 * We need to invoke the toaster if there are already any out-of-line
3655 * toasted values present, or if the new tuple is over-threshold.
3656 */
3657 if (relation->rd_rel->relkind != RELKIND_RELATION &&
3658 relation->rd_rel->relkind != RELKIND_MATVIEW)
3659 {
3660 /* toast table entries should never be recursively toasted */
3661 Assert(!HeapTupleHasExternal(&oldtup));
3662 Assert(!HeapTupleHasExternal(newtup));
3663 need_toast = false;
3664 }
3665 else
3666 need_toast = (HeapTupleHasExternal(&oldtup) ||
3667 HeapTupleHasExternal(newtup) ||
3668 newtup->t_len > TOAST_TUPLE_THRESHOLD);
3669
3670 pagefree = PageGetHeapFreeSpace(page);
3671
3672 newtupsize = MAXALIGN(newtup->t_len);
3673
3674 if (need_toast || newtupsize > pagefree)
3675 {
3676 TransactionId xmax_lock_old_tuple;
3677 uint16 infomask_lock_old_tuple,
3678 infomask2_lock_old_tuple;
3679 bool cleared_all_frozen = false;
3680
3681 /*
3682 * To prevent concurrent sessions from updating the tuple, we have to
3683 * temporarily mark it locked, while we release the page-level lock.
3684 *
3685 * To satisfy the rule that any xid potentially appearing in a buffer
3686 * written out to disk, we unfortunately have to WAL log this
3687 * temporary modification. We can reuse xl_heap_lock for this
3688 * purpose. If we crash/error before following through with the
3689 * actual update, xmax will be of an aborted transaction, allowing
3690 * other sessions to proceed.
3691 */
3692
3693 /*
3694 * Compute xmax / infomask appropriate for locking the tuple. This has
3695 * to be done separately from the combo that's going to be used for
3696 * updating, because the potentially created multixact would otherwise
3697 * be wrong.
3698 */
3699 compute_new_xmax_infomask(HeapTupleHeaderGetRawXmax(oldtup.t_data),
3700 oldtup.t_data->t_infomask,
3701 oldtup.t_data->t_infomask2,
3702 xid, *lockmode, false,
3703 &xmax_lock_old_tuple, &infomask_lock_old_tuple,
3704 &infomask2_lock_old_tuple);
3705
3706 Assert(HEAP_XMAX_IS_LOCKED_ONLY(infomask_lock_old_tuple));
3707
3708 START_CRIT_SECTION();
3709
3710 /* Clear obsolete visibility flags ... */
3711 oldtup.t_data->t_infomask &= ~(HEAP_XMAX_BITS | HEAP_MOVED);
3712 oldtup.t_data->t_infomask2 &= ~HEAP_KEYS_UPDATED;
3713 HeapTupleClearHotUpdated(&oldtup);
3714 /* ... and store info about transaction updating this tuple */
3715 Assert(TransactionIdIsValid(xmax_lock_old_tuple));
3716 HeapTupleHeaderSetXmax(oldtup.t_data, xmax_lock_old_tuple);
3717 oldtup.t_data->t_infomask |= infomask_lock_old_tuple;
3718 oldtup.t_data->t_infomask2 |= infomask2_lock_old_tuple;
3719 HeapTupleHeaderSetCmax(oldtup.t_data, cid, iscombo);
3720
3721 /* temporarily make it look not-updated, but locked */
3722 oldtup.t_data->t_ctid = oldtup.t_self;
3723
3724 /*
3725 * Clear all-frozen bit on visibility map if needed. We could
3726 * immediately reset ALL_VISIBLE, but given that the WAL logging
3727 * overhead would be unchanged, that doesn't seem necessarily
3728 * worthwhile.
3729 */
3730 if (PageIsAllVisible(page) &&
3731 visibilitymap_clear(relation, block, vmbuffer,
3732 VISIBILITYMAP_ALL_FROZEN))
3733 cleared_all_frozen = true;
3734
3735 MarkBufferDirty(buffer);
3736
3737 if (RelationNeedsWAL(relation))
3738 {
3739 xl_heap_lock xlrec;
3740 XLogRecPtr recptr;
3741
3742 XLogBeginInsert();
3743 XLogRegisterBuffer(0, buffer, REGBUF_STANDARD);
3744
3745 xlrec.offnum = ItemPointerGetOffsetNumber(&oldtup.t_self);
3746 xlrec.locking_xid = xmax_lock_old_tuple;
3747 xlrec.infobits_set = compute_infobits(oldtup.t_data->t_infomask,
3748 oldtup.t_data->t_infomask2);
3749 xlrec.flags =
3750 cleared_all_frozen ? XLH_LOCK_ALL_FROZEN_CLEARED : 0;
3751 XLogRegisterData((char *) &xlrec, SizeOfHeapLock);
3752 recptr = XLogInsert(RM_HEAP_ID, XLOG_HEAP_LOCK);
3753 PageSetLSN(page, recptr);
3754 }
3755
3756 END_CRIT_SECTION();
3757
3758 LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
3759
3760 /*
3761 * Let the toaster do its thing, if needed.
3762 *
3763 * Note: below this point, heaptup is the data we actually intend to
3764 * store into the relation; newtup is the caller's original untoasted
3765 * data.
3766 */
3767 if (need_toast)
3768 {
3769 /* Note we always use WAL and FSM during updates */
3770 heaptup = heap_toast_insert_or_update(relation, newtup, &oldtup, 0);
3771 newtupsize = MAXALIGN(heaptup->t_len);
3772 }
3773 else
3774 heaptup = newtup;
3775
3776 /*
3777 * Now, do we need a new page for the tuple, or not? This is a bit
3778 * tricky since someone else could have added tuples to the page while
3779 * we weren't looking. We have to recheck the available space after
3780 * reacquiring the buffer lock. But don't bother to do that if the
3781 * former amount of free space is still not enough; it's unlikely
3782 * there's more free now than before.
3783 *
3784 * What's more, if we need to get a new page, we will need to acquire
3785 * buffer locks on both old and new pages. To avoid deadlock against
3786 * some other backend trying to get the same two locks in the other
3787 * order, we must be consistent about the order we get the locks in.
3788 * We use the rule "lock the lower-numbered page of the relation
3789 * first". To implement this, we must do RelationGetBufferForTuple
3790 * while not holding the lock on the old page, and we must rely on it
3791 * to get the locks on both pages in the correct order.
3792 *
3793 * Another consideration is that we need visibility map page pin(s) if
3794 * we will have to clear the all-visible flag on either page. If we
3795 * call RelationGetBufferForTuple, we rely on it to acquire any such
3796 * pins; but if we don't, we have to handle that here. Hence we need
3797 * a loop.
3798 */
3799 for (;;)
3800 {
3801 if (newtupsize > pagefree)
3802 {
3803 /* It doesn't fit, must use RelationGetBufferForTuple. */
3804 newbuf = RelationGetBufferForTuple(relation, heaptup->t_len,
3805 buffer, 0, NULL,
3806 &vmbuffer_new, &vmbuffer);
3807 /* We're all done. */
3808 break;
3809 }
3810 /* Acquire VM page pin if needed and we don't have it. */
3811 if (vmbuffer == InvalidBuffer && PageIsAllVisible(page))
3812 visibilitymap_pin(relation, block, &vmbuffer);
3813 /* Re-acquire the lock on the old tuple's page. */
3814 LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE);
3815 /* Re-check using the up-to-date free space */
3816 pagefree = PageGetHeapFreeSpace(page);
3817 if (newtupsize > pagefree ||
3818 (vmbuffer == InvalidBuffer && PageIsAllVisible(page)))
3819 {
3820 /*
3821 * Rats, it doesn't fit anymore, or somebody just now set the
3822 * all-visible flag. We must now unlock and loop to avoid
3823 * deadlock. Fortunately, this path should seldom be taken.
3824 */
3825 LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
3826 }
3827 else
3828 {
3829 /* We're all done. */
3830 newbuf = buffer;
3831 break;
3832 }
3833 }
3834 }
3835 else
3836 {
3837 /* No TOAST work needed, and it'll fit on same page */
3838 newbuf = buffer;
3839 heaptup = newtup;
3840 }
3841
3842 /*
3843 * We're about to do the actual update -- check for conflict first, to
3844 * avoid possibly having to roll back work we've just done.
3845 *
3846 * This is safe without a recheck as long as there is no possibility of
3847 * another process scanning the pages between this check and the update
3848 * being visible to the scan (i.e., exclusive buffer content lock(s) are
3849 * continuously held from this point until the tuple update is visible).
3850 *
3851 * For the new tuple the only check needed is at the relation level, but
3852 * since both tuples are in the same relation and the check for oldtup
3853 * will include checking the relation level, there is no benefit to a
3854 * separate check for the new tuple.
3855 */
3856 CheckForSerializableConflictIn(relation, &oldtup.t_self,
3857 BufferGetBlockNumber(buffer));
3858
3859 /*
3860 * At this point newbuf and buffer are both pinned and locked, and newbuf
3861 * has enough space for the new tuple. If they are the same buffer, only
3862 * one pin is held.
3863 */
3864
3865 if (newbuf == buffer)
3866 {
3867 /*
3868 * Since the new tuple is going into the same page, we might be able
3869 * to do a HOT update. Check if any of the index columns have been
3870 * changed. If the page was already full, we may have skipped checking
3871 * for index columns, and also can't do a HOT update.
3872 */
3873 if (hot_attrs_checked && !bms_overlap(modified_attrs, hot_attrs))
3874 use_hot_update = true;
3875 }
3876 else
3877 {
3878 /* Set a hint that the old page could use prune/defrag */
3879 PageSetFull(page);
3880 }
3881
3882 /*
3883 * Compute replica identity tuple before entering the critical section so
3884 * we don't PANIC upon a memory allocation failure.
3885 * ExtractReplicaIdentity() will return NULL if nothing needs to be
3886 * logged.
3887 */
3888 old_key_tuple = ExtractReplicaIdentity(relation, &oldtup,
3889 bms_overlap(modified_attrs, id_attrs),
3890 &old_key_copied);
3891
3892 /* NO EREPORT(ERROR) from here till changes are logged */
3893 START_CRIT_SECTION();
3894
3895 /*
3896 * If this transaction commits, the old tuple will become DEAD sooner or
3897 * later. Set flag that this page is a candidate for pruning once our xid
3898 * falls below the OldestXmin horizon. If the transaction finally aborts,
3899 * the subsequent page pruning will be a no-op and the hint will be
3900 * cleared.
3901 *
3902 * XXX Should we set hint on newbuf as well? If the transaction aborts,
3903 * there would be a prunable tuple in the newbuf; but for now we choose
3904 * not to optimize for aborts. Note that heap_xlog_update must be kept in
3905 * sync if this decision changes.
3906 */
3907 PageSetPrunable(page, xid);
3908
3909 if (use_hot_update)
3910 {
3911 /* Mark the old tuple as HOT-updated */
3912 HeapTupleSetHotUpdated(&oldtup);
3913 /* And mark the new tuple as heap-only */
3914 HeapTupleSetHeapOnly(heaptup);
3915 /* Mark the caller's copy too, in case different from heaptup */
3916 HeapTupleSetHeapOnly(newtup);
3917 }
3918 else
3919 {
3920 /* Make sure tuples are correctly marked as not-HOT */
3921 HeapTupleClearHotUpdated(&oldtup);
3922 HeapTupleClearHeapOnly(heaptup);
3923 HeapTupleClearHeapOnly(newtup);
3924 }
3925
3926 RelationPutHeapTuple(relation, newbuf, heaptup, false); /* insert new tuple */
3927
3928
3929 /* Clear obsolete visibility flags, possibly set by ourselves above... */
3930 oldtup.t_data->t_infomask &= ~(HEAP_XMAX_BITS | HEAP_MOVED);
3931 oldtup.t_data->t_infomask2 &= ~HEAP_KEYS_UPDATED;
3932 /* ... and store info about transaction updating this tuple */
3933 Assert(TransactionIdIsValid(xmax_old_tuple));
3934 HeapTupleHeaderSetXmax(oldtup.t_data, xmax_old_tuple);
3935 oldtup.t_data->t_infomask |= infomask_old_tuple;
3936 oldtup.t_data->t_infomask2 |= infomask2_old_tuple;
3937 HeapTupleHeaderSetCmax(oldtup.t_data, cid, iscombo);
3938
3939 /* record address of new tuple in t_ctid of old one */
3940 oldtup.t_data->t_ctid = heaptup->t_self;
3941
3942 /* clear PD_ALL_VISIBLE flags, reset all visibilitymap bits */
3943 if (PageIsAllVisible(BufferGetPage(buffer)))
3944 {
3945 all_visible_cleared = true;
3946 PageClearAllVisible(BufferGetPage(buffer));
3947 visibilitymap_clear(relation, BufferGetBlockNumber(buffer),
3948 vmbuffer, VISIBILITYMAP_VALID_BITS);
3949 }
3950 if (newbuf != buffer && PageIsAllVisible(BufferGetPage(newbuf)))
3951 {
3952 all_visible_cleared_new = true;
3953 PageClearAllVisible(BufferGetPage(newbuf));
3954 visibilitymap_clear(relation, BufferGetBlockNumber(newbuf),
3955 vmbuffer_new, VISIBILITYMAP_VALID_BITS);
3956 }
3957
3958 if (newbuf != buffer)
3959 MarkBufferDirty(newbuf);
3960 MarkBufferDirty(buffer);
3961
3962 /* XLOG stuff */
3963 if (RelationNeedsWAL(relation))
3964 {
3965 XLogRecPtr recptr;
3966
3967 /*
3968 * For logical decoding we need combo CIDs to properly decode the
3969 * catalog.
3970 */
3971 if (RelationIsAccessibleInLogicalDecoding(relation))
3972 {
3973 log_heap_new_cid(relation, &oldtup);
3974 log_heap_new_cid(relation, heaptup);
3975 }
3976
3977 recptr = log_heap_update(relation, buffer,
3978 newbuf, &oldtup, heaptup,
3979 old_key_tuple,
3980 all_visible_cleared,
3981 all_visible_cleared_new);
3982 if (newbuf != buffer)
3983 {
3984 PageSetLSN(BufferGetPage(newbuf), recptr);
3985 }
3986 PageSetLSN(BufferGetPage(buffer), recptr);
3987 }
3988
3989 END_CRIT_SECTION();
3990
3991 if (newbuf != buffer)
3992 LockBuffer(newbuf, BUFFER_LOCK_UNLOCK);
3993 LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
3994
3995 /*
3996 * Mark old tuple for invalidation from system caches at next command
3997 * boundary, and mark the new tuple for invalidation in case we abort. We
3998 * have to do this before releasing the buffer because oldtup is in the
3999 * buffer. (heaptup is all in local memory, but it's necessary to process
4000 * both tuple versions in one call to inval.c so we can avoid redundant
4001 * sinval messages.)
4002 */
4003 CacheInvalidateHeapTuple(relation, &oldtup, heaptup);
4004
4005 /* Now we can release the buffer(s) */
4006 if (newbuf != buffer)
4007 ReleaseBuffer(newbuf);
4008 ReleaseBuffer(buffer);
4009 if (BufferIsValid(vmbuffer_new))
4010 ReleaseBuffer(vmbuffer_new);
4011 if (BufferIsValid(vmbuffer))
4012 ReleaseBuffer(vmbuffer);
4013
4014 /*
4015 * Release the lmgr tuple lock, if we had it.
4016 */
4017 if (have_tuple_lock)
4018 UnlockTupleTuplock(relation, &(oldtup.t_self), *lockmode);
4019
4020 pgstat_count_heap_update(relation, use_hot_update);
4021
4022 /*
4023 * If heaptup is a private copy, release it. Don't forget to copy t_self
4024 * back to the caller's image, too.
4025 */
4026 if (heaptup != newtup)
4027 {
4028 newtup->t_self = heaptup->t_self;
4029 heap_freetuple(heaptup);
4030 }
4031
4032 if (old_key_tuple != NULL && old_key_copied)
4033 heap_freetuple(old_key_tuple);
4034
4035 bms_free(hot_attrs);
4036 bms_free(key_attrs);
4037 bms_free(id_attrs);
4038 bms_free(modified_attrs);
4039 bms_free(interesting_attrs);
4040
4041 return TM_Ok;
4042 }
4043
4044 /*
4045 * Check if the specified attribute's value is same in both given tuples.
4046 * Subroutine for HeapDetermineModifiedColumns.
4047 */
4048 static bool
heap_tuple_attr_equals(TupleDesc tupdesc,int attrnum,HeapTuple tup1,HeapTuple tup2)4049 heap_tuple_attr_equals(TupleDesc tupdesc, int attrnum,
4050 HeapTuple tup1, HeapTuple tup2)
4051 {
4052 Datum value1,
4053 value2;
4054 bool isnull1,
4055 isnull2;
4056 Form_pg_attribute att;
4057
4058 /*
4059 * If it's a whole-tuple reference, say "not equal". It's not really
4060 * worth supporting this case, since it could only succeed after a no-op
4061 * update, which is hardly a case worth optimizing for.
4062 */
4063 if (attrnum == 0)
4064 return false;
4065
4066 /*
4067 * Likewise, automatically say "not equal" for any system attribute other
4068 * than tableOID; we cannot expect these to be consistent in a HOT chain,
4069 * or even to be set correctly yet in the new tuple.
4070 */
4071 if (attrnum < 0)
4072 {
4073 if (attrnum != TableOidAttributeNumber)
4074 return false;
4075 }
4076
4077 /*
4078 * Extract the corresponding values. XXX this is pretty inefficient if
4079 * there are many indexed columns. Should HeapDetermineModifiedColumns do
4080 * a single heap_deform_tuple call on each tuple, instead? But that
4081 * doesn't work for system columns ...
4082 */
4083 value1 = heap_getattr(tup1, attrnum, tupdesc, &isnull1);
4084 value2 = heap_getattr(tup2, attrnum, tupdesc, &isnull2);
4085
4086 /*
4087 * If one value is NULL and other is not, then they are certainly not
4088 * equal
4089 */
4090 if (isnull1 != isnull2)
4091 return false;
4092
4093 /*
4094 * If both are NULL, they can be considered equal.
4095 */
4096 if (isnull1)
4097 return true;
4098
4099 /*
4100 * We do simple binary comparison of the two datums. This may be overly
4101 * strict because there can be multiple binary representations for the
4102 * same logical value. But we should be OK as long as there are no false
4103 * positives. Using a type-specific equality operator is messy because
4104 * there could be multiple notions of equality in different operator
4105 * classes; furthermore, we cannot safely invoke user-defined functions
4106 * while holding exclusive buffer lock.
4107 */
4108 if (attrnum <= 0)
4109 {
4110 /* The only allowed system columns are OIDs, so do this */
4111 return (DatumGetObjectId(value1) == DatumGetObjectId(value2));
4112 }
4113 else
4114 {
4115 Assert(attrnum <= tupdesc->natts);
4116 att = TupleDescAttr(tupdesc, attrnum - 1);
4117 return datumIsEqual(value1, value2, att->attbyval, att->attlen);
4118 }
4119 }
4120
4121 /*
4122 * Check which columns are being updated.
4123 *
4124 * Given an updated tuple, determine (and return into the output bitmapset),
4125 * from those listed as interesting, the set of columns that changed.
4126 *
4127 * The input bitmapset is destructively modified; that is OK since this is
4128 * invoked at most once in heap_update.
4129 */
4130 static Bitmapset *
HeapDetermineModifiedColumns(Relation relation,Bitmapset * interesting_cols,HeapTuple oldtup,HeapTuple newtup)4131 HeapDetermineModifiedColumns(Relation relation, Bitmapset *interesting_cols,
4132 HeapTuple oldtup, HeapTuple newtup)
4133 {
4134 int attnum;
4135 Bitmapset *modified = NULL;
4136
4137 while ((attnum = bms_first_member(interesting_cols)) >= 0)
4138 {
4139 attnum += FirstLowInvalidHeapAttributeNumber;
4140
4141 if (!heap_tuple_attr_equals(RelationGetDescr(relation),
4142 attnum, oldtup, newtup))
4143 modified = bms_add_member(modified,
4144 attnum - FirstLowInvalidHeapAttributeNumber);
4145 }
4146
4147 return modified;
4148 }
4149
4150 /*
4151 * simple_heap_update - replace a tuple
4152 *
4153 * This routine may be used to update a tuple when concurrent updates of
4154 * the target tuple are not expected (for example, because we have a lock
4155 * on the relation associated with the tuple). Any failure is reported
4156 * via ereport().
4157 */
4158 void
simple_heap_update(Relation relation,ItemPointer otid,HeapTuple tup)4159 simple_heap_update(Relation relation, ItemPointer otid, HeapTuple tup)
4160 {
4161 TM_Result result;
4162 TM_FailureData tmfd;
4163 LockTupleMode lockmode;
4164
4165 result = heap_update(relation, otid, tup,
4166 GetCurrentCommandId(true), InvalidSnapshot,
4167 true /* wait for commit */ ,
4168 &tmfd, &lockmode);
4169 switch (result)
4170 {
4171 case TM_SelfModified:
4172 /* Tuple was already updated in current command? */
4173 elog(ERROR, "tuple already updated by self");
4174 break;
4175
4176 case TM_Ok:
4177 /* done successfully */
4178 break;
4179
4180 case TM_Updated:
4181 elog(ERROR, "tuple concurrently updated");
4182 break;
4183
4184 case TM_Deleted:
4185 elog(ERROR, "tuple concurrently deleted");
4186 break;
4187
4188 default:
4189 elog(ERROR, "unrecognized heap_update status: %u", result);
4190 break;
4191 }
4192 }
4193
4194
4195 /*
4196 * Return the MultiXactStatus corresponding to the given tuple lock mode.
4197 */
4198 static MultiXactStatus
get_mxact_status_for_lock(LockTupleMode mode,bool is_update)4199 get_mxact_status_for_lock(LockTupleMode mode, bool is_update)
4200 {
4201 int retval;
4202
4203 if (is_update)
4204 retval = tupleLockExtraInfo[mode].updstatus;
4205 else
4206 retval = tupleLockExtraInfo[mode].lockstatus;
4207
4208 if (retval == -1)
4209 elog(ERROR, "invalid lock tuple mode %d/%s", mode,
4210 is_update ? "true" : "false");
4211
4212 return (MultiXactStatus) retval;
4213 }
4214
4215 /*
4216 * heap_lock_tuple - lock a tuple in shared or exclusive mode
4217 *
4218 * Note that this acquires a buffer pin, which the caller must release.
4219 *
4220 * Input parameters:
4221 * relation: relation containing tuple (caller must hold suitable lock)
4222 * tid: TID of tuple to lock
4223 * cid: current command ID (used for visibility test, and stored into
4224 * tuple's cmax if lock is successful)
4225 * mode: indicates if shared or exclusive tuple lock is desired
4226 * wait_policy: what to do if tuple lock is not available
4227 * follow_updates: if true, follow the update chain to also lock descendant
4228 * tuples.
4229 *
4230 * Output parameters:
4231 * *tuple: all fields filled in
4232 * *buffer: set to buffer holding tuple (pinned but not locked at exit)
4233 * *tmfd: filled in failure cases (see below)
4234 *
4235 * Function results are the same as the ones for table_tuple_lock().
4236 *
4237 * In the failure cases other than TM_Invisible, the routine fills
4238 * *tmfd with the tuple's t_ctid, t_xmax (resolving a possible MultiXact,
4239 * if necessary), and t_cmax (the last only for TM_SelfModified,
4240 * since we cannot obtain cmax from a combo CID generated by another
4241 * transaction).
4242 * See comments for struct TM_FailureData for additional info.
4243 *
4244 * See README.tuplock for a thorough explanation of this mechanism.
4245 */
4246 TM_Result
heap_lock_tuple(Relation relation,HeapTuple tuple,CommandId cid,LockTupleMode mode,LockWaitPolicy wait_policy,bool follow_updates,Buffer * buffer,TM_FailureData * tmfd)4247 heap_lock_tuple(Relation relation, HeapTuple tuple,
4248 CommandId cid, LockTupleMode mode, LockWaitPolicy wait_policy,
4249 bool follow_updates,
4250 Buffer *buffer, TM_FailureData *tmfd)
4251 {
4252 TM_Result result;
4253 ItemPointer tid = &(tuple->t_self);
4254 ItemId lp;
4255 Page page;
4256 Buffer vmbuffer = InvalidBuffer;
4257 BlockNumber block;
4258 TransactionId xid,
4259 xmax;
4260 uint16 old_infomask,
4261 new_infomask,
4262 new_infomask2;
4263 bool first_time = true;
4264 bool skip_tuple_lock = false;
4265 bool have_tuple_lock = false;
4266 bool cleared_all_frozen = false;
4267
4268 *buffer = ReadBuffer(relation, ItemPointerGetBlockNumber(tid));
4269 block = ItemPointerGetBlockNumber(tid);
4270
4271 /*
4272 * Before locking the buffer, pin the visibility map page if it appears to
4273 * be necessary. Since we haven't got the lock yet, someone else might be
4274 * in the middle of changing this, so we'll need to recheck after we have
4275 * the lock.
4276 */
4277 if (PageIsAllVisible(BufferGetPage(*buffer)))
4278 visibilitymap_pin(relation, block, &vmbuffer);
4279
4280 LockBuffer(*buffer, BUFFER_LOCK_EXCLUSIVE);
4281
4282 page = BufferGetPage(*buffer);
4283 lp = PageGetItemId(page, ItemPointerGetOffsetNumber(tid));
4284 Assert(ItemIdIsNormal(lp));
4285
4286 tuple->t_data = (HeapTupleHeader) PageGetItem(page, lp);
4287 tuple->t_len = ItemIdGetLength(lp);
4288 tuple->t_tableOid = RelationGetRelid(relation);
4289
4290 l3:
4291 result = HeapTupleSatisfiesUpdate(tuple, cid, *buffer);
4292
4293 if (result == TM_Invisible)
4294 {
4295 /*
4296 * This is possible, but only when locking a tuple for ON CONFLICT
4297 * UPDATE. We return this value here rather than throwing an error in
4298 * order to give that case the opportunity to throw a more specific
4299 * error.
4300 */
4301 result = TM_Invisible;
4302 goto out_locked;
4303 }
4304 else if (result == TM_BeingModified ||
4305 result == TM_Updated ||
4306 result == TM_Deleted)
4307 {
4308 TransactionId xwait;
4309 uint16 infomask;
4310 uint16 infomask2;
4311 bool require_sleep;
4312 ItemPointerData t_ctid;
4313
4314 /* must copy state data before unlocking buffer */
4315 xwait = HeapTupleHeaderGetRawXmax(tuple->t_data);
4316 infomask = tuple->t_data->t_infomask;
4317 infomask2 = tuple->t_data->t_infomask2;
4318 ItemPointerCopy(&tuple->t_data->t_ctid, &t_ctid);
4319
4320 LockBuffer(*buffer, BUFFER_LOCK_UNLOCK);
4321
4322 /*
4323 * If any subtransaction of the current top transaction already holds
4324 * a lock as strong as or stronger than what we're requesting, we
4325 * effectively hold the desired lock already. We *must* succeed
4326 * without trying to take the tuple lock, else we will deadlock
4327 * against anyone wanting to acquire a stronger lock.
4328 *
4329 * Note we only do this the first time we loop on the HTSU result;
4330 * there is no point in testing in subsequent passes, because
4331 * evidently our own transaction cannot have acquired a new lock after
4332 * the first time we checked.
4333 */
4334 if (first_time)
4335 {
4336 first_time = false;
4337
4338 if (infomask & HEAP_XMAX_IS_MULTI)
4339 {
4340 int i;
4341 int nmembers;
4342 MultiXactMember *members;
4343
4344 /*
4345 * We don't need to allow old multixacts here; if that had
4346 * been the case, HeapTupleSatisfiesUpdate would have returned
4347 * MayBeUpdated and we wouldn't be here.
4348 */
4349 nmembers =
4350 GetMultiXactIdMembers(xwait, &members, false,
4351 HEAP_XMAX_IS_LOCKED_ONLY(infomask));
4352
4353 for (i = 0; i < nmembers; i++)
4354 {
4355 /* only consider members of our own transaction */
4356 if (!TransactionIdIsCurrentTransactionId(members[i].xid))
4357 continue;
4358
4359 if (TUPLOCK_from_mxstatus(members[i].status) >= mode)
4360 {
4361 pfree(members);
4362 result = TM_Ok;
4363 goto out_unlocked;
4364 }
4365 else
4366 {
4367 /*
4368 * Disable acquisition of the heavyweight tuple lock.
4369 * Otherwise, when promoting a weaker lock, we might
4370 * deadlock with another locker that has acquired the
4371 * heavyweight tuple lock and is waiting for our
4372 * transaction to finish.
4373 *
4374 * Note that in this case we still need to wait for
4375 * the multixact if required, to avoid acquiring
4376 * conflicting locks.
4377 */
4378 skip_tuple_lock = true;
4379 }
4380 }
4381
4382 if (members)
4383 pfree(members);
4384 }
4385 else if (TransactionIdIsCurrentTransactionId(xwait))
4386 {
4387 switch (mode)
4388 {
4389 case LockTupleKeyShare:
4390 Assert(HEAP_XMAX_IS_KEYSHR_LOCKED(infomask) ||
4391 HEAP_XMAX_IS_SHR_LOCKED(infomask) ||
4392 HEAP_XMAX_IS_EXCL_LOCKED(infomask));
4393 result = TM_Ok;
4394 goto out_unlocked;
4395 case LockTupleShare:
4396 if (HEAP_XMAX_IS_SHR_LOCKED(infomask) ||
4397 HEAP_XMAX_IS_EXCL_LOCKED(infomask))
4398 {
4399 result = TM_Ok;
4400 goto out_unlocked;
4401 }
4402 break;
4403 case LockTupleNoKeyExclusive:
4404 if (HEAP_XMAX_IS_EXCL_LOCKED(infomask))
4405 {
4406 result = TM_Ok;
4407 goto out_unlocked;
4408 }
4409 break;
4410 case LockTupleExclusive:
4411 if (HEAP_XMAX_IS_EXCL_LOCKED(infomask) &&
4412 infomask2 & HEAP_KEYS_UPDATED)
4413 {
4414 result = TM_Ok;
4415 goto out_unlocked;
4416 }
4417 break;
4418 }
4419 }
4420 }
4421
4422 /*
4423 * Initially assume that we will have to wait for the locking
4424 * transaction(s) to finish. We check various cases below in which
4425 * this can be turned off.
4426 */
4427 require_sleep = true;
4428 if (mode == LockTupleKeyShare)
4429 {
4430 /*
4431 * If we're requesting KeyShare, and there's no update present, we
4432 * don't need to wait. Even if there is an update, we can still
4433 * continue if the key hasn't been modified.
4434 *
4435 * However, if there are updates, we need to walk the update chain
4436 * to mark future versions of the row as locked, too. That way,
4437 * if somebody deletes that future version, we're protected
4438 * against the key going away. This locking of future versions
4439 * could block momentarily, if a concurrent transaction is
4440 * deleting a key; or it could return a value to the effect that
4441 * the transaction deleting the key has already committed. So we
4442 * do this before re-locking the buffer; otherwise this would be
4443 * prone to deadlocks.
4444 *
4445 * Note that the TID we're locking was grabbed before we unlocked
4446 * the buffer. For it to change while we're not looking, the
4447 * other properties we're testing for below after re-locking the
4448 * buffer would also change, in which case we would restart this
4449 * loop above.
4450 */
4451 if (!(infomask2 & HEAP_KEYS_UPDATED))
4452 {
4453 bool updated;
4454
4455 updated = !HEAP_XMAX_IS_LOCKED_ONLY(infomask);
4456
4457 /*
4458 * If there are updates, follow the update chain; bail out if
4459 * that cannot be done.
4460 */
4461 if (follow_updates && updated)
4462 {
4463 TM_Result res;
4464
4465 res = heap_lock_updated_tuple(relation, tuple, &t_ctid,
4466 GetCurrentTransactionId(),
4467 mode);
4468 if (res != TM_Ok)
4469 {
4470 result = res;
4471 /* recovery code expects to have buffer lock held */
4472 LockBuffer(*buffer, BUFFER_LOCK_EXCLUSIVE);
4473 goto failed;
4474 }
4475 }
4476
4477 LockBuffer(*buffer, BUFFER_LOCK_EXCLUSIVE);
4478
4479 /*
4480 * Make sure it's still an appropriate lock, else start over.
4481 * Also, if it wasn't updated before we released the lock, but
4482 * is updated now, we start over too; the reason is that we
4483 * now need to follow the update chain to lock the new
4484 * versions.
4485 */
4486 if (!HeapTupleHeaderIsOnlyLocked(tuple->t_data) &&
4487 ((tuple->t_data->t_infomask2 & HEAP_KEYS_UPDATED) ||
4488 !updated))
4489 goto l3;
4490
4491 /* Things look okay, so we can skip sleeping */
4492 require_sleep = false;
4493
4494 /*
4495 * Note we allow Xmax to change here; other updaters/lockers
4496 * could have modified it before we grabbed the buffer lock.
4497 * However, this is not a problem, because with the recheck we
4498 * just did we ensure that they still don't conflict with the
4499 * lock we want.
4500 */
4501 }
4502 }
4503 else if (mode == LockTupleShare)
4504 {
4505 /*
4506 * If we're requesting Share, we can similarly avoid sleeping if
4507 * there's no update and no exclusive lock present.
4508 */
4509 if (HEAP_XMAX_IS_LOCKED_ONLY(infomask) &&
4510 !HEAP_XMAX_IS_EXCL_LOCKED(infomask))
4511 {
4512 LockBuffer(*buffer, BUFFER_LOCK_EXCLUSIVE);
4513
4514 /*
4515 * Make sure it's still an appropriate lock, else start over.
4516 * See above about allowing xmax to change.
4517 */
4518 if (!HEAP_XMAX_IS_LOCKED_ONLY(tuple->t_data->t_infomask) ||
4519 HEAP_XMAX_IS_EXCL_LOCKED(tuple->t_data->t_infomask))
4520 goto l3;
4521 require_sleep = false;
4522 }
4523 }
4524 else if (mode == LockTupleNoKeyExclusive)
4525 {
4526 /*
4527 * If we're requesting NoKeyExclusive, we might also be able to
4528 * avoid sleeping; just ensure that there no conflicting lock
4529 * already acquired.
4530 */
4531 if (infomask & HEAP_XMAX_IS_MULTI)
4532 {
4533 if (!DoesMultiXactIdConflict((MultiXactId) xwait, infomask,
4534 mode, NULL))
4535 {
4536 /*
4537 * No conflict, but if the xmax changed under us in the
4538 * meantime, start over.
4539 */
4540 LockBuffer(*buffer, BUFFER_LOCK_EXCLUSIVE);
4541 if (xmax_infomask_changed(tuple->t_data->t_infomask, infomask) ||
4542 !TransactionIdEquals(HeapTupleHeaderGetRawXmax(tuple->t_data),
4543 xwait))
4544 goto l3;
4545
4546 /* otherwise, we're good */
4547 require_sleep = false;
4548 }
4549 }
4550 else if (HEAP_XMAX_IS_KEYSHR_LOCKED(infomask))
4551 {
4552 LockBuffer(*buffer, BUFFER_LOCK_EXCLUSIVE);
4553
4554 /* if the xmax changed in the meantime, start over */
4555 if (xmax_infomask_changed(tuple->t_data->t_infomask, infomask) ||
4556 !TransactionIdEquals(HeapTupleHeaderGetRawXmax(tuple->t_data),
4557 xwait))
4558 goto l3;
4559 /* otherwise, we're good */
4560 require_sleep = false;
4561 }
4562 }
4563
4564 /*
4565 * As a check independent from those above, we can also avoid sleeping
4566 * if the current transaction is the sole locker of the tuple. Note
4567 * that the strength of the lock already held is irrelevant; this is
4568 * not about recording the lock in Xmax (which will be done regardless
4569 * of this optimization, below). Also, note that the cases where we
4570 * hold a lock stronger than we are requesting are already handled
4571 * above by not doing anything.
4572 *
4573 * Note we only deal with the non-multixact case here; MultiXactIdWait
4574 * is well equipped to deal with this situation on its own.
4575 */
4576 if (require_sleep && !(infomask & HEAP_XMAX_IS_MULTI) &&
4577 TransactionIdIsCurrentTransactionId(xwait))
4578 {
4579 /* ... but if the xmax changed in the meantime, start over */
4580 LockBuffer(*buffer, BUFFER_LOCK_EXCLUSIVE);
4581 if (xmax_infomask_changed(tuple->t_data->t_infomask, infomask) ||
4582 !TransactionIdEquals(HeapTupleHeaderGetRawXmax(tuple->t_data),
4583 xwait))
4584 goto l3;
4585 Assert(HEAP_XMAX_IS_LOCKED_ONLY(tuple->t_data->t_infomask));
4586 require_sleep = false;
4587 }
4588
4589 /*
4590 * Time to sleep on the other transaction/multixact, if necessary.
4591 *
4592 * If the other transaction is an update/delete that's already
4593 * committed, then sleeping cannot possibly do any good: if we're
4594 * required to sleep, get out to raise an error instead.
4595 *
4596 * By here, we either have already acquired the buffer exclusive lock,
4597 * or we must wait for the locking transaction or multixact; so below
4598 * we ensure that we grab buffer lock after the sleep.
4599 */
4600 if (require_sleep && (result == TM_Updated || result == TM_Deleted))
4601 {
4602 LockBuffer(*buffer, BUFFER_LOCK_EXCLUSIVE);
4603 goto failed;
4604 }
4605 else if (require_sleep)
4606 {
4607 /*
4608 * Acquire tuple lock to establish our priority for the tuple, or
4609 * die trying. LockTuple will release us when we are next-in-line
4610 * for the tuple. We must do this even if we are share-locking,
4611 * but not if we already have a weaker lock on the tuple.
4612 *
4613 * If we are forced to "start over" below, we keep the tuple lock;
4614 * this arranges that we stay at the head of the line while
4615 * rechecking tuple state.
4616 */
4617 if (!skip_tuple_lock &&
4618 !heap_acquire_tuplock(relation, tid, mode, wait_policy,
4619 &have_tuple_lock))
4620 {
4621 /*
4622 * This can only happen if wait_policy is Skip and the lock
4623 * couldn't be obtained.
4624 */
4625 result = TM_WouldBlock;
4626 /* recovery code expects to have buffer lock held */
4627 LockBuffer(*buffer, BUFFER_LOCK_EXCLUSIVE);
4628 goto failed;
4629 }
4630
4631 if (infomask & HEAP_XMAX_IS_MULTI)
4632 {
4633 MultiXactStatus status = get_mxact_status_for_lock(mode, false);
4634
4635 /* We only ever lock tuples, never update them */
4636 if (status >= MultiXactStatusNoKeyUpdate)
4637 elog(ERROR, "invalid lock mode in heap_lock_tuple");
4638
4639 /* wait for multixact to end, or die trying */
4640 switch (wait_policy)
4641 {
4642 case LockWaitBlock:
4643 MultiXactIdWait((MultiXactId) xwait, status, infomask,
4644 relation, &tuple->t_self, XLTW_Lock, NULL);
4645 break;
4646 case LockWaitSkip:
4647 if (!ConditionalMultiXactIdWait((MultiXactId) xwait,
4648 status, infomask, relation,
4649 NULL))
4650 {
4651 result = TM_WouldBlock;
4652 /* recovery code expects to have buffer lock held */
4653 LockBuffer(*buffer, BUFFER_LOCK_EXCLUSIVE);
4654 goto failed;
4655 }
4656 break;
4657 case LockWaitError:
4658 if (!ConditionalMultiXactIdWait((MultiXactId) xwait,
4659 status, infomask, relation,
4660 NULL))
4661 ereport(ERROR,
4662 (errcode(ERRCODE_LOCK_NOT_AVAILABLE),
4663 errmsg("could not obtain lock on row in relation \"%s\"",
4664 RelationGetRelationName(relation))));
4665
4666 break;
4667 }
4668
4669 /*
4670 * Of course, the multixact might not be done here: if we're
4671 * requesting a light lock mode, other transactions with light
4672 * locks could still be alive, as well as locks owned by our
4673 * own xact or other subxacts of this backend. We need to
4674 * preserve the surviving MultiXact members. Note that it
4675 * isn't absolutely necessary in the latter case, but doing so
4676 * is simpler.
4677 */
4678 }
4679 else
4680 {
4681 /* wait for regular transaction to end, or die trying */
4682 switch (wait_policy)
4683 {
4684 case LockWaitBlock:
4685 XactLockTableWait(xwait, relation, &tuple->t_self,
4686 XLTW_Lock);
4687 break;
4688 case LockWaitSkip:
4689 if (!ConditionalXactLockTableWait(xwait))
4690 {
4691 result = TM_WouldBlock;
4692 /* recovery code expects to have buffer lock held */
4693 LockBuffer(*buffer, BUFFER_LOCK_EXCLUSIVE);
4694 goto failed;
4695 }
4696 break;
4697 case LockWaitError:
4698 if (!ConditionalXactLockTableWait(xwait))
4699 ereport(ERROR,
4700 (errcode(ERRCODE_LOCK_NOT_AVAILABLE),
4701 errmsg("could not obtain lock on row in relation \"%s\"",
4702 RelationGetRelationName(relation))));
4703 break;
4704 }
4705 }
4706
4707 /* if there are updates, follow the update chain */
4708 if (follow_updates && !HEAP_XMAX_IS_LOCKED_ONLY(infomask))
4709 {
4710 TM_Result res;
4711
4712 res = heap_lock_updated_tuple(relation, tuple, &t_ctid,
4713 GetCurrentTransactionId(),
4714 mode);
4715 if (res != TM_Ok)
4716 {
4717 result = res;
4718 /* recovery code expects to have buffer lock held */
4719 LockBuffer(*buffer, BUFFER_LOCK_EXCLUSIVE);
4720 goto failed;
4721 }
4722 }
4723
4724 LockBuffer(*buffer, BUFFER_LOCK_EXCLUSIVE);
4725
4726 /*
4727 * xwait is done, but if xwait had just locked the tuple then some
4728 * other xact could update this tuple before we get to this point.
4729 * Check for xmax change, and start over if so.
4730 */
4731 if (xmax_infomask_changed(tuple->t_data->t_infomask, infomask) ||
4732 !TransactionIdEquals(HeapTupleHeaderGetRawXmax(tuple->t_data),
4733 xwait))
4734 goto l3;
4735
4736 if (!(infomask & HEAP_XMAX_IS_MULTI))
4737 {
4738 /*
4739 * Otherwise check if it committed or aborted. Note we cannot
4740 * be here if the tuple was only locked by somebody who didn't
4741 * conflict with us; that would have been handled above. So
4742 * that transaction must necessarily be gone by now. But
4743 * don't check for this in the multixact case, because some
4744 * locker transactions might still be running.
4745 */
4746 UpdateXmaxHintBits(tuple->t_data, *buffer, xwait);
4747 }
4748 }
4749
4750 /* By here, we're certain that we hold buffer exclusive lock again */
4751
4752 /*
4753 * We may lock if previous xmax aborted, or if it committed but only
4754 * locked the tuple without updating it; or if we didn't have to wait
4755 * at all for whatever reason.
4756 */
4757 if (!require_sleep ||
4758 (tuple->t_data->t_infomask & HEAP_XMAX_INVALID) ||
4759 HEAP_XMAX_IS_LOCKED_ONLY(tuple->t_data->t_infomask) ||
4760 HeapTupleHeaderIsOnlyLocked(tuple->t_data))
4761 result = TM_Ok;
4762 else if (!ItemPointerEquals(&tuple->t_self, &tuple->t_data->t_ctid))
4763 result = TM_Updated;
4764 else
4765 result = TM_Deleted;
4766 }
4767
4768 failed:
4769 if (result != TM_Ok)
4770 {
4771 Assert(result == TM_SelfModified || result == TM_Updated ||
4772 result == TM_Deleted || result == TM_WouldBlock);
4773 Assert(!(tuple->t_data->t_infomask & HEAP_XMAX_INVALID));
4774 Assert(result != TM_Updated ||
4775 !ItemPointerEquals(&tuple->t_self, &tuple->t_data->t_ctid));
4776 tmfd->ctid = tuple->t_data->t_ctid;
4777 tmfd->xmax = HeapTupleHeaderGetUpdateXid(tuple->t_data);
4778 if (result == TM_SelfModified)
4779 tmfd->cmax = HeapTupleHeaderGetCmax(tuple->t_data);
4780 else
4781 tmfd->cmax = InvalidCommandId;
4782 goto out_locked;
4783 }
4784
4785 /*
4786 * If we didn't pin the visibility map page and the page has become all
4787 * visible while we were busy locking the buffer, or during some
4788 * subsequent window during which we had it unlocked, we'll have to unlock
4789 * and re-lock, to avoid holding the buffer lock across I/O. That's a bit
4790 * unfortunate, especially since we'll now have to recheck whether the
4791 * tuple has been locked or updated under us, but hopefully it won't
4792 * happen very often.
4793 */
4794 if (vmbuffer == InvalidBuffer && PageIsAllVisible(page))
4795 {
4796 LockBuffer(*buffer, BUFFER_LOCK_UNLOCK);
4797 visibilitymap_pin(relation, block, &vmbuffer);
4798 LockBuffer(*buffer, BUFFER_LOCK_EXCLUSIVE);
4799 goto l3;
4800 }
4801
4802 xmax = HeapTupleHeaderGetRawXmax(tuple->t_data);
4803 old_infomask = tuple->t_data->t_infomask;
4804
4805 /*
4806 * If this is the first possibly-multixact-able operation in the current
4807 * transaction, set my per-backend OldestMemberMXactId setting. We can be
4808 * certain that the transaction will never become a member of any older
4809 * MultiXactIds than that. (We have to do this even if we end up just
4810 * using our own TransactionId below, since some other backend could
4811 * incorporate our XID into a MultiXact immediately afterwards.)
4812 */
4813 MultiXactIdSetOldestMember();
4814
4815 /*
4816 * Compute the new xmax and infomask to store into the tuple. Note we do
4817 * not modify the tuple just yet, because that would leave it in the wrong
4818 * state if multixact.c elogs.
4819 */
4820 compute_new_xmax_infomask(xmax, old_infomask, tuple->t_data->t_infomask2,
4821 GetCurrentTransactionId(), mode, false,
4822 &xid, &new_infomask, &new_infomask2);
4823
4824 START_CRIT_SECTION();
4825
4826 /*
4827 * Store transaction information of xact locking the tuple.
4828 *
4829 * Note: Cmax is meaningless in this context, so don't set it; this avoids
4830 * possibly generating a useless combo CID. Moreover, if we're locking a
4831 * previously updated tuple, it's important to preserve the Cmax.
4832 *
4833 * Also reset the HOT UPDATE bit, but only if there's no update; otherwise
4834 * we would break the HOT chain.
4835 */
4836 tuple->t_data->t_infomask &= ~HEAP_XMAX_BITS;
4837 tuple->t_data->t_infomask2 &= ~HEAP_KEYS_UPDATED;
4838 tuple->t_data->t_infomask |= new_infomask;
4839 tuple->t_data->t_infomask2 |= new_infomask2;
4840 if (HEAP_XMAX_IS_LOCKED_ONLY(new_infomask))
4841 HeapTupleHeaderClearHotUpdated(tuple->t_data);
4842 HeapTupleHeaderSetXmax(tuple->t_data, xid);
4843
4844 /*
4845 * Make sure there is no forward chain link in t_ctid. Note that in the
4846 * cases where the tuple has been updated, we must not overwrite t_ctid,
4847 * because it was set by the updater. Moreover, if the tuple has been
4848 * updated, we need to follow the update chain to lock the new versions of
4849 * the tuple as well.
4850 */
4851 if (HEAP_XMAX_IS_LOCKED_ONLY(new_infomask))
4852 tuple->t_data->t_ctid = *tid;
4853
4854 /* Clear only the all-frozen bit on visibility map if needed */
4855 if (PageIsAllVisible(page) &&
4856 visibilitymap_clear(relation, block, vmbuffer,
4857 VISIBILITYMAP_ALL_FROZEN))
4858 cleared_all_frozen = true;
4859
4860
4861 MarkBufferDirty(*buffer);
4862
4863 /*
4864 * XLOG stuff. You might think that we don't need an XLOG record because
4865 * there is no state change worth restoring after a crash. You would be
4866 * wrong however: we have just written either a TransactionId or a
4867 * MultiXactId that may never have been seen on disk before, and we need
4868 * to make sure that there are XLOG entries covering those ID numbers.
4869 * Else the same IDs might be re-used after a crash, which would be
4870 * disastrous if this page made it to disk before the crash. Essentially
4871 * we have to enforce the WAL log-before-data rule even in this case.
4872 * (Also, in a PITR log-shipping or 2PC environment, we have to have XLOG
4873 * entries for everything anyway.)
4874 */
4875 if (RelationNeedsWAL(relation))
4876 {
4877 xl_heap_lock xlrec;
4878 XLogRecPtr recptr;
4879
4880 XLogBeginInsert();
4881 XLogRegisterBuffer(0, *buffer, REGBUF_STANDARD);
4882
4883 xlrec.offnum = ItemPointerGetOffsetNumber(&tuple->t_self);
4884 xlrec.locking_xid = xid;
4885 xlrec.infobits_set = compute_infobits(new_infomask,
4886 tuple->t_data->t_infomask2);
4887 xlrec.flags = cleared_all_frozen ? XLH_LOCK_ALL_FROZEN_CLEARED : 0;
4888 XLogRegisterData((char *) &xlrec, SizeOfHeapLock);
4889
4890 /* we don't decode row locks atm, so no need to log the origin */
4891
4892 recptr = XLogInsert(RM_HEAP_ID, XLOG_HEAP_LOCK);
4893
4894 PageSetLSN(page, recptr);
4895 }
4896
4897 END_CRIT_SECTION();
4898
4899 result = TM_Ok;
4900
4901 out_locked:
4902 LockBuffer(*buffer, BUFFER_LOCK_UNLOCK);
4903
4904 out_unlocked:
4905 if (BufferIsValid(vmbuffer))
4906 ReleaseBuffer(vmbuffer);
4907
4908 /*
4909 * Don't update the visibility map here. Locking a tuple doesn't change
4910 * visibility info.
4911 */
4912
4913 /*
4914 * Now that we have successfully marked the tuple as locked, we can
4915 * release the lmgr tuple lock, if we had it.
4916 */
4917 if (have_tuple_lock)
4918 UnlockTupleTuplock(relation, tid, mode);
4919
4920 return result;
4921 }
4922
4923 /*
4924 * Acquire heavyweight lock on the given tuple, in preparation for acquiring
4925 * its normal, Xmax-based tuple lock.
4926 *
4927 * have_tuple_lock is an input and output parameter: on input, it indicates
4928 * whether the lock has previously been acquired (and this function does
4929 * nothing in that case). If this function returns success, have_tuple_lock
4930 * has been flipped to true.
4931 *
4932 * Returns false if it was unable to obtain the lock; this can only happen if
4933 * wait_policy is Skip.
4934 */
4935 static bool
heap_acquire_tuplock(Relation relation,ItemPointer tid,LockTupleMode mode,LockWaitPolicy wait_policy,bool * have_tuple_lock)4936 heap_acquire_tuplock(Relation relation, ItemPointer tid, LockTupleMode mode,
4937 LockWaitPolicy wait_policy, bool *have_tuple_lock)
4938 {
4939 if (*have_tuple_lock)
4940 return true;
4941
4942 switch (wait_policy)
4943 {
4944 case LockWaitBlock:
4945 LockTupleTuplock(relation, tid, mode);
4946 break;
4947
4948 case LockWaitSkip:
4949 if (!ConditionalLockTupleTuplock(relation, tid, mode))
4950 return false;
4951 break;
4952
4953 case LockWaitError:
4954 if (!ConditionalLockTupleTuplock(relation, tid, mode))
4955 ereport(ERROR,
4956 (errcode(ERRCODE_LOCK_NOT_AVAILABLE),
4957 errmsg("could not obtain lock on row in relation \"%s\"",
4958 RelationGetRelationName(relation))));
4959 break;
4960 }
4961 *have_tuple_lock = true;
4962
4963 return true;
4964 }
4965
4966 /*
4967 * Given an original set of Xmax and infomask, and a transaction (identified by
4968 * add_to_xmax) acquiring a new lock of some mode, compute the new Xmax and
4969 * corresponding infomasks to use on the tuple.
4970 *
4971 * Note that this might have side effects such as creating a new MultiXactId.
4972 *
4973 * Most callers will have called HeapTupleSatisfiesUpdate before this function;
4974 * that will have set the HEAP_XMAX_INVALID bit if the xmax was a MultiXactId
4975 * but it was not running anymore. There is a race condition, which is that the
4976 * MultiXactId may have finished since then, but that uncommon case is handled
4977 * either here, or within MultiXactIdExpand.
4978 *
4979 * There is a similar race condition possible when the old xmax was a regular
4980 * TransactionId. We test TransactionIdIsInProgress again just to narrow the
4981 * window, but it's still possible to end up creating an unnecessary
4982 * MultiXactId. Fortunately this is harmless.
4983 */
4984 static void
compute_new_xmax_infomask(TransactionId xmax,uint16 old_infomask,uint16 old_infomask2,TransactionId add_to_xmax,LockTupleMode mode,bool is_update,TransactionId * result_xmax,uint16 * result_infomask,uint16 * result_infomask2)4985 compute_new_xmax_infomask(TransactionId xmax, uint16 old_infomask,
4986 uint16 old_infomask2, TransactionId add_to_xmax,
4987 LockTupleMode mode, bool is_update,
4988 TransactionId *result_xmax, uint16 *result_infomask,
4989 uint16 *result_infomask2)
4990 {
4991 TransactionId new_xmax;
4992 uint16 new_infomask,
4993 new_infomask2;
4994
4995 Assert(TransactionIdIsCurrentTransactionId(add_to_xmax));
4996
4997 l5:
4998 new_infomask = 0;
4999 new_infomask2 = 0;
5000 if (old_infomask & HEAP_XMAX_INVALID)
5001 {
5002 /*
5003 * No previous locker; we just insert our own TransactionId.
5004 *
5005 * Note that it's critical that this case be the first one checked,
5006 * because there are several blocks below that come back to this one
5007 * to implement certain optimizations; old_infomask might contain
5008 * other dirty bits in those cases, but we don't really care.
5009 */
5010 if (is_update)
5011 {
5012 new_xmax = add_to_xmax;
5013 if (mode == LockTupleExclusive)
5014 new_infomask2 |= HEAP_KEYS_UPDATED;
5015 }
5016 else
5017 {
5018 new_infomask |= HEAP_XMAX_LOCK_ONLY;
5019 switch (mode)
5020 {
5021 case LockTupleKeyShare:
5022 new_xmax = add_to_xmax;
5023 new_infomask |= HEAP_XMAX_KEYSHR_LOCK;
5024 break;
5025 case LockTupleShare:
5026 new_xmax = add_to_xmax;
5027 new_infomask |= HEAP_XMAX_SHR_LOCK;
5028 break;
5029 case LockTupleNoKeyExclusive:
5030 new_xmax = add_to_xmax;
5031 new_infomask |= HEAP_XMAX_EXCL_LOCK;
5032 break;
5033 case LockTupleExclusive:
5034 new_xmax = add_to_xmax;
5035 new_infomask |= HEAP_XMAX_EXCL_LOCK;
5036 new_infomask2 |= HEAP_KEYS_UPDATED;
5037 break;
5038 default:
5039 new_xmax = InvalidTransactionId; /* silence compiler */
5040 elog(ERROR, "invalid lock mode");
5041 }
5042 }
5043 }
5044 else if (old_infomask & HEAP_XMAX_IS_MULTI)
5045 {
5046 MultiXactStatus new_status;
5047
5048 /*
5049 * Currently we don't allow XMAX_COMMITTED to be set for multis, so
5050 * cross-check.
5051 */
5052 Assert(!(old_infomask & HEAP_XMAX_COMMITTED));
5053
5054 /*
5055 * A multixact together with LOCK_ONLY set but neither lock bit set
5056 * (i.e. a pg_upgraded share locked tuple) cannot possibly be running
5057 * anymore. This check is critical for databases upgraded by
5058 * pg_upgrade; both MultiXactIdIsRunning and MultiXactIdExpand assume
5059 * that such multis are never passed.
5060 */
5061 if (HEAP_LOCKED_UPGRADED(old_infomask))
5062 {
5063 old_infomask &= ~HEAP_XMAX_IS_MULTI;
5064 old_infomask |= HEAP_XMAX_INVALID;
5065 goto l5;
5066 }
5067
5068 /*
5069 * If the XMAX is already a MultiXactId, then we need to expand it to
5070 * include add_to_xmax; but if all the members were lockers and are
5071 * all gone, we can do away with the IS_MULTI bit and just set
5072 * add_to_xmax as the only locker/updater. If all lockers are gone
5073 * and we have an updater that aborted, we can also do without a
5074 * multi.
5075 *
5076 * The cost of doing GetMultiXactIdMembers would be paid by
5077 * MultiXactIdExpand if we weren't to do this, so this check is not
5078 * incurring extra work anyhow.
5079 */
5080 if (!MultiXactIdIsRunning(xmax, HEAP_XMAX_IS_LOCKED_ONLY(old_infomask)))
5081 {
5082 if (HEAP_XMAX_IS_LOCKED_ONLY(old_infomask) ||
5083 !TransactionIdDidCommit(MultiXactIdGetUpdateXid(xmax,
5084 old_infomask)))
5085 {
5086 /*
5087 * Reset these bits and restart; otherwise fall through to
5088 * create a new multi below.
5089 */
5090 old_infomask &= ~HEAP_XMAX_IS_MULTI;
5091 old_infomask |= HEAP_XMAX_INVALID;
5092 goto l5;
5093 }
5094 }
5095
5096 new_status = get_mxact_status_for_lock(mode, is_update);
5097
5098 new_xmax = MultiXactIdExpand((MultiXactId) xmax, add_to_xmax,
5099 new_status);
5100 GetMultiXactIdHintBits(new_xmax, &new_infomask, &new_infomask2);
5101 }
5102 else if (old_infomask & HEAP_XMAX_COMMITTED)
5103 {
5104 /*
5105 * It's a committed update, so we need to preserve him as updater of
5106 * the tuple.
5107 */
5108 MultiXactStatus status;
5109 MultiXactStatus new_status;
5110
5111 if (old_infomask2 & HEAP_KEYS_UPDATED)
5112 status = MultiXactStatusUpdate;
5113 else
5114 status = MultiXactStatusNoKeyUpdate;
5115
5116 new_status = get_mxact_status_for_lock(mode, is_update);
5117
5118 /*
5119 * since it's not running, it's obviously impossible for the old
5120 * updater to be identical to the current one, so we need not check
5121 * for that case as we do in the block above.
5122 */
5123 new_xmax = MultiXactIdCreate(xmax, status, add_to_xmax, new_status);
5124 GetMultiXactIdHintBits(new_xmax, &new_infomask, &new_infomask2);
5125 }
5126 else if (TransactionIdIsInProgress(xmax))
5127 {
5128 /*
5129 * If the XMAX is a valid, in-progress TransactionId, then we need to
5130 * create a new MultiXactId that includes both the old locker or
5131 * updater and our own TransactionId.
5132 */
5133 MultiXactStatus new_status;
5134 MultiXactStatus old_status;
5135 LockTupleMode old_mode;
5136
5137 if (HEAP_XMAX_IS_LOCKED_ONLY(old_infomask))
5138 {
5139 if (HEAP_XMAX_IS_KEYSHR_LOCKED(old_infomask))
5140 old_status = MultiXactStatusForKeyShare;
5141 else if (HEAP_XMAX_IS_SHR_LOCKED(old_infomask))
5142 old_status = MultiXactStatusForShare;
5143 else if (HEAP_XMAX_IS_EXCL_LOCKED(old_infomask))
5144 {
5145 if (old_infomask2 & HEAP_KEYS_UPDATED)
5146 old_status = MultiXactStatusForUpdate;
5147 else
5148 old_status = MultiXactStatusForNoKeyUpdate;
5149 }
5150 else
5151 {
5152 /*
5153 * LOCK_ONLY can be present alone only when a page has been
5154 * upgraded by pg_upgrade. But in that case,
5155 * TransactionIdIsInProgress() should have returned false. We
5156 * assume it's no longer locked in this case.
5157 */
5158 elog(WARNING, "LOCK_ONLY found for Xid in progress %u", xmax);
5159 old_infomask |= HEAP_XMAX_INVALID;
5160 old_infomask &= ~HEAP_XMAX_LOCK_ONLY;
5161 goto l5;
5162 }
5163 }
5164 else
5165 {
5166 /* it's an update, but which kind? */
5167 if (old_infomask2 & HEAP_KEYS_UPDATED)
5168 old_status = MultiXactStatusUpdate;
5169 else
5170 old_status = MultiXactStatusNoKeyUpdate;
5171 }
5172
5173 old_mode = TUPLOCK_from_mxstatus(old_status);
5174
5175 /*
5176 * If the lock to be acquired is for the same TransactionId as the
5177 * existing lock, there's an optimization possible: consider only the
5178 * strongest of both locks as the only one present, and restart.
5179 */
5180 if (xmax == add_to_xmax)
5181 {
5182 /*
5183 * Note that it's not possible for the original tuple to be
5184 * updated: we wouldn't be here because the tuple would have been
5185 * invisible and we wouldn't try to update it. As a subtlety,
5186 * this code can also run when traversing an update chain to lock
5187 * future versions of a tuple. But we wouldn't be here either,
5188 * because the add_to_xmax would be different from the original
5189 * updater.
5190 */
5191 Assert(HEAP_XMAX_IS_LOCKED_ONLY(old_infomask));
5192
5193 /* acquire the strongest of both */
5194 if (mode < old_mode)
5195 mode = old_mode;
5196 /* mustn't touch is_update */
5197
5198 old_infomask |= HEAP_XMAX_INVALID;
5199 goto l5;
5200 }
5201
5202 /* otherwise, just fall back to creating a new multixact */
5203 new_status = get_mxact_status_for_lock(mode, is_update);
5204 new_xmax = MultiXactIdCreate(xmax, old_status,
5205 add_to_xmax, new_status);
5206 GetMultiXactIdHintBits(new_xmax, &new_infomask, &new_infomask2);
5207 }
5208 else if (!HEAP_XMAX_IS_LOCKED_ONLY(old_infomask) &&
5209 TransactionIdDidCommit(xmax))
5210 {
5211 /*
5212 * It's a committed update, so we gotta preserve him as updater of the
5213 * tuple.
5214 */
5215 MultiXactStatus status;
5216 MultiXactStatus new_status;
5217
5218 if (old_infomask2 & HEAP_KEYS_UPDATED)
5219 status = MultiXactStatusUpdate;
5220 else
5221 status = MultiXactStatusNoKeyUpdate;
5222
5223 new_status = get_mxact_status_for_lock(mode, is_update);
5224
5225 /*
5226 * since it's not running, it's obviously impossible for the old
5227 * updater to be identical to the current one, so we need not check
5228 * for that case as we do in the block above.
5229 */
5230 new_xmax = MultiXactIdCreate(xmax, status, add_to_xmax, new_status);
5231 GetMultiXactIdHintBits(new_xmax, &new_infomask, &new_infomask2);
5232 }
5233 else
5234 {
5235 /*
5236 * Can get here iff the locking/updating transaction was running when
5237 * the infomask was extracted from the tuple, but finished before
5238 * TransactionIdIsInProgress got to run. Deal with it as if there was
5239 * no locker at all in the first place.
5240 */
5241 old_infomask |= HEAP_XMAX_INVALID;
5242 goto l5;
5243 }
5244
5245 *result_infomask = new_infomask;
5246 *result_infomask2 = new_infomask2;
5247 *result_xmax = new_xmax;
5248 }
5249
5250 /*
5251 * Subroutine for heap_lock_updated_tuple_rec.
5252 *
5253 * Given a hypothetical multixact status held by the transaction identified
5254 * with the given xid, does the current transaction need to wait, fail, or can
5255 * it continue if it wanted to acquire a lock of the given mode? "needwait"
5256 * is set to true if waiting is necessary; if it can continue, then TM_Ok is
5257 * returned. If the lock is already held by the current transaction, return
5258 * TM_SelfModified. In case of a conflict with another transaction, a
5259 * different HeapTupleSatisfiesUpdate return code is returned.
5260 *
5261 * The held status is said to be hypothetical because it might correspond to a
5262 * lock held by a single Xid, i.e. not a real MultiXactId; we express it this
5263 * way for simplicity of API.
5264 */
5265 static TM_Result
test_lockmode_for_conflict(MultiXactStatus status,TransactionId xid,LockTupleMode mode,HeapTuple tup,bool * needwait)5266 test_lockmode_for_conflict(MultiXactStatus status, TransactionId xid,
5267 LockTupleMode mode, HeapTuple tup,
5268 bool *needwait)
5269 {
5270 MultiXactStatus wantedstatus;
5271
5272 *needwait = false;
5273 wantedstatus = get_mxact_status_for_lock(mode, false);
5274
5275 /*
5276 * Note: we *must* check TransactionIdIsInProgress before
5277 * TransactionIdDidAbort/Commit; see comment at top of heapam_visibility.c
5278 * for an explanation.
5279 */
5280 if (TransactionIdIsCurrentTransactionId(xid))
5281 {
5282 /*
5283 * The tuple has already been locked by our own transaction. This is
5284 * very rare but can happen if multiple transactions are trying to
5285 * lock an ancient version of the same tuple.
5286 */
5287 return TM_SelfModified;
5288 }
5289 else if (TransactionIdIsInProgress(xid))
5290 {
5291 /*
5292 * If the locking transaction is running, what we do depends on
5293 * whether the lock modes conflict: if they do, then we must wait for
5294 * it to finish; otherwise we can fall through to lock this tuple
5295 * version without waiting.
5296 */
5297 if (DoLockModesConflict(LOCKMODE_from_mxstatus(status),
5298 LOCKMODE_from_mxstatus(wantedstatus)))
5299 {
5300 *needwait = true;
5301 }
5302
5303 /*
5304 * If we set needwait above, then this value doesn't matter;
5305 * otherwise, this value signals to caller that it's okay to proceed.
5306 */
5307 return TM_Ok;
5308 }
5309 else if (TransactionIdDidAbort(xid))
5310 return TM_Ok;
5311 else if (TransactionIdDidCommit(xid))
5312 {
5313 /*
5314 * The other transaction committed. If it was only a locker, then the
5315 * lock is completely gone now and we can return success; but if it
5316 * was an update, then what we do depends on whether the two lock
5317 * modes conflict. If they conflict, then we must report error to
5318 * caller. But if they don't, we can fall through to allow the current
5319 * transaction to lock the tuple.
5320 *
5321 * Note: the reason we worry about ISUPDATE here is because as soon as
5322 * a transaction ends, all its locks are gone and meaningless, and
5323 * thus we can ignore them; whereas its updates persist. In the
5324 * TransactionIdIsInProgress case, above, we don't need to check
5325 * because we know the lock is still "alive" and thus a conflict needs
5326 * always be checked.
5327 */
5328 if (!ISUPDATE_from_mxstatus(status))
5329 return TM_Ok;
5330
5331 if (DoLockModesConflict(LOCKMODE_from_mxstatus(status),
5332 LOCKMODE_from_mxstatus(wantedstatus)))
5333 {
5334 /* bummer */
5335 if (!ItemPointerEquals(&tup->t_self, &tup->t_data->t_ctid))
5336 return TM_Updated;
5337 else
5338 return TM_Deleted;
5339 }
5340
5341 return TM_Ok;
5342 }
5343
5344 /* Not in progress, not aborted, not committed -- must have crashed */
5345 return TM_Ok;
5346 }
5347
5348
5349 /*
5350 * Recursive part of heap_lock_updated_tuple
5351 *
5352 * Fetch the tuple pointed to by tid in rel, and mark it as locked by the given
5353 * xid with the given mode; if this tuple is updated, recurse to lock the new
5354 * version as well.
5355 */
5356 static TM_Result
heap_lock_updated_tuple_rec(Relation rel,ItemPointer tid,TransactionId xid,LockTupleMode mode)5357 heap_lock_updated_tuple_rec(Relation rel, ItemPointer tid, TransactionId xid,
5358 LockTupleMode mode)
5359 {
5360 TM_Result result;
5361 ItemPointerData tupid;
5362 HeapTupleData mytup;
5363 Buffer buf;
5364 uint16 new_infomask,
5365 new_infomask2,
5366 old_infomask,
5367 old_infomask2;
5368 TransactionId xmax,
5369 new_xmax;
5370 TransactionId priorXmax = InvalidTransactionId;
5371 bool cleared_all_frozen = false;
5372 bool pinned_desired_page;
5373 Buffer vmbuffer = InvalidBuffer;
5374 BlockNumber block;
5375
5376 ItemPointerCopy(tid, &tupid);
5377
5378 for (;;)
5379 {
5380 new_infomask = 0;
5381 new_xmax = InvalidTransactionId;
5382 block = ItemPointerGetBlockNumber(&tupid);
5383 ItemPointerCopy(&tupid, &(mytup.t_self));
5384
5385 if (!heap_fetch(rel, SnapshotAny, &mytup, &buf))
5386 {
5387 /*
5388 * if we fail to find the updated version of the tuple, it's
5389 * because it was vacuumed/pruned away after its creator
5390 * transaction aborted. So behave as if we got to the end of the
5391 * chain, and there's no further tuple to lock: return success to
5392 * caller.
5393 */
5394 result = TM_Ok;
5395 goto out_unlocked;
5396 }
5397
5398 l4:
5399 CHECK_FOR_INTERRUPTS();
5400
5401 /*
5402 * Before locking the buffer, pin the visibility map page if it
5403 * appears to be necessary. Since we haven't got the lock yet,
5404 * someone else might be in the middle of changing this, so we'll need
5405 * to recheck after we have the lock.
5406 */
5407 if (PageIsAllVisible(BufferGetPage(buf)))
5408 {
5409 visibilitymap_pin(rel, block, &vmbuffer);
5410 pinned_desired_page = true;
5411 }
5412 else
5413 pinned_desired_page = false;
5414
5415 LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE);
5416
5417 /*
5418 * If we didn't pin the visibility map page and the page has become
5419 * all visible while we were busy locking the buffer, we'll have to
5420 * unlock and re-lock, to avoid holding the buffer lock across I/O.
5421 * That's a bit unfortunate, but hopefully shouldn't happen often.
5422 *
5423 * Note: in some paths through this function, we will reach here
5424 * holding a pin on a vm page that may or may not be the one matching
5425 * this page. If this page isn't all-visible, we won't use the vm
5426 * page, but we hold onto such a pin till the end of the function.
5427 */
5428 if (!pinned_desired_page && PageIsAllVisible(BufferGetPage(buf)))
5429 {
5430 LockBuffer(buf, BUFFER_LOCK_UNLOCK);
5431 visibilitymap_pin(rel, block, &vmbuffer);
5432 LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE);
5433 }
5434
5435 /*
5436 * Check the tuple XMIN against prior XMAX, if any. If we reached the
5437 * end of the chain, we're done, so return success.
5438 */
5439 if (TransactionIdIsValid(priorXmax) &&
5440 !TransactionIdEquals(HeapTupleHeaderGetXmin(mytup.t_data),
5441 priorXmax))
5442 {
5443 result = TM_Ok;
5444 goto out_locked;
5445 }
5446
5447 /*
5448 * Also check Xmin: if this tuple was created by an aborted
5449 * (sub)transaction, then we already locked the last live one in the
5450 * chain, thus we're done, so return success.
5451 */
5452 if (TransactionIdDidAbort(HeapTupleHeaderGetXmin(mytup.t_data)))
5453 {
5454 result = TM_Ok;
5455 goto out_locked;
5456 }
5457
5458 old_infomask = mytup.t_data->t_infomask;
5459 old_infomask2 = mytup.t_data->t_infomask2;
5460 xmax = HeapTupleHeaderGetRawXmax(mytup.t_data);
5461
5462 /*
5463 * If this tuple version has been updated or locked by some concurrent
5464 * transaction(s), what we do depends on whether our lock mode
5465 * conflicts with what those other transactions hold, and also on the
5466 * status of them.
5467 */
5468 if (!(old_infomask & HEAP_XMAX_INVALID))
5469 {
5470 TransactionId rawxmax;
5471 bool needwait;
5472
5473 rawxmax = HeapTupleHeaderGetRawXmax(mytup.t_data);
5474 if (old_infomask & HEAP_XMAX_IS_MULTI)
5475 {
5476 int nmembers;
5477 int i;
5478 MultiXactMember *members;
5479
5480 /*
5481 * We don't need a test for pg_upgrade'd tuples: this is only
5482 * applied to tuples after the first in an update chain. Said
5483 * first tuple in the chain may well be locked-in-9.2-and-
5484 * pg_upgraded, but that one was already locked by our caller,
5485 * not us; and any subsequent ones cannot be because our
5486 * caller must necessarily have obtained a snapshot later than
5487 * the pg_upgrade itself.
5488 */
5489 Assert(!HEAP_LOCKED_UPGRADED(mytup.t_data->t_infomask));
5490
5491 nmembers = GetMultiXactIdMembers(rawxmax, &members, false,
5492 HEAP_XMAX_IS_LOCKED_ONLY(old_infomask));
5493 for (i = 0; i < nmembers; i++)
5494 {
5495 result = test_lockmode_for_conflict(members[i].status,
5496 members[i].xid,
5497 mode,
5498 &mytup,
5499 &needwait);
5500
5501 /*
5502 * If the tuple was already locked by ourselves in a
5503 * previous iteration of this (say heap_lock_tuple was
5504 * forced to restart the locking loop because of a change
5505 * in xmax), then we hold the lock already on this tuple
5506 * version and we don't need to do anything; and this is
5507 * not an error condition either. We just need to skip
5508 * this tuple and continue locking the next version in the
5509 * update chain.
5510 */
5511 if (result == TM_SelfModified)
5512 {
5513 pfree(members);
5514 goto next;
5515 }
5516
5517 if (needwait)
5518 {
5519 LockBuffer(buf, BUFFER_LOCK_UNLOCK);
5520 XactLockTableWait(members[i].xid, rel,
5521 &mytup.t_self,
5522 XLTW_LockUpdated);
5523 pfree(members);
5524 goto l4;
5525 }
5526 if (result != TM_Ok)
5527 {
5528 pfree(members);
5529 goto out_locked;
5530 }
5531 }
5532 if (members)
5533 pfree(members);
5534 }
5535 else
5536 {
5537 MultiXactStatus status;
5538
5539 /*
5540 * For a non-multi Xmax, we first need to compute the
5541 * corresponding MultiXactStatus by using the infomask bits.
5542 */
5543 if (HEAP_XMAX_IS_LOCKED_ONLY(old_infomask))
5544 {
5545 if (HEAP_XMAX_IS_KEYSHR_LOCKED(old_infomask))
5546 status = MultiXactStatusForKeyShare;
5547 else if (HEAP_XMAX_IS_SHR_LOCKED(old_infomask))
5548 status = MultiXactStatusForShare;
5549 else if (HEAP_XMAX_IS_EXCL_LOCKED(old_infomask))
5550 {
5551 if (old_infomask2 & HEAP_KEYS_UPDATED)
5552 status = MultiXactStatusForUpdate;
5553 else
5554 status = MultiXactStatusForNoKeyUpdate;
5555 }
5556 else
5557 {
5558 /*
5559 * LOCK_ONLY present alone (a pg_upgraded tuple marked
5560 * as share-locked in the old cluster) shouldn't be
5561 * seen in the middle of an update chain.
5562 */
5563 elog(ERROR, "invalid lock status in tuple");
5564 }
5565 }
5566 else
5567 {
5568 /* it's an update, but which kind? */
5569 if (old_infomask2 & HEAP_KEYS_UPDATED)
5570 status = MultiXactStatusUpdate;
5571 else
5572 status = MultiXactStatusNoKeyUpdate;
5573 }
5574
5575 result = test_lockmode_for_conflict(status, rawxmax, mode,
5576 &mytup, &needwait);
5577
5578 /*
5579 * If the tuple was already locked by ourselves in a previous
5580 * iteration of this (say heap_lock_tuple was forced to
5581 * restart the locking loop because of a change in xmax), then
5582 * we hold the lock already on this tuple version and we don't
5583 * need to do anything; and this is not an error condition
5584 * either. We just need to skip this tuple and continue
5585 * locking the next version in the update chain.
5586 */
5587 if (result == TM_SelfModified)
5588 goto next;
5589
5590 if (needwait)
5591 {
5592 LockBuffer(buf, BUFFER_LOCK_UNLOCK);
5593 XactLockTableWait(rawxmax, rel, &mytup.t_self,
5594 XLTW_LockUpdated);
5595 goto l4;
5596 }
5597 if (result != TM_Ok)
5598 {
5599 goto out_locked;
5600 }
5601 }
5602 }
5603
5604 /* compute the new Xmax and infomask values for the tuple ... */
5605 compute_new_xmax_infomask(xmax, old_infomask, mytup.t_data->t_infomask2,
5606 xid, mode, false,
5607 &new_xmax, &new_infomask, &new_infomask2);
5608
5609 if (PageIsAllVisible(BufferGetPage(buf)) &&
5610 visibilitymap_clear(rel, block, vmbuffer,
5611 VISIBILITYMAP_ALL_FROZEN))
5612 cleared_all_frozen = true;
5613
5614 START_CRIT_SECTION();
5615
5616 /* ... and set them */
5617 HeapTupleHeaderSetXmax(mytup.t_data, new_xmax);
5618 mytup.t_data->t_infomask &= ~HEAP_XMAX_BITS;
5619 mytup.t_data->t_infomask2 &= ~HEAP_KEYS_UPDATED;
5620 mytup.t_data->t_infomask |= new_infomask;
5621 mytup.t_data->t_infomask2 |= new_infomask2;
5622
5623 MarkBufferDirty(buf);
5624
5625 /* XLOG stuff */
5626 if (RelationNeedsWAL(rel))
5627 {
5628 xl_heap_lock_updated xlrec;
5629 XLogRecPtr recptr;
5630 Page page = BufferGetPage(buf);
5631
5632 XLogBeginInsert();
5633 XLogRegisterBuffer(0, buf, REGBUF_STANDARD);
5634
5635 xlrec.offnum = ItemPointerGetOffsetNumber(&mytup.t_self);
5636 xlrec.xmax = new_xmax;
5637 xlrec.infobits_set = compute_infobits(new_infomask, new_infomask2);
5638 xlrec.flags =
5639 cleared_all_frozen ? XLH_LOCK_ALL_FROZEN_CLEARED : 0;
5640
5641 XLogRegisterData((char *) &xlrec, SizeOfHeapLockUpdated);
5642
5643 recptr = XLogInsert(RM_HEAP2_ID, XLOG_HEAP2_LOCK_UPDATED);
5644
5645 PageSetLSN(page, recptr);
5646 }
5647
5648 END_CRIT_SECTION();
5649
5650 next:
5651 /* if we find the end of update chain, we're done. */
5652 if (mytup.t_data->t_infomask & HEAP_XMAX_INVALID ||
5653 HeapTupleHeaderIndicatesMovedPartitions(mytup.t_data) ||
5654 ItemPointerEquals(&mytup.t_self, &mytup.t_data->t_ctid) ||
5655 HeapTupleHeaderIsOnlyLocked(mytup.t_data))
5656 {
5657 result = TM_Ok;
5658 goto out_locked;
5659 }
5660
5661 /* tail recursion */
5662 priorXmax = HeapTupleHeaderGetUpdateXid(mytup.t_data);
5663 ItemPointerCopy(&(mytup.t_data->t_ctid), &tupid);
5664 UnlockReleaseBuffer(buf);
5665 }
5666
5667 result = TM_Ok;
5668
5669 out_locked:
5670 UnlockReleaseBuffer(buf);
5671
5672 out_unlocked:
5673 if (vmbuffer != InvalidBuffer)
5674 ReleaseBuffer(vmbuffer);
5675
5676 return result;
5677 }
5678
5679 /*
5680 * heap_lock_updated_tuple
5681 * Follow update chain when locking an updated tuple, acquiring locks (row
5682 * marks) on the updated versions.
5683 *
5684 * The initial tuple is assumed to be already locked.
5685 *
5686 * This function doesn't check visibility, it just unconditionally marks the
5687 * tuple(s) as locked. If any tuple in the updated chain is being deleted
5688 * concurrently (or updated with the key being modified), sleep until the
5689 * transaction doing it is finished.
5690 *
5691 * Note that we don't acquire heavyweight tuple locks on the tuples we walk
5692 * when we have to wait for other transactions to release them, as opposed to
5693 * what heap_lock_tuple does. The reason is that having more than one
5694 * transaction walking the chain is probably uncommon enough that risk of
5695 * starvation is not likely: one of the preconditions for being here is that
5696 * the snapshot in use predates the update that created this tuple (because we
5697 * started at an earlier version of the tuple), but at the same time such a
5698 * transaction cannot be using repeatable read or serializable isolation
5699 * levels, because that would lead to a serializability failure.
5700 */
5701 static TM_Result
heap_lock_updated_tuple(Relation rel,HeapTuple tuple,ItemPointer ctid,TransactionId xid,LockTupleMode mode)5702 heap_lock_updated_tuple(Relation rel, HeapTuple tuple, ItemPointer ctid,
5703 TransactionId xid, LockTupleMode mode)
5704 {
5705 /*
5706 * If the tuple has not been updated, or has moved into another partition
5707 * (effectively a delete) stop here.
5708 */
5709 if (!HeapTupleHeaderIndicatesMovedPartitions(tuple->t_data) &&
5710 !ItemPointerEquals(&tuple->t_self, ctid))
5711 {
5712 /*
5713 * If this is the first possibly-multixact-able operation in the
5714 * current transaction, set my per-backend OldestMemberMXactId
5715 * setting. We can be certain that the transaction will never become a
5716 * member of any older MultiXactIds than that. (We have to do this
5717 * even if we end up just using our own TransactionId below, since
5718 * some other backend could incorporate our XID into a MultiXact
5719 * immediately afterwards.)
5720 */
5721 MultiXactIdSetOldestMember();
5722
5723 return heap_lock_updated_tuple_rec(rel, ctid, xid, mode);
5724 }
5725
5726 /* nothing to lock */
5727 return TM_Ok;
5728 }
5729
5730 /*
5731 * heap_finish_speculative - mark speculative insertion as successful
5732 *
5733 * To successfully finish a speculative insertion we have to clear speculative
5734 * token from tuple. To do so the t_ctid field, which will contain a
5735 * speculative token value, is modified in place to point to the tuple itself,
5736 * which is characteristic of a newly inserted ordinary tuple.
5737 *
5738 * NB: It is not ok to commit without either finishing or aborting a
5739 * speculative insertion. We could treat speculative tuples of committed
5740 * transactions implicitly as completed, but then we would have to be prepared
5741 * to deal with speculative tokens on committed tuples. That wouldn't be
5742 * difficult - no-one looks at the ctid field of a tuple with invalid xmax -
5743 * but clearing the token at completion isn't very expensive either.
5744 * An explicit confirmation WAL record also makes logical decoding simpler.
5745 */
5746 void
heap_finish_speculative(Relation relation,ItemPointer tid)5747 heap_finish_speculative(Relation relation, ItemPointer tid)
5748 {
5749 Buffer buffer;
5750 Page page;
5751 OffsetNumber offnum;
5752 ItemId lp = NULL;
5753 HeapTupleHeader htup;
5754
5755 buffer = ReadBuffer(relation, ItemPointerGetBlockNumber(tid));
5756 LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE);
5757 page = (Page) BufferGetPage(buffer);
5758
5759 offnum = ItemPointerGetOffsetNumber(tid);
5760 if (PageGetMaxOffsetNumber(page) >= offnum)
5761 lp = PageGetItemId(page, offnum);
5762
5763 if (PageGetMaxOffsetNumber(page) < offnum || !ItemIdIsNormal(lp))
5764 elog(ERROR, "invalid lp");
5765
5766 htup = (HeapTupleHeader) PageGetItem(page, lp);
5767
5768 /* SpecTokenOffsetNumber should be distinguishable from any real offset */
5769 StaticAssertStmt(MaxOffsetNumber < SpecTokenOffsetNumber,
5770 "invalid speculative token constant");
5771
5772 /* NO EREPORT(ERROR) from here till changes are logged */
5773 START_CRIT_SECTION();
5774
5775 Assert(HeapTupleHeaderIsSpeculative(htup));
5776
5777 MarkBufferDirty(buffer);
5778
5779 /*
5780 * Replace the speculative insertion token with a real t_ctid, pointing to
5781 * itself like it does on regular tuples.
5782 */
5783 htup->t_ctid = *tid;
5784
5785 /* XLOG stuff */
5786 if (RelationNeedsWAL(relation))
5787 {
5788 xl_heap_confirm xlrec;
5789 XLogRecPtr recptr;
5790
5791 xlrec.offnum = ItemPointerGetOffsetNumber(tid);
5792
5793 XLogBeginInsert();
5794
5795 /* We want the same filtering on this as on a plain insert */
5796 XLogSetRecordFlags(XLOG_INCLUDE_ORIGIN);
5797
5798 XLogRegisterData((char *) &xlrec, SizeOfHeapConfirm);
5799 XLogRegisterBuffer(0, buffer, REGBUF_STANDARD);
5800
5801 recptr = XLogInsert(RM_HEAP_ID, XLOG_HEAP_CONFIRM);
5802
5803 PageSetLSN(page, recptr);
5804 }
5805
5806 END_CRIT_SECTION();
5807
5808 UnlockReleaseBuffer(buffer);
5809 }
5810
5811 /*
5812 * heap_abort_speculative - kill a speculatively inserted tuple
5813 *
5814 * Marks a tuple that was speculatively inserted in the same command as dead,
5815 * by setting its xmin as invalid. That makes it immediately appear as dead
5816 * to all transactions, including our own. In particular, it makes
5817 * HeapTupleSatisfiesDirty() regard the tuple as dead, so that another backend
5818 * inserting a duplicate key value won't unnecessarily wait for our whole
5819 * transaction to finish (it'll just wait for our speculative insertion to
5820 * finish).
5821 *
5822 * Killing the tuple prevents "unprincipled deadlocks", which are deadlocks
5823 * that arise due to a mutual dependency that is not user visible. By
5824 * definition, unprincipled deadlocks cannot be prevented by the user
5825 * reordering lock acquisition in client code, because the implementation level
5826 * lock acquisitions are not under the user's direct control. If speculative
5827 * inserters did not take this precaution, then under high concurrency they
5828 * could deadlock with each other, which would not be acceptable.
5829 *
5830 * This is somewhat redundant with heap_delete, but we prefer to have a
5831 * dedicated routine with stripped down requirements. Note that this is also
5832 * used to delete the TOAST tuples created during speculative insertion.
5833 *
5834 * This routine does not affect logical decoding as it only looks at
5835 * confirmation records.
5836 */
5837 void
heap_abort_speculative(Relation relation,ItemPointer tid)5838 heap_abort_speculative(Relation relation, ItemPointer tid)
5839 {
5840 TransactionId xid = GetCurrentTransactionId();
5841 ItemId lp;
5842 HeapTupleData tp;
5843 Page page;
5844 BlockNumber block;
5845 Buffer buffer;
5846 TransactionId prune_xid;
5847
5848 Assert(ItemPointerIsValid(tid));
5849
5850 block = ItemPointerGetBlockNumber(tid);
5851 buffer = ReadBuffer(relation, block);
5852 page = BufferGetPage(buffer);
5853
5854 LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE);
5855
5856 /*
5857 * Page can't be all visible, we just inserted into it, and are still
5858 * running.
5859 */
5860 Assert(!PageIsAllVisible(page));
5861
5862 lp = PageGetItemId(page, ItemPointerGetOffsetNumber(tid));
5863 Assert(ItemIdIsNormal(lp));
5864
5865 tp.t_tableOid = RelationGetRelid(relation);
5866 tp.t_data = (HeapTupleHeader) PageGetItem(page, lp);
5867 tp.t_len = ItemIdGetLength(lp);
5868 tp.t_self = *tid;
5869
5870 /*
5871 * Sanity check that the tuple really is a speculatively inserted tuple,
5872 * inserted by us.
5873 */
5874 if (tp.t_data->t_choice.t_heap.t_xmin != xid)
5875 elog(ERROR, "attempted to kill a tuple inserted by another transaction");
5876 if (!(IsToastRelation(relation) || HeapTupleHeaderIsSpeculative(tp.t_data)))
5877 elog(ERROR, "attempted to kill a non-speculative tuple");
5878 Assert(!HeapTupleHeaderIsHeapOnly(tp.t_data));
5879
5880 /*
5881 * No need to check for serializable conflicts here. There is never a
5882 * need for a combo CID, either. No need to extract replica identity, or
5883 * do anything special with infomask bits.
5884 */
5885
5886 START_CRIT_SECTION();
5887
5888 /*
5889 * The tuple will become DEAD immediately. Flag that this page is a
5890 * candidate for pruning by setting xmin to TransactionXmin. While not
5891 * immediately prunable, it is the oldest xid we can cheaply determine
5892 * that's safe against wraparound / being older than the table's
5893 * relfrozenxid. To defend against the unlikely case of a new relation
5894 * having a newer relfrozenxid than our TransactionXmin, use relfrozenxid
5895 * if so (vacuum can't subsequently move relfrozenxid to beyond
5896 * TransactionXmin, so there's no race here).
5897 */
5898 Assert(TransactionIdIsValid(TransactionXmin));
5899 if (TransactionIdPrecedes(TransactionXmin, relation->rd_rel->relfrozenxid))
5900 prune_xid = relation->rd_rel->relfrozenxid;
5901 else
5902 prune_xid = TransactionXmin;
5903 PageSetPrunable(page, prune_xid);
5904
5905 /* store transaction information of xact deleting the tuple */
5906 tp.t_data->t_infomask &= ~(HEAP_XMAX_BITS | HEAP_MOVED);
5907 tp.t_data->t_infomask2 &= ~HEAP_KEYS_UPDATED;
5908
5909 /*
5910 * Set the tuple header xmin to InvalidTransactionId. This makes the
5911 * tuple immediately invisible everyone. (In particular, to any
5912 * transactions waiting on the speculative token, woken up later.)
5913 */
5914 HeapTupleHeaderSetXmin(tp.t_data, InvalidTransactionId);
5915
5916 /* Clear the speculative insertion token too */
5917 tp.t_data->t_ctid = tp.t_self;
5918
5919 MarkBufferDirty(buffer);
5920
5921 /*
5922 * XLOG stuff
5923 *
5924 * The WAL records generated here match heap_delete(). The same recovery
5925 * routines are used.
5926 */
5927 if (RelationNeedsWAL(relation))
5928 {
5929 xl_heap_delete xlrec;
5930 XLogRecPtr recptr;
5931
5932 xlrec.flags = XLH_DELETE_IS_SUPER;
5933 xlrec.infobits_set = compute_infobits(tp.t_data->t_infomask,
5934 tp.t_data->t_infomask2);
5935 xlrec.offnum = ItemPointerGetOffsetNumber(&tp.t_self);
5936 xlrec.xmax = xid;
5937
5938 XLogBeginInsert();
5939 XLogRegisterData((char *) &xlrec, SizeOfHeapDelete);
5940 XLogRegisterBuffer(0, buffer, REGBUF_STANDARD);
5941
5942 /* No replica identity & replication origin logged */
5943
5944 recptr = XLogInsert(RM_HEAP_ID, XLOG_HEAP_DELETE);
5945
5946 PageSetLSN(page, recptr);
5947 }
5948
5949 END_CRIT_SECTION();
5950
5951 LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
5952
5953 if (HeapTupleHasExternal(&tp))
5954 {
5955 Assert(!IsToastRelation(relation));
5956 heap_toast_delete(relation, &tp, true);
5957 }
5958
5959 /*
5960 * Never need to mark tuple for invalidation, since catalogs don't support
5961 * speculative insertion
5962 */
5963
5964 /* Now we can release the buffer */
5965 ReleaseBuffer(buffer);
5966
5967 /* count deletion, as we counted the insertion too */
5968 pgstat_count_heap_delete(relation);
5969 }
5970
5971 /*
5972 * heap_inplace_update - update a tuple "in place" (ie, overwrite it)
5973 *
5974 * Overwriting violates both MVCC and transactional safety, so the uses
5975 * of this function in Postgres are extremely limited. Nonetheless we
5976 * find some places to use it.
5977 *
5978 * The tuple cannot change size, and therefore it's reasonable to assume
5979 * that its null bitmap (if any) doesn't change either. So we just
5980 * overwrite the data portion of the tuple without touching the null
5981 * bitmap or any of the header fields.
5982 *
5983 * tuple is an in-memory tuple structure containing the data to be written
5984 * over the target tuple. Also, tuple->t_self identifies the target tuple.
5985 *
5986 * Note that the tuple updated here had better not come directly from the
5987 * syscache if the relation has a toast relation as this tuple could
5988 * include toast values that have been expanded, causing a failure here.
5989 */
5990 void
heap_inplace_update(Relation relation,HeapTuple tuple)5991 heap_inplace_update(Relation relation, HeapTuple tuple)
5992 {
5993 Buffer buffer;
5994 Page page;
5995 OffsetNumber offnum;
5996 ItemId lp = NULL;
5997 HeapTupleHeader htup;
5998 uint32 oldlen;
5999 uint32 newlen;
6000
6001 /*
6002 * For now, we don't allow parallel updates. Unlike a regular update,
6003 * this should never create a combo CID, so it might be possible to relax
6004 * this restriction, but not without more thought and testing. It's not
6005 * clear that it would be useful, anyway.
6006 */
6007 if (IsInParallelMode())
6008 ereport(ERROR,
6009 (errcode(ERRCODE_INVALID_TRANSACTION_STATE),
6010 errmsg("cannot update tuples during a parallel operation")));
6011
6012 buffer = ReadBuffer(relation, ItemPointerGetBlockNumber(&(tuple->t_self)));
6013 LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE);
6014 page = (Page) BufferGetPage(buffer);
6015
6016 offnum = ItemPointerGetOffsetNumber(&(tuple->t_self));
6017 if (PageGetMaxOffsetNumber(page) >= offnum)
6018 lp = PageGetItemId(page, offnum);
6019
6020 if (PageGetMaxOffsetNumber(page) < offnum || !ItemIdIsNormal(lp))
6021 elog(ERROR, "invalid lp");
6022
6023 htup = (HeapTupleHeader) PageGetItem(page, lp);
6024
6025 oldlen = ItemIdGetLength(lp) - htup->t_hoff;
6026 newlen = tuple->t_len - tuple->t_data->t_hoff;
6027 if (oldlen != newlen || htup->t_hoff != tuple->t_data->t_hoff)
6028 elog(ERROR, "wrong tuple length");
6029
6030 /* NO EREPORT(ERROR) from here till changes are logged */
6031 START_CRIT_SECTION();
6032
6033 memcpy((char *) htup + htup->t_hoff,
6034 (char *) tuple->t_data + tuple->t_data->t_hoff,
6035 newlen);
6036
6037 MarkBufferDirty(buffer);
6038
6039 /* XLOG stuff */
6040 if (RelationNeedsWAL(relation))
6041 {
6042 xl_heap_inplace xlrec;
6043 XLogRecPtr recptr;
6044
6045 xlrec.offnum = ItemPointerGetOffsetNumber(&tuple->t_self);
6046
6047 XLogBeginInsert();
6048 XLogRegisterData((char *) &xlrec, SizeOfHeapInplace);
6049
6050 XLogRegisterBuffer(0, buffer, REGBUF_STANDARD);
6051 XLogRegisterBufData(0, (char *) htup + htup->t_hoff, newlen);
6052
6053 /* inplace updates aren't decoded atm, don't log the origin */
6054
6055 recptr = XLogInsert(RM_HEAP_ID, XLOG_HEAP_INPLACE);
6056
6057 PageSetLSN(page, recptr);
6058 }
6059
6060 END_CRIT_SECTION();
6061
6062 UnlockReleaseBuffer(buffer);
6063
6064 /*
6065 * Send out shared cache inval if necessary. Note that because we only
6066 * pass the new version of the tuple, this mustn't be used for any
6067 * operations that could change catcache lookup keys. But we aren't
6068 * bothering with index updates either, so that's true a fortiori.
6069 */
6070 if (!IsBootstrapProcessingMode())
6071 CacheInvalidateHeapTuple(relation, tuple, NULL);
6072 }
6073
6074 #define FRM_NOOP 0x0001
6075 #define FRM_INVALIDATE_XMAX 0x0002
6076 #define FRM_RETURN_IS_XID 0x0004
6077 #define FRM_RETURN_IS_MULTI 0x0008
6078 #define FRM_MARK_COMMITTED 0x0010
6079
6080 /*
6081 * FreezeMultiXactId
6082 * Determine what to do during freezing when a tuple is marked by a
6083 * MultiXactId.
6084 *
6085 * NB -- this might have the side-effect of creating a new MultiXactId!
6086 *
6087 * "flags" is an output value; it's used to tell caller what to do on return.
6088 * Possible flags are:
6089 * FRM_NOOP
6090 * don't do anything -- keep existing Xmax
6091 * FRM_INVALIDATE_XMAX
6092 * mark Xmax as InvalidTransactionId and set XMAX_INVALID flag.
6093 * FRM_RETURN_IS_XID
6094 * The Xid return value is a single update Xid to set as xmax.
6095 * FRM_MARK_COMMITTED
6096 * Xmax can be marked as HEAP_XMAX_COMMITTED
6097 * FRM_RETURN_IS_MULTI
6098 * The return value is a new MultiXactId to set as new Xmax.
6099 * (caller must obtain proper infomask bits using GetMultiXactIdHintBits)
6100 */
6101 static TransactionId
FreezeMultiXactId(MultiXactId multi,uint16 t_infomask,TransactionId relfrozenxid,TransactionId relminmxid,TransactionId cutoff_xid,MultiXactId cutoff_multi,uint16 * flags)6102 FreezeMultiXactId(MultiXactId multi, uint16 t_infomask,
6103 TransactionId relfrozenxid, TransactionId relminmxid,
6104 TransactionId cutoff_xid, MultiXactId cutoff_multi,
6105 uint16 *flags)
6106 {
6107 TransactionId xid = InvalidTransactionId;
6108 int i;
6109 MultiXactMember *members;
6110 int nmembers;
6111 bool need_replace;
6112 int nnewmembers;
6113 MultiXactMember *newmembers;
6114 bool has_lockers;
6115 TransactionId update_xid;
6116 bool update_committed;
6117
6118 *flags = 0;
6119
6120 /* We should only be called in Multis */
6121 Assert(t_infomask & HEAP_XMAX_IS_MULTI);
6122
6123 if (!MultiXactIdIsValid(multi) ||
6124 HEAP_LOCKED_UPGRADED(t_infomask))
6125 {
6126 /* Ensure infomask bits are appropriately set/reset */
6127 *flags |= FRM_INVALIDATE_XMAX;
6128 return InvalidTransactionId;
6129 }
6130 else if (MultiXactIdPrecedes(multi, relminmxid))
6131 ereport(ERROR,
6132 (errcode(ERRCODE_DATA_CORRUPTED),
6133 errmsg_internal("found multixact %u from before relminmxid %u",
6134 multi, relminmxid)));
6135 else if (MultiXactIdPrecedes(multi, cutoff_multi))
6136 {
6137 /*
6138 * This old multi cannot possibly have members still running, but
6139 * verify just in case. If it was a locker only, it can be removed
6140 * without any further consideration; but if it contained an update,
6141 * we might need to preserve it.
6142 */
6143 if (MultiXactIdIsRunning(multi,
6144 HEAP_XMAX_IS_LOCKED_ONLY(t_infomask)))
6145 ereport(ERROR,
6146 (errcode(ERRCODE_DATA_CORRUPTED),
6147 errmsg_internal("multixact %u from before cutoff %u found to be still running",
6148 multi, cutoff_multi)));
6149
6150 if (HEAP_XMAX_IS_LOCKED_ONLY(t_infomask))
6151 {
6152 *flags |= FRM_INVALIDATE_XMAX;
6153 xid = InvalidTransactionId; /* not strictly necessary */
6154 }
6155 else
6156 {
6157 /* replace multi by update xid */
6158 xid = MultiXactIdGetUpdateXid(multi, t_infomask);
6159
6160 /* wasn't only a lock, xid needs to be valid */
6161 Assert(TransactionIdIsValid(xid));
6162
6163 if (TransactionIdPrecedes(xid, relfrozenxid))
6164 ereport(ERROR,
6165 (errcode(ERRCODE_DATA_CORRUPTED),
6166 errmsg_internal("found update xid %u from before relfrozenxid %u",
6167 xid, relfrozenxid)));
6168
6169 /*
6170 * If the xid is older than the cutoff, it has to have aborted,
6171 * otherwise the tuple would have gotten pruned away.
6172 */
6173 if (TransactionIdPrecedes(xid, cutoff_xid))
6174 {
6175 if (TransactionIdDidCommit(xid))
6176 ereport(ERROR,
6177 (errcode(ERRCODE_DATA_CORRUPTED),
6178 errmsg_internal("cannot freeze committed update xid %u", xid)));
6179 *flags |= FRM_INVALIDATE_XMAX;
6180 xid = InvalidTransactionId; /* not strictly necessary */
6181 }
6182 else
6183 {
6184 *flags |= FRM_RETURN_IS_XID;
6185 }
6186 }
6187
6188 return xid;
6189 }
6190
6191 /*
6192 * This multixact might have or might not have members still running, but
6193 * we know it's valid and is newer than the cutoff point for multis.
6194 * However, some member(s) of it may be below the cutoff for Xids, so we
6195 * need to walk the whole members array to figure out what to do, if
6196 * anything.
6197 */
6198
6199 nmembers =
6200 GetMultiXactIdMembers(multi, &members, false,
6201 HEAP_XMAX_IS_LOCKED_ONLY(t_infomask));
6202 if (nmembers <= 0)
6203 {
6204 /* Nothing worth keeping */
6205 *flags |= FRM_INVALIDATE_XMAX;
6206 return InvalidTransactionId;
6207 }
6208
6209 /* is there anything older than the cutoff? */
6210 need_replace = false;
6211 for (i = 0; i < nmembers; i++)
6212 {
6213 if (TransactionIdPrecedes(members[i].xid, cutoff_xid))
6214 {
6215 need_replace = true;
6216 break;
6217 }
6218 }
6219
6220 /*
6221 * In the simplest case, there is no member older than the cutoff; we can
6222 * keep the existing MultiXactId as is.
6223 */
6224 if (!need_replace)
6225 {
6226 *flags |= FRM_NOOP;
6227 pfree(members);
6228 return InvalidTransactionId;
6229 }
6230
6231 /*
6232 * If the multi needs to be updated, figure out which members do we need
6233 * to keep.
6234 */
6235 nnewmembers = 0;
6236 newmembers = palloc(sizeof(MultiXactMember) * nmembers);
6237 has_lockers = false;
6238 update_xid = InvalidTransactionId;
6239 update_committed = false;
6240
6241 for (i = 0; i < nmembers; i++)
6242 {
6243 /*
6244 * Determine whether to keep this member or ignore it.
6245 */
6246 if (ISUPDATE_from_mxstatus(members[i].status))
6247 {
6248 TransactionId xid = members[i].xid;
6249
6250 Assert(TransactionIdIsValid(xid));
6251 if (TransactionIdPrecedes(xid, relfrozenxid))
6252 ereport(ERROR,
6253 (errcode(ERRCODE_DATA_CORRUPTED),
6254 errmsg_internal("found update xid %u from before relfrozenxid %u",
6255 xid, relfrozenxid)));
6256
6257 /*
6258 * It's an update; should we keep it? If the transaction is known
6259 * aborted or crashed then it's okay to ignore it, otherwise not.
6260 * Note that an updater older than cutoff_xid cannot possibly be
6261 * committed, because HeapTupleSatisfiesVacuum would have returned
6262 * HEAPTUPLE_DEAD and we would not be trying to freeze the tuple.
6263 *
6264 * As with all tuple visibility routines, it's critical to test
6265 * TransactionIdIsInProgress before TransactionIdDidCommit,
6266 * because of race conditions explained in detail in
6267 * heapam_visibility.c.
6268 */
6269 if (TransactionIdIsCurrentTransactionId(xid) ||
6270 TransactionIdIsInProgress(xid))
6271 {
6272 Assert(!TransactionIdIsValid(update_xid));
6273 update_xid = xid;
6274 }
6275 else if (TransactionIdDidCommit(xid))
6276 {
6277 /*
6278 * The transaction committed, so we can tell caller to set
6279 * HEAP_XMAX_COMMITTED. (We can only do this because we know
6280 * the transaction is not running.)
6281 */
6282 Assert(!TransactionIdIsValid(update_xid));
6283 update_committed = true;
6284 update_xid = xid;
6285 }
6286 else
6287 {
6288 /*
6289 * Not in progress, not committed -- must be aborted or
6290 * crashed; we can ignore it.
6291 */
6292 }
6293
6294 /*
6295 * Since the tuple wasn't marked HEAPTUPLE_DEAD by vacuum, the
6296 * update Xid cannot possibly be older than the xid cutoff. The
6297 * presence of such a tuple would cause corruption, so be paranoid
6298 * and check.
6299 */
6300 if (TransactionIdIsValid(update_xid) &&
6301 TransactionIdPrecedes(update_xid, cutoff_xid))
6302 ereport(ERROR,
6303 (errcode(ERRCODE_DATA_CORRUPTED),
6304 errmsg_internal("found update xid %u from before xid cutoff %u",
6305 update_xid, cutoff_xid)));
6306
6307 /*
6308 * If we determined that it's an Xid corresponding to an update
6309 * that must be retained, additionally add it to the list of
6310 * members of the new Multi, in case we end up using that. (We
6311 * might still decide to use only an update Xid and not a multi,
6312 * but it's easier to maintain the list as we walk the old members
6313 * list.)
6314 */
6315 if (TransactionIdIsValid(update_xid))
6316 newmembers[nnewmembers++] = members[i];
6317 }
6318 else
6319 {
6320 /* We only keep lockers if they are still running */
6321 if (TransactionIdIsCurrentTransactionId(members[i].xid) ||
6322 TransactionIdIsInProgress(members[i].xid))
6323 {
6324 /* running locker cannot possibly be older than the cutoff */
6325 Assert(!TransactionIdPrecedes(members[i].xid, cutoff_xid));
6326 newmembers[nnewmembers++] = members[i];
6327 has_lockers = true;
6328 }
6329 }
6330 }
6331
6332 pfree(members);
6333
6334 if (nnewmembers == 0)
6335 {
6336 /* nothing worth keeping!? Tell caller to remove the whole thing */
6337 *flags |= FRM_INVALIDATE_XMAX;
6338 xid = InvalidTransactionId;
6339 }
6340 else if (TransactionIdIsValid(update_xid) && !has_lockers)
6341 {
6342 /*
6343 * If there's a single member and it's an update, pass it back alone
6344 * without creating a new Multi. (XXX we could do this when there's a
6345 * single remaining locker, too, but that would complicate the API too
6346 * much; moreover, the case with the single updater is more
6347 * interesting, because those are longer-lived.)
6348 */
6349 Assert(nnewmembers == 1);
6350 *flags |= FRM_RETURN_IS_XID;
6351 if (update_committed)
6352 *flags |= FRM_MARK_COMMITTED;
6353 xid = update_xid;
6354 }
6355 else
6356 {
6357 /*
6358 * Create a new multixact with the surviving members of the previous
6359 * one, to set as new Xmax in the tuple.
6360 */
6361 xid = MultiXactIdCreateFromMembers(nnewmembers, newmembers);
6362 *flags |= FRM_RETURN_IS_MULTI;
6363 }
6364
6365 pfree(newmembers);
6366
6367 return xid;
6368 }
6369
6370 /*
6371 * heap_prepare_freeze_tuple
6372 *
6373 * Check to see whether any of the XID fields of a tuple (xmin, xmax, xvac)
6374 * are older than the specified cutoff XID and cutoff MultiXactId. If so,
6375 * setup enough state (in the *frz output argument) to later execute and
6376 * WAL-log what we would need to do, and return true. Return false if nothing
6377 * is to be changed. In addition, set *totally_frozen_p to true if the tuple
6378 * will be totally frozen after these operations are performed and false if
6379 * more freezing will eventually be required.
6380 *
6381 * Caller is responsible for setting the offset field, if appropriate.
6382 *
6383 * It is assumed that the caller has checked the tuple with
6384 * HeapTupleSatisfiesVacuum() and determined that it is not HEAPTUPLE_DEAD
6385 * (else we should be removing the tuple, not freezing it).
6386 *
6387 * NB: cutoff_xid *must* be <= the current global xmin, to ensure that any
6388 * XID older than it could neither be running nor seen as running by any
6389 * open transaction. This ensures that the replacement will not change
6390 * anyone's idea of the tuple state.
6391 * Similarly, cutoff_multi must be less than or equal to the smallest
6392 * MultiXactId used by any transaction currently open.
6393 *
6394 * If the tuple is in a shared buffer, caller must hold an exclusive lock on
6395 * that buffer.
6396 *
6397 * NB: It is not enough to set hint bits to indicate something is
6398 * committed/invalid -- they might not be set on a standby, or after crash
6399 * recovery. We really need to remove old xids.
6400 */
6401 bool
heap_prepare_freeze_tuple(HeapTupleHeader tuple,TransactionId relfrozenxid,TransactionId relminmxid,TransactionId cutoff_xid,TransactionId cutoff_multi,xl_heap_freeze_tuple * frz,bool * totally_frozen_p)6402 heap_prepare_freeze_tuple(HeapTupleHeader tuple,
6403 TransactionId relfrozenxid, TransactionId relminmxid,
6404 TransactionId cutoff_xid, TransactionId cutoff_multi,
6405 xl_heap_freeze_tuple *frz, bool *totally_frozen_p)
6406 {
6407 bool changed = false;
6408 bool xmax_already_frozen = false;
6409 bool xmin_frozen;
6410 bool freeze_xmax;
6411 TransactionId xid;
6412
6413 frz->frzflags = 0;
6414 frz->t_infomask2 = tuple->t_infomask2;
6415 frz->t_infomask = tuple->t_infomask;
6416 frz->xmax = HeapTupleHeaderGetRawXmax(tuple);
6417
6418 /*
6419 * Process xmin. xmin_frozen has two slightly different meanings: in the
6420 * !XidIsNormal case, it means "the xmin doesn't need any freezing" (it's
6421 * already a permanent value), while in the block below it is set true to
6422 * mean "xmin won't need freezing after what we do to it here" (false
6423 * otherwise). In both cases we're allowed to set totally_frozen, as far
6424 * as xmin is concerned.
6425 */
6426 xid = HeapTupleHeaderGetXmin(tuple);
6427 if (!TransactionIdIsNormal(xid))
6428 xmin_frozen = true;
6429 else
6430 {
6431 if (TransactionIdPrecedes(xid, relfrozenxid))
6432 ereport(ERROR,
6433 (errcode(ERRCODE_DATA_CORRUPTED),
6434 errmsg_internal("found xmin %u from before relfrozenxid %u",
6435 xid, relfrozenxid)));
6436
6437 xmin_frozen = TransactionIdPrecedes(xid, cutoff_xid);
6438 if (xmin_frozen)
6439 {
6440 if (!TransactionIdDidCommit(xid))
6441 ereport(ERROR,
6442 (errcode(ERRCODE_DATA_CORRUPTED),
6443 errmsg_internal("uncommitted xmin %u from before xid cutoff %u needs to be frozen",
6444 xid, cutoff_xid)));
6445
6446 frz->t_infomask |= HEAP_XMIN_FROZEN;
6447 changed = true;
6448 }
6449 }
6450
6451 /*
6452 * Process xmax. To thoroughly examine the current Xmax value we need to
6453 * resolve a MultiXactId to its member Xids, in case some of them are
6454 * below the given cutoff for Xids. In that case, those values might need
6455 * freezing, too. Also, if a multi needs freezing, we cannot simply take
6456 * it out --- if there's a live updater Xid, it needs to be kept.
6457 *
6458 * Make sure to keep heap_tuple_needs_freeze in sync with this.
6459 */
6460 xid = HeapTupleHeaderGetRawXmax(tuple);
6461
6462 if (tuple->t_infomask & HEAP_XMAX_IS_MULTI)
6463 {
6464 TransactionId newxmax;
6465 uint16 flags;
6466
6467 newxmax = FreezeMultiXactId(xid, tuple->t_infomask,
6468 relfrozenxid, relminmxid,
6469 cutoff_xid, cutoff_multi, &flags);
6470
6471 freeze_xmax = (flags & FRM_INVALIDATE_XMAX);
6472
6473 if (flags & FRM_RETURN_IS_XID)
6474 {
6475 /*
6476 * NB -- some of these transformations are only valid because we
6477 * know the return Xid is a tuple updater (i.e. not merely a
6478 * locker.) Also note that the only reason we don't explicitly
6479 * worry about HEAP_KEYS_UPDATED is because it lives in
6480 * t_infomask2 rather than t_infomask.
6481 */
6482 frz->t_infomask &= ~HEAP_XMAX_BITS;
6483 frz->xmax = newxmax;
6484 if (flags & FRM_MARK_COMMITTED)
6485 frz->t_infomask |= HEAP_XMAX_COMMITTED;
6486 changed = true;
6487 }
6488 else if (flags & FRM_RETURN_IS_MULTI)
6489 {
6490 uint16 newbits;
6491 uint16 newbits2;
6492
6493 /*
6494 * We can't use GetMultiXactIdHintBits directly on the new multi
6495 * here; that routine initializes the masks to all zeroes, which
6496 * would lose other bits we need. Doing it this way ensures all
6497 * unrelated bits remain untouched.
6498 */
6499 frz->t_infomask &= ~HEAP_XMAX_BITS;
6500 frz->t_infomask2 &= ~HEAP_KEYS_UPDATED;
6501 GetMultiXactIdHintBits(newxmax, &newbits, &newbits2);
6502 frz->t_infomask |= newbits;
6503 frz->t_infomask2 |= newbits2;
6504
6505 frz->xmax = newxmax;
6506
6507 changed = true;
6508 }
6509 }
6510 else if (TransactionIdIsNormal(xid))
6511 {
6512 if (TransactionIdPrecedes(xid, relfrozenxid))
6513 ereport(ERROR,
6514 (errcode(ERRCODE_DATA_CORRUPTED),
6515 errmsg_internal("found xmax %u from before relfrozenxid %u",
6516 xid, relfrozenxid)));
6517
6518 if (TransactionIdPrecedes(xid, cutoff_xid))
6519 {
6520 /*
6521 * If we freeze xmax, make absolutely sure that it's not an XID
6522 * that is important. (Note, a lock-only xmax can be removed
6523 * independent of committedness, since a committed lock holder has
6524 * released the lock).
6525 */
6526 if (!HEAP_XMAX_IS_LOCKED_ONLY(tuple->t_infomask) &&
6527 TransactionIdDidCommit(xid))
6528 ereport(ERROR,
6529 (errcode(ERRCODE_DATA_CORRUPTED),
6530 errmsg_internal("cannot freeze committed xmax %u",
6531 xid)));
6532 freeze_xmax = true;
6533 }
6534 else
6535 freeze_xmax = false;
6536 }
6537 else if ((tuple->t_infomask & HEAP_XMAX_INVALID) ||
6538 !TransactionIdIsValid(HeapTupleHeaderGetRawXmax(tuple)))
6539 {
6540 freeze_xmax = false;
6541 xmax_already_frozen = true;
6542 }
6543 else
6544 ereport(ERROR,
6545 (errcode(ERRCODE_DATA_CORRUPTED),
6546 errmsg_internal("found xmax %u (infomask 0x%04x) not frozen, not multi, not normal",
6547 xid, tuple->t_infomask)));
6548
6549 if (freeze_xmax)
6550 {
6551 Assert(!xmax_already_frozen);
6552
6553 frz->xmax = InvalidTransactionId;
6554
6555 /*
6556 * The tuple might be marked either XMAX_INVALID or XMAX_COMMITTED +
6557 * LOCKED. Normalize to INVALID just to be sure no one gets confused.
6558 * Also get rid of the HEAP_KEYS_UPDATED bit.
6559 */
6560 frz->t_infomask &= ~HEAP_XMAX_BITS;
6561 frz->t_infomask |= HEAP_XMAX_INVALID;
6562 frz->t_infomask2 &= ~HEAP_HOT_UPDATED;
6563 frz->t_infomask2 &= ~HEAP_KEYS_UPDATED;
6564 changed = true;
6565 }
6566
6567 /*
6568 * Old-style VACUUM FULL is gone, but we have to keep this code as long as
6569 * we support having MOVED_OFF/MOVED_IN tuples in the database.
6570 */
6571 if (tuple->t_infomask & HEAP_MOVED)
6572 {
6573 xid = HeapTupleHeaderGetXvac(tuple);
6574
6575 /*
6576 * For Xvac, we ignore the cutoff_xid and just always perform the
6577 * freeze operation. The oldest release in which such a value can
6578 * actually be set is PostgreSQL 8.4, because old-style VACUUM FULL
6579 * was removed in PostgreSQL 9.0. Note that if we were to respect
6580 * cutoff_xid here, we'd need to make surely to clear totally_frozen
6581 * when we skipped freezing on that basis.
6582 */
6583 if (TransactionIdIsNormal(xid))
6584 {
6585 /*
6586 * If a MOVED_OFF tuple is not dead, the xvac transaction must
6587 * have failed; whereas a non-dead MOVED_IN tuple must mean the
6588 * xvac transaction succeeded.
6589 */
6590 if (tuple->t_infomask & HEAP_MOVED_OFF)
6591 frz->frzflags |= XLH_INVALID_XVAC;
6592 else
6593 frz->frzflags |= XLH_FREEZE_XVAC;
6594
6595 /*
6596 * Might as well fix the hint bits too; usually XMIN_COMMITTED
6597 * will already be set here, but there's a small chance not.
6598 */
6599 Assert(!(tuple->t_infomask & HEAP_XMIN_INVALID));
6600 frz->t_infomask |= HEAP_XMIN_COMMITTED;
6601 changed = true;
6602 }
6603 }
6604
6605 *totally_frozen_p = (xmin_frozen &&
6606 (freeze_xmax || xmax_already_frozen));
6607 return changed;
6608 }
6609
6610 /*
6611 * heap_execute_freeze_tuple
6612 * Execute the prepared freezing of a tuple.
6613 *
6614 * Caller is responsible for ensuring that no other backend can access the
6615 * storage underlying this tuple, either by holding an exclusive lock on the
6616 * buffer containing it (which is what lazy VACUUM does), or by having it be
6617 * in private storage (which is what CLUSTER and friends do).
6618 *
6619 * Note: it might seem we could make the changes without exclusive lock, since
6620 * TransactionId read/write is assumed atomic anyway. However there is a race
6621 * condition: someone who just fetched an old XID that we overwrite here could
6622 * conceivably not finish checking the XID against pg_xact before we finish
6623 * the VACUUM and perhaps truncate off the part of pg_xact he needs. Getting
6624 * exclusive lock ensures no other backend is in process of checking the
6625 * tuple status. Also, getting exclusive lock makes it safe to adjust the
6626 * infomask bits.
6627 *
6628 * NB: All code in here must be safe to execute during crash recovery!
6629 */
6630 void
heap_execute_freeze_tuple(HeapTupleHeader tuple,xl_heap_freeze_tuple * frz)6631 heap_execute_freeze_tuple(HeapTupleHeader tuple, xl_heap_freeze_tuple *frz)
6632 {
6633 HeapTupleHeaderSetXmax(tuple, frz->xmax);
6634
6635 if (frz->frzflags & XLH_FREEZE_XVAC)
6636 HeapTupleHeaderSetXvac(tuple, FrozenTransactionId);
6637
6638 if (frz->frzflags & XLH_INVALID_XVAC)
6639 HeapTupleHeaderSetXvac(tuple, InvalidTransactionId);
6640
6641 tuple->t_infomask = frz->t_infomask;
6642 tuple->t_infomask2 = frz->t_infomask2;
6643 }
6644
6645 /*
6646 * heap_freeze_tuple
6647 * Freeze tuple in place, without WAL logging.
6648 *
6649 * Useful for callers like CLUSTER that perform their own WAL logging.
6650 */
6651 bool
heap_freeze_tuple(HeapTupleHeader tuple,TransactionId relfrozenxid,TransactionId relminmxid,TransactionId cutoff_xid,TransactionId cutoff_multi)6652 heap_freeze_tuple(HeapTupleHeader tuple,
6653 TransactionId relfrozenxid, TransactionId relminmxid,
6654 TransactionId cutoff_xid, TransactionId cutoff_multi)
6655 {
6656 xl_heap_freeze_tuple frz;
6657 bool do_freeze;
6658 bool tuple_totally_frozen;
6659
6660 do_freeze = heap_prepare_freeze_tuple(tuple,
6661 relfrozenxid, relminmxid,
6662 cutoff_xid, cutoff_multi,
6663 &frz, &tuple_totally_frozen);
6664
6665 /*
6666 * Note that because this is not a WAL-logged operation, we don't need to
6667 * fill in the offset in the freeze record.
6668 */
6669
6670 if (do_freeze)
6671 heap_execute_freeze_tuple(tuple, &frz);
6672 return do_freeze;
6673 }
6674
6675 /*
6676 * For a given MultiXactId, return the hint bits that should be set in the
6677 * tuple's infomask.
6678 *
6679 * Normally this should be called for a multixact that was just created, and
6680 * so is on our local cache, so the GetMembers call is fast.
6681 */
6682 static void
GetMultiXactIdHintBits(MultiXactId multi,uint16 * new_infomask,uint16 * new_infomask2)6683 GetMultiXactIdHintBits(MultiXactId multi, uint16 *new_infomask,
6684 uint16 *new_infomask2)
6685 {
6686 int nmembers;
6687 MultiXactMember *members;
6688 int i;
6689 uint16 bits = HEAP_XMAX_IS_MULTI;
6690 uint16 bits2 = 0;
6691 bool has_update = false;
6692 LockTupleMode strongest = LockTupleKeyShare;
6693
6694 /*
6695 * We only use this in multis we just created, so they cannot be values
6696 * pre-pg_upgrade.
6697 */
6698 nmembers = GetMultiXactIdMembers(multi, &members, false, false);
6699
6700 for (i = 0; i < nmembers; i++)
6701 {
6702 LockTupleMode mode;
6703
6704 /*
6705 * Remember the strongest lock mode held by any member of the
6706 * multixact.
6707 */
6708 mode = TUPLOCK_from_mxstatus(members[i].status);
6709 if (mode > strongest)
6710 strongest = mode;
6711
6712 /* See what other bits we need */
6713 switch (members[i].status)
6714 {
6715 case MultiXactStatusForKeyShare:
6716 case MultiXactStatusForShare:
6717 case MultiXactStatusForNoKeyUpdate:
6718 break;
6719
6720 case MultiXactStatusForUpdate:
6721 bits2 |= HEAP_KEYS_UPDATED;
6722 break;
6723
6724 case MultiXactStatusNoKeyUpdate:
6725 has_update = true;
6726 break;
6727
6728 case MultiXactStatusUpdate:
6729 bits2 |= HEAP_KEYS_UPDATED;
6730 has_update = true;
6731 break;
6732 }
6733 }
6734
6735 if (strongest == LockTupleExclusive ||
6736 strongest == LockTupleNoKeyExclusive)
6737 bits |= HEAP_XMAX_EXCL_LOCK;
6738 else if (strongest == LockTupleShare)
6739 bits |= HEAP_XMAX_SHR_LOCK;
6740 else if (strongest == LockTupleKeyShare)
6741 bits |= HEAP_XMAX_KEYSHR_LOCK;
6742
6743 if (!has_update)
6744 bits |= HEAP_XMAX_LOCK_ONLY;
6745
6746 if (nmembers > 0)
6747 pfree(members);
6748
6749 *new_infomask = bits;
6750 *new_infomask2 = bits2;
6751 }
6752
6753 /*
6754 * MultiXactIdGetUpdateXid
6755 *
6756 * Given a multixact Xmax and corresponding infomask, which does not have the
6757 * HEAP_XMAX_LOCK_ONLY bit set, obtain and return the Xid of the updating
6758 * transaction.
6759 *
6760 * Caller is expected to check the status of the updating transaction, if
6761 * necessary.
6762 */
6763 static TransactionId
MultiXactIdGetUpdateXid(TransactionId xmax,uint16 t_infomask)6764 MultiXactIdGetUpdateXid(TransactionId xmax, uint16 t_infomask)
6765 {
6766 TransactionId update_xact = InvalidTransactionId;
6767 MultiXactMember *members;
6768 int nmembers;
6769
6770 Assert(!(t_infomask & HEAP_XMAX_LOCK_ONLY));
6771 Assert(t_infomask & HEAP_XMAX_IS_MULTI);
6772
6773 /*
6774 * Since we know the LOCK_ONLY bit is not set, this cannot be a multi from
6775 * pre-pg_upgrade.
6776 */
6777 nmembers = GetMultiXactIdMembers(xmax, &members, false, false);
6778
6779 if (nmembers > 0)
6780 {
6781 int i;
6782
6783 for (i = 0; i < nmembers; i++)
6784 {
6785 /* Ignore lockers */
6786 if (!ISUPDATE_from_mxstatus(members[i].status))
6787 continue;
6788
6789 /* there can be at most one updater */
6790 Assert(update_xact == InvalidTransactionId);
6791 update_xact = members[i].xid;
6792 #ifndef USE_ASSERT_CHECKING
6793
6794 /*
6795 * in an assert-enabled build, walk the whole array to ensure
6796 * there's no other updater.
6797 */
6798 break;
6799 #endif
6800 }
6801
6802 pfree(members);
6803 }
6804
6805 return update_xact;
6806 }
6807
6808 /*
6809 * HeapTupleGetUpdateXid
6810 * As above, but use a HeapTupleHeader
6811 *
6812 * See also HeapTupleHeaderGetUpdateXid, which can be used without previously
6813 * checking the hint bits.
6814 */
6815 TransactionId
HeapTupleGetUpdateXid(HeapTupleHeader tuple)6816 HeapTupleGetUpdateXid(HeapTupleHeader tuple)
6817 {
6818 return MultiXactIdGetUpdateXid(HeapTupleHeaderGetRawXmax(tuple),
6819 tuple->t_infomask);
6820 }
6821
6822 /*
6823 * Does the given multixact conflict with the current transaction grabbing a
6824 * tuple lock of the given strength?
6825 *
6826 * The passed infomask pairs up with the given multixact in the tuple header.
6827 *
6828 * If current_is_member is not NULL, it is set to 'true' if the current
6829 * transaction is a member of the given multixact.
6830 */
6831 static bool
DoesMultiXactIdConflict(MultiXactId multi,uint16 infomask,LockTupleMode lockmode,bool * current_is_member)6832 DoesMultiXactIdConflict(MultiXactId multi, uint16 infomask,
6833 LockTupleMode lockmode, bool *current_is_member)
6834 {
6835 int nmembers;
6836 MultiXactMember *members;
6837 bool result = false;
6838 LOCKMODE wanted = tupleLockExtraInfo[lockmode].hwlock;
6839
6840 if (HEAP_LOCKED_UPGRADED(infomask))
6841 return false;
6842
6843 nmembers = GetMultiXactIdMembers(multi, &members, false,
6844 HEAP_XMAX_IS_LOCKED_ONLY(infomask));
6845 if (nmembers >= 0)
6846 {
6847 int i;
6848
6849 for (i = 0; i < nmembers; i++)
6850 {
6851 TransactionId memxid;
6852 LOCKMODE memlockmode;
6853
6854 if (result && (current_is_member == NULL || *current_is_member))
6855 break;
6856
6857 memlockmode = LOCKMODE_from_mxstatus(members[i].status);
6858
6859 /* ignore members from current xact (but track their presence) */
6860 memxid = members[i].xid;
6861 if (TransactionIdIsCurrentTransactionId(memxid))
6862 {
6863 if (current_is_member != NULL)
6864 *current_is_member = true;
6865 continue;
6866 }
6867 else if (result)
6868 continue;
6869
6870 /* ignore members that don't conflict with the lock we want */
6871 if (!DoLockModesConflict(memlockmode, wanted))
6872 continue;
6873
6874 if (ISUPDATE_from_mxstatus(members[i].status))
6875 {
6876 /* ignore aborted updaters */
6877 if (TransactionIdDidAbort(memxid))
6878 continue;
6879 }
6880 else
6881 {
6882 /* ignore lockers-only that are no longer in progress */
6883 if (!TransactionIdIsInProgress(memxid))
6884 continue;
6885 }
6886
6887 /*
6888 * Whatever remains are either live lockers that conflict with our
6889 * wanted lock, and updaters that are not aborted. Those conflict
6890 * with what we want. Set up to return true, but keep going to
6891 * look for the current transaction among the multixact members,
6892 * if needed.
6893 */
6894 result = true;
6895 }
6896 pfree(members);
6897 }
6898
6899 return result;
6900 }
6901
6902 /*
6903 * Do_MultiXactIdWait
6904 * Actual implementation for the two functions below.
6905 *
6906 * 'multi', 'status' and 'infomask' indicate what to sleep on (the status is
6907 * needed to ensure we only sleep on conflicting members, and the infomask is
6908 * used to optimize multixact access in case it's a lock-only multi); 'nowait'
6909 * indicates whether to use conditional lock acquisition, to allow callers to
6910 * fail if lock is unavailable. 'rel', 'ctid' and 'oper' are used to set up
6911 * context information for error messages. 'remaining', if not NULL, receives
6912 * the number of members that are still running, including any (non-aborted)
6913 * subtransactions of our own transaction.
6914 *
6915 * We do this by sleeping on each member using XactLockTableWait. Any
6916 * members that belong to the current backend are *not* waited for, however;
6917 * this would not merely be useless but would lead to Assert failure inside
6918 * XactLockTableWait. By the time this returns, it is certain that all
6919 * transactions *of other backends* that were members of the MultiXactId
6920 * that conflict with the requested status are dead (and no new ones can have
6921 * been added, since it is not legal to add members to an existing
6922 * MultiXactId).
6923 *
6924 * But by the time we finish sleeping, someone else may have changed the Xmax
6925 * of the containing tuple, so the caller needs to iterate on us somehow.
6926 *
6927 * Note that in case we return false, the number of remaining members is
6928 * not to be trusted.
6929 */
6930 static bool
Do_MultiXactIdWait(MultiXactId multi,MultiXactStatus status,uint16 infomask,bool nowait,Relation rel,ItemPointer ctid,XLTW_Oper oper,int * remaining)6931 Do_MultiXactIdWait(MultiXactId multi, MultiXactStatus status,
6932 uint16 infomask, bool nowait,
6933 Relation rel, ItemPointer ctid, XLTW_Oper oper,
6934 int *remaining)
6935 {
6936 bool result = true;
6937 MultiXactMember *members;
6938 int nmembers;
6939 int remain = 0;
6940
6941 /* for pre-pg_upgrade tuples, no need to sleep at all */
6942 nmembers = HEAP_LOCKED_UPGRADED(infomask) ? -1 :
6943 GetMultiXactIdMembers(multi, &members, false,
6944 HEAP_XMAX_IS_LOCKED_ONLY(infomask));
6945
6946 if (nmembers >= 0)
6947 {
6948 int i;
6949
6950 for (i = 0; i < nmembers; i++)
6951 {
6952 TransactionId memxid = members[i].xid;
6953 MultiXactStatus memstatus = members[i].status;
6954
6955 if (TransactionIdIsCurrentTransactionId(memxid))
6956 {
6957 remain++;
6958 continue;
6959 }
6960
6961 if (!DoLockModesConflict(LOCKMODE_from_mxstatus(memstatus),
6962 LOCKMODE_from_mxstatus(status)))
6963 {
6964 if (remaining && TransactionIdIsInProgress(memxid))
6965 remain++;
6966 continue;
6967 }
6968
6969 /*
6970 * This member conflicts with our multi, so we have to sleep (or
6971 * return failure, if asked to avoid waiting.)
6972 *
6973 * Note that we don't set up an error context callback ourselves,
6974 * but instead we pass the info down to XactLockTableWait. This
6975 * might seem a bit wasteful because the context is set up and
6976 * tore down for each member of the multixact, but in reality it
6977 * should be barely noticeable, and it avoids duplicate code.
6978 */
6979 if (nowait)
6980 {
6981 result = ConditionalXactLockTableWait(memxid);
6982 if (!result)
6983 break;
6984 }
6985 else
6986 XactLockTableWait(memxid, rel, ctid, oper);
6987 }
6988
6989 pfree(members);
6990 }
6991
6992 if (remaining)
6993 *remaining = remain;
6994
6995 return result;
6996 }
6997
6998 /*
6999 * MultiXactIdWait
7000 * Sleep on a MultiXactId.
7001 *
7002 * By the time we finish sleeping, someone else may have changed the Xmax
7003 * of the containing tuple, so the caller needs to iterate on us somehow.
7004 *
7005 * We return (in *remaining, if not NULL) the number of members that are still
7006 * running, including any (non-aborted) subtransactions of our own transaction.
7007 */
7008 static void
MultiXactIdWait(MultiXactId multi,MultiXactStatus status,uint16 infomask,Relation rel,ItemPointer ctid,XLTW_Oper oper,int * remaining)7009 MultiXactIdWait(MultiXactId multi, MultiXactStatus status, uint16 infomask,
7010 Relation rel, ItemPointer ctid, XLTW_Oper oper,
7011 int *remaining)
7012 {
7013 (void) Do_MultiXactIdWait(multi, status, infomask, false,
7014 rel, ctid, oper, remaining);
7015 }
7016
7017 /*
7018 * ConditionalMultiXactIdWait
7019 * As above, but only lock if we can get the lock without blocking.
7020 *
7021 * By the time we finish sleeping, someone else may have changed the Xmax
7022 * of the containing tuple, so the caller needs to iterate on us somehow.
7023 *
7024 * If the multixact is now all gone, return true. Returns false if some
7025 * transactions might still be running.
7026 *
7027 * We return (in *remaining, if not NULL) the number of members that are still
7028 * running, including any (non-aborted) subtransactions of our own transaction.
7029 */
7030 static bool
ConditionalMultiXactIdWait(MultiXactId multi,MultiXactStatus status,uint16 infomask,Relation rel,int * remaining)7031 ConditionalMultiXactIdWait(MultiXactId multi, MultiXactStatus status,
7032 uint16 infomask, Relation rel, int *remaining)
7033 {
7034 return Do_MultiXactIdWait(multi, status, infomask, true,
7035 rel, NULL, XLTW_None, remaining);
7036 }
7037
7038 /*
7039 * heap_tuple_needs_eventual_freeze
7040 *
7041 * Check to see whether any of the XID fields of a tuple (xmin, xmax, xvac)
7042 * will eventually require freezing. Similar to heap_tuple_needs_freeze,
7043 * but there's no cutoff, since we're trying to figure out whether freezing
7044 * will ever be needed, not whether it's needed now.
7045 */
7046 bool
heap_tuple_needs_eventual_freeze(HeapTupleHeader tuple)7047 heap_tuple_needs_eventual_freeze(HeapTupleHeader tuple)
7048 {
7049 TransactionId xid;
7050
7051 /*
7052 * If xmin is a normal transaction ID, this tuple is definitely not
7053 * frozen.
7054 */
7055 xid = HeapTupleHeaderGetXmin(tuple);
7056 if (TransactionIdIsNormal(xid))
7057 return true;
7058
7059 /*
7060 * If xmax is a valid xact or multixact, this tuple is also not frozen.
7061 */
7062 if (tuple->t_infomask & HEAP_XMAX_IS_MULTI)
7063 {
7064 MultiXactId multi;
7065
7066 multi = HeapTupleHeaderGetRawXmax(tuple);
7067 if (MultiXactIdIsValid(multi))
7068 return true;
7069 }
7070 else
7071 {
7072 xid = HeapTupleHeaderGetRawXmax(tuple);
7073 if (TransactionIdIsNormal(xid))
7074 return true;
7075 }
7076
7077 if (tuple->t_infomask & HEAP_MOVED)
7078 {
7079 xid = HeapTupleHeaderGetXvac(tuple);
7080 if (TransactionIdIsNormal(xid))
7081 return true;
7082 }
7083
7084 return false;
7085 }
7086
7087 /*
7088 * heap_tuple_needs_freeze
7089 *
7090 * Check to see whether any of the XID fields of a tuple (xmin, xmax, xvac)
7091 * are older than the specified cutoff XID or MultiXactId. If so, return true.
7092 *
7093 * It doesn't matter whether the tuple is alive or dead, we are checking
7094 * to see if a tuple needs to be removed or frozen to avoid wraparound.
7095 *
7096 * NB: Cannot rely on hint bits here, they might not be set after a crash or
7097 * on a standby.
7098 */
7099 bool
heap_tuple_needs_freeze(HeapTupleHeader tuple,TransactionId cutoff_xid,MultiXactId cutoff_multi,Buffer buf)7100 heap_tuple_needs_freeze(HeapTupleHeader tuple, TransactionId cutoff_xid,
7101 MultiXactId cutoff_multi, Buffer buf)
7102 {
7103 TransactionId xid;
7104
7105 xid = HeapTupleHeaderGetXmin(tuple);
7106 if (TransactionIdIsNormal(xid) &&
7107 TransactionIdPrecedes(xid, cutoff_xid))
7108 return true;
7109
7110 /*
7111 * The considerations for multixacts are complicated; look at
7112 * heap_prepare_freeze_tuple for justifications. This routine had better
7113 * be in sync with that one!
7114 */
7115 if (tuple->t_infomask & HEAP_XMAX_IS_MULTI)
7116 {
7117 MultiXactId multi;
7118
7119 multi = HeapTupleHeaderGetRawXmax(tuple);
7120 if (!MultiXactIdIsValid(multi))
7121 {
7122 /* no xmax set, ignore */
7123 ;
7124 }
7125 else if (HEAP_LOCKED_UPGRADED(tuple->t_infomask))
7126 return true;
7127 else if (MultiXactIdPrecedes(multi, cutoff_multi))
7128 return true;
7129 else
7130 {
7131 MultiXactMember *members;
7132 int nmembers;
7133 int i;
7134
7135 /* need to check whether any member of the mxact is too old */
7136
7137 nmembers = GetMultiXactIdMembers(multi, &members, false,
7138 HEAP_XMAX_IS_LOCKED_ONLY(tuple->t_infomask));
7139
7140 for (i = 0; i < nmembers; i++)
7141 {
7142 if (TransactionIdPrecedes(members[i].xid, cutoff_xid))
7143 {
7144 pfree(members);
7145 return true;
7146 }
7147 }
7148 if (nmembers > 0)
7149 pfree(members);
7150 }
7151 }
7152 else
7153 {
7154 xid = HeapTupleHeaderGetRawXmax(tuple);
7155 if (TransactionIdIsNormal(xid) &&
7156 TransactionIdPrecedes(xid, cutoff_xid))
7157 return true;
7158 }
7159
7160 if (tuple->t_infomask & HEAP_MOVED)
7161 {
7162 xid = HeapTupleHeaderGetXvac(tuple);
7163 if (TransactionIdIsNormal(xid) &&
7164 TransactionIdPrecedes(xid, cutoff_xid))
7165 return true;
7166 }
7167
7168 return false;
7169 }
7170
7171 /*
7172 * If 'tuple' contains any visible XID greater than latestRemovedXid,
7173 * ratchet forwards latestRemovedXid to the greatest one found.
7174 * This is used as the basis for generating Hot Standby conflicts, so
7175 * if a tuple was never visible then removing it should not conflict
7176 * with queries.
7177 */
7178 void
HeapTupleHeaderAdvanceLatestRemovedXid(HeapTupleHeader tuple,TransactionId * latestRemovedXid)7179 HeapTupleHeaderAdvanceLatestRemovedXid(HeapTupleHeader tuple,
7180 TransactionId *latestRemovedXid)
7181 {
7182 TransactionId xmin = HeapTupleHeaderGetXmin(tuple);
7183 TransactionId xmax = HeapTupleHeaderGetUpdateXid(tuple);
7184 TransactionId xvac = HeapTupleHeaderGetXvac(tuple);
7185
7186 if (tuple->t_infomask & HEAP_MOVED)
7187 {
7188 if (TransactionIdPrecedes(*latestRemovedXid, xvac))
7189 *latestRemovedXid = xvac;
7190 }
7191
7192 /*
7193 * Ignore tuples inserted by an aborted transaction or if the tuple was
7194 * updated/deleted by the inserting transaction.
7195 *
7196 * Look for a committed hint bit, or if no xmin bit is set, check clog.
7197 */
7198 if (HeapTupleHeaderXminCommitted(tuple) ||
7199 (!HeapTupleHeaderXminInvalid(tuple) && TransactionIdDidCommit(xmin)))
7200 {
7201 if (xmax != xmin &&
7202 TransactionIdFollows(xmax, *latestRemovedXid))
7203 *latestRemovedXid = xmax;
7204 }
7205
7206 /* *latestRemovedXid may still be invalid at end */
7207 }
7208
7209 #ifdef USE_PREFETCH
7210 /*
7211 * Helper function for heap_index_delete_tuples. Issues prefetch requests for
7212 * prefetch_count buffers. The prefetch_state keeps track of all the buffers
7213 * we can prefetch, and which have already been prefetched; each call to this
7214 * function picks up where the previous call left off.
7215 *
7216 * Note: we expect the deltids array to be sorted in an order that groups TIDs
7217 * by heap block, with all TIDs for each block appearing together in exactly
7218 * one group.
7219 */
7220 static void
index_delete_prefetch_buffer(Relation rel,IndexDeletePrefetchState * prefetch_state,int prefetch_count)7221 index_delete_prefetch_buffer(Relation rel,
7222 IndexDeletePrefetchState *prefetch_state,
7223 int prefetch_count)
7224 {
7225 BlockNumber cur_hblkno = prefetch_state->cur_hblkno;
7226 int count = 0;
7227 int i;
7228 int ndeltids = prefetch_state->ndeltids;
7229 TM_IndexDelete *deltids = prefetch_state->deltids;
7230
7231 for (i = prefetch_state->next_item;
7232 i < ndeltids && count < prefetch_count;
7233 i++)
7234 {
7235 ItemPointer htid = &deltids[i].tid;
7236
7237 if (cur_hblkno == InvalidBlockNumber ||
7238 ItemPointerGetBlockNumber(htid) != cur_hblkno)
7239 {
7240 cur_hblkno = ItemPointerGetBlockNumber(htid);
7241 PrefetchBuffer(rel, MAIN_FORKNUM, cur_hblkno);
7242 count++;
7243 }
7244 }
7245
7246 /*
7247 * Save the prefetch position so that next time we can continue from that
7248 * position.
7249 */
7250 prefetch_state->next_item = i;
7251 prefetch_state->cur_hblkno = cur_hblkno;
7252 }
7253 #endif
7254
7255 /*
7256 * heapam implementation of tableam's index_delete_tuples interface.
7257 *
7258 * This helper function is called by index AMs during index tuple deletion.
7259 * See tableam header comments for an explanation of the interface implemented
7260 * here and a general theory of operation. Note that each call here is either
7261 * a simple index deletion call, or a bottom-up index deletion call.
7262 *
7263 * It's possible for this to generate a fair amount of I/O, since we may be
7264 * deleting hundreds of tuples from a single index block. To amortize that
7265 * cost to some degree, this uses prefetching and combines repeat accesses to
7266 * the same heap block.
7267 */
7268 TransactionId
heap_index_delete_tuples(Relation rel,TM_IndexDeleteOp * delstate)7269 heap_index_delete_tuples(Relation rel, TM_IndexDeleteOp *delstate)
7270 {
7271 /* Initial assumption is that earlier pruning took care of conflict */
7272 TransactionId latestRemovedXid = InvalidTransactionId;
7273 BlockNumber blkno = InvalidBlockNumber;
7274 Buffer buf = InvalidBuffer;
7275 Page page = NULL;
7276 OffsetNumber maxoff = InvalidOffsetNumber;
7277 TransactionId priorXmax;
7278 #ifdef USE_PREFETCH
7279 IndexDeletePrefetchState prefetch_state;
7280 int prefetch_distance;
7281 #endif
7282 SnapshotData SnapshotNonVacuumable;
7283 int finalndeltids = 0,
7284 nblocksaccessed = 0;
7285
7286 /* State that's only used in bottom-up index deletion case */
7287 int nblocksfavorable = 0;
7288 int curtargetfreespace = delstate->bottomupfreespace,
7289 lastfreespace = 0,
7290 actualfreespace = 0;
7291 bool bottomup_final_block = false;
7292
7293 InitNonVacuumableSnapshot(SnapshotNonVacuumable, GlobalVisTestFor(rel));
7294
7295 /* Sort caller's deltids array by TID for further processing */
7296 index_delete_sort(delstate);
7297
7298 /*
7299 * Bottom-up case: resort deltids array in an order attuned to where the
7300 * greatest number of promising TIDs are to be found, and determine how
7301 * many blocks from the start of sorted array should be considered
7302 * favorable. This will also shrink the deltids array in order to
7303 * eliminate completely unfavorable blocks up front.
7304 */
7305 if (delstate->bottomup)
7306 nblocksfavorable = bottomup_sort_and_shrink(delstate);
7307
7308 #ifdef USE_PREFETCH
7309 /* Initialize prefetch state. */
7310 prefetch_state.cur_hblkno = InvalidBlockNumber;
7311 prefetch_state.next_item = 0;
7312 prefetch_state.ndeltids = delstate->ndeltids;
7313 prefetch_state.deltids = delstate->deltids;
7314
7315 /*
7316 * Determine the prefetch distance that we will attempt to maintain.
7317 *
7318 * Since the caller holds a buffer lock somewhere in rel, we'd better make
7319 * sure that isn't a catalog relation before we call code that does
7320 * syscache lookups, to avoid risk of deadlock.
7321 */
7322 if (IsCatalogRelation(rel))
7323 prefetch_distance = maintenance_io_concurrency;
7324 else
7325 prefetch_distance =
7326 get_tablespace_maintenance_io_concurrency(rel->rd_rel->reltablespace);
7327
7328 /* Cap initial prefetch distance for bottom-up deletion caller */
7329 if (delstate->bottomup)
7330 {
7331 Assert(nblocksfavorable >= 1);
7332 Assert(nblocksfavorable <= BOTTOMUP_MAX_NBLOCKS);
7333 prefetch_distance = Min(prefetch_distance, nblocksfavorable);
7334 }
7335
7336 /* Start prefetching. */
7337 index_delete_prefetch_buffer(rel, &prefetch_state, prefetch_distance);
7338 #endif
7339
7340 /* Iterate over deltids, determine which to delete, check their horizon */
7341 Assert(delstate->ndeltids > 0);
7342 for (int i = 0; i < delstate->ndeltids; i++)
7343 {
7344 TM_IndexDelete *ideltid = &delstate->deltids[i];
7345 TM_IndexStatus *istatus = delstate->status + ideltid->id;
7346 ItemPointer htid = &ideltid->tid;
7347 OffsetNumber offnum;
7348
7349 /*
7350 * Read buffer, and perform required extra steps each time a new block
7351 * is encountered. Avoid refetching if it's the same block as the one
7352 * from the last htid.
7353 */
7354 if (blkno == InvalidBlockNumber ||
7355 ItemPointerGetBlockNumber(htid) != blkno)
7356 {
7357 /*
7358 * Consider giving up early for bottom-up index deletion caller
7359 * first. (Only prefetch next-next block afterwards, when it
7360 * becomes clear that we're at least going to access the next
7361 * block in line.)
7362 *
7363 * Sometimes the first block frees so much space for bottom-up
7364 * caller that the deletion process can end without accessing any
7365 * more blocks. It is usually necessary to access 2 or 3 blocks
7366 * per bottom-up deletion operation, though.
7367 */
7368 if (delstate->bottomup)
7369 {
7370 /*
7371 * We often allow caller to delete a few additional items
7372 * whose entries we reached after the point that space target
7373 * from caller was satisfied. The cost of accessing the page
7374 * was already paid at that point, so it made sense to finish
7375 * it off. When that happened, we finalize everything here
7376 * (by finishing off the whole bottom-up deletion operation
7377 * without needlessly paying the cost of accessing any more
7378 * blocks).
7379 */
7380 if (bottomup_final_block)
7381 break;
7382
7383 /*
7384 * Give up when we didn't enable our caller to free any
7385 * additional space as a result of processing the page that we
7386 * just finished up with. This rule is the main way in which
7387 * we keep the cost of bottom-up deletion under control.
7388 */
7389 if (nblocksaccessed >= 1 && actualfreespace == lastfreespace)
7390 break;
7391 lastfreespace = actualfreespace; /* for next time */
7392
7393 /*
7394 * Deletion operation (which is bottom-up) will definitely
7395 * access the next block in line. Prepare for that now.
7396 *
7397 * Decay target free space so that we don't hang on for too
7398 * long with a marginal case. (Space target is only truly
7399 * helpful when it allows us to recognize that we don't need
7400 * to access more than 1 or 2 blocks to satisfy caller due to
7401 * agreeable workload characteristics.)
7402 *
7403 * We are a bit more patient when we encounter contiguous
7404 * blocks, though: these are treated as favorable blocks. The
7405 * decay process is only applied when the next block in line
7406 * is not a favorable/contiguous block. This is not an
7407 * exception to the general rule; we still insist on finding
7408 * at least one deletable item per block accessed. See
7409 * bottomup_nblocksfavorable() for full details of the theory
7410 * behind favorable blocks and heap block locality in general.
7411 *
7412 * Note: The first block in line is always treated as a
7413 * favorable block, so the earliest possible point that the
7414 * decay can be applied is just before we access the second
7415 * block in line. The Assert() verifies this for us.
7416 */
7417 Assert(nblocksaccessed > 0 || nblocksfavorable > 0);
7418 if (nblocksfavorable > 0)
7419 nblocksfavorable--;
7420 else
7421 curtargetfreespace /= 2;
7422 }
7423
7424 /* release old buffer */
7425 if (BufferIsValid(buf))
7426 UnlockReleaseBuffer(buf);
7427
7428 blkno = ItemPointerGetBlockNumber(htid);
7429 buf = ReadBuffer(rel, blkno);
7430 nblocksaccessed++;
7431 Assert(!delstate->bottomup ||
7432 nblocksaccessed <= BOTTOMUP_MAX_NBLOCKS);
7433
7434 #ifdef USE_PREFETCH
7435
7436 /*
7437 * To maintain the prefetch distance, prefetch one more page for
7438 * each page we read.
7439 */
7440 index_delete_prefetch_buffer(rel, &prefetch_state, 1);
7441 #endif
7442
7443 LockBuffer(buf, BUFFER_LOCK_SHARE);
7444
7445 page = BufferGetPage(buf);
7446 maxoff = PageGetMaxOffsetNumber(page);
7447 }
7448
7449 if (istatus->knowndeletable)
7450 Assert(!delstate->bottomup && !istatus->promising);
7451 else
7452 {
7453 ItemPointerData tmp = *htid;
7454 HeapTupleData heapTuple;
7455
7456 /* Are any tuples from this HOT chain non-vacuumable? */
7457 if (heap_hot_search_buffer(&tmp, rel, buf, &SnapshotNonVacuumable,
7458 &heapTuple, NULL, true))
7459 continue; /* can't delete entry */
7460
7461 /* Caller will delete, since whole HOT chain is vacuumable */
7462 istatus->knowndeletable = true;
7463
7464 /* Maintain index free space info for bottom-up deletion case */
7465 if (delstate->bottomup)
7466 {
7467 Assert(istatus->freespace > 0);
7468 actualfreespace += istatus->freespace;
7469 if (actualfreespace >= curtargetfreespace)
7470 bottomup_final_block = true;
7471 }
7472 }
7473
7474 /*
7475 * Maintain latestRemovedXid value for deletion operation as a whole
7476 * by advancing current value using heap tuple headers. This is
7477 * loosely based on the logic for pruning a HOT chain.
7478 */
7479 offnum = ItemPointerGetOffsetNumber(htid);
7480 priorXmax = InvalidTransactionId; /* cannot check first XMIN */
7481 for (;;)
7482 {
7483 ItemId lp;
7484 HeapTupleHeader htup;
7485
7486 /* Some sanity checks */
7487 if (offnum < FirstOffsetNumber || offnum > maxoff)
7488 break;
7489
7490 lp = PageGetItemId(page, offnum);
7491 if (ItemIdIsRedirected(lp))
7492 {
7493 offnum = ItemIdGetRedirect(lp);
7494 continue;
7495 }
7496
7497 /*
7498 * We'll often encounter LP_DEAD line pointers (especially with an
7499 * entry marked knowndeletable by our caller up front). No heap
7500 * tuple headers get examined for an htid that leads us to an
7501 * LP_DEAD item. This is okay because the earlier pruning
7502 * operation that made the line pointer LP_DEAD in the first place
7503 * must have considered the original tuple header as part of
7504 * generating its own latestRemovedXid value.
7505 *
7506 * Relying on XLOG_HEAP2_PRUNE records like this is the same
7507 * strategy that index vacuuming uses in all cases. Index VACUUM
7508 * WAL records don't even have a latestRemovedXid field of their
7509 * own for this reason.
7510 */
7511 if (!ItemIdIsNormal(lp))
7512 break;
7513
7514 htup = (HeapTupleHeader) PageGetItem(page, lp);
7515
7516 /*
7517 * Check the tuple XMIN against prior XMAX, if any
7518 */
7519 if (TransactionIdIsValid(priorXmax) &&
7520 !TransactionIdEquals(HeapTupleHeaderGetXmin(htup), priorXmax))
7521 break;
7522
7523 HeapTupleHeaderAdvanceLatestRemovedXid(htup, &latestRemovedXid);
7524
7525 /*
7526 * If the tuple is not HOT-updated, then we are at the end of this
7527 * HOT-chain. No need to visit later tuples from the same update
7528 * chain (they get their own index entries) -- just move on to
7529 * next htid from index AM caller.
7530 */
7531 if (!HeapTupleHeaderIsHotUpdated(htup))
7532 break;
7533
7534 /* Advance to next HOT chain member */
7535 Assert(ItemPointerGetBlockNumber(&htup->t_ctid) == blkno);
7536 offnum = ItemPointerGetOffsetNumber(&htup->t_ctid);
7537 priorXmax = HeapTupleHeaderGetUpdateXid(htup);
7538 }
7539
7540 /* Enable further/final shrinking of deltids for caller */
7541 finalndeltids = i + 1;
7542 }
7543
7544 UnlockReleaseBuffer(buf);
7545
7546 /*
7547 * Shrink deltids array to exclude non-deletable entries at the end. This
7548 * is not just a minor optimization. Final deltids array size might be
7549 * zero for a bottom-up caller. Index AM is explicitly allowed to rely on
7550 * ndeltids being zero in all cases with zero total deletable entries.
7551 */
7552 Assert(finalndeltids > 0 || delstate->bottomup);
7553 delstate->ndeltids = finalndeltids;
7554
7555 return latestRemovedXid;
7556 }
7557
7558 /*
7559 * Specialized inlineable comparison function for index_delete_sort()
7560 */
7561 static inline int
index_delete_sort_cmp(TM_IndexDelete * deltid1,TM_IndexDelete * deltid2)7562 index_delete_sort_cmp(TM_IndexDelete *deltid1, TM_IndexDelete *deltid2)
7563 {
7564 ItemPointer tid1 = &deltid1->tid;
7565 ItemPointer tid2 = &deltid2->tid;
7566
7567 {
7568 BlockNumber blk1 = ItemPointerGetBlockNumber(tid1);
7569 BlockNumber blk2 = ItemPointerGetBlockNumber(tid2);
7570
7571 if (blk1 != blk2)
7572 return (blk1 < blk2) ? -1 : 1;
7573 }
7574 {
7575 OffsetNumber pos1 = ItemPointerGetOffsetNumber(tid1);
7576 OffsetNumber pos2 = ItemPointerGetOffsetNumber(tid2);
7577
7578 if (pos1 != pos2)
7579 return (pos1 < pos2) ? -1 : 1;
7580 }
7581
7582 Assert(false);
7583
7584 return 0;
7585 }
7586
7587 /*
7588 * Sort deltids array from delstate by TID. This prepares it for further
7589 * processing by heap_index_delete_tuples().
7590 *
7591 * This operation becomes a noticeable consumer of CPU cycles with some
7592 * workloads, so we go to the trouble of specialization/micro optimization.
7593 * We use shellsort for this because it's easy to specialize, compiles to
7594 * relatively few instructions, and is adaptive to presorted inputs/subsets
7595 * (which are typical here).
7596 */
7597 static void
index_delete_sort(TM_IndexDeleteOp * delstate)7598 index_delete_sort(TM_IndexDeleteOp *delstate)
7599 {
7600 TM_IndexDelete *deltids = delstate->deltids;
7601 int ndeltids = delstate->ndeltids;
7602 int low = 0;
7603
7604 /*
7605 * Shellsort gap sequence (taken from Sedgewick-Incerpi paper).
7606 *
7607 * This implementation is fast with array sizes up to ~4500. This covers
7608 * all supported BLCKSZ values.
7609 */
7610 const int gaps[9] = {1968, 861, 336, 112, 48, 21, 7, 3, 1};
7611
7612 /* Think carefully before changing anything here -- keep swaps cheap */
7613 StaticAssertStmt(sizeof(TM_IndexDelete) <= 8,
7614 "element size exceeds 8 bytes");
7615
7616 for (int g = 0; g < lengthof(gaps); g++)
7617 {
7618 for (int hi = gaps[g], i = low + hi; i < ndeltids; i++)
7619 {
7620 TM_IndexDelete d = deltids[i];
7621 int j = i;
7622
7623 while (j >= hi && index_delete_sort_cmp(&deltids[j - hi], &d) >= 0)
7624 {
7625 deltids[j] = deltids[j - hi];
7626 j -= hi;
7627 }
7628 deltids[j] = d;
7629 }
7630 }
7631 }
7632
7633 /*
7634 * Returns how many blocks should be considered favorable/contiguous for a
7635 * bottom-up index deletion pass. This is a number of heap blocks that starts
7636 * from and includes the first block in line.
7637 *
7638 * There is always at least one favorable block during bottom-up index
7639 * deletion. In the worst case (i.e. with totally random heap blocks) the
7640 * first block in line (the only favorable block) can be thought of as a
7641 * degenerate array of contiguous blocks that consists of a single block.
7642 * heap_index_delete_tuples() will expect this.
7643 *
7644 * Caller passes blockgroups, a description of the final order that deltids
7645 * will be sorted in for heap_index_delete_tuples() bottom-up index deletion
7646 * processing. Note that deltids need not actually be sorted just yet (caller
7647 * only passes deltids to us so that we can interpret blockgroups).
7648 *
7649 * You might guess that the existence of contiguous blocks cannot matter much,
7650 * since in general the main factor that determines which blocks we visit is
7651 * the number of promising TIDs, which is a fixed hint from the index AM.
7652 * We're not really targeting the general case, though -- the actual goal is
7653 * to adapt our behavior to a wide variety of naturally occurring conditions.
7654 * The effects of most of the heuristics we apply are only noticeable in the
7655 * aggregate, over time and across many _related_ bottom-up index deletion
7656 * passes.
7657 *
7658 * Deeming certain blocks favorable allows heapam to recognize and adapt to
7659 * workloads where heap blocks visited during bottom-up index deletion can be
7660 * accessed contiguously, in the sense that each newly visited block is the
7661 * neighbor of the block that bottom-up deletion just finished processing (or
7662 * close enough to it). It will likely be cheaper to access more favorable
7663 * blocks sooner rather than later (e.g. in this pass, not across a series of
7664 * related bottom-up passes). Either way it is probably only a matter of time
7665 * (or a matter of further correlated version churn) before all blocks that
7666 * appear together as a single large batch of favorable blocks get accessed by
7667 * _some_ bottom-up pass. Large batches of favorable blocks tend to either
7668 * appear almost constantly or not even once (it all depends on per-index
7669 * workload characteristics).
7670 *
7671 * Note that the blockgroups sort order applies a power-of-two bucketing
7672 * scheme that creates opportunities for contiguous groups of blocks to get
7673 * batched together, at least with workloads that are naturally amenable to
7674 * being driven by heap block locality. This doesn't just enhance the spatial
7675 * locality of bottom-up heap block processing in the obvious way. It also
7676 * enables temporal locality of access, since sorting by heap block number
7677 * naturally tends to make the bottom-up processing order deterministic.
7678 *
7679 * Consider the following example to get a sense of how temporal locality
7680 * might matter: There is a heap relation with several indexes, each of which
7681 * is low to medium cardinality. It is subject to constant non-HOT updates.
7682 * The updates are skewed (in one part of the primary key, perhaps). None of
7683 * the indexes are logically modified by the UPDATE statements (if they were
7684 * then bottom-up index deletion would not be triggered in the first place).
7685 * Naturally, each new round of index tuples (for each heap tuple that gets a
7686 * heap_update() call) will have the same heap TID in each and every index.
7687 * Since these indexes are low cardinality and never get logically modified,
7688 * heapam processing during bottom-up deletion passes will access heap blocks
7689 * in approximately sequential order. Temporal locality of access occurs due
7690 * to bottom-up deletion passes behaving very similarly across each of the
7691 * indexes at any given moment. This keeps the number of buffer misses needed
7692 * to visit heap blocks to a minimum.
7693 */
7694 static int
bottomup_nblocksfavorable(IndexDeleteCounts * blockgroups,int nblockgroups,TM_IndexDelete * deltids)7695 bottomup_nblocksfavorable(IndexDeleteCounts *blockgroups, int nblockgroups,
7696 TM_IndexDelete *deltids)
7697 {
7698 int64 lastblock = -1;
7699 int nblocksfavorable = 0;
7700
7701 Assert(nblockgroups >= 1);
7702 Assert(nblockgroups <= BOTTOMUP_MAX_NBLOCKS);
7703
7704 /*
7705 * We tolerate heap blocks that will be accessed only slightly out of
7706 * physical order. Small blips occur when a pair of almost-contiguous
7707 * blocks happen to fall into different buckets (perhaps due only to a
7708 * small difference in npromisingtids that the bucketing scheme didn't
7709 * quite manage to ignore). We effectively ignore these blips by applying
7710 * a small tolerance. The precise tolerance we use is a little arbitrary,
7711 * but it works well enough in practice.
7712 */
7713 for (int b = 0; b < nblockgroups; b++)
7714 {
7715 IndexDeleteCounts *group = blockgroups + b;
7716 TM_IndexDelete *firstdtid = deltids + group->ifirsttid;
7717 BlockNumber block = ItemPointerGetBlockNumber(&firstdtid->tid);
7718
7719 if (lastblock != -1 &&
7720 ((int64) block < lastblock - BOTTOMUP_TOLERANCE_NBLOCKS ||
7721 (int64) block > lastblock + BOTTOMUP_TOLERANCE_NBLOCKS))
7722 break;
7723
7724 nblocksfavorable++;
7725 lastblock = block;
7726 }
7727
7728 /* Always indicate that there is at least 1 favorable block */
7729 Assert(nblocksfavorable >= 1);
7730
7731 return nblocksfavorable;
7732 }
7733
7734 /*
7735 * qsort comparison function for bottomup_sort_and_shrink()
7736 */
7737 static int
bottomup_sort_and_shrink_cmp(const void * arg1,const void * arg2)7738 bottomup_sort_and_shrink_cmp(const void *arg1, const void *arg2)
7739 {
7740 const IndexDeleteCounts *group1 = (const IndexDeleteCounts *) arg1;
7741 const IndexDeleteCounts *group2 = (const IndexDeleteCounts *) arg2;
7742
7743 /*
7744 * Most significant field is npromisingtids (which we invert the order of
7745 * so as to sort in desc order).
7746 *
7747 * Caller should have already normalized npromisingtids fields into
7748 * power-of-two values (buckets).
7749 */
7750 if (group1->npromisingtids > group2->npromisingtids)
7751 return -1;
7752 if (group1->npromisingtids < group2->npromisingtids)
7753 return 1;
7754
7755 /*
7756 * Tiebreak: desc ntids sort order.
7757 *
7758 * We cannot expect power-of-two values for ntids fields. We should
7759 * behave as if they were already rounded up for us instead.
7760 */
7761 if (group1->ntids != group2->ntids)
7762 {
7763 uint32 ntids1 = pg_nextpower2_32((uint32) group1->ntids);
7764 uint32 ntids2 = pg_nextpower2_32((uint32) group2->ntids);
7765
7766 if (ntids1 > ntids2)
7767 return -1;
7768 if (ntids1 < ntids2)
7769 return 1;
7770 }
7771
7772 /*
7773 * Tiebreak: asc offset-into-deltids-for-block (offset to first TID for
7774 * block in deltids array) order.
7775 *
7776 * This is equivalent to sorting in ascending heap block number order
7777 * (among otherwise equal subsets of the array). This approach allows us
7778 * to avoid accessing the out-of-line TID. (We rely on the assumption
7779 * that the deltids array was sorted in ascending heap TID order when
7780 * these offsets to the first TID from each heap block group were formed.)
7781 */
7782 if (group1->ifirsttid > group2->ifirsttid)
7783 return 1;
7784 if (group1->ifirsttid < group2->ifirsttid)
7785 return -1;
7786
7787 pg_unreachable();
7788
7789 return 0;
7790 }
7791
7792 /*
7793 * heap_index_delete_tuples() helper function for bottom-up deletion callers.
7794 *
7795 * Sorts deltids array in the order needed for useful processing by bottom-up
7796 * deletion. The array should already be sorted in TID order when we're
7797 * called. The sort process groups heap TIDs from deltids into heap block
7798 * groupings. Earlier/more-promising groups/blocks are usually those that are
7799 * known to have the most "promising" TIDs.
7800 *
7801 * Sets new size of deltids array (ndeltids) in state. deltids will only have
7802 * TIDs from the BOTTOMUP_MAX_NBLOCKS most promising heap blocks when we
7803 * return. This often means that deltids will be shrunk to a small fraction
7804 * of its original size (we eliminate many heap blocks from consideration for
7805 * caller up front).
7806 *
7807 * Returns the number of "favorable" blocks. See bottomup_nblocksfavorable()
7808 * for a definition and full details.
7809 */
7810 static int
bottomup_sort_and_shrink(TM_IndexDeleteOp * delstate)7811 bottomup_sort_and_shrink(TM_IndexDeleteOp *delstate)
7812 {
7813 IndexDeleteCounts *blockgroups;
7814 TM_IndexDelete *reordereddeltids;
7815 BlockNumber curblock = InvalidBlockNumber;
7816 int nblockgroups = 0;
7817 int ncopied = 0;
7818 int nblocksfavorable = 0;
7819
7820 Assert(delstate->bottomup);
7821 Assert(delstate->ndeltids > 0);
7822
7823 /* Calculate per-heap-block count of TIDs */
7824 blockgroups = palloc(sizeof(IndexDeleteCounts) * delstate->ndeltids);
7825 for (int i = 0; i < delstate->ndeltids; i++)
7826 {
7827 TM_IndexDelete *ideltid = &delstate->deltids[i];
7828 TM_IndexStatus *istatus = delstate->status + ideltid->id;
7829 ItemPointer htid = &ideltid->tid;
7830 bool promising = istatus->promising;
7831
7832 if (curblock != ItemPointerGetBlockNumber(htid))
7833 {
7834 /* New block group */
7835 nblockgroups++;
7836
7837 Assert(curblock < ItemPointerGetBlockNumber(htid) ||
7838 !BlockNumberIsValid(curblock));
7839
7840 curblock = ItemPointerGetBlockNumber(htid);
7841 blockgroups[nblockgroups - 1].ifirsttid = i;
7842 blockgroups[nblockgroups - 1].ntids = 1;
7843 blockgroups[nblockgroups - 1].npromisingtids = 0;
7844 }
7845 else
7846 {
7847 blockgroups[nblockgroups - 1].ntids++;
7848 }
7849
7850 if (promising)
7851 blockgroups[nblockgroups - 1].npromisingtids++;
7852 }
7853
7854 /*
7855 * We're about ready to sort block groups to determine the optimal order
7856 * for visiting heap blocks. But before we do, round the number of
7857 * promising tuples for each block group up to the next power-of-two,
7858 * unless it is very low (less than 4), in which case we round up to 4.
7859 * npromisingtids is far too noisy to trust when choosing between a pair
7860 * of block groups that both have very low values.
7861 *
7862 * This scheme divides heap blocks/block groups into buckets. Each bucket
7863 * contains blocks that have _approximately_ the same number of promising
7864 * TIDs as each other. The goal is to ignore relatively small differences
7865 * in the total number of promising entries, so that the whole process can
7866 * give a little weight to heapam factors (like heap block locality)
7867 * instead. This isn't a trade-off, really -- we have nothing to lose. It
7868 * would be foolish to interpret small differences in npromisingtids
7869 * values as anything more than noise.
7870 *
7871 * We tiebreak on nhtids when sorting block group subsets that have the
7872 * same npromisingtids, but this has the same issues as npromisingtids,
7873 * and so nhtids is subject to the same power-of-two bucketing scheme. The
7874 * only reason that we don't fix nhtids in the same way here too is that
7875 * we'll need accurate nhtids values after the sort. We handle nhtids
7876 * bucketization dynamically instead (in the sort comparator).
7877 *
7878 * See bottomup_nblocksfavorable() for a full explanation of when and how
7879 * heap locality/favorable blocks can significantly influence when and how
7880 * heap blocks are accessed.
7881 */
7882 for (int b = 0; b < nblockgroups; b++)
7883 {
7884 IndexDeleteCounts *group = blockgroups + b;
7885
7886 /* Better off falling back on nhtids with low npromisingtids */
7887 if (group->npromisingtids <= 4)
7888 group->npromisingtids = 4;
7889 else
7890 group->npromisingtids =
7891 pg_nextpower2_32((uint32) group->npromisingtids);
7892 }
7893
7894 /* Sort groups and rearrange caller's deltids array */
7895 qsort(blockgroups, nblockgroups, sizeof(IndexDeleteCounts),
7896 bottomup_sort_and_shrink_cmp);
7897 reordereddeltids = palloc(delstate->ndeltids * sizeof(TM_IndexDelete));
7898
7899 nblockgroups = Min(BOTTOMUP_MAX_NBLOCKS, nblockgroups);
7900 /* Determine number of favorable blocks at the start of final deltids */
7901 nblocksfavorable = bottomup_nblocksfavorable(blockgroups, nblockgroups,
7902 delstate->deltids);
7903
7904 for (int b = 0; b < nblockgroups; b++)
7905 {
7906 IndexDeleteCounts *group = blockgroups + b;
7907 TM_IndexDelete *firstdtid = delstate->deltids + group->ifirsttid;
7908
7909 memcpy(reordereddeltids + ncopied, firstdtid,
7910 sizeof(TM_IndexDelete) * group->ntids);
7911 ncopied += group->ntids;
7912 }
7913
7914 /* Copy final grouped and sorted TIDs back into start of caller's array */
7915 memcpy(delstate->deltids, reordereddeltids,
7916 sizeof(TM_IndexDelete) * ncopied);
7917 delstate->ndeltids = ncopied;
7918
7919 pfree(reordereddeltids);
7920 pfree(blockgroups);
7921
7922 return nblocksfavorable;
7923 }
7924
7925 /*
7926 * Perform XLogInsert for a heap-freeze operation. Caller must have already
7927 * modified the buffer and marked it dirty.
7928 */
7929 XLogRecPtr
log_heap_freeze(Relation reln,Buffer buffer,TransactionId cutoff_xid,xl_heap_freeze_tuple * tuples,int ntuples)7930 log_heap_freeze(Relation reln, Buffer buffer, TransactionId cutoff_xid,
7931 xl_heap_freeze_tuple *tuples, int ntuples)
7932 {
7933 xl_heap_freeze_page xlrec;
7934 XLogRecPtr recptr;
7935
7936 /* Caller should not call me on a non-WAL-logged relation */
7937 Assert(RelationNeedsWAL(reln));
7938 /* nor when there are no tuples to freeze */
7939 Assert(ntuples > 0);
7940
7941 xlrec.cutoff_xid = cutoff_xid;
7942 xlrec.ntuples = ntuples;
7943
7944 XLogBeginInsert();
7945 XLogRegisterData((char *) &xlrec, SizeOfHeapFreezePage);
7946
7947 /*
7948 * The freeze plan array is not actually in the buffer, but pretend that
7949 * it is. When XLogInsert stores the whole buffer, the freeze plan need
7950 * not be stored too.
7951 */
7952 XLogRegisterBuffer(0, buffer, REGBUF_STANDARD);
7953 XLogRegisterBufData(0, (char *) tuples,
7954 ntuples * sizeof(xl_heap_freeze_tuple));
7955
7956 recptr = XLogInsert(RM_HEAP2_ID, XLOG_HEAP2_FREEZE_PAGE);
7957
7958 return recptr;
7959 }
7960
7961 /*
7962 * Perform XLogInsert for a heap-visible operation. 'block' is the block
7963 * being marked all-visible, and vm_buffer is the buffer containing the
7964 * corresponding visibility map block. Both should have already been modified
7965 * and dirtied.
7966 *
7967 * If checksums are enabled, we also generate a full-page image of
7968 * heap_buffer, if necessary.
7969 */
7970 XLogRecPtr
log_heap_visible(RelFileNode rnode,Buffer heap_buffer,Buffer vm_buffer,TransactionId cutoff_xid,uint8 vmflags)7971 log_heap_visible(RelFileNode rnode, Buffer heap_buffer, Buffer vm_buffer,
7972 TransactionId cutoff_xid, uint8 vmflags)
7973 {
7974 xl_heap_visible xlrec;
7975 XLogRecPtr recptr;
7976 uint8 flags;
7977
7978 Assert(BufferIsValid(heap_buffer));
7979 Assert(BufferIsValid(vm_buffer));
7980
7981 xlrec.cutoff_xid = cutoff_xid;
7982 xlrec.flags = vmflags;
7983 XLogBeginInsert();
7984 XLogRegisterData((char *) &xlrec, SizeOfHeapVisible);
7985
7986 XLogRegisterBuffer(0, vm_buffer, 0);
7987
7988 flags = REGBUF_STANDARD;
7989 if (!XLogHintBitIsNeeded())
7990 flags |= REGBUF_NO_IMAGE;
7991 XLogRegisterBuffer(1, heap_buffer, flags);
7992
7993 recptr = XLogInsert(RM_HEAP2_ID, XLOG_HEAP2_VISIBLE);
7994
7995 return recptr;
7996 }
7997
7998 /*
7999 * Perform XLogInsert for a heap-update operation. Caller must already
8000 * have modified the buffer(s) and marked them dirty.
8001 */
8002 static XLogRecPtr
log_heap_update(Relation reln,Buffer oldbuf,Buffer newbuf,HeapTuple oldtup,HeapTuple newtup,HeapTuple old_key_tuple,bool all_visible_cleared,bool new_all_visible_cleared)8003 log_heap_update(Relation reln, Buffer oldbuf,
8004 Buffer newbuf, HeapTuple oldtup, HeapTuple newtup,
8005 HeapTuple old_key_tuple,
8006 bool all_visible_cleared, bool new_all_visible_cleared)
8007 {
8008 xl_heap_update xlrec;
8009 xl_heap_header xlhdr;
8010 xl_heap_header xlhdr_idx;
8011 uint8 info;
8012 uint16 prefix_suffix[2];
8013 uint16 prefixlen = 0,
8014 suffixlen = 0;
8015 XLogRecPtr recptr;
8016 Page page = BufferGetPage(newbuf);
8017 bool need_tuple_data = RelationIsLogicallyLogged(reln);
8018 bool init;
8019 int bufflags;
8020
8021 /* Caller should not call me on a non-WAL-logged relation */
8022 Assert(RelationNeedsWAL(reln));
8023
8024 XLogBeginInsert();
8025
8026 if (HeapTupleIsHeapOnly(newtup))
8027 info = XLOG_HEAP_HOT_UPDATE;
8028 else
8029 info = XLOG_HEAP_UPDATE;
8030
8031 /*
8032 * If the old and new tuple are on the same page, we only need to log the
8033 * parts of the new tuple that were changed. That saves on the amount of
8034 * WAL we need to write. Currently, we just count any unchanged bytes in
8035 * the beginning and end of the tuple. That's quick to check, and
8036 * perfectly covers the common case that only one field is updated.
8037 *
8038 * We could do this even if the old and new tuple are on different pages,
8039 * but only if we don't make a full-page image of the old page, which is
8040 * difficult to know in advance. Also, if the old tuple is corrupt for
8041 * some reason, it would allow the corruption to propagate the new page,
8042 * so it seems best to avoid. Under the general assumption that most
8043 * updates tend to create the new tuple version on the same page, there
8044 * isn't much to be gained by doing this across pages anyway.
8045 *
8046 * Skip this if we're taking a full-page image of the new page, as we
8047 * don't include the new tuple in the WAL record in that case. Also
8048 * disable if wal_level='logical', as logical decoding needs to be able to
8049 * read the new tuple in whole from the WAL record alone.
8050 */
8051 if (oldbuf == newbuf && !need_tuple_data &&
8052 !XLogCheckBufferNeedsBackup(newbuf))
8053 {
8054 char *oldp = (char *) oldtup->t_data + oldtup->t_data->t_hoff;
8055 char *newp = (char *) newtup->t_data + newtup->t_data->t_hoff;
8056 int oldlen = oldtup->t_len - oldtup->t_data->t_hoff;
8057 int newlen = newtup->t_len - newtup->t_data->t_hoff;
8058
8059 /* Check for common prefix between old and new tuple */
8060 for (prefixlen = 0; prefixlen < Min(oldlen, newlen); prefixlen++)
8061 {
8062 if (newp[prefixlen] != oldp[prefixlen])
8063 break;
8064 }
8065
8066 /*
8067 * Storing the length of the prefix takes 2 bytes, so we need to save
8068 * at least 3 bytes or there's no point.
8069 */
8070 if (prefixlen < 3)
8071 prefixlen = 0;
8072
8073 /* Same for suffix */
8074 for (suffixlen = 0; suffixlen < Min(oldlen, newlen) - prefixlen; suffixlen++)
8075 {
8076 if (newp[newlen - suffixlen - 1] != oldp[oldlen - suffixlen - 1])
8077 break;
8078 }
8079 if (suffixlen < 3)
8080 suffixlen = 0;
8081 }
8082
8083 /* Prepare main WAL data chain */
8084 xlrec.flags = 0;
8085 if (all_visible_cleared)
8086 xlrec.flags |= XLH_UPDATE_OLD_ALL_VISIBLE_CLEARED;
8087 if (new_all_visible_cleared)
8088 xlrec.flags |= XLH_UPDATE_NEW_ALL_VISIBLE_CLEARED;
8089 if (prefixlen > 0)
8090 xlrec.flags |= XLH_UPDATE_PREFIX_FROM_OLD;
8091 if (suffixlen > 0)
8092 xlrec.flags |= XLH_UPDATE_SUFFIX_FROM_OLD;
8093 if (need_tuple_data)
8094 {
8095 xlrec.flags |= XLH_UPDATE_CONTAINS_NEW_TUPLE;
8096 if (old_key_tuple)
8097 {
8098 if (reln->rd_rel->relreplident == REPLICA_IDENTITY_FULL)
8099 xlrec.flags |= XLH_UPDATE_CONTAINS_OLD_TUPLE;
8100 else
8101 xlrec.flags |= XLH_UPDATE_CONTAINS_OLD_KEY;
8102 }
8103 }
8104
8105 /* If new tuple is the single and first tuple on page... */
8106 if (ItemPointerGetOffsetNumber(&(newtup->t_self)) == FirstOffsetNumber &&
8107 PageGetMaxOffsetNumber(page) == FirstOffsetNumber)
8108 {
8109 info |= XLOG_HEAP_INIT_PAGE;
8110 init = true;
8111 }
8112 else
8113 init = false;
8114
8115 /* Prepare WAL data for the old page */
8116 xlrec.old_offnum = ItemPointerGetOffsetNumber(&oldtup->t_self);
8117 xlrec.old_xmax = HeapTupleHeaderGetRawXmax(oldtup->t_data);
8118 xlrec.old_infobits_set = compute_infobits(oldtup->t_data->t_infomask,
8119 oldtup->t_data->t_infomask2);
8120
8121 /* Prepare WAL data for the new page */
8122 xlrec.new_offnum = ItemPointerGetOffsetNumber(&newtup->t_self);
8123 xlrec.new_xmax = HeapTupleHeaderGetRawXmax(newtup->t_data);
8124
8125 bufflags = REGBUF_STANDARD;
8126 if (init)
8127 bufflags |= REGBUF_WILL_INIT;
8128 if (need_tuple_data)
8129 bufflags |= REGBUF_KEEP_DATA;
8130
8131 XLogRegisterBuffer(0, newbuf, bufflags);
8132 if (oldbuf != newbuf)
8133 XLogRegisterBuffer(1, oldbuf, REGBUF_STANDARD);
8134
8135 XLogRegisterData((char *) &xlrec, SizeOfHeapUpdate);
8136
8137 /*
8138 * Prepare WAL data for the new tuple.
8139 */
8140 if (prefixlen > 0 || suffixlen > 0)
8141 {
8142 if (prefixlen > 0 && suffixlen > 0)
8143 {
8144 prefix_suffix[0] = prefixlen;
8145 prefix_suffix[1] = suffixlen;
8146 XLogRegisterBufData(0, (char *) &prefix_suffix, sizeof(uint16) * 2);
8147 }
8148 else if (prefixlen > 0)
8149 {
8150 XLogRegisterBufData(0, (char *) &prefixlen, sizeof(uint16));
8151 }
8152 else
8153 {
8154 XLogRegisterBufData(0, (char *) &suffixlen, sizeof(uint16));
8155 }
8156 }
8157
8158 xlhdr.t_infomask2 = newtup->t_data->t_infomask2;
8159 xlhdr.t_infomask = newtup->t_data->t_infomask;
8160 xlhdr.t_hoff = newtup->t_data->t_hoff;
8161 Assert(SizeofHeapTupleHeader + prefixlen + suffixlen <= newtup->t_len);
8162
8163 /*
8164 * PG73FORMAT: write bitmap [+ padding] [+ oid] + data
8165 *
8166 * The 'data' doesn't include the common prefix or suffix.
8167 */
8168 XLogRegisterBufData(0, (char *) &xlhdr, SizeOfHeapHeader);
8169 if (prefixlen == 0)
8170 {
8171 XLogRegisterBufData(0,
8172 ((char *) newtup->t_data) + SizeofHeapTupleHeader,
8173 newtup->t_len - SizeofHeapTupleHeader - suffixlen);
8174 }
8175 else
8176 {
8177 /*
8178 * Have to write the null bitmap and data after the common prefix as
8179 * two separate rdata entries.
8180 */
8181 /* bitmap [+ padding] [+ oid] */
8182 if (newtup->t_data->t_hoff - SizeofHeapTupleHeader > 0)
8183 {
8184 XLogRegisterBufData(0,
8185 ((char *) newtup->t_data) + SizeofHeapTupleHeader,
8186 newtup->t_data->t_hoff - SizeofHeapTupleHeader);
8187 }
8188
8189 /* data after common prefix */
8190 XLogRegisterBufData(0,
8191 ((char *) newtup->t_data) + newtup->t_data->t_hoff + prefixlen,
8192 newtup->t_len - newtup->t_data->t_hoff - prefixlen - suffixlen);
8193 }
8194
8195 /* We need to log a tuple identity */
8196 if (need_tuple_data && old_key_tuple)
8197 {
8198 /* don't really need this, but its more comfy to decode */
8199 xlhdr_idx.t_infomask2 = old_key_tuple->t_data->t_infomask2;
8200 xlhdr_idx.t_infomask = old_key_tuple->t_data->t_infomask;
8201 xlhdr_idx.t_hoff = old_key_tuple->t_data->t_hoff;
8202
8203 XLogRegisterData((char *) &xlhdr_idx, SizeOfHeapHeader);
8204
8205 /* PG73FORMAT: write bitmap [+ padding] [+ oid] + data */
8206 XLogRegisterData((char *) old_key_tuple->t_data + SizeofHeapTupleHeader,
8207 old_key_tuple->t_len - SizeofHeapTupleHeader);
8208 }
8209
8210 /* filtering by origin on a row level is much more efficient */
8211 XLogSetRecordFlags(XLOG_INCLUDE_ORIGIN);
8212
8213 recptr = XLogInsert(RM_HEAP_ID, info);
8214
8215 return recptr;
8216 }
8217
8218 /*
8219 * Perform XLogInsert of an XLOG_HEAP2_NEW_CID record
8220 *
8221 * This is only used in wal_level >= WAL_LEVEL_LOGICAL, and only for catalog
8222 * tuples.
8223 */
8224 static XLogRecPtr
log_heap_new_cid(Relation relation,HeapTuple tup)8225 log_heap_new_cid(Relation relation, HeapTuple tup)
8226 {
8227 xl_heap_new_cid xlrec;
8228
8229 XLogRecPtr recptr;
8230 HeapTupleHeader hdr = tup->t_data;
8231
8232 Assert(ItemPointerIsValid(&tup->t_self));
8233 Assert(tup->t_tableOid != InvalidOid);
8234
8235 xlrec.top_xid = GetTopTransactionId();
8236 xlrec.target_node = relation->rd_node;
8237 xlrec.target_tid = tup->t_self;
8238
8239 /*
8240 * If the tuple got inserted & deleted in the same TX we definitely have a
8241 * combo CID, set cmin and cmax.
8242 */
8243 if (hdr->t_infomask & HEAP_COMBOCID)
8244 {
8245 Assert(!(hdr->t_infomask & HEAP_XMAX_INVALID));
8246 Assert(!HeapTupleHeaderXminInvalid(hdr));
8247 xlrec.cmin = HeapTupleHeaderGetCmin(hdr);
8248 xlrec.cmax = HeapTupleHeaderGetCmax(hdr);
8249 xlrec.combocid = HeapTupleHeaderGetRawCommandId(hdr);
8250 }
8251 /* No combo CID, so only cmin or cmax can be set by this TX */
8252 else
8253 {
8254 /*
8255 * Tuple inserted.
8256 *
8257 * We need to check for LOCK ONLY because multixacts might be
8258 * transferred to the new tuple in case of FOR KEY SHARE updates in
8259 * which case there will be an xmax, although the tuple just got
8260 * inserted.
8261 */
8262 if (hdr->t_infomask & HEAP_XMAX_INVALID ||
8263 HEAP_XMAX_IS_LOCKED_ONLY(hdr->t_infomask))
8264 {
8265 xlrec.cmin = HeapTupleHeaderGetRawCommandId(hdr);
8266 xlrec.cmax = InvalidCommandId;
8267 }
8268 /* Tuple from a different tx updated or deleted. */
8269 else
8270 {
8271 xlrec.cmin = InvalidCommandId;
8272 xlrec.cmax = HeapTupleHeaderGetRawCommandId(hdr);
8273
8274 }
8275 xlrec.combocid = InvalidCommandId;
8276 }
8277
8278 /*
8279 * Note that we don't need to register the buffer here, because this
8280 * operation does not modify the page. The insert/update/delete that
8281 * called us certainly did, but that's WAL-logged separately.
8282 */
8283 XLogBeginInsert();
8284 XLogRegisterData((char *) &xlrec, SizeOfHeapNewCid);
8285
8286 /* will be looked at irrespective of origin */
8287
8288 recptr = XLogInsert(RM_HEAP2_ID, XLOG_HEAP2_NEW_CID);
8289
8290 return recptr;
8291 }
8292
8293 /*
8294 * Build a heap tuple representing the configured REPLICA IDENTITY to represent
8295 * the old tuple in a UPDATE or DELETE.
8296 *
8297 * Returns NULL if there's no need to log an identity or if there's no suitable
8298 * key defined.
8299 *
8300 * key_changed should be false if caller knows that no replica identity
8301 * columns changed value. It's always true in the DELETE case.
8302 *
8303 * *copy is set to true if the returned tuple is a modified copy rather than
8304 * the same tuple that was passed in.
8305 */
8306 static HeapTuple
ExtractReplicaIdentity(Relation relation,HeapTuple tp,bool key_changed,bool * copy)8307 ExtractReplicaIdentity(Relation relation, HeapTuple tp, bool key_changed,
8308 bool *copy)
8309 {
8310 TupleDesc desc = RelationGetDescr(relation);
8311 char replident = relation->rd_rel->relreplident;
8312 Bitmapset *idattrs;
8313 HeapTuple key_tuple;
8314 bool nulls[MaxHeapAttributeNumber];
8315 Datum values[MaxHeapAttributeNumber];
8316
8317 *copy = false;
8318
8319 if (!RelationIsLogicallyLogged(relation))
8320 return NULL;
8321
8322 if (replident == REPLICA_IDENTITY_NOTHING)
8323 return NULL;
8324
8325 if (replident == REPLICA_IDENTITY_FULL)
8326 {
8327 /*
8328 * When logging the entire old tuple, it very well could contain
8329 * toasted columns. If so, force them to be inlined.
8330 */
8331 if (HeapTupleHasExternal(tp))
8332 {
8333 *copy = true;
8334 tp = toast_flatten_tuple(tp, desc);
8335 }
8336 return tp;
8337 }
8338
8339 /* if the key hasn't changed and we're only logging the key, we're done */
8340 if (!key_changed)
8341 return NULL;
8342
8343 /* find out the replica identity columns */
8344 idattrs = RelationGetIndexAttrBitmap(relation,
8345 INDEX_ATTR_BITMAP_IDENTITY_KEY);
8346
8347 /*
8348 * If there's no defined replica identity columns, treat as !key_changed.
8349 * (This case should not be reachable from heap_update, since that should
8350 * calculate key_changed accurately. But heap_delete just passes constant
8351 * true for key_changed, so we can hit this case in deletes.)
8352 */
8353 if (bms_is_empty(idattrs))
8354 return NULL;
8355
8356 /*
8357 * Construct a new tuple containing only the replica identity columns,
8358 * with nulls elsewhere. While we're at it, assert that the replica
8359 * identity columns aren't null.
8360 */
8361 heap_deform_tuple(tp, desc, values, nulls);
8362
8363 for (int i = 0; i < desc->natts; i++)
8364 {
8365 if (bms_is_member(i + 1 - FirstLowInvalidHeapAttributeNumber,
8366 idattrs))
8367 Assert(!nulls[i]);
8368 else
8369 nulls[i] = true;
8370 }
8371
8372 key_tuple = heap_form_tuple(desc, values, nulls);
8373 *copy = true;
8374
8375 bms_free(idattrs);
8376
8377 /*
8378 * If the tuple, which by here only contains indexed columns, still has
8379 * toasted columns, force them to be inlined. This is somewhat unlikely
8380 * since there's limits on the size of indexed columns, so we don't
8381 * duplicate toast_flatten_tuple()s functionality in the above loop over
8382 * the indexed columns, even if it would be more efficient.
8383 */
8384 if (HeapTupleHasExternal(key_tuple))
8385 {
8386 HeapTuple oldtup = key_tuple;
8387
8388 key_tuple = toast_flatten_tuple(oldtup, desc);
8389 heap_freetuple(oldtup);
8390 }
8391
8392 return key_tuple;
8393 }
8394
8395 /*
8396 * Handles XLOG_HEAP2_PRUNE record type.
8397 *
8398 * Acquires a super-exclusive lock.
8399 */
8400 static void
heap_xlog_prune(XLogReaderState * record)8401 heap_xlog_prune(XLogReaderState *record)
8402 {
8403 XLogRecPtr lsn = record->EndRecPtr;
8404 xl_heap_prune *xlrec = (xl_heap_prune *) XLogRecGetData(record);
8405 Buffer buffer;
8406 RelFileNode rnode;
8407 BlockNumber blkno;
8408 XLogRedoAction action;
8409
8410 XLogRecGetBlockTag(record, 0, &rnode, NULL, &blkno);
8411
8412 /*
8413 * We're about to remove tuples. In Hot Standby mode, ensure that there's
8414 * no queries running for which the removed tuples are still visible.
8415 */
8416 if (InHotStandby)
8417 ResolveRecoveryConflictWithSnapshot(xlrec->latestRemovedXid, rnode);
8418
8419 /*
8420 * If we have a full-page image, restore it (using a cleanup lock) and
8421 * we're done.
8422 */
8423 action = XLogReadBufferForRedoExtended(record, 0, RBM_NORMAL, true,
8424 &buffer);
8425 if (action == BLK_NEEDS_REDO)
8426 {
8427 Page page = (Page) BufferGetPage(buffer);
8428 OffsetNumber *end;
8429 OffsetNumber *redirected;
8430 OffsetNumber *nowdead;
8431 OffsetNumber *nowunused;
8432 int nredirected;
8433 int ndead;
8434 int nunused;
8435 Size datalen;
8436
8437 redirected = (OffsetNumber *) XLogRecGetBlockData(record, 0, &datalen);
8438
8439 nredirected = xlrec->nredirected;
8440 ndead = xlrec->ndead;
8441 end = (OffsetNumber *) ((char *) redirected + datalen);
8442 nowdead = redirected + (nredirected * 2);
8443 nowunused = nowdead + ndead;
8444 nunused = (end - nowunused);
8445 Assert(nunused >= 0);
8446
8447 /* Update all line pointers per the record, and repair fragmentation */
8448 heap_page_prune_execute(buffer,
8449 redirected, nredirected,
8450 nowdead, ndead,
8451 nowunused, nunused);
8452
8453 /*
8454 * Note: we don't worry about updating the page's prunability hints.
8455 * At worst this will cause an extra prune cycle to occur soon.
8456 */
8457
8458 PageSetLSN(page, lsn);
8459 MarkBufferDirty(buffer);
8460 }
8461
8462 if (BufferIsValid(buffer))
8463 {
8464 Size freespace = PageGetHeapFreeSpace(BufferGetPage(buffer));
8465
8466 UnlockReleaseBuffer(buffer);
8467
8468 /*
8469 * After pruning records from a page, it's useful to update the FSM
8470 * about it, as it may cause the page become target for insertions
8471 * later even if vacuum decides not to visit it (which is possible if
8472 * gets marked all-visible.)
8473 *
8474 * Do this regardless of a full-page image being applied, since the
8475 * FSM data is not in the page anyway.
8476 */
8477 XLogRecordPageWithFreeSpace(rnode, blkno, freespace);
8478 }
8479 }
8480
8481 /*
8482 * Handles XLOG_HEAP2_VACUUM record type.
8483 *
8484 * Acquires an exclusive lock only.
8485 */
8486 static void
heap_xlog_vacuum(XLogReaderState * record)8487 heap_xlog_vacuum(XLogReaderState *record)
8488 {
8489 XLogRecPtr lsn = record->EndRecPtr;
8490 xl_heap_vacuum *xlrec = (xl_heap_vacuum *) XLogRecGetData(record);
8491 Buffer buffer;
8492 BlockNumber blkno;
8493 XLogRedoAction action;
8494
8495 /*
8496 * If we have a full-page image, restore it (without using a cleanup lock)
8497 * and we're done.
8498 */
8499 action = XLogReadBufferForRedoExtended(record, 0, RBM_NORMAL, false,
8500 &buffer);
8501 if (action == BLK_NEEDS_REDO)
8502 {
8503 Page page = (Page) BufferGetPage(buffer);
8504 OffsetNumber *nowunused;
8505 Size datalen;
8506 OffsetNumber *offnum;
8507
8508 nowunused = (OffsetNumber *) XLogRecGetBlockData(record, 0, &datalen);
8509
8510 /* Shouldn't be a record unless there's something to do */
8511 Assert(xlrec->nunused > 0);
8512
8513 /* Update all now-unused line pointers */
8514 offnum = nowunused;
8515 for (int i = 0; i < xlrec->nunused; i++)
8516 {
8517 OffsetNumber off = *offnum++;
8518 ItemId lp = PageGetItemId(page, off);
8519
8520 Assert(ItemIdIsDead(lp) && !ItemIdHasStorage(lp));
8521 ItemIdSetUnused(lp);
8522 }
8523
8524 /* Attempt to truncate line pointer array now */
8525 PageTruncateLinePointerArray(page);
8526
8527 PageSetLSN(page, lsn);
8528 MarkBufferDirty(buffer);
8529 }
8530
8531 if (BufferIsValid(buffer))
8532 {
8533 Size freespace = PageGetHeapFreeSpace(BufferGetPage(buffer));
8534 RelFileNode rnode;
8535
8536 XLogRecGetBlockTag(record, 0, &rnode, NULL, &blkno);
8537
8538 UnlockReleaseBuffer(buffer);
8539
8540 /*
8541 * After vacuuming LP_DEAD items from a page, it's useful to update
8542 * the FSM about it, as it may cause the page become target for
8543 * insertions later even if vacuum decides not to visit it (which is
8544 * possible if gets marked all-visible.)
8545 *
8546 * Do this regardless of a full-page image being applied, since the
8547 * FSM data is not in the page anyway.
8548 */
8549 XLogRecordPageWithFreeSpace(rnode, blkno, freespace);
8550 }
8551 }
8552
8553 /*
8554 * Replay XLOG_HEAP2_VISIBLE record.
8555 *
8556 * The critical integrity requirement here is that we must never end up with
8557 * a situation where the visibility map bit is set, and the page-level
8558 * PD_ALL_VISIBLE bit is clear. If that were to occur, then a subsequent
8559 * page modification would fail to clear the visibility map bit.
8560 */
8561 static void
heap_xlog_visible(XLogReaderState * record)8562 heap_xlog_visible(XLogReaderState *record)
8563 {
8564 XLogRecPtr lsn = record->EndRecPtr;
8565 xl_heap_visible *xlrec = (xl_heap_visible *) XLogRecGetData(record);
8566 Buffer vmbuffer = InvalidBuffer;
8567 Buffer buffer;
8568 Page page;
8569 RelFileNode rnode;
8570 BlockNumber blkno;
8571 XLogRedoAction action;
8572
8573 XLogRecGetBlockTag(record, 1, &rnode, NULL, &blkno);
8574
8575 /*
8576 * If there are any Hot Standby transactions running that have an xmin
8577 * horizon old enough that this page isn't all-visible for them, they
8578 * might incorrectly decide that an index-only scan can skip a heap fetch.
8579 *
8580 * NB: It might be better to throw some kind of "soft" conflict here that
8581 * forces any index-only scan that is in flight to perform heap fetches,
8582 * rather than killing the transaction outright.
8583 */
8584 if (InHotStandby)
8585 ResolveRecoveryConflictWithSnapshot(xlrec->cutoff_xid, rnode);
8586
8587 /*
8588 * Read the heap page, if it still exists. If the heap file has dropped or
8589 * truncated later in recovery, we don't need to update the page, but we'd
8590 * better still update the visibility map.
8591 */
8592 action = XLogReadBufferForRedo(record, 1, &buffer);
8593 if (action == BLK_NEEDS_REDO)
8594 {
8595 /*
8596 * We don't bump the LSN of the heap page when setting the visibility
8597 * map bit (unless checksums or wal_hint_bits is enabled, in which
8598 * case we must), because that would generate an unworkable volume of
8599 * full-page writes. This exposes us to torn page hazards, but since
8600 * we're not inspecting the existing page contents in any way, we
8601 * don't care.
8602 *
8603 * However, all operations that clear the visibility map bit *do* bump
8604 * the LSN, and those operations will only be replayed if the XLOG LSN
8605 * follows the page LSN. Thus, if the page LSN has advanced past our
8606 * XLOG record's LSN, we mustn't mark the page all-visible, because
8607 * the subsequent update won't be replayed to clear the flag.
8608 */
8609 page = BufferGetPage(buffer);
8610
8611 PageSetAllVisible(page);
8612
8613 MarkBufferDirty(buffer);
8614 }
8615 else if (action == BLK_RESTORED)
8616 {
8617 /*
8618 * If heap block was backed up, we already restored it and there's
8619 * nothing more to do. (This can only happen with checksums or
8620 * wal_log_hints enabled.)
8621 */
8622 }
8623
8624 if (BufferIsValid(buffer))
8625 {
8626 Size space = PageGetFreeSpace(BufferGetPage(buffer));
8627
8628 UnlockReleaseBuffer(buffer);
8629
8630 /*
8631 * Since FSM is not WAL-logged and only updated heuristically, it
8632 * easily becomes stale in standbys. If the standby is later promoted
8633 * and runs VACUUM, it will skip updating individual free space
8634 * figures for pages that became all-visible (or all-frozen, depending
8635 * on the vacuum mode,) which is troublesome when FreeSpaceMapVacuum
8636 * propagates too optimistic free space values to upper FSM layers;
8637 * later inserters try to use such pages only to find out that they
8638 * are unusable. This can cause long stalls when there are many such
8639 * pages.
8640 *
8641 * Forestall those problems by updating FSM's idea about a page that
8642 * is becoming all-visible or all-frozen.
8643 *
8644 * Do this regardless of a full-page image being applied, since the
8645 * FSM data is not in the page anyway.
8646 */
8647 if (xlrec->flags & VISIBILITYMAP_VALID_BITS)
8648 XLogRecordPageWithFreeSpace(rnode, blkno, space);
8649 }
8650
8651 /*
8652 * Even if we skipped the heap page update due to the LSN interlock, it's
8653 * still safe to update the visibility map. Any WAL record that clears
8654 * the visibility map bit does so before checking the page LSN, so any
8655 * bits that need to be cleared will still be cleared.
8656 */
8657 if (XLogReadBufferForRedoExtended(record, 0, RBM_ZERO_ON_ERROR, false,
8658 &vmbuffer) == BLK_NEEDS_REDO)
8659 {
8660 Page vmpage = BufferGetPage(vmbuffer);
8661 Relation reln;
8662
8663 /* initialize the page if it was read as zeros */
8664 if (PageIsNew(vmpage))
8665 PageInit(vmpage, BLCKSZ, 0);
8666
8667 /*
8668 * XLogReadBufferForRedoExtended locked the buffer. But
8669 * visibilitymap_set will handle locking itself.
8670 */
8671 LockBuffer(vmbuffer, BUFFER_LOCK_UNLOCK);
8672
8673 reln = CreateFakeRelcacheEntry(rnode);
8674 visibilitymap_pin(reln, blkno, &vmbuffer);
8675
8676 /*
8677 * Don't set the bit if replay has already passed this point.
8678 *
8679 * It might be safe to do this unconditionally; if replay has passed
8680 * this point, we'll replay at least as far this time as we did
8681 * before, and if this bit needs to be cleared, the record responsible
8682 * for doing so should be again replayed, and clear it. For right
8683 * now, out of an abundance of conservatism, we use the same test here
8684 * we did for the heap page. If this results in a dropped bit, no
8685 * real harm is done; and the next VACUUM will fix it.
8686 */
8687 if (lsn > PageGetLSN(vmpage))
8688 visibilitymap_set(reln, blkno, InvalidBuffer, lsn, vmbuffer,
8689 xlrec->cutoff_xid, xlrec->flags);
8690
8691 ReleaseBuffer(vmbuffer);
8692 FreeFakeRelcacheEntry(reln);
8693 }
8694 else if (BufferIsValid(vmbuffer))
8695 UnlockReleaseBuffer(vmbuffer);
8696 }
8697
8698 /*
8699 * Replay XLOG_HEAP2_FREEZE_PAGE records
8700 */
8701 static void
heap_xlog_freeze_page(XLogReaderState * record)8702 heap_xlog_freeze_page(XLogReaderState *record)
8703 {
8704 XLogRecPtr lsn = record->EndRecPtr;
8705 xl_heap_freeze_page *xlrec = (xl_heap_freeze_page *) XLogRecGetData(record);
8706 TransactionId cutoff_xid = xlrec->cutoff_xid;
8707 Buffer buffer;
8708 int ntup;
8709
8710 /*
8711 * In Hot Standby mode, ensure that there's no queries running which still
8712 * consider the frozen xids as running.
8713 */
8714 if (InHotStandby)
8715 {
8716 RelFileNode rnode;
8717 TransactionId latestRemovedXid = cutoff_xid;
8718
8719 TransactionIdRetreat(latestRemovedXid);
8720
8721 XLogRecGetBlockTag(record, 0, &rnode, NULL, NULL);
8722 ResolveRecoveryConflictWithSnapshot(latestRemovedXid, rnode);
8723 }
8724
8725 if (XLogReadBufferForRedo(record, 0, &buffer) == BLK_NEEDS_REDO)
8726 {
8727 Page page = BufferGetPage(buffer);
8728 xl_heap_freeze_tuple *tuples;
8729
8730 tuples = (xl_heap_freeze_tuple *) XLogRecGetBlockData(record, 0, NULL);
8731
8732 /* now execute freeze plan for each frozen tuple */
8733 for (ntup = 0; ntup < xlrec->ntuples; ntup++)
8734 {
8735 xl_heap_freeze_tuple *xlrec_tp;
8736 ItemId lp;
8737 HeapTupleHeader tuple;
8738
8739 xlrec_tp = &tuples[ntup];
8740 lp = PageGetItemId(page, xlrec_tp->offset); /* offsets are one-based */
8741 tuple = (HeapTupleHeader) PageGetItem(page, lp);
8742
8743 heap_execute_freeze_tuple(tuple, xlrec_tp);
8744 }
8745
8746 PageSetLSN(page, lsn);
8747 MarkBufferDirty(buffer);
8748 }
8749 if (BufferIsValid(buffer))
8750 UnlockReleaseBuffer(buffer);
8751 }
8752
8753 /*
8754 * Given an "infobits" field from an XLog record, set the correct bits in the
8755 * given infomask and infomask2 for the tuple touched by the record.
8756 *
8757 * (This is the reverse of compute_infobits).
8758 */
8759 static void
fix_infomask_from_infobits(uint8 infobits,uint16 * infomask,uint16 * infomask2)8760 fix_infomask_from_infobits(uint8 infobits, uint16 *infomask, uint16 *infomask2)
8761 {
8762 *infomask &= ~(HEAP_XMAX_IS_MULTI | HEAP_XMAX_LOCK_ONLY |
8763 HEAP_XMAX_KEYSHR_LOCK | HEAP_XMAX_EXCL_LOCK);
8764 *infomask2 &= ~HEAP_KEYS_UPDATED;
8765
8766 if (infobits & XLHL_XMAX_IS_MULTI)
8767 *infomask |= HEAP_XMAX_IS_MULTI;
8768 if (infobits & XLHL_XMAX_LOCK_ONLY)
8769 *infomask |= HEAP_XMAX_LOCK_ONLY;
8770 if (infobits & XLHL_XMAX_EXCL_LOCK)
8771 *infomask |= HEAP_XMAX_EXCL_LOCK;
8772 /* note HEAP_XMAX_SHR_LOCK isn't considered here */
8773 if (infobits & XLHL_XMAX_KEYSHR_LOCK)
8774 *infomask |= HEAP_XMAX_KEYSHR_LOCK;
8775
8776 if (infobits & XLHL_KEYS_UPDATED)
8777 *infomask2 |= HEAP_KEYS_UPDATED;
8778 }
8779
8780 static void
heap_xlog_delete(XLogReaderState * record)8781 heap_xlog_delete(XLogReaderState *record)
8782 {
8783 XLogRecPtr lsn = record->EndRecPtr;
8784 xl_heap_delete *xlrec = (xl_heap_delete *) XLogRecGetData(record);
8785 Buffer buffer;
8786 Page page;
8787 ItemId lp = NULL;
8788 HeapTupleHeader htup;
8789 BlockNumber blkno;
8790 RelFileNode target_node;
8791 ItemPointerData target_tid;
8792
8793 XLogRecGetBlockTag(record, 0, &target_node, NULL, &blkno);
8794 ItemPointerSetBlockNumber(&target_tid, blkno);
8795 ItemPointerSetOffsetNumber(&target_tid, xlrec->offnum);
8796
8797 /*
8798 * The visibility map may need to be fixed even if the heap page is
8799 * already up-to-date.
8800 */
8801 if (xlrec->flags & XLH_DELETE_ALL_VISIBLE_CLEARED)
8802 {
8803 Relation reln = CreateFakeRelcacheEntry(target_node);
8804 Buffer vmbuffer = InvalidBuffer;
8805
8806 visibilitymap_pin(reln, blkno, &vmbuffer);
8807 visibilitymap_clear(reln, blkno, vmbuffer, VISIBILITYMAP_VALID_BITS);
8808 ReleaseBuffer(vmbuffer);
8809 FreeFakeRelcacheEntry(reln);
8810 }
8811
8812 if (XLogReadBufferForRedo(record, 0, &buffer) == BLK_NEEDS_REDO)
8813 {
8814 page = BufferGetPage(buffer);
8815
8816 if (PageGetMaxOffsetNumber(page) >= xlrec->offnum)
8817 lp = PageGetItemId(page, xlrec->offnum);
8818
8819 if (PageGetMaxOffsetNumber(page) < xlrec->offnum || !ItemIdIsNormal(lp))
8820 elog(PANIC, "invalid lp");
8821
8822 htup = (HeapTupleHeader) PageGetItem(page, lp);
8823
8824 htup->t_infomask &= ~(HEAP_XMAX_BITS | HEAP_MOVED);
8825 htup->t_infomask2 &= ~HEAP_KEYS_UPDATED;
8826 HeapTupleHeaderClearHotUpdated(htup);
8827 fix_infomask_from_infobits(xlrec->infobits_set,
8828 &htup->t_infomask, &htup->t_infomask2);
8829 if (!(xlrec->flags & XLH_DELETE_IS_SUPER))
8830 HeapTupleHeaderSetXmax(htup, xlrec->xmax);
8831 else
8832 HeapTupleHeaderSetXmin(htup, InvalidTransactionId);
8833 HeapTupleHeaderSetCmax(htup, FirstCommandId, false);
8834
8835 /* Mark the page as a candidate for pruning */
8836 PageSetPrunable(page, XLogRecGetXid(record));
8837
8838 if (xlrec->flags & XLH_DELETE_ALL_VISIBLE_CLEARED)
8839 PageClearAllVisible(page);
8840
8841 /* Make sure t_ctid is set correctly */
8842 if (xlrec->flags & XLH_DELETE_IS_PARTITION_MOVE)
8843 HeapTupleHeaderSetMovedPartitions(htup);
8844 else
8845 htup->t_ctid = target_tid;
8846 PageSetLSN(page, lsn);
8847 MarkBufferDirty(buffer);
8848 }
8849 if (BufferIsValid(buffer))
8850 UnlockReleaseBuffer(buffer);
8851 }
8852
8853 static void
heap_xlog_insert(XLogReaderState * record)8854 heap_xlog_insert(XLogReaderState *record)
8855 {
8856 XLogRecPtr lsn = record->EndRecPtr;
8857 xl_heap_insert *xlrec = (xl_heap_insert *) XLogRecGetData(record);
8858 Buffer buffer;
8859 Page page;
8860 union
8861 {
8862 HeapTupleHeaderData hdr;
8863 char data[MaxHeapTupleSize];
8864 } tbuf;
8865 HeapTupleHeader htup;
8866 xl_heap_header xlhdr;
8867 uint32 newlen;
8868 Size freespace = 0;
8869 RelFileNode target_node;
8870 BlockNumber blkno;
8871 ItemPointerData target_tid;
8872 XLogRedoAction action;
8873
8874 XLogRecGetBlockTag(record, 0, &target_node, NULL, &blkno);
8875 ItemPointerSetBlockNumber(&target_tid, blkno);
8876 ItemPointerSetOffsetNumber(&target_tid, xlrec->offnum);
8877
8878 /*
8879 * The visibility map may need to be fixed even if the heap page is
8880 * already up-to-date.
8881 */
8882 if (xlrec->flags & XLH_INSERT_ALL_VISIBLE_CLEARED)
8883 {
8884 Relation reln = CreateFakeRelcacheEntry(target_node);
8885 Buffer vmbuffer = InvalidBuffer;
8886
8887 visibilitymap_pin(reln, blkno, &vmbuffer);
8888 visibilitymap_clear(reln, blkno, vmbuffer, VISIBILITYMAP_VALID_BITS);
8889 ReleaseBuffer(vmbuffer);
8890 FreeFakeRelcacheEntry(reln);
8891 }
8892
8893 /*
8894 * If we inserted the first and only tuple on the page, re-initialize the
8895 * page from scratch.
8896 */
8897 if (XLogRecGetInfo(record) & XLOG_HEAP_INIT_PAGE)
8898 {
8899 buffer = XLogInitBufferForRedo(record, 0);
8900 page = BufferGetPage(buffer);
8901 PageInit(page, BufferGetPageSize(buffer), 0);
8902 action = BLK_NEEDS_REDO;
8903 }
8904 else
8905 action = XLogReadBufferForRedo(record, 0, &buffer);
8906 if (action == BLK_NEEDS_REDO)
8907 {
8908 Size datalen;
8909 char *data;
8910
8911 page = BufferGetPage(buffer);
8912
8913 if (PageGetMaxOffsetNumber(page) + 1 < xlrec->offnum)
8914 elog(PANIC, "invalid max offset number");
8915
8916 data = XLogRecGetBlockData(record, 0, &datalen);
8917
8918 newlen = datalen - SizeOfHeapHeader;
8919 Assert(datalen > SizeOfHeapHeader && newlen <= MaxHeapTupleSize);
8920 memcpy((char *) &xlhdr, data, SizeOfHeapHeader);
8921 data += SizeOfHeapHeader;
8922
8923 htup = &tbuf.hdr;
8924 MemSet((char *) htup, 0, SizeofHeapTupleHeader);
8925 /* PG73FORMAT: get bitmap [+ padding] [+ oid] + data */
8926 memcpy((char *) htup + SizeofHeapTupleHeader,
8927 data,
8928 newlen);
8929 newlen += SizeofHeapTupleHeader;
8930 htup->t_infomask2 = xlhdr.t_infomask2;
8931 htup->t_infomask = xlhdr.t_infomask;
8932 htup->t_hoff = xlhdr.t_hoff;
8933 HeapTupleHeaderSetXmin(htup, XLogRecGetXid(record));
8934 HeapTupleHeaderSetCmin(htup, FirstCommandId);
8935 htup->t_ctid = target_tid;
8936
8937 if (PageAddItem(page, (Item) htup, newlen, xlrec->offnum,
8938 true, true) == InvalidOffsetNumber)
8939 elog(PANIC, "failed to add tuple");
8940
8941 freespace = PageGetHeapFreeSpace(page); /* needed to update FSM below */
8942
8943 PageSetLSN(page, lsn);
8944
8945 if (xlrec->flags & XLH_INSERT_ALL_VISIBLE_CLEARED)
8946 PageClearAllVisible(page);
8947
8948 /* XLH_INSERT_ALL_FROZEN_SET implies that all tuples are visible */
8949 if (xlrec->flags & XLH_INSERT_ALL_FROZEN_SET)
8950 PageSetAllVisible(page);
8951
8952 MarkBufferDirty(buffer);
8953 }
8954 if (BufferIsValid(buffer))
8955 UnlockReleaseBuffer(buffer);
8956
8957 /*
8958 * If the page is running low on free space, update the FSM as well.
8959 * Arbitrarily, our definition of "low" is less than 20%. We can't do much
8960 * better than that without knowing the fill-factor for the table.
8961 *
8962 * XXX: Don't do this if the page was restored from full page image. We
8963 * don't bother to update the FSM in that case, it doesn't need to be
8964 * totally accurate anyway.
8965 */
8966 if (action == BLK_NEEDS_REDO && freespace < BLCKSZ / 5)
8967 XLogRecordPageWithFreeSpace(target_node, blkno, freespace);
8968 }
8969
8970 /*
8971 * Handles MULTI_INSERT record type.
8972 */
8973 static void
heap_xlog_multi_insert(XLogReaderState * record)8974 heap_xlog_multi_insert(XLogReaderState *record)
8975 {
8976 XLogRecPtr lsn = record->EndRecPtr;
8977 xl_heap_multi_insert *xlrec;
8978 RelFileNode rnode;
8979 BlockNumber blkno;
8980 Buffer buffer;
8981 Page page;
8982 union
8983 {
8984 HeapTupleHeaderData hdr;
8985 char data[MaxHeapTupleSize];
8986 } tbuf;
8987 HeapTupleHeader htup;
8988 uint32 newlen;
8989 Size freespace = 0;
8990 int i;
8991 bool isinit = (XLogRecGetInfo(record) & XLOG_HEAP_INIT_PAGE) != 0;
8992 XLogRedoAction action;
8993
8994 /*
8995 * Insertion doesn't overwrite MVCC data, so no conflict processing is
8996 * required.
8997 */
8998 xlrec = (xl_heap_multi_insert *) XLogRecGetData(record);
8999
9000 XLogRecGetBlockTag(record, 0, &rnode, NULL, &blkno);
9001
9002 /* check that the mutually exclusive flags are not both set */
9003 Assert(!((xlrec->flags & XLH_INSERT_ALL_VISIBLE_CLEARED) &&
9004 (xlrec->flags & XLH_INSERT_ALL_FROZEN_SET)));
9005
9006 /*
9007 * The visibility map may need to be fixed even if the heap page is
9008 * already up-to-date.
9009 */
9010 if (xlrec->flags & XLH_INSERT_ALL_VISIBLE_CLEARED)
9011 {
9012 Relation reln = CreateFakeRelcacheEntry(rnode);
9013 Buffer vmbuffer = InvalidBuffer;
9014
9015 visibilitymap_pin(reln, blkno, &vmbuffer);
9016 visibilitymap_clear(reln, blkno, vmbuffer, VISIBILITYMAP_VALID_BITS);
9017 ReleaseBuffer(vmbuffer);
9018 FreeFakeRelcacheEntry(reln);
9019 }
9020
9021 if (isinit)
9022 {
9023 buffer = XLogInitBufferForRedo(record, 0);
9024 page = BufferGetPage(buffer);
9025 PageInit(page, BufferGetPageSize(buffer), 0);
9026 action = BLK_NEEDS_REDO;
9027 }
9028 else
9029 action = XLogReadBufferForRedo(record, 0, &buffer);
9030 if (action == BLK_NEEDS_REDO)
9031 {
9032 char *tupdata;
9033 char *endptr;
9034 Size len;
9035
9036 /* Tuples are stored as block data */
9037 tupdata = XLogRecGetBlockData(record, 0, &len);
9038 endptr = tupdata + len;
9039
9040 page = (Page) BufferGetPage(buffer);
9041
9042 for (i = 0; i < xlrec->ntuples; i++)
9043 {
9044 OffsetNumber offnum;
9045 xl_multi_insert_tuple *xlhdr;
9046
9047 /*
9048 * If we're reinitializing the page, the tuples are stored in
9049 * order from FirstOffsetNumber. Otherwise there's an array of
9050 * offsets in the WAL record, and the tuples come after that.
9051 */
9052 if (isinit)
9053 offnum = FirstOffsetNumber + i;
9054 else
9055 offnum = xlrec->offsets[i];
9056 if (PageGetMaxOffsetNumber(page) + 1 < offnum)
9057 elog(PANIC, "invalid max offset number");
9058
9059 xlhdr = (xl_multi_insert_tuple *) SHORTALIGN(tupdata);
9060 tupdata = ((char *) xlhdr) + SizeOfMultiInsertTuple;
9061
9062 newlen = xlhdr->datalen;
9063 Assert(newlen <= MaxHeapTupleSize);
9064 htup = &tbuf.hdr;
9065 MemSet((char *) htup, 0, SizeofHeapTupleHeader);
9066 /* PG73FORMAT: get bitmap [+ padding] [+ oid] + data */
9067 memcpy((char *) htup + SizeofHeapTupleHeader,
9068 (char *) tupdata,
9069 newlen);
9070 tupdata += newlen;
9071
9072 newlen += SizeofHeapTupleHeader;
9073 htup->t_infomask2 = xlhdr->t_infomask2;
9074 htup->t_infomask = xlhdr->t_infomask;
9075 htup->t_hoff = xlhdr->t_hoff;
9076 HeapTupleHeaderSetXmin(htup, XLogRecGetXid(record));
9077 HeapTupleHeaderSetCmin(htup, FirstCommandId);
9078 ItemPointerSetBlockNumber(&htup->t_ctid, blkno);
9079 ItemPointerSetOffsetNumber(&htup->t_ctid, offnum);
9080
9081 offnum = PageAddItem(page, (Item) htup, newlen, offnum, true, true);
9082 if (offnum == InvalidOffsetNumber)
9083 elog(PANIC, "failed to add tuple");
9084 }
9085 if (tupdata != endptr)
9086 elog(PANIC, "total tuple length mismatch");
9087
9088 freespace = PageGetHeapFreeSpace(page); /* needed to update FSM below */
9089
9090 PageSetLSN(page, lsn);
9091
9092 if (xlrec->flags & XLH_INSERT_ALL_VISIBLE_CLEARED)
9093 PageClearAllVisible(page);
9094
9095 /* XLH_INSERT_ALL_FROZEN_SET implies that all tuples are visible */
9096 if (xlrec->flags & XLH_INSERT_ALL_FROZEN_SET)
9097 PageSetAllVisible(page);
9098
9099 MarkBufferDirty(buffer);
9100 }
9101 if (BufferIsValid(buffer))
9102 UnlockReleaseBuffer(buffer);
9103
9104 /*
9105 * If the page is running low on free space, update the FSM as well.
9106 * Arbitrarily, our definition of "low" is less than 20%. We can't do much
9107 * better than that without knowing the fill-factor for the table.
9108 *
9109 * XXX: Don't do this if the page was restored from full page image. We
9110 * don't bother to update the FSM in that case, it doesn't need to be
9111 * totally accurate anyway.
9112 */
9113 if (action == BLK_NEEDS_REDO && freespace < BLCKSZ / 5)
9114 XLogRecordPageWithFreeSpace(rnode, blkno, freespace);
9115 }
9116
9117 /*
9118 * Handles UPDATE and HOT_UPDATE
9119 */
9120 static void
heap_xlog_update(XLogReaderState * record,bool hot_update)9121 heap_xlog_update(XLogReaderState *record, bool hot_update)
9122 {
9123 XLogRecPtr lsn = record->EndRecPtr;
9124 xl_heap_update *xlrec = (xl_heap_update *) XLogRecGetData(record);
9125 RelFileNode rnode;
9126 BlockNumber oldblk;
9127 BlockNumber newblk;
9128 ItemPointerData newtid;
9129 Buffer obuffer,
9130 nbuffer;
9131 Page page;
9132 OffsetNumber offnum;
9133 ItemId lp = NULL;
9134 HeapTupleData oldtup;
9135 HeapTupleHeader htup;
9136 uint16 prefixlen = 0,
9137 suffixlen = 0;
9138 char *newp;
9139 union
9140 {
9141 HeapTupleHeaderData hdr;
9142 char data[MaxHeapTupleSize];
9143 } tbuf;
9144 xl_heap_header xlhdr;
9145 uint32 newlen;
9146 Size freespace = 0;
9147 XLogRedoAction oldaction;
9148 XLogRedoAction newaction;
9149
9150 /* initialize to keep the compiler quiet */
9151 oldtup.t_data = NULL;
9152 oldtup.t_len = 0;
9153
9154 XLogRecGetBlockTag(record, 0, &rnode, NULL, &newblk);
9155 if (XLogRecGetBlockTag(record, 1, NULL, NULL, &oldblk))
9156 {
9157 /* HOT updates are never done across pages */
9158 Assert(!hot_update);
9159 }
9160 else
9161 oldblk = newblk;
9162
9163 ItemPointerSet(&newtid, newblk, xlrec->new_offnum);
9164
9165 /*
9166 * The visibility map may need to be fixed even if the heap page is
9167 * already up-to-date.
9168 */
9169 if (xlrec->flags & XLH_UPDATE_OLD_ALL_VISIBLE_CLEARED)
9170 {
9171 Relation reln = CreateFakeRelcacheEntry(rnode);
9172 Buffer vmbuffer = InvalidBuffer;
9173
9174 visibilitymap_pin(reln, oldblk, &vmbuffer);
9175 visibilitymap_clear(reln, oldblk, vmbuffer, VISIBILITYMAP_VALID_BITS);
9176 ReleaseBuffer(vmbuffer);
9177 FreeFakeRelcacheEntry(reln);
9178 }
9179
9180 /*
9181 * In normal operation, it is important to lock the two pages in
9182 * page-number order, to avoid possible deadlocks against other update
9183 * operations going the other way. However, during WAL replay there can
9184 * be no other update happening, so we don't need to worry about that. But
9185 * we *do* need to worry that we don't expose an inconsistent state to Hot
9186 * Standby queries --- so the original page can't be unlocked before we've
9187 * added the new tuple to the new page.
9188 */
9189
9190 /* Deal with old tuple version */
9191 oldaction = XLogReadBufferForRedo(record, (oldblk == newblk) ? 0 : 1,
9192 &obuffer);
9193 if (oldaction == BLK_NEEDS_REDO)
9194 {
9195 page = BufferGetPage(obuffer);
9196 offnum = xlrec->old_offnum;
9197 if (PageGetMaxOffsetNumber(page) >= offnum)
9198 lp = PageGetItemId(page, offnum);
9199
9200 if (PageGetMaxOffsetNumber(page) < offnum || !ItemIdIsNormal(lp))
9201 elog(PANIC, "invalid lp");
9202
9203 htup = (HeapTupleHeader) PageGetItem(page, lp);
9204
9205 oldtup.t_data = htup;
9206 oldtup.t_len = ItemIdGetLength(lp);
9207
9208 htup->t_infomask &= ~(HEAP_XMAX_BITS | HEAP_MOVED);
9209 htup->t_infomask2 &= ~HEAP_KEYS_UPDATED;
9210 if (hot_update)
9211 HeapTupleHeaderSetHotUpdated(htup);
9212 else
9213 HeapTupleHeaderClearHotUpdated(htup);
9214 fix_infomask_from_infobits(xlrec->old_infobits_set, &htup->t_infomask,
9215 &htup->t_infomask2);
9216 HeapTupleHeaderSetXmax(htup, xlrec->old_xmax);
9217 HeapTupleHeaderSetCmax(htup, FirstCommandId, false);
9218 /* Set forward chain link in t_ctid */
9219 htup->t_ctid = newtid;
9220
9221 /* Mark the page as a candidate for pruning */
9222 PageSetPrunable(page, XLogRecGetXid(record));
9223
9224 if (xlrec->flags & XLH_UPDATE_OLD_ALL_VISIBLE_CLEARED)
9225 PageClearAllVisible(page);
9226
9227 PageSetLSN(page, lsn);
9228 MarkBufferDirty(obuffer);
9229 }
9230
9231 /*
9232 * Read the page the new tuple goes into, if different from old.
9233 */
9234 if (oldblk == newblk)
9235 {
9236 nbuffer = obuffer;
9237 newaction = oldaction;
9238 }
9239 else if (XLogRecGetInfo(record) & XLOG_HEAP_INIT_PAGE)
9240 {
9241 nbuffer = XLogInitBufferForRedo(record, 0);
9242 page = (Page) BufferGetPage(nbuffer);
9243 PageInit(page, BufferGetPageSize(nbuffer), 0);
9244 newaction = BLK_NEEDS_REDO;
9245 }
9246 else
9247 newaction = XLogReadBufferForRedo(record, 0, &nbuffer);
9248
9249 /*
9250 * The visibility map may need to be fixed even if the heap page is
9251 * already up-to-date.
9252 */
9253 if (xlrec->flags & XLH_UPDATE_NEW_ALL_VISIBLE_CLEARED)
9254 {
9255 Relation reln = CreateFakeRelcacheEntry(rnode);
9256 Buffer vmbuffer = InvalidBuffer;
9257
9258 visibilitymap_pin(reln, newblk, &vmbuffer);
9259 visibilitymap_clear(reln, newblk, vmbuffer, VISIBILITYMAP_VALID_BITS);
9260 ReleaseBuffer(vmbuffer);
9261 FreeFakeRelcacheEntry(reln);
9262 }
9263
9264 /* Deal with new tuple */
9265 if (newaction == BLK_NEEDS_REDO)
9266 {
9267 char *recdata;
9268 char *recdata_end;
9269 Size datalen;
9270 Size tuplen;
9271
9272 recdata = XLogRecGetBlockData(record, 0, &datalen);
9273 recdata_end = recdata + datalen;
9274
9275 page = BufferGetPage(nbuffer);
9276
9277 offnum = xlrec->new_offnum;
9278 if (PageGetMaxOffsetNumber(page) + 1 < offnum)
9279 elog(PANIC, "invalid max offset number");
9280
9281 if (xlrec->flags & XLH_UPDATE_PREFIX_FROM_OLD)
9282 {
9283 Assert(newblk == oldblk);
9284 memcpy(&prefixlen, recdata, sizeof(uint16));
9285 recdata += sizeof(uint16);
9286 }
9287 if (xlrec->flags & XLH_UPDATE_SUFFIX_FROM_OLD)
9288 {
9289 Assert(newblk == oldblk);
9290 memcpy(&suffixlen, recdata, sizeof(uint16));
9291 recdata += sizeof(uint16);
9292 }
9293
9294 memcpy((char *) &xlhdr, recdata, SizeOfHeapHeader);
9295 recdata += SizeOfHeapHeader;
9296
9297 tuplen = recdata_end - recdata;
9298 Assert(tuplen <= MaxHeapTupleSize);
9299
9300 htup = &tbuf.hdr;
9301 MemSet((char *) htup, 0, SizeofHeapTupleHeader);
9302
9303 /*
9304 * Reconstruct the new tuple using the prefix and/or suffix from the
9305 * old tuple, and the data stored in the WAL record.
9306 */
9307 newp = (char *) htup + SizeofHeapTupleHeader;
9308 if (prefixlen > 0)
9309 {
9310 int len;
9311
9312 /* copy bitmap [+ padding] [+ oid] from WAL record */
9313 len = xlhdr.t_hoff - SizeofHeapTupleHeader;
9314 memcpy(newp, recdata, len);
9315 recdata += len;
9316 newp += len;
9317
9318 /* copy prefix from old tuple */
9319 memcpy(newp, (char *) oldtup.t_data + oldtup.t_data->t_hoff, prefixlen);
9320 newp += prefixlen;
9321
9322 /* copy new tuple data from WAL record */
9323 len = tuplen - (xlhdr.t_hoff - SizeofHeapTupleHeader);
9324 memcpy(newp, recdata, len);
9325 recdata += len;
9326 newp += len;
9327 }
9328 else
9329 {
9330 /*
9331 * copy bitmap [+ padding] [+ oid] + data from record, all in one
9332 * go
9333 */
9334 memcpy(newp, recdata, tuplen);
9335 recdata += tuplen;
9336 newp += tuplen;
9337 }
9338 Assert(recdata == recdata_end);
9339
9340 /* copy suffix from old tuple */
9341 if (suffixlen > 0)
9342 memcpy(newp, (char *) oldtup.t_data + oldtup.t_len - suffixlen, suffixlen);
9343
9344 newlen = SizeofHeapTupleHeader + tuplen + prefixlen + suffixlen;
9345 htup->t_infomask2 = xlhdr.t_infomask2;
9346 htup->t_infomask = xlhdr.t_infomask;
9347 htup->t_hoff = xlhdr.t_hoff;
9348
9349 HeapTupleHeaderSetXmin(htup, XLogRecGetXid(record));
9350 HeapTupleHeaderSetCmin(htup, FirstCommandId);
9351 HeapTupleHeaderSetXmax(htup, xlrec->new_xmax);
9352 /* Make sure there is no forward chain link in t_ctid */
9353 htup->t_ctid = newtid;
9354
9355 offnum = PageAddItem(page, (Item) htup, newlen, offnum, true, true);
9356 if (offnum == InvalidOffsetNumber)
9357 elog(PANIC, "failed to add tuple");
9358
9359 if (xlrec->flags & XLH_UPDATE_NEW_ALL_VISIBLE_CLEARED)
9360 PageClearAllVisible(page);
9361
9362 freespace = PageGetHeapFreeSpace(page); /* needed to update FSM below */
9363
9364 PageSetLSN(page, lsn);
9365 MarkBufferDirty(nbuffer);
9366 }
9367
9368 if (BufferIsValid(nbuffer) && nbuffer != obuffer)
9369 UnlockReleaseBuffer(nbuffer);
9370 if (BufferIsValid(obuffer))
9371 UnlockReleaseBuffer(obuffer);
9372
9373 /*
9374 * If the new page is running low on free space, update the FSM as well.
9375 * Arbitrarily, our definition of "low" is less than 20%. We can't do much
9376 * better than that without knowing the fill-factor for the table.
9377 *
9378 * However, don't update the FSM on HOT updates, because after crash
9379 * recovery, either the old or the new tuple will certainly be dead and
9380 * prunable. After pruning, the page will have roughly as much free space
9381 * as it did before the update, assuming the new tuple is about the same
9382 * size as the old one.
9383 *
9384 * XXX: Don't do this if the page was restored from full page image. We
9385 * don't bother to update the FSM in that case, it doesn't need to be
9386 * totally accurate anyway.
9387 */
9388 if (newaction == BLK_NEEDS_REDO && !hot_update && freespace < BLCKSZ / 5)
9389 XLogRecordPageWithFreeSpace(rnode, newblk, freespace);
9390 }
9391
9392 static void
heap_xlog_confirm(XLogReaderState * record)9393 heap_xlog_confirm(XLogReaderState *record)
9394 {
9395 XLogRecPtr lsn = record->EndRecPtr;
9396 xl_heap_confirm *xlrec = (xl_heap_confirm *) XLogRecGetData(record);
9397 Buffer buffer;
9398 Page page;
9399 OffsetNumber offnum;
9400 ItemId lp = NULL;
9401 HeapTupleHeader htup;
9402
9403 if (XLogReadBufferForRedo(record, 0, &buffer) == BLK_NEEDS_REDO)
9404 {
9405 page = BufferGetPage(buffer);
9406
9407 offnum = xlrec->offnum;
9408 if (PageGetMaxOffsetNumber(page) >= offnum)
9409 lp = PageGetItemId(page, offnum);
9410
9411 if (PageGetMaxOffsetNumber(page) < offnum || !ItemIdIsNormal(lp))
9412 elog(PANIC, "invalid lp");
9413
9414 htup = (HeapTupleHeader) PageGetItem(page, lp);
9415
9416 /*
9417 * Confirm tuple as actually inserted
9418 */
9419 ItemPointerSet(&htup->t_ctid, BufferGetBlockNumber(buffer), offnum);
9420
9421 PageSetLSN(page, lsn);
9422 MarkBufferDirty(buffer);
9423 }
9424 if (BufferIsValid(buffer))
9425 UnlockReleaseBuffer(buffer);
9426 }
9427
9428 static void
heap_xlog_lock(XLogReaderState * record)9429 heap_xlog_lock(XLogReaderState *record)
9430 {
9431 XLogRecPtr lsn = record->EndRecPtr;
9432 xl_heap_lock *xlrec = (xl_heap_lock *) XLogRecGetData(record);
9433 Buffer buffer;
9434 Page page;
9435 OffsetNumber offnum;
9436 ItemId lp = NULL;
9437 HeapTupleHeader htup;
9438
9439 /*
9440 * The visibility map may need to be fixed even if the heap page is
9441 * already up-to-date.
9442 */
9443 if (xlrec->flags & XLH_LOCK_ALL_FROZEN_CLEARED)
9444 {
9445 RelFileNode rnode;
9446 Buffer vmbuffer = InvalidBuffer;
9447 BlockNumber block;
9448 Relation reln;
9449
9450 XLogRecGetBlockTag(record, 0, &rnode, NULL, &block);
9451 reln = CreateFakeRelcacheEntry(rnode);
9452
9453 visibilitymap_pin(reln, block, &vmbuffer);
9454 visibilitymap_clear(reln, block, vmbuffer, VISIBILITYMAP_ALL_FROZEN);
9455
9456 ReleaseBuffer(vmbuffer);
9457 FreeFakeRelcacheEntry(reln);
9458 }
9459
9460 if (XLogReadBufferForRedo(record, 0, &buffer) == BLK_NEEDS_REDO)
9461 {
9462 page = (Page) BufferGetPage(buffer);
9463
9464 offnum = xlrec->offnum;
9465 if (PageGetMaxOffsetNumber(page) >= offnum)
9466 lp = PageGetItemId(page, offnum);
9467
9468 if (PageGetMaxOffsetNumber(page) < offnum || !ItemIdIsNormal(lp))
9469 elog(PANIC, "invalid lp");
9470
9471 htup = (HeapTupleHeader) PageGetItem(page, lp);
9472
9473 htup->t_infomask &= ~(HEAP_XMAX_BITS | HEAP_MOVED);
9474 htup->t_infomask2 &= ~HEAP_KEYS_UPDATED;
9475 fix_infomask_from_infobits(xlrec->infobits_set, &htup->t_infomask,
9476 &htup->t_infomask2);
9477
9478 /*
9479 * Clear relevant update flags, but only if the modified infomask says
9480 * there's no update.
9481 */
9482 if (HEAP_XMAX_IS_LOCKED_ONLY(htup->t_infomask))
9483 {
9484 HeapTupleHeaderClearHotUpdated(htup);
9485 /* Make sure there is no forward chain link in t_ctid */
9486 ItemPointerSet(&htup->t_ctid,
9487 BufferGetBlockNumber(buffer),
9488 offnum);
9489 }
9490 HeapTupleHeaderSetXmax(htup, xlrec->locking_xid);
9491 HeapTupleHeaderSetCmax(htup, FirstCommandId, false);
9492 PageSetLSN(page, lsn);
9493 MarkBufferDirty(buffer);
9494 }
9495 if (BufferIsValid(buffer))
9496 UnlockReleaseBuffer(buffer);
9497 }
9498
9499 static void
heap_xlog_lock_updated(XLogReaderState * record)9500 heap_xlog_lock_updated(XLogReaderState *record)
9501 {
9502 XLogRecPtr lsn = record->EndRecPtr;
9503 xl_heap_lock_updated *xlrec;
9504 Buffer buffer;
9505 Page page;
9506 OffsetNumber offnum;
9507 ItemId lp = NULL;
9508 HeapTupleHeader htup;
9509
9510 xlrec = (xl_heap_lock_updated *) XLogRecGetData(record);
9511
9512 /*
9513 * The visibility map may need to be fixed even if the heap page is
9514 * already up-to-date.
9515 */
9516 if (xlrec->flags & XLH_LOCK_ALL_FROZEN_CLEARED)
9517 {
9518 RelFileNode rnode;
9519 Buffer vmbuffer = InvalidBuffer;
9520 BlockNumber block;
9521 Relation reln;
9522
9523 XLogRecGetBlockTag(record, 0, &rnode, NULL, &block);
9524 reln = CreateFakeRelcacheEntry(rnode);
9525
9526 visibilitymap_pin(reln, block, &vmbuffer);
9527 visibilitymap_clear(reln, block, vmbuffer, VISIBILITYMAP_ALL_FROZEN);
9528
9529 ReleaseBuffer(vmbuffer);
9530 FreeFakeRelcacheEntry(reln);
9531 }
9532
9533 if (XLogReadBufferForRedo(record, 0, &buffer) == BLK_NEEDS_REDO)
9534 {
9535 page = BufferGetPage(buffer);
9536
9537 offnum = xlrec->offnum;
9538 if (PageGetMaxOffsetNumber(page) >= offnum)
9539 lp = PageGetItemId(page, offnum);
9540
9541 if (PageGetMaxOffsetNumber(page) < offnum || !ItemIdIsNormal(lp))
9542 elog(PANIC, "invalid lp");
9543
9544 htup = (HeapTupleHeader) PageGetItem(page, lp);
9545
9546 htup->t_infomask &= ~(HEAP_XMAX_BITS | HEAP_MOVED);
9547 htup->t_infomask2 &= ~HEAP_KEYS_UPDATED;
9548 fix_infomask_from_infobits(xlrec->infobits_set, &htup->t_infomask,
9549 &htup->t_infomask2);
9550 HeapTupleHeaderSetXmax(htup, xlrec->xmax);
9551
9552 PageSetLSN(page, lsn);
9553 MarkBufferDirty(buffer);
9554 }
9555 if (BufferIsValid(buffer))
9556 UnlockReleaseBuffer(buffer);
9557 }
9558
9559 static void
heap_xlog_inplace(XLogReaderState * record)9560 heap_xlog_inplace(XLogReaderState *record)
9561 {
9562 XLogRecPtr lsn = record->EndRecPtr;
9563 xl_heap_inplace *xlrec = (xl_heap_inplace *) XLogRecGetData(record);
9564 Buffer buffer;
9565 Page page;
9566 OffsetNumber offnum;
9567 ItemId lp = NULL;
9568 HeapTupleHeader htup;
9569 uint32 oldlen;
9570 Size newlen;
9571
9572 if (XLogReadBufferForRedo(record, 0, &buffer) == BLK_NEEDS_REDO)
9573 {
9574 char *newtup = XLogRecGetBlockData(record, 0, &newlen);
9575
9576 page = BufferGetPage(buffer);
9577
9578 offnum = xlrec->offnum;
9579 if (PageGetMaxOffsetNumber(page) >= offnum)
9580 lp = PageGetItemId(page, offnum);
9581
9582 if (PageGetMaxOffsetNumber(page) < offnum || !ItemIdIsNormal(lp))
9583 elog(PANIC, "invalid lp");
9584
9585 htup = (HeapTupleHeader) PageGetItem(page, lp);
9586
9587 oldlen = ItemIdGetLength(lp) - htup->t_hoff;
9588 if (oldlen != newlen)
9589 elog(PANIC, "wrong tuple length");
9590
9591 memcpy((char *) htup + htup->t_hoff, newtup, newlen);
9592
9593 PageSetLSN(page, lsn);
9594 MarkBufferDirty(buffer);
9595 }
9596 if (BufferIsValid(buffer))
9597 UnlockReleaseBuffer(buffer);
9598 }
9599
9600 void
heap_redo(XLogReaderState * record)9601 heap_redo(XLogReaderState *record)
9602 {
9603 uint8 info = XLogRecGetInfo(record) & ~XLR_INFO_MASK;
9604
9605 /*
9606 * These operations don't overwrite MVCC data so no conflict processing is
9607 * required. The ones in heap2 rmgr do.
9608 */
9609
9610 switch (info & XLOG_HEAP_OPMASK)
9611 {
9612 case XLOG_HEAP_INSERT:
9613 heap_xlog_insert(record);
9614 break;
9615 case XLOG_HEAP_DELETE:
9616 heap_xlog_delete(record);
9617 break;
9618 case XLOG_HEAP_UPDATE:
9619 heap_xlog_update(record, false);
9620 break;
9621 case XLOG_HEAP_TRUNCATE:
9622
9623 /*
9624 * TRUNCATE is a no-op because the actions are already logged as
9625 * SMGR WAL records. TRUNCATE WAL record only exists for logical
9626 * decoding.
9627 */
9628 break;
9629 case XLOG_HEAP_HOT_UPDATE:
9630 heap_xlog_update(record, true);
9631 break;
9632 case XLOG_HEAP_CONFIRM:
9633 heap_xlog_confirm(record);
9634 break;
9635 case XLOG_HEAP_LOCK:
9636 heap_xlog_lock(record);
9637 break;
9638 case XLOG_HEAP_INPLACE:
9639 heap_xlog_inplace(record);
9640 break;
9641 default:
9642 elog(PANIC, "heap_redo: unknown op code %u", info);
9643 }
9644 }
9645
9646 void
heap2_redo(XLogReaderState * record)9647 heap2_redo(XLogReaderState *record)
9648 {
9649 uint8 info = XLogRecGetInfo(record) & ~XLR_INFO_MASK;
9650
9651 switch (info & XLOG_HEAP_OPMASK)
9652 {
9653 case XLOG_HEAP2_PRUNE:
9654 heap_xlog_prune(record);
9655 break;
9656 case XLOG_HEAP2_VACUUM:
9657 heap_xlog_vacuum(record);
9658 break;
9659 case XLOG_HEAP2_FREEZE_PAGE:
9660 heap_xlog_freeze_page(record);
9661 break;
9662 case XLOG_HEAP2_VISIBLE:
9663 heap_xlog_visible(record);
9664 break;
9665 case XLOG_HEAP2_MULTI_INSERT:
9666 heap_xlog_multi_insert(record);
9667 break;
9668 case XLOG_HEAP2_LOCK_UPDATED:
9669 heap_xlog_lock_updated(record);
9670 break;
9671 case XLOG_HEAP2_NEW_CID:
9672
9673 /*
9674 * Nothing to do on a real replay, only used during logical
9675 * decoding.
9676 */
9677 break;
9678 case XLOG_HEAP2_REWRITE:
9679 heap_xlog_logical_rewrite(record);
9680 break;
9681 default:
9682 elog(PANIC, "heap2_redo: unknown op code %u", info);
9683 }
9684 }
9685
9686 /*
9687 * Mask a heap page before performing consistency checks on it.
9688 */
9689 void
heap_mask(char * pagedata,BlockNumber blkno)9690 heap_mask(char *pagedata, BlockNumber blkno)
9691 {
9692 Page page = (Page) pagedata;
9693 OffsetNumber off;
9694
9695 mask_page_lsn_and_checksum(page);
9696
9697 mask_page_hint_bits(page);
9698 mask_unused_space(page);
9699
9700 for (off = 1; off <= PageGetMaxOffsetNumber(page); off++)
9701 {
9702 ItemId iid = PageGetItemId(page, off);
9703 char *page_item;
9704
9705 page_item = (char *) (page + ItemIdGetOffset(iid));
9706
9707 if (ItemIdIsNormal(iid))
9708 {
9709 HeapTupleHeader page_htup = (HeapTupleHeader) page_item;
9710
9711 /*
9712 * If xmin of a tuple is not yet frozen, we should ignore
9713 * differences in hint bits, since they can be set without
9714 * emitting WAL.
9715 */
9716 if (!HeapTupleHeaderXminFrozen(page_htup))
9717 page_htup->t_infomask &= ~HEAP_XACT_MASK;
9718 else
9719 {
9720 /* Still we need to mask xmax hint bits. */
9721 page_htup->t_infomask &= ~HEAP_XMAX_INVALID;
9722 page_htup->t_infomask &= ~HEAP_XMAX_COMMITTED;
9723 }
9724
9725 /*
9726 * During replay, we set Command Id to FirstCommandId. Hence, mask
9727 * it. See heap_xlog_insert() for details.
9728 */
9729 page_htup->t_choice.t_heap.t_field3.t_cid = MASK_MARKER;
9730
9731 /*
9732 * For a speculative tuple, heap_insert() does not set ctid in the
9733 * caller-passed heap tuple itself, leaving the ctid field to
9734 * contain a speculative token value - a per-backend monotonically
9735 * increasing identifier. Besides, it does not WAL-log ctid under
9736 * any circumstances.
9737 *
9738 * During redo, heap_xlog_insert() sets t_ctid to current block
9739 * number and self offset number. It doesn't care about any
9740 * speculative insertions on the primary. Hence, we set t_ctid to
9741 * current block number and self offset number to ignore any
9742 * inconsistency.
9743 */
9744 if (HeapTupleHeaderIsSpeculative(page_htup))
9745 ItemPointerSet(&page_htup->t_ctid, blkno, off);
9746
9747 /*
9748 * NB: Not ignoring ctid changes due to the tuple having moved
9749 * (i.e. HeapTupleHeaderIndicatesMovedPartitions), because that's
9750 * important information that needs to be in-sync between primary
9751 * and standby, and thus is WAL logged.
9752 */
9753 }
9754
9755 /*
9756 * Ignore any padding bytes after the tuple, when the length of the
9757 * item is not MAXALIGNed.
9758 */
9759 if (ItemIdHasStorage(iid))
9760 {
9761 int len = ItemIdGetLength(iid);
9762 int padlen = MAXALIGN(len) - len;
9763
9764 if (padlen > 0)
9765 memset(page_item + len, MASK_MARKER, padlen);
9766 }
9767 }
9768 }
9769
9770 /*
9771 * HeapCheckForSerializableConflictOut
9772 * We are reading a tuple. If it's not visible, there may be a
9773 * rw-conflict out with the inserter. Otherwise, if it is visible to us
9774 * but has been deleted, there may be a rw-conflict out with the deleter.
9775 *
9776 * We will determine the top level xid of the writing transaction with which
9777 * we may be in conflict, and ask CheckForSerializableConflictOut() to check
9778 * for overlap with our own transaction.
9779 *
9780 * This function should be called just about anywhere in heapam.c where a
9781 * tuple has been read. The caller must hold at least a shared lock on the
9782 * buffer, because this function might set hint bits on the tuple. There is
9783 * currently no known reason to call this function from an index AM.
9784 */
9785 void
HeapCheckForSerializableConflictOut(bool visible,Relation relation,HeapTuple tuple,Buffer buffer,Snapshot snapshot)9786 HeapCheckForSerializableConflictOut(bool visible, Relation relation,
9787 HeapTuple tuple, Buffer buffer,
9788 Snapshot snapshot)
9789 {
9790 TransactionId xid;
9791 HTSV_Result htsvResult;
9792
9793 if (!CheckForSerializableConflictOutNeeded(relation, snapshot))
9794 return;
9795
9796 /*
9797 * Check to see whether the tuple has been written to by a concurrent
9798 * transaction, either to create it not visible to us, or to delete it
9799 * while it is visible to us. The "visible" bool indicates whether the
9800 * tuple is visible to us, while HeapTupleSatisfiesVacuum checks what else
9801 * is going on with it.
9802 *
9803 * In the event of a concurrently inserted tuple that also happens to have
9804 * been concurrently updated (by a separate transaction), the xmin of the
9805 * tuple will be used -- not the updater's xid.
9806 */
9807 htsvResult = HeapTupleSatisfiesVacuum(tuple, TransactionXmin, buffer);
9808 switch (htsvResult)
9809 {
9810 case HEAPTUPLE_LIVE:
9811 if (visible)
9812 return;
9813 xid = HeapTupleHeaderGetXmin(tuple->t_data);
9814 break;
9815 case HEAPTUPLE_RECENTLY_DEAD:
9816 case HEAPTUPLE_DELETE_IN_PROGRESS:
9817 if (visible)
9818 xid = HeapTupleHeaderGetUpdateXid(tuple->t_data);
9819 else
9820 xid = HeapTupleHeaderGetXmin(tuple->t_data);
9821
9822 if (TransactionIdPrecedes(xid, TransactionXmin))
9823 {
9824 /* This is like the HEAPTUPLE_DEAD case */
9825 Assert(!visible);
9826 return;
9827 }
9828 break;
9829 case HEAPTUPLE_INSERT_IN_PROGRESS:
9830 xid = HeapTupleHeaderGetXmin(tuple->t_data);
9831 break;
9832 case HEAPTUPLE_DEAD:
9833 Assert(!visible);
9834 return;
9835 default:
9836
9837 /*
9838 * The only way to get to this default clause is if a new value is
9839 * added to the enum type without adding it to this switch
9840 * statement. That's a bug, so elog.
9841 */
9842 elog(ERROR, "unrecognized return value from HeapTupleSatisfiesVacuum: %u", htsvResult);
9843
9844 /*
9845 * In spite of having all enum values covered and calling elog on
9846 * this default, some compilers think this is a code path which
9847 * allows xid to be used below without initialization. Silence
9848 * that warning.
9849 */
9850 xid = InvalidTransactionId;
9851 }
9852
9853 Assert(TransactionIdIsValid(xid));
9854 Assert(TransactionIdFollowsOrEquals(xid, TransactionXmin));
9855
9856 /*
9857 * Find top level xid. Bail out if xid is too early to be a conflict, or
9858 * if it's our own xid.
9859 */
9860 if (TransactionIdEquals(xid, GetTopTransactionIdIfAny()))
9861 return;
9862 xid = SubTransGetTopmostTransaction(xid);
9863 if (TransactionIdPrecedes(xid, TransactionXmin))
9864 return;
9865
9866 CheckForSerializableConflictOut(relation, xid, snapshot);
9867 }
9868