1 /*-------------------------------------------------------------------------
2 *
3 * heapam.c
4 * heap access method code
5 *
6 * Portions Copyright (c) 1996-2020, PostgreSQL Global Development Group
7 * Portions Copyright (c) 1994, Regents of the University of California
8 *
9 *
10 * IDENTIFICATION
11 * src/backend/access/heap/heapam.c
12 *
13 *
14 * INTERFACE ROUTINES
15 * heap_beginscan - begin relation scan
16 * heap_rescan - restart a relation scan
17 * heap_endscan - end relation scan
18 * heap_getnext - retrieve next tuple in scan
19 * heap_fetch - retrieve tuple with given tid
20 * heap_insert - insert tuple into a relation
21 * heap_multi_insert - insert multiple tuples into a relation
22 * heap_delete - delete a tuple from a relation
23 * heap_update - replace a tuple in a relation with another tuple
24 *
25 * NOTES
26 * This file contains the heap_ routines which implement
27 * the POSTGRES heap access method used for all POSTGRES
28 * relations.
29 *
30 *-------------------------------------------------------------------------
31 */
32 #include "postgres.h"
33
34 #include "access/bufmask.h"
35 #include "access/genam.h"
36 #include "access/heapam.h"
37 #include "access/heapam_xlog.h"
38 #include "access/heaptoast.h"
39 #include "access/hio.h"
40 #include "access/multixact.h"
41 #include "access/parallel.h"
42 #include "access/relscan.h"
43 #include "access/subtrans.h"
44 #include "access/sysattr.h"
45 #include "access/tableam.h"
46 #include "access/transam.h"
47 #include "access/valid.h"
48 #include "access/visibilitymap.h"
49 #include "access/xact.h"
50 #include "access/xlog.h"
51 #include "access/xloginsert.h"
52 #include "access/xlogutils.h"
53 #include "catalog/catalog.h"
54 #include "miscadmin.h"
55 #include "pgstat.h"
56 #include "port/atomics.h"
57 #include "storage/bufmgr.h"
58 #include "storage/freespace.h"
59 #include "storage/lmgr.h"
60 #include "storage/predicate.h"
61 #include "storage/procarray.h"
62 #include "storage/smgr.h"
63 #include "storage/spin.h"
64 #include "storage/standby.h"
65 #include "utils/datum.h"
66 #include "utils/inval.h"
67 #include "utils/lsyscache.h"
68 #include "utils/relcache.h"
69 #include "utils/snapmgr.h"
70 #include "utils/spccache.h"
71
72
73 static HeapTuple heap_prepare_insert(Relation relation, HeapTuple tup,
74 TransactionId xid, CommandId cid, int options);
75 static XLogRecPtr log_heap_update(Relation reln, Buffer oldbuf,
76 Buffer newbuf, HeapTuple oldtup,
77 HeapTuple newtup, HeapTuple old_key_tuple,
78 bool all_visible_cleared, bool new_all_visible_cleared);
79 static Bitmapset *HeapDetermineModifiedColumns(Relation relation,
80 Bitmapset *interesting_cols,
81 HeapTuple oldtup, HeapTuple newtup);
82 static bool heap_acquire_tuplock(Relation relation, ItemPointer tid,
83 LockTupleMode mode, LockWaitPolicy wait_policy,
84 bool *have_tuple_lock);
85 static void compute_new_xmax_infomask(TransactionId xmax, uint16 old_infomask,
86 uint16 old_infomask2, TransactionId add_to_xmax,
87 LockTupleMode mode, bool is_update,
88 TransactionId *result_xmax, uint16 *result_infomask,
89 uint16 *result_infomask2);
90 static TM_Result heap_lock_updated_tuple(Relation rel, HeapTuple tuple,
91 ItemPointer ctid, TransactionId xid,
92 LockTupleMode mode);
93 static void GetMultiXactIdHintBits(MultiXactId multi, uint16 *new_infomask,
94 uint16 *new_infomask2);
95 static TransactionId MultiXactIdGetUpdateXid(TransactionId xmax,
96 uint16 t_infomask);
97 static bool DoesMultiXactIdConflict(MultiXactId multi, uint16 infomask,
98 LockTupleMode lockmode, bool *current_is_member);
99 static void MultiXactIdWait(MultiXactId multi, MultiXactStatus status, uint16 infomask,
100 Relation rel, ItemPointer ctid, XLTW_Oper oper,
101 int *remaining);
102 static bool ConditionalMultiXactIdWait(MultiXactId multi, MultiXactStatus status,
103 uint16 infomask, Relation rel, int *remaining);
104 static XLogRecPtr log_heap_new_cid(Relation relation, HeapTuple tup);
105 static HeapTuple ExtractReplicaIdentity(Relation rel, HeapTuple tup, bool key_changed,
106 bool *copy);
107
108
109 /*
110 * Each tuple lock mode has a corresponding heavyweight lock, and one or two
111 * corresponding MultiXactStatuses (one to merely lock tuples, another one to
112 * update them). This table (and the macros below) helps us determine the
113 * heavyweight lock mode and MultiXactStatus values to use for any particular
114 * tuple lock strength.
115 *
116 * Don't look at lockstatus/updstatus directly! Use get_mxact_status_for_lock
117 * instead.
118 */
119 static const struct
120 {
121 LOCKMODE hwlock;
122 int lockstatus;
123 int updstatus;
124 }
125
126 tupleLockExtraInfo[MaxLockTupleMode + 1] =
127 {
128 { /* LockTupleKeyShare */
129 AccessShareLock,
130 MultiXactStatusForKeyShare,
131 -1 /* KeyShare does not allow updating tuples */
132 },
133 { /* LockTupleShare */
134 RowShareLock,
135 MultiXactStatusForShare,
136 -1 /* Share does not allow updating tuples */
137 },
138 { /* LockTupleNoKeyExclusive */
139 ExclusiveLock,
140 MultiXactStatusForNoKeyUpdate,
141 MultiXactStatusNoKeyUpdate
142 },
143 { /* LockTupleExclusive */
144 AccessExclusiveLock,
145 MultiXactStatusForUpdate,
146 MultiXactStatusUpdate
147 }
148 };
149
150 /* Get the LOCKMODE for a given MultiXactStatus */
151 #define LOCKMODE_from_mxstatus(status) \
152 (tupleLockExtraInfo[TUPLOCK_from_mxstatus((status))].hwlock)
153
154 /*
155 * Acquire heavyweight locks on tuples, using a LockTupleMode strength value.
156 * This is more readable than having every caller translate it to lock.h's
157 * LOCKMODE.
158 */
159 #define LockTupleTuplock(rel, tup, mode) \
160 LockTuple((rel), (tup), tupleLockExtraInfo[mode].hwlock)
161 #define UnlockTupleTuplock(rel, tup, mode) \
162 UnlockTuple((rel), (tup), tupleLockExtraInfo[mode].hwlock)
163 #define ConditionalLockTupleTuplock(rel, tup, mode) \
164 ConditionalLockTuple((rel), (tup), tupleLockExtraInfo[mode].hwlock)
165
166 #ifdef USE_PREFETCH
167 /*
168 * heap_compute_xid_horizon_for_tuples and xid_horizon_prefetch_buffer use
169 * this structure to coordinate prefetching activity.
170 */
171 typedef struct
172 {
173 BlockNumber cur_hblkno;
174 int next_item;
175 int nitems;
176 ItemPointerData *tids;
177 } XidHorizonPrefetchState;
178 #endif
179
180 /*
181 * This table maps tuple lock strength values for each particular
182 * MultiXactStatus value.
183 */
184 static const int MultiXactStatusLock[MaxMultiXactStatus + 1] =
185 {
186 LockTupleKeyShare, /* ForKeyShare */
187 LockTupleShare, /* ForShare */
188 LockTupleNoKeyExclusive, /* ForNoKeyUpdate */
189 LockTupleExclusive, /* ForUpdate */
190 LockTupleNoKeyExclusive, /* NoKeyUpdate */
191 LockTupleExclusive /* Update */
192 };
193
194 /* Get the LockTupleMode for a given MultiXactStatus */
195 #define TUPLOCK_from_mxstatus(status) \
196 (MultiXactStatusLock[(status)])
197
198 /* ----------------------------------------------------------------
199 * heap support routines
200 * ----------------------------------------------------------------
201 */
202
203 /* ----------------
204 * initscan - scan code common to heap_beginscan and heap_rescan
205 * ----------------
206 */
207 static void
initscan(HeapScanDesc scan,ScanKey key,bool keep_startblock)208 initscan(HeapScanDesc scan, ScanKey key, bool keep_startblock)
209 {
210 ParallelBlockTableScanDesc bpscan = NULL;
211 bool allow_strat;
212 bool allow_sync;
213
214 /*
215 * Determine the number of blocks we have to scan.
216 *
217 * It is sufficient to do this once at scan start, since any tuples added
218 * while the scan is in progress will be invisible to my snapshot anyway.
219 * (That is not true when using a non-MVCC snapshot. However, we couldn't
220 * guarantee to return tuples added after scan start anyway, since they
221 * might go into pages we already scanned. To guarantee consistent
222 * results for a non-MVCC snapshot, the caller must hold some higher-level
223 * lock that ensures the interesting tuple(s) won't change.)
224 */
225 if (scan->rs_base.rs_parallel != NULL)
226 {
227 bpscan = (ParallelBlockTableScanDesc) scan->rs_base.rs_parallel;
228 scan->rs_nblocks = bpscan->phs_nblocks;
229 }
230 else
231 scan->rs_nblocks = RelationGetNumberOfBlocks(scan->rs_base.rs_rd);
232
233 /*
234 * If the table is large relative to NBuffers, use a bulk-read access
235 * strategy and enable synchronized scanning (see syncscan.c). Although
236 * the thresholds for these features could be different, we make them the
237 * same so that there are only two behaviors to tune rather than four.
238 * (However, some callers need to be able to disable one or both of these
239 * behaviors, independently of the size of the table; also there is a GUC
240 * variable that can disable synchronized scanning.)
241 *
242 * Note that table_block_parallelscan_initialize has a very similar test;
243 * if you change this, consider changing that one, too.
244 */
245 if (!RelationUsesLocalBuffers(scan->rs_base.rs_rd) &&
246 scan->rs_nblocks > NBuffers / 4)
247 {
248 allow_strat = (scan->rs_base.rs_flags & SO_ALLOW_STRAT) != 0;
249 allow_sync = (scan->rs_base.rs_flags & SO_ALLOW_SYNC) != 0;
250 }
251 else
252 allow_strat = allow_sync = false;
253
254 if (allow_strat)
255 {
256 /* During a rescan, keep the previous strategy object. */
257 if (scan->rs_strategy == NULL)
258 scan->rs_strategy = GetAccessStrategy(BAS_BULKREAD);
259 }
260 else
261 {
262 if (scan->rs_strategy != NULL)
263 FreeAccessStrategy(scan->rs_strategy);
264 scan->rs_strategy = NULL;
265 }
266
267 if (scan->rs_base.rs_parallel != NULL)
268 {
269 /* For parallel scan, believe whatever ParallelTableScanDesc says. */
270 if (scan->rs_base.rs_parallel->phs_syncscan)
271 scan->rs_base.rs_flags |= SO_ALLOW_SYNC;
272 else
273 scan->rs_base.rs_flags &= ~SO_ALLOW_SYNC;
274 }
275 else if (keep_startblock)
276 {
277 /*
278 * When rescanning, we want to keep the previous startblock setting,
279 * so that rewinding a cursor doesn't generate surprising results.
280 * Reset the active syncscan setting, though.
281 */
282 if (allow_sync && synchronize_seqscans)
283 scan->rs_base.rs_flags |= SO_ALLOW_SYNC;
284 else
285 scan->rs_base.rs_flags &= ~SO_ALLOW_SYNC;
286 }
287 else if (allow_sync && synchronize_seqscans)
288 {
289 scan->rs_base.rs_flags |= SO_ALLOW_SYNC;
290 scan->rs_startblock = ss_get_location(scan->rs_base.rs_rd, scan->rs_nblocks);
291 }
292 else
293 {
294 scan->rs_base.rs_flags &= ~SO_ALLOW_SYNC;
295 scan->rs_startblock = 0;
296 }
297
298 scan->rs_numblocks = InvalidBlockNumber;
299 scan->rs_inited = false;
300 scan->rs_ctup.t_data = NULL;
301 ItemPointerSetInvalid(&scan->rs_ctup.t_self);
302 scan->rs_cbuf = InvalidBuffer;
303 scan->rs_cblock = InvalidBlockNumber;
304
305 /* page-at-a-time fields are always invalid when not rs_inited */
306
307 /*
308 * copy the scan key, if appropriate
309 */
310 if (key != NULL)
311 memcpy(scan->rs_base.rs_key, key, scan->rs_base.rs_nkeys * sizeof(ScanKeyData));
312
313 /*
314 * Currently, we only have a stats counter for sequential heap scans (but
315 * e.g for bitmap scans the underlying bitmap index scans will be counted,
316 * and for sample scans we update stats for tuple fetches).
317 */
318 if (scan->rs_base.rs_flags & SO_TYPE_SEQSCAN)
319 pgstat_count_heap_scan(scan->rs_base.rs_rd);
320 }
321
322 /*
323 * heap_setscanlimits - restrict range of a heapscan
324 *
325 * startBlk is the page to start at
326 * numBlks is number of pages to scan (InvalidBlockNumber means "all")
327 */
328 void
heap_setscanlimits(TableScanDesc sscan,BlockNumber startBlk,BlockNumber numBlks)329 heap_setscanlimits(TableScanDesc sscan, BlockNumber startBlk, BlockNumber numBlks)
330 {
331 HeapScanDesc scan = (HeapScanDesc) sscan;
332
333 Assert(!scan->rs_inited); /* else too late to change */
334 /* else rs_startblock is significant */
335 Assert(!(scan->rs_base.rs_flags & SO_ALLOW_SYNC));
336
337 /* Check startBlk is valid (but allow case of zero blocks...) */
338 Assert(startBlk == 0 || startBlk < scan->rs_nblocks);
339
340 scan->rs_startblock = startBlk;
341 scan->rs_numblocks = numBlks;
342 }
343
344 /*
345 * heapgetpage - subroutine for heapgettup()
346 *
347 * This routine reads and pins the specified page of the relation.
348 * In page-at-a-time mode it performs additional work, namely determining
349 * which tuples on the page are visible.
350 */
351 void
heapgetpage(TableScanDesc sscan,BlockNumber page)352 heapgetpage(TableScanDesc sscan, BlockNumber page)
353 {
354 HeapScanDesc scan = (HeapScanDesc) sscan;
355 Buffer buffer;
356 Snapshot snapshot;
357 Page dp;
358 int lines;
359 int ntup;
360 OffsetNumber lineoff;
361 ItemId lpp;
362 bool all_visible;
363
364 Assert(page < scan->rs_nblocks);
365
366 /* release previous scan buffer, if any */
367 if (BufferIsValid(scan->rs_cbuf))
368 {
369 ReleaseBuffer(scan->rs_cbuf);
370 scan->rs_cbuf = InvalidBuffer;
371 }
372
373 /*
374 * Be sure to check for interrupts at least once per page. Checks at
375 * higher code levels won't be able to stop a seqscan that encounters many
376 * pages' worth of consecutive dead tuples.
377 */
378 CHECK_FOR_INTERRUPTS();
379
380 /* read page using selected strategy */
381 scan->rs_cbuf = ReadBufferExtended(scan->rs_base.rs_rd, MAIN_FORKNUM, page,
382 RBM_NORMAL, scan->rs_strategy);
383 scan->rs_cblock = page;
384
385 if (!(scan->rs_base.rs_flags & SO_ALLOW_PAGEMODE))
386 return;
387
388 buffer = scan->rs_cbuf;
389 snapshot = scan->rs_base.rs_snapshot;
390
391 /*
392 * Prune and repair fragmentation for the whole page, if possible.
393 */
394 heap_page_prune_opt(scan->rs_base.rs_rd, buffer);
395
396 /*
397 * We must hold share lock on the buffer content while examining tuple
398 * visibility. Afterwards, however, the tuples we have found to be
399 * visible are guaranteed good as long as we hold the buffer pin.
400 */
401 LockBuffer(buffer, BUFFER_LOCK_SHARE);
402
403 dp = BufferGetPage(buffer);
404 TestForOldSnapshot(snapshot, scan->rs_base.rs_rd, dp);
405 lines = PageGetMaxOffsetNumber(dp);
406 ntup = 0;
407
408 /*
409 * If the all-visible flag indicates that all tuples on the page are
410 * visible to everyone, we can skip the per-tuple visibility tests.
411 *
412 * Note: In hot standby, a tuple that's already visible to all
413 * transactions in the master might still be invisible to a read-only
414 * transaction in the standby. We partly handle this problem by tracking
415 * the minimum xmin of visible tuples as the cut-off XID while marking a
416 * page all-visible on master and WAL log that along with the visibility
417 * map SET operation. In hot standby, we wait for (or abort) all
418 * transactions that can potentially may not see one or more tuples on the
419 * page. That's how index-only scans work fine in hot standby. A crucial
420 * difference between index-only scans and heap scans is that the
421 * index-only scan completely relies on the visibility map where as heap
422 * scan looks at the page-level PD_ALL_VISIBLE flag. We are not sure if
423 * the page-level flag can be trusted in the same way, because it might
424 * get propagated somehow without being explicitly WAL-logged, e.g. via a
425 * full page write. Until we can prove that beyond doubt, let's check each
426 * tuple for visibility the hard way.
427 */
428 all_visible = PageIsAllVisible(dp) && !snapshot->takenDuringRecovery;
429
430 for (lineoff = FirstOffsetNumber, lpp = PageGetItemId(dp, lineoff);
431 lineoff <= lines;
432 lineoff++, lpp++)
433 {
434 if (ItemIdIsNormal(lpp))
435 {
436 HeapTupleData loctup;
437 bool valid;
438
439 loctup.t_tableOid = RelationGetRelid(scan->rs_base.rs_rd);
440 loctup.t_data = (HeapTupleHeader) PageGetItem((Page) dp, lpp);
441 loctup.t_len = ItemIdGetLength(lpp);
442 ItemPointerSet(&(loctup.t_self), page, lineoff);
443
444 if (all_visible)
445 valid = true;
446 else
447 valid = HeapTupleSatisfiesVisibility(&loctup, snapshot, buffer);
448
449 HeapCheckForSerializableConflictOut(valid, scan->rs_base.rs_rd,
450 &loctup, buffer, snapshot);
451
452 if (valid)
453 scan->rs_vistuples[ntup++] = lineoff;
454 }
455 }
456
457 LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
458
459 Assert(ntup <= MaxHeapTuplesPerPage);
460 scan->rs_ntuples = ntup;
461 }
462
463 /* ----------------
464 * heapgettup - fetch next heap tuple
465 *
466 * Initialize the scan if not already done; then advance to the next
467 * tuple as indicated by "dir"; return the next tuple in scan->rs_ctup,
468 * or set scan->rs_ctup.t_data = NULL if no more tuples.
469 *
470 * dir == NoMovementScanDirection means "re-fetch the tuple indicated
471 * by scan->rs_ctup".
472 *
473 * Note: the reason nkeys/key are passed separately, even though they are
474 * kept in the scan descriptor, is that the caller may not want us to check
475 * the scankeys.
476 *
477 * Note: when we fall off the end of the scan in either direction, we
478 * reset rs_inited. This means that a further request with the same
479 * scan direction will restart the scan, which is a bit odd, but a
480 * request with the opposite scan direction will start a fresh scan
481 * in the proper direction. The latter is required behavior for cursors,
482 * while the former case is generally undefined behavior in Postgres
483 * so we don't care too much.
484 * ----------------
485 */
486 static void
heapgettup(HeapScanDesc scan,ScanDirection dir,int nkeys,ScanKey key)487 heapgettup(HeapScanDesc scan,
488 ScanDirection dir,
489 int nkeys,
490 ScanKey key)
491 {
492 HeapTuple tuple = &(scan->rs_ctup);
493 Snapshot snapshot = scan->rs_base.rs_snapshot;
494 bool backward = ScanDirectionIsBackward(dir);
495 BlockNumber page;
496 bool finished;
497 Page dp;
498 int lines;
499 OffsetNumber lineoff;
500 int linesleft;
501 ItemId lpp;
502
503 /*
504 * calculate next starting lineoff, given scan direction
505 */
506 if (ScanDirectionIsForward(dir))
507 {
508 if (!scan->rs_inited)
509 {
510 /*
511 * return null immediately if relation is empty
512 */
513 if (scan->rs_nblocks == 0 || scan->rs_numblocks == 0)
514 {
515 Assert(!BufferIsValid(scan->rs_cbuf));
516 tuple->t_data = NULL;
517 return;
518 }
519 if (scan->rs_base.rs_parallel != NULL)
520 {
521 ParallelBlockTableScanDesc pbscan =
522 (ParallelBlockTableScanDesc) scan->rs_base.rs_parallel;
523
524 table_block_parallelscan_startblock_init(scan->rs_base.rs_rd,
525 pbscan);
526
527 page = table_block_parallelscan_nextpage(scan->rs_base.rs_rd,
528 pbscan);
529
530 /* Other processes might have already finished the scan. */
531 if (page == InvalidBlockNumber)
532 {
533 Assert(!BufferIsValid(scan->rs_cbuf));
534 tuple->t_data = NULL;
535 return;
536 }
537 }
538 else
539 page = scan->rs_startblock; /* first page */
540 heapgetpage((TableScanDesc) scan, page);
541 lineoff = FirstOffsetNumber; /* first offnum */
542 scan->rs_inited = true;
543 }
544 else
545 {
546 /* continue from previously returned page/tuple */
547 page = scan->rs_cblock; /* current page */
548 lineoff = /* next offnum */
549 OffsetNumberNext(ItemPointerGetOffsetNumber(&(tuple->t_self)));
550 }
551
552 LockBuffer(scan->rs_cbuf, BUFFER_LOCK_SHARE);
553
554 dp = BufferGetPage(scan->rs_cbuf);
555 TestForOldSnapshot(snapshot, scan->rs_base.rs_rd, dp);
556 lines = PageGetMaxOffsetNumber(dp);
557 /* page and lineoff now reference the physically next tid */
558
559 linesleft = lines - lineoff + 1;
560 }
561 else if (backward)
562 {
563 /* backward parallel scan not supported */
564 Assert(scan->rs_base.rs_parallel == NULL);
565
566 if (!scan->rs_inited)
567 {
568 /*
569 * return null immediately if relation is empty
570 */
571 if (scan->rs_nblocks == 0 || scan->rs_numblocks == 0)
572 {
573 Assert(!BufferIsValid(scan->rs_cbuf));
574 tuple->t_data = NULL;
575 return;
576 }
577
578 /*
579 * Disable reporting to syncscan logic in a backwards scan; it's
580 * not very likely anyone else is doing the same thing at the same
581 * time, and much more likely that we'll just bollix things for
582 * forward scanners.
583 */
584 scan->rs_base.rs_flags &= ~SO_ALLOW_SYNC;
585
586 /*
587 * Start from last page of the scan. Ensure we take into account
588 * rs_numblocks if it's been adjusted by heap_setscanlimits().
589 */
590 if (scan->rs_numblocks != InvalidBlockNumber)
591 page = (scan->rs_startblock + scan->rs_numblocks - 1) % scan->rs_nblocks;
592 else if (scan->rs_startblock > 0)
593 page = scan->rs_startblock - 1;
594 else
595 page = scan->rs_nblocks - 1;
596 heapgetpage((TableScanDesc) scan, page);
597 }
598 else
599 {
600 /* continue from previously returned page/tuple */
601 page = scan->rs_cblock; /* current page */
602 }
603
604 LockBuffer(scan->rs_cbuf, BUFFER_LOCK_SHARE);
605
606 dp = BufferGetPage(scan->rs_cbuf);
607 TestForOldSnapshot(snapshot, scan->rs_base.rs_rd, dp);
608 lines = PageGetMaxOffsetNumber(dp);
609
610 if (!scan->rs_inited)
611 {
612 lineoff = lines; /* final offnum */
613 scan->rs_inited = true;
614 }
615 else
616 {
617 lineoff = /* previous offnum */
618 OffsetNumberPrev(ItemPointerGetOffsetNumber(&(tuple->t_self)));
619 }
620 /* page and lineoff now reference the physically previous tid */
621
622 linesleft = lineoff;
623 }
624 else
625 {
626 /*
627 * ``no movement'' scan direction: refetch prior tuple
628 */
629 if (!scan->rs_inited)
630 {
631 Assert(!BufferIsValid(scan->rs_cbuf));
632 tuple->t_data = NULL;
633 return;
634 }
635
636 page = ItemPointerGetBlockNumber(&(tuple->t_self));
637 if (page != scan->rs_cblock)
638 heapgetpage((TableScanDesc) scan, page);
639
640 /* Since the tuple was previously fetched, needn't lock page here */
641 dp = BufferGetPage(scan->rs_cbuf);
642 TestForOldSnapshot(snapshot, scan->rs_base.rs_rd, dp);
643 lineoff = ItemPointerGetOffsetNumber(&(tuple->t_self));
644 lpp = PageGetItemId(dp, lineoff);
645 Assert(ItemIdIsNormal(lpp));
646
647 tuple->t_data = (HeapTupleHeader) PageGetItem((Page) dp, lpp);
648 tuple->t_len = ItemIdGetLength(lpp);
649
650 return;
651 }
652
653 /*
654 * advance the scan until we find a qualifying tuple or run out of stuff
655 * to scan
656 */
657 lpp = PageGetItemId(dp, lineoff);
658 for (;;)
659 {
660 while (linesleft > 0)
661 {
662 if (ItemIdIsNormal(lpp))
663 {
664 bool valid;
665
666 tuple->t_data = (HeapTupleHeader) PageGetItem((Page) dp, lpp);
667 tuple->t_len = ItemIdGetLength(lpp);
668 ItemPointerSet(&(tuple->t_self), page, lineoff);
669
670 /*
671 * if current tuple qualifies, return it.
672 */
673 valid = HeapTupleSatisfiesVisibility(tuple,
674 snapshot,
675 scan->rs_cbuf);
676
677 HeapCheckForSerializableConflictOut(valid, scan->rs_base.rs_rd,
678 tuple, scan->rs_cbuf,
679 snapshot);
680
681 if (valid && key != NULL)
682 HeapKeyTest(tuple, RelationGetDescr(scan->rs_base.rs_rd),
683 nkeys, key, valid);
684
685 if (valid)
686 {
687 LockBuffer(scan->rs_cbuf, BUFFER_LOCK_UNLOCK);
688 return;
689 }
690 }
691
692 /*
693 * otherwise move to the next item on the page
694 */
695 --linesleft;
696 if (backward)
697 {
698 --lpp; /* move back in this page's ItemId array */
699 --lineoff;
700 }
701 else
702 {
703 ++lpp; /* move forward in this page's ItemId array */
704 ++lineoff;
705 }
706 }
707
708 /*
709 * if we get here, it means we've exhausted the items on this page and
710 * it's time to move to the next.
711 */
712 LockBuffer(scan->rs_cbuf, BUFFER_LOCK_UNLOCK);
713
714 /*
715 * advance to next/prior page and detect end of scan
716 */
717 if (backward)
718 {
719 finished = (page == scan->rs_startblock) ||
720 (scan->rs_numblocks != InvalidBlockNumber ? --scan->rs_numblocks == 0 : false);
721 if (page == 0)
722 page = scan->rs_nblocks;
723 page--;
724 }
725 else if (scan->rs_base.rs_parallel != NULL)
726 {
727 ParallelBlockTableScanDesc pbscan =
728 (ParallelBlockTableScanDesc) scan->rs_base.rs_parallel;
729
730 page = table_block_parallelscan_nextpage(scan->rs_base.rs_rd,
731 pbscan);
732 finished = (page == InvalidBlockNumber);
733 }
734 else
735 {
736 page++;
737 if (page >= scan->rs_nblocks)
738 page = 0;
739 finished = (page == scan->rs_startblock) ||
740 (scan->rs_numblocks != InvalidBlockNumber ? --scan->rs_numblocks == 0 : false);
741
742 /*
743 * Report our new scan position for synchronization purposes. We
744 * don't do that when moving backwards, however. That would just
745 * mess up any other forward-moving scanners.
746 *
747 * Note: we do this before checking for end of scan so that the
748 * final state of the position hint is back at the start of the
749 * rel. That's not strictly necessary, but otherwise when you run
750 * the same query multiple times the starting position would shift
751 * a little bit backwards on every invocation, which is confusing.
752 * We don't guarantee any specific ordering in general, though.
753 */
754 if (scan->rs_base.rs_flags & SO_ALLOW_SYNC)
755 ss_report_location(scan->rs_base.rs_rd, page);
756 }
757
758 /*
759 * return NULL if we've exhausted all the pages
760 */
761 if (finished)
762 {
763 if (BufferIsValid(scan->rs_cbuf))
764 ReleaseBuffer(scan->rs_cbuf);
765 scan->rs_cbuf = InvalidBuffer;
766 scan->rs_cblock = InvalidBlockNumber;
767 tuple->t_data = NULL;
768 scan->rs_inited = false;
769 return;
770 }
771
772 heapgetpage((TableScanDesc) scan, page);
773
774 LockBuffer(scan->rs_cbuf, BUFFER_LOCK_SHARE);
775
776 dp = BufferGetPage(scan->rs_cbuf);
777 TestForOldSnapshot(snapshot, scan->rs_base.rs_rd, dp);
778 lines = PageGetMaxOffsetNumber((Page) dp);
779 linesleft = lines;
780 if (backward)
781 {
782 lineoff = lines;
783 lpp = PageGetItemId(dp, lines);
784 }
785 else
786 {
787 lineoff = FirstOffsetNumber;
788 lpp = PageGetItemId(dp, FirstOffsetNumber);
789 }
790 }
791 }
792
793 /* ----------------
794 * heapgettup_pagemode - fetch next heap tuple in page-at-a-time mode
795 *
796 * Same API as heapgettup, but used in page-at-a-time mode
797 *
798 * The internal logic is much the same as heapgettup's too, but there are some
799 * differences: we do not take the buffer content lock (that only needs to
800 * happen inside heapgetpage), and we iterate through just the tuples listed
801 * in rs_vistuples[] rather than all tuples on the page. Notice that
802 * lineindex is 0-based, where the corresponding loop variable lineoff in
803 * heapgettup is 1-based.
804 * ----------------
805 */
806 static void
heapgettup_pagemode(HeapScanDesc scan,ScanDirection dir,int nkeys,ScanKey key)807 heapgettup_pagemode(HeapScanDesc scan,
808 ScanDirection dir,
809 int nkeys,
810 ScanKey key)
811 {
812 HeapTuple tuple = &(scan->rs_ctup);
813 bool backward = ScanDirectionIsBackward(dir);
814 BlockNumber page;
815 bool finished;
816 Page dp;
817 int lines;
818 int lineindex;
819 OffsetNumber lineoff;
820 int linesleft;
821 ItemId lpp;
822
823 /*
824 * calculate next starting lineindex, given scan direction
825 */
826 if (ScanDirectionIsForward(dir))
827 {
828 if (!scan->rs_inited)
829 {
830 /*
831 * return null immediately if relation is empty
832 */
833 if (scan->rs_nblocks == 0 || scan->rs_numblocks == 0)
834 {
835 Assert(!BufferIsValid(scan->rs_cbuf));
836 tuple->t_data = NULL;
837 return;
838 }
839 if (scan->rs_base.rs_parallel != NULL)
840 {
841 ParallelBlockTableScanDesc pbscan =
842 (ParallelBlockTableScanDesc) scan->rs_base.rs_parallel;
843
844 table_block_parallelscan_startblock_init(scan->rs_base.rs_rd,
845 pbscan);
846
847 page = table_block_parallelscan_nextpage(scan->rs_base.rs_rd,
848 pbscan);
849
850 /* Other processes might have already finished the scan. */
851 if (page == InvalidBlockNumber)
852 {
853 Assert(!BufferIsValid(scan->rs_cbuf));
854 tuple->t_data = NULL;
855 return;
856 }
857 }
858 else
859 page = scan->rs_startblock; /* first page */
860 heapgetpage((TableScanDesc) scan, page);
861 lineindex = 0;
862 scan->rs_inited = true;
863 }
864 else
865 {
866 /* continue from previously returned page/tuple */
867 page = scan->rs_cblock; /* current page */
868 lineindex = scan->rs_cindex + 1;
869 }
870
871 dp = BufferGetPage(scan->rs_cbuf);
872 TestForOldSnapshot(scan->rs_base.rs_snapshot, scan->rs_base.rs_rd, dp);
873 lines = scan->rs_ntuples;
874 /* page and lineindex now reference the next visible tid */
875
876 linesleft = lines - lineindex;
877 }
878 else if (backward)
879 {
880 /* backward parallel scan not supported */
881 Assert(scan->rs_base.rs_parallel == NULL);
882
883 if (!scan->rs_inited)
884 {
885 /*
886 * return null immediately if relation is empty
887 */
888 if (scan->rs_nblocks == 0 || scan->rs_numblocks == 0)
889 {
890 Assert(!BufferIsValid(scan->rs_cbuf));
891 tuple->t_data = NULL;
892 return;
893 }
894
895 /*
896 * Disable reporting to syncscan logic in a backwards scan; it's
897 * not very likely anyone else is doing the same thing at the same
898 * time, and much more likely that we'll just bollix things for
899 * forward scanners.
900 */
901 scan->rs_base.rs_flags &= ~SO_ALLOW_SYNC;
902
903 /*
904 * Start from last page of the scan. Ensure we take into account
905 * rs_numblocks if it's been adjusted by heap_setscanlimits().
906 */
907 if (scan->rs_numblocks != InvalidBlockNumber)
908 page = (scan->rs_startblock + scan->rs_numblocks - 1) % scan->rs_nblocks;
909 else if (scan->rs_startblock > 0)
910 page = scan->rs_startblock - 1;
911 else
912 page = scan->rs_nblocks - 1;
913 heapgetpage((TableScanDesc) scan, page);
914 }
915 else
916 {
917 /* continue from previously returned page/tuple */
918 page = scan->rs_cblock; /* current page */
919 }
920
921 dp = BufferGetPage(scan->rs_cbuf);
922 TestForOldSnapshot(scan->rs_base.rs_snapshot, scan->rs_base.rs_rd, dp);
923 lines = scan->rs_ntuples;
924
925 if (!scan->rs_inited)
926 {
927 lineindex = lines - 1;
928 scan->rs_inited = true;
929 }
930 else
931 {
932 lineindex = scan->rs_cindex - 1;
933 }
934 /* page and lineindex now reference the previous visible tid */
935
936 linesleft = lineindex + 1;
937 }
938 else
939 {
940 /*
941 * ``no movement'' scan direction: refetch prior tuple
942 */
943 if (!scan->rs_inited)
944 {
945 Assert(!BufferIsValid(scan->rs_cbuf));
946 tuple->t_data = NULL;
947 return;
948 }
949
950 page = ItemPointerGetBlockNumber(&(tuple->t_self));
951 if (page != scan->rs_cblock)
952 heapgetpage((TableScanDesc) scan, page);
953
954 /* Since the tuple was previously fetched, needn't lock page here */
955 dp = BufferGetPage(scan->rs_cbuf);
956 TestForOldSnapshot(scan->rs_base.rs_snapshot, scan->rs_base.rs_rd, dp);
957 lineoff = ItemPointerGetOffsetNumber(&(tuple->t_self));
958 lpp = PageGetItemId(dp, lineoff);
959 Assert(ItemIdIsNormal(lpp));
960
961 tuple->t_data = (HeapTupleHeader) PageGetItem((Page) dp, lpp);
962 tuple->t_len = ItemIdGetLength(lpp);
963
964 /* check that rs_cindex is in sync */
965 Assert(scan->rs_cindex < scan->rs_ntuples);
966 Assert(lineoff == scan->rs_vistuples[scan->rs_cindex]);
967
968 return;
969 }
970
971 /*
972 * advance the scan until we find a qualifying tuple or run out of stuff
973 * to scan
974 */
975 for (;;)
976 {
977 while (linesleft > 0)
978 {
979 lineoff = scan->rs_vistuples[lineindex];
980 lpp = PageGetItemId(dp, lineoff);
981 Assert(ItemIdIsNormal(lpp));
982
983 tuple->t_data = (HeapTupleHeader) PageGetItem((Page) dp, lpp);
984 tuple->t_len = ItemIdGetLength(lpp);
985 ItemPointerSet(&(tuple->t_self), page, lineoff);
986
987 /*
988 * if current tuple qualifies, return it.
989 */
990 if (key != NULL)
991 {
992 bool valid;
993
994 HeapKeyTest(tuple, RelationGetDescr(scan->rs_base.rs_rd),
995 nkeys, key, valid);
996 if (valid)
997 {
998 scan->rs_cindex = lineindex;
999 return;
1000 }
1001 }
1002 else
1003 {
1004 scan->rs_cindex = lineindex;
1005 return;
1006 }
1007
1008 /*
1009 * otherwise move to the next item on the page
1010 */
1011 --linesleft;
1012 if (backward)
1013 --lineindex;
1014 else
1015 ++lineindex;
1016 }
1017
1018 /*
1019 * if we get here, it means we've exhausted the items on this page and
1020 * it's time to move to the next.
1021 */
1022 if (backward)
1023 {
1024 finished = (page == scan->rs_startblock) ||
1025 (scan->rs_numblocks != InvalidBlockNumber ? --scan->rs_numblocks == 0 : false);
1026 if (page == 0)
1027 page = scan->rs_nblocks;
1028 page--;
1029 }
1030 else if (scan->rs_base.rs_parallel != NULL)
1031 {
1032 ParallelBlockTableScanDesc pbscan =
1033 (ParallelBlockTableScanDesc) scan->rs_base.rs_parallel;
1034
1035 page = table_block_parallelscan_nextpage(scan->rs_base.rs_rd,
1036 pbscan);
1037 finished = (page == InvalidBlockNumber);
1038 }
1039 else
1040 {
1041 page++;
1042 if (page >= scan->rs_nblocks)
1043 page = 0;
1044 finished = (page == scan->rs_startblock) ||
1045 (scan->rs_numblocks != InvalidBlockNumber ? --scan->rs_numblocks == 0 : false);
1046
1047 /*
1048 * Report our new scan position for synchronization purposes. We
1049 * don't do that when moving backwards, however. That would just
1050 * mess up any other forward-moving scanners.
1051 *
1052 * Note: we do this before checking for end of scan so that the
1053 * final state of the position hint is back at the start of the
1054 * rel. That's not strictly necessary, but otherwise when you run
1055 * the same query multiple times the starting position would shift
1056 * a little bit backwards on every invocation, which is confusing.
1057 * We don't guarantee any specific ordering in general, though.
1058 */
1059 if (scan->rs_base.rs_flags & SO_ALLOW_SYNC)
1060 ss_report_location(scan->rs_base.rs_rd, page);
1061 }
1062
1063 /*
1064 * return NULL if we've exhausted all the pages
1065 */
1066 if (finished)
1067 {
1068 if (BufferIsValid(scan->rs_cbuf))
1069 ReleaseBuffer(scan->rs_cbuf);
1070 scan->rs_cbuf = InvalidBuffer;
1071 scan->rs_cblock = InvalidBlockNumber;
1072 tuple->t_data = NULL;
1073 scan->rs_inited = false;
1074 return;
1075 }
1076
1077 heapgetpage((TableScanDesc) scan, page);
1078
1079 dp = BufferGetPage(scan->rs_cbuf);
1080 TestForOldSnapshot(scan->rs_base.rs_snapshot, scan->rs_base.rs_rd, dp);
1081 lines = scan->rs_ntuples;
1082 linesleft = lines;
1083 if (backward)
1084 lineindex = lines - 1;
1085 else
1086 lineindex = 0;
1087 }
1088 }
1089
1090
1091 #if defined(DISABLE_COMPLEX_MACRO)
1092 /*
1093 * This is formatted so oddly so that the correspondence to the macro
1094 * definition in access/htup_details.h is maintained.
1095 */
1096 Datum
fastgetattr(HeapTuple tup,int attnum,TupleDesc tupleDesc,bool * isnull)1097 fastgetattr(HeapTuple tup, int attnum, TupleDesc tupleDesc,
1098 bool *isnull)
1099 {
1100 return (
1101 (attnum) > 0 ?
1102 (
1103 (*(isnull) = false),
1104 HeapTupleNoNulls(tup) ?
1105 (
1106 TupleDescAttr((tupleDesc), (attnum) - 1)->attcacheoff >= 0 ?
1107 (
1108 fetchatt(TupleDescAttr((tupleDesc), (attnum) - 1),
1109 (char *) (tup)->t_data + (tup)->t_data->t_hoff +
1110 TupleDescAttr((tupleDesc), (attnum) - 1)->attcacheoff)
1111 )
1112 :
1113 nocachegetattr((tup), (attnum), (tupleDesc))
1114 )
1115 :
1116 (
1117 att_isnull((attnum) - 1, (tup)->t_data->t_bits) ?
1118 (
1119 (*(isnull) = true),
1120 (Datum) NULL
1121 )
1122 :
1123 (
1124 nocachegetattr((tup), (attnum), (tupleDesc))
1125 )
1126 )
1127 )
1128 :
1129 (
1130 (Datum) NULL
1131 )
1132 );
1133 }
1134 #endif /* defined(DISABLE_COMPLEX_MACRO) */
1135
1136
1137 /* ----------------------------------------------------------------
1138 * heap access method interface
1139 * ----------------------------------------------------------------
1140 */
1141
1142
1143 TableScanDesc
heap_beginscan(Relation relation,Snapshot snapshot,int nkeys,ScanKey key,ParallelTableScanDesc parallel_scan,uint32 flags)1144 heap_beginscan(Relation relation, Snapshot snapshot,
1145 int nkeys, ScanKey key,
1146 ParallelTableScanDesc parallel_scan,
1147 uint32 flags)
1148 {
1149 HeapScanDesc scan;
1150
1151 /*
1152 * increment relation ref count while scanning relation
1153 *
1154 * This is just to make really sure the relcache entry won't go away while
1155 * the scan has a pointer to it. Caller should be holding the rel open
1156 * anyway, so this is redundant in all normal scenarios...
1157 */
1158 RelationIncrementReferenceCount(relation);
1159
1160 /*
1161 * allocate and initialize scan descriptor
1162 */
1163 scan = (HeapScanDesc) palloc(sizeof(HeapScanDescData));
1164
1165 scan->rs_base.rs_rd = relation;
1166 scan->rs_base.rs_snapshot = snapshot;
1167 scan->rs_base.rs_nkeys = nkeys;
1168 scan->rs_base.rs_flags = flags;
1169 scan->rs_base.rs_parallel = parallel_scan;
1170 scan->rs_strategy = NULL; /* set in initscan */
1171
1172 /*
1173 * Disable page-at-a-time mode if it's not a MVCC-safe snapshot.
1174 */
1175 if (!(snapshot && IsMVCCSnapshot(snapshot)))
1176 scan->rs_base.rs_flags &= ~SO_ALLOW_PAGEMODE;
1177
1178 /*
1179 * For seqscan and sample scans in a serializable transaction, acquire a
1180 * predicate lock on the entire relation. This is required not only to
1181 * lock all the matching tuples, but also to conflict with new insertions
1182 * into the table. In an indexscan, we take page locks on the index pages
1183 * covering the range specified in the scan qual, but in a heap scan there
1184 * is nothing more fine-grained to lock. A bitmap scan is a different
1185 * story, there we have already scanned the index and locked the index
1186 * pages covering the predicate. But in that case we still have to lock
1187 * any matching heap tuples. For sample scan we could optimize the locking
1188 * to be at least page-level granularity, but we'd need to add per-tuple
1189 * locking for that.
1190 */
1191 if (scan->rs_base.rs_flags & (SO_TYPE_SEQSCAN | SO_TYPE_SAMPLESCAN))
1192 {
1193 /*
1194 * Ensure a missing snapshot is noticed reliably, even if the
1195 * isolation mode means predicate locking isn't performed (and
1196 * therefore the snapshot isn't used here).
1197 */
1198 Assert(snapshot);
1199 PredicateLockRelation(relation, snapshot);
1200 }
1201
1202 /* we only need to set this up once */
1203 scan->rs_ctup.t_tableOid = RelationGetRelid(relation);
1204
1205 /*
1206 * we do this here instead of in initscan() because heap_rescan also calls
1207 * initscan() and we don't want to allocate memory again
1208 */
1209 if (nkeys > 0)
1210 scan->rs_base.rs_key = (ScanKey) palloc(sizeof(ScanKeyData) * nkeys);
1211 else
1212 scan->rs_base.rs_key = NULL;
1213
1214 initscan(scan, key, false);
1215
1216 return (TableScanDesc) scan;
1217 }
1218
1219 void
heap_rescan(TableScanDesc sscan,ScanKey key,bool set_params,bool allow_strat,bool allow_sync,bool allow_pagemode)1220 heap_rescan(TableScanDesc sscan, ScanKey key, bool set_params,
1221 bool allow_strat, bool allow_sync, bool allow_pagemode)
1222 {
1223 HeapScanDesc scan = (HeapScanDesc) sscan;
1224
1225 if (set_params)
1226 {
1227 if (allow_strat)
1228 scan->rs_base.rs_flags |= SO_ALLOW_STRAT;
1229 else
1230 scan->rs_base.rs_flags &= ~SO_ALLOW_STRAT;
1231
1232 if (allow_sync)
1233 scan->rs_base.rs_flags |= SO_ALLOW_SYNC;
1234 else
1235 scan->rs_base.rs_flags &= ~SO_ALLOW_SYNC;
1236
1237 if (allow_pagemode && scan->rs_base.rs_snapshot &&
1238 IsMVCCSnapshot(scan->rs_base.rs_snapshot))
1239 scan->rs_base.rs_flags |= SO_ALLOW_PAGEMODE;
1240 else
1241 scan->rs_base.rs_flags &= ~SO_ALLOW_PAGEMODE;
1242 }
1243
1244 /*
1245 * unpin scan buffers
1246 */
1247 if (BufferIsValid(scan->rs_cbuf))
1248 ReleaseBuffer(scan->rs_cbuf);
1249
1250 /*
1251 * reinitialize scan descriptor
1252 */
1253 initscan(scan, key, true);
1254 }
1255
1256 void
heap_endscan(TableScanDesc sscan)1257 heap_endscan(TableScanDesc sscan)
1258 {
1259 HeapScanDesc scan = (HeapScanDesc) sscan;
1260
1261 /* Note: no locking manipulations needed */
1262
1263 /*
1264 * unpin scan buffers
1265 */
1266 if (BufferIsValid(scan->rs_cbuf))
1267 ReleaseBuffer(scan->rs_cbuf);
1268
1269 /*
1270 * decrement relation reference count and free scan descriptor storage
1271 */
1272 RelationDecrementReferenceCount(scan->rs_base.rs_rd);
1273
1274 if (scan->rs_base.rs_key)
1275 pfree(scan->rs_base.rs_key);
1276
1277 if (scan->rs_strategy != NULL)
1278 FreeAccessStrategy(scan->rs_strategy);
1279
1280 if (scan->rs_base.rs_flags & SO_TEMP_SNAPSHOT)
1281 UnregisterSnapshot(scan->rs_base.rs_snapshot);
1282
1283 pfree(scan);
1284 }
1285
1286 HeapTuple
heap_getnext(TableScanDesc sscan,ScanDirection direction)1287 heap_getnext(TableScanDesc sscan, ScanDirection direction)
1288 {
1289 HeapScanDesc scan = (HeapScanDesc) sscan;
1290
1291 /*
1292 * This is still widely used directly, without going through table AM, so
1293 * add a safety check. It's possible we should, at a later point,
1294 * downgrade this to an assert. The reason for checking the AM routine,
1295 * rather than the AM oid, is that this allows to write regression tests
1296 * that create another AM reusing the heap handler.
1297 */
1298 if (unlikely(sscan->rs_rd->rd_tableam != GetHeapamTableAmRoutine()))
1299 ereport(ERROR,
1300 (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
1301 errmsg_internal("only heap AM is supported")));
1302
1303 /* Note: no locking manipulations needed */
1304
1305 if (scan->rs_base.rs_flags & SO_ALLOW_PAGEMODE)
1306 heapgettup_pagemode(scan, direction,
1307 scan->rs_base.rs_nkeys, scan->rs_base.rs_key);
1308 else
1309 heapgettup(scan, direction,
1310 scan->rs_base.rs_nkeys, scan->rs_base.rs_key);
1311
1312 if (scan->rs_ctup.t_data == NULL)
1313 return NULL;
1314
1315 /*
1316 * if we get here it means we have a new current scan tuple, so point to
1317 * the proper return buffer and return the tuple.
1318 */
1319
1320 pgstat_count_heap_getnext(scan->rs_base.rs_rd);
1321
1322 return &scan->rs_ctup;
1323 }
1324
1325 bool
heap_getnextslot(TableScanDesc sscan,ScanDirection direction,TupleTableSlot * slot)1326 heap_getnextslot(TableScanDesc sscan, ScanDirection direction, TupleTableSlot *slot)
1327 {
1328 HeapScanDesc scan = (HeapScanDesc) sscan;
1329
1330 /* Note: no locking manipulations needed */
1331
1332 if (sscan->rs_flags & SO_ALLOW_PAGEMODE)
1333 heapgettup_pagemode(scan, direction, sscan->rs_nkeys, sscan->rs_key);
1334 else
1335 heapgettup(scan, direction, sscan->rs_nkeys, sscan->rs_key);
1336
1337 if (scan->rs_ctup.t_data == NULL)
1338 {
1339 ExecClearTuple(slot);
1340 return false;
1341 }
1342
1343 /*
1344 * if we get here it means we have a new current scan tuple, so point to
1345 * the proper return buffer and return the tuple.
1346 */
1347
1348 pgstat_count_heap_getnext(scan->rs_base.rs_rd);
1349
1350 ExecStoreBufferHeapTuple(&scan->rs_ctup, slot,
1351 scan->rs_cbuf);
1352 return true;
1353 }
1354
1355 /*
1356 * heap_fetch - retrieve tuple with given tid
1357 *
1358 * On entry, tuple->t_self is the TID to fetch. We pin the buffer holding
1359 * the tuple, fill in the remaining fields of *tuple, and check the tuple
1360 * against the specified snapshot.
1361 *
1362 * If successful (tuple found and passes snapshot time qual), then *userbuf
1363 * is set to the buffer holding the tuple and true is returned. The caller
1364 * must unpin the buffer when done with the tuple.
1365 *
1366 * If the tuple is not found (ie, item number references a deleted slot),
1367 * then tuple->t_data is set to NULL and false is returned.
1368 *
1369 * If the tuple is found but fails the time qual check, then false is returned
1370 * but tuple->t_data is left pointing to the tuple.
1371 *
1372 * heap_fetch does not follow HOT chains: only the exact TID requested will
1373 * be fetched.
1374 *
1375 * It is somewhat inconsistent that we ereport() on invalid block number but
1376 * return false on invalid item number. There are a couple of reasons though.
1377 * One is that the caller can relatively easily check the block number for
1378 * validity, but cannot check the item number without reading the page
1379 * himself. Another is that when we are following a t_ctid link, we can be
1380 * reasonably confident that the page number is valid (since VACUUM shouldn't
1381 * truncate off the destination page without having killed the referencing
1382 * tuple first), but the item number might well not be good.
1383 */
1384 bool
heap_fetch(Relation relation,Snapshot snapshot,HeapTuple tuple,Buffer * userbuf)1385 heap_fetch(Relation relation,
1386 Snapshot snapshot,
1387 HeapTuple tuple,
1388 Buffer *userbuf)
1389 {
1390 ItemPointer tid = &(tuple->t_self);
1391 ItemId lp;
1392 Buffer buffer;
1393 Page page;
1394 OffsetNumber offnum;
1395 bool valid;
1396
1397 /*
1398 * Fetch and pin the appropriate page of the relation.
1399 */
1400 buffer = ReadBuffer(relation, ItemPointerGetBlockNumber(tid));
1401
1402 /*
1403 * Need share lock on buffer to examine tuple commit status.
1404 */
1405 LockBuffer(buffer, BUFFER_LOCK_SHARE);
1406 page = BufferGetPage(buffer);
1407 TestForOldSnapshot(snapshot, relation, page);
1408
1409 /*
1410 * We'd better check for out-of-range offnum in case of VACUUM since the
1411 * TID was obtained.
1412 */
1413 offnum = ItemPointerGetOffsetNumber(tid);
1414 if (offnum < FirstOffsetNumber || offnum > PageGetMaxOffsetNumber(page))
1415 {
1416 LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
1417 ReleaseBuffer(buffer);
1418 *userbuf = InvalidBuffer;
1419 tuple->t_data = NULL;
1420 return false;
1421 }
1422
1423 /*
1424 * get the item line pointer corresponding to the requested tid
1425 */
1426 lp = PageGetItemId(page, offnum);
1427
1428 /*
1429 * Must check for deleted tuple.
1430 */
1431 if (!ItemIdIsNormal(lp))
1432 {
1433 LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
1434 ReleaseBuffer(buffer);
1435 *userbuf = InvalidBuffer;
1436 tuple->t_data = NULL;
1437 return false;
1438 }
1439
1440 /*
1441 * fill in *tuple fields
1442 */
1443 tuple->t_data = (HeapTupleHeader) PageGetItem(page, lp);
1444 tuple->t_len = ItemIdGetLength(lp);
1445 tuple->t_tableOid = RelationGetRelid(relation);
1446
1447 /*
1448 * check tuple visibility, then release lock
1449 */
1450 valid = HeapTupleSatisfiesVisibility(tuple, snapshot, buffer);
1451
1452 if (valid)
1453 PredicateLockTID(relation, &(tuple->t_self), snapshot,
1454 HeapTupleHeaderGetXmin(tuple->t_data));
1455
1456 HeapCheckForSerializableConflictOut(valid, relation, tuple, buffer, snapshot);
1457
1458 LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
1459
1460 if (valid)
1461 {
1462 /*
1463 * All checks passed, so return the tuple as valid. Caller is now
1464 * responsible for releasing the buffer.
1465 */
1466 *userbuf = buffer;
1467
1468 return true;
1469 }
1470
1471 /* Tuple failed time qual */
1472 ReleaseBuffer(buffer);
1473 *userbuf = InvalidBuffer;
1474
1475 return false;
1476 }
1477
1478 /*
1479 * heap_hot_search_buffer - search HOT chain for tuple satisfying snapshot
1480 *
1481 * On entry, *tid is the TID of a tuple (either a simple tuple, or the root
1482 * of a HOT chain), and buffer is the buffer holding this tuple. We search
1483 * for the first chain member satisfying the given snapshot. If one is
1484 * found, we update *tid to reference that tuple's offset number, and
1485 * return true. If no match, return false without modifying *tid.
1486 *
1487 * heapTuple is a caller-supplied buffer. When a match is found, we return
1488 * the tuple here, in addition to updating *tid. If no match is found, the
1489 * contents of this buffer on return are undefined.
1490 *
1491 * If all_dead is not NULL, we check non-visible tuples to see if they are
1492 * globally dead; *all_dead is set true if all members of the HOT chain
1493 * are vacuumable, false if not.
1494 *
1495 * Unlike heap_fetch, the caller must already have pin and (at least) share
1496 * lock on the buffer; it is still pinned/locked at exit. Also unlike
1497 * heap_fetch, we do not report any pgstats count; caller may do so if wanted.
1498 */
1499 bool
heap_hot_search_buffer(ItemPointer tid,Relation relation,Buffer buffer,Snapshot snapshot,HeapTuple heapTuple,bool * all_dead,bool first_call)1500 heap_hot_search_buffer(ItemPointer tid, Relation relation, Buffer buffer,
1501 Snapshot snapshot, HeapTuple heapTuple,
1502 bool *all_dead, bool first_call)
1503 {
1504 Page dp = (Page) BufferGetPage(buffer);
1505 TransactionId prev_xmax = InvalidTransactionId;
1506 BlockNumber blkno;
1507 OffsetNumber offnum;
1508 bool at_chain_start;
1509 bool valid;
1510 bool skip;
1511
1512 /* If this is not the first call, previous call returned a (live!) tuple */
1513 if (all_dead)
1514 *all_dead = first_call;
1515
1516 blkno = ItemPointerGetBlockNumber(tid);
1517 offnum = ItemPointerGetOffsetNumber(tid);
1518 at_chain_start = first_call;
1519 skip = !first_call;
1520
1521 Assert(TransactionIdIsValid(RecentGlobalXmin));
1522 Assert(BufferGetBlockNumber(buffer) == blkno);
1523
1524 /* Scan through possible multiple members of HOT-chain */
1525 for (;;)
1526 {
1527 ItemId lp;
1528
1529 /* check for bogus TID */
1530 if (offnum < FirstOffsetNumber || offnum > PageGetMaxOffsetNumber(dp))
1531 break;
1532
1533 lp = PageGetItemId(dp, offnum);
1534
1535 /* check for unused, dead, or redirected items */
1536 if (!ItemIdIsNormal(lp))
1537 {
1538 /* We should only see a redirect at start of chain */
1539 if (ItemIdIsRedirected(lp) && at_chain_start)
1540 {
1541 /* Follow the redirect */
1542 offnum = ItemIdGetRedirect(lp);
1543 at_chain_start = false;
1544 continue;
1545 }
1546 /* else must be end of chain */
1547 break;
1548 }
1549
1550 /*
1551 * Update heapTuple to point to the element of the HOT chain we're
1552 * currently investigating. Having t_self set correctly is important
1553 * because the SSI checks and the *Satisfies routine for historical
1554 * MVCC snapshots need the correct tid to decide about the visibility.
1555 */
1556 heapTuple->t_data = (HeapTupleHeader) PageGetItem(dp, lp);
1557 heapTuple->t_len = ItemIdGetLength(lp);
1558 heapTuple->t_tableOid = RelationGetRelid(relation);
1559 ItemPointerSet(&heapTuple->t_self, blkno, offnum);
1560
1561 /*
1562 * Shouldn't see a HEAP_ONLY tuple at chain start.
1563 */
1564 if (at_chain_start && HeapTupleIsHeapOnly(heapTuple))
1565 break;
1566
1567 /*
1568 * The xmin should match the previous xmax value, else chain is
1569 * broken.
1570 */
1571 if (TransactionIdIsValid(prev_xmax) &&
1572 !TransactionIdEquals(prev_xmax,
1573 HeapTupleHeaderGetXmin(heapTuple->t_data)))
1574 break;
1575
1576 /*
1577 * When first_call is true (and thus, skip is initially false) we'll
1578 * return the first tuple we find. But on later passes, heapTuple
1579 * will initially be pointing to the tuple we returned last time.
1580 * Returning it again would be incorrect (and would loop forever), so
1581 * we skip it and return the next match we find.
1582 */
1583 if (!skip)
1584 {
1585 /* If it's visible per the snapshot, we must return it */
1586 valid = HeapTupleSatisfiesVisibility(heapTuple, snapshot, buffer);
1587 HeapCheckForSerializableConflictOut(valid, relation, heapTuple,
1588 buffer, snapshot);
1589
1590 if (valid)
1591 {
1592 ItemPointerSetOffsetNumber(tid, offnum);
1593 PredicateLockTID(relation, &heapTuple->t_self, snapshot,
1594 HeapTupleHeaderGetXmin(heapTuple->t_data));
1595 if (all_dead)
1596 *all_dead = false;
1597 return true;
1598 }
1599 }
1600 skip = false;
1601
1602 /*
1603 * If we can't see it, maybe no one else can either. At caller
1604 * request, check whether all chain members are dead to all
1605 * transactions.
1606 *
1607 * Note: if you change the criterion here for what is "dead", fix the
1608 * planner's get_actual_variable_range() function to match.
1609 */
1610 if (all_dead && *all_dead &&
1611 !HeapTupleIsSurelyDead(heapTuple, RecentGlobalXmin))
1612 *all_dead = false;
1613
1614 /*
1615 * Check to see if HOT chain continues past this tuple; if so fetch
1616 * the next offnum and loop around.
1617 */
1618 if (HeapTupleIsHotUpdated(heapTuple))
1619 {
1620 Assert(ItemPointerGetBlockNumber(&heapTuple->t_data->t_ctid) ==
1621 blkno);
1622 offnum = ItemPointerGetOffsetNumber(&heapTuple->t_data->t_ctid);
1623 at_chain_start = false;
1624 prev_xmax = HeapTupleHeaderGetUpdateXid(heapTuple->t_data);
1625 }
1626 else
1627 break; /* end of chain */
1628 }
1629
1630 return false;
1631 }
1632
1633 /*
1634 * heap_get_latest_tid - get the latest tid of a specified tuple
1635 *
1636 * Actually, this gets the latest version that is visible according to the
1637 * scan's snapshot. Create a scan using SnapshotDirty to get the very latest,
1638 * possibly uncommitted version.
1639 *
1640 * *tid is both an input and an output parameter: it is updated to
1641 * show the latest version of the row. Note that it will not be changed
1642 * if no version of the row passes the snapshot test.
1643 */
1644 void
heap_get_latest_tid(TableScanDesc sscan,ItemPointer tid)1645 heap_get_latest_tid(TableScanDesc sscan,
1646 ItemPointer tid)
1647 {
1648 Relation relation = sscan->rs_rd;
1649 Snapshot snapshot = sscan->rs_snapshot;
1650 ItemPointerData ctid;
1651 TransactionId priorXmax;
1652
1653 /*
1654 * table_tuple_get_latest_tid() verified that the passed in tid is valid.
1655 * Assume that t_ctid links are valid however - there shouldn't be invalid
1656 * ones in the table.
1657 */
1658 Assert(ItemPointerIsValid(tid));
1659
1660 /*
1661 * Loop to chase down t_ctid links. At top of loop, ctid is the tuple we
1662 * need to examine, and *tid is the TID we will return if ctid turns out
1663 * to be bogus.
1664 *
1665 * Note that we will loop until we reach the end of the t_ctid chain.
1666 * Depending on the snapshot passed, there might be at most one visible
1667 * version of the row, but we don't try to optimize for that.
1668 */
1669 ctid = *tid;
1670 priorXmax = InvalidTransactionId; /* cannot check first XMIN */
1671 for (;;)
1672 {
1673 Buffer buffer;
1674 Page page;
1675 OffsetNumber offnum;
1676 ItemId lp;
1677 HeapTupleData tp;
1678 bool valid;
1679
1680 /*
1681 * Read, pin, and lock the page.
1682 */
1683 buffer = ReadBuffer(relation, ItemPointerGetBlockNumber(&ctid));
1684 LockBuffer(buffer, BUFFER_LOCK_SHARE);
1685 page = BufferGetPage(buffer);
1686 TestForOldSnapshot(snapshot, relation, page);
1687
1688 /*
1689 * Check for bogus item number. This is not treated as an error
1690 * condition because it can happen while following a t_ctid link. We
1691 * just assume that the prior tid is OK and return it unchanged.
1692 */
1693 offnum = ItemPointerGetOffsetNumber(&ctid);
1694 if (offnum < FirstOffsetNumber || offnum > PageGetMaxOffsetNumber(page))
1695 {
1696 UnlockReleaseBuffer(buffer);
1697 break;
1698 }
1699 lp = PageGetItemId(page, offnum);
1700 if (!ItemIdIsNormal(lp))
1701 {
1702 UnlockReleaseBuffer(buffer);
1703 break;
1704 }
1705
1706 /* OK to access the tuple */
1707 tp.t_self = ctid;
1708 tp.t_data = (HeapTupleHeader) PageGetItem(page, lp);
1709 tp.t_len = ItemIdGetLength(lp);
1710 tp.t_tableOid = RelationGetRelid(relation);
1711
1712 /*
1713 * After following a t_ctid link, we might arrive at an unrelated
1714 * tuple. Check for XMIN match.
1715 */
1716 if (TransactionIdIsValid(priorXmax) &&
1717 !TransactionIdEquals(priorXmax, HeapTupleHeaderGetXmin(tp.t_data)))
1718 {
1719 UnlockReleaseBuffer(buffer);
1720 break;
1721 }
1722
1723 /*
1724 * Check tuple visibility; if visible, set it as the new result
1725 * candidate.
1726 */
1727 valid = HeapTupleSatisfiesVisibility(&tp, snapshot, buffer);
1728 HeapCheckForSerializableConflictOut(valid, relation, &tp, buffer, snapshot);
1729 if (valid)
1730 *tid = ctid;
1731
1732 /*
1733 * If there's a valid t_ctid link, follow it, else we're done.
1734 */
1735 if ((tp.t_data->t_infomask & HEAP_XMAX_INVALID) ||
1736 HeapTupleHeaderIsOnlyLocked(tp.t_data) ||
1737 HeapTupleHeaderIndicatesMovedPartitions(tp.t_data) ||
1738 ItemPointerEquals(&tp.t_self, &tp.t_data->t_ctid))
1739 {
1740 UnlockReleaseBuffer(buffer);
1741 break;
1742 }
1743
1744 ctid = tp.t_data->t_ctid;
1745 priorXmax = HeapTupleHeaderGetUpdateXid(tp.t_data);
1746 UnlockReleaseBuffer(buffer);
1747 } /* end of loop */
1748 }
1749
1750
1751 /*
1752 * UpdateXmaxHintBits - update tuple hint bits after xmax transaction ends
1753 *
1754 * This is called after we have waited for the XMAX transaction to terminate.
1755 * If the transaction aborted, we guarantee the XMAX_INVALID hint bit will
1756 * be set on exit. If the transaction committed, we set the XMAX_COMMITTED
1757 * hint bit if possible --- but beware that that may not yet be possible,
1758 * if the transaction committed asynchronously.
1759 *
1760 * Note that if the transaction was a locker only, we set HEAP_XMAX_INVALID
1761 * even if it commits.
1762 *
1763 * Hence callers should look only at XMAX_INVALID.
1764 *
1765 * Note this is not allowed for tuples whose xmax is a multixact.
1766 */
1767 static void
UpdateXmaxHintBits(HeapTupleHeader tuple,Buffer buffer,TransactionId xid)1768 UpdateXmaxHintBits(HeapTupleHeader tuple, Buffer buffer, TransactionId xid)
1769 {
1770 Assert(TransactionIdEquals(HeapTupleHeaderGetRawXmax(tuple), xid));
1771 Assert(!(tuple->t_infomask & HEAP_XMAX_IS_MULTI));
1772
1773 if (!(tuple->t_infomask & (HEAP_XMAX_COMMITTED | HEAP_XMAX_INVALID)))
1774 {
1775 if (!HEAP_XMAX_IS_LOCKED_ONLY(tuple->t_infomask) &&
1776 TransactionIdDidCommit(xid))
1777 HeapTupleSetHintBits(tuple, buffer, HEAP_XMAX_COMMITTED,
1778 xid);
1779 else
1780 HeapTupleSetHintBits(tuple, buffer, HEAP_XMAX_INVALID,
1781 InvalidTransactionId);
1782 }
1783 }
1784
1785
1786 /*
1787 * GetBulkInsertState - prepare status object for a bulk insert
1788 */
1789 BulkInsertState
GetBulkInsertState(void)1790 GetBulkInsertState(void)
1791 {
1792 BulkInsertState bistate;
1793
1794 bistate = (BulkInsertState) palloc(sizeof(BulkInsertStateData));
1795 bistate->strategy = GetAccessStrategy(BAS_BULKWRITE);
1796 bistate->current_buf = InvalidBuffer;
1797 return bistate;
1798 }
1799
1800 /*
1801 * FreeBulkInsertState - clean up after finishing a bulk insert
1802 */
1803 void
FreeBulkInsertState(BulkInsertState bistate)1804 FreeBulkInsertState(BulkInsertState bistate)
1805 {
1806 if (bistate->current_buf != InvalidBuffer)
1807 ReleaseBuffer(bistate->current_buf);
1808 FreeAccessStrategy(bistate->strategy);
1809 pfree(bistate);
1810 }
1811
1812 /*
1813 * ReleaseBulkInsertStatePin - release a buffer currently held in bistate
1814 */
1815 void
ReleaseBulkInsertStatePin(BulkInsertState bistate)1816 ReleaseBulkInsertStatePin(BulkInsertState bistate)
1817 {
1818 if (bistate->current_buf != InvalidBuffer)
1819 ReleaseBuffer(bistate->current_buf);
1820 bistate->current_buf = InvalidBuffer;
1821 }
1822
1823
1824 /*
1825 * heap_insert - insert tuple into a heap
1826 *
1827 * The new tuple is stamped with current transaction ID and the specified
1828 * command ID.
1829 *
1830 * See table_tuple_insert for comments about most of the input flags, except
1831 * that this routine directly takes a tuple rather than a slot.
1832 *
1833 * There's corresponding HEAP_INSERT_ options to all the TABLE_INSERT_
1834 * options, and there additionally is HEAP_INSERT_SPECULATIVE which is used to
1835 * implement table_tuple_insert_speculative().
1836 *
1837 * On return the header fields of *tup are updated to match the stored tuple;
1838 * in particular tup->t_self receives the actual TID where the tuple was
1839 * stored. But note that any toasting of fields within the tuple data is NOT
1840 * reflected into *tup.
1841 */
1842 void
heap_insert(Relation relation,HeapTuple tup,CommandId cid,int options,BulkInsertState bistate)1843 heap_insert(Relation relation, HeapTuple tup, CommandId cid,
1844 int options, BulkInsertState bistate)
1845 {
1846 TransactionId xid = GetCurrentTransactionId();
1847 HeapTuple heaptup;
1848 Buffer buffer;
1849 Buffer vmbuffer = InvalidBuffer;
1850 bool all_visible_cleared = false;
1851
1852 /* Cheap, simplistic check that the tuple matches the rel's rowtype. */
1853 Assert(HeapTupleHeaderGetNatts(tup->t_data) <=
1854 RelationGetNumberOfAttributes(relation));
1855
1856 /*
1857 * Fill in tuple header fields and toast the tuple if necessary.
1858 *
1859 * Note: below this point, heaptup is the data we actually intend to store
1860 * into the relation; tup is the caller's original untoasted data.
1861 */
1862 heaptup = heap_prepare_insert(relation, tup, xid, cid, options);
1863
1864 /*
1865 * Find buffer to insert this tuple into. If the page is all visible,
1866 * this will also pin the requisite visibility map page.
1867 */
1868 buffer = RelationGetBufferForTuple(relation, heaptup->t_len,
1869 InvalidBuffer, options, bistate,
1870 &vmbuffer, NULL);
1871
1872 /*
1873 * We're about to do the actual insert -- but check for conflict first, to
1874 * avoid possibly having to roll back work we've just done.
1875 *
1876 * This is safe without a recheck as long as there is no possibility of
1877 * another process scanning the page between this check and the insert
1878 * being visible to the scan (i.e., an exclusive buffer content lock is
1879 * continuously held from this point until the tuple insert is visible).
1880 *
1881 * For a heap insert, we only need to check for table-level SSI locks. Our
1882 * new tuple can't possibly conflict with existing tuple locks, and heap
1883 * page locks are only consolidated versions of tuple locks; they do not
1884 * lock "gaps" as index page locks do. So we don't need to specify a
1885 * buffer when making the call, which makes for a faster check.
1886 */
1887 CheckForSerializableConflictIn(relation, NULL, InvalidBlockNumber);
1888
1889 /* NO EREPORT(ERROR) from here till changes are logged */
1890 START_CRIT_SECTION();
1891
1892 RelationPutHeapTuple(relation, buffer, heaptup,
1893 (options & HEAP_INSERT_SPECULATIVE) != 0);
1894
1895 if (PageIsAllVisible(BufferGetPage(buffer)))
1896 {
1897 all_visible_cleared = true;
1898 PageClearAllVisible(BufferGetPage(buffer));
1899 visibilitymap_clear(relation,
1900 ItemPointerGetBlockNumber(&(heaptup->t_self)),
1901 vmbuffer, VISIBILITYMAP_VALID_BITS);
1902 }
1903
1904 /*
1905 * XXX Should we set PageSetPrunable on this page ?
1906 *
1907 * The inserting transaction may eventually abort thus making this tuple
1908 * DEAD and hence available for pruning. Though we don't want to optimize
1909 * for aborts, if no other tuple in this page is UPDATEd/DELETEd, the
1910 * aborted tuple will never be pruned until next vacuum is triggered.
1911 *
1912 * If you do add PageSetPrunable here, add it in heap_xlog_insert too.
1913 */
1914
1915 MarkBufferDirty(buffer);
1916
1917 /* XLOG stuff */
1918 if (RelationNeedsWAL(relation))
1919 {
1920 xl_heap_insert xlrec;
1921 xl_heap_header xlhdr;
1922 XLogRecPtr recptr;
1923 Page page = BufferGetPage(buffer);
1924 uint8 info = XLOG_HEAP_INSERT;
1925 int bufflags = 0;
1926
1927 /*
1928 * If this is a catalog, we need to transmit combocids to properly
1929 * decode, so log that as well.
1930 */
1931 if (RelationIsAccessibleInLogicalDecoding(relation))
1932 log_heap_new_cid(relation, heaptup);
1933
1934 /*
1935 * If this is the single and first tuple on page, we can reinit the
1936 * page instead of restoring the whole thing. Set flag, and hide
1937 * buffer references from XLogInsert.
1938 */
1939 if (ItemPointerGetOffsetNumber(&(heaptup->t_self)) == FirstOffsetNumber &&
1940 PageGetMaxOffsetNumber(page) == FirstOffsetNumber)
1941 {
1942 info |= XLOG_HEAP_INIT_PAGE;
1943 bufflags |= REGBUF_WILL_INIT;
1944 }
1945
1946 xlrec.offnum = ItemPointerGetOffsetNumber(&heaptup->t_self);
1947 xlrec.flags = 0;
1948 if (all_visible_cleared)
1949 xlrec.flags |= XLH_INSERT_ALL_VISIBLE_CLEARED;
1950 if (options & HEAP_INSERT_SPECULATIVE)
1951 xlrec.flags |= XLH_INSERT_IS_SPECULATIVE;
1952 Assert(ItemPointerGetBlockNumber(&heaptup->t_self) == BufferGetBlockNumber(buffer));
1953
1954 /*
1955 * For logical decoding, we need the tuple even if we're doing a full
1956 * page write, so make sure it's included even if we take a full-page
1957 * image. (XXX We could alternatively store a pointer into the FPW).
1958 */
1959 if (RelationIsLogicallyLogged(relation) &&
1960 !(options & HEAP_INSERT_NO_LOGICAL))
1961 {
1962 xlrec.flags |= XLH_INSERT_CONTAINS_NEW_TUPLE;
1963 bufflags |= REGBUF_KEEP_DATA;
1964 }
1965
1966 XLogBeginInsert();
1967 XLogRegisterData((char *) &xlrec, SizeOfHeapInsert);
1968
1969 xlhdr.t_infomask2 = heaptup->t_data->t_infomask2;
1970 xlhdr.t_infomask = heaptup->t_data->t_infomask;
1971 xlhdr.t_hoff = heaptup->t_data->t_hoff;
1972
1973 /*
1974 * note we mark xlhdr as belonging to buffer; if XLogInsert decides to
1975 * write the whole page to the xlog, we don't need to store
1976 * xl_heap_header in the xlog.
1977 */
1978 XLogRegisterBuffer(0, buffer, REGBUF_STANDARD | bufflags);
1979 XLogRegisterBufData(0, (char *) &xlhdr, SizeOfHeapHeader);
1980 /* PG73FORMAT: write bitmap [+ padding] [+ oid] + data */
1981 XLogRegisterBufData(0,
1982 (char *) heaptup->t_data + SizeofHeapTupleHeader,
1983 heaptup->t_len - SizeofHeapTupleHeader);
1984
1985 /* filtering by origin on a row level is much more efficient */
1986 XLogSetRecordFlags(XLOG_INCLUDE_ORIGIN);
1987
1988 recptr = XLogInsert(RM_HEAP_ID, info);
1989
1990 PageSetLSN(page, recptr);
1991 }
1992
1993 END_CRIT_SECTION();
1994
1995 UnlockReleaseBuffer(buffer);
1996 if (vmbuffer != InvalidBuffer)
1997 ReleaseBuffer(vmbuffer);
1998
1999 /*
2000 * If tuple is cachable, mark it for invalidation from the caches in case
2001 * we abort. Note it is OK to do this after releasing the buffer, because
2002 * the heaptup data structure is all in local memory, not in the shared
2003 * buffer.
2004 */
2005 CacheInvalidateHeapTuple(relation, heaptup, NULL);
2006
2007 /* Note: speculative insertions are counted too, even if aborted later */
2008 pgstat_count_heap_insert(relation, 1);
2009
2010 /*
2011 * If heaptup is a private copy, release it. Don't forget to copy t_self
2012 * back to the caller's image, too.
2013 */
2014 if (heaptup != tup)
2015 {
2016 tup->t_self = heaptup->t_self;
2017 heap_freetuple(heaptup);
2018 }
2019 }
2020
2021 /*
2022 * Subroutine for heap_insert(). Prepares a tuple for insertion. This sets the
2023 * tuple header fields and toasts the tuple if necessary. Returns a toasted
2024 * version of the tuple if it was toasted, or the original tuple if not. Note
2025 * that in any case, the header fields are also set in the original tuple.
2026 */
2027 static HeapTuple
heap_prepare_insert(Relation relation,HeapTuple tup,TransactionId xid,CommandId cid,int options)2028 heap_prepare_insert(Relation relation, HeapTuple tup, TransactionId xid,
2029 CommandId cid, int options)
2030 {
2031 /*
2032 * To allow parallel inserts, we need to ensure that they are safe to be
2033 * performed in workers. We have the infrastructure to allow parallel
2034 * inserts in general except for the cases where inserts generate a new
2035 * CommandId (eg. inserts into a table having a foreign key column).
2036 */
2037 if (IsParallelWorker())
2038 ereport(ERROR,
2039 (errcode(ERRCODE_INVALID_TRANSACTION_STATE),
2040 errmsg("cannot insert tuples in a parallel worker")));
2041
2042 tup->t_data->t_infomask &= ~(HEAP_XACT_MASK);
2043 tup->t_data->t_infomask2 &= ~(HEAP2_XACT_MASK);
2044 tup->t_data->t_infomask |= HEAP_XMAX_INVALID;
2045 HeapTupleHeaderSetXmin(tup->t_data, xid);
2046 if (options & HEAP_INSERT_FROZEN)
2047 HeapTupleHeaderSetXminFrozen(tup->t_data);
2048
2049 HeapTupleHeaderSetCmin(tup->t_data, cid);
2050 HeapTupleHeaderSetXmax(tup->t_data, 0); /* for cleanliness */
2051 tup->t_tableOid = RelationGetRelid(relation);
2052
2053 /*
2054 * If the new tuple is too big for storage or contains already toasted
2055 * out-of-line attributes from some other relation, invoke the toaster.
2056 */
2057 if (relation->rd_rel->relkind != RELKIND_RELATION &&
2058 relation->rd_rel->relkind != RELKIND_MATVIEW)
2059 {
2060 /* toast table entries should never be recursively toasted */
2061 Assert(!HeapTupleHasExternal(tup));
2062 return tup;
2063 }
2064 else if (HeapTupleHasExternal(tup) || tup->t_len > TOAST_TUPLE_THRESHOLD)
2065 return heap_toast_insert_or_update(relation, tup, NULL, options);
2066 else
2067 return tup;
2068 }
2069
2070 /*
2071 * heap_multi_insert - insert multiple tuple into a heap
2072 *
2073 * This is like heap_insert(), but inserts multiple tuples in one operation.
2074 * That's faster than calling heap_insert() in a loop, because when multiple
2075 * tuples can be inserted on a single page, we can write just a single WAL
2076 * record covering all of them, and only need to lock/unlock the page once.
2077 *
2078 * Note: this leaks memory into the current memory context. You can create a
2079 * temporary context before calling this, if that's a problem.
2080 */
2081 void
heap_multi_insert(Relation relation,TupleTableSlot ** slots,int ntuples,CommandId cid,int options,BulkInsertState bistate)2082 heap_multi_insert(Relation relation, TupleTableSlot **slots, int ntuples,
2083 CommandId cid, int options, BulkInsertState bistate)
2084 {
2085 TransactionId xid = GetCurrentTransactionId();
2086 HeapTuple *heaptuples;
2087 int i;
2088 int ndone;
2089 PGAlignedBlock scratch;
2090 Page page;
2091 bool needwal;
2092 Size saveFreeSpace;
2093 bool need_tuple_data = RelationIsLogicallyLogged(relation);
2094 bool need_cids = RelationIsAccessibleInLogicalDecoding(relation);
2095
2096 /* currently not needed (thus unsupported) for heap_multi_insert() */
2097 AssertArg(!(options & HEAP_INSERT_NO_LOGICAL));
2098
2099 needwal = RelationNeedsWAL(relation);
2100 saveFreeSpace = RelationGetTargetPageFreeSpace(relation,
2101 HEAP_DEFAULT_FILLFACTOR);
2102
2103 /* Toast and set header data in all the slots */
2104 heaptuples = palloc(ntuples * sizeof(HeapTuple));
2105 for (i = 0; i < ntuples; i++)
2106 {
2107 HeapTuple tuple;
2108
2109 tuple = ExecFetchSlotHeapTuple(slots[i], true, NULL);
2110 slots[i]->tts_tableOid = RelationGetRelid(relation);
2111 tuple->t_tableOid = slots[i]->tts_tableOid;
2112 heaptuples[i] = heap_prepare_insert(relation, tuple, xid, cid,
2113 options);
2114 }
2115
2116 /*
2117 * We're about to do the actual inserts -- but check for conflict first,
2118 * to minimize the possibility of having to roll back work we've just
2119 * done.
2120 *
2121 * A check here does not definitively prevent a serialization anomaly;
2122 * that check MUST be done at least past the point of acquiring an
2123 * exclusive buffer content lock on every buffer that will be affected,
2124 * and MAY be done after all inserts are reflected in the buffers and
2125 * those locks are released; otherwise there is a race condition. Since
2126 * multiple buffers can be locked and unlocked in the loop below, and it
2127 * would not be feasible to identify and lock all of those buffers before
2128 * the loop, we must do a final check at the end.
2129 *
2130 * The check here could be omitted with no loss of correctness; it is
2131 * present strictly as an optimization.
2132 *
2133 * For heap inserts, we only need to check for table-level SSI locks. Our
2134 * new tuples can't possibly conflict with existing tuple locks, and heap
2135 * page locks are only consolidated versions of tuple locks; they do not
2136 * lock "gaps" as index page locks do. So we don't need to specify a
2137 * buffer when making the call, which makes for a faster check.
2138 */
2139 CheckForSerializableConflictIn(relation, NULL, InvalidBlockNumber);
2140
2141 ndone = 0;
2142 while (ndone < ntuples)
2143 {
2144 Buffer buffer;
2145 Buffer vmbuffer = InvalidBuffer;
2146 bool all_visible_cleared = false;
2147 int nthispage;
2148
2149 CHECK_FOR_INTERRUPTS();
2150
2151 /*
2152 * Find buffer where at least the next tuple will fit. If the page is
2153 * all-visible, this will also pin the requisite visibility map page.
2154 */
2155 buffer = RelationGetBufferForTuple(relation, heaptuples[ndone]->t_len,
2156 InvalidBuffer, options, bistate,
2157 &vmbuffer, NULL);
2158 page = BufferGetPage(buffer);
2159
2160 /* NO EREPORT(ERROR) from here till changes are logged */
2161 START_CRIT_SECTION();
2162
2163 /*
2164 * RelationGetBufferForTuple has ensured that the first tuple fits.
2165 * Put that on the page, and then as many other tuples as fit.
2166 */
2167 RelationPutHeapTuple(relation, buffer, heaptuples[ndone], false);
2168
2169 /*
2170 * Note that heap_multi_insert is not used for catalog tuples yet, but
2171 * this will cover the gap once that is the case.
2172 */
2173 if (needwal && need_cids)
2174 log_heap_new_cid(relation, heaptuples[ndone]);
2175
2176 for (nthispage = 1; ndone + nthispage < ntuples; nthispage++)
2177 {
2178 HeapTuple heaptup = heaptuples[ndone + nthispage];
2179
2180 if (PageGetHeapFreeSpace(page) < MAXALIGN(heaptup->t_len) + saveFreeSpace)
2181 break;
2182
2183 RelationPutHeapTuple(relation, buffer, heaptup, false);
2184
2185 /*
2186 * We don't use heap_multi_insert for catalog tuples yet, but
2187 * better be prepared...
2188 */
2189 if (needwal && need_cids)
2190 log_heap_new_cid(relation, heaptup);
2191 }
2192
2193 if (PageIsAllVisible(page))
2194 {
2195 all_visible_cleared = true;
2196 PageClearAllVisible(page);
2197 visibilitymap_clear(relation,
2198 BufferGetBlockNumber(buffer),
2199 vmbuffer, VISIBILITYMAP_VALID_BITS);
2200 }
2201
2202 /*
2203 * XXX Should we set PageSetPrunable on this page ? See heap_insert()
2204 */
2205
2206 MarkBufferDirty(buffer);
2207
2208 /* XLOG stuff */
2209 if (needwal)
2210 {
2211 XLogRecPtr recptr;
2212 xl_heap_multi_insert *xlrec;
2213 uint8 info = XLOG_HEAP2_MULTI_INSERT;
2214 char *tupledata;
2215 int totaldatalen;
2216 char *scratchptr = scratch.data;
2217 bool init;
2218 int bufflags = 0;
2219
2220 /*
2221 * If the page was previously empty, we can reinit the page
2222 * instead of restoring the whole thing.
2223 */
2224 init = (ItemPointerGetOffsetNumber(&(heaptuples[ndone]->t_self)) == FirstOffsetNumber &&
2225 PageGetMaxOffsetNumber(page) == FirstOffsetNumber + nthispage - 1);
2226
2227 /* allocate xl_heap_multi_insert struct from the scratch area */
2228 xlrec = (xl_heap_multi_insert *) scratchptr;
2229 scratchptr += SizeOfHeapMultiInsert;
2230
2231 /*
2232 * Allocate offsets array. Unless we're reinitializing the page,
2233 * in that case the tuples are stored in order starting at
2234 * FirstOffsetNumber and we don't need to store the offsets
2235 * explicitly.
2236 */
2237 if (!init)
2238 scratchptr += nthispage * sizeof(OffsetNumber);
2239
2240 /* the rest of the scratch space is used for tuple data */
2241 tupledata = scratchptr;
2242
2243 xlrec->flags = all_visible_cleared ? XLH_INSERT_ALL_VISIBLE_CLEARED : 0;
2244 xlrec->ntuples = nthispage;
2245
2246 /*
2247 * Write out an xl_multi_insert_tuple and the tuple data itself
2248 * for each tuple.
2249 */
2250 for (i = 0; i < nthispage; i++)
2251 {
2252 HeapTuple heaptup = heaptuples[ndone + i];
2253 xl_multi_insert_tuple *tuphdr;
2254 int datalen;
2255
2256 if (!init)
2257 xlrec->offsets[i] = ItemPointerGetOffsetNumber(&heaptup->t_self);
2258 /* xl_multi_insert_tuple needs two-byte alignment. */
2259 tuphdr = (xl_multi_insert_tuple *) SHORTALIGN(scratchptr);
2260 scratchptr = ((char *) tuphdr) + SizeOfMultiInsertTuple;
2261
2262 tuphdr->t_infomask2 = heaptup->t_data->t_infomask2;
2263 tuphdr->t_infomask = heaptup->t_data->t_infomask;
2264 tuphdr->t_hoff = heaptup->t_data->t_hoff;
2265
2266 /* write bitmap [+ padding] [+ oid] + data */
2267 datalen = heaptup->t_len - SizeofHeapTupleHeader;
2268 memcpy(scratchptr,
2269 (char *) heaptup->t_data + SizeofHeapTupleHeader,
2270 datalen);
2271 tuphdr->datalen = datalen;
2272 scratchptr += datalen;
2273 }
2274 totaldatalen = scratchptr - tupledata;
2275 Assert((scratchptr - scratch.data) < BLCKSZ);
2276
2277 if (need_tuple_data)
2278 xlrec->flags |= XLH_INSERT_CONTAINS_NEW_TUPLE;
2279
2280 /*
2281 * Signal that this is the last xl_heap_multi_insert record
2282 * emitted by this call to heap_multi_insert(). Needed for logical
2283 * decoding so it knows when to cleanup temporary data.
2284 */
2285 if (ndone + nthispage == ntuples)
2286 xlrec->flags |= XLH_INSERT_LAST_IN_MULTI;
2287
2288 if (init)
2289 {
2290 info |= XLOG_HEAP_INIT_PAGE;
2291 bufflags |= REGBUF_WILL_INIT;
2292 }
2293
2294 /*
2295 * If we're doing logical decoding, include the new tuple data
2296 * even if we take a full-page image of the page.
2297 */
2298 if (need_tuple_data)
2299 bufflags |= REGBUF_KEEP_DATA;
2300
2301 XLogBeginInsert();
2302 XLogRegisterData((char *) xlrec, tupledata - scratch.data);
2303 XLogRegisterBuffer(0, buffer, REGBUF_STANDARD | bufflags);
2304
2305 XLogRegisterBufData(0, tupledata, totaldatalen);
2306
2307 /* filtering by origin on a row level is much more efficient */
2308 XLogSetRecordFlags(XLOG_INCLUDE_ORIGIN);
2309
2310 recptr = XLogInsert(RM_HEAP2_ID, info);
2311
2312 PageSetLSN(page, recptr);
2313 }
2314
2315 END_CRIT_SECTION();
2316
2317 UnlockReleaseBuffer(buffer);
2318 if (vmbuffer != InvalidBuffer)
2319 ReleaseBuffer(vmbuffer);
2320
2321 ndone += nthispage;
2322 }
2323
2324 /*
2325 * We're done with the actual inserts. Check for conflicts again, to
2326 * ensure that all rw-conflicts in to these inserts are detected. Without
2327 * this final check, a sequential scan of the heap may have locked the
2328 * table after the "before" check, missing one opportunity to detect the
2329 * conflict, and then scanned the table before the new tuples were there,
2330 * missing the other chance to detect the conflict.
2331 *
2332 * For heap inserts, we only need to check for table-level SSI locks. Our
2333 * new tuples can't possibly conflict with existing tuple locks, and heap
2334 * page locks are only consolidated versions of tuple locks; they do not
2335 * lock "gaps" as index page locks do. So we don't need to specify a
2336 * buffer when making the call.
2337 */
2338 CheckForSerializableConflictIn(relation, NULL, InvalidBlockNumber);
2339
2340 /*
2341 * If tuples are cachable, mark them for invalidation from the caches in
2342 * case we abort. Note it is OK to do this after releasing the buffer,
2343 * because the heaptuples data structure is all in local memory, not in
2344 * the shared buffer.
2345 */
2346 if (IsCatalogRelation(relation))
2347 {
2348 for (i = 0; i < ntuples; i++)
2349 CacheInvalidateHeapTuple(relation, heaptuples[i], NULL);
2350 }
2351
2352 /* copy t_self fields back to the caller's slots */
2353 for (i = 0; i < ntuples; i++)
2354 slots[i]->tts_tid = heaptuples[i]->t_self;
2355
2356 pgstat_count_heap_insert(relation, ntuples);
2357 }
2358
2359 /*
2360 * simple_heap_insert - insert a tuple
2361 *
2362 * Currently, this routine differs from heap_insert only in supplying
2363 * a default command ID and not allowing access to the speedup options.
2364 *
2365 * This should be used rather than using heap_insert directly in most places
2366 * where we are modifying system catalogs.
2367 */
2368 void
simple_heap_insert(Relation relation,HeapTuple tup)2369 simple_heap_insert(Relation relation, HeapTuple tup)
2370 {
2371 heap_insert(relation, tup, GetCurrentCommandId(true), 0, NULL);
2372 }
2373
2374 /*
2375 * Given infomask/infomask2, compute the bits that must be saved in the
2376 * "infobits" field of xl_heap_delete, xl_heap_update, xl_heap_lock,
2377 * xl_heap_lock_updated WAL records.
2378 *
2379 * See fix_infomask_from_infobits.
2380 */
2381 static uint8
compute_infobits(uint16 infomask,uint16 infomask2)2382 compute_infobits(uint16 infomask, uint16 infomask2)
2383 {
2384 return
2385 ((infomask & HEAP_XMAX_IS_MULTI) != 0 ? XLHL_XMAX_IS_MULTI : 0) |
2386 ((infomask & HEAP_XMAX_LOCK_ONLY) != 0 ? XLHL_XMAX_LOCK_ONLY : 0) |
2387 ((infomask & HEAP_XMAX_EXCL_LOCK) != 0 ? XLHL_XMAX_EXCL_LOCK : 0) |
2388 /* note we ignore HEAP_XMAX_SHR_LOCK here */
2389 ((infomask & HEAP_XMAX_KEYSHR_LOCK) != 0 ? XLHL_XMAX_KEYSHR_LOCK : 0) |
2390 ((infomask2 & HEAP_KEYS_UPDATED) != 0 ?
2391 XLHL_KEYS_UPDATED : 0);
2392 }
2393
2394 /*
2395 * Given two versions of the same t_infomask for a tuple, compare them and
2396 * return whether the relevant status for a tuple Xmax has changed. This is
2397 * used after a buffer lock has been released and reacquired: we want to ensure
2398 * that the tuple state continues to be the same it was when we previously
2399 * examined it.
2400 *
2401 * Note the Xmax field itself must be compared separately.
2402 */
2403 static inline bool
xmax_infomask_changed(uint16 new_infomask,uint16 old_infomask)2404 xmax_infomask_changed(uint16 new_infomask, uint16 old_infomask)
2405 {
2406 const uint16 interesting =
2407 HEAP_XMAX_IS_MULTI | HEAP_XMAX_LOCK_ONLY | HEAP_LOCK_MASK;
2408
2409 if ((new_infomask & interesting) != (old_infomask & interesting))
2410 return true;
2411
2412 return false;
2413 }
2414
2415 /*
2416 * heap_delete - delete a tuple
2417 *
2418 * See table_tuple_delete() for an explanation of the parameters, except that
2419 * this routine directly takes a tuple rather than a slot.
2420 *
2421 * In the failure cases, the routine fills *tmfd with the tuple's t_ctid,
2422 * t_xmax (resolving a possible MultiXact, if necessary), and t_cmax (the last
2423 * only for TM_SelfModified, since we cannot obtain cmax from a combocid
2424 * generated by another transaction).
2425 */
2426 TM_Result
heap_delete(Relation relation,ItemPointer tid,CommandId cid,Snapshot crosscheck,bool wait,TM_FailureData * tmfd,bool changingPart)2427 heap_delete(Relation relation, ItemPointer tid,
2428 CommandId cid, Snapshot crosscheck, bool wait,
2429 TM_FailureData *tmfd, bool changingPart)
2430 {
2431 TM_Result result;
2432 TransactionId xid = GetCurrentTransactionId();
2433 ItemId lp;
2434 HeapTupleData tp;
2435 Page page;
2436 BlockNumber block;
2437 Buffer buffer;
2438 Buffer vmbuffer = InvalidBuffer;
2439 TransactionId new_xmax;
2440 uint16 new_infomask,
2441 new_infomask2;
2442 bool have_tuple_lock = false;
2443 bool iscombo;
2444 bool all_visible_cleared = false;
2445 HeapTuple old_key_tuple = NULL; /* replica identity of the tuple */
2446 bool old_key_copied = false;
2447
2448 Assert(ItemPointerIsValid(tid));
2449
2450 /*
2451 * Forbid this during a parallel operation, lest it allocate a combocid.
2452 * Other workers might need that combocid for visibility checks, and we
2453 * have no provision for broadcasting it to them.
2454 */
2455 if (IsInParallelMode())
2456 ereport(ERROR,
2457 (errcode(ERRCODE_INVALID_TRANSACTION_STATE),
2458 errmsg("cannot delete tuples during a parallel operation")));
2459
2460 block = ItemPointerGetBlockNumber(tid);
2461 buffer = ReadBuffer(relation, block);
2462 page = BufferGetPage(buffer);
2463
2464 /*
2465 * Before locking the buffer, pin the visibility map page if it appears to
2466 * be necessary. Since we haven't got the lock yet, someone else might be
2467 * in the middle of changing this, so we'll need to recheck after we have
2468 * the lock.
2469 */
2470 if (PageIsAllVisible(page))
2471 visibilitymap_pin(relation, block, &vmbuffer);
2472
2473 LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE);
2474
2475 /*
2476 * If we didn't pin the visibility map page and the page has become all
2477 * visible while we were busy locking the buffer, we'll have to unlock and
2478 * re-lock, to avoid holding the buffer lock across an I/O. That's a bit
2479 * unfortunate, but hopefully shouldn't happen often.
2480 */
2481 if (vmbuffer == InvalidBuffer && PageIsAllVisible(page))
2482 {
2483 LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
2484 visibilitymap_pin(relation, block, &vmbuffer);
2485 LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE);
2486 }
2487
2488 lp = PageGetItemId(page, ItemPointerGetOffsetNumber(tid));
2489 Assert(ItemIdIsNormal(lp));
2490
2491 tp.t_tableOid = RelationGetRelid(relation);
2492 tp.t_data = (HeapTupleHeader) PageGetItem(page, lp);
2493 tp.t_len = ItemIdGetLength(lp);
2494 tp.t_self = *tid;
2495
2496 l1:
2497 result = HeapTupleSatisfiesUpdate(&tp, cid, buffer);
2498
2499 if (result == TM_Invisible)
2500 {
2501 UnlockReleaseBuffer(buffer);
2502 ereport(ERROR,
2503 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
2504 errmsg("attempted to delete invisible tuple")));
2505 }
2506 else if (result == TM_BeingModified && wait)
2507 {
2508 TransactionId xwait;
2509 uint16 infomask;
2510
2511 /* must copy state data before unlocking buffer */
2512 xwait = HeapTupleHeaderGetRawXmax(tp.t_data);
2513 infomask = tp.t_data->t_infomask;
2514
2515 /*
2516 * Sleep until concurrent transaction ends -- except when there's a
2517 * single locker and it's our own transaction. Note we don't care
2518 * which lock mode the locker has, because we need the strongest one.
2519 *
2520 * Before sleeping, we need to acquire tuple lock to establish our
2521 * priority for the tuple (see heap_lock_tuple). LockTuple will
2522 * release us when we are next-in-line for the tuple.
2523 *
2524 * If we are forced to "start over" below, we keep the tuple lock;
2525 * this arranges that we stay at the head of the line while rechecking
2526 * tuple state.
2527 */
2528 if (infomask & HEAP_XMAX_IS_MULTI)
2529 {
2530 bool current_is_member = false;
2531
2532 if (DoesMultiXactIdConflict((MultiXactId) xwait, infomask,
2533 LockTupleExclusive, ¤t_is_member))
2534 {
2535 LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
2536
2537 /*
2538 * Acquire the lock, if necessary (but skip it when we're
2539 * requesting a lock and already have one; avoids deadlock).
2540 */
2541 if (!current_is_member)
2542 heap_acquire_tuplock(relation, &(tp.t_self), LockTupleExclusive,
2543 LockWaitBlock, &have_tuple_lock);
2544
2545 /* wait for multixact */
2546 MultiXactIdWait((MultiXactId) xwait, MultiXactStatusUpdate, infomask,
2547 relation, &(tp.t_self), XLTW_Delete,
2548 NULL);
2549 LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE);
2550
2551 /*
2552 * If xwait had just locked the tuple then some other xact
2553 * could update this tuple before we get to this point. Check
2554 * for xmax change, and start over if so.
2555 */
2556 if (xmax_infomask_changed(tp.t_data->t_infomask, infomask) ||
2557 !TransactionIdEquals(HeapTupleHeaderGetRawXmax(tp.t_data),
2558 xwait))
2559 goto l1;
2560 }
2561
2562 /*
2563 * You might think the multixact is necessarily done here, but not
2564 * so: it could have surviving members, namely our own xact or
2565 * other subxacts of this backend. It is legal for us to delete
2566 * the tuple in either case, however (the latter case is
2567 * essentially a situation of upgrading our former shared lock to
2568 * exclusive). We don't bother changing the on-disk hint bits
2569 * since we are about to overwrite the xmax altogether.
2570 */
2571 }
2572 else if (!TransactionIdIsCurrentTransactionId(xwait))
2573 {
2574 /*
2575 * Wait for regular transaction to end; but first, acquire tuple
2576 * lock.
2577 */
2578 LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
2579 heap_acquire_tuplock(relation, &(tp.t_self), LockTupleExclusive,
2580 LockWaitBlock, &have_tuple_lock);
2581 XactLockTableWait(xwait, relation, &(tp.t_self), XLTW_Delete);
2582 LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE);
2583
2584 /*
2585 * xwait is done, but if xwait had just locked the tuple then some
2586 * other xact could update this tuple before we get to this point.
2587 * Check for xmax change, and start over if so.
2588 */
2589 if (xmax_infomask_changed(tp.t_data->t_infomask, infomask) ||
2590 !TransactionIdEquals(HeapTupleHeaderGetRawXmax(tp.t_data),
2591 xwait))
2592 goto l1;
2593
2594 /* Otherwise check if it committed or aborted */
2595 UpdateXmaxHintBits(tp.t_data, buffer, xwait);
2596 }
2597
2598 /*
2599 * We may overwrite if previous xmax aborted, or if it committed but
2600 * only locked the tuple without updating it.
2601 */
2602 if ((tp.t_data->t_infomask & HEAP_XMAX_INVALID) ||
2603 HEAP_XMAX_IS_LOCKED_ONLY(tp.t_data->t_infomask) ||
2604 HeapTupleHeaderIsOnlyLocked(tp.t_data))
2605 result = TM_Ok;
2606 else if (!ItemPointerEquals(&tp.t_self, &tp.t_data->t_ctid) ||
2607 HeapTupleHeaderIndicatesMovedPartitions(tp.t_data))
2608 result = TM_Updated;
2609 else
2610 result = TM_Deleted;
2611 }
2612
2613 if (crosscheck != InvalidSnapshot && result == TM_Ok)
2614 {
2615 /* Perform additional check for transaction-snapshot mode RI updates */
2616 if (!HeapTupleSatisfiesVisibility(&tp, crosscheck, buffer))
2617 result = TM_Updated;
2618 }
2619
2620 if (result != TM_Ok)
2621 {
2622 Assert(result == TM_SelfModified ||
2623 result == TM_Updated ||
2624 result == TM_Deleted ||
2625 result == TM_BeingModified);
2626 Assert(!(tp.t_data->t_infomask & HEAP_XMAX_INVALID));
2627 Assert(result != TM_Updated ||
2628 !ItemPointerEquals(&tp.t_self, &tp.t_data->t_ctid));
2629 tmfd->ctid = tp.t_data->t_ctid;
2630 tmfd->xmax = HeapTupleHeaderGetUpdateXid(tp.t_data);
2631 if (result == TM_SelfModified)
2632 tmfd->cmax = HeapTupleHeaderGetCmax(tp.t_data);
2633 else
2634 tmfd->cmax = InvalidCommandId;
2635 UnlockReleaseBuffer(buffer);
2636 if (have_tuple_lock)
2637 UnlockTupleTuplock(relation, &(tp.t_self), LockTupleExclusive);
2638 if (vmbuffer != InvalidBuffer)
2639 ReleaseBuffer(vmbuffer);
2640 return result;
2641 }
2642
2643 /*
2644 * We're about to do the actual delete -- check for conflict first, to
2645 * avoid possibly having to roll back work we've just done.
2646 *
2647 * This is safe without a recheck as long as there is no possibility of
2648 * another process scanning the page between this check and the delete
2649 * being visible to the scan (i.e., an exclusive buffer content lock is
2650 * continuously held from this point until the tuple delete is visible).
2651 */
2652 CheckForSerializableConflictIn(relation, tid, BufferGetBlockNumber(buffer));
2653
2654 /* replace cid with a combo cid if necessary */
2655 HeapTupleHeaderAdjustCmax(tp.t_data, &cid, &iscombo);
2656
2657 /*
2658 * Compute replica identity tuple before entering the critical section so
2659 * we don't PANIC upon a memory allocation failure.
2660 */
2661 old_key_tuple = ExtractReplicaIdentity(relation, &tp, true, &old_key_copied);
2662
2663 /*
2664 * If this is the first possibly-multixact-able operation in the current
2665 * transaction, set my per-backend OldestMemberMXactId setting. We can be
2666 * certain that the transaction will never become a member of any older
2667 * MultiXactIds than that. (We have to do this even if we end up just
2668 * using our own TransactionId below, since some other backend could
2669 * incorporate our XID into a MultiXact immediately afterwards.)
2670 */
2671 MultiXactIdSetOldestMember();
2672
2673 compute_new_xmax_infomask(HeapTupleHeaderGetRawXmax(tp.t_data),
2674 tp.t_data->t_infomask, tp.t_data->t_infomask2,
2675 xid, LockTupleExclusive, true,
2676 &new_xmax, &new_infomask, &new_infomask2);
2677
2678 START_CRIT_SECTION();
2679
2680 /*
2681 * If this transaction commits, the tuple will become DEAD sooner or
2682 * later. Set flag that this page is a candidate for pruning once our xid
2683 * falls below the OldestXmin horizon. If the transaction finally aborts,
2684 * the subsequent page pruning will be a no-op and the hint will be
2685 * cleared.
2686 */
2687 PageSetPrunable(page, xid);
2688
2689 if (PageIsAllVisible(page))
2690 {
2691 all_visible_cleared = true;
2692 PageClearAllVisible(page);
2693 visibilitymap_clear(relation, BufferGetBlockNumber(buffer),
2694 vmbuffer, VISIBILITYMAP_VALID_BITS);
2695 }
2696
2697 /* store transaction information of xact deleting the tuple */
2698 tp.t_data->t_infomask &= ~(HEAP_XMAX_BITS | HEAP_MOVED);
2699 tp.t_data->t_infomask2 &= ~HEAP_KEYS_UPDATED;
2700 tp.t_data->t_infomask |= new_infomask;
2701 tp.t_data->t_infomask2 |= new_infomask2;
2702 HeapTupleHeaderClearHotUpdated(tp.t_data);
2703 HeapTupleHeaderSetXmax(tp.t_data, new_xmax);
2704 HeapTupleHeaderSetCmax(tp.t_data, cid, iscombo);
2705 /* Make sure there is no forward chain link in t_ctid */
2706 tp.t_data->t_ctid = tp.t_self;
2707
2708 /* Signal that this is actually a move into another partition */
2709 if (changingPart)
2710 HeapTupleHeaderSetMovedPartitions(tp.t_data);
2711
2712 MarkBufferDirty(buffer);
2713
2714 /*
2715 * XLOG stuff
2716 *
2717 * NB: heap_abort_speculative() uses the same xlog record and replay
2718 * routines.
2719 */
2720 if (RelationNeedsWAL(relation))
2721 {
2722 xl_heap_delete xlrec;
2723 xl_heap_header xlhdr;
2724 XLogRecPtr recptr;
2725
2726 /* For logical decode we need combocids to properly decode the catalog */
2727 if (RelationIsAccessibleInLogicalDecoding(relation))
2728 log_heap_new_cid(relation, &tp);
2729
2730 xlrec.flags = 0;
2731 if (all_visible_cleared)
2732 xlrec.flags |= XLH_DELETE_ALL_VISIBLE_CLEARED;
2733 if (changingPart)
2734 xlrec.flags |= XLH_DELETE_IS_PARTITION_MOVE;
2735 xlrec.infobits_set = compute_infobits(tp.t_data->t_infomask,
2736 tp.t_data->t_infomask2);
2737 xlrec.offnum = ItemPointerGetOffsetNumber(&tp.t_self);
2738 xlrec.xmax = new_xmax;
2739
2740 if (old_key_tuple != NULL)
2741 {
2742 if (relation->rd_rel->relreplident == REPLICA_IDENTITY_FULL)
2743 xlrec.flags |= XLH_DELETE_CONTAINS_OLD_TUPLE;
2744 else
2745 xlrec.flags |= XLH_DELETE_CONTAINS_OLD_KEY;
2746 }
2747
2748 XLogBeginInsert();
2749 XLogRegisterData((char *) &xlrec, SizeOfHeapDelete);
2750
2751 XLogRegisterBuffer(0, buffer, REGBUF_STANDARD);
2752
2753 /*
2754 * Log replica identity of the deleted tuple if there is one
2755 */
2756 if (old_key_tuple != NULL)
2757 {
2758 xlhdr.t_infomask2 = old_key_tuple->t_data->t_infomask2;
2759 xlhdr.t_infomask = old_key_tuple->t_data->t_infomask;
2760 xlhdr.t_hoff = old_key_tuple->t_data->t_hoff;
2761
2762 XLogRegisterData((char *) &xlhdr, SizeOfHeapHeader);
2763 XLogRegisterData((char *) old_key_tuple->t_data
2764 + SizeofHeapTupleHeader,
2765 old_key_tuple->t_len
2766 - SizeofHeapTupleHeader);
2767 }
2768
2769 /* filtering by origin on a row level is much more efficient */
2770 XLogSetRecordFlags(XLOG_INCLUDE_ORIGIN);
2771
2772 recptr = XLogInsert(RM_HEAP_ID, XLOG_HEAP_DELETE);
2773
2774 PageSetLSN(page, recptr);
2775 }
2776
2777 END_CRIT_SECTION();
2778
2779 LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
2780
2781 if (vmbuffer != InvalidBuffer)
2782 ReleaseBuffer(vmbuffer);
2783
2784 /*
2785 * If the tuple has toasted out-of-line attributes, we need to delete
2786 * those items too. We have to do this before releasing the buffer
2787 * because we need to look at the contents of the tuple, but it's OK to
2788 * release the content lock on the buffer first.
2789 */
2790 if (relation->rd_rel->relkind != RELKIND_RELATION &&
2791 relation->rd_rel->relkind != RELKIND_MATVIEW)
2792 {
2793 /* toast table entries should never be recursively toasted */
2794 Assert(!HeapTupleHasExternal(&tp));
2795 }
2796 else if (HeapTupleHasExternal(&tp))
2797 heap_toast_delete(relation, &tp, false);
2798
2799 /*
2800 * Mark tuple for invalidation from system caches at next command
2801 * boundary. We have to do this before releasing the buffer because we
2802 * need to look at the contents of the tuple.
2803 */
2804 CacheInvalidateHeapTuple(relation, &tp, NULL);
2805
2806 /* Now we can release the buffer */
2807 ReleaseBuffer(buffer);
2808
2809 /*
2810 * Release the lmgr tuple lock, if we had it.
2811 */
2812 if (have_tuple_lock)
2813 UnlockTupleTuplock(relation, &(tp.t_self), LockTupleExclusive);
2814
2815 pgstat_count_heap_delete(relation);
2816
2817 if (old_key_tuple != NULL && old_key_copied)
2818 heap_freetuple(old_key_tuple);
2819
2820 return TM_Ok;
2821 }
2822
2823 /*
2824 * simple_heap_delete - delete a tuple
2825 *
2826 * This routine may be used to delete a tuple when concurrent updates of
2827 * the target tuple are not expected (for example, because we have a lock
2828 * on the relation associated with the tuple). Any failure is reported
2829 * via ereport().
2830 */
2831 void
simple_heap_delete(Relation relation,ItemPointer tid)2832 simple_heap_delete(Relation relation, ItemPointer tid)
2833 {
2834 TM_Result result;
2835 TM_FailureData tmfd;
2836
2837 result = heap_delete(relation, tid,
2838 GetCurrentCommandId(true), InvalidSnapshot,
2839 true /* wait for commit */ ,
2840 &tmfd, false /* changingPart */ );
2841 switch (result)
2842 {
2843 case TM_SelfModified:
2844 /* Tuple was already updated in current command? */
2845 elog(ERROR, "tuple already updated by self");
2846 break;
2847
2848 case TM_Ok:
2849 /* done successfully */
2850 break;
2851
2852 case TM_Updated:
2853 elog(ERROR, "tuple concurrently updated");
2854 break;
2855
2856 case TM_Deleted:
2857 elog(ERROR, "tuple concurrently deleted");
2858 break;
2859
2860 default:
2861 elog(ERROR, "unrecognized heap_delete status: %u", result);
2862 break;
2863 }
2864 }
2865
2866 /*
2867 * heap_update - replace a tuple
2868 *
2869 * See table_tuple_update() for an explanation of the parameters, except that
2870 * this routine directly takes a tuple rather than a slot.
2871 *
2872 * In the failure cases, the routine fills *tmfd with the tuple's t_ctid,
2873 * t_xmax (resolving a possible MultiXact, if necessary), and t_cmax (the last
2874 * only for TM_SelfModified, since we cannot obtain cmax from a combocid
2875 * generated by another transaction).
2876 */
2877 TM_Result
heap_update(Relation relation,ItemPointer otid,HeapTuple newtup,CommandId cid,Snapshot crosscheck,bool wait,TM_FailureData * tmfd,LockTupleMode * lockmode)2878 heap_update(Relation relation, ItemPointer otid, HeapTuple newtup,
2879 CommandId cid, Snapshot crosscheck, bool wait,
2880 TM_FailureData *tmfd, LockTupleMode *lockmode)
2881 {
2882 TM_Result result;
2883 TransactionId xid = GetCurrentTransactionId();
2884 Bitmapset *hot_attrs;
2885 Bitmapset *key_attrs;
2886 Bitmapset *id_attrs;
2887 Bitmapset *interesting_attrs;
2888 Bitmapset *modified_attrs;
2889 ItemId lp;
2890 HeapTupleData oldtup;
2891 HeapTuple heaptup;
2892 HeapTuple old_key_tuple = NULL;
2893 bool old_key_copied = false;
2894 Page page;
2895 BlockNumber block;
2896 MultiXactStatus mxact_status;
2897 Buffer buffer,
2898 newbuf,
2899 vmbuffer = InvalidBuffer,
2900 vmbuffer_new = InvalidBuffer;
2901 bool need_toast;
2902 Size newtupsize,
2903 pagefree;
2904 bool have_tuple_lock = false;
2905 bool iscombo;
2906 bool use_hot_update = false;
2907 bool hot_attrs_checked = false;
2908 bool key_intact;
2909 bool all_visible_cleared = false;
2910 bool all_visible_cleared_new = false;
2911 bool checked_lockers;
2912 bool locker_remains;
2913 TransactionId xmax_new_tuple,
2914 xmax_old_tuple;
2915 uint16 infomask_old_tuple,
2916 infomask2_old_tuple,
2917 infomask_new_tuple,
2918 infomask2_new_tuple;
2919
2920 Assert(ItemPointerIsValid(otid));
2921
2922 /* Cheap, simplistic check that the tuple matches the rel's rowtype. */
2923 Assert(HeapTupleHeaderGetNatts(newtup->t_data) <=
2924 RelationGetNumberOfAttributes(relation));
2925
2926 /*
2927 * Forbid this during a parallel operation, lest it allocate a combocid.
2928 * Other workers might need that combocid for visibility checks, and we
2929 * have no provision for broadcasting it to them.
2930 */
2931 if (IsInParallelMode())
2932 ereport(ERROR,
2933 (errcode(ERRCODE_INVALID_TRANSACTION_STATE),
2934 errmsg("cannot update tuples during a parallel operation")));
2935
2936 /*
2937 * Fetch the list of attributes to be checked for various operations.
2938 *
2939 * For HOT considerations, this is wasted effort if we fail to update or
2940 * have to put the new tuple on a different page. But we must compute the
2941 * list before obtaining buffer lock --- in the worst case, if we are
2942 * doing an update on one of the relevant system catalogs, we could
2943 * deadlock if we try to fetch the list later. In any case, the relcache
2944 * caches the data so this is usually pretty cheap.
2945 *
2946 * We also need columns used by the replica identity and columns that are
2947 * considered the "key" of rows in the table.
2948 *
2949 * Note that we get copies of each bitmap, so we need not worry about
2950 * relcache flush happening midway through.
2951 */
2952 hot_attrs = RelationGetIndexAttrBitmap(relation, INDEX_ATTR_BITMAP_ALL);
2953 key_attrs = RelationGetIndexAttrBitmap(relation, INDEX_ATTR_BITMAP_KEY);
2954 id_attrs = RelationGetIndexAttrBitmap(relation,
2955 INDEX_ATTR_BITMAP_IDENTITY_KEY);
2956
2957
2958 block = ItemPointerGetBlockNumber(otid);
2959 buffer = ReadBuffer(relation, block);
2960 page = BufferGetPage(buffer);
2961
2962 interesting_attrs = NULL;
2963
2964 /*
2965 * If the page is already full, there is hardly any chance of doing a HOT
2966 * update on this page. It might be wasteful effort to look for index
2967 * column updates only to later reject HOT updates for lack of space in
2968 * the same page. So we be conservative and only fetch hot_attrs if the
2969 * page is not already full. Since we are already holding a pin on the
2970 * buffer, there is no chance that the buffer can get cleaned up
2971 * concurrently and even if that was possible, in the worst case we lose a
2972 * chance to do a HOT update.
2973 */
2974 if (!PageIsFull(page))
2975 {
2976 interesting_attrs = bms_add_members(interesting_attrs, hot_attrs);
2977 hot_attrs_checked = true;
2978 }
2979 interesting_attrs = bms_add_members(interesting_attrs, key_attrs);
2980 interesting_attrs = bms_add_members(interesting_attrs, id_attrs);
2981
2982 /*
2983 * Before locking the buffer, pin the visibility map page if it appears to
2984 * be necessary. Since we haven't got the lock yet, someone else might be
2985 * in the middle of changing this, so we'll need to recheck after we have
2986 * the lock.
2987 */
2988 if (PageIsAllVisible(page))
2989 visibilitymap_pin(relation, block, &vmbuffer);
2990
2991 LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE);
2992
2993 lp = PageGetItemId(page, ItemPointerGetOffsetNumber(otid));
2994 Assert(ItemIdIsNormal(lp));
2995
2996 /*
2997 * Fill in enough data in oldtup for HeapDetermineModifiedColumns to work
2998 * properly.
2999 */
3000 oldtup.t_tableOid = RelationGetRelid(relation);
3001 oldtup.t_data = (HeapTupleHeader) PageGetItem(page, lp);
3002 oldtup.t_len = ItemIdGetLength(lp);
3003 oldtup.t_self = *otid;
3004
3005 /* the new tuple is ready, except for this: */
3006 newtup->t_tableOid = RelationGetRelid(relation);
3007
3008 /* Determine columns modified by the update. */
3009 modified_attrs = HeapDetermineModifiedColumns(relation, interesting_attrs,
3010 &oldtup, newtup);
3011
3012 /*
3013 * If we're not updating any "key" column, we can grab a weaker lock type.
3014 * This allows for more concurrency when we are running simultaneously
3015 * with foreign key checks.
3016 *
3017 * Note that if a column gets detoasted while executing the update, but
3018 * the value ends up being the same, this test will fail and we will use
3019 * the stronger lock. This is acceptable; the important case to optimize
3020 * is updates that don't manipulate key columns, not those that
3021 * serendipitously arrive at the same key values.
3022 */
3023 if (!bms_overlap(modified_attrs, key_attrs))
3024 {
3025 *lockmode = LockTupleNoKeyExclusive;
3026 mxact_status = MultiXactStatusNoKeyUpdate;
3027 key_intact = true;
3028
3029 /*
3030 * If this is the first possibly-multixact-able operation in the
3031 * current transaction, set my per-backend OldestMemberMXactId
3032 * setting. We can be certain that the transaction will never become a
3033 * member of any older MultiXactIds than that. (We have to do this
3034 * even if we end up just using our own TransactionId below, since
3035 * some other backend could incorporate our XID into a MultiXact
3036 * immediately afterwards.)
3037 */
3038 MultiXactIdSetOldestMember();
3039 }
3040 else
3041 {
3042 *lockmode = LockTupleExclusive;
3043 mxact_status = MultiXactStatusUpdate;
3044 key_intact = false;
3045 }
3046
3047 /*
3048 * Note: beyond this point, use oldtup not otid to refer to old tuple.
3049 * otid may very well point at newtup->t_self, which we will overwrite
3050 * with the new tuple's location, so there's great risk of confusion if we
3051 * use otid anymore.
3052 */
3053
3054 l2:
3055 checked_lockers = false;
3056 locker_remains = false;
3057 result = HeapTupleSatisfiesUpdate(&oldtup, cid, buffer);
3058
3059 /* see below about the "no wait" case */
3060 Assert(result != TM_BeingModified || wait);
3061
3062 if (result == TM_Invisible)
3063 {
3064 UnlockReleaseBuffer(buffer);
3065 ereport(ERROR,
3066 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
3067 errmsg("attempted to update invisible tuple")));
3068 }
3069 else if (result == TM_BeingModified && wait)
3070 {
3071 TransactionId xwait;
3072 uint16 infomask;
3073 bool can_continue = false;
3074
3075 /*
3076 * XXX note that we don't consider the "no wait" case here. This
3077 * isn't a problem currently because no caller uses that case, but it
3078 * should be fixed if such a caller is introduced. It wasn't a
3079 * problem previously because this code would always wait, but now
3080 * that some tuple locks do not conflict with one of the lock modes we
3081 * use, it is possible that this case is interesting to handle
3082 * specially.
3083 *
3084 * This may cause failures with third-party code that calls
3085 * heap_update directly.
3086 */
3087
3088 /* must copy state data before unlocking buffer */
3089 xwait = HeapTupleHeaderGetRawXmax(oldtup.t_data);
3090 infomask = oldtup.t_data->t_infomask;
3091
3092 /*
3093 * Now we have to do something about the existing locker. If it's a
3094 * multi, sleep on it; we might be awakened before it is completely
3095 * gone (or even not sleep at all in some cases); we need to preserve
3096 * it as locker, unless it is gone completely.
3097 *
3098 * If it's not a multi, we need to check for sleeping conditions
3099 * before actually going to sleep. If the update doesn't conflict
3100 * with the locks, we just continue without sleeping (but making sure
3101 * it is preserved).
3102 *
3103 * Before sleeping, we need to acquire tuple lock to establish our
3104 * priority for the tuple (see heap_lock_tuple). LockTuple will
3105 * release us when we are next-in-line for the tuple. Note we must
3106 * not acquire the tuple lock until we're sure we're going to sleep;
3107 * otherwise we're open for race conditions with other transactions
3108 * holding the tuple lock which sleep on us.
3109 *
3110 * If we are forced to "start over" below, we keep the tuple lock;
3111 * this arranges that we stay at the head of the line while rechecking
3112 * tuple state.
3113 */
3114 if (infomask & HEAP_XMAX_IS_MULTI)
3115 {
3116 TransactionId update_xact;
3117 int remain;
3118 bool current_is_member = false;
3119
3120 if (DoesMultiXactIdConflict((MultiXactId) xwait, infomask,
3121 *lockmode, ¤t_is_member))
3122 {
3123 LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
3124
3125 /*
3126 * Acquire the lock, if necessary (but skip it when we're
3127 * requesting a lock and already have one; avoids deadlock).
3128 */
3129 if (!current_is_member)
3130 heap_acquire_tuplock(relation, &(oldtup.t_self), *lockmode,
3131 LockWaitBlock, &have_tuple_lock);
3132
3133 /* wait for multixact */
3134 MultiXactIdWait((MultiXactId) xwait, mxact_status, infomask,
3135 relation, &oldtup.t_self, XLTW_Update,
3136 &remain);
3137 checked_lockers = true;
3138 locker_remains = remain != 0;
3139 LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE);
3140
3141 /*
3142 * If xwait had just locked the tuple then some other xact
3143 * could update this tuple before we get to this point. Check
3144 * for xmax change, and start over if so.
3145 */
3146 if (xmax_infomask_changed(oldtup.t_data->t_infomask,
3147 infomask) ||
3148 !TransactionIdEquals(HeapTupleHeaderGetRawXmax(oldtup.t_data),
3149 xwait))
3150 goto l2;
3151 }
3152
3153 /*
3154 * Note that the multixact may not be done by now. It could have
3155 * surviving members; our own xact or other subxacts of this
3156 * backend, and also any other concurrent transaction that locked
3157 * the tuple with LockTupleKeyShare if we only got
3158 * LockTupleNoKeyExclusive. If this is the case, we have to be
3159 * careful to mark the updated tuple with the surviving members in
3160 * Xmax.
3161 *
3162 * Note that there could have been another update in the
3163 * MultiXact. In that case, we need to check whether it committed
3164 * or aborted. If it aborted we are safe to update it again;
3165 * otherwise there is an update conflict, and we have to return
3166 * TableTuple{Deleted, Updated} below.
3167 *
3168 * In the LockTupleExclusive case, we still need to preserve the
3169 * surviving members: those would include the tuple locks we had
3170 * before this one, which are important to keep in case this
3171 * subxact aborts.
3172 */
3173 if (!HEAP_XMAX_IS_LOCKED_ONLY(oldtup.t_data->t_infomask))
3174 update_xact = HeapTupleGetUpdateXid(oldtup.t_data);
3175 else
3176 update_xact = InvalidTransactionId;
3177
3178 /*
3179 * There was no UPDATE in the MultiXact; or it aborted. No
3180 * TransactionIdIsInProgress() call needed here, since we called
3181 * MultiXactIdWait() above.
3182 */
3183 if (!TransactionIdIsValid(update_xact) ||
3184 TransactionIdDidAbort(update_xact))
3185 can_continue = true;
3186 }
3187 else if (TransactionIdIsCurrentTransactionId(xwait))
3188 {
3189 /*
3190 * The only locker is ourselves; we can avoid grabbing the tuple
3191 * lock here, but must preserve our locking information.
3192 */
3193 checked_lockers = true;
3194 locker_remains = true;
3195 can_continue = true;
3196 }
3197 else if (HEAP_XMAX_IS_KEYSHR_LOCKED(infomask) && key_intact)
3198 {
3199 /*
3200 * If it's just a key-share locker, and we're not changing the key
3201 * columns, we don't need to wait for it to end; but we need to
3202 * preserve it as locker.
3203 */
3204 checked_lockers = true;
3205 locker_remains = true;
3206 can_continue = true;
3207 }
3208 else
3209 {
3210 /*
3211 * Wait for regular transaction to end; but first, acquire tuple
3212 * lock.
3213 */
3214 LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
3215 heap_acquire_tuplock(relation, &(oldtup.t_self), *lockmode,
3216 LockWaitBlock, &have_tuple_lock);
3217 XactLockTableWait(xwait, relation, &oldtup.t_self,
3218 XLTW_Update);
3219 checked_lockers = true;
3220 LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE);
3221
3222 /*
3223 * xwait is done, but if xwait had just locked the tuple then some
3224 * other xact could update this tuple before we get to this point.
3225 * Check for xmax change, and start over if so.
3226 */
3227 if (xmax_infomask_changed(oldtup.t_data->t_infomask, infomask) ||
3228 !TransactionIdEquals(xwait,
3229 HeapTupleHeaderGetRawXmax(oldtup.t_data)))
3230 goto l2;
3231
3232 /* Otherwise check if it committed or aborted */
3233 UpdateXmaxHintBits(oldtup.t_data, buffer, xwait);
3234 if (oldtup.t_data->t_infomask & HEAP_XMAX_INVALID)
3235 can_continue = true;
3236 }
3237
3238 if (can_continue)
3239 result = TM_Ok;
3240 else if (!ItemPointerEquals(&oldtup.t_self, &oldtup.t_data->t_ctid) ||
3241 HeapTupleHeaderIndicatesMovedPartitions(oldtup.t_data))
3242 result = TM_Updated;
3243 else
3244 result = TM_Deleted;
3245 }
3246
3247 if (crosscheck != InvalidSnapshot && result == TM_Ok)
3248 {
3249 /* Perform additional check for transaction-snapshot mode RI updates */
3250 if (!HeapTupleSatisfiesVisibility(&oldtup, crosscheck, buffer))
3251 {
3252 result = TM_Updated;
3253 Assert(!ItemPointerEquals(&oldtup.t_self, &oldtup.t_data->t_ctid));
3254 }
3255 }
3256
3257 if (result != TM_Ok)
3258 {
3259 Assert(result == TM_SelfModified ||
3260 result == TM_Updated ||
3261 result == TM_Deleted ||
3262 result == TM_BeingModified);
3263 Assert(!(oldtup.t_data->t_infomask & HEAP_XMAX_INVALID));
3264 Assert(result != TM_Updated ||
3265 !ItemPointerEquals(&oldtup.t_self, &oldtup.t_data->t_ctid));
3266 tmfd->ctid = oldtup.t_data->t_ctid;
3267 tmfd->xmax = HeapTupleHeaderGetUpdateXid(oldtup.t_data);
3268 if (result == TM_SelfModified)
3269 tmfd->cmax = HeapTupleHeaderGetCmax(oldtup.t_data);
3270 else
3271 tmfd->cmax = InvalidCommandId;
3272 UnlockReleaseBuffer(buffer);
3273 if (have_tuple_lock)
3274 UnlockTupleTuplock(relation, &(oldtup.t_self), *lockmode);
3275 if (vmbuffer != InvalidBuffer)
3276 ReleaseBuffer(vmbuffer);
3277 bms_free(hot_attrs);
3278 bms_free(key_attrs);
3279 bms_free(id_attrs);
3280 bms_free(modified_attrs);
3281 bms_free(interesting_attrs);
3282 return result;
3283 }
3284
3285 /*
3286 * If we didn't pin the visibility map page and the page has become all
3287 * visible while we were busy locking the buffer, or during some
3288 * subsequent window during which we had it unlocked, we'll have to unlock
3289 * and re-lock, to avoid holding the buffer lock across an I/O. That's a
3290 * bit unfortunate, especially since we'll now have to recheck whether the
3291 * tuple has been locked or updated under us, but hopefully it won't
3292 * happen very often.
3293 */
3294 if (vmbuffer == InvalidBuffer && PageIsAllVisible(page))
3295 {
3296 LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
3297 visibilitymap_pin(relation, block, &vmbuffer);
3298 LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE);
3299 goto l2;
3300 }
3301
3302 /* Fill in transaction status data */
3303
3304 /*
3305 * If the tuple we're updating is locked, we need to preserve the locking
3306 * info in the old tuple's Xmax. Prepare a new Xmax value for this.
3307 */
3308 compute_new_xmax_infomask(HeapTupleHeaderGetRawXmax(oldtup.t_data),
3309 oldtup.t_data->t_infomask,
3310 oldtup.t_data->t_infomask2,
3311 xid, *lockmode, true,
3312 &xmax_old_tuple, &infomask_old_tuple,
3313 &infomask2_old_tuple);
3314
3315 /*
3316 * And also prepare an Xmax value for the new copy of the tuple. If there
3317 * was no xmax previously, or there was one but all lockers are now gone,
3318 * then use InvalidXid; otherwise, get the xmax from the old tuple. (In
3319 * rare cases that might also be InvalidXid and yet not have the
3320 * HEAP_XMAX_INVALID bit set; that's fine.)
3321 */
3322 if ((oldtup.t_data->t_infomask & HEAP_XMAX_INVALID) ||
3323 HEAP_LOCKED_UPGRADED(oldtup.t_data->t_infomask) ||
3324 (checked_lockers && !locker_remains))
3325 xmax_new_tuple = InvalidTransactionId;
3326 else
3327 xmax_new_tuple = HeapTupleHeaderGetRawXmax(oldtup.t_data);
3328
3329 if (!TransactionIdIsValid(xmax_new_tuple))
3330 {
3331 infomask_new_tuple = HEAP_XMAX_INVALID;
3332 infomask2_new_tuple = 0;
3333 }
3334 else
3335 {
3336 /*
3337 * If we found a valid Xmax for the new tuple, then the infomask bits
3338 * to use on the new tuple depend on what was there on the old one.
3339 * Note that since we're doing an update, the only possibility is that
3340 * the lockers had FOR KEY SHARE lock.
3341 */
3342 if (oldtup.t_data->t_infomask & HEAP_XMAX_IS_MULTI)
3343 {
3344 GetMultiXactIdHintBits(xmax_new_tuple, &infomask_new_tuple,
3345 &infomask2_new_tuple);
3346 }
3347 else
3348 {
3349 infomask_new_tuple = HEAP_XMAX_KEYSHR_LOCK | HEAP_XMAX_LOCK_ONLY;
3350 infomask2_new_tuple = 0;
3351 }
3352 }
3353
3354 /*
3355 * Prepare the new tuple with the appropriate initial values of Xmin and
3356 * Xmax, as well as initial infomask bits as computed above.
3357 */
3358 newtup->t_data->t_infomask &= ~(HEAP_XACT_MASK);
3359 newtup->t_data->t_infomask2 &= ~(HEAP2_XACT_MASK);
3360 HeapTupleHeaderSetXmin(newtup->t_data, xid);
3361 HeapTupleHeaderSetCmin(newtup->t_data, cid);
3362 newtup->t_data->t_infomask |= HEAP_UPDATED | infomask_new_tuple;
3363 newtup->t_data->t_infomask2 |= infomask2_new_tuple;
3364 HeapTupleHeaderSetXmax(newtup->t_data, xmax_new_tuple);
3365
3366 /*
3367 * Replace cid with a combo cid if necessary. Note that we already put
3368 * the plain cid into the new tuple.
3369 */
3370 HeapTupleHeaderAdjustCmax(oldtup.t_data, &cid, &iscombo);
3371
3372 /*
3373 * If the toaster needs to be activated, OR if the new tuple will not fit
3374 * on the same page as the old, then we need to release the content lock
3375 * (but not the pin!) on the old tuple's buffer while we are off doing
3376 * TOAST and/or table-file-extension work. We must mark the old tuple to
3377 * show that it's locked, else other processes may try to update it
3378 * themselves.
3379 *
3380 * We need to invoke the toaster if there are already any out-of-line
3381 * toasted values present, or if the new tuple is over-threshold.
3382 */
3383 if (relation->rd_rel->relkind != RELKIND_RELATION &&
3384 relation->rd_rel->relkind != RELKIND_MATVIEW)
3385 {
3386 /* toast table entries should never be recursively toasted */
3387 Assert(!HeapTupleHasExternal(&oldtup));
3388 Assert(!HeapTupleHasExternal(newtup));
3389 need_toast = false;
3390 }
3391 else
3392 need_toast = (HeapTupleHasExternal(&oldtup) ||
3393 HeapTupleHasExternal(newtup) ||
3394 newtup->t_len > TOAST_TUPLE_THRESHOLD);
3395
3396 pagefree = PageGetHeapFreeSpace(page);
3397
3398 newtupsize = MAXALIGN(newtup->t_len);
3399
3400 if (need_toast || newtupsize > pagefree)
3401 {
3402 TransactionId xmax_lock_old_tuple;
3403 uint16 infomask_lock_old_tuple,
3404 infomask2_lock_old_tuple;
3405 bool cleared_all_frozen = false;
3406
3407 /*
3408 * To prevent concurrent sessions from updating the tuple, we have to
3409 * temporarily mark it locked, while we release the page-level lock.
3410 *
3411 * To satisfy the rule that any xid potentially appearing in a buffer
3412 * written out to disk, we unfortunately have to WAL log this
3413 * temporary modification. We can reuse xl_heap_lock for this
3414 * purpose. If we crash/error before following through with the
3415 * actual update, xmax will be of an aborted transaction, allowing
3416 * other sessions to proceed.
3417 */
3418
3419 /*
3420 * Compute xmax / infomask appropriate for locking the tuple. This has
3421 * to be done separately from the combo that's going to be used for
3422 * updating, because the potentially created multixact would otherwise
3423 * be wrong.
3424 */
3425 compute_new_xmax_infomask(HeapTupleHeaderGetRawXmax(oldtup.t_data),
3426 oldtup.t_data->t_infomask,
3427 oldtup.t_data->t_infomask2,
3428 xid, *lockmode, false,
3429 &xmax_lock_old_tuple, &infomask_lock_old_tuple,
3430 &infomask2_lock_old_tuple);
3431
3432 Assert(HEAP_XMAX_IS_LOCKED_ONLY(infomask_lock_old_tuple));
3433
3434 START_CRIT_SECTION();
3435
3436 /* Clear obsolete visibility flags ... */
3437 oldtup.t_data->t_infomask &= ~(HEAP_XMAX_BITS | HEAP_MOVED);
3438 oldtup.t_data->t_infomask2 &= ~HEAP_KEYS_UPDATED;
3439 HeapTupleClearHotUpdated(&oldtup);
3440 /* ... and store info about transaction updating this tuple */
3441 Assert(TransactionIdIsValid(xmax_lock_old_tuple));
3442 HeapTupleHeaderSetXmax(oldtup.t_data, xmax_lock_old_tuple);
3443 oldtup.t_data->t_infomask |= infomask_lock_old_tuple;
3444 oldtup.t_data->t_infomask2 |= infomask2_lock_old_tuple;
3445 HeapTupleHeaderSetCmax(oldtup.t_data, cid, iscombo);
3446
3447 /* temporarily make it look not-updated, but locked */
3448 oldtup.t_data->t_ctid = oldtup.t_self;
3449
3450 /*
3451 * Clear all-frozen bit on visibility map if needed. We could
3452 * immediately reset ALL_VISIBLE, but given that the WAL logging
3453 * overhead would be unchanged, that doesn't seem necessarily
3454 * worthwhile.
3455 */
3456 if (PageIsAllVisible(page) &&
3457 visibilitymap_clear(relation, block, vmbuffer,
3458 VISIBILITYMAP_ALL_FROZEN))
3459 cleared_all_frozen = true;
3460
3461 MarkBufferDirty(buffer);
3462
3463 if (RelationNeedsWAL(relation))
3464 {
3465 xl_heap_lock xlrec;
3466 XLogRecPtr recptr;
3467
3468 XLogBeginInsert();
3469 XLogRegisterBuffer(0, buffer, REGBUF_STANDARD);
3470
3471 xlrec.offnum = ItemPointerGetOffsetNumber(&oldtup.t_self);
3472 xlrec.locking_xid = xmax_lock_old_tuple;
3473 xlrec.infobits_set = compute_infobits(oldtup.t_data->t_infomask,
3474 oldtup.t_data->t_infomask2);
3475 xlrec.flags =
3476 cleared_all_frozen ? XLH_LOCK_ALL_FROZEN_CLEARED : 0;
3477 XLogRegisterData((char *) &xlrec, SizeOfHeapLock);
3478 recptr = XLogInsert(RM_HEAP_ID, XLOG_HEAP_LOCK);
3479 PageSetLSN(page, recptr);
3480 }
3481
3482 END_CRIT_SECTION();
3483
3484 LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
3485
3486 /*
3487 * Let the toaster do its thing, if needed.
3488 *
3489 * Note: below this point, heaptup is the data we actually intend to
3490 * store into the relation; newtup is the caller's original untoasted
3491 * data.
3492 */
3493 if (need_toast)
3494 {
3495 /* Note we always use WAL and FSM during updates */
3496 heaptup = heap_toast_insert_or_update(relation, newtup, &oldtup, 0);
3497 newtupsize = MAXALIGN(heaptup->t_len);
3498 }
3499 else
3500 heaptup = newtup;
3501
3502 /*
3503 * Now, do we need a new page for the tuple, or not? This is a bit
3504 * tricky since someone else could have added tuples to the page while
3505 * we weren't looking. We have to recheck the available space after
3506 * reacquiring the buffer lock. But don't bother to do that if the
3507 * former amount of free space is still not enough; it's unlikely
3508 * there's more free now than before.
3509 *
3510 * What's more, if we need to get a new page, we will need to acquire
3511 * buffer locks on both old and new pages. To avoid deadlock against
3512 * some other backend trying to get the same two locks in the other
3513 * order, we must be consistent about the order we get the locks in.
3514 * We use the rule "lock the lower-numbered page of the relation
3515 * first". To implement this, we must do RelationGetBufferForTuple
3516 * while not holding the lock on the old page, and we must rely on it
3517 * to get the locks on both pages in the correct order.
3518 *
3519 * Another consideration is that we need visibility map page pin(s) if
3520 * we will have to clear the all-visible flag on either page. If we
3521 * call RelationGetBufferForTuple, we rely on it to acquire any such
3522 * pins; but if we don't, we have to handle that here. Hence we need
3523 * a loop.
3524 */
3525 for (;;)
3526 {
3527 if (newtupsize > pagefree)
3528 {
3529 /* It doesn't fit, must use RelationGetBufferForTuple. */
3530 newbuf = RelationGetBufferForTuple(relation, heaptup->t_len,
3531 buffer, 0, NULL,
3532 &vmbuffer_new, &vmbuffer);
3533 /* We're all done. */
3534 break;
3535 }
3536 /* Acquire VM page pin if needed and we don't have it. */
3537 if (vmbuffer == InvalidBuffer && PageIsAllVisible(page))
3538 visibilitymap_pin(relation, block, &vmbuffer);
3539 /* Re-acquire the lock on the old tuple's page. */
3540 LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE);
3541 /* Re-check using the up-to-date free space */
3542 pagefree = PageGetHeapFreeSpace(page);
3543 if (newtupsize > pagefree ||
3544 (vmbuffer == InvalidBuffer && PageIsAllVisible(page)))
3545 {
3546 /*
3547 * Rats, it doesn't fit anymore, or somebody just now set the
3548 * all-visible flag. We must now unlock and loop to avoid
3549 * deadlock. Fortunately, this path should seldom be taken.
3550 */
3551 LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
3552 }
3553 else
3554 {
3555 /* We're all done. */
3556 newbuf = buffer;
3557 break;
3558 }
3559 }
3560 }
3561 else
3562 {
3563 /* No TOAST work needed, and it'll fit on same page */
3564 newbuf = buffer;
3565 heaptup = newtup;
3566 }
3567
3568 /*
3569 * We're about to do the actual update -- check for conflict first, to
3570 * avoid possibly having to roll back work we've just done.
3571 *
3572 * This is safe without a recheck as long as there is no possibility of
3573 * another process scanning the pages between this check and the update
3574 * being visible to the scan (i.e., exclusive buffer content lock(s) are
3575 * continuously held from this point until the tuple update is visible).
3576 *
3577 * For the new tuple the only check needed is at the relation level, but
3578 * since both tuples are in the same relation and the check for oldtup
3579 * will include checking the relation level, there is no benefit to a
3580 * separate check for the new tuple.
3581 */
3582 CheckForSerializableConflictIn(relation, &oldtup.t_self,
3583 BufferGetBlockNumber(buffer));
3584
3585 /*
3586 * At this point newbuf and buffer are both pinned and locked, and newbuf
3587 * has enough space for the new tuple. If they are the same buffer, only
3588 * one pin is held.
3589 */
3590
3591 if (newbuf == buffer)
3592 {
3593 /*
3594 * Since the new tuple is going into the same page, we might be able
3595 * to do a HOT update. Check if any of the index columns have been
3596 * changed. If the page was already full, we may have skipped checking
3597 * for index columns, and also can't do a HOT update.
3598 */
3599 if (hot_attrs_checked && !bms_overlap(modified_attrs, hot_attrs))
3600 use_hot_update = true;
3601 }
3602 else
3603 {
3604 /* Set a hint that the old page could use prune/defrag */
3605 PageSetFull(page);
3606 }
3607
3608 /*
3609 * Compute replica identity tuple before entering the critical section so
3610 * we don't PANIC upon a memory allocation failure.
3611 * ExtractReplicaIdentity() will return NULL if nothing needs to be
3612 * logged.
3613 */
3614 old_key_tuple = ExtractReplicaIdentity(relation, &oldtup,
3615 bms_overlap(modified_attrs, id_attrs),
3616 &old_key_copied);
3617
3618 /* NO EREPORT(ERROR) from here till changes are logged */
3619 START_CRIT_SECTION();
3620
3621 /*
3622 * If this transaction commits, the old tuple will become DEAD sooner or
3623 * later. Set flag that this page is a candidate for pruning once our xid
3624 * falls below the OldestXmin horizon. If the transaction finally aborts,
3625 * the subsequent page pruning will be a no-op and the hint will be
3626 * cleared.
3627 *
3628 * XXX Should we set hint on newbuf as well? If the transaction aborts,
3629 * there would be a prunable tuple in the newbuf; but for now we choose
3630 * not to optimize for aborts. Note that heap_xlog_update must be kept in
3631 * sync if this decision changes.
3632 */
3633 PageSetPrunable(page, xid);
3634
3635 if (use_hot_update)
3636 {
3637 /* Mark the old tuple as HOT-updated */
3638 HeapTupleSetHotUpdated(&oldtup);
3639 /* And mark the new tuple as heap-only */
3640 HeapTupleSetHeapOnly(heaptup);
3641 /* Mark the caller's copy too, in case different from heaptup */
3642 HeapTupleSetHeapOnly(newtup);
3643 }
3644 else
3645 {
3646 /* Make sure tuples are correctly marked as not-HOT */
3647 HeapTupleClearHotUpdated(&oldtup);
3648 HeapTupleClearHeapOnly(heaptup);
3649 HeapTupleClearHeapOnly(newtup);
3650 }
3651
3652 RelationPutHeapTuple(relation, newbuf, heaptup, false); /* insert new tuple */
3653
3654
3655 /* Clear obsolete visibility flags, possibly set by ourselves above... */
3656 oldtup.t_data->t_infomask &= ~(HEAP_XMAX_BITS | HEAP_MOVED);
3657 oldtup.t_data->t_infomask2 &= ~HEAP_KEYS_UPDATED;
3658 /* ... and store info about transaction updating this tuple */
3659 Assert(TransactionIdIsValid(xmax_old_tuple));
3660 HeapTupleHeaderSetXmax(oldtup.t_data, xmax_old_tuple);
3661 oldtup.t_data->t_infomask |= infomask_old_tuple;
3662 oldtup.t_data->t_infomask2 |= infomask2_old_tuple;
3663 HeapTupleHeaderSetCmax(oldtup.t_data, cid, iscombo);
3664
3665 /* record address of new tuple in t_ctid of old one */
3666 oldtup.t_data->t_ctid = heaptup->t_self;
3667
3668 /* clear PD_ALL_VISIBLE flags, reset all visibilitymap bits */
3669 if (PageIsAllVisible(BufferGetPage(buffer)))
3670 {
3671 all_visible_cleared = true;
3672 PageClearAllVisible(BufferGetPage(buffer));
3673 visibilitymap_clear(relation, BufferGetBlockNumber(buffer),
3674 vmbuffer, VISIBILITYMAP_VALID_BITS);
3675 }
3676 if (newbuf != buffer && PageIsAllVisible(BufferGetPage(newbuf)))
3677 {
3678 all_visible_cleared_new = true;
3679 PageClearAllVisible(BufferGetPage(newbuf));
3680 visibilitymap_clear(relation, BufferGetBlockNumber(newbuf),
3681 vmbuffer_new, VISIBILITYMAP_VALID_BITS);
3682 }
3683
3684 if (newbuf != buffer)
3685 MarkBufferDirty(newbuf);
3686 MarkBufferDirty(buffer);
3687
3688 /* XLOG stuff */
3689 if (RelationNeedsWAL(relation))
3690 {
3691 XLogRecPtr recptr;
3692
3693 /*
3694 * For logical decoding we need combocids to properly decode the
3695 * catalog.
3696 */
3697 if (RelationIsAccessibleInLogicalDecoding(relation))
3698 {
3699 log_heap_new_cid(relation, &oldtup);
3700 log_heap_new_cid(relation, heaptup);
3701 }
3702
3703 recptr = log_heap_update(relation, buffer,
3704 newbuf, &oldtup, heaptup,
3705 old_key_tuple,
3706 all_visible_cleared,
3707 all_visible_cleared_new);
3708 if (newbuf != buffer)
3709 {
3710 PageSetLSN(BufferGetPage(newbuf), recptr);
3711 }
3712 PageSetLSN(BufferGetPage(buffer), recptr);
3713 }
3714
3715 END_CRIT_SECTION();
3716
3717 if (newbuf != buffer)
3718 LockBuffer(newbuf, BUFFER_LOCK_UNLOCK);
3719 LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
3720
3721 /*
3722 * Mark old tuple for invalidation from system caches at next command
3723 * boundary, and mark the new tuple for invalidation in case we abort. We
3724 * have to do this before releasing the buffer because oldtup is in the
3725 * buffer. (heaptup is all in local memory, but it's necessary to process
3726 * both tuple versions in one call to inval.c so we can avoid redundant
3727 * sinval messages.)
3728 */
3729 CacheInvalidateHeapTuple(relation, &oldtup, heaptup);
3730
3731 /* Now we can release the buffer(s) */
3732 if (newbuf != buffer)
3733 ReleaseBuffer(newbuf);
3734 ReleaseBuffer(buffer);
3735 if (BufferIsValid(vmbuffer_new))
3736 ReleaseBuffer(vmbuffer_new);
3737 if (BufferIsValid(vmbuffer))
3738 ReleaseBuffer(vmbuffer);
3739
3740 /*
3741 * Release the lmgr tuple lock, if we had it.
3742 */
3743 if (have_tuple_lock)
3744 UnlockTupleTuplock(relation, &(oldtup.t_self), *lockmode);
3745
3746 pgstat_count_heap_update(relation, use_hot_update);
3747
3748 /*
3749 * If heaptup is a private copy, release it. Don't forget to copy t_self
3750 * back to the caller's image, too.
3751 */
3752 if (heaptup != newtup)
3753 {
3754 newtup->t_self = heaptup->t_self;
3755 heap_freetuple(heaptup);
3756 }
3757
3758 if (old_key_tuple != NULL && old_key_copied)
3759 heap_freetuple(old_key_tuple);
3760
3761 bms_free(hot_attrs);
3762 bms_free(key_attrs);
3763 bms_free(id_attrs);
3764 bms_free(modified_attrs);
3765 bms_free(interesting_attrs);
3766
3767 return TM_Ok;
3768 }
3769
3770 /*
3771 * Check if the specified attribute's value is same in both given tuples.
3772 * Subroutine for HeapDetermineModifiedColumns.
3773 */
3774 static bool
heap_tuple_attr_equals(TupleDesc tupdesc,int attrnum,HeapTuple tup1,HeapTuple tup2)3775 heap_tuple_attr_equals(TupleDesc tupdesc, int attrnum,
3776 HeapTuple tup1, HeapTuple tup2)
3777 {
3778 Datum value1,
3779 value2;
3780 bool isnull1,
3781 isnull2;
3782 Form_pg_attribute att;
3783
3784 /*
3785 * If it's a whole-tuple reference, say "not equal". It's not really
3786 * worth supporting this case, since it could only succeed after a no-op
3787 * update, which is hardly a case worth optimizing for.
3788 */
3789 if (attrnum == 0)
3790 return false;
3791
3792 /*
3793 * Likewise, automatically say "not equal" for any system attribute other
3794 * than tableOID; we cannot expect these to be consistent in a HOT chain,
3795 * or even to be set correctly yet in the new tuple.
3796 */
3797 if (attrnum < 0)
3798 {
3799 if (attrnum != TableOidAttributeNumber)
3800 return false;
3801 }
3802
3803 /*
3804 * Extract the corresponding values. XXX this is pretty inefficient if
3805 * there are many indexed columns. Should HeapDetermineModifiedColumns do
3806 * a single heap_deform_tuple call on each tuple, instead? But that
3807 * doesn't work for system columns ...
3808 */
3809 value1 = heap_getattr(tup1, attrnum, tupdesc, &isnull1);
3810 value2 = heap_getattr(tup2, attrnum, tupdesc, &isnull2);
3811
3812 /*
3813 * If one value is NULL and other is not, then they are certainly not
3814 * equal
3815 */
3816 if (isnull1 != isnull2)
3817 return false;
3818
3819 /*
3820 * If both are NULL, they can be considered equal.
3821 */
3822 if (isnull1)
3823 return true;
3824
3825 /*
3826 * We do simple binary comparison of the two datums. This may be overly
3827 * strict because there can be multiple binary representations for the
3828 * same logical value. But we should be OK as long as there are no false
3829 * positives. Using a type-specific equality operator is messy because
3830 * there could be multiple notions of equality in different operator
3831 * classes; furthermore, we cannot safely invoke user-defined functions
3832 * while holding exclusive buffer lock.
3833 */
3834 if (attrnum <= 0)
3835 {
3836 /* The only allowed system columns are OIDs, so do this */
3837 return (DatumGetObjectId(value1) == DatumGetObjectId(value2));
3838 }
3839 else
3840 {
3841 Assert(attrnum <= tupdesc->natts);
3842 att = TupleDescAttr(tupdesc, attrnum - 1);
3843 return datumIsEqual(value1, value2, att->attbyval, att->attlen);
3844 }
3845 }
3846
3847 /*
3848 * Check which columns are being updated.
3849 *
3850 * Given an updated tuple, determine (and return into the output bitmapset),
3851 * from those listed as interesting, the set of columns that changed.
3852 *
3853 * The input bitmapset is destructively modified; that is OK since this is
3854 * invoked at most once in heap_update.
3855 */
3856 static Bitmapset *
HeapDetermineModifiedColumns(Relation relation,Bitmapset * interesting_cols,HeapTuple oldtup,HeapTuple newtup)3857 HeapDetermineModifiedColumns(Relation relation, Bitmapset *interesting_cols,
3858 HeapTuple oldtup, HeapTuple newtup)
3859 {
3860 int attnum;
3861 Bitmapset *modified = NULL;
3862
3863 while ((attnum = bms_first_member(interesting_cols)) >= 0)
3864 {
3865 attnum += FirstLowInvalidHeapAttributeNumber;
3866
3867 if (!heap_tuple_attr_equals(RelationGetDescr(relation),
3868 attnum, oldtup, newtup))
3869 modified = bms_add_member(modified,
3870 attnum - FirstLowInvalidHeapAttributeNumber);
3871 }
3872
3873 return modified;
3874 }
3875
3876 /*
3877 * simple_heap_update - replace a tuple
3878 *
3879 * This routine may be used to update a tuple when concurrent updates of
3880 * the target tuple are not expected (for example, because we have a lock
3881 * on the relation associated with the tuple). Any failure is reported
3882 * via ereport().
3883 */
3884 void
simple_heap_update(Relation relation,ItemPointer otid,HeapTuple tup)3885 simple_heap_update(Relation relation, ItemPointer otid, HeapTuple tup)
3886 {
3887 TM_Result result;
3888 TM_FailureData tmfd;
3889 LockTupleMode lockmode;
3890
3891 result = heap_update(relation, otid, tup,
3892 GetCurrentCommandId(true), InvalidSnapshot,
3893 true /* wait for commit */ ,
3894 &tmfd, &lockmode);
3895 switch (result)
3896 {
3897 case TM_SelfModified:
3898 /* Tuple was already updated in current command? */
3899 elog(ERROR, "tuple already updated by self");
3900 break;
3901
3902 case TM_Ok:
3903 /* done successfully */
3904 break;
3905
3906 case TM_Updated:
3907 elog(ERROR, "tuple concurrently updated");
3908 break;
3909
3910 case TM_Deleted:
3911 elog(ERROR, "tuple concurrently deleted");
3912 break;
3913
3914 default:
3915 elog(ERROR, "unrecognized heap_update status: %u", result);
3916 break;
3917 }
3918 }
3919
3920
3921 /*
3922 * Return the MultiXactStatus corresponding to the given tuple lock mode.
3923 */
3924 static MultiXactStatus
get_mxact_status_for_lock(LockTupleMode mode,bool is_update)3925 get_mxact_status_for_lock(LockTupleMode mode, bool is_update)
3926 {
3927 int retval;
3928
3929 if (is_update)
3930 retval = tupleLockExtraInfo[mode].updstatus;
3931 else
3932 retval = tupleLockExtraInfo[mode].lockstatus;
3933
3934 if (retval == -1)
3935 elog(ERROR, "invalid lock tuple mode %d/%s", mode,
3936 is_update ? "true" : "false");
3937
3938 return (MultiXactStatus) retval;
3939 }
3940
3941 /*
3942 * heap_lock_tuple - lock a tuple in shared or exclusive mode
3943 *
3944 * Note that this acquires a buffer pin, which the caller must release.
3945 *
3946 * Input parameters:
3947 * relation: relation containing tuple (caller must hold suitable lock)
3948 * tid: TID of tuple to lock
3949 * cid: current command ID (used for visibility test, and stored into
3950 * tuple's cmax if lock is successful)
3951 * mode: indicates if shared or exclusive tuple lock is desired
3952 * wait_policy: what to do if tuple lock is not available
3953 * follow_updates: if true, follow the update chain to also lock descendant
3954 * tuples.
3955 *
3956 * Output parameters:
3957 * *tuple: all fields filled in
3958 * *buffer: set to buffer holding tuple (pinned but not locked at exit)
3959 * *tmfd: filled in failure cases (see below)
3960 *
3961 * Function results are the same as the ones for table_tuple_lock().
3962 *
3963 * In the failure cases other than TM_Invisible, the routine fills
3964 * *tmfd with the tuple's t_ctid, t_xmax (resolving a possible MultiXact,
3965 * if necessary), and t_cmax (the last only for TM_SelfModified,
3966 * since we cannot obtain cmax from a combocid generated by another
3967 * transaction).
3968 * See comments for struct TM_FailureData for additional info.
3969 *
3970 * See README.tuplock for a thorough explanation of this mechanism.
3971 */
3972 TM_Result
heap_lock_tuple(Relation relation,HeapTuple tuple,CommandId cid,LockTupleMode mode,LockWaitPolicy wait_policy,bool follow_updates,Buffer * buffer,TM_FailureData * tmfd)3973 heap_lock_tuple(Relation relation, HeapTuple tuple,
3974 CommandId cid, LockTupleMode mode, LockWaitPolicy wait_policy,
3975 bool follow_updates,
3976 Buffer *buffer, TM_FailureData *tmfd)
3977 {
3978 TM_Result result;
3979 ItemPointer tid = &(tuple->t_self);
3980 ItemId lp;
3981 Page page;
3982 Buffer vmbuffer = InvalidBuffer;
3983 BlockNumber block;
3984 TransactionId xid,
3985 xmax;
3986 uint16 old_infomask,
3987 new_infomask,
3988 new_infomask2;
3989 bool first_time = true;
3990 bool skip_tuple_lock = false;
3991 bool have_tuple_lock = false;
3992 bool cleared_all_frozen = false;
3993
3994 *buffer = ReadBuffer(relation, ItemPointerGetBlockNumber(tid));
3995 block = ItemPointerGetBlockNumber(tid);
3996
3997 /*
3998 * Before locking the buffer, pin the visibility map page if it appears to
3999 * be necessary. Since we haven't got the lock yet, someone else might be
4000 * in the middle of changing this, so we'll need to recheck after we have
4001 * the lock.
4002 */
4003 if (PageIsAllVisible(BufferGetPage(*buffer)))
4004 visibilitymap_pin(relation, block, &vmbuffer);
4005
4006 LockBuffer(*buffer, BUFFER_LOCK_EXCLUSIVE);
4007
4008 page = BufferGetPage(*buffer);
4009 lp = PageGetItemId(page, ItemPointerGetOffsetNumber(tid));
4010 Assert(ItemIdIsNormal(lp));
4011
4012 tuple->t_data = (HeapTupleHeader) PageGetItem(page, lp);
4013 tuple->t_len = ItemIdGetLength(lp);
4014 tuple->t_tableOid = RelationGetRelid(relation);
4015
4016 l3:
4017 result = HeapTupleSatisfiesUpdate(tuple, cid, *buffer);
4018
4019 if (result == TM_Invisible)
4020 {
4021 /*
4022 * This is possible, but only when locking a tuple for ON CONFLICT
4023 * UPDATE. We return this value here rather than throwing an error in
4024 * order to give that case the opportunity to throw a more specific
4025 * error.
4026 */
4027 result = TM_Invisible;
4028 goto out_locked;
4029 }
4030 else if (result == TM_BeingModified ||
4031 result == TM_Updated ||
4032 result == TM_Deleted)
4033 {
4034 TransactionId xwait;
4035 uint16 infomask;
4036 uint16 infomask2;
4037 bool require_sleep;
4038 ItemPointerData t_ctid;
4039
4040 /* must copy state data before unlocking buffer */
4041 xwait = HeapTupleHeaderGetRawXmax(tuple->t_data);
4042 infomask = tuple->t_data->t_infomask;
4043 infomask2 = tuple->t_data->t_infomask2;
4044 ItemPointerCopy(&tuple->t_data->t_ctid, &t_ctid);
4045
4046 LockBuffer(*buffer, BUFFER_LOCK_UNLOCK);
4047
4048 /*
4049 * If any subtransaction of the current top transaction already holds
4050 * a lock as strong as or stronger than what we're requesting, we
4051 * effectively hold the desired lock already. We *must* succeed
4052 * without trying to take the tuple lock, else we will deadlock
4053 * against anyone wanting to acquire a stronger lock.
4054 *
4055 * Note we only do this the first time we loop on the HTSU result;
4056 * there is no point in testing in subsequent passes, because
4057 * evidently our own transaction cannot have acquired a new lock after
4058 * the first time we checked.
4059 */
4060 if (first_time)
4061 {
4062 first_time = false;
4063
4064 if (infomask & HEAP_XMAX_IS_MULTI)
4065 {
4066 int i;
4067 int nmembers;
4068 MultiXactMember *members;
4069
4070 /*
4071 * We don't need to allow old multixacts here; if that had
4072 * been the case, HeapTupleSatisfiesUpdate would have returned
4073 * MayBeUpdated and we wouldn't be here.
4074 */
4075 nmembers =
4076 GetMultiXactIdMembers(xwait, &members, false,
4077 HEAP_XMAX_IS_LOCKED_ONLY(infomask));
4078
4079 for (i = 0; i < nmembers; i++)
4080 {
4081 /* only consider members of our own transaction */
4082 if (!TransactionIdIsCurrentTransactionId(members[i].xid))
4083 continue;
4084
4085 if (TUPLOCK_from_mxstatus(members[i].status) >= mode)
4086 {
4087 pfree(members);
4088 result = TM_Ok;
4089 goto out_unlocked;
4090 }
4091 else
4092 {
4093 /*
4094 * Disable acquisition of the heavyweight tuple lock.
4095 * Otherwise, when promoting a weaker lock, we might
4096 * deadlock with another locker that has acquired the
4097 * heavyweight tuple lock and is waiting for our
4098 * transaction to finish.
4099 *
4100 * Note that in this case we still need to wait for
4101 * the multixact if required, to avoid acquiring
4102 * conflicting locks.
4103 */
4104 skip_tuple_lock = true;
4105 }
4106 }
4107
4108 if (members)
4109 pfree(members);
4110 }
4111 else if (TransactionIdIsCurrentTransactionId(xwait))
4112 {
4113 switch (mode)
4114 {
4115 case LockTupleKeyShare:
4116 Assert(HEAP_XMAX_IS_KEYSHR_LOCKED(infomask) ||
4117 HEAP_XMAX_IS_SHR_LOCKED(infomask) ||
4118 HEAP_XMAX_IS_EXCL_LOCKED(infomask));
4119 result = TM_Ok;
4120 goto out_unlocked;
4121 case LockTupleShare:
4122 if (HEAP_XMAX_IS_SHR_LOCKED(infomask) ||
4123 HEAP_XMAX_IS_EXCL_LOCKED(infomask))
4124 {
4125 result = TM_Ok;
4126 goto out_unlocked;
4127 }
4128 break;
4129 case LockTupleNoKeyExclusive:
4130 if (HEAP_XMAX_IS_EXCL_LOCKED(infomask))
4131 {
4132 result = TM_Ok;
4133 goto out_unlocked;
4134 }
4135 break;
4136 case LockTupleExclusive:
4137 if (HEAP_XMAX_IS_EXCL_LOCKED(infomask) &&
4138 infomask2 & HEAP_KEYS_UPDATED)
4139 {
4140 result = TM_Ok;
4141 goto out_unlocked;
4142 }
4143 break;
4144 }
4145 }
4146 }
4147
4148 /*
4149 * Initially assume that we will have to wait for the locking
4150 * transaction(s) to finish. We check various cases below in which
4151 * this can be turned off.
4152 */
4153 require_sleep = true;
4154 if (mode == LockTupleKeyShare)
4155 {
4156 /*
4157 * If we're requesting KeyShare, and there's no update present, we
4158 * don't need to wait. Even if there is an update, we can still
4159 * continue if the key hasn't been modified.
4160 *
4161 * However, if there are updates, we need to walk the update chain
4162 * to mark future versions of the row as locked, too. That way,
4163 * if somebody deletes that future version, we're protected
4164 * against the key going away. This locking of future versions
4165 * could block momentarily, if a concurrent transaction is
4166 * deleting a key; or it could return a value to the effect that
4167 * the transaction deleting the key has already committed. So we
4168 * do this before re-locking the buffer; otherwise this would be
4169 * prone to deadlocks.
4170 *
4171 * Note that the TID we're locking was grabbed before we unlocked
4172 * the buffer. For it to change while we're not looking, the
4173 * other properties we're testing for below after re-locking the
4174 * buffer would also change, in which case we would restart this
4175 * loop above.
4176 */
4177 if (!(infomask2 & HEAP_KEYS_UPDATED))
4178 {
4179 bool updated;
4180
4181 updated = !HEAP_XMAX_IS_LOCKED_ONLY(infomask);
4182
4183 /*
4184 * If there are updates, follow the update chain; bail out if
4185 * that cannot be done.
4186 */
4187 if (follow_updates && updated)
4188 {
4189 TM_Result res;
4190
4191 res = heap_lock_updated_tuple(relation, tuple, &t_ctid,
4192 GetCurrentTransactionId(),
4193 mode);
4194 if (res != TM_Ok)
4195 {
4196 result = res;
4197 /* recovery code expects to have buffer lock held */
4198 LockBuffer(*buffer, BUFFER_LOCK_EXCLUSIVE);
4199 goto failed;
4200 }
4201 }
4202
4203 LockBuffer(*buffer, BUFFER_LOCK_EXCLUSIVE);
4204
4205 /*
4206 * Make sure it's still an appropriate lock, else start over.
4207 * Also, if it wasn't updated before we released the lock, but
4208 * is updated now, we start over too; the reason is that we
4209 * now need to follow the update chain to lock the new
4210 * versions.
4211 */
4212 if (!HeapTupleHeaderIsOnlyLocked(tuple->t_data) &&
4213 ((tuple->t_data->t_infomask2 & HEAP_KEYS_UPDATED) ||
4214 !updated))
4215 goto l3;
4216
4217 /* Things look okay, so we can skip sleeping */
4218 require_sleep = false;
4219
4220 /*
4221 * Note we allow Xmax to change here; other updaters/lockers
4222 * could have modified it before we grabbed the buffer lock.
4223 * However, this is not a problem, because with the recheck we
4224 * just did we ensure that they still don't conflict with the
4225 * lock we want.
4226 */
4227 }
4228 }
4229 else if (mode == LockTupleShare)
4230 {
4231 /*
4232 * If we're requesting Share, we can similarly avoid sleeping if
4233 * there's no update and no exclusive lock present.
4234 */
4235 if (HEAP_XMAX_IS_LOCKED_ONLY(infomask) &&
4236 !HEAP_XMAX_IS_EXCL_LOCKED(infomask))
4237 {
4238 LockBuffer(*buffer, BUFFER_LOCK_EXCLUSIVE);
4239
4240 /*
4241 * Make sure it's still an appropriate lock, else start over.
4242 * See above about allowing xmax to change.
4243 */
4244 if (!HEAP_XMAX_IS_LOCKED_ONLY(tuple->t_data->t_infomask) ||
4245 HEAP_XMAX_IS_EXCL_LOCKED(tuple->t_data->t_infomask))
4246 goto l3;
4247 require_sleep = false;
4248 }
4249 }
4250 else if (mode == LockTupleNoKeyExclusive)
4251 {
4252 /*
4253 * If we're requesting NoKeyExclusive, we might also be able to
4254 * avoid sleeping; just ensure that there no conflicting lock
4255 * already acquired.
4256 */
4257 if (infomask & HEAP_XMAX_IS_MULTI)
4258 {
4259 if (!DoesMultiXactIdConflict((MultiXactId) xwait, infomask,
4260 mode, NULL))
4261 {
4262 /*
4263 * No conflict, but if the xmax changed under us in the
4264 * meantime, start over.
4265 */
4266 LockBuffer(*buffer, BUFFER_LOCK_EXCLUSIVE);
4267 if (xmax_infomask_changed(tuple->t_data->t_infomask, infomask) ||
4268 !TransactionIdEquals(HeapTupleHeaderGetRawXmax(tuple->t_data),
4269 xwait))
4270 goto l3;
4271
4272 /* otherwise, we're good */
4273 require_sleep = false;
4274 }
4275 }
4276 else if (HEAP_XMAX_IS_KEYSHR_LOCKED(infomask))
4277 {
4278 LockBuffer(*buffer, BUFFER_LOCK_EXCLUSIVE);
4279
4280 /* if the xmax changed in the meantime, start over */
4281 if (xmax_infomask_changed(tuple->t_data->t_infomask, infomask) ||
4282 !TransactionIdEquals(HeapTupleHeaderGetRawXmax(tuple->t_data),
4283 xwait))
4284 goto l3;
4285 /* otherwise, we're good */
4286 require_sleep = false;
4287 }
4288 }
4289
4290 /*
4291 * As a check independent from those above, we can also avoid sleeping
4292 * if the current transaction is the sole locker of the tuple. Note
4293 * that the strength of the lock already held is irrelevant; this is
4294 * not about recording the lock in Xmax (which will be done regardless
4295 * of this optimization, below). Also, note that the cases where we
4296 * hold a lock stronger than we are requesting are already handled
4297 * above by not doing anything.
4298 *
4299 * Note we only deal with the non-multixact case here; MultiXactIdWait
4300 * is well equipped to deal with this situation on its own.
4301 */
4302 if (require_sleep && !(infomask & HEAP_XMAX_IS_MULTI) &&
4303 TransactionIdIsCurrentTransactionId(xwait))
4304 {
4305 /* ... but if the xmax changed in the meantime, start over */
4306 LockBuffer(*buffer, BUFFER_LOCK_EXCLUSIVE);
4307 if (xmax_infomask_changed(tuple->t_data->t_infomask, infomask) ||
4308 !TransactionIdEquals(HeapTupleHeaderGetRawXmax(tuple->t_data),
4309 xwait))
4310 goto l3;
4311 Assert(HEAP_XMAX_IS_LOCKED_ONLY(tuple->t_data->t_infomask));
4312 require_sleep = false;
4313 }
4314
4315 /*
4316 * Time to sleep on the other transaction/multixact, if necessary.
4317 *
4318 * If the other transaction is an update/delete that's already
4319 * committed, then sleeping cannot possibly do any good: if we're
4320 * required to sleep, get out to raise an error instead.
4321 *
4322 * By here, we either have already acquired the buffer exclusive lock,
4323 * or we must wait for the locking transaction or multixact; so below
4324 * we ensure that we grab buffer lock after the sleep.
4325 */
4326 if (require_sleep && (result == TM_Updated || result == TM_Deleted))
4327 {
4328 LockBuffer(*buffer, BUFFER_LOCK_EXCLUSIVE);
4329 goto failed;
4330 }
4331 else if (require_sleep)
4332 {
4333 /*
4334 * Acquire tuple lock to establish our priority for the tuple, or
4335 * die trying. LockTuple will release us when we are next-in-line
4336 * for the tuple. We must do this even if we are share-locking,
4337 * but not if we already have a weaker lock on the tuple.
4338 *
4339 * If we are forced to "start over" below, we keep the tuple lock;
4340 * this arranges that we stay at the head of the line while
4341 * rechecking tuple state.
4342 */
4343 if (!skip_tuple_lock &&
4344 !heap_acquire_tuplock(relation, tid, mode, wait_policy,
4345 &have_tuple_lock))
4346 {
4347 /*
4348 * This can only happen if wait_policy is Skip and the lock
4349 * couldn't be obtained.
4350 */
4351 result = TM_WouldBlock;
4352 /* recovery code expects to have buffer lock held */
4353 LockBuffer(*buffer, BUFFER_LOCK_EXCLUSIVE);
4354 goto failed;
4355 }
4356
4357 if (infomask & HEAP_XMAX_IS_MULTI)
4358 {
4359 MultiXactStatus status = get_mxact_status_for_lock(mode, false);
4360
4361 /* We only ever lock tuples, never update them */
4362 if (status >= MultiXactStatusNoKeyUpdate)
4363 elog(ERROR, "invalid lock mode in heap_lock_tuple");
4364
4365 /* wait for multixact to end, or die trying */
4366 switch (wait_policy)
4367 {
4368 case LockWaitBlock:
4369 MultiXactIdWait((MultiXactId) xwait, status, infomask,
4370 relation, &tuple->t_self, XLTW_Lock, NULL);
4371 break;
4372 case LockWaitSkip:
4373 if (!ConditionalMultiXactIdWait((MultiXactId) xwait,
4374 status, infomask, relation,
4375 NULL))
4376 {
4377 result = TM_WouldBlock;
4378 /* recovery code expects to have buffer lock held */
4379 LockBuffer(*buffer, BUFFER_LOCK_EXCLUSIVE);
4380 goto failed;
4381 }
4382 break;
4383 case LockWaitError:
4384 if (!ConditionalMultiXactIdWait((MultiXactId) xwait,
4385 status, infomask, relation,
4386 NULL))
4387 ereport(ERROR,
4388 (errcode(ERRCODE_LOCK_NOT_AVAILABLE),
4389 errmsg("could not obtain lock on row in relation \"%s\"",
4390 RelationGetRelationName(relation))));
4391
4392 break;
4393 }
4394
4395 /*
4396 * Of course, the multixact might not be done here: if we're
4397 * requesting a light lock mode, other transactions with light
4398 * locks could still be alive, as well as locks owned by our
4399 * own xact or other subxacts of this backend. We need to
4400 * preserve the surviving MultiXact members. Note that it
4401 * isn't absolutely necessary in the latter case, but doing so
4402 * is simpler.
4403 */
4404 }
4405 else
4406 {
4407 /* wait for regular transaction to end, or die trying */
4408 switch (wait_policy)
4409 {
4410 case LockWaitBlock:
4411 XactLockTableWait(xwait, relation, &tuple->t_self,
4412 XLTW_Lock);
4413 break;
4414 case LockWaitSkip:
4415 if (!ConditionalXactLockTableWait(xwait))
4416 {
4417 result = TM_WouldBlock;
4418 /* recovery code expects to have buffer lock held */
4419 LockBuffer(*buffer, BUFFER_LOCK_EXCLUSIVE);
4420 goto failed;
4421 }
4422 break;
4423 case LockWaitError:
4424 if (!ConditionalXactLockTableWait(xwait))
4425 ereport(ERROR,
4426 (errcode(ERRCODE_LOCK_NOT_AVAILABLE),
4427 errmsg("could not obtain lock on row in relation \"%s\"",
4428 RelationGetRelationName(relation))));
4429 break;
4430 }
4431 }
4432
4433 /* if there are updates, follow the update chain */
4434 if (follow_updates && !HEAP_XMAX_IS_LOCKED_ONLY(infomask))
4435 {
4436 TM_Result res;
4437
4438 res = heap_lock_updated_tuple(relation, tuple, &t_ctid,
4439 GetCurrentTransactionId(),
4440 mode);
4441 if (res != TM_Ok)
4442 {
4443 result = res;
4444 /* recovery code expects to have buffer lock held */
4445 LockBuffer(*buffer, BUFFER_LOCK_EXCLUSIVE);
4446 goto failed;
4447 }
4448 }
4449
4450 LockBuffer(*buffer, BUFFER_LOCK_EXCLUSIVE);
4451
4452 /*
4453 * xwait is done, but if xwait had just locked the tuple then some
4454 * other xact could update this tuple before we get to this point.
4455 * Check for xmax change, and start over if so.
4456 */
4457 if (xmax_infomask_changed(tuple->t_data->t_infomask, infomask) ||
4458 !TransactionIdEquals(HeapTupleHeaderGetRawXmax(tuple->t_data),
4459 xwait))
4460 goto l3;
4461
4462 if (!(infomask & HEAP_XMAX_IS_MULTI))
4463 {
4464 /*
4465 * Otherwise check if it committed or aborted. Note we cannot
4466 * be here if the tuple was only locked by somebody who didn't
4467 * conflict with us; that would have been handled above. So
4468 * that transaction must necessarily be gone by now. But
4469 * don't check for this in the multixact case, because some
4470 * locker transactions might still be running.
4471 */
4472 UpdateXmaxHintBits(tuple->t_data, *buffer, xwait);
4473 }
4474 }
4475
4476 /* By here, we're certain that we hold buffer exclusive lock again */
4477
4478 /*
4479 * We may lock if previous xmax aborted, or if it committed but only
4480 * locked the tuple without updating it; or if we didn't have to wait
4481 * at all for whatever reason.
4482 */
4483 if (!require_sleep ||
4484 (tuple->t_data->t_infomask & HEAP_XMAX_INVALID) ||
4485 HEAP_XMAX_IS_LOCKED_ONLY(tuple->t_data->t_infomask) ||
4486 HeapTupleHeaderIsOnlyLocked(tuple->t_data))
4487 result = TM_Ok;
4488 else if (!ItemPointerEquals(&tuple->t_self, &tuple->t_data->t_ctid) ||
4489 HeapTupleHeaderIndicatesMovedPartitions(tuple->t_data))
4490 result = TM_Updated;
4491 else
4492 result = TM_Deleted;
4493 }
4494
4495 failed:
4496 if (result != TM_Ok)
4497 {
4498 Assert(result == TM_SelfModified || result == TM_Updated ||
4499 result == TM_Deleted || result == TM_WouldBlock);
4500 Assert(!(tuple->t_data->t_infomask & HEAP_XMAX_INVALID));
4501 Assert(result != TM_Updated ||
4502 !ItemPointerEquals(&tuple->t_self, &tuple->t_data->t_ctid));
4503 tmfd->ctid = tuple->t_data->t_ctid;
4504 tmfd->xmax = HeapTupleHeaderGetUpdateXid(tuple->t_data);
4505 if (result == TM_SelfModified)
4506 tmfd->cmax = HeapTupleHeaderGetCmax(tuple->t_data);
4507 else
4508 tmfd->cmax = InvalidCommandId;
4509 goto out_locked;
4510 }
4511
4512 /*
4513 * If we didn't pin the visibility map page and the page has become all
4514 * visible while we were busy locking the buffer, or during some
4515 * subsequent window during which we had it unlocked, we'll have to unlock
4516 * and re-lock, to avoid holding the buffer lock across I/O. That's a bit
4517 * unfortunate, especially since we'll now have to recheck whether the
4518 * tuple has been locked or updated under us, but hopefully it won't
4519 * happen very often.
4520 */
4521 if (vmbuffer == InvalidBuffer && PageIsAllVisible(page))
4522 {
4523 LockBuffer(*buffer, BUFFER_LOCK_UNLOCK);
4524 visibilitymap_pin(relation, block, &vmbuffer);
4525 LockBuffer(*buffer, BUFFER_LOCK_EXCLUSIVE);
4526 goto l3;
4527 }
4528
4529 xmax = HeapTupleHeaderGetRawXmax(tuple->t_data);
4530 old_infomask = tuple->t_data->t_infomask;
4531
4532 /*
4533 * If this is the first possibly-multixact-able operation in the current
4534 * transaction, set my per-backend OldestMemberMXactId setting. We can be
4535 * certain that the transaction will never become a member of any older
4536 * MultiXactIds than that. (We have to do this even if we end up just
4537 * using our own TransactionId below, since some other backend could
4538 * incorporate our XID into a MultiXact immediately afterwards.)
4539 */
4540 MultiXactIdSetOldestMember();
4541
4542 /*
4543 * Compute the new xmax and infomask to store into the tuple. Note we do
4544 * not modify the tuple just yet, because that would leave it in the wrong
4545 * state if multixact.c elogs.
4546 */
4547 compute_new_xmax_infomask(xmax, old_infomask, tuple->t_data->t_infomask2,
4548 GetCurrentTransactionId(), mode, false,
4549 &xid, &new_infomask, &new_infomask2);
4550
4551 START_CRIT_SECTION();
4552
4553 /*
4554 * Store transaction information of xact locking the tuple.
4555 *
4556 * Note: Cmax is meaningless in this context, so don't set it; this avoids
4557 * possibly generating a useless combo CID. Moreover, if we're locking a
4558 * previously updated tuple, it's important to preserve the Cmax.
4559 *
4560 * Also reset the HOT UPDATE bit, but only if there's no update; otherwise
4561 * we would break the HOT chain.
4562 */
4563 tuple->t_data->t_infomask &= ~HEAP_XMAX_BITS;
4564 tuple->t_data->t_infomask2 &= ~HEAP_KEYS_UPDATED;
4565 tuple->t_data->t_infomask |= new_infomask;
4566 tuple->t_data->t_infomask2 |= new_infomask2;
4567 if (HEAP_XMAX_IS_LOCKED_ONLY(new_infomask))
4568 HeapTupleHeaderClearHotUpdated(tuple->t_data);
4569 HeapTupleHeaderSetXmax(tuple->t_data, xid);
4570
4571 /*
4572 * Make sure there is no forward chain link in t_ctid. Note that in the
4573 * cases where the tuple has been updated, we must not overwrite t_ctid,
4574 * because it was set by the updater. Moreover, if the tuple has been
4575 * updated, we need to follow the update chain to lock the new versions of
4576 * the tuple as well.
4577 */
4578 if (HEAP_XMAX_IS_LOCKED_ONLY(new_infomask))
4579 tuple->t_data->t_ctid = *tid;
4580
4581 /* Clear only the all-frozen bit on visibility map if needed */
4582 if (PageIsAllVisible(page) &&
4583 visibilitymap_clear(relation, block, vmbuffer,
4584 VISIBILITYMAP_ALL_FROZEN))
4585 cleared_all_frozen = true;
4586
4587
4588 MarkBufferDirty(*buffer);
4589
4590 /*
4591 * XLOG stuff. You might think that we don't need an XLOG record because
4592 * there is no state change worth restoring after a crash. You would be
4593 * wrong however: we have just written either a TransactionId or a
4594 * MultiXactId that may never have been seen on disk before, and we need
4595 * to make sure that there are XLOG entries covering those ID numbers.
4596 * Else the same IDs might be re-used after a crash, which would be
4597 * disastrous if this page made it to disk before the crash. Essentially
4598 * we have to enforce the WAL log-before-data rule even in this case.
4599 * (Also, in a PITR log-shipping or 2PC environment, we have to have XLOG
4600 * entries for everything anyway.)
4601 */
4602 if (RelationNeedsWAL(relation))
4603 {
4604 xl_heap_lock xlrec;
4605 XLogRecPtr recptr;
4606
4607 XLogBeginInsert();
4608 XLogRegisterBuffer(0, *buffer, REGBUF_STANDARD);
4609
4610 xlrec.offnum = ItemPointerGetOffsetNumber(&tuple->t_self);
4611 xlrec.locking_xid = xid;
4612 xlrec.infobits_set = compute_infobits(new_infomask,
4613 tuple->t_data->t_infomask2);
4614 xlrec.flags = cleared_all_frozen ? XLH_LOCK_ALL_FROZEN_CLEARED : 0;
4615 XLogRegisterData((char *) &xlrec, SizeOfHeapLock);
4616
4617 /* we don't decode row locks atm, so no need to log the origin */
4618
4619 recptr = XLogInsert(RM_HEAP_ID, XLOG_HEAP_LOCK);
4620
4621 PageSetLSN(page, recptr);
4622 }
4623
4624 END_CRIT_SECTION();
4625
4626 result = TM_Ok;
4627
4628 out_locked:
4629 LockBuffer(*buffer, BUFFER_LOCK_UNLOCK);
4630
4631 out_unlocked:
4632 if (BufferIsValid(vmbuffer))
4633 ReleaseBuffer(vmbuffer);
4634
4635 /*
4636 * Don't update the visibility map here. Locking a tuple doesn't change
4637 * visibility info.
4638 */
4639
4640 /*
4641 * Now that we have successfully marked the tuple as locked, we can
4642 * release the lmgr tuple lock, if we had it.
4643 */
4644 if (have_tuple_lock)
4645 UnlockTupleTuplock(relation, tid, mode);
4646
4647 return result;
4648 }
4649
4650 /*
4651 * Acquire heavyweight lock on the given tuple, in preparation for acquiring
4652 * its normal, Xmax-based tuple lock.
4653 *
4654 * have_tuple_lock is an input and output parameter: on input, it indicates
4655 * whether the lock has previously been acquired (and this function does
4656 * nothing in that case). If this function returns success, have_tuple_lock
4657 * has been flipped to true.
4658 *
4659 * Returns false if it was unable to obtain the lock; this can only happen if
4660 * wait_policy is Skip.
4661 */
4662 static bool
heap_acquire_tuplock(Relation relation,ItemPointer tid,LockTupleMode mode,LockWaitPolicy wait_policy,bool * have_tuple_lock)4663 heap_acquire_tuplock(Relation relation, ItemPointer tid, LockTupleMode mode,
4664 LockWaitPolicy wait_policy, bool *have_tuple_lock)
4665 {
4666 if (*have_tuple_lock)
4667 return true;
4668
4669 switch (wait_policy)
4670 {
4671 case LockWaitBlock:
4672 LockTupleTuplock(relation, tid, mode);
4673 break;
4674
4675 case LockWaitSkip:
4676 if (!ConditionalLockTupleTuplock(relation, tid, mode))
4677 return false;
4678 break;
4679
4680 case LockWaitError:
4681 if (!ConditionalLockTupleTuplock(relation, tid, mode))
4682 ereport(ERROR,
4683 (errcode(ERRCODE_LOCK_NOT_AVAILABLE),
4684 errmsg("could not obtain lock on row in relation \"%s\"",
4685 RelationGetRelationName(relation))));
4686 break;
4687 }
4688 *have_tuple_lock = true;
4689
4690 return true;
4691 }
4692
4693 /*
4694 * Given an original set of Xmax and infomask, and a transaction (identified by
4695 * add_to_xmax) acquiring a new lock of some mode, compute the new Xmax and
4696 * corresponding infomasks to use on the tuple.
4697 *
4698 * Note that this might have side effects such as creating a new MultiXactId.
4699 *
4700 * Most callers will have called HeapTupleSatisfiesUpdate before this function;
4701 * that will have set the HEAP_XMAX_INVALID bit if the xmax was a MultiXactId
4702 * but it was not running anymore. There is a race condition, which is that the
4703 * MultiXactId may have finished since then, but that uncommon case is handled
4704 * either here, or within MultiXactIdExpand.
4705 *
4706 * There is a similar race condition possible when the old xmax was a regular
4707 * TransactionId. We test TransactionIdIsInProgress again just to narrow the
4708 * window, but it's still possible to end up creating an unnecessary
4709 * MultiXactId. Fortunately this is harmless.
4710 */
4711 static void
compute_new_xmax_infomask(TransactionId xmax,uint16 old_infomask,uint16 old_infomask2,TransactionId add_to_xmax,LockTupleMode mode,bool is_update,TransactionId * result_xmax,uint16 * result_infomask,uint16 * result_infomask2)4712 compute_new_xmax_infomask(TransactionId xmax, uint16 old_infomask,
4713 uint16 old_infomask2, TransactionId add_to_xmax,
4714 LockTupleMode mode, bool is_update,
4715 TransactionId *result_xmax, uint16 *result_infomask,
4716 uint16 *result_infomask2)
4717 {
4718 TransactionId new_xmax;
4719 uint16 new_infomask,
4720 new_infomask2;
4721
4722 Assert(TransactionIdIsCurrentTransactionId(add_to_xmax));
4723
4724 l5:
4725 new_infomask = 0;
4726 new_infomask2 = 0;
4727 if (old_infomask & HEAP_XMAX_INVALID)
4728 {
4729 /*
4730 * No previous locker; we just insert our own TransactionId.
4731 *
4732 * Note that it's critical that this case be the first one checked,
4733 * because there are several blocks below that come back to this one
4734 * to implement certain optimizations; old_infomask might contain
4735 * other dirty bits in those cases, but we don't really care.
4736 */
4737 if (is_update)
4738 {
4739 new_xmax = add_to_xmax;
4740 if (mode == LockTupleExclusive)
4741 new_infomask2 |= HEAP_KEYS_UPDATED;
4742 }
4743 else
4744 {
4745 new_infomask |= HEAP_XMAX_LOCK_ONLY;
4746 switch (mode)
4747 {
4748 case LockTupleKeyShare:
4749 new_xmax = add_to_xmax;
4750 new_infomask |= HEAP_XMAX_KEYSHR_LOCK;
4751 break;
4752 case LockTupleShare:
4753 new_xmax = add_to_xmax;
4754 new_infomask |= HEAP_XMAX_SHR_LOCK;
4755 break;
4756 case LockTupleNoKeyExclusive:
4757 new_xmax = add_to_xmax;
4758 new_infomask |= HEAP_XMAX_EXCL_LOCK;
4759 break;
4760 case LockTupleExclusive:
4761 new_xmax = add_to_xmax;
4762 new_infomask |= HEAP_XMAX_EXCL_LOCK;
4763 new_infomask2 |= HEAP_KEYS_UPDATED;
4764 break;
4765 default:
4766 new_xmax = InvalidTransactionId; /* silence compiler */
4767 elog(ERROR, "invalid lock mode");
4768 }
4769 }
4770 }
4771 else if (old_infomask & HEAP_XMAX_IS_MULTI)
4772 {
4773 MultiXactStatus new_status;
4774
4775 /*
4776 * Currently we don't allow XMAX_COMMITTED to be set for multis, so
4777 * cross-check.
4778 */
4779 Assert(!(old_infomask & HEAP_XMAX_COMMITTED));
4780
4781 /*
4782 * A multixact together with LOCK_ONLY set but neither lock bit set
4783 * (i.e. a pg_upgraded share locked tuple) cannot possibly be running
4784 * anymore. This check is critical for databases upgraded by
4785 * pg_upgrade; both MultiXactIdIsRunning and MultiXactIdExpand assume
4786 * that such multis are never passed.
4787 */
4788 if (HEAP_LOCKED_UPGRADED(old_infomask))
4789 {
4790 old_infomask &= ~HEAP_XMAX_IS_MULTI;
4791 old_infomask |= HEAP_XMAX_INVALID;
4792 goto l5;
4793 }
4794
4795 /*
4796 * If the XMAX is already a MultiXactId, then we need to expand it to
4797 * include add_to_xmax; but if all the members were lockers and are
4798 * all gone, we can do away with the IS_MULTI bit and just set
4799 * add_to_xmax as the only locker/updater. If all lockers are gone
4800 * and we have an updater that aborted, we can also do without a
4801 * multi.
4802 *
4803 * The cost of doing GetMultiXactIdMembers would be paid by
4804 * MultiXactIdExpand if we weren't to do this, so this check is not
4805 * incurring extra work anyhow.
4806 */
4807 if (!MultiXactIdIsRunning(xmax, HEAP_XMAX_IS_LOCKED_ONLY(old_infomask)))
4808 {
4809 if (HEAP_XMAX_IS_LOCKED_ONLY(old_infomask) ||
4810 !TransactionIdDidCommit(MultiXactIdGetUpdateXid(xmax,
4811 old_infomask)))
4812 {
4813 /*
4814 * Reset these bits and restart; otherwise fall through to
4815 * create a new multi below.
4816 */
4817 old_infomask &= ~HEAP_XMAX_IS_MULTI;
4818 old_infomask |= HEAP_XMAX_INVALID;
4819 goto l5;
4820 }
4821 }
4822
4823 new_status = get_mxact_status_for_lock(mode, is_update);
4824
4825 new_xmax = MultiXactIdExpand((MultiXactId) xmax, add_to_xmax,
4826 new_status);
4827 GetMultiXactIdHintBits(new_xmax, &new_infomask, &new_infomask2);
4828 }
4829 else if (old_infomask & HEAP_XMAX_COMMITTED)
4830 {
4831 /*
4832 * It's a committed update, so we need to preserve him as updater of
4833 * the tuple.
4834 */
4835 MultiXactStatus status;
4836 MultiXactStatus new_status;
4837
4838 if (old_infomask2 & HEAP_KEYS_UPDATED)
4839 status = MultiXactStatusUpdate;
4840 else
4841 status = MultiXactStatusNoKeyUpdate;
4842
4843 new_status = get_mxact_status_for_lock(mode, is_update);
4844
4845 /*
4846 * since it's not running, it's obviously impossible for the old
4847 * updater to be identical to the current one, so we need not check
4848 * for that case as we do in the block above.
4849 */
4850 new_xmax = MultiXactIdCreate(xmax, status, add_to_xmax, new_status);
4851 GetMultiXactIdHintBits(new_xmax, &new_infomask, &new_infomask2);
4852 }
4853 else if (TransactionIdIsInProgress(xmax))
4854 {
4855 /*
4856 * If the XMAX is a valid, in-progress TransactionId, then we need to
4857 * create a new MultiXactId that includes both the old locker or
4858 * updater and our own TransactionId.
4859 */
4860 MultiXactStatus new_status;
4861 MultiXactStatus old_status;
4862 LockTupleMode old_mode;
4863
4864 if (HEAP_XMAX_IS_LOCKED_ONLY(old_infomask))
4865 {
4866 if (HEAP_XMAX_IS_KEYSHR_LOCKED(old_infomask))
4867 old_status = MultiXactStatusForKeyShare;
4868 else if (HEAP_XMAX_IS_SHR_LOCKED(old_infomask))
4869 old_status = MultiXactStatusForShare;
4870 else if (HEAP_XMAX_IS_EXCL_LOCKED(old_infomask))
4871 {
4872 if (old_infomask2 & HEAP_KEYS_UPDATED)
4873 old_status = MultiXactStatusForUpdate;
4874 else
4875 old_status = MultiXactStatusForNoKeyUpdate;
4876 }
4877 else
4878 {
4879 /*
4880 * LOCK_ONLY can be present alone only when a page has been
4881 * upgraded by pg_upgrade. But in that case,
4882 * TransactionIdIsInProgress() should have returned false. We
4883 * assume it's no longer locked in this case.
4884 */
4885 elog(WARNING, "LOCK_ONLY found for Xid in progress %u", xmax);
4886 old_infomask |= HEAP_XMAX_INVALID;
4887 old_infomask &= ~HEAP_XMAX_LOCK_ONLY;
4888 goto l5;
4889 }
4890 }
4891 else
4892 {
4893 /* it's an update, but which kind? */
4894 if (old_infomask2 & HEAP_KEYS_UPDATED)
4895 old_status = MultiXactStatusUpdate;
4896 else
4897 old_status = MultiXactStatusNoKeyUpdate;
4898 }
4899
4900 old_mode = TUPLOCK_from_mxstatus(old_status);
4901
4902 /*
4903 * If the lock to be acquired is for the same TransactionId as the
4904 * existing lock, there's an optimization possible: consider only the
4905 * strongest of both locks as the only one present, and restart.
4906 */
4907 if (xmax == add_to_xmax)
4908 {
4909 /*
4910 * Note that it's not possible for the original tuple to be
4911 * updated: we wouldn't be here because the tuple would have been
4912 * invisible and we wouldn't try to update it. As a subtlety,
4913 * this code can also run when traversing an update chain to lock
4914 * future versions of a tuple. But we wouldn't be here either,
4915 * because the add_to_xmax would be different from the original
4916 * updater.
4917 */
4918 Assert(HEAP_XMAX_IS_LOCKED_ONLY(old_infomask));
4919
4920 /* acquire the strongest of both */
4921 if (mode < old_mode)
4922 mode = old_mode;
4923 /* mustn't touch is_update */
4924
4925 old_infomask |= HEAP_XMAX_INVALID;
4926 goto l5;
4927 }
4928
4929 /* otherwise, just fall back to creating a new multixact */
4930 new_status = get_mxact_status_for_lock(mode, is_update);
4931 new_xmax = MultiXactIdCreate(xmax, old_status,
4932 add_to_xmax, new_status);
4933 GetMultiXactIdHintBits(new_xmax, &new_infomask, &new_infomask2);
4934 }
4935 else if (!HEAP_XMAX_IS_LOCKED_ONLY(old_infomask) &&
4936 TransactionIdDidCommit(xmax))
4937 {
4938 /*
4939 * It's a committed update, so we gotta preserve him as updater of the
4940 * tuple.
4941 */
4942 MultiXactStatus status;
4943 MultiXactStatus new_status;
4944
4945 if (old_infomask2 & HEAP_KEYS_UPDATED)
4946 status = MultiXactStatusUpdate;
4947 else
4948 status = MultiXactStatusNoKeyUpdate;
4949
4950 new_status = get_mxact_status_for_lock(mode, is_update);
4951
4952 /*
4953 * since it's not running, it's obviously impossible for the old
4954 * updater to be identical to the current one, so we need not check
4955 * for that case as we do in the block above.
4956 */
4957 new_xmax = MultiXactIdCreate(xmax, status, add_to_xmax, new_status);
4958 GetMultiXactIdHintBits(new_xmax, &new_infomask, &new_infomask2);
4959 }
4960 else
4961 {
4962 /*
4963 * Can get here iff the locking/updating transaction was running when
4964 * the infomask was extracted from the tuple, but finished before
4965 * TransactionIdIsInProgress got to run. Deal with it as if there was
4966 * no locker at all in the first place.
4967 */
4968 old_infomask |= HEAP_XMAX_INVALID;
4969 goto l5;
4970 }
4971
4972 *result_infomask = new_infomask;
4973 *result_infomask2 = new_infomask2;
4974 *result_xmax = new_xmax;
4975 }
4976
4977 /*
4978 * Subroutine for heap_lock_updated_tuple_rec.
4979 *
4980 * Given a hypothetical multixact status held by the transaction identified
4981 * with the given xid, does the current transaction need to wait, fail, or can
4982 * it continue if it wanted to acquire a lock of the given mode? "needwait"
4983 * is set to true if waiting is necessary; if it can continue, then TM_Ok is
4984 * returned. If the lock is already held by the current transaction, return
4985 * TM_SelfModified. In case of a conflict with another transaction, a
4986 * different HeapTupleSatisfiesUpdate return code is returned.
4987 *
4988 * The held status is said to be hypothetical because it might correspond to a
4989 * lock held by a single Xid, i.e. not a real MultiXactId; we express it this
4990 * way for simplicity of API.
4991 */
4992 static TM_Result
test_lockmode_for_conflict(MultiXactStatus status,TransactionId xid,LockTupleMode mode,HeapTuple tup,bool * needwait)4993 test_lockmode_for_conflict(MultiXactStatus status, TransactionId xid,
4994 LockTupleMode mode, HeapTuple tup,
4995 bool *needwait)
4996 {
4997 MultiXactStatus wantedstatus;
4998
4999 *needwait = false;
5000 wantedstatus = get_mxact_status_for_lock(mode, false);
5001
5002 /*
5003 * Note: we *must* check TransactionIdIsInProgress before
5004 * TransactionIdDidAbort/Commit; see comment at top of heapam_visibility.c
5005 * for an explanation.
5006 */
5007 if (TransactionIdIsCurrentTransactionId(xid))
5008 {
5009 /*
5010 * The tuple has already been locked by our own transaction. This is
5011 * very rare but can happen if multiple transactions are trying to
5012 * lock an ancient version of the same tuple.
5013 */
5014 return TM_SelfModified;
5015 }
5016 else if (TransactionIdIsInProgress(xid))
5017 {
5018 /*
5019 * If the locking transaction is running, what we do depends on
5020 * whether the lock modes conflict: if they do, then we must wait for
5021 * it to finish; otherwise we can fall through to lock this tuple
5022 * version without waiting.
5023 */
5024 if (DoLockModesConflict(LOCKMODE_from_mxstatus(status),
5025 LOCKMODE_from_mxstatus(wantedstatus)))
5026 {
5027 *needwait = true;
5028 }
5029
5030 /*
5031 * If we set needwait above, then this value doesn't matter;
5032 * otherwise, this value signals to caller that it's okay to proceed.
5033 */
5034 return TM_Ok;
5035 }
5036 else if (TransactionIdDidAbort(xid))
5037 return TM_Ok;
5038 else if (TransactionIdDidCommit(xid))
5039 {
5040 /*
5041 * The other transaction committed. If it was only a locker, then the
5042 * lock is completely gone now and we can return success; but if it
5043 * was an update, then what we do depends on whether the two lock
5044 * modes conflict. If they conflict, then we must report error to
5045 * caller. But if they don't, we can fall through to allow the current
5046 * transaction to lock the tuple.
5047 *
5048 * Note: the reason we worry about ISUPDATE here is because as soon as
5049 * a transaction ends, all its locks are gone and meaningless, and
5050 * thus we can ignore them; whereas its updates persist. In the
5051 * TransactionIdIsInProgress case, above, we don't need to check
5052 * because we know the lock is still "alive" and thus a conflict needs
5053 * always be checked.
5054 */
5055 if (!ISUPDATE_from_mxstatus(status))
5056 return TM_Ok;
5057
5058 if (DoLockModesConflict(LOCKMODE_from_mxstatus(status),
5059 LOCKMODE_from_mxstatus(wantedstatus)))
5060 {
5061 /* bummer */
5062 if (!ItemPointerEquals(&tup->t_self, &tup->t_data->t_ctid) ||
5063 HeapTupleHeaderIndicatesMovedPartitions(tup->t_data))
5064 return TM_Updated;
5065 else
5066 return TM_Deleted;
5067 }
5068
5069 return TM_Ok;
5070 }
5071
5072 /* Not in progress, not aborted, not committed -- must have crashed */
5073 return TM_Ok;
5074 }
5075
5076
5077 /*
5078 * Recursive part of heap_lock_updated_tuple
5079 *
5080 * Fetch the tuple pointed to by tid in rel, and mark it as locked by the given
5081 * xid with the given mode; if this tuple is updated, recurse to lock the new
5082 * version as well.
5083 */
5084 static TM_Result
heap_lock_updated_tuple_rec(Relation rel,ItemPointer tid,TransactionId xid,LockTupleMode mode)5085 heap_lock_updated_tuple_rec(Relation rel, ItemPointer tid, TransactionId xid,
5086 LockTupleMode mode)
5087 {
5088 TM_Result result;
5089 ItemPointerData tupid;
5090 HeapTupleData mytup;
5091 Buffer buf;
5092 uint16 new_infomask,
5093 new_infomask2,
5094 old_infomask,
5095 old_infomask2;
5096 TransactionId xmax,
5097 new_xmax;
5098 TransactionId priorXmax = InvalidTransactionId;
5099 bool cleared_all_frozen = false;
5100 bool pinned_desired_page;
5101 Buffer vmbuffer = InvalidBuffer;
5102 BlockNumber block;
5103
5104 ItemPointerCopy(tid, &tupid);
5105
5106 for (;;)
5107 {
5108 new_infomask = 0;
5109 new_xmax = InvalidTransactionId;
5110 block = ItemPointerGetBlockNumber(&tupid);
5111 ItemPointerCopy(&tupid, &(mytup.t_self));
5112
5113 if (!heap_fetch(rel, SnapshotAny, &mytup, &buf))
5114 {
5115 /*
5116 * if we fail to find the updated version of the tuple, it's
5117 * because it was vacuumed/pruned away after its creator
5118 * transaction aborted. So behave as if we got to the end of the
5119 * chain, and there's no further tuple to lock: return success to
5120 * caller.
5121 */
5122 result = TM_Ok;
5123 goto out_unlocked;
5124 }
5125
5126 l4:
5127 CHECK_FOR_INTERRUPTS();
5128
5129 /*
5130 * Before locking the buffer, pin the visibility map page if it
5131 * appears to be necessary. Since we haven't got the lock yet,
5132 * someone else might be in the middle of changing this, so we'll need
5133 * to recheck after we have the lock.
5134 */
5135 if (PageIsAllVisible(BufferGetPage(buf)))
5136 {
5137 visibilitymap_pin(rel, block, &vmbuffer);
5138 pinned_desired_page = true;
5139 }
5140 else
5141 pinned_desired_page = false;
5142
5143 LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE);
5144
5145 /*
5146 * If we didn't pin the visibility map page and the page has become
5147 * all visible while we were busy locking the buffer, we'll have to
5148 * unlock and re-lock, to avoid holding the buffer lock across I/O.
5149 * That's a bit unfortunate, but hopefully shouldn't happen often.
5150 *
5151 * Note: in some paths through this function, we will reach here
5152 * holding a pin on a vm page that may or may not be the one matching
5153 * this page. If this page isn't all-visible, we won't use the vm
5154 * page, but we hold onto such a pin till the end of the function.
5155 */
5156 if (!pinned_desired_page && PageIsAllVisible(BufferGetPage(buf)))
5157 {
5158 LockBuffer(buf, BUFFER_LOCK_UNLOCK);
5159 visibilitymap_pin(rel, block, &vmbuffer);
5160 LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE);
5161 }
5162
5163 /*
5164 * Check the tuple XMIN against prior XMAX, if any. If we reached the
5165 * end of the chain, we're done, so return success.
5166 */
5167 if (TransactionIdIsValid(priorXmax) &&
5168 !TransactionIdEquals(HeapTupleHeaderGetXmin(mytup.t_data),
5169 priorXmax))
5170 {
5171 result = TM_Ok;
5172 goto out_locked;
5173 }
5174
5175 /*
5176 * Also check Xmin: if this tuple was created by an aborted
5177 * (sub)transaction, then we already locked the last live one in the
5178 * chain, thus we're done, so return success.
5179 */
5180 if (TransactionIdDidAbort(HeapTupleHeaderGetXmin(mytup.t_data)))
5181 {
5182 result = TM_Ok;
5183 goto out_locked;
5184 }
5185
5186 old_infomask = mytup.t_data->t_infomask;
5187 old_infomask2 = mytup.t_data->t_infomask2;
5188 xmax = HeapTupleHeaderGetRawXmax(mytup.t_data);
5189
5190 /*
5191 * If this tuple version has been updated or locked by some concurrent
5192 * transaction(s), what we do depends on whether our lock mode
5193 * conflicts with what those other transactions hold, and also on the
5194 * status of them.
5195 */
5196 if (!(old_infomask & HEAP_XMAX_INVALID))
5197 {
5198 TransactionId rawxmax;
5199 bool needwait;
5200
5201 rawxmax = HeapTupleHeaderGetRawXmax(mytup.t_data);
5202 if (old_infomask & HEAP_XMAX_IS_MULTI)
5203 {
5204 int nmembers;
5205 int i;
5206 MultiXactMember *members;
5207
5208 /*
5209 * We don't need a test for pg_upgrade'd tuples: this is only
5210 * applied to tuples after the first in an update chain. Said
5211 * first tuple in the chain may well be locked-in-9.2-and-
5212 * pg_upgraded, but that one was already locked by our caller,
5213 * not us; and any subsequent ones cannot be because our
5214 * caller must necessarily have obtained a snapshot later than
5215 * the pg_upgrade itself.
5216 */
5217 Assert(!HEAP_LOCKED_UPGRADED(mytup.t_data->t_infomask));
5218
5219 nmembers = GetMultiXactIdMembers(rawxmax, &members, false,
5220 HEAP_XMAX_IS_LOCKED_ONLY(old_infomask));
5221 for (i = 0; i < nmembers; i++)
5222 {
5223 result = test_lockmode_for_conflict(members[i].status,
5224 members[i].xid,
5225 mode,
5226 &mytup,
5227 &needwait);
5228
5229 /*
5230 * If the tuple was already locked by ourselves in a
5231 * previous iteration of this (say heap_lock_tuple was
5232 * forced to restart the locking loop because of a change
5233 * in xmax), then we hold the lock already on this tuple
5234 * version and we don't need to do anything; and this is
5235 * not an error condition either. We just need to skip
5236 * this tuple and continue locking the next version in the
5237 * update chain.
5238 */
5239 if (result == TM_SelfModified)
5240 {
5241 pfree(members);
5242 goto next;
5243 }
5244
5245 if (needwait)
5246 {
5247 LockBuffer(buf, BUFFER_LOCK_UNLOCK);
5248 XactLockTableWait(members[i].xid, rel,
5249 &mytup.t_self,
5250 XLTW_LockUpdated);
5251 pfree(members);
5252 goto l4;
5253 }
5254 if (result != TM_Ok)
5255 {
5256 pfree(members);
5257 goto out_locked;
5258 }
5259 }
5260 if (members)
5261 pfree(members);
5262 }
5263 else
5264 {
5265 MultiXactStatus status;
5266
5267 /*
5268 * For a non-multi Xmax, we first need to compute the
5269 * corresponding MultiXactStatus by using the infomask bits.
5270 */
5271 if (HEAP_XMAX_IS_LOCKED_ONLY(old_infomask))
5272 {
5273 if (HEAP_XMAX_IS_KEYSHR_LOCKED(old_infomask))
5274 status = MultiXactStatusForKeyShare;
5275 else if (HEAP_XMAX_IS_SHR_LOCKED(old_infomask))
5276 status = MultiXactStatusForShare;
5277 else if (HEAP_XMAX_IS_EXCL_LOCKED(old_infomask))
5278 {
5279 if (old_infomask2 & HEAP_KEYS_UPDATED)
5280 status = MultiXactStatusForUpdate;
5281 else
5282 status = MultiXactStatusForNoKeyUpdate;
5283 }
5284 else
5285 {
5286 /*
5287 * LOCK_ONLY present alone (a pg_upgraded tuple marked
5288 * as share-locked in the old cluster) shouldn't be
5289 * seen in the middle of an update chain.
5290 */
5291 elog(ERROR, "invalid lock status in tuple");
5292 }
5293 }
5294 else
5295 {
5296 /* it's an update, but which kind? */
5297 if (old_infomask2 & HEAP_KEYS_UPDATED)
5298 status = MultiXactStatusUpdate;
5299 else
5300 status = MultiXactStatusNoKeyUpdate;
5301 }
5302
5303 result = test_lockmode_for_conflict(status, rawxmax, mode,
5304 &mytup, &needwait);
5305
5306 /*
5307 * If the tuple was already locked by ourselves in a previous
5308 * iteration of this (say heap_lock_tuple was forced to
5309 * restart the locking loop because of a change in xmax), then
5310 * we hold the lock already on this tuple version and we don't
5311 * need to do anything; and this is not an error condition
5312 * either. We just need to skip this tuple and continue
5313 * locking the next version in the update chain.
5314 */
5315 if (result == TM_SelfModified)
5316 goto next;
5317
5318 if (needwait)
5319 {
5320 LockBuffer(buf, BUFFER_LOCK_UNLOCK);
5321 XactLockTableWait(rawxmax, rel, &mytup.t_self,
5322 XLTW_LockUpdated);
5323 goto l4;
5324 }
5325 if (result != TM_Ok)
5326 {
5327 goto out_locked;
5328 }
5329 }
5330 }
5331
5332 /* compute the new Xmax and infomask values for the tuple ... */
5333 compute_new_xmax_infomask(xmax, old_infomask, mytup.t_data->t_infomask2,
5334 xid, mode, false,
5335 &new_xmax, &new_infomask, &new_infomask2);
5336
5337 if (PageIsAllVisible(BufferGetPage(buf)) &&
5338 visibilitymap_clear(rel, block, vmbuffer,
5339 VISIBILITYMAP_ALL_FROZEN))
5340 cleared_all_frozen = true;
5341
5342 START_CRIT_SECTION();
5343
5344 /* ... and set them */
5345 HeapTupleHeaderSetXmax(mytup.t_data, new_xmax);
5346 mytup.t_data->t_infomask &= ~HEAP_XMAX_BITS;
5347 mytup.t_data->t_infomask2 &= ~HEAP_KEYS_UPDATED;
5348 mytup.t_data->t_infomask |= new_infomask;
5349 mytup.t_data->t_infomask2 |= new_infomask2;
5350
5351 MarkBufferDirty(buf);
5352
5353 /* XLOG stuff */
5354 if (RelationNeedsWAL(rel))
5355 {
5356 xl_heap_lock_updated xlrec;
5357 XLogRecPtr recptr;
5358 Page page = BufferGetPage(buf);
5359
5360 XLogBeginInsert();
5361 XLogRegisterBuffer(0, buf, REGBUF_STANDARD);
5362
5363 xlrec.offnum = ItemPointerGetOffsetNumber(&mytup.t_self);
5364 xlrec.xmax = new_xmax;
5365 xlrec.infobits_set = compute_infobits(new_infomask, new_infomask2);
5366 xlrec.flags =
5367 cleared_all_frozen ? XLH_LOCK_ALL_FROZEN_CLEARED : 0;
5368
5369 XLogRegisterData((char *) &xlrec, SizeOfHeapLockUpdated);
5370
5371 recptr = XLogInsert(RM_HEAP2_ID, XLOG_HEAP2_LOCK_UPDATED);
5372
5373 PageSetLSN(page, recptr);
5374 }
5375
5376 END_CRIT_SECTION();
5377
5378 next:
5379 /* if we find the end of update chain, we're done. */
5380 if (mytup.t_data->t_infomask & HEAP_XMAX_INVALID ||
5381 HeapTupleHeaderIndicatesMovedPartitions(mytup.t_data) ||
5382 ItemPointerEquals(&mytup.t_self, &mytup.t_data->t_ctid) ||
5383 HeapTupleHeaderIsOnlyLocked(mytup.t_data))
5384 {
5385 result = TM_Ok;
5386 goto out_locked;
5387 }
5388
5389 /* tail recursion */
5390 priorXmax = HeapTupleHeaderGetUpdateXid(mytup.t_data);
5391 ItemPointerCopy(&(mytup.t_data->t_ctid), &tupid);
5392 UnlockReleaseBuffer(buf);
5393 }
5394
5395 result = TM_Ok;
5396
5397 out_locked:
5398 UnlockReleaseBuffer(buf);
5399
5400 out_unlocked:
5401 if (vmbuffer != InvalidBuffer)
5402 ReleaseBuffer(vmbuffer);
5403
5404 return result;
5405 }
5406
5407 /*
5408 * heap_lock_updated_tuple
5409 * Follow update chain when locking an updated tuple, acquiring locks (row
5410 * marks) on the updated versions.
5411 *
5412 * The initial tuple is assumed to be already locked.
5413 *
5414 * This function doesn't check visibility, it just unconditionally marks the
5415 * tuple(s) as locked. If any tuple in the updated chain is being deleted
5416 * concurrently (or updated with the key being modified), sleep until the
5417 * transaction doing it is finished.
5418 *
5419 * Note that we don't acquire heavyweight tuple locks on the tuples we walk
5420 * when we have to wait for other transactions to release them, as opposed to
5421 * what heap_lock_tuple does. The reason is that having more than one
5422 * transaction walking the chain is probably uncommon enough that risk of
5423 * starvation is not likely: one of the preconditions for being here is that
5424 * the snapshot in use predates the update that created this tuple (because we
5425 * started at an earlier version of the tuple), but at the same time such a
5426 * transaction cannot be using repeatable read or serializable isolation
5427 * levels, because that would lead to a serializability failure.
5428 */
5429 static TM_Result
heap_lock_updated_tuple(Relation rel,HeapTuple tuple,ItemPointer ctid,TransactionId xid,LockTupleMode mode)5430 heap_lock_updated_tuple(Relation rel, HeapTuple tuple, ItemPointer ctid,
5431 TransactionId xid, LockTupleMode mode)
5432 {
5433 /*
5434 * If the tuple has not been updated, or has moved into another partition
5435 * (effectively a delete) stop here.
5436 */
5437 if (!HeapTupleHeaderIndicatesMovedPartitions(tuple->t_data) &&
5438 !ItemPointerEquals(&tuple->t_self, ctid))
5439 {
5440 /*
5441 * If this is the first possibly-multixact-able operation in the
5442 * current transaction, set my per-backend OldestMemberMXactId
5443 * setting. We can be certain that the transaction will never become a
5444 * member of any older MultiXactIds than that. (We have to do this
5445 * even if we end up just using our own TransactionId below, since
5446 * some other backend could incorporate our XID into a MultiXact
5447 * immediately afterwards.)
5448 */
5449 MultiXactIdSetOldestMember();
5450
5451 return heap_lock_updated_tuple_rec(rel, ctid, xid, mode);
5452 }
5453
5454 /* nothing to lock */
5455 return TM_Ok;
5456 }
5457
5458 /*
5459 * heap_finish_speculative - mark speculative insertion as successful
5460 *
5461 * To successfully finish a speculative insertion we have to clear speculative
5462 * token from tuple. To do so the t_ctid field, which will contain a
5463 * speculative token value, is modified in place to point to the tuple itself,
5464 * which is characteristic of a newly inserted ordinary tuple.
5465 *
5466 * NB: It is not ok to commit without either finishing or aborting a
5467 * speculative insertion. We could treat speculative tuples of committed
5468 * transactions implicitly as completed, but then we would have to be prepared
5469 * to deal with speculative tokens on committed tuples. That wouldn't be
5470 * difficult - no-one looks at the ctid field of a tuple with invalid xmax -
5471 * but clearing the token at completion isn't very expensive either.
5472 * An explicit confirmation WAL record also makes logical decoding simpler.
5473 */
5474 void
heap_finish_speculative(Relation relation,ItemPointer tid)5475 heap_finish_speculative(Relation relation, ItemPointer tid)
5476 {
5477 Buffer buffer;
5478 Page page;
5479 OffsetNumber offnum;
5480 ItemId lp = NULL;
5481 HeapTupleHeader htup;
5482
5483 buffer = ReadBuffer(relation, ItemPointerGetBlockNumber(tid));
5484 LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE);
5485 page = (Page) BufferGetPage(buffer);
5486
5487 offnum = ItemPointerGetOffsetNumber(tid);
5488 if (PageGetMaxOffsetNumber(page) >= offnum)
5489 lp = PageGetItemId(page, offnum);
5490
5491 if (PageGetMaxOffsetNumber(page) < offnum || !ItemIdIsNormal(lp))
5492 elog(ERROR, "invalid lp");
5493
5494 htup = (HeapTupleHeader) PageGetItem(page, lp);
5495
5496 /* SpecTokenOffsetNumber should be distinguishable from any real offset */
5497 StaticAssertStmt(MaxOffsetNumber < SpecTokenOffsetNumber,
5498 "invalid speculative token constant");
5499
5500 /* NO EREPORT(ERROR) from here till changes are logged */
5501 START_CRIT_SECTION();
5502
5503 Assert(HeapTupleHeaderIsSpeculative(htup));
5504
5505 MarkBufferDirty(buffer);
5506
5507 /*
5508 * Replace the speculative insertion token with a real t_ctid, pointing to
5509 * itself like it does on regular tuples.
5510 */
5511 htup->t_ctid = *tid;
5512
5513 /* XLOG stuff */
5514 if (RelationNeedsWAL(relation))
5515 {
5516 xl_heap_confirm xlrec;
5517 XLogRecPtr recptr;
5518
5519 xlrec.offnum = ItemPointerGetOffsetNumber(tid);
5520
5521 XLogBeginInsert();
5522
5523 /* We want the same filtering on this as on a plain insert */
5524 XLogSetRecordFlags(XLOG_INCLUDE_ORIGIN);
5525
5526 XLogRegisterData((char *) &xlrec, SizeOfHeapConfirm);
5527 XLogRegisterBuffer(0, buffer, REGBUF_STANDARD);
5528
5529 recptr = XLogInsert(RM_HEAP_ID, XLOG_HEAP_CONFIRM);
5530
5531 PageSetLSN(page, recptr);
5532 }
5533
5534 END_CRIT_SECTION();
5535
5536 UnlockReleaseBuffer(buffer);
5537 }
5538
5539 /*
5540 * heap_abort_speculative - kill a speculatively inserted tuple
5541 *
5542 * Marks a tuple that was speculatively inserted in the same command as dead,
5543 * by setting its xmin as invalid. That makes it immediately appear as dead
5544 * to all transactions, including our own. In particular, it makes
5545 * HeapTupleSatisfiesDirty() regard the tuple as dead, so that another backend
5546 * inserting a duplicate key value won't unnecessarily wait for our whole
5547 * transaction to finish (it'll just wait for our speculative insertion to
5548 * finish).
5549 *
5550 * Killing the tuple prevents "unprincipled deadlocks", which are deadlocks
5551 * that arise due to a mutual dependency that is not user visible. By
5552 * definition, unprincipled deadlocks cannot be prevented by the user
5553 * reordering lock acquisition in client code, because the implementation level
5554 * lock acquisitions are not under the user's direct control. If speculative
5555 * inserters did not take this precaution, then under high concurrency they
5556 * could deadlock with each other, which would not be acceptable.
5557 *
5558 * This is somewhat redundant with heap_delete, but we prefer to have a
5559 * dedicated routine with stripped down requirements. Note that this is also
5560 * used to delete the TOAST tuples created during speculative insertion.
5561 *
5562 * This routine does not affect logical decoding as it only looks at
5563 * confirmation records.
5564 */
5565 void
heap_abort_speculative(Relation relation,ItemPointer tid)5566 heap_abort_speculative(Relation relation, ItemPointer tid)
5567 {
5568 TransactionId xid = GetCurrentTransactionId();
5569 ItemId lp;
5570 HeapTupleData tp;
5571 Page page;
5572 BlockNumber block;
5573 Buffer buffer;
5574 TransactionId prune_xid;
5575
5576 Assert(ItemPointerIsValid(tid));
5577
5578 block = ItemPointerGetBlockNumber(tid);
5579 buffer = ReadBuffer(relation, block);
5580 page = BufferGetPage(buffer);
5581
5582 LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE);
5583
5584 /*
5585 * Page can't be all visible, we just inserted into it, and are still
5586 * running.
5587 */
5588 Assert(!PageIsAllVisible(page));
5589
5590 lp = PageGetItemId(page, ItemPointerGetOffsetNumber(tid));
5591 Assert(ItemIdIsNormal(lp));
5592
5593 tp.t_tableOid = RelationGetRelid(relation);
5594 tp.t_data = (HeapTupleHeader) PageGetItem(page, lp);
5595 tp.t_len = ItemIdGetLength(lp);
5596 tp.t_self = *tid;
5597
5598 /*
5599 * Sanity check that the tuple really is a speculatively inserted tuple,
5600 * inserted by us.
5601 */
5602 if (tp.t_data->t_choice.t_heap.t_xmin != xid)
5603 elog(ERROR, "attempted to kill a tuple inserted by another transaction");
5604 if (!(IsToastRelation(relation) || HeapTupleHeaderIsSpeculative(tp.t_data)))
5605 elog(ERROR, "attempted to kill a non-speculative tuple");
5606 Assert(!HeapTupleHeaderIsHeapOnly(tp.t_data));
5607
5608 /*
5609 * No need to check for serializable conflicts here. There is never a
5610 * need for a combocid, either. No need to extract replica identity, or
5611 * do anything special with infomask bits.
5612 */
5613
5614 START_CRIT_SECTION();
5615
5616 /*
5617 * The tuple will become DEAD immediately. Flag that this page is a
5618 * candidate for pruning by setting xmin to TransactionXmin. While not
5619 * immediately prunable, it is the oldest xid we can cheaply determine
5620 * that's safe against wraparound / being older than the table's
5621 * relfrozenxid. To defend against the unlikely case of a new relation
5622 * having a newer relfrozenxid than our TransactionXmin, use relfrozenxid
5623 * if so (vacuum can't subsequently move relfrozenxid to beyond
5624 * TransactionXmin, so there's no race here).
5625 */
5626 Assert(TransactionIdIsValid(TransactionXmin));
5627 if (TransactionIdPrecedes(TransactionXmin, relation->rd_rel->relfrozenxid))
5628 prune_xid = relation->rd_rel->relfrozenxid;
5629 else
5630 prune_xid = TransactionXmin;
5631 PageSetPrunable(page, prune_xid);
5632
5633 /* store transaction information of xact deleting the tuple */
5634 tp.t_data->t_infomask &= ~(HEAP_XMAX_BITS | HEAP_MOVED);
5635 tp.t_data->t_infomask2 &= ~HEAP_KEYS_UPDATED;
5636
5637 /*
5638 * Set the tuple header xmin to InvalidTransactionId. This makes the
5639 * tuple immediately invisible everyone. (In particular, to any
5640 * transactions waiting on the speculative token, woken up later.)
5641 */
5642 HeapTupleHeaderSetXmin(tp.t_data, InvalidTransactionId);
5643
5644 /* Clear the speculative insertion token too */
5645 tp.t_data->t_ctid = tp.t_self;
5646
5647 MarkBufferDirty(buffer);
5648
5649 /*
5650 * XLOG stuff
5651 *
5652 * The WAL records generated here match heap_delete(). The same recovery
5653 * routines are used.
5654 */
5655 if (RelationNeedsWAL(relation))
5656 {
5657 xl_heap_delete xlrec;
5658 XLogRecPtr recptr;
5659
5660 xlrec.flags = XLH_DELETE_IS_SUPER;
5661 xlrec.infobits_set = compute_infobits(tp.t_data->t_infomask,
5662 tp.t_data->t_infomask2);
5663 xlrec.offnum = ItemPointerGetOffsetNumber(&tp.t_self);
5664 xlrec.xmax = xid;
5665
5666 XLogBeginInsert();
5667 XLogRegisterData((char *) &xlrec, SizeOfHeapDelete);
5668 XLogRegisterBuffer(0, buffer, REGBUF_STANDARD);
5669
5670 /* No replica identity & replication origin logged */
5671
5672 recptr = XLogInsert(RM_HEAP_ID, XLOG_HEAP_DELETE);
5673
5674 PageSetLSN(page, recptr);
5675 }
5676
5677 END_CRIT_SECTION();
5678
5679 LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
5680
5681 if (HeapTupleHasExternal(&tp))
5682 {
5683 Assert(!IsToastRelation(relation));
5684 heap_toast_delete(relation, &tp, true);
5685 }
5686
5687 /*
5688 * Never need to mark tuple for invalidation, since catalogs don't support
5689 * speculative insertion
5690 */
5691
5692 /* Now we can release the buffer */
5693 ReleaseBuffer(buffer);
5694
5695 /* count deletion, as we counted the insertion too */
5696 pgstat_count_heap_delete(relation);
5697 }
5698
5699 /*
5700 * heap_inplace_update - update a tuple "in place" (ie, overwrite it)
5701 *
5702 * Overwriting violates both MVCC and transactional safety, so the uses
5703 * of this function in Postgres are extremely limited. Nonetheless we
5704 * find some places to use it.
5705 *
5706 * The tuple cannot change size, and therefore it's reasonable to assume
5707 * that its null bitmap (if any) doesn't change either. So we just
5708 * overwrite the data portion of the tuple without touching the null
5709 * bitmap or any of the header fields.
5710 *
5711 * tuple is an in-memory tuple structure containing the data to be written
5712 * over the target tuple. Also, tuple->t_self identifies the target tuple.
5713 */
5714 void
heap_inplace_update(Relation relation,HeapTuple tuple)5715 heap_inplace_update(Relation relation, HeapTuple tuple)
5716 {
5717 Buffer buffer;
5718 Page page;
5719 OffsetNumber offnum;
5720 ItemId lp = NULL;
5721 HeapTupleHeader htup;
5722 uint32 oldlen;
5723 uint32 newlen;
5724
5725 /*
5726 * For now, we don't allow parallel updates. Unlike a regular update,
5727 * this should never create a combo CID, so it might be possible to relax
5728 * this restriction, but not without more thought and testing. It's not
5729 * clear that it would be useful, anyway.
5730 */
5731 if (IsInParallelMode())
5732 ereport(ERROR,
5733 (errcode(ERRCODE_INVALID_TRANSACTION_STATE),
5734 errmsg("cannot update tuples during a parallel operation")));
5735
5736 buffer = ReadBuffer(relation, ItemPointerGetBlockNumber(&(tuple->t_self)));
5737 LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE);
5738 page = (Page) BufferGetPage(buffer);
5739
5740 offnum = ItemPointerGetOffsetNumber(&(tuple->t_self));
5741 if (PageGetMaxOffsetNumber(page) >= offnum)
5742 lp = PageGetItemId(page, offnum);
5743
5744 if (PageGetMaxOffsetNumber(page) < offnum || !ItemIdIsNormal(lp))
5745 elog(ERROR, "invalid lp");
5746
5747 htup = (HeapTupleHeader) PageGetItem(page, lp);
5748
5749 oldlen = ItemIdGetLength(lp) - htup->t_hoff;
5750 newlen = tuple->t_len - tuple->t_data->t_hoff;
5751 if (oldlen != newlen || htup->t_hoff != tuple->t_data->t_hoff)
5752 elog(ERROR, "wrong tuple length");
5753
5754 /* NO EREPORT(ERROR) from here till changes are logged */
5755 START_CRIT_SECTION();
5756
5757 memcpy((char *) htup + htup->t_hoff,
5758 (char *) tuple->t_data + tuple->t_data->t_hoff,
5759 newlen);
5760
5761 MarkBufferDirty(buffer);
5762
5763 /* XLOG stuff */
5764 if (RelationNeedsWAL(relation))
5765 {
5766 xl_heap_inplace xlrec;
5767 XLogRecPtr recptr;
5768
5769 xlrec.offnum = ItemPointerGetOffsetNumber(&tuple->t_self);
5770
5771 XLogBeginInsert();
5772 XLogRegisterData((char *) &xlrec, SizeOfHeapInplace);
5773
5774 XLogRegisterBuffer(0, buffer, REGBUF_STANDARD);
5775 XLogRegisterBufData(0, (char *) htup + htup->t_hoff, newlen);
5776
5777 /* inplace updates aren't decoded atm, don't log the origin */
5778
5779 recptr = XLogInsert(RM_HEAP_ID, XLOG_HEAP_INPLACE);
5780
5781 PageSetLSN(page, recptr);
5782 }
5783
5784 END_CRIT_SECTION();
5785
5786 UnlockReleaseBuffer(buffer);
5787
5788 /*
5789 * Send out shared cache inval if necessary. Note that because we only
5790 * pass the new version of the tuple, this mustn't be used for any
5791 * operations that could change catcache lookup keys. But we aren't
5792 * bothering with index updates either, so that's true a fortiori.
5793 */
5794 if (!IsBootstrapProcessingMode())
5795 CacheInvalidateHeapTuple(relation, tuple, NULL);
5796 }
5797
5798 #define FRM_NOOP 0x0001
5799 #define FRM_INVALIDATE_XMAX 0x0002
5800 #define FRM_RETURN_IS_XID 0x0004
5801 #define FRM_RETURN_IS_MULTI 0x0008
5802 #define FRM_MARK_COMMITTED 0x0010
5803
5804 /*
5805 * FreezeMultiXactId
5806 * Determine what to do during freezing when a tuple is marked by a
5807 * MultiXactId.
5808 *
5809 * NB -- this might have the side-effect of creating a new MultiXactId!
5810 *
5811 * "flags" is an output value; it's used to tell caller what to do on return.
5812 * Possible flags are:
5813 * FRM_NOOP
5814 * don't do anything -- keep existing Xmax
5815 * FRM_INVALIDATE_XMAX
5816 * mark Xmax as InvalidTransactionId and set XMAX_INVALID flag.
5817 * FRM_RETURN_IS_XID
5818 * The Xid return value is a single update Xid to set as xmax.
5819 * FRM_MARK_COMMITTED
5820 * Xmax can be marked as HEAP_XMAX_COMMITTED
5821 * FRM_RETURN_IS_MULTI
5822 * The return value is a new MultiXactId to set as new Xmax.
5823 * (caller must obtain proper infomask bits using GetMultiXactIdHintBits)
5824 */
5825 static TransactionId
FreezeMultiXactId(MultiXactId multi,uint16 t_infomask,TransactionId relfrozenxid,TransactionId relminmxid,TransactionId cutoff_xid,MultiXactId cutoff_multi,uint16 * flags)5826 FreezeMultiXactId(MultiXactId multi, uint16 t_infomask,
5827 TransactionId relfrozenxid, TransactionId relminmxid,
5828 TransactionId cutoff_xid, MultiXactId cutoff_multi,
5829 uint16 *flags)
5830 {
5831 TransactionId xid = InvalidTransactionId;
5832 int i;
5833 MultiXactMember *members;
5834 int nmembers;
5835 bool need_replace;
5836 int nnewmembers;
5837 MultiXactMember *newmembers;
5838 bool has_lockers;
5839 TransactionId update_xid;
5840 bool update_committed;
5841
5842 *flags = 0;
5843
5844 /* We should only be called in Multis */
5845 Assert(t_infomask & HEAP_XMAX_IS_MULTI);
5846
5847 if (!MultiXactIdIsValid(multi) ||
5848 HEAP_LOCKED_UPGRADED(t_infomask))
5849 {
5850 /* Ensure infomask bits are appropriately set/reset */
5851 *flags |= FRM_INVALIDATE_XMAX;
5852 return InvalidTransactionId;
5853 }
5854 else if (MultiXactIdPrecedes(multi, relminmxid))
5855 ereport(ERROR,
5856 (errcode(ERRCODE_DATA_CORRUPTED),
5857 errmsg_internal("found multixact %u from before relminmxid %u",
5858 multi, relminmxid)));
5859 else if (MultiXactIdPrecedes(multi, cutoff_multi))
5860 {
5861 /*
5862 * This old multi cannot possibly have members still running, but
5863 * verify just in case. If it was a locker only, it can be removed
5864 * without any further consideration; but if it contained an update,
5865 * we might need to preserve it.
5866 */
5867 if (MultiXactIdIsRunning(multi,
5868 HEAP_XMAX_IS_LOCKED_ONLY(t_infomask)))
5869 ereport(ERROR,
5870 (errcode(ERRCODE_DATA_CORRUPTED),
5871 errmsg_internal("multixact %u from before cutoff %u found to be still running",
5872 multi, cutoff_multi)));
5873
5874 if (HEAP_XMAX_IS_LOCKED_ONLY(t_infomask))
5875 {
5876 *flags |= FRM_INVALIDATE_XMAX;
5877 xid = InvalidTransactionId; /* not strictly necessary */
5878 }
5879 else
5880 {
5881 /* replace multi by update xid */
5882 xid = MultiXactIdGetUpdateXid(multi, t_infomask);
5883
5884 /* wasn't only a lock, xid needs to be valid */
5885 Assert(TransactionIdIsValid(xid));
5886
5887 if (TransactionIdPrecedes(xid, relfrozenxid))
5888 ereport(ERROR,
5889 (errcode(ERRCODE_DATA_CORRUPTED),
5890 errmsg_internal("found update xid %u from before relfrozenxid %u",
5891 xid, relfrozenxid)));
5892
5893 /*
5894 * If the xid is older than the cutoff, it has to have aborted,
5895 * otherwise the tuple would have gotten pruned away.
5896 */
5897 if (TransactionIdPrecedes(xid, cutoff_xid))
5898 {
5899 if (TransactionIdDidCommit(xid))
5900 ereport(ERROR,
5901 (errcode(ERRCODE_DATA_CORRUPTED),
5902 errmsg_internal("cannot freeze committed update xid %u", xid)));
5903 *flags |= FRM_INVALIDATE_XMAX;
5904 xid = InvalidTransactionId; /* not strictly necessary */
5905 }
5906 else
5907 {
5908 *flags |= FRM_RETURN_IS_XID;
5909 }
5910 }
5911
5912 return xid;
5913 }
5914
5915 /*
5916 * This multixact might have or might not have members still running, but
5917 * we know it's valid and is newer than the cutoff point for multis.
5918 * However, some member(s) of it may be below the cutoff for Xids, so we
5919 * need to walk the whole members array to figure out what to do, if
5920 * anything.
5921 */
5922
5923 nmembers =
5924 GetMultiXactIdMembers(multi, &members, false,
5925 HEAP_XMAX_IS_LOCKED_ONLY(t_infomask));
5926 if (nmembers <= 0)
5927 {
5928 /* Nothing worth keeping */
5929 *flags |= FRM_INVALIDATE_XMAX;
5930 return InvalidTransactionId;
5931 }
5932
5933 /* is there anything older than the cutoff? */
5934 need_replace = false;
5935 for (i = 0; i < nmembers; i++)
5936 {
5937 if (TransactionIdPrecedes(members[i].xid, cutoff_xid))
5938 {
5939 need_replace = true;
5940 break;
5941 }
5942 }
5943
5944 /*
5945 * In the simplest case, there is no member older than the cutoff; we can
5946 * keep the existing MultiXactId as is.
5947 */
5948 if (!need_replace)
5949 {
5950 *flags |= FRM_NOOP;
5951 pfree(members);
5952 return InvalidTransactionId;
5953 }
5954
5955 /*
5956 * If the multi needs to be updated, figure out which members do we need
5957 * to keep.
5958 */
5959 nnewmembers = 0;
5960 newmembers = palloc(sizeof(MultiXactMember) * nmembers);
5961 has_lockers = false;
5962 update_xid = InvalidTransactionId;
5963 update_committed = false;
5964
5965 for (i = 0; i < nmembers; i++)
5966 {
5967 /*
5968 * Determine whether to keep this member or ignore it.
5969 */
5970 if (ISUPDATE_from_mxstatus(members[i].status))
5971 {
5972 TransactionId xid = members[i].xid;
5973
5974 Assert(TransactionIdIsValid(xid));
5975 if (TransactionIdPrecedes(xid, relfrozenxid))
5976 ereport(ERROR,
5977 (errcode(ERRCODE_DATA_CORRUPTED),
5978 errmsg_internal("found update xid %u from before relfrozenxid %u",
5979 xid, relfrozenxid)));
5980
5981 /*
5982 * It's an update; should we keep it? If the transaction is known
5983 * aborted or crashed then it's okay to ignore it, otherwise not.
5984 * Note that an updater older than cutoff_xid cannot possibly be
5985 * committed, because HeapTupleSatisfiesVacuum would have returned
5986 * HEAPTUPLE_DEAD and we would not be trying to freeze the tuple.
5987 *
5988 * As with all tuple visibility routines, it's critical to test
5989 * TransactionIdIsInProgress before TransactionIdDidCommit,
5990 * because of race conditions explained in detail in
5991 * heapam_visibility.c.
5992 */
5993 if (TransactionIdIsCurrentTransactionId(xid) ||
5994 TransactionIdIsInProgress(xid))
5995 {
5996 Assert(!TransactionIdIsValid(update_xid));
5997 update_xid = xid;
5998 }
5999 else if (TransactionIdDidCommit(xid))
6000 {
6001 /*
6002 * The transaction committed, so we can tell caller to set
6003 * HEAP_XMAX_COMMITTED. (We can only do this because we know
6004 * the transaction is not running.)
6005 */
6006 Assert(!TransactionIdIsValid(update_xid));
6007 update_committed = true;
6008 update_xid = xid;
6009 }
6010 else
6011 {
6012 /*
6013 * Not in progress, not committed -- must be aborted or
6014 * crashed; we can ignore it.
6015 */
6016 }
6017
6018 /*
6019 * Since the tuple wasn't marked HEAPTUPLE_DEAD by vacuum, the
6020 * update Xid cannot possibly be older than the xid cutoff. The
6021 * presence of such a tuple would cause corruption, so be paranoid
6022 * and check.
6023 */
6024 if (TransactionIdIsValid(update_xid) &&
6025 TransactionIdPrecedes(update_xid, cutoff_xid))
6026 ereport(ERROR,
6027 (errcode(ERRCODE_DATA_CORRUPTED),
6028 errmsg_internal("found update xid %u from before xid cutoff %u",
6029 update_xid, cutoff_xid)));
6030
6031 /*
6032 * If we determined that it's an Xid corresponding to an update
6033 * that must be retained, additionally add it to the list of
6034 * members of the new Multi, in case we end up using that. (We
6035 * might still decide to use only an update Xid and not a multi,
6036 * but it's easier to maintain the list as we walk the old members
6037 * list.)
6038 */
6039 if (TransactionIdIsValid(update_xid))
6040 newmembers[nnewmembers++] = members[i];
6041 }
6042 else
6043 {
6044 /* We only keep lockers if they are still running */
6045 if (TransactionIdIsCurrentTransactionId(members[i].xid) ||
6046 TransactionIdIsInProgress(members[i].xid))
6047 {
6048 /* running locker cannot possibly be older than the cutoff */
6049 Assert(!TransactionIdPrecedes(members[i].xid, cutoff_xid));
6050 newmembers[nnewmembers++] = members[i];
6051 has_lockers = true;
6052 }
6053 }
6054 }
6055
6056 pfree(members);
6057
6058 if (nnewmembers == 0)
6059 {
6060 /* nothing worth keeping!? Tell caller to remove the whole thing */
6061 *flags |= FRM_INVALIDATE_XMAX;
6062 xid = InvalidTransactionId;
6063 }
6064 else if (TransactionIdIsValid(update_xid) && !has_lockers)
6065 {
6066 /*
6067 * If there's a single member and it's an update, pass it back alone
6068 * without creating a new Multi. (XXX we could do this when there's a
6069 * single remaining locker, too, but that would complicate the API too
6070 * much; moreover, the case with the single updater is more
6071 * interesting, because those are longer-lived.)
6072 */
6073 Assert(nnewmembers == 1);
6074 *flags |= FRM_RETURN_IS_XID;
6075 if (update_committed)
6076 *flags |= FRM_MARK_COMMITTED;
6077 xid = update_xid;
6078 }
6079 else
6080 {
6081 /*
6082 * Create a new multixact with the surviving members of the previous
6083 * one, to set as new Xmax in the tuple.
6084 */
6085 xid = MultiXactIdCreateFromMembers(nnewmembers, newmembers);
6086 *flags |= FRM_RETURN_IS_MULTI;
6087 }
6088
6089 pfree(newmembers);
6090
6091 return xid;
6092 }
6093
6094 /*
6095 * heap_prepare_freeze_tuple
6096 *
6097 * Check to see whether any of the XID fields of a tuple (xmin, xmax, xvac)
6098 * are older than the specified cutoff XID and cutoff MultiXactId. If so,
6099 * setup enough state (in the *frz output argument) to later execute and
6100 * WAL-log what we would need to do, and return true. Return false if nothing
6101 * is to be changed. In addition, set *totally_frozen_p to true if the tuple
6102 * will be totally frozen after these operations are performed and false if
6103 * more freezing will eventually be required.
6104 *
6105 * Caller is responsible for setting the offset field, if appropriate.
6106 *
6107 * It is assumed that the caller has checked the tuple with
6108 * HeapTupleSatisfiesVacuum() and determined that it is not HEAPTUPLE_DEAD
6109 * (else we should be removing the tuple, not freezing it).
6110 *
6111 * NB: cutoff_xid *must* be <= the current global xmin, to ensure that any
6112 * XID older than it could neither be running nor seen as running by any
6113 * open transaction. This ensures that the replacement will not change
6114 * anyone's idea of the tuple state.
6115 * Similarly, cutoff_multi must be less than or equal to the smallest
6116 * MultiXactId used by any transaction currently open.
6117 *
6118 * If the tuple is in a shared buffer, caller must hold an exclusive lock on
6119 * that buffer.
6120 *
6121 * NB: It is not enough to set hint bits to indicate something is
6122 * committed/invalid -- they might not be set on a standby, or after crash
6123 * recovery. We really need to remove old xids.
6124 */
6125 bool
heap_prepare_freeze_tuple(HeapTupleHeader tuple,TransactionId relfrozenxid,TransactionId relminmxid,TransactionId cutoff_xid,TransactionId cutoff_multi,xl_heap_freeze_tuple * frz,bool * totally_frozen_p)6126 heap_prepare_freeze_tuple(HeapTupleHeader tuple,
6127 TransactionId relfrozenxid, TransactionId relminmxid,
6128 TransactionId cutoff_xid, TransactionId cutoff_multi,
6129 xl_heap_freeze_tuple *frz, bool *totally_frozen_p)
6130 {
6131 bool changed = false;
6132 bool xmax_already_frozen = false;
6133 bool xmin_frozen;
6134 bool freeze_xmax;
6135 TransactionId xid;
6136
6137 frz->frzflags = 0;
6138 frz->t_infomask2 = tuple->t_infomask2;
6139 frz->t_infomask = tuple->t_infomask;
6140 frz->xmax = HeapTupleHeaderGetRawXmax(tuple);
6141
6142 /*
6143 * Process xmin. xmin_frozen has two slightly different meanings: in the
6144 * !XidIsNormal case, it means "the xmin doesn't need any freezing" (it's
6145 * already a permanent value), while in the block below it is set true to
6146 * mean "xmin won't need freezing after what we do to it here" (false
6147 * otherwise). In both cases we're allowed to set totally_frozen, as far
6148 * as xmin is concerned.
6149 */
6150 xid = HeapTupleHeaderGetXmin(tuple);
6151 if (!TransactionIdIsNormal(xid))
6152 xmin_frozen = true;
6153 else
6154 {
6155 if (TransactionIdPrecedes(xid, relfrozenxid))
6156 ereport(ERROR,
6157 (errcode(ERRCODE_DATA_CORRUPTED),
6158 errmsg_internal("found xmin %u from before relfrozenxid %u",
6159 xid, relfrozenxid)));
6160
6161 xmin_frozen = TransactionIdPrecedes(xid, cutoff_xid);
6162 if (xmin_frozen)
6163 {
6164 if (!TransactionIdDidCommit(xid))
6165 ereport(ERROR,
6166 (errcode(ERRCODE_DATA_CORRUPTED),
6167 errmsg_internal("uncommitted xmin %u from before xid cutoff %u needs to be frozen",
6168 xid, cutoff_xid)));
6169
6170 frz->t_infomask |= HEAP_XMIN_FROZEN;
6171 changed = true;
6172 }
6173 }
6174
6175 /*
6176 * Process xmax. To thoroughly examine the current Xmax value we need to
6177 * resolve a MultiXactId to its member Xids, in case some of them are
6178 * below the given cutoff for Xids. In that case, those values might need
6179 * freezing, too. Also, if a multi needs freezing, we cannot simply take
6180 * it out --- if there's a live updater Xid, it needs to be kept.
6181 *
6182 * Make sure to keep heap_tuple_needs_freeze in sync with this.
6183 */
6184 xid = HeapTupleHeaderGetRawXmax(tuple);
6185
6186 if (tuple->t_infomask & HEAP_XMAX_IS_MULTI)
6187 {
6188 TransactionId newxmax;
6189 uint16 flags;
6190
6191 newxmax = FreezeMultiXactId(xid, tuple->t_infomask,
6192 relfrozenxid, relminmxid,
6193 cutoff_xid, cutoff_multi, &flags);
6194
6195 freeze_xmax = (flags & FRM_INVALIDATE_XMAX);
6196
6197 if (flags & FRM_RETURN_IS_XID)
6198 {
6199 /*
6200 * NB -- some of these transformations are only valid because we
6201 * know the return Xid is a tuple updater (i.e. not merely a
6202 * locker.) Also note that the only reason we don't explicitly
6203 * worry about HEAP_KEYS_UPDATED is because it lives in
6204 * t_infomask2 rather than t_infomask.
6205 */
6206 frz->t_infomask &= ~HEAP_XMAX_BITS;
6207 frz->xmax = newxmax;
6208 if (flags & FRM_MARK_COMMITTED)
6209 frz->t_infomask |= HEAP_XMAX_COMMITTED;
6210 changed = true;
6211 }
6212 else if (flags & FRM_RETURN_IS_MULTI)
6213 {
6214 uint16 newbits;
6215 uint16 newbits2;
6216
6217 /*
6218 * We can't use GetMultiXactIdHintBits directly on the new multi
6219 * here; that routine initializes the masks to all zeroes, which
6220 * would lose other bits we need. Doing it this way ensures all
6221 * unrelated bits remain untouched.
6222 */
6223 frz->t_infomask &= ~HEAP_XMAX_BITS;
6224 frz->t_infomask2 &= ~HEAP_KEYS_UPDATED;
6225 GetMultiXactIdHintBits(newxmax, &newbits, &newbits2);
6226 frz->t_infomask |= newbits;
6227 frz->t_infomask2 |= newbits2;
6228
6229 frz->xmax = newxmax;
6230
6231 changed = true;
6232 }
6233 }
6234 else if (TransactionIdIsNormal(xid))
6235 {
6236 if (TransactionIdPrecedes(xid, relfrozenxid))
6237 ereport(ERROR,
6238 (errcode(ERRCODE_DATA_CORRUPTED),
6239 errmsg_internal("found xmax %u from before relfrozenxid %u",
6240 xid, relfrozenxid)));
6241
6242 if (TransactionIdPrecedes(xid, cutoff_xid))
6243 {
6244 /*
6245 * If we freeze xmax, make absolutely sure that it's not an XID
6246 * that is important. (Note, a lock-only xmax can be removed
6247 * independent of committedness, since a committed lock holder has
6248 * released the lock).
6249 */
6250 if (!HEAP_XMAX_IS_LOCKED_ONLY(tuple->t_infomask) &&
6251 TransactionIdDidCommit(xid))
6252 ereport(ERROR,
6253 (errcode(ERRCODE_DATA_CORRUPTED),
6254 errmsg_internal("cannot freeze committed xmax %u",
6255 xid)));
6256 freeze_xmax = true;
6257 }
6258 else
6259 freeze_xmax = false;
6260 }
6261 else if ((tuple->t_infomask & HEAP_XMAX_INVALID) ||
6262 !TransactionIdIsValid(HeapTupleHeaderGetRawXmax(tuple)))
6263 {
6264 freeze_xmax = false;
6265 xmax_already_frozen = true;
6266 }
6267 else
6268 ereport(ERROR,
6269 (errcode(ERRCODE_DATA_CORRUPTED),
6270 errmsg_internal("found xmax %u (infomask 0x%04x) not frozen, not multi, not normal",
6271 xid, tuple->t_infomask)));
6272
6273 if (freeze_xmax)
6274 {
6275 Assert(!xmax_already_frozen);
6276
6277 frz->xmax = InvalidTransactionId;
6278
6279 /*
6280 * The tuple might be marked either XMAX_INVALID or XMAX_COMMITTED +
6281 * LOCKED. Normalize to INVALID just to be sure no one gets confused.
6282 * Also get rid of the HEAP_KEYS_UPDATED bit.
6283 */
6284 frz->t_infomask &= ~HEAP_XMAX_BITS;
6285 frz->t_infomask |= HEAP_XMAX_INVALID;
6286 frz->t_infomask2 &= ~HEAP_HOT_UPDATED;
6287 frz->t_infomask2 &= ~HEAP_KEYS_UPDATED;
6288 changed = true;
6289 }
6290
6291 /*
6292 * Old-style VACUUM FULL is gone, but we have to keep this code as long as
6293 * we support having MOVED_OFF/MOVED_IN tuples in the database.
6294 */
6295 if (tuple->t_infomask & HEAP_MOVED)
6296 {
6297 xid = HeapTupleHeaderGetXvac(tuple);
6298
6299 /*
6300 * For Xvac, we ignore the cutoff_xid and just always perform the
6301 * freeze operation. The oldest release in which such a value can
6302 * actually be set is PostgreSQL 8.4, because old-style VACUUM FULL
6303 * was removed in PostgreSQL 9.0. Note that if we were to respect
6304 * cutoff_xid here, we'd need to make surely to clear totally_frozen
6305 * when we skipped freezing on that basis.
6306 */
6307 if (TransactionIdIsNormal(xid))
6308 {
6309 /*
6310 * If a MOVED_OFF tuple is not dead, the xvac transaction must
6311 * have failed; whereas a non-dead MOVED_IN tuple must mean the
6312 * xvac transaction succeeded.
6313 */
6314 if (tuple->t_infomask & HEAP_MOVED_OFF)
6315 frz->frzflags |= XLH_INVALID_XVAC;
6316 else
6317 frz->frzflags |= XLH_FREEZE_XVAC;
6318
6319 /*
6320 * Might as well fix the hint bits too; usually XMIN_COMMITTED
6321 * will already be set here, but there's a small chance not.
6322 */
6323 Assert(!(tuple->t_infomask & HEAP_XMIN_INVALID));
6324 frz->t_infomask |= HEAP_XMIN_COMMITTED;
6325 changed = true;
6326 }
6327 }
6328
6329 *totally_frozen_p = (xmin_frozen &&
6330 (freeze_xmax || xmax_already_frozen));
6331 return changed;
6332 }
6333
6334 /*
6335 * heap_execute_freeze_tuple
6336 * Execute the prepared freezing of a tuple.
6337 *
6338 * Caller is responsible for ensuring that no other backend can access the
6339 * storage underlying this tuple, either by holding an exclusive lock on the
6340 * buffer containing it (which is what lazy VACUUM does), or by having it be
6341 * in private storage (which is what CLUSTER and friends do).
6342 *
6343 * Note: it might seem we could make the changes without exclusive lock, since
6344 * TransactionId read/write is assumed atomic anyway. However there is a race
6345 * condition: someone who just fetched an old XID that we overwrite here could
6346 * conceivably not finish checking the XID against pg_xact before we finish
6347 * the VACUUM and perhaps truncate off the part of pg_xact he needs. Getting
6348 * exclusive lock ensures no other backend is in process of checking the
6349 * tuple status. Also, getting exclusive lock makes it safe to adjust the
6350 * infomask bits.
6351 *
6352 * NB: All code in here must be safe to execute during crash recovery!
6353 */
6354 void
heap_execute_freeze_tuple(HeapTupleHeader tuple,xl_heap_freeze_tuple * frz)6355 heap_execute_freeze_tuple(HeapTupleHeader tuple, xl_heap_freeze_tuple *frz)
6356 {
6357 HeapTupleHeaderSetXmax(tuple, frz->xmax);
6358
6359 if (frz->frzflags & XLH_FREEZE_XVAC)
6360 HeapTupleHeaderSetXvac(tuple, FrozenTransactionId);
6361
6362 if (frz->frzflags & XLH_INVALID_XVAC)
6363 HeapTupleHeaderSetXvac(tuple, InvalidTransactionId);
6364
6365 tuple->t_infomask = frz->t_infomask;
6366 tuple->t_infomask2 = frz->t_infomask2;
6367 }
6368
6369 /*
6370 * heap_freeze_tuple
6371 * Freeze tuple in place, without WAL logging.
6372 *
6373 * Useful for callers like CLUSTER that perform their own WAL logging.
6374 */
6375 bool
heap_freeze_tuple(HeapTupleHeader tuple,TransactionId relfrozenxid,TransactionId relminmxid,TransactionId cutoff_xid,TransactionId cutoff_multi)6376 heap_freeze_tuple(HeapTupleHeader tuple,
6377 TransactionId relfrozenxid, TransactionId relminmxid,
6378 TransactionId cutoff_xid, TransactionId cutoff_multi)
6379 {
6380 xl_heap_freeze_tuple frz;
6381 bool do_freeze;
6382 bool tuple_totally_frozen;
6383
6384 do_freeze = heap_prepare_freeze_tuple(tuple,
6385 relfrozenxid, relminmxid,
6386 cutoff_xid, cutoff_multi,
6387 &frz, &tuple_totally_frozen);
6388
6389 /*
6390 * Note that because this is not a WAL-logged operation, we don't need to
6391 * fill in the offset in the freeze record.
6392 */
6393
6394 if (do_freeze)
6395 heap_execute_freeze_tuple(tuple, &frz);
6396 return do_freeze;
6397 }
6398
6399 /*
6400 * For a given MultiXactId, return the hint bits that should be set in the
6401 * tuple's infomask.
6402 *
6403 * Normally this should be called for a multixact that was just created, and
6404 * so is on our local cache, so the GetMembers call is fast.
6405 */
6406 static void
GetMultiXactIdHintBits(MultiXactId multi,uint16 * new_infomask,uint16 * new_infomask2)6407 GetMultiXactIdHintBits(MultiXactId multi, uint16 *new_infomask,
6408 uint16 *new_infomask2)
6409 {
6410 int nmembers;
6411 MultiXactMember *members;
6412 int i;
6413 uint16 bits = HEAP_XMAX_IS_MULTI;
6414 uint16 bits2 = 0;
6415 bool has_update = false;
6416 LockTupleMode strongest = LockTupleKeyShare;
6417
6418 /*
6419 * We only use this in multis we just created, so they cannot be values
6420 * pre-pg_upgrade.
6421 */
6422 nmembers = GetMultiXactIdMembers(multi, &members, false, false);
6423
6424 for (i = 0; i < nmembers; i++)
6425 {
6426 LockTupleMode mode;
6427
6428 /*
6429 * Remember the strongest lock mode held by any member of the
6430 * multixact.
6431 */
6432 mode = TUPLOCK_from_mxstatus(members[i].status);
6433 if (mode > strongest)
6434 strongest = mode;
6435
6436 /* See what other bits we need */
6437 switch (members[i].status)
6438 {
6439 case MultiXactStatusForKeyShare:
6440 case MultiXactStatusForShare:
6441 case MultiXactStatusForNoKeyUpdate:
6442 break;
6443
6444 case MultiXactStatusForUpdate:
6445 bits2 |= HEAP_KEYS_UPDATED;
6446 break;
6447
6448 case MultiXactStatusNoKeyUpdate:
6449 has_update = true;
6450 break;
6451
6452 case MultiXactStatusUpdate:
6453 bits2 |= HEAP_KEYS_UPDATED;
6454 has_update = true;
6455 break;
6456 }
6457 }
6458
6459 if (strongest == LockTupleExclusive ||
6460 strongest == LockTupleNoKeyExclusive)
6461 bits |= HEAP_XMAX_EXCL_LOCK;
6462 else if (strongest == LockTupleShare)
6463 bits |= HEAP_XMAX_SHR_LOCK;
6464 else if (strongest == LockTupleKeyShare)
6465 bits |= HEAP_XMAX_KEYSHR_LOCK;
6466
6467 if (!has_update)
6468 bits |= HEAP_XMAX_LOCK_ONLY;
6469
6470 if (nmembers > 0)
6471 pfree(members);
6472
6473 *new_infomask = bits;
6474 *new_infomask2 = bits2;
6475 }
6476
6477 /*
6478 * MultiXactIdGetUpdateXid
6479 *
6480 * Given a multixact Xmax and corresponding infomask, which does not have the
6481 * HEAP_XMAX_LOCK_ONLY bit set, obtain and return the Xid of the updating
6482 * transaction.
6483 *
6484 * Caller is expected to check the status of the updating transaction, if
6485 * necessary.
6486 */
6487 static TransactionId
MultiXactIdGetUpdateXid(TransactionId xmax,uint16 t_infomask)6488 MultiXactIdGetUpdateXid(TransactionId xmax, uint16 t_infomask)
6489 {
6490 TransactionId update_xact = InvalidTransactionId;
6491 MultiXactMember *members;
6492 int nmembers;
6493
6494 Assert(!(t_infomask & HEAP_XMAX_LOCK_ONLY));
6495 Assert(t_infomask & HEAP_XMAX_IS_MULTI);
6496
6497 /*
6498 * Since we know the LOCK_ONLY bit is not set, this cannot be a multi from
6499 * pre-pg_upgrade.
6500 */
6501 nmembers = GetMultiXactIdMembers(xmax, &members, false, false);
6502
6503 if (nmembers > 0)
6504 {
6505 int i;
6506
6507 for (i = 0; i < nmembers; i++)
6508 {
6509 /* Ignore lockers */
6510 if (!ISUPDATE_from_mxstatus(members[i].status))
6511 continue;
6512
6513 /* there can be at most one updater */
6514 Assert(update_xact == InvalidTransactionId);
6515 update_xact = members[i].xid;
6516 #ifndef USE_ASSERT_CHECKING
6517
6518 /*
6519 * in an assert-enabled build, walk the whole array to ensure
6520 * there's no other updater.
6521 */
6522 break;
6523 #endif
6524 }
6525
6526 pfree(members);
6527 }
6528
6529 return update_xact;
6530 }
6531
6532 /*
6533 * HeapTupleGetUpdateXid
6534 * As above, but use a HeapTupleHeader
6535 *
6536 * See also HeapTupleHeaderGetUpdateXid, which can be used without previously
6537 * checking the hint bits.
6538 */
6539 TransactionId
HeapTupleGetUpdateXid(HeapTupleHeader tuple)6540 HeapTupleGetUpdateXid(HeapTupleHeader tuple)
6541 {
6542 return MultiXactIdGetUpdateXid(HeapTupleHeaderGetRawXmax(tuple),
6543 tuple->t_infomask);
6544 }
6545
6546 /*
6547 * Does the given multixact conflict with the current transaction grabbing a
6548 * tuple lock of the given strength?
6549 *
6550 * The passed infomask pairs up with the given multixact in the tuple header.
6551 *
6552 * If current_is_member is not NULL, it is set to 'true' if the current
6553 * transaction is a member of the given multixact.
6554 */
6555 static bool
DoesMultiXactIdConflict(MultiXactId multi,uint16 infomask,LockTupleMode lockmode,bool * current_is_member)6556 DoesMultiXactIdConflict(MultiXactId multi, uint16 infomask,
6557 LockTupleMode lockmode, bool *current_is_member)
6558 {
6559 int nmembers;
6560 MultiXactMember *members;
6561 bool result = false;
6562 LOCKMODE wanted = tupleLockExtraInfo[lockmode].hwlock;
6563
6564 if (HEAP_LOCKED_UPGRADED(infomask))
6565 return false;
6566
6567 nmembers = GetMultiXactIdMembers(multi, &members, false,
6568 HEAP_XMAX_IS_LOCKED_ONLY(infomask));
6569 if (nmembers >= 0)
6570 {
6571 int i;
6572
6573 for (i = 0; i < nmembers; i++)
6574 {
6575 TransactionId memxid;
6576 LOCKMODE memlockmode;
6577
6578 if (result && (current_is_member == NULL || *current_is_member))
6579 break;
6580
6581 memlockmode = LOCKMODE_from_mxstatus(members[i].status);
6582
6583 /* ignore members from current xact (but track their presence) */
6584 memxid = members[i].xid;
6585 if (TransactionIdIsCurrentTransactionId(memxid))
6586 {
6587 if (current_is_member != NULL)
6588 *current_is_member = true;
6589 continue;
6590 }
6591 else if (result)
6592 continue;
6593
6594 /* ignore members that don't conflict with the lock we want */
6595 if (!DoLockModesConflict(memlockmode, wanted))
6596 continue;
6597
6598 if (ISUPDATE_from_mxstatus(members[i].status))
6599 {
6600 /* ignore aborted updaters */
6601 if (TransactionIdDidAbort(memxid))
6602 continue;
6603 }
6604 else
6605 {
6606 /* ignore lockers-only that are no longer in progress */
6607 if (!TransactionIdIsInProgress(memxid))
6608 continue;
6609 }
6610
6611 /*
6612 * Whatever remains are either live lockers that conflict with our
6613 * wanted lock, and updaters that are not aborted. Those conflict
6614 * with what we want. Set up to return true, but keep going to
6615 * look for the current transaction among the multixact members,
6616 * if needed.
6617 */
6618 result = true;
6619 }
6620 pfree(members);
6621 }
6622
6623 return result;
6624 }
6625
6626 /*
6627 * Do_MultiXactIdWait
6628 * Actual implementation for the two functions below.
6629 *
6630 * 'multi', 'status' and 'infomask' indicate what to sleep on (the status is
6631 * needed to ensure we only sleep on conflicting members, and the infomask is
6632 * used to optimize multixact access in case it's a lock-only multi); 'nowait'
6633 * indicates whether to use conditional lock acquisition, to allow callers to
6634 * fail if lock is unavailable. 'rel', 'ctid' and 'oper' are used to set up
6635 * context information for error messages. 'remaining', if not NULL, receives
6636 * the number of members that are still running, including any (non-aborted)
6637 * subtransactions of our own transaction.
6638 *
6639 * We do this by sleeping on each member using XactLockTableWait. Any
6640 * members that belong to the current backend are *not* waited for, however;
6641 * this would not merely be useless but would lead to Assert failure inside
6642 * XactLockTableWait. By the time this returns, it is certain that all
6643 * transactions *of other backends* that were members of the MultiXactId
6644 * that conflict with the requested status are dead (and no new ones can have
6645 * been added, since it is not legal to add members to an existing
6646 * MultiXactId).
6647 *
6648 * But by the time we finish sleeping, someone else may have changed the Xmax
6649 * of the containing tuple, so the caller needs to iterate on us somehow.
6650 *
6651 * Note that in case we return false, the number of remaining members is
6652 * not to be trusted.
6653 */
6654 static bool
Do_MultiXactIdWait(MultiXactId multi,MultiXactStatus status,uint16 infomask,bool nowait,Relation rel,ItemPointer ctid,XLTW_Oper oper,int * remaining)6655 Do_MultiXactIdWait(MultiXactId multi, MultiXactStatus status,
6656 uint16 infomask, bool nowait,
6657 Relation rel, ItemPointer ctid, XLTW_Oper oper,
6658 int *remaining)
6659 {
6660 bool result = true;
6661 MultiXactMember *members;
6662 int nmembers;
6663 int remain = 0;
6664
6665 /* for pre-pg_upgrade tuples, no need to sleep at all */
6666 nmembers = HEAP_LOCKED_UPGRADED(infomask) ? -1 :
6667 GetMultiXactIdMembers(multi, &members, false,
6668 HEAP_XMAX_IS_LOCKED_ONLY(infomask));
6669
6670 if (nmembers >= 0)
6671 {
6672 int i;
6673
6674 for (i = 0; i < nmembers; i++)
6675 {
6676 TransactionId memxid = members[i].xid;
6677 MultiXactStatus memstatus = members[i].status;
6678
6679 if (TransactionIdIsCurrentTransactionId(memxid))
6680 {
6681 remain++;
6682 continue;
6683 }
6684
6685 if (!DoLockModesConflict(LOCKMODE_from_mxstatus(memstatus),
6686 LOCKMODE_from_mxstatus(status)))
6687 {
6688 if (remaining && TransactionIdIsInProgress(memxid))
6689 remain++;
6690 continue;
6691 }
6692
6693 /*
6694 * This member conflicts with our multi, so we have to sleep (or
6695 * return failure, if asked to avoid waiting.)
6696 *
6697 * Note that we don't set up an error context callback ourselves,
6698 * but instead we pass the info down to XactLockTableWait. This
6699 * might seem a bit wasteful because the context is set up and
6700 * tore down for each member of the multixact, but in reality it
6701 * should be barely noticeable, and it avoids duplicate code.
6702 */
6703 if (nowait)
6704 {
6705 result = ConditionalXactLockTableWait(memxid);
6706 if (!result)
6707 break;
6708 }
6709 else
6710 XactLockTableWait(memxid, rel, ctid, oper);
6711 }
6712
6713 pfree(members);
6714 }
6715
6716 if (remaining)
6717 *remaining = remain;
6718
6719 return result;
6720 }
6721
6722 /*
6723 * MultiXactIdWait
6724 * Sleep on a MultiXactId.
6725 *
6726 * By the time we finish sleeping, someone else may have changed the Xmax
6727 * of the containing tuple, so the caller needs to iterate on us somehow.
6728 *
6729 * We return (in *remaining, if not NULL) the number of members that are still
6730 * running, including any (non-aborted) subtransactions of our own transaction.
6731 */
6732 static void
MultiXactIdWait(MultiXactId multi,MultiXactStatus status,uint16 infomask,Relation rel,ItemPointer ctid,XLTW_Oper oper,int * remaining)6733 MultiXactIdWait(MultiXactId multi, MultiXactStatus status, uint16 infomask,
6734 Relation rel, ItemPointer ctid, XLTW_Oper oper,
6735 int *remaining)
6736 {
6737 (void) Do_MultiXactIdWait(multi, status, infomask, false,
6738 rel, ctid, oper, remaining);
6739 }
6740
6741 /*
6742 * ConditionalMultiXactIdWait
6743 * As above, but only lock if we can get the lock without blocking.
6744 *
6745 * By the time we finish sleeping, someone else may have changed the Xmax
6746 * of the containing tuple, so the caller needs to iterate on us somehow.
6747 *
6748 * If the multixact is now all gone, return true. Returns false if some
6749 * transactions might still be running.
6750 *
6751 * We return (in *remaining, if not NULL) the number of members that are still
6752 * running, including any (non-aborted) subtransactions of our own transaction.
6753 */
6754 static bool
ConditionalMultiXactIdWait(MultiXactId multi,MultiXactStatus status,uint16 infomask,Relation rel,int * remaining)6755 ConditionalMultiXactIdWait(MultiXactId multi, MultiXactStatus status,
6756 uint16 infomask, Relation rel, int *remaining)
6757 {
6758 return Do_MultiXactIdWait(multi, status, infomask, true,
6759 rel, NULL, XLTW_None, remaining);
6760 }
6761
6762 /*
6763 * heap_tuple_needs_eventual_freeze
6764 *
6765 * Check to see whether any of the XID fields of a tuple (xmin, xmax, xvac)
6766 * will eventually require freezing. Similar to heap_tuple_needs_freeze,
6767 * but there's no cutoff, since we're trying to figure out whether freezing
6768 * will ever be needed, not whether it's needed now.
6769 */
6770 bool
heap_tuple_needs_eventual_freeze(HeapTupleHeader tuple)6771 heap_tuple_needs_eventual_freeze(HeapTupleHeader tuple)
6772 {
6773 TransactionId xid;
6774
6775 /*
6776 * If xmin is a normal transaction ID, this tuple is definitely not
6777 * frozen.
6778 */
6779 xid = HeapTupleHeaderGetXmin(tuple);
6780 if (TransactionIdIsNormal(xid))
6781 return true;
6782
6783 /*
6784 * If xmax is a valid xact or multixact, this tuple is also not frozen.
6785 */
6786 if (tuple->t_infomask & HEAP_XMAX_IS_MULTI)
6787 {
6788 MultiXactId multi;
6789
6790 multi = HeapTupleHeaderGetRawXmax(tuple);
6791 if (MultiXactIdIsValid(multi))
6792 return true;
6793 }
6794 else
6795 {
6796 xid = HeapTupleHeaderGetRawXmax(tuple);
6797 if (TransactionIdIsNormal(xid))
6798 return true;
6799 }
6800
6801 if (tuple->t_infomask & HEAP_MOVED)
6802 {
6803 xid = HeapTupleHeaderGetXvac(tuple);
6804 if (TransactionIdIsNormal(xid))
6805 return true;
6806 }
6807
6808 return false;
6809 }
6810
6811 /*
6812 * heap_tuple_needs_freeze
6813 *
6814 * Check to see whether any of the XID fields of a tuple (xmin, xmax, xvac)
6815 * are older than the specified cutoff XID or MultiXactId. If so, return true.
6816 *
6817 * It doesn't matter whether the tuple is alive or dead, we are checking
6818 * to see if a tuple needs to be removed or frozen to avoid wraparound.
6819 *
6820 * NB: Cannot rely on hint bits here, they might not be set after a crash or
6821 * on a standby.
6822 */
6823 bool
heap_tuple_needs_freeze(HeapTupleHeader tuple,TransactionId cutoff_xid,MultiXactId cutoff_multi,Buffer buf)6824 heap_tuple_needs_freeze(HeapTupleHeader tuple, TransactionId cutoff_xid,
6825 MultiXactId cutoff_multi, Buffer buf)
6826 {
6827 TransactionId xid;
6828
6829 xid = HeapTupleHeaderGetXmin(tuple);
6830 if (TransactionIdIsNormal(xid) &&
6831 TransactionIdPrecedes(xid, cutoff_xid))
6832 return true;
6833
6834 /*
6835 * The considerations for multixacts are complicated; look at
6836 * heap_prepare_freeze_tuple for justifications. This routine had better
6837 * be in sync with that one!
6838 */
6839 if (tuple->t_infomask & HEAP_XMAX_IS_MULTI)
6840 {
6841 MultiXactId multi;
6842
6843 multi = HeapTupleHeaderGetRawXmax(tuple);
6844 if (!MultiXactIdIsValid(multi))
6845 {
6846 /* no xmax set, ignore */
6847 ;
6848 }
6849 else if (HEAP_LOCKED_UPGRADED(tuple->t_infomask))
6850 return true;
6851 else if (MultiXactIdPrecedes(multi, cutoff_multi))
6852 return true;
6853 else
6854 {
6855 MultiXactMember *members;
6856 int nmembers;
6857 int i;
6858
6859 /* need to check whether any member of the mxact is too old */
6860
6861 nmembers = GetMultiXactIdMembers(multi, &members, false,
6862 HEAP_XMAX_IS_LOCKED_ONLY(tuple->t_infomask));
6863
6864 for (i = 0; i < nmembers; i++)
6865 {
6866 if (TransactionIdPrecedes(members[i].xid, cutoff_xid))
6867 {
6868 pfree(members);
6869 return true;
6870 }
6871 }
6872 if (nmembers > 0)
6873 pfree(members);
6874 }
6875 }
6876 else
6877 {
6878 xid = HeapTupleHeaderGetRawXmax(tuple);
6879 if (TransactionIdIsNormal(xid) &&
6880 TransactionIdPrecedes(xid, cutoff_xid))
6881 return true;
6882 }
6883
6884 if (tuple->t_infomask & HEAP_MOVED)
6885 {
6886 xid = HeapTupleHeaderGetXvac(tuple);
6887 if (TransactionIdIsNormal(xid) &&
6888 TransactionIdPrecedes(xid, cutoff_xid))
6889 return true;
6890 }
6891
6892 return false;
6893 }
6894
6895 /*
6896 * If 'tuple' contains any visible XID greater than latestRemovedXid,
6897 * ratchet forwards latestRemovedXid to the greatest one found.
6898 * This is used as the basis for generating Hot Standby conflicts, so
6899 * if a tuple was never visible then removing it should not conflict
6900 * with queries.
6901 */
6902 void
HeapTupleHeaderAdvanceLatestRemovedXid(HeapTupleHeader tuple,TransactionId * latestRemovedXid)6903 HeapTupleHeaderAdvanceLatestRemovedXid(HeapTupleHeader tuple,
6904 TransactionId *latestRemovedXid)
6905 {
6906 TransactionId xmin = HeapTupleHeaderGetXmin(tuple);
6907 TransactionId xmax = HeapTupleHeaderGetUpdateXid(tuple);
6908 TransactionId xvac = HeapTupleHeaderGetXvac(tuple);
6909
6910 if (tuple->t_infomask & HEAP_MOVED)
6911 {
6912 if (TransactionIdPrecedes(*latestRemovedXid, xvac))
6913 *latestRemovedXid = xvac;
6914 }
6915
6916 /*
6917 * Ignore tuples inserted by an aborted transaction or if the tuple was
6918 * updated/deleted by the inserting transaction.
6919 *
6920 * Look for a committed hint bit, or if no xmin bit is set, check clog.
6921 * This needs to work on both master and standby, where it is used to
6922 * assess btree delete records.
6923 */
6924 if (HeapTupleHeaderXminCommitted(tuple) ||
6925 (!HeapTupleHeaderXminInvalid(tuple) && TransactionIdDidCommit(xmin)))
6926 {
6927 if (xmax != xmin &&
6928 TransactionIdFollows(xmax, *latestRemovedXid))
6929 *latestRemovedXid = xmax;
6930 }
6931
6932 /* *latestRemovedXid may still be invalid at end */
6933 }
6934
6935 #ifdef USE_PREFETCH
6936 /*
6937 * Helper function for heap_compute_xid_horizon_for_tuples. Issue prefetch
6938 * requests for the number of buffers indicated by prefetch_count. The
6939 * prefetch_state keeps track of all the buffers that we can prefetch and
6940 * which ones have already been prefetched; each call to this function picks
6941 * up where the previous call left off.
6942 */
6943 static void
xid_horizon_prefetch_buffer(Relation rel,XidHorizonPrefetchState * prefetch_state,int prefetch_count)6944 xid_horizon_prefetch_buffer(Relation rel,
6945 XidHorizonPrefetchState *prefetch_state,
6946 int prefetch_count)
6947 {
6948 BlockNumber cur_hblkno = prefetch_state->cur_hblkno;
6949 int count = 0;
6950 int i;
6951 int nitems = prefetch_state->nitems;
6952 ItemPointerData *tids = prefetch_state->tids;
6953
6954 for (i = prefetch_state->next_item;
6955 i < nitems && count < prefetch_count;
6956 i++)
6957 {
6958 ItemPointer htid = &tids[i];
6959
6960 if (cur_hblkno == InvalidBlockNumber ||
6961 ItemPointerGetBlockNumber(htid) != cur_hblkno)
6962 {
6963 cur_hblkno = ItemPointerGetBlockNumber(htid);
6964 PrefetchBuffer(rel, MAIN_FORKNUM, cur_hblkno);
6965 count++;
6966 }
6967 }
6968
6969 /*
6970 * Save the prefetch position so that next time we can continue from that
6971 * position.
6972 */
6973 prefetch_state->next_item = i;
6974 prefetch_state->cur_hblkno = cur_hblkno;
6975 }
6976 #endif
6977
6978 /*
6979 * Get the latestRemovedXid from the heap pages pointed at by the index
6980 * tuples being deleted.
6981 *
6982 * We used to do this during recovery rather than on the primary, but that
6983 * approach now appears inferior. It meant that the master could generate
6984 * a lot of work for the standby without any back-pressure to slow down the
6985 * master, and it required the standby to have reached consistency, whereas
6986 * we want to have correct information available even before that point.
6987 *
6988 * It's possible for this to generate a fair amount of I/O, since we may be
6989 * deleting hundreds of tuples from a single index block. To amortize that
6990 * cost to some degree, this uses prefetching and combines repeat accesses to
6991 * the same block.
6992 */
6993 TransactionId
heap_compute_xid_horizon_for_tuples(Relation rel,ItemPointerData * tids,int nitems)6994 heap_compute_xid_horizon_for_tuples(Relation rel,
6995 ItemPointerData *tids,
6996 int nitems)
6997 {
6998 /* Initial assumption is that earlier pruning took care of conflict */
6999 TransactionId latestRemovedXid = InvalidTransactionId;
7000 BlockNumber blkno = InvalidBlockNumber;
7001 Buffer buf = InvalidBuffer;
7002 Page page = NULL;
7003 OffsetNumber maxoff = InvalidOffsetNumber;
7004 TransactionId priorXmax;
7005 #ifdef USE_PREFETCH
7006 XidHorizonPrefetchState prefetch_state;
7007 int prefetch_distance;
7008 #endif
7009
7010 /*
7011 * Sort to avoid repeated lookups for the same page, and to make it more
7012 * likely to access items in an efficient order. In particular, this
7013 * ensures that if there are multiple pointers to the same page, they all
7014 * get processed looking up and locking the page just once.
7015 */
7016 qsort((void *) tids, nitems, sizeof(ItemPointerData),
7017 (int (*) (const void *, const void *)) ItemPointerCompare);
7018
7019 #ifdef USE_PREFETCH
7020 /* Initialize prefetch state. */
7021 prefetch_state.cur_hblkno = InvalidBlockNumber;
7022 prefetch_state.next_item = 0;
7023 prefetch_state.nitems = nitems;
7024 prefetch_state.tids = tids;
7025
7026 /*
7027 * Compute the prefetch distance that we will attempt to maintain.
7028 *
7029 * Since the caller holds a buffer lock somewhere in rel, we'd better make
7030 * sure that isn't a catalog relation before we call code that does
7031 * syscache lookups, to avoid risk of deadlock.
7032 */
7033 if (IsCatalogRelation(rel))
7034 prefetch_distance = maintenance_io_concurrency;
7035 else
7036 prefetch_distance =
7037 get_tablespace_maintenance_io_concurrency(rel->rd_rel->reltablespace);
7038
7039 /* Start prefetching. */
7040 xid_horizon_prefetch_buffer(rel, &prefetch_state, prefetch_distance);
7041 #endif
7042
7043 /* Iterate over all tids, and check their horizon */
7044 for (int i = 0; i < nitems; i++)
7045 {
7046 ItemPointer htid = &tids[i];
7047 OffsetNumber offnum;
7048
7049 /*
7050 * Read heap buffer, but avoid refetching if it's the same block as
7051 * required for the last tid.
7052 */
7053 if (blkno == InvalidBlockNumber ||
7054 ItemPointerGetBlockNumber(htid) != blkno)
7055 {
7056 /* release old buffer */
7057 if (BufferIsValid(buf))
7058 {
7059 LockBuffer(buf, BUFFER_LOCK_UNLOCK);
7060 ReleaseBuffer(buf);
7061 }
7062
7063 blkno = ItemPointerGetBlockNumber(htid);
7064
7065 buf = ReadBuffer(rel, blkno);
7066
7067 #ifdef USE_PREFETCH
7068
7069 /*
7070 * To maintain the prefetch distance, prefetch one more page for
7071 * each page we read.
7072 */
7073 xid_horizon_prefetch_buffer(rel, &prefetch_state, 1);
7074 #endif
7075
7076 LockBuffer(buf, BUFFER_LOCK_SHARE);
7077
7078 page = BufferGetPage(buf);
7079 maxoff = PageGetMaxOffsetNumber(page);
7080 }
7081
7082 /*
7083 * Maintain latestRemovedXid value for deletion operation as a whole
7084 * by advancing current value using heap tuple headers. This is
7085 * loosely based on the logic for pruning a HOT chain.
7086 */
7087 offnum = ItemPointerGetOffsetNumber(htid);
7088 priorXmax = InvalidTransactionId; /* cannot check first XMIN */
7089 for (;;)
7090 {
7091 ItemId lp;
7092 HeapTupleHeader htup;
7093
7094 /* Some sanity checks */
7095 if (offnum < FirstOffsetNumber || offnum > maxoff)
7096 break;
7097
7098 lp = PageGetItemId(page, offnum);
7099 if (ItemIdIsRedirected(lp))
7100 {
7101 offnum = ItemIdGetRedirect(lp);
7102 continue;
7103 }
7104
7105 /*
7106 * We'll often encounter LP_DEAD line pointers. No need to do
7107 * anything more with htid when that happens. This is okay
7108 * because the earlier pruning operation that made the line
7109 * pointer LP_DEAD in the first place must have considered the
7110 * tuple header as part of generating its own latestRemovedXid
7111 * value.
7112 *
7113 * Relying on XLOG_HEAP2_CLEANUP_INFO records like this is the
7114 * same strategy that index vacuuming uses in all cases. Index
7115 * VACUUM WAL records don't even have a latestRemovedXid field of
7116 * their own for this reason.
7117 */
7118 if (!ItemIdIsNormal(lp))
7119 break;
7120
7121 htup = (HeapTupleHeader) PageGetItem(page, lp);
7122
7123 /*
7124 * Check the tuple XMIN against prior XMAX, if any
7125 */
7126 if (TransactionIdIsValid(priorXmax) &&
7127 !TransactionIdEquals(HeapTupleHeaderGetXmin(htup), priorXmax))
7128 break;
7129
7130 HeapTupleHeaderAdvanceLatestRemovedXid(htup, &latestRemovedXid);
7131
7132 /*
7133 * If the tuple is not HOT-updated, then we are at the end of this
7134 * HOT-chain. No need to visit later tuples from the same update
7135 * chain (they get their own index entries) -- just move on to
7136 * next htid from index AM caller.
7137 */
7138 if (!HeapTupleHeaderIsHotUpdated(htup))
7139 break;
7140
7141 /* Advance to next HOT chain member */
7142 Assert(ItemPointerGetBlockNumber(&htup->t_ctid) == blkno);
7143 offnum = ItemPointerGetOffsetNumber(&htup->t_ctid);
7144 priorXmax = HeapTupleHeaderGetUpdateXid(htup);
7145 }
7146 }
7147
7148 if (BufferIsValid(buf))
7149 {
7150 LockBuffer(buf, BUFFER_LOCK_UNLOCK);
7151 ReleaseBuffer(buf);
7152 }
7153
7154 return latestRemovedXid;
7155 }
7156
7157 /*
7158 * Perform XLogInsert to register a heap cleanup info message. These
7159 * messages are sent once per VACUUM and are required because
7160 * of the phasing of removal operations during a lazy VACUUM.
7161 * see comments for vacuum_log_cleanup_info().
7162 */
7163 XLogRecPtr
log_heap_cleanup_info(RelFileNode rnode,TransactionId latestRemovedXid)7164 log_heap_cleanup_info(RelFileNode rnode, TransactionId latestRemovedXid)
7165 {
7166 xl_heap_cleanup_info xlrec;
7167 XLogRecPtr recptr;
7168
7169 xlrec.node = rnode;
7170 xlrec.latestRemovedXid = latestRemovedXid;
7171
7172 XLogBeginInsert();
7173 XLogRegisterData((char *) &xlrec, SizeOfHeapCleanupInfo);
7174
7175 recptr = XLogInsert(RM_HEAP2_ID, XLOG_HEAP2_CLEANUP_INFO);
7176
7177 return recptr;
7178 }
7179
7180 /*
7181 * Perform XLogInsert for a heap-clean operation. Caller must already
7182 * have modified the buffer and marked it dirty.
7183 *
7184 * Note: prior to Postgres 8.3, the entries in the nowunused[] array were
7185 * zero-based tuple indexes. Now they are one-based like other uses
7186 * of OffsetNumber.
7187 *
7188 * We also include latestRemovedXid, which is the greatest XID present in
7189 * the removed tuples. That allows recovery processing to cancel or wait
7190 * for long standby queries that can still see these tuples.
7191 */
7192 XLogRecPtr
log_heap_clean(Relation reln,Buffer buffer,OffsetNumber * redirected,int nredirected,OffsetNumber * nowdead,int ndead,OffsetNumber * nowunused,int nunused,TransactionId latestRemovedXid)7193 log_heap_clean(Relation reln, Buffer buffer,
7194 OffsetNumber *redirected, int nredirected,
7195 OffsetNumber *nowdead, int ndead,
7196 OffsetNumber *nowunused, int nunused,
7197 TransactionId latestRemovedXid)
7198 {
7199 xl_heap_clean xlrec;
7200 XLogRecPtr recptr;
7201
7202 /* Caller should not call me on a non-WAL-logged relation */
7203 Assert(RelationNeedsWAL(reln));
7204
7205 xlrec.latestRemovedXid = latestRemovedXid;
7206 xlrec.nredirected = nredirected;
7207 xlrec.ndead = ndead;
7208
7209 XLogBeginInsert();
7210 XLogRegisterData((char *) &xlrec, SizeOfHeapClean);
7211
7212 XLogRegisterBuffer(0, buffer, REGBUF_STANDARD);
7213
7214 /*
7215 * The OffsetNumber arrays are not actually in the buffer, but we pretend
7216 * that they are. When XLogInsert stores the whole buffer, the offset
7217 * arrays need not be stored too. Note that even if all three arrays are
7218 * empty, we want to expose the buffer as a candidate for whole-page
7219 * storage, since this record type implies a defragmentation operation
7220 * even if no line pointers changed state.
7221 */
7222 if (nredirected > 0)
7223 XLogRegisterBufData(0, (char *) redirected,
7224 nredirected * sizeof(OffsetNumber) * 2);
7225
7226 if (ndead > 0)
7227 XLogRegisterBufData(0, (char *) nowdead,
7228 ndead * sizeof(OffsetNumber));
7229
7230 if (nunused > 0)
7231 XLogRegisterBufData(0, (char *) nowunused,
7232 nunused * sizeof(OffsetNumber));
7233
7234 recptr = XLogInsert(RM_HEAP2_ID, XLOG_HEAP2_CLEAN);
7235
7236 return recptr;
7237 }
7238
7239 /*
7240 * Perform XLogInsert for a heap-freeze operation. Caller must have already
7241 * modified the buffer and marked it dirty.
7242 */
7243 XLogRecPtr
log_heap_freeze(Relation reln,Buffer buffer,TransactionId cutoff_xid,xl_heap_freeze_tuple * tuples,int ntuples)7244 log_heap_freeze(Relation reln, Buffer buffer, TransactionId cutoff_xid,
7245 xl_heap_freeze_tuple *tuples, int ntuples)
7246 {
7247 xl_heap_freeze_page xlrec;
7248 XLogRecPtr recptr;
7249
7250 /* Caller should not call me on a non-WAL-logged relation */
7251 Assert(RelationNeedsWAL(reln));
7252 /* nor when there are no tuples to freeze */
7253 Assert(ntuples > 0);
7254
7255 xlrec.cutoff_xid = cutoff_xid;
7256 xlrec.ntuples = ntuples;
7257
7258 XLogBeginInsert();
7259 XLogRegisterData((char *) &xlrec, SizeOfHeapFreezePage);
7260
7261 /*
7262 * The freeze plan array is not actually in the buffer, but pretend that
7263 * it is. When XLogInsert stores the whole buffer, the freeze plan need
7264 * not be stored too.
7265 */
7266 XLogRegisterBuffer(0, buffer, REGBUF_STANDARD);
7267 XLogRegisterBufData(0, (char *) tuples,
7268 ntuples * sizeof(xl_heap_freeze_tuple));
7269
7270 recptr = XLogInsert(RM_HEAP2_ID, XLOG_HEAP2_FREEZE_PAGE);
7271
7272 return recptr;
7273 }
7274
7275 /*
7276 * Perform XLogInsert for a heap-visible operation. 'block' is the block
7277 * being marked all-visible, and vm_buffer is the buffer containing the
7278 * corresponding visibility map block. Both should have already been modified
7279 * and dirtied.
7280 *
7281 * If checksums are enabled, we also generate a full-page image of
7282 * heap_buffer, if necessary.
7283 */
7284 XLogRecPtr
log_heap_visible(RelFileNode rnode,Buffer heap_buffer,Buffer vm_buffer,TransactionId cutoff_xid,uint8 vmflags)7285 log_heap_visible(RelFileNode rnode, Buffer heap_buffer, Buffer vm_buffer,
7286 TransactionId cutoff_xid, uint8 vmflags)
7287 {
7288 xl_heap_visible xlrec;
7289 XLogRecPtr recptr;
7290 uint8 flags;
7291
7292 Assert(BufferIsValid(heap_buffer));
7293 Assert(BufferIsValid(vm_buffer));
7294
7295 xlrec.cutoff_xid = cutoff_xid;
7296 xlrec.flags = vmflags;
7297 XLogBeginInsert();
7298 XLogRegisterData((char *) &xlrec, SizeOfHeapVisible);
7299
7300 XLogRegisterBuffer(0, vm_buffer, 0);
7301
7302 flags = REGBUF_STANDARD;
7303 if (!XLogHintBitIsNeeded())
7304 flags |= REGBUF_NO_IMAGE;
7305 XLogRegisterBuffer(1, heap_buffer, flags);
7306
7307 recptr = XLogInsert(RM_HEAP2_ID, XLOG_HEAP2_VISIBLE);
7308
7309 return recptr;
7310 }
7311
7312 /*
7313 * Perform XLogInsert for a heap-update operation. Caller must already
7314 * have modified the buffer(s) and marked them dirty.
7315 */
7316 static XLogRecPtr
log_heap_update(Relation reln,Buffer oldbuf,Buffer newbuf,HeapTuple oldtup,HeapTuple newtup,HeapTuple old_key_tuple,bool all_visible_cleared,bool new_all_visible_cleared)7317 log_heap_update(Relation reln, Buffer oldbuf,
7318 Buffer newbuf, HeapTuple oldtup, HeapTuple newtup,
7319 HeapTuple old_key_tuple,
7320 bool all_visible_cleared, bool new_all_visible_cleared)
7321 {
7322 xl_heap_update xlrec;
7323 xl_heap_header xlhdr;
7324 xl_heap_header xlhdr_idx;
7325 uint8 info;
7326 uint16 prefix_suffix[2];
7327 uint16 prefixlen = 0,
7328 suffixlen = 0;
7329 XLogRecPtr recptr;
7330 Page page = BufferGetPage(newbuf);
7331 bool need_tuple_data = RelationIsLogicallyLogged(reln);
7332 bool init;
7333 int bufflags;
7334
7335 /* Caller should not call me on a non-WAL-logged relation */
7336 Assert(RelationNeedsWAL(reln));
7337
7338 XLogBeginInsert();
7339
7340 if (HeapTupleIsHeapOnly(newtup))
7341 info = XLOG_HEAP_HOT_UPDATE;
7342 else
7343 info = XLOG_HEAP_UPDATE;
7344
7345 /*
7346 * If the old and new tuple are on the same page, we only need to log the
7347 * parts of the new tuple that were changed. That saves on the amount of
7348 * WAL we need to write. Currently, we just count any unchanged bytes in
7349 * the beginning and end of the tuple. That's quick to check, and
7350 * perfectly covers the common case that only one field is updated.
7351 *
7352 * We could do this even if the old and new tuple are on different pages,
7353 * but only if we don't make a full-page image of the old page, which is
7354 * difficult to know in advance. Also, if the old tuple is corrupt for
7355 * some reason, it would allow the corruption to propagate the new page,
7356 * so it seems best to avoid. Under the general assumption that most
7357 * updates tend to create the new tuple version on the same page, there
7358 * isn't much to be gained by doing this across pages anyway.
7359 *
7360 * Skip this if we're taking a full-page image of the new page, as we
7361 * don't include the new tuple in the WAL record in that case. Also
7362 * disable if wal_level='logical', as logical decoding needs to be able to
7363 * read the new tuple in whole from the WAL record alone.
7364 */
7365 if (oldbuf == newbuf && !need_tuple_data &&
7366 !XLogCheckBufferNeedsBackup(newbuf))
7367 {
7368 char *oldp = (char *) oldtup->t_data + oldtup->t_data->t_hoff;
7369 char *newp = (char *) newtup->t_data + newtup->t_data->t_hoff;
7370 int oldlen = oldtup->t_len - oldtup->t_data->t_hoff;
7371 int newlen = newtup->t_len - newtup->t_data->t_hoff;
7372
7373 /* Check for common prefix between old and new tuple */
7374 for (prefixlen = 0; prefixlen < Min(oldlen, newlen); prefixlen++)
7375 {
7376 if (newp[prefixlen] != oldp[prefixlen])
7377 break;
7378 }
7379
7380 /*
7381 * Storing the length of the prefix takes 2 bytes, so we need to save
7382 * at least 3 bytes or there's no point.
7383 */
7384 if (prefixlen < 3)
7385 prefixlen = 0;
7386
7387 /* Same for suffix */
7388 for (suffixlen = 0; suffixlen < Min(oldlen, newlen) - prefixlen; suffixlen++)
7389 {
7390 if (newp[newlen - suffixlen - 1] != oldp[oldlen - suffixlen - 1])
7391 break;
7392 }
7393 if (suffixlen < 3)
7394 suffixlen = 0;
7395 }
7396
7397 /* Prepare main WAL data chain */
7398 xlrec.flags = 0;
7399 if (all_visible_cleared)
7400 xlrec.flags |= XLH_UPDATE_OLD_ALL_VISIBLE_CLEARED;
7401 if (new_all_visible_cleared)
7402 xlrec.flags |= XLH_UPDATE_NEW_ALL_VISIBLE_CLEARED;
7403 if (prefixlen > 0)
7404 xlrec.flags |= XLH_UPDATE_PREFIX_FROM_OLD;
7405 if (suffixlen > 0)
7406 xlrec.flags |= XLH_UPDATE_SUFFIX_FROM_OLD;
7407 if (need_tuple_data)
7408 {
7409 xlrec.flags |= XLH_UPDATE_CONTAINS_NEW_TUPLE;
7410 if (old_key_tuple)
7411 {
7412 if (reln->rd_rel->relreplident == REPLICA_IDENTITY_FULL)
7413 xlrec.flags |= XLH_UPDATE_CONTAINS_OLD_TUPLE;
7414 else
7415 xlrec.flags |= XLH_UPDATE_CONTAINS_OLD_KEY;
7416 }
7417 }
7418
7419 /* If new tuple is the single and first tuple on page... */
7420 if (ItemPointerGetOffsetNumber(&(newtup->t_self)) == FirstOffsetNumber &&
7421 PageGetMaxOffsetNumber(page) == FirstOffsetNumber)
7422 {
7423 info |= XLOG_HEAP_INIT_PAGE;
7424 init = true;
7425 }
7426 else
7427 init = false;
7428
7429 /* Prepare WAL data for the old page */
7430 xlrec.old_offnum = ItemPointerGetOffsetNumber(&oldtup->t_self);
7431 xlrec.old_xmax = HeapTupleHeaderGetRawXmax(oldtup->t_data);
7432 xlrec.old_infobits_set = compute_infobits(oldtup->t_data->t_infomask,
7433 oldtup->t_data->t_infomask2);
7434
7435 /* Prepare WAL data for the new page */
7436 xlrec.new_offnum = ItemPointerGetOffsetNumber(&newtup->t_self);
7437 xlrec.new_xmax = HeapTupleHeaderGetRawXmax(newtup->t_data);
7438
7439 bufflags = REGBUF_STANDARD;
7440 if (init)
7441 bufflags |= REGBUF_WILL_INIT;
7442 if (need_tuple_data)
7443 bufflags |= REGBUF_KEEP_DATA;
7444
7445 XLogRegisterBuffer(0, newbuf, bufflags);
7446 if (oldbuf != newbuf)
7447 XLogRegisterBuffer(1, oldbuf, REGBUF_STANDARD);
7448
7449 XLogRegisterData((char *) &xlrec, SizeOfHeapUpdate);
7450
7451 /*
7452 * Prepare WAL data for the new tuple.
7453 */
7454 if (prefixlen > 0 || suffixlen > 0)
7455 {
7456 if (prefixlen > 0 && suffixlen > 0)
7457 {
7458 prefix_suffix[0] = prefixlen;
7459 prefix_suffix[1] = suffixlen;
7460 XLogRegisterBufData(0, (char *) &prefix_suffix, sizeof(uint16) * 2);
7461 }
7462 else if (prefixlen > 0)
7463 {
7464 XLogRegisterBufData(0, (char *) &prefixlen, sizeof(uint16));
7465 }
7466 else
7467 {
7468 XLogRegisterBufData(0, (char *) &suffixlen, sizeof(uint16));
7469 }
7470 }
7471
7472 xlhdr.t_infomask2 = newtup->t_data->t_infomask2;
7473 xlhdr.t_infomask = newtup->t_data->t_infomask;
7474 xlhdr.t_hoff = newtup->t_data->t_hoff;
7475 Assert(SizeofHeapTupleHeader + prefixlen + suffixlen <= newtup->t_len);
7476
7477 /*
7478 * PG73FORMAT: write bitmap [+ padding] [+ oid] + data
7479 *
7480 * The 'data' doesn't include the common prefix or suffix.
7481 */
7482 XLogRegisterBufData(0, (char *) &xlhdr, SizeOfHeapHeader);
7483 if (prefixlen == 0)
7484 {
7485 XLogRegisterBufData(0,
7486 ((char *) newtup->t_data) + SizeofHeapTupleHeader,
7487 newtup->t_len - SizeofHeapTupleHeader - suffixlen);
7488 }
7489 else
7490 {
7491 /*
7492 * Have to write the null bitmap and data after the common prefix as
7493 * two separate rdata entries.
7494 */
7495 /* bitmap [+ padding] [+ oid] */
7496 if (newtup->t_data->t_hoff - SizeofHeapTupleHeader > 0)
7497 {
7498 XLogRegisterBufData(0,
7499 ((char *) newtup->t_data) + SizeofHeapTupleHeader,
7500 newtup->t_data->t_hoff - SizeofHeapTupleHeader);
7501 }
7502
7503 /* data after common prefix */
7504 XLogRegisterBufData(0,
7505 ((char *) newtup->t_data) + newtup->t_data->t_hoff + prefixlen,
7506 newtup->t_len - newtup->t_data->t_hoff - prefixlen - suffixlen);
7507 }
7508
7509 /* We need to log a tuple identity */
7510 if (need_tuple_data && old_key_tuple)
7511 {
7512 /* don't really need this, but its more comfy to decode */
7513 xlhdr_idx.t_infomask2 = old_key_tuple->t_data->t_infomask2;
7514 xlhdr_idx.t_infomask = old_key_tuple->t_data->t_infomask;
7515 xlhdr_idx.t_hoff = old_key_tuple->t_data->t_hoff;
7516
7517 XLogRegisterData((char *) &xlhdr_idx, SizeOfHeapHeader);
7518
7519 /* PG73FORMAT: write bitmap [+ padding] [+ oid] + data */
7520 XLogRegisterData((char *) old_key_tuple->t_data + SizeofHeapTupleHeader,
7521 old_key_tuple->t_len - SizeofHeapTupleHeader);
7522 }
7523
7524 /* filtering by origin on a row level is much more efficient */
7525 XLogSetRecordFlags(XLOG_INCLUDE_ORIGIN);
7526
7527 recptr = XLogInsert(RM_HEAP_ID, info);
7528
7529 return recptr;
7530 }
7531
7532 /*
7533 * Perform XLogInsert of an XLOG_HEAP2_NEW_CID record
7534 *
7535 * This is only used in wal_level >= WAL_LEVEL_LOGICAL, and only for catalog
7536 * tuples.
7537 */
7538 static XLogRecPtr
log_heap_new_cid(Relation relation,HeapTuple tup)7539 log_heap_new_cid(Relation relation, HeapTuple tup)
7540 {
7541 xl_heap_new_cid xlrec;
7542
7543 XLogRecPtr recptr;
7544 HeapTupleHeader hdr = tup->t_data;
7545
7546 Assert(ItemPointerIsValid(&tup->t_self));
7547 Assert(tup->t_tableOid != InvalidOid);
7548
7549 xlrec.top_xid = GetTopTransactionId();
7550 xlrec.target_node = relation->rd_node;
7551 xlrec.target_tid = tup->t_self;
7552
7553 /*
7554 * If the tuple got inserted & deleted in the same TX we definitely have a
7555 * combocid, set cmin and cmax.
7556 */
7557 if (hdr->t_infomask & HEAP_COMBOCID)
7558 {
7559 Assert(!(hdr->t_infomask & HEAP_XMAX_INVALID));
7560 Assert(!HeapTupleHeaderXminInvalid(hdr));
7561 xlrec.cmin = HeapTupleHeaderGetCmin(hdr);
7562 xlrec.cmax = HeapTupleHeaderGetCmax(hdr);
7563 xlrec.combocid = HeapTupleHeaderGetRawCommandId(hdr);
7564 }
7565 /* No combocid, so only cmin or cmax can be set by this TX */
7566 else
7567 {
7568 /*
7569 * Tuple inserted.
7570 *
7571 * We need to check for LOCK ONLY because multixacts might be
7572 * transferred to the new tuple in case of FOR KEY SHARE updates in
7573 * which case there will be an xmax, although the tuple just got
7574 * inserted.
7575 */
7576 if (hdr->t_infomask & HEAP_XMAX_INVALID ||
7577 HEAP_XMAX_IS_LOCKED_ONLY(hdr->t_infomask))
7578 {
7579 xlrec.cmin = HeapTupleHeaderGetRawCommandId(hdr);
7580 xlrec.cmax = InvalidCommandId;
7581 }
7582 /* Tuple from a different tx updated or deleted. */
7583 else
7584 {
7585 xlrec.cmin = InvalidCommandId;
7586 xlrec.cmax = HeapTupleHeaderGetRawCommandId(hdr);
7587
7588 }
7589 xlrec.combocid = InvalidCommandId;
7590 }
7591
7592 /*
7593 * Note that we don't need to register the buffer here, because this
7594 * operation does not modify the page. The insert/update/delete that
7595 * called us certainly did, but that's WAL-logged separately.
7596 */
7597 XLogBeginInsert();
7598 XLogRegisterData((char *) &xlrec, SizeOfHeapNewCid);
7599
7600 /* will be looked at irrespective of origin */
7601
7602 recptr = XLogInsert(RM_HEAP2_ID, XLOG_HEAP2_NEW_CID);
7603
7604 return recptr;
7605 }
7606
7607 /*
7608 * Build a heap tuple representing the configured REPLICA IDENTITY to represent
7609 * the old tuple in a UPDATE or DELETE.
7610 *
7611 * Returns NULL if there's no need to log an identity or if there's no suitable
7612 * key defined.
7613 *
7614 * key_changed should be false if caller knows that no replica identity
7615 * columns changed value. It's always true in the DELETE case.
7616 *
7617 * *copy is set to true if the returned tuple is a modified copy rather than
7618 * the same tuple that was passed in.
7619 */
7620 static HeapTuple
ExtractReplicaIdentity(Relation relation,HeapTuple tp,bool key_changed,bool * copy)7621 ExtractReplicaIdentity(Relation relation, HeapTuple tp, bool key_changed,
7622 bool *copy)
7623 {
7624 TupleDesc desc = RelationGetDescr(relation);
7625 char replident = relation->rd_rel->relreplident;
7626 Bitmapset *idattrs;
7627 HeapTuple key_tuple;
7628 bool nulls[MaxHeapAttributeNumber];
7629 Datum values[MaxHeapAttributeNumber];
7630
7631 *copy = false;
7632
7633 if (!RelationIsLogicallyLogged(relation))
7634 return NULL;
7635
7636 if (replident == REPLICA_IDENTITY_NOTHING)
7637 return NULL;
7638
7639 if (replident == REPLICA_IDENTITY_FULL)
7640 {
7641 /*
7642 * When logging the entire old tuple, it very well could contain
7643 * toasted columns. If so, force them to be inlined.
7644 */
7645 if (HeapTupleHasExternal(tp))
7646 {
7647 *copy = true;
7648 tp = toast_flatten_tuple(tp, desc);
7649 }
7650 return tp;
7651 }
7652
7653 /* if the key hasn't changed and we're only logging the key, we're done */
7654 if (!key_changed)
7655 return NULL;
7656
7657 /* find out the replica identity columns */
7658 idattrs = RelationGetIndexAttrBitmap(relation,
7659 INDEX_ATTR_BITMAP_IDENTITY_KEY);
7660
7661 /*
7662 * If there's no defined replica identity columns, treat as !key_changed.
7663 * (This case should not be reachable from heap_update, since that should
7664 * calculate key_changed accurately. But heap_delete just passes constant
7665 * true for key_changed, so we can hit this case in deletes.)
7666 */
7667 if (bms_is_empty(idattrs))
7668 return NULL;
7669
7670 /*
7671 * Construct a new tuple containing only the replica identity columns,
7672 * with nulls elsewhere. While we're at it, assert that the replica
7673 * identity columns aren't null.
7674 */
7675 heap_deform_tuple(tp, desc, values, nulls);
7676
7677 for (int i = 0; i < desc->natts; i++)
7678 {
7679 if (bms_is_member(i + 1 - FirstLowInvalidHeapAttributeNumber,
7680 idattrs))
7681 Assert(!nulls[i]);
7682 else
7683 nulls[i] = true;
7684 }
7685
7686 key_tuple = heap_form_tuple(desc, values, nulls);
7687 *copy = true;
7688
7689 bms_free(idattrs);
7690
7691 /*
7692 * If the tuple, which by here only contains indexed columns, still has
7693 * toasted columns, force them to be inlined. This is somewhat unlikely
7694 * since there's limits on the size of indexed columns, so we don't
7695 * duplicate toast_flatten_tuple()s functionality in the above loop over
7696 * the indexed columns, even if it would be more efficient.
7697 */
7698 if (HeapTupleHasExternal(key_tuple))
7699 {
7700 HeapTuple oldtup = key_tuple;
7701
7702 key_tuple = toast_flatten_tuple(oldtup, desc);
7703 heap_freetuple(oldtup);
7704 }
7705
7706 return key_tuple;
7707 }
7708
7709 /*
7710 * Handles CLEANUP_INFO
7711 */
7712 static void
heap_xlog_cleanup_info(XLogReaderState * record)7713 heap_xlog_cleanup_info(XLogReaderState *record)
7714 {
7715 xl_heap_cleanup_info *xlrec = (xl_heap_cleanup_info *) XLogRecGetData(record);
7716
7717 if (InHotStandby)
7718 ResolveRecoveryConflictWithSnapshot(xlrec->latestRemovedXid, xlrec->node);
7719
7720 /*
7721 * Actual operation is a no-op. Record type exists to provide a means for
7722 * conflict processing to occur before we begin index vacuum actions. see
7723 * vacuumlazy.c and also comments in btvacuumpage()
7724 */
7725
7726 /* Backup blocks are not used in cleanup_info records */
7727 Assert(!XLogRecHasAnyBlockRefs(record));
7728 }
7729
7730 /*
7731 * Handles XLOG_HEAP2_CLEAN record type
7732 */
7733 static void
heap_xlog_clean(XLogReaderState * record)7734 heap_xlog_clean(XLogReaderState *record)
7735 {
7736 XLogRecPtr lsn = record->EndRecPtr;
7737 xl_heap_clean *xlrec = (xl_heap_clean *) XLogRecGetData(record);
7738 Buffer buffer;
7739 RelFileNode rnode;
7740 BlockNumber blkno;
7741 XLogRedoAction action;
7742
7743 XLogRecGetBlockTag(record, 0, &rnode, NULL, &blkno);
7744
7745 /*
7746 * We're about to remove tuples. In Hot Standby mode, ensure that there's
7747 * no queries running for which the removed tuples are still visible.
7748 *
7749 * Not all HEAP2_CLEAN records remove tuples with xids, so we only want to
7750 * conflict on the records that cause MVCC failures for user queries. If
7751 * latestRemovedXid is invalid, skip conflict processing.
7752 */
7753 if (InHotStandby && TransactionIdIsValid(xlrec->latestRemovedXid))
7754 ResolveRecoveryConflictWithSnapshot(xlrec->latestRemovedXid, rnode);
7755
7756 /*
7757 * If we have a full-page image, restore it (using a cleanup lock) and
7758 * we're done.
7759 */
7760 action = XLogReadBufferForRedoExtended(record, 0, RBM_NORMAL, true,
7761 &buffer);
7762 if (action == BLK_NEEDS_REDO)
7763 {
7764 Page page = (Page) BufferGetPage(buffer);
7765 OffsetNumber *end;
7766 OffsetNumber *redirected;
7767 OffsetNumber *nowdead;
7768 OffsetNumber *nowunused;
7769 int nredirected;
7770 int ndead;
7771 int nunused;
7772 Size datalen;
7773
7774 redirected = (OffsetNumber *) XLogRecGetBlockData(record, 0, &datalen);
7775
7776 nredirected = xlrec->nredirected;
7777 ndead = xlrec->ndead;
7778 end = (OffsetNumber *) ((char *) redirected + datalen);
7779 nowdead = redirected + (nredirected * 2);
7780 nowunused = nowdead + ndead;
7781 nunused = (end - nowunused);
7782 Assert(nunused >= 0);
7783
7784 /* Update all line pointers per the record, and repair fragmentation */
7785 heap_page_prune_execute(buffer,
7786 redirected, nredirected,
7787 nowdead, ndead,
7788 nowunused, nunused);
7789
7790 /*
7791 * Note: we don't worry about updating the page's prunability hints.
7792 * At worst this will cause an extra prune cycle to occur soon.
7793 */
7794
7795 PageSetLSN(page, lsn);
7796 MarkBufferDirty(buffer);
7797 }
7798
7799 if (BufferIsValid(buffer))
7800 {
7801 Size freespace = PageGetHeapFreeSpace(BufferGetPage(buffer));
7802
7803 UnlockReleaseBuffer(buffer);
7804
7805 /*
7806 * After cleaning records from a page, it's useful to update the FSM
7807 * about it, as it may cause the page become target for insertions
7808 * later even if vacuum decides not to visit it (which is possible if
7809 * gets marked all-visible.)
7810 *
7811 * Do this regardless of a full-page image being applied, since the
7812 * FSM data is not in the page anyway.
7813 */
7814 XLogRecordPageWithFreeSpace(rnode, blkno, freespace);
7815 }
7816 }
7817
7818 /*
7819 * Replay XLOG_HEAP2_VISIBLE record.
7820 *
7821 * The critical integrity requirement here is that we must never end up with
7822 * a situation where the visibility map bit is set, and the page-level
7823 * PD_ALL_VISIBLE bit is clear. If that were to occur, then a subsequent
7824 * page modification would fail to clear the visibility map bit.
7825 */
7826 static void
heap_xlog_visible(XLogReaderState * record)7827 heap_xlog_visible(XLogReaderState *record)
7828 {
7829 XLogRecPtr lsn = record->EndRecPtr;
7830 xl_heap_visible *xlrec = (xl_heap_visible *) XLogRecGetData(record);
7831 Buffer vmbuffer = InvalidBuffer;
7832 Buffer buffer;
7833 Page page;
7834 RelFileNode rnode;
7835 BlockNumber blkno;
7836 XLogRedoAction action;
7837
7838 XLogRecGetBlockTag(record, 1, &rnode, NULL, &blkno);
7839
7840 /*
7841 * If there are any Hot Standby transactions running that have an xmin
7842 * horizon old enough that this page isn't all-visible for them, they
7843 * might incorrectly decide that an index-only scan can skip a heap fetch.
7844 *
7845 * NB: It might be better to throw some kind of "soft" conflict here that
7846 * forces any index-only scan that is in flight to perform heap fetches,
7847 * rather than killing the transaction outright.
7848 */
7849 if (InHotStandby)
7850 ResolveRecoveryConflictWithSnapshot(xlrec->cutoff_xid, rnode);
7851
7852 /*
7853 * Read the heap page, if it still exists. If the heap file has dropped or
7854 * truncated later in recovery, we don't need to update the page, but we'd
7855 * better still update the visibility map.
7856 */
7857 action = XLogReadBufferForRedo(record, 1, &buffer);
7858 if (action == BLK_NEEDS_REDO)
7859 {
7860 /*
7861 * We don't bump the LSN of the heap page when setting the visibility
7862 * map bit (unless checksums or wal_hint_bits is enabled, in which
7863 * case we must), because that would generate an unworkable volume of
7864 * full-page writes. This exposes us to torn page hazards, but since
7865 * we're not inspecting the existing page contents in any way, we
7866 * don't care.
7867 *
7868 * However, all operations that clear the visibility map bit *do* bump
7869 * the LSN, and those operations will only be replayed if the XLOG LSN
7870 * follows the page LSN. Thus, if the page LSN has advanced past our
7871 * XLOG record's LSN, we mustn't mark the page all-visible, because
7872 * the subsequent update won't be replayed to clear the flag.
7873 */
7874 page = BufferGetPage(buffer);
7875
7876 PageSetAllVisible(page);
7877
7878 MarkBufferDirty(buffer);
7879 }
7880 else if (action == BLK_RESTORED)
7881 {
7882 /*
7883 * If heap block was backed up, we already restored it and there's
7884 * nothing more to do. (This can only happen with checksums or
7885 * wal_log_hints enabled.)
7886 */
7887 }
7888
7889 if (BufferIsValid(buffer))
7890 {
7891 Size space = PageGetFreeSpace(BufferGetPage(buffer));
7892
7893 UnlockReleaseBuffer(buffer);
7894
7895 /*
7896 * Since FSM is not WAL-logged and only updated heuristically, it
7897 * easily becomes stale in standbys. If the standby is later promoted
7898 * and runs VACUUM, it will skip updating individual free space
7899 * figures for pages that became all-visible (or all-frozen, depending
7900 * on the vacuum mode,) which is troublesome when FreeSpaceMapVacuum
7901 * propagates too optimistic free space values to upper FSM layers;
7902 * later inserters try to use such pages only to find out that they
7903 * are unusable. This can cause long stalls when there are many such
7904 * pages.
7905 *
7906 * Forestall those problems by updating FSM's idea about a page that
7907 * is becoming all-visible or all-frozen.
7908 *
7909 * Do this regardless of a full-page image being applied, since the
7910 * FSM data is not in the page anyway.
7911 */
7912 if (xlrec->flags & VISIBILITYMAP_VALID_BITS)
7913 XLogRecordPageWithFreeSpace(rnode, blkno, space);
7914 }
7915
7916 /*
7917 * Even if we skipped the heap page update due to the LSN interlock, it's
7918 * still safe to update the visibility map. Any WAL record that clears
7919 * the visibility map bit does so before checking the page LSN, so any
7920 * bits that need to be cleared will still be cleared.
7921 */
7922 if (XLogReadBufferForRedoExtended(record, 0, RBM_ZERO_ON_ERROR, false,
7923 &vmbuffer) == BLK_NEEDS_REDO)
7924 {
7925 Page vmpage = BufferGetPage(vmbuffer);
7926 Relation reln;
7927
7928 /* initialize the page if it was read as zeros */
7929 if (PageIsNew(vmpage))
7930 PageInit(vmpage, BLCKSZ, 0);
7931
7932 /*
7933 * XLogReadBufferForRedoExtended locked the buffer. But
7934 * visibilitymap_set will handle locking itself.
7935 */
7936 LockBuffer(vmbuffer, BUFFER_LOCK_UNLOCK);
7937
7938 reln = CreateFakeRelcacheEntry(rnode);
7939 visibilitymap_pin(reln, blkno, &vmbuffer);
7940
7941 /*
7942 * Don't set the bit if replay has already passed this point.
7943 *
7944 * It might be safe to do this unconditionally; if replay has passed
7945 * this point, we'll replay at least as far this time as we did
7946 * before, and if this bit needs to be cleared, the record responsible
7947 * for doing so should be again replayed, and clear it. For right
7948 * now, out of an abundance of conservatism, we use the same test here
7949 * we did for the heap page. If this results in a dropped bit, no
7950 * real harm is done; and the next VACUUM will fix it.
7951 */
7952 if (lsn > PageGetLSN(vmpage))
7953 visibilitymap_set(reln, blkno, InvalidBuffer, lsn, vmbuffer,
7954 xlrec->cutoff_xid, xlrec->flags);
7955
7956 ReleaseBuffer(vmbuffer);
7957 FreeFakeRelcacheEntry(reln);
7958 }
7959 else if (BufferIsValid(vmbuffer))
7960 UnlockReleaseBuffer(vmbuffer);
7961 }
7962
7963 /*
7964 * Replay XLOG_HEAP2_FREEZE_PAGE records
7965 */
7966 static void
heap_xlog_freeze_page(XLogReaderState * record)7967 heap_xlog_freeze_page(XLogReaderState *record)
7968 {
7969 XLogRecPtr lsn = record->EndRecPtr;
7970 xl_heap_freeze_page *xlrec = (xl_heap_freeze_page *) XLogRecGetData(record);
7971 TransactionId cutoff_xid = xlrec->cutoff_xid;
7972 Buffer buffer;
7973 int ntup;
7974
7975 /*
7976 * In Hot Standby mode, ensure that there's no queries running which still
7977 * consider the frozen xids as running.
7978 */
7979 if (InHotStandby)
7980 {
7981 RelFileNode rnode;
7982 TransactionId latestRemovedXid = cutoff_xid;
7983
7984 TransactionIdRetreat(latestRemovedXid);
7985
7986 XLogRecGetBlockTag(record, 0, &rnode, NULL, NULL);
7987 ResolveRecoveryConflictWithSnapshot(latestRemovedXid, rnode);
7988 }
7989
7990 if (XLogReadBufferForRedo(record, 0, &buffer) == BLK_NEEDS_REDO)
7991 {
7992 Page page = BufferGetPage(buffer);
7993 xl_heap_freeze_tuple *tuples;
7994
7995 tuples = (xl_heap_freeze_tuple *) XLogRecGetBlockData(record, 0, NULL);
7996
7997 /* now execute freeze plan for each frozen tuple */
7998 for (ntup = 0; ntup < xlrec->ntuples; ntup++)
7999 {
8000 xl_heap_freeze_tuple *xlrec_tp;
8001 ItemId lp;
8002 HeapTupleHeader tuple;
8003
8004 xlrec_tp = &tuples[ntup];
8005 lp = PageGetItemId(page, xlrec_tp->offset); /* offsets are one-based */
8006 tuple = (HeapTupleHeader) PageGetItem(page, lp);
8007
8008 heap_execute_freeze_tuple(tuple, xlrec_tp);
8009 }
8010
8011 PageSetLSN(page, lsn);
8012 MarkBufferDirty(buffer);
8013 }
8014 if (BufferIsValid(buffer))
8015 UnlockReleaseBuffer(buffer);
8016 }
8017
8018 /*
8019 * Given an "infobits" field from an XLog record, set the correct bits in the
8020 * given infomask and infomask2 for the tuple touched by the record.
8021 *
8022 * (This is the reverse of compute_infobits).
8023 */
8024 static void
fix_infomask_from_infobits(uint8 infobits,uint16 * infomask,uint16 * infomask2)8025 fix_infomask_from_infobits(uint8 infobits, uint16 *infomask, uint16 *infomask2)
8026 {
8027 *infomask &= ~(HEAP_XMAX_IS_MULTI | HEAP_XMAX_LOCK_ONLY |
8028 HEAP_XMAX_KEYSHR_LOCK | HEAP_XMAX_EXCL_LOCK);
8029 *infomask2 &= ~HEAP_KEYS_UPDATED;
8030
8031 if (infobits & XLHL_XMAX_IS_MULTI)
8032 *infomask |= HEAP_XMAX_IS_MULTI;
8033 if (infobits & XLHL_XMAX_LOCK_ONLY)
8034 *infomask |= HEAP_XMAX_LOCK_ONLY;
8035 if (infobits & XLHL_XMAX_EXCL_LOCK)
8036 *infomask |= HEAP_XMAX_EXCL_LOCK;
8037 /* note HEAP_XMAX_SHR_LOCK isn't considered here */
8038 if (infobits & XLHL_XMAX_KEYSHR_LOCK)
8039 *infomask |= HEAP_XMAX_KEYSHR_LOCK;
8040
8041 if (infobits & XLHL_KEYS_UPDATED)
8042 *infomask2 |= HEAP_KEYS_UPDATED;
8043 }
8044
8045 static void
heap_xlog_delete(XLogReaderState * record)8046 heap_xlog_delete(XLogReaderState *record)
8047 {
8048 XLogRecPtr lsn = record->EndRecPtr;
8049 xl_heap_delete *xlrec = (xl_heap_delete *) XLogRecGetData(record);
8050 Buffer buffer;
8051 Page page;
8052 ItemId lp = NULL;
8053 HeapTupleHeader htup;
8054 BlockNumber blkno;
8055 RelFileNode target_node;
8056 ItemPointerData target_tid;
8057
8058 XLogRecGetBlockTag(record, 0, &target_node, NULL, &blkno);
8059 ItemPointerSetBlockNumber(&target_tid, blkno);
8060 ItemPointerSetOffsetNumber(&target_tid, xlrec->offnum);
8061
8062 /*
8063 * The visibility map may need to be fixed even if the heap page is
8064 * already up-to-date.
8065 */
8066 if (xlrec->flags & XLH_DELETE_ALL_VISIBLE_CLEARED)
8067 {
8068 Relation reln = CreateFakeRelcacheEntry(target_node);
8069 Buffer vmbuffer = InvalidBuffer;
8070
8071 visibilitymap_pin(reln, blkno, &vmbuffer);
8072 visibilitymap_clear(reln, blkno, vmbuffer, VISIBILITYMAP_VALID_BITS);
8073 ReleaseBuffer(vmbuffer);
8074 FreeFakeRelcacheEntry(reln);
8075 }
8076
8077 if (XLogReadBufferForRedo(record, 0, &buffer) == BLK_NEEDS_REDO)
8078 {
8079 page = BufferGetPage(buffer);
8080
8081 if (PageGetMaxOffsetNumber(page) >= xlrec->offnum)
8082 lp = PageGetItemId(page, xlrec->offnum);
8083
8084 if (PageGetMaxOffsetNumber(page) < xlrec->offnum || !ItemIdIsNormal(lp))
8085 elog(PANIC, "invalid lp");
8086
8087 htup = (HeapTupleHeader) PageGetItem(page, lp);
8088
8089 htup->t_infomask &= ~(HEAP_XMAX_BITS | HEAP_MOVED);
8090 htup->t_infomask2 &= ~HEAP_KEYS_UPDATED;
8091 HeapTupleHeaderClearHotUpdated(htup);
8092 fix_infomask_from_infobits(xlrec->infobits_set,
8093 &htup->t_infomask, &htup->t_infomask2);
8094 if (!(xlrec->flags & XLH_DELETE_IS_SUPER))
8095 HeapTupleHeaderSetXmax(htup, xlrec->xmax);
8096 else
8097 HeapTupleHeaderSetXmin(htup, InvalidTransactionId);
8098 HeapTupleHeaderSetCmax(htup, FirstCommandId, false);
8099
8100 /* Mark the page as a candidate for pruning */
8101 PageSetPrunable(page, XLogRecGetXid(record));
8102
8103 if (xlrec->flags & XLH_DELETE_ALL_VISIBLE_CLEARED)
8104 PageClearAllVisible(page);
8105
8106 /* Make sure t_ctid is set correctly */
8107 if (xlrec->flags & XLH_DELETE_IS_PARTITION_MOVE)
8108 HeapTupleHeaderSetMovedPartitions(htup);
8109 else
8110 htup->t_ctid = target_tid;
8111 PageSetLSN(page, lsn);
8112 MarkBufferDirty(buffer);
8113 }
8114 if (BufferIsValid(buffer))
8115 UnlockReleaseBuffer(buffer);
8116 }
8117
8118 static void
heap_xlog_insert(XLogReaderState * record)8119 heap_xlog_insert(XLogReaderState *record)
8120 {
8121 XLogRecPtr lsn = record->EndRecPtr;
8122 xl_heap_insert *xlrec = (xl_heap_insert *) XLogRecGetData(record);
8123 Buffer buffer;
8124 Page page;
8125 union
8126 {
8127 HeapTupleHeaderData hdr;
8128 char data[MaxHeapTupleSize];
8129 } tbuf;
8130 HeapTupleHeader htup;
8131 xl_heap_header xlhdr;
8132 uint32 newlen;
8133 Size freespace = 0;
8134 RelFileNode target_node;
8135 BlockNumber blkno;
8136 ItemPointerData target_tid;
8137 XLogRedoAction action;
8138
8139 XLogRecGetBlockTag(record, 0, &target_node, NULL, &blkno);
8140 ItemPointerSetBlockNumber(&target_tid, blkno);
8141 ItemPointerSetOffsetNumber(&target_tid, xlrec->offnum);
8142
8143 /*
8144 * The visibility map may need to be fixed even if the heap page is
8145 * already up-to-date.
8146 */
8147 if (xlrec->flags & XLH_INSERT_ALL_VISIBLE_CLEARED)
8148 {
8149 Relation reln = CreateFakeRelcacheEntry(target_node);
8150 Buffer vmbuffer = InvalidBuffer;
8151
8152 visibilitymap_pin(reln, blkno, &vmbuffer);
8153 visibilitymap_clear(reln, blkno, vmbuffer, VISIBILITYMAP_VALID_BITS);
8154 ReleaseBuffer(vmbuffer);
8155 FreeFakeRelcacheEntry(reln);
8156 }
8157
8158 /*
8159 * If we inserted the first and only tuple on the page, re-initialize the
8160 * page from scratch.
8161 */
8162 if (XLogRecGetInfo(record) & XLOG_HEAP_INIT_PAGE)
8163 {
8164 buffer = XLogInitBufferForRedo(record, 0);
8165 page = BufferGetPage(buffer);
8166 PageInit(page, BufferGetPageSize(buffer), 0);
8167 action = BLK_NEEDS_REDO;
8168 }
8169 else
8170 action = XLogReadBufferForRedo(record, 0, &buffer);
8171 if (action == BLK_NEEDS_REDO)
8172 {
8173 Size datalen;
8174 char *data;
8175
8176 page = BufferGetPage(buffer);
8177
8178 if (PageGetMaxOffsetNumber(page) + 1 < xlrec->offnum)
8179 elog(PANIC, "invalid max offset number");
8180
8181 data = XLogRecGetBlockData(record, 0, &datalen);
8182
8183 newlen = datalen - SizeOfHeapHeader;
8184 Assert(datalen > SizeOfHeapHeader && newlen <= MaxHeapTupleSize);
8185 memcpy((char *) &xlhdr, data, SizeOfHeapHeader);
8186 data += SizeOfHeapHeader;
8187
8188 htup = &tbuf.hdr;
8189 MemSet((char *) htup, 0, SizeofHeapTupleHeader);
8190 /* PG73FORMAT: get bitmap [+ padding] [+ oid] + data */
8191 memcpy((char *) htup + SizeofHeapTupleHeader,
8192 data,
8193 newlen);
8194 newlen += SizeofHeapTupleHeader;
8195 htup->t_infomask2 = xlhdr.t_infomask2;
8196 htup->t_infomask = xlhdr.t_infomask;
8197 htup->t_hoff = xlhdr.t_hoff;
8198 HeapTupleHeaderSetXmin(htup, XLogRecGetXid(record));
8199 HeapTupleHeaderSetCmin(htup, FirstCommandId);
8200 htup->t_ctid = target_tid;
8201
8202 if (PageAddItem(page, (Item) htup, newlen, xlrec->offnum,
8203 true, true) == InvalidOffsetNumber)
8204 elog(PANIC, "failed to add tuple");
8205
8206 freespace = PageGetHeapFreeSpace(page); /* needed to update FSM below */
8207
8208 PageSetLSN(page, lsn);
8209
8210 if (xlrec->flags & XLH_INSERT_ALL_VISIBLE_CLEARED)
8211 PageClearAllVisible(page);
8212
8213 MarkBufferDirty(buffer);
8214 }
8215 if (BufferIsValid(buffer))
8216 UnlockReleaseBuffer(buffer);
8217
8218 /*
8219 * If the page is running low on free space, update the FSM as well.
8220 * Arbitrarily, our definition of "low" is less than 20%. We can't do much
8221 * better than that without knowing the fill-factor for the table.
8222 *
8223 * XXX: Don't do this if the page was restored from full page image. We
8224 * don't bother to update the FSM in that case, it doesn't need to be
8225 * totally accurate anyway.
8226 */
8227 if (action == BLK_NEEDS_REDO && freespace < BLCKSZ / 5)
8228 XLogRecordPageWithFreeSpace(target_node, blkno, freespace);
8229 }
8230
8231 /*
8232 * Handles MULTI_INSERT record type.
8233 */
8234 static void
heap_xlog_multi_insert(XLogReaderState * record)8235 heap_xlog_multi_insert(XLogReaderState *record)
8236 {
8237 XLogRecPtr lsn = record->EndRecPtr;
8238 xl_heap_multi_insert *xlrec;
8239 RelFileNode rnode;
8240 BlockNumber blkno;
8241 Buffer buffer;
8242 Page page;
8243 union
8244 {
8245 HeapTupleHeaderData hdr;
8246 char data[MaxHeapTupleSize];
8247 } tbuf;
8248 HeapTupleHeader htup;
8249 uint32 newlen;
8250 Size freespace = 0;
8251 int i;
8252 bool isinit = (XLogRecGetInfo(record) & XLOG_HEAP_INIT_PAGE) != 0;
8253 XLogRedoAction action;
8254
8255 /*
8256 * Insertion doesn't overwrite MVCC data, so no conflict processing is
8257 * required.
8258 */
8259 xlrec = (xl_heap_multi_insert *) XLogRecGetData(record);
8260
8261 XLogRecGetBlockTag(record, 0, &rnode, NULL, &blkno);
8262
8263 /*
8264 * The visibility map may need to be fixed even if the heap page is
8265 * already up-to-date.
8266 */
8267 if (xlrec->flags & XLH_INSERT_ALL_VISIBLE_CLEARED)
8268 {
8269 Relation reln = CreateFakeRelcacheEntry(rnode);
8270 Buffer vmbuffer = InvalidBuffer;
8271
8272 visibilitymap_pin(reln, blkno, &vmbuffer);
8273 visibilitymap_clear(reln, blkno, vmbuffer, VISIBILITYMAP_VALID_BITS);
8274 ReleaseBuffer(vmbuffer);
8275 FreeFakeRelcacheEntry(reln);
8276 }
8277
8278 if (isinit)
8279 {
8280 buffer = XLogInitBufferForRedo(record, 0);
8281 page = BufferGetPage(buffer);
8282 PageInit(page, BufferGetPageSize(buffer), 0);
8283 action = BLK_NEEDS_REDO;
8284 }
8285 else
8286 action = XLogReadBufferForRedo(record, 0, &buffer);
8287 if (action == BLK_NEEDS_REDO)
8288 {
8289 char *tupdata;
8290 char *endptr;
8291 Size len;
8292
8293 /* Tuples are stored as block data */
8294 tupdata = XLogRecGetBlockData(record, 0, &len);
8295 endptr = tupdata + len;
8296
8297 page = (Page) BufferGetPage(buffer);
8298
8299 for (i = 0; i < xlrec->ntuples; i++)
8300 {
8301 OffsetNumber offnum;
8302 xl_multi_insert_tuple *xlhdr;
8303
8304 /*
8305 * If we're reinitializing the page, the tuples are stored in
8306 * order from FirstOffsetNumber. Otherwise there's an array of
8307 * offsets in the WAL record, and the tuples come after that.
8308 */
8309 if (isinit)
8310 offnum = FirstOffsetNumber + i;
8311 else
8312 offnum = xlrec->offsets[i];
8313 if (PageGetMaxOffsetNumber(page) + 1 < offnum)
8314 elog(PANIC, "invalid max offset number");
8315
8316 xlhdr = (xl_multi_insert_tuple *) SHORTALIGN(tupdata);
8317 tupdata = ((char *) xlhdr) + SizeOfMultiInsertTuple;
8318
8319 newlen = xlhdr->datalen;
8320 Assert(newlen <= MaxHeapTupleSize);
8321 htup = &tbuf.hdr;
8322 MemSet((char *) htup, 0, SizeofHeapTupleHeader);
8323 /* PG73FORMAT: get bitmap [+ padding] [+ oid] + data */
8324 memcpy((char *) htup + SizeofHeapTupleHeader,
8325 (char *) tupdata,
8326 newlen);
8327 tupdata += newlen;
8328
8329 newlen += SizeofHeapTupleHeader;
8330 htup->t_infomask2 = xlhdr->t_infomask2;
8331 htup->t_infomask = xlhdr->t_infomask;
8332 htup->t_hoff = xlhdr->t_hoff;
8333 HeapTupleHeaderSetXmin(htup, XLogRecGetXid(record));
8334 HeapTupleHeaderSetCmin(htup, FirstCommandId);
8335 ItemPointerSetBlockNumber(&htup->t_ctid, blkno);
8336 ItemPointerSetOffsetNumber(&htup->t_ctid, offnum);
8337
8338 offnum = PageAddItem(page, (Item) htup, newlen, offnum, true, true);
8339 if (offnum == InvalidOffsetNumber)
8340 elog(PANIC, "failed to add tuple");
8341 }
8342 if (tupdata != endptr)
8343 elog(PANIC, "total tuple length mismatch");
8344
8345 freespace = PageGetHeapFreeSpace(page); /* needed to update FSM below */
8346
8347 PageSetLSN(page, lsn);
8348
8349 if (xlrec->flags & XLH_INSERT_ALL_VISIBLE_CLEARED)
8350 PageClearAllVisible(page);
8351
8352 MarkBufferDirty(buffer);
8353 }
8354 if (BufferIsValid(buffer))
8355 UnlockReleaseBuffer(buffer);
8356
8357 /*
8358 * If the page is running low on free space, update the FSM as well.
8359 * Arbitrarily, our definition of "low" is less than 20%. We can't do much
8360 * better than that without knowing the fill-factor for the table.
8361 *
8362 * XXX: Don't do this if the page was restored from full page image. We
8363 * don't bother to update the FSM in that case, it doesn't need to be
8364 * totally accurate anyway.
8365 */
8366 if (action == BLK_NEEDS_REDO && freespace < BLCKSZ / 5)
8367 XLogRecordPageWithFreeSpace(rnode, blkno, freespace);
8368 }
8369
8370 /*
8371 * Handles UPDATE and HOT_UPDATE
8372 */
8373 static void
heap_xlog_update(XLogReaderState * record,bool hot_update)8374 heap_xlog_update(XLogReaderState *record, bool hot_update)
8375 {
8376 XLogRecPtr lsn = record->EndRecPtr;
8377 xl_heap_update *xlrec = (xl_heap_update *) XLogRecGetData(record);
8378 RelFileNode rnode;
8379 BlockNumber oldblk;
8380 BlockNumber newblk;
8381 ItemPointerData newtid;
8382 Buffer obuffer,
8383 nbuffer;
8384 Page page;
8385 OffsetNumber offnum;
8386 ItemId lp = NULL;
8387 HeapTupleData oldtup;
8388 HeapTupleHeader htup;
8389 uint16 prefixlen = 0,
8390 suffixlen = 0;
8391 char *newp;
8392 union
8393 {
8394 HeapTupleHeaderData hdr;
8395 char data[MaxHeapTupleSize];
8396 } tbuf;
8397 xl_heap_header xlhdr;
8398 uint32 newlen;
8399 Size freespace = 0;
8400 XLogRedoAction oldaction;
8401 XLogRedoAction newaction;
8402
8403 /* initialize to keep the compiler quiet */
8404 oldtup.t_data = NULL;
8405 oldtup.t_len = 0;
8406
8407 XLogRecGetBlockTag(record, 0, &rnode, NULL, &newblk);
8408 if (XLogRecGetBlockTag(record, 1, NULL, NULL, &oldblk))
8409 {
8410 /* HOT updates are never done across pages */
8411 Assert(!hot_update);
8412 }
8413 else
8414 oldblk = newblk;
8415
8416 ItemPointerSet(&newtid, newblk, xlrec->new_offnum);
8417
8418 /*
8419 * The visibility map may need to be fixed even if the heap page is
8420 * already up-to-date.
8421 */
8422 if (xlrec->flags & XLH_UPDATE_OLD_ALL_VISIBLE_CLEARED)
8423 {
8424 Relation reln = CreateFakeRelcacheEntry(rnode);
8425 Buffer vmbuffer = InvalidBuffer;
8426
8427 visibilitymap_pin(reln, oldblk, &vmbuffer);
8428 visibilitymap_clear(reln, oldblk, vmbuffer, VISIBILITYMAP_VALID_BITS);
8429 ReleaseBuffer(vmbuffer);
8430 FreeFakeRelcacheEntry(reln);
8431 }
8432
8433 /*
8434 * In normal operation, it is important to lock the two pages in
8435 * page-number order, to avoid possible deadlocks against other update
8436 * operations going the other way. However, during WAL replay there can
8437 * be no other update happening, so we don't need to worry about that. But
8438 * we *do* need to worry that we don't expose an inconsistent state to Hot
8439 * Standby queries --- so the original page can't be unlocked before we've
8440 * added the new tuple to the new page.
8441 */
8442
8443 /* Deal with old tuple version */
8444 oldaction = XLogReadBufferForRedo(record, (oldblk == newblk) ? 0 : 1,
8445 &obuffer);
8446 if (oldaction == BLK_NEEDS_REDO)
8447 {
8448 page = BufferGetPage(obuffer);
8449 offnum = xlrec->old_offnum;
8450 if (PageGetMaxOffsetNumber(page) >= offnum)
8451 lp = PageGetItemId(page, offnum);
8452
8453 if (PageGetMaxOffsetNumber(page) < offnum || !ItemIdIsNormal(lp))
8454 elog(PANIC, "invalid lp");
8455
8456 htup = (HeapTupleHeader) PageGetItem(page, lp);
8457
8458 oldtup.t_data = htup;
8459 oldtup.t_len = ItemIdGetLength(lp);
8460
8461 htup->t_infomask &= ~(HEAP_XMAX_BITS | HEAP_MOVED);
8462 htup->t_infomask2 &= ~HEAP_KEYS_UPDATED;
8463 if (hot_update)
8464 HeapTupleHeaderSetHotUpdated(htup);
8465 else
8466 HeapTupleHeaderClearHotUpdated(htup);
8467 fix_infomask_from_infobits(xlrec->old_infobits_set, &htup->t_infomask,
8468 &htup->t_infomask2);
8469 HeapTupleHeaderSetXmax(htup, xlrec->old_xmax);
8470 HeapTupleHeaderSetCmax(htup, FirstCommandId, false);
8471 /* Set forward chain link in t_ctid */
8472 htup->t_ctid = newtid;
8473
8474 /* Mark the page as a candidate for pruning */
8475 PageSetPrunable(page, XLogRecGetXid(record));
8476
8477 if (xlrec->flags & XLH_UPDATE_OLD_ALL_VISIBLE_CLEARED)
8478 PageClearAllVisible(page);
8479
8480 PageSetLSN(page, lsn);
8481 MarkBufferDirty(obuffer);
8482 }
8483
8484 /*
8485 * Read the page the new tuple goes into, if different from old.
8486 */
8487 if (oldblk == newblk)
8488 {
8489 nbuffer = obuffer;
8490 newaction = oldaction;
8491 }
8492 else if (XLogRecGetInfo(record) & XLOG_HEAP_INIT_PAGE)
8493 {
8494 nbuffer = XLogInitBufferForRedo(record, 0);
8495 page = (Page) BufferGetPage(nbuffer);
8496 PageInit(page, BufferGetPageSize(nbuffer), 0);
8497 newaction = BLK_NEEDS_REDO;
8498 }
8499 else
8500 newaction = XLogReadBufferForRedo(record, 0, &nbuffer);
8501
8502 /*
8503 * The visibility map may need to be fixed even if the heap page is
8504 * already up-to-date.
8505 */
8506 if (xlrec->flags & XLH_UPDATE_NEW_ALL_VISIBLE_CLEARED)
8507 {
8508 Relation reln = CreateFakeRelcacheEntry(rnode);
8509 Buffer vmbuffer = InvalidBuffer;
8510
8511 visibilitymap_pin(reln, newblk, &vmbuffer);
8512 visibilitymap_clear(reln, newblk, vmbuffer, VISIBILITYMAP_VALID_BITS);
8513 ReleaseBuffer(vmbuffer);
8514 FreeFakeRelcacheEntry(reln);
8515 }
8516
8517 /* Deal with new tuple */
8518 if (newaction == BLK_NEEDS_REDO)
8519 {
8520 char *recdata;
8521 char *recdata_end;
8522 Size datalen;
8523 Size tuplen;
8524
8525 recdata = XLogRecGetBlockData(record, 0, &datalen);
8526 recdata_end = recdata + datalen;
8527
8528 page = BufferGetPage(nbuffer);
8529
8530 offnum = xlrec->new_offnum;
8531 if (PageGetMaxOffsetNumber(page) + 1 < offnum)
8532 elog(PANIC, "invalid max offset number");
8533
8534 if (xlrec->flags & XLH_UPDATE_PREFIX_FROM_OLD)
8535 {
8536 Assert(newblk == oldblk);
8537 memcpy(&prefixlen, recdata, sizeof(uint16));
8538 recdata += sizeof(uint16);
8539 }
8540 if (xlrec->flags & XLH_UPDATE_SUFFIX_FROM_OLD)
8541 {
8542 Assert(newblk == oldblk);
8543 memcpy(&suffixlen, recdata, sizeof(uint16));
8544 recdata += sizeof(uint16);
8545 }
8546
8547 memcpy((char *) &xlhdr, recdata, SizeOfHeapHeader);
8548 recdata += SizeOfHeapHeader;
8549
8550 tuplen = recdata_end - recdata;
8551 Assert(tuplen <= MaxHeapTupleSize);
8552
8553 htup = &tbuf.hdr;
8554 MemSet((char *) htup, 0, SizeofHeapTupleHeader);
8555
8556 /*
8557 * Reconstruct the new tuple using the prefix and/or suffix from the
8558 * old tuple, and the data stored in the WAL record.
8559 */
8560 newp = (char *) htup + SizeofHeapTupleHeader;
8561 if (prefixlen > 0)
8562 {
8563 int len;
8564
8565 /* copy bitmap [+ padding] [+ oid] from WAL record */
8566 len = xlhdr.t_hoff - SizeofHeapTupleHeader;
8567 memcpy(newp, recdata, len);
8568 recdata += len;
8569 newp += len;
8570
8571 /* copy prefix from old tuple */
8572 memcpy(newp, (char *) oldtup.t_data + oldtup.t_data->t_hoff, prefixlen);
8573 newp += prefixlen;
8574
8575 /* copy new tuple data from WAL record */
8576 len = tuplen - (xlhdr.t_hoff - SizeofHeapTupleHeader);
8577 memcpy(newp, recdata, len);
8578 recdata += len;
8579 newp += len;
8580 }
8581 else
8582 {
8583 /*
8584 * copy bitmap [+ padding] [+ oid] + data from record, all in one
8585 * go
8586 */
8587 memcpy(newp, recdata, tuplen);
8588 recdata += tuplen;
8589 newp += tuplen;
8590 }
8591 Assert(recdata == recdata_end);
8592
8593 /* copy suffix from old tuple */
8594 if (suffixlen > 0)
8595 memcpy(newp, (char *) oldtup.t_data + oldtup.t_len - suffixlen, suffixlen);
8596
8597 newlen = SizeofHeapTupleHeader + tuplen + prefixlen + suffixlen;
8598 htup->t_infomask2 = xlhdr.t_infomask2;
8599 htup->t_infomask = xlhdr.t_infomask;
8600 htup->t_hoff = xlhdr.t_hoff;
8601
8602 HeapTupleHeaderSetXmin(htup, XLogRecGetXid(record));
8603 HeapTupleHeaderSetCmin(htup, FirstCommandId);
8604 HeapTupleHeaderSetXmax(htup, xlrec->new_xmax);
8605 /* Make sure there is no forward chain link in t_ctid */
8606 htup->t_ctid = newtid;
8607
8608 offnum = PageAddItem(page, (Item) htup, newlen, offnum, true, true);
8609 if (offnum == InvalidOffsetNumber)
8610 elog(PANIC, "failed to add tuple");
8611
8612 if (xlrec->flags & XLH_UPDATE_NEW_ALL_VISIBLE_CLEARED)
8613 PageClearAllVisible(page);
8614
8615 freespace = PageGetHeapFreeSpace(page); /* needed to update FSM below */
8616
8617 PageSetLSN(page, lsn);
8618 MarkBufferDirty(nbuffer);
8619 }
8620
8621 if (BufferIsValid(nbuffer) && nbuffer != obuffer)
8622 UnlockReleaseBuffer(nbuffer);
8623 if (BufferIsValid(obuffer))
8624 UnlockReleaseBuffer(obuffer);
8625
8626 /*
8627 * If the new page is running low on free space, update the FSM as well.
8628 * Arbitrarily, our definition of "low" is less than 20%. We can't do much
8629 * better than that without knowing the fill-factor for the table.
8630 *
8631 * However, don't update the FSM on HOT updates, because after crash
8632 * recovery, either the old or the new tuple will certainly be dead and
8633 * prunable. After pruning, the page will have roughly as much free space
8634 * as it did before the update, assuming the new tuple is about the same
8635 * size as the old one.
8636 *
8637 * XXX: Don't do this if the page was restored from full page image. We
8638 * don't bother to update the FSM in that case, it doesn't need to be
8639 * totally accurate anyway.
8640 */
8641 if (newaction == BLK_NEEDS_REDO && !hot_update && freespace < BLCKSZ / 5)
8642 XLogRecordPageWithFreeSpace(rnode, newblk, freespace);
8643 }
8644
8645 static void
heap_xlog_confirm(XLogReaderState * record)8646 heap_xlog_confirm(XLogReaderState *record)
8647 {
8648 XLogRecPtr lsn = record->EndRecPtr;
8649 xl_heap_confirm *xlrec = (xl_heap_confirm *) XLogRecGetData(record);
8650 Buffer buffer;
8651 Page page;
8652 OffsetNumber offnum;
8653 ItemId lp = NULL;
8654 HeapTupleHeader htup;
8655
8656 if (XLogReadBufferForRedo(record, 0, &buffer) == BLK_NEEDS_REDO)
8657 {
8658 page = BufferGetPage(buffer);
8659
8660 offnum = xlrec->offnum;
8661 if (PageGetMaxOffsetNumber(page) >= offnum)
8662 lp = PageGetItemId(page, offnum);
8663
8664 if (PageGetMaxOffsetNumber(page) < offnum || !ItemIdIsNormal(lp))
8665 elog(PANIC, "invalid lp");
8666
8667 htup = (HeapTupleHeader) PageGetItem(page, lp);
8668
8669 /*
8670 * Confirm tuple as actually inserted
8671 */
8672 ItemPointerSet(&htup->t_ctid, BufferGetBlockNumber(buffer), offnum);
8673
8674 PageSetLSN(page, lsn);
8675 MarkBufferDirty(buffer);
8676 }
8677 if (BufferIsValid(buffer))
8678 UnlockReleaseBuffer(buffer);
8679 }
8680
8681 static void
heap_xlog_lock(XLogReaderState * record)8682 heap_xlog_lock(XLogReaderState *record)
8683 {
8684 XLogRecPtr lsn = record->EndRecPtr;
8685 xl_heap_lock *xlrec = (xl_heap_lock *) XLogRecGetData(record);
8686 Buffer buffer;
8687 Page page;
8688 OffsetNumber offnum;
8689 ItemId lp = NULL;
8690 HeapTupleHeader htup;
8691
8692 /*
8693 * The visibility map may need to be fixed even if the heap page is
8694 * already up-to-date.
8695 */
8696 if (xlrec->flags & XLH_LOCK_ALL_FROZEN_CLEARED)
8697 {
8698 RelFileNode rnode;
8699 Buffer vmbuffer = InvalidBuffer;
8700 BlockNumber block;
8701 Relation reln;
8702
8703 XLogRecGetBlockTag(record, 0, &rnode, NULL, &block);
8704 reln = CreateFakeRelcacheEntry(rnode);
8705
8706 visibilitymap_pin(reln, block, &vmbuffer);
8707 visibilitymap_clear(reln, block, vmbuffer, VISIBILITYMAP_ALL_FROZEN);
8708
8709 ReleaseBuffer(vmbuffer);
8710 FreeFakeRelcacheEntry(reln);
8711 }
8712
8713 if (XLogReadBufferForRedo(record, 0, &buffer) == BLK_NEEDS_REDO)
8714 {
8715 page = (Page) BufferGetPage(buffer);
8716
8717 offnum = xlrec->offnum;
8718 if (PageGetMaxOffsetNumber(page) >= offnum)
8719 lp = PageGetItemId(page, offnum);
8720
8721 if (PageGetMaxOffsetNumber(page) < offnum || !ItemIdIsNormal(lp))
8722 elog(PANIC, "invalid lp");
8723
8724 htup = (HeapTupleHeader) PageGetItem(page, lp);
8725
8726 htup->t_infomask &= ~(HEAP_XMAX_BITS | HEAP_MOVED);
8727 htup->t_infomask2 &= ~HEAP_KEYS_UPDATED;
8728 fix_infomask_from_infobits(xlrec->infobits_set, &htup->t_infomask,
8729 &htup->t_infomask2);
8730
8731 /*
8732 * Clear relevant update flags, but only if the modified infomask says
8733 * there's no update.
8734 */
8735 if (HEAP_XMAX_IS_LOCKED_ONLY(htup->t_infomask))
8736 {
8737 HeapTupleHeaderClearHotUpdated(htup);
8738 /* Make sure there is no forward chain link in t_ctid */
8739 ItemPointerSet(&htup->t_ctid,
8740 BufferGetBlockNumber(buffer),
8741 offnum);
8742 }
8743 HeapTupleHeaderSetXmax(htup, xlrec->locking_xid);
8744 HeapTupleHeaderSetCmax(htup, FirstCommandId, false);
8745 PageSetLSN(page, lsn);
8746 MarkBufferDirty(buffer);
8747 }
8748 if (BufferIsValid(buffer))
8749 UnlockReleaseBuffer(buffer);
8750 }
8751
8752 static void
heap_xlog_lock_updated(XLogReaderState * record)8753 heap_xlog_lock_updated(XLogReaderState *record)
8754 {
8755 XLogRecPtr lsn = record->EndRecPtr;
8756 xl_heap_lock_updated *xlrec;
8757 Buffer buffer;
8758 Page page;
8759 OffsetNumber offnum;
8760 ItemId lp = NULL;
8761 HeapTupleHeader htup;
8762
8763 xlrec = (xl_heap_lock_updated *) XLogRecGetData(record);
8764
8765 /*
8766 * The visibility map may need to be fixed even if the heap page is
8767 * already up-to-date.
8768 */
8769 if (xlrec->flags & XLH_LOCK_ALL_FROZEN_CLEARED)
8770 {
8771 RelFileNode rnode;
8772 Buffer vmbuffer = InvalidBuffer;
8773 BlockNumber block;
8774 Relation reln;
8775
8776 XLogRecGetBlockTag(record, 0, &rnode, NULL, &block);
8777 reln = CreateFakeRelcacheEntry(rnode);
8778
8779 visibilitymap_pin(reln, block, &vmbuffer);
8780 visibilitymap_clear(reln, block, vmbuffer, VISIBILITYMAP_ALL_FROZEN);
8781
8782 ReleaseBuffer(vmbuffer);
8783 FreeFakeRelcacheEntry(reln);
8784 }
8785
8786 if (XLogReadBufferForRedo(record, 0, &buffer) == BLK_NEEDS_REDO)
8787 {
8788 page = BufferGetPage(buffer);
8789
8790 offnum = xlrec->offnum;
8791 if (PageGetMaxOffsetNumber(page) >= offnum)
8792 lp = PageGetItemId(page, offnum);
8793
8794 if (PageGetMaxOffsetNumber(page) < offnum || !ItemIdIsNormal(lp))
8795 elog(PANIC, "invalid lp");
8796
8797 htup = (HeapTupleHeader) PageGetItem(page, lp);
8798
8799 htup->t_infomask &= ~(HEAP_XMAX_BITS | HEAP_MOVED);
8800 htup->t_infomask2 &= ~HEAP_KEYS_UPDATED;
8801 fix_infomask_from_infobits(xlrec->infobits_set, &htup->t_infomask,
8802 &htup->t_infomask2);
8803 HeapTupleHeaderSetXmax(htup, xlrec->xmax);
8804
8805 PageSetLSN(page, lsn);
8806 MarkBufferDirty(buffer);
8807 }
8808 if (BufferIsValid(buffer))
8809 UnlockReleaseBuffer(buffer);
8810 }
8811
8812 static void
heap_xlog_inplace(XLogReaderState * record)8813 heap_xlog_inplace(XLogReaderState *record)
8814 {
8815 XLogRecPtr lsn = record->EndRecPtr;
8816 xl_heap_inplace *xlrec = (xl_heap_inplace *) XLogRecGetData(record);
8817 Buffer buffer;
8818 Page page;
8819 OffsetNumber offnum;
8820 ItemId lp = NULL;
8821 HeapTupleHeader htup;
8822 uint32 oldlen;
8823 Size newlen;
8824
8825 if (XLogReadBufferForRedo(record, 0, &buffer) == BLK_NEEDS_REDO)
8826 {
8827 char *newtup = XLogRecGetBlockData(record, 0, &newlen);
8828
8829 page = BufferGetPage(buffer);
8830
8831 offnum = xlrec->offnum;
8832 if (PageGetMaxOffsetNumber(page) >= offnum)
8833 lp = PageGetItemId(page, offnum);
8834
8835 if (PageGetMaxOffsetNumber(page) < offnum || !ItemIdIsNormal(lp))
8836 elog(PANIC, "invalid lp");
8837
8838 htup = (HeapTupleHeader) PageGetItem(page, lp);
8839
8840 oldlen = ItemIdGetLength(lp) - htup->t_hoff;
8841 if (oldlen != newlen)
8842 elog(PANIC, "wrong tuple length");
8843
8844 memcpy((char *) htup + htup->t_hoff, newtup, newlen);
8845
8846 PageSetLSN(page, lsn);
8847 MarkBufferDirty(buffer);
8848 }
8849 if (BufferIsValid(buffer))
8850 UnlockReleaseBuffer(buffer);
8851 }
8852
8853 void
heap_redo(XLogReaderState * record)8854 heap_redo(XLogReaderState *record)
8855 {
8856 uint8 info = XLogRecGetInfo(record) & ~XLR_INFO_MASK;
8857
8858 /*
8859 * These operations don't overwrite MVCC data so no conflict processing is
8860 * required. The ones in heap2 rmgr do.
8861 */
8862
8863 switch (info & XLOG_HEAP_OPMASK)
8864 {
8865 case XLOG_HEAP_INSERT:
8866 heap_xlog_insert(record);
8867 break;
8868 case XLOG_HEAP_DELETE:
8869 heap_xlog_delete(record);
8870 break;
8871 case XLOG_HEAP_UPDATE:
8872 heap_xlog_update(record, false);
8873 break;
8874 case XLOG_HEAP_TRUNCATE:
8875
8876 /*
8877 * TRUNCATE is a no-op because the actions are already logged as
8878 * SMGR WAL records. TRUNCATE WAL record only exists for logical
8879 * decoding.
8880 */
8881 break;
8882 case XLOG_HEAP_HOT_UPDATE:
8883 heap_xlog_update(record, true);
8884 break;
8885 case XLOG_HEAP_CONFIRM:
8886 heap_xlog_confirm(record);
8887 break;
8888 case XLOG_HEAP_LOCK:
8889 heap_xlog_lock(record);
8890 break;
8891 case XLOG_HEAP_INPLACE:
8892 heap_xlog_inplace(record);
8893 break;
8894 default:
8895 elog(PANIC, "heap_redo: unknown op code %u", info);
8896 }
8897 }
8898
8899 void
heap2_redo(XLogReaderState * record)8900 heap2_redo(XLogReaderState *record)
8901 {
8902 uint8 info = XLogRecGetInfo(record) & ~XLR_INFO_MASK;
8903
8904 switch (info & XLOG_HEAP_OPMASK)
8905 {
8906 case XLOG_HEAP2_CLEAN:
8907 heap_xlog_clean(record);
8908 break;
8909 case XLOG_HEAP2_FREEZE_PAGE:
8910 heap_xlog_freeze_page(record);
8911 break;
8912 case XLOG_HEAP2_CLEANUP_INFO:
8913 heap_xlog_cleanup_info(record);
8914 break;
8915 case XLOG_HEAP2_VISIBLE:
8916 heap_xlog_visible(record);
8917 break;
8918 case XLOG_HEAP2_MULTI_INSERT:
8919 heap_xlog_multi_insert(record);
8920 break;
8921 case XLOG_HEAP2_LOCK_UPDATED:
8922 heap_xlog_lock_updated(record);
8923 break;
8924 case XLOG_HEAP2_NEW_CID:
8925
8926 /*
8927 * Nothing to do on a real replay, only used during logical
8928 * decoding.
8929 */
8930 break;
8931 case XLOG_HEAP2_REWRITE:
8932 heap_xlog_logical_rewrite(record);
8933 break;
8934 default:
8935 elog(PANIC, "heap2_redo: unknown op code %u", info);
8936 }
8937 }
8938
8939 /*
8940 * Mask a heap page before performing consistency checks on it.
8941 */
8942 void
heap_mask(char * pagedata,BlockNumber blkno)8943 heap_mask(char *pagedata, BlockNumber blkno)
8944 {
8945 Page page = (Page) pagedata;
8946 OffsetNumber off;
8947
8948 mask_page_lsn_and_checksum(page);
8949
8950 mask_page_hint_bits(page);
8951 mask_unused_space(page);
8952
8953 for (off = 1; off <= PageGetMaxOffsetNumber(page); off++)
8954 {
8955 ItemId iid = PageGetItemId(page, off);
8956 char *page_item;
8957
8958 page_item = (char *) (page + ItemIdGetOffset(iid));
8959
8960 if (ItemIdIsNormal(iid))
8961 {
8962 HeapTupleHeader page_htup = (HeapTupleHeader) page_item;
8963
8964 /*
8965 * If xmin of a tuple is not yet frozen, we should ignore
8966 * differences in hint bits, since they can be set without
8967 * emitting WAL.
8968 */
8969 if (!HeapTupleHeaderXminFrozen(page_htup))
8970 page_htup->t_infomask &= ~HEAP_XACT_MASK;
8971 else
8972 {
8973 /* Still we need to mask xmax hint bits. */
8974 page_htup->t_infomask &= ~HEAP_XMAX_INVALID;
8975 page_htup->t_infomask &= ~HEAP_XMAX_COMMITTED;
8976 }
8977
8978 /*
8979 * During replay, we set Command Id to FirstCommandId. Hence, mask
8980 * it. See heap_xlog_insert() for details.
8981 */
8982 page_htup->t_choice.t_heap.t_field3.t_cid = MASK_MARKER;
8983
8984 /*
8985 * For a speculative tuple, heap_insert() does not set ctid in the
8986 * caller-passed heap tuple itself, leaving the ctid field to
8987 * contain a speculative token value - a per-backend monotonically
8988 * increasing identifier. Besides, it does not WAL-log ctid under
8989 * any circumstances.
8990 *
8991 * During redo, heap_xlog_insert() sets t_ctid to current block
8992 * number and self offset number. It doesn't care about any
8993 * speculative insertions in master. Hence, we set t_ctid to
8994 * current block number and self offset number to ignore any
8995 * inconsistency.
8996 */
8997 if (HeapTupleHeaderIsSpeculative(page_htup))
8998 ItemPointerSet(&page_htup->t_ctid, blkno, off);
8999
9000 /*
9001 * NB: Not ignoring ctid changes due to the tuple having moved
9002 * (i.e. HeapTupleHeaderIndicatesMovedPartitions), because that's
9003 * important information that needs to be in-sync between primary
9004 * and standby, and thus is WAL logged.
9005 */
9006 }
9007
9008 /*
9009 * Ignore any padding bytes after the tuple, when the length of the
9010 * item is not MAXALIGNed.
9011 */
9012 if (ItemIdHasStorage(iid))
9013 {
9014 int len = ItemIdGetLength(iid);
9015 int padlen = MAXALIGN(len) - len;
9016
9017 if (padlen > 0)
9018 memset(page_item + len, MASK_MARKER, padlen);
9019 }
9020 }
9021 }
9022
9023 /*
9024 * HeapCheckForSerializableConflictOut
9025 * We are reading a tuple. If it's not visible, there may be a
9026 * rw-conflict out with the inserter. Otherwise, if it is visible to us
9027 * but has been deleted, there may be a rw-conflict out with the deleter.
9028 *
9029 * We will determine the top level xid of the writing transaction with which
9030 * we may be in conflict, and ask CheckForSerializableConflictOut() to check
9031 * for overlap with our own transaction.
9032 *
9033 * This function should be called just about anywhere in heapam.c where a
9034 * tuple has been read. The caller must hold at least a shared lock on the
9035 * buffer, because this function might set hint bits on the tuple. There is
9036 * currently no known reason to call this function from an index AM.
9037 */
9038 void
HeapCheckForSerializableConflictOut(bool visible,Relation relation,HeapTuple tuple,Buffer buffer,Snapshot snapshot)9039 HeapCheckForSerializableConflictOut(bool visible, Relation relation,
9040 HeapTuple tuple, Buffer buffer,
9041 Snapshot snapshot)
9042 {
9043 TransactionId xid;
9044 HTSV_Result htsvResult;
9045
9046 if (!CheckForSerializableConflictOutNeeded(relation, snapshot))
9047 return;
9048
9049 /*
9050 * Check to see whether the tuple has been written to by a concurrent
9051 * transaction, either to create it not visible to us, or to delete it
9052 * while it is visible to us. The "visible" bool indicates whether the
9053 * tuple is visible to us, while HeapTupleSatisfiesVacuum checks what else
9054 * is going on with it.
9055 *
9056 * In the event of a concurrently inserted tuple that also happens to have
9057 * been concurrently updated (by a separate transaction), the xmin of the
9058 * tuple will be used -- not the updater's xid.
9059 */
9060 htsvResult = HeapTupleSatisfiesVacuum(tuple, TransactionXmin, buffer);
9061 switch (htsvResult)
9062 {
9063 case HEAPTUPLE_LIVE:
9064 if (visible)
9065 return;
9066 xid = HeapTupleHeaderGetXmin(tuple->t_data);
9067 break;
9068 case HEAPTUPLE_RECENTLY_DEAD:
9069 case HEAPTUPLE_DELETE_IN_PROGRESS:
9070 if (visible)
9071 xid = HeapTupleHeaderGetUpdateXid(tuple->t_data);
9072 else
9073 xid = HeapTupleHeaderGetXmin(tuple->t_data);
9074
9075 if (TransactionIdPrecedes(xid, TransactionXmin))
9076 {
9077 /* This is like the HEAPTUPLE_DEAD case */
9078 Assert(!visible);
9079 return;
9080 }
9081 break;
9082 case HEAPTUPLE_INSERT_IN_PROGRESS:
9083 xid = HeapTupleHeaderGetXmin(tuple->t_data);
9084 break;
9085 case HEAPTUPLE_DEAD:
9086 Assert(!visible);
9087 return;
9088 default:
9089
9090 /*
9091 * The only way to get to this default clause is if a new value is
9092 * added to the enum type without adding it to this switch
9093 * statement. That's a bug, so elog.
9094 */
9095 elog(ERROR, "unrecognized return value from HeapTupleSatisfiesVacuum: %u", htsvResult);
9096
9097 /*
9098 * In spite of having all enum values covered and calling elog on
9099 * this default, some compilers think this is a code path which
9100 * allows xid to be used below without initialization. Silence
9101 * that warning.
9102 */
9103 xid = InvalidTransactionId;
9104 }
9105
9106 Assert(TransactionIdIsValid(xid));
9107 Assert(TransactionIdFollowsOrEquals(xid, TransactionXmin));
9108
9109 /*
9110 * Find top level xid. Bail out if xid is too early to be a conflict, or
9111 * if it's our own xid.
9112 */
9113 if (TransactionIdEquals(xid, GetTopTransactionIdIfAny()))
9114 return;
9115 xid = SubTransGetTopmostTransaction(xid);
9116 if (TransactionIdPrecedes(xid, TransactionXmin))
9117 return;
9118
9119 CheckForSerializableConflictOut(relation, xid, snapshot);
9120 }
9121