1 /*-------------------------------------------------------------------------
2 *
3 * heapam.c
4 * heap access method code
5 *
6 * Portions Copyright (c) 1996-2018, PostgreSQL Global Development Group
7 * Portions Copyright (c) 1994, Regents of the University of California
8 *
9 *
10 * IDENTIFICATION
11 * src/backend/access/heap/heapam.c
12 *
13 *
14 * INTERFACE ROUTINES
15 * relation_open - open any relation by relation OID
16 * relation_openrv - open any relation specified by a RangeVar
17 * relation_close - close any relation
18 * heap_open - open a heap relation by relation OID
19 * heap_openrv - open a heap relation specified by a RangeVar
20 * heap_close - (now just a macro for relation_close)
21 * heap_beginscan - begin relation scan
22 * heap_rescan - restart a relation scan
23 * heap_endscan - end relation scan
24 * heap_getnext - retrieve next tuple in scan
25 * heap_fetch - retrieve tuple with given tid
26 * heap_insert - insert tuple into a relation
27 * heap_multi_insert - insert multiple tuples into a relation
28 * heap_delete - delete a tuple from a relation
29 * heap_update - replace a tuple in a relation with another tuple
30 * heap_sync - sync heap, for when no WAL has been written
31 *
32 * NOTES
33 * This file contains the heap_ routines which implement
34 * the POSTGRES heap access method used for all POSTGRES
35 * relations.
36 *
37 *-------------------------------------------------------------------------
38 */
39 #include "postgres.h"
40
41 #include "access/bufmask.h"
42 #include "access/heapam.h"
43 #include "access/heapam_xlog.h"
44 #include "access/hio.h"
45 #include "access/multixact.h"
46 #include "access/parallel.h"
47 #include "access/relscan.h"
48 #include "access/sysattr.h"
49 #include "access/transam.h"
50 #include "access/tuptoaster.h"
51 #include "access/valid.h"
52 #include "access/visibilitymap.h"
53 #include "access/xact.h"
54 #include "access/xlog.h"
55 #include "access/xloginsert.h"
56 #include "access/xlogutils.h"
57 #include "catalog/catalog.h"
58 #include "catalog/namespace.h"
59 #include "catalog/index.h"
60 #include "miscadmin.h"
61 #include "pgstat.h"
62 #include "port/atomics.h"
63 #include "storage/bufmgr.h"
64 #include "storage/freespace.h"
65 #include "storage/lmgr.h"
66 #include "storage/predicate.h"
67 #include "storage/procarray.h"
68 #include "storage/smgr.h"
69 #include "storage/spin.h"
70 #include "storage/standby.h"
71 #include "utils/datum.h"
72 #include "utils/inval.h"
73 #include "utils/lsyscache.h"
74 #include "utils/relcache.h"
75 #include "utils/snapmgr.h"
76 #include "utils/syscache.h"
77 #include "utils/tqual.h"
78 #include "utils/memutils.h"
79 #include "nodes/execnodes.h"
80 #include "executor/executor.h"
81
82 /* GUC variable */
83 bool synchronize_seqscans = true;
84
85
86 static HeapScanDesc heap_beginscan_internal(Relation relation,
87 Snapshot snapshot,
88 int nkeys, ScanKey key,
89 ParallelHeapScanDesc parallel_scan,
90 bool allow_strat,
91 bool allow_sync,
92 bool allow_pagemode,
93 bool is_bitmapscan,
94 bool is_samplescan,
95 bool temp_snap);
96 static void heap_parallelscan_startblock_init(HeapScanDesc scan);
97 static BlockNumber heap_parallelscan_nextpage(HeapScanDesc scan);
98 static HeapTuple heap_prepare_insert(Relation relation, HeapTuple tup,
99 TransactionId xid, CommandId cid, int options);
100 static XLogRecPtr log_heap_update(Relation reln, Buffer oldbuf,
101 Buffer newbuf, HeapTuple oldtup,
102 HeapTuple newtup, HeapTuple old_key_tup,
103 bool all_visible_cleared, bool new_all_visible_cleared);
104 static Bitmapset *HeapDetermineModifiedColumns(Relation relation,
105 Bitmapset *interesting_cols,
106 HeapTuple oldtup, HeapTuple newtup);
107 static bool heap_acquire_tuplock(Relation relation, ItemPointer tid,
108 LockTupleMode mode, LockWaitPolicy wait_policy,
109 bool *have_tuple_lock);
110 static void compute_new_xmax_infomask(TransactionId xmax, uint16 old_infomask,
111 uint16 old_infomask2, TransactionId add_to_xmax,
112 LockTupleMode mode, bool is_update,
113 TransactionId *result_xmax, uint16 *result_infomask,
114 uint16 *result_infomask2);
115 static HTSU_Result heap_lock_updated_tuple(Relation rel, HeapTuple tuple,
116 ItemPointer ctid, TransactionId xid,
117 LockTupleMode mode);
118 static void GetMultiXactIdHintBits(MultiXactId multi, uint16 *new_infomask,
119 uint16 *new_infomask2);
120 static TransactionId MultiXactIdGetUpdateXid(TransactionId xmax,
121 uint16 t_infomask);
122 static bool DoesMultiXactIdConflict(MultiXactId multi, uint16 infomask,
123 LockTupleMode lockmode, bool *current_is_member);
124 static void MultiXactIdWait(MultiXactId multi, MultiXactStatus status, uint16 infomask,
125 Relation rel, ItemPointer ctid, XLTW_Oper oper,
126 int *remaining);
127 static bool ConditionalMultiXactIdWait(MultiXactId multi, MultiXactStatus status,
128 uint16 infomask, Relation rel, int *remaining);
129 static XLogRecPtr log_heap_new_cid(Relation relation, HeapTuple tup);
130 static HeapTuple ExtractReplicaIdentity(Relation rel, HeapTuple tup, bool key_modified,
131 bool *copy);
132 static bool ProjIndexIsUnchanged(Relation relation, HeapTuple oldtup, HeapTuple newtup);
133
134
135 /*
136 * Each tuple lock mode has a corresponding heavyweight lock, and one or two
137 * corresponding MultiXactStatuses (one to merely lock tuples, another one to
138 * update them). This table (and the macros below) helps us determine the
139 * heavyweight lock mode and MultiXactStatus values to use for any particular
140 * tuple lock strength.
141 *
142 * Don't look at lockstatus/updstatus directly! Use get_mxact_status_for_lock
143 * instead.
144 */
145 static const struct
146 {
147 LOCKMODE hwlock;
148 int lockstatus;
149 int updstatus;
150 }
151
152 tupleLockExtraInfo[MaxLockTupleMode + 1] =
153 {
154 { /* LockTupleKeyShare */
155 AccessShareLock,
156 MultiXactStatusForKeyShare,
157 -1 /* KeyShare does not allow updating tuples */
158 },
159 { /* LockTupleShare */
160 RowShareLock,
161 MultiXactStatusForShare,
162 -1 /* Share does not allow updating tuples */
163 },
164 { /* LockTupleNoKeyExclusive */
165 ExclusiveLock,
166 MultiXactStatusForNoKeyUpdate,
167 MultiXactStatusNoKeyUpdate
168 },
169 { /* LockTupleExclusive */
170 AccessExclusiveLock,
171 MultiXactStatusForUpdate,
172 MultiXactStatusUpdate
173 }
174 };
175
176 /* Get the LOCKMODE for a given MultiXactStatus */
177 #define LOCKMODE_from_mxstatus(status) \
178 (tupleLockExtraInfo[TUPLOCK_from_mxstatus((status))].hwlock)
179
180 /*
181 * Acquire heavyweight locks on tuples, using a LockTupleMode strength value.
182 * This is more readable than having every caller translate it to lock.h's
183 * LOCKMODE.
184 */
185 #define LockTupleTuplock(rel, tup, mode) \
186 LockTuple((rel), (tup), tupleLockExtraInfo[mode].hwlock)
187 #define UnlockTupleTuplock(rel, tup, mode) \
188 UnlockTuple((rel), (tup), tupleLockExtraInfo[mode].hwlock)
189 #define ConditionalLockTupleTuplock(rel, tup, mode) \
190 ConditionalLockTuple((rel), (tup), tupleLockExtraInfo[mode].hwlock)
191
192 /*
193 * This table maps tuple lock strength values for each particular
194 * MultiXactStatus value.
195 */
196 static const int MultiXactStatusLock[MaxMultiXactStatus + 1] =
197 {
198 LockTupleKeyShare, /* ForKeyShare */
199 LockTupleShare, /* ForShare */
200 LockTupleNoKeyExclusive, /* ForNoKeyUpdate */
201 LockTupleExclusive, /* ForUpdate */
202 LockTupleNoKeyExclusive, /* NoKeyUpdate */
203 LockTupleExclusive /* Update */
204 };
205
206 /* Get the LockTupleMode for a given MultiXactStatus */
207 #define TUPLOCK_from_mxstatus(status) \
208 (MultiXactStatusLock[(status)])
209
210 /* ----------------------------------------------------------------
211 * heap support routines
212 * ----------------------------------------------------------------
213 */
214
215 /* ----------------
216 * initscan - scan code common to heap_beginscan and heap_rescan
217 * ----------------
218 */
219 static void
initscan(HeapScanDesc scan,ScanKey key,bool keep_startblock)220 initscan(HeapScanDesc scan, ScanKey key, bool keep_startblock)
221 {
222 bool allow_strat;
223 bool allow_sync;
224
225 /*
226 * Determine the number of blocks we have to scan.
227 *
228 * It is sufficient to do this once at scan start, since any tuples added
229 * while the scan is in progress will be invisible to my snapshot anyway.
230 * (That is not true when using a non-MVCC snapshot. However, we couldn't
231 * guarantee to return tuples added after scan start anyway, since they
232 * might go into pages we already scanned. To guarantee consistent
233 * results for a non-MVCC snapshot, the caller must hold some higher-level
234 * lock that ensures the interesting tuple(s) won't change.)
235 */
236 if (scan->rs_parallel != NULL)
237 scan->rs_nblocks = scan->rs_parallel->phs_nblocks;
238 else
239 scan->rs_nblocks = RelationGetNumberOfBlocks(scan->rs_rd);
240
241 /*
242 * If the table is large relative to NBuffers, use a bulk-read access
243 * strategy and enable synchronized scanning (see syncscan.c). Although
244 * the thresholds for these features could be different, we make them the
245 * same so that there are only two behaviors to tune rather than four.
246 * (However, some callers need to be able to disable one or both of these
247 * behaviors, independently of the size of the table; also there is a GUC
248 * variable that can disable synchronized scanning.)
249 *
250 * Note that heap_parallelscan_initialize has a very similar test; if you
251 * change this, consider changing that one, too.
252 */
253 if (!RelationUsesLocalBuffers(scan->rs_rd) &&
254 scan->rs_nblocks > NBuffers / 4)
255 {
256 allow_strat = scan->rs_allow_strat;
257 allow_sync = scan->rs_allow_sync;
258 }
259 else
260 allow_strat = allow_sync = false;
261
262 if (allow_strat)
263 {
264 /* During a rescan, keep the previous strategy object. */
265 if (scan->rs_strategy == NULL)
266 scan->rs_strategy = GetAccessStrategy(BAS_BULKREAD);
267 }
268 else
269 {
270 if (scan->rs_strategy != NULL)
271 FreeAccessStrategy(scan->rs_strategy);
272 scan->rs_strategy = NULL;
273 }
274
275 if (scan->rs_parallel != NULL)
276 {
277 /* For parallel scan, believe whatever ParallelHeapScanDesc says. */
278 scan->rs_syncscan = scan->rs_parallel->phs_syncscan;
279 }
280 else if (keep_startblock)
281 {
282 /*
283 * When rescanning, we want to keep the previous startblock setting,
284 * so that rewinding a cursor doesn't generate surprising results.
285 * Reset the active syncscan setting, though.
286 */
287 scan->rs_syncscan = (allow_sync && synchronize_seqscans);
288 }
289 else if (allow_sync && synchronize_seqscans)
290 {
291 scan->rs_syncscan = true;
292 scan->rs_startblock = ss_get_location(scan->rs_rd, scan->rs_nblocks);
293 }
294 else
295 {
296 scan->rs_syncscan = false;
297 scan->rs_startblock = 0;
298 }
299
300 scan->rs_numblocks = InvalidBlockNumber;
301 scan->rs_inited = false;
302 scan->rs_ctup.t_data = NULL;
303 ItemPointerSetInvalid(&scan->rs_ctup.t_self);
304 scan->rs_cbuf = InvalidBuffer;
305 scan->rs_cblock = InvalidBlockNumber;
306
307 /* page-at-a-time fields are always invalid when not rs_inited */
308
309 /*
310 * copy the scan key, if appropriate
311 */
312 if (key != NULL)
313 memcpy(scan->rs_key, key, scan->rs_nkeys * sizeof(ScanKeyData));
314
315 /*
316 * Currently, we don't have a stats counter for bitmap heap scans (but the
317 * underlying bitmap index scans will be counted) or sample scans (we only
318 * update stats for tuple fetches there)
319 */
320 if (!scan->rs_bitmapscan && !scan->rs_samplescan)
321 pgstat_count_heap_scan(scan->rs_rd);
322 }
323
324 /*
325 * heap_setscanlimits - restrict range of a heapscan
326 *
327 * startBlk is the page to start at
328 * numBlks is number of pages to scan (InvalidBlockNumber means "all")
329 */
330 void
heap_setscanlimits(HeapScanDesc scan,BlockNumber startBlk,BlockNumber numBlks)331 heap_setscanlimits(HeapScanDesc scan, BlockNumber startBlk, BlockNumber numBlks)
332 {
333 Assert(!scan->rs_inited); /* else too late to change */
334 Assert(!scan->rs_syncscan); /* else rs_startblock is significant */
335
336 /* Check startBlk is valid (but allow case of zero blocks...) */
337 Assert(startBlk == 0 || startBlk < scan->rs_nblocks);
338
339 scan->rs_startblock = startBlk;
340 scan->rs_numblocks = numBlks;
341 }
342
343 /*
344 * heapgetpage - subroutine for heapgettup()
345 *
346 * This routine reads and pins the specified page of the relation.
347 * In page-at-a-time mode it performs additional work, namely determining
348 * which tuples on the page are visible.
349 */
350 void
heapgetpage(HeapScanDesc scan,BlockNumber page)351 heapgetpage(HeapScanDesc scan, BlockNumber page)
352 {
353 Buffer buffer;
354 Snapshot snapshot;
355 Page dp;
356 int lines;
357 int ntup;
358 OffsetNumber lineoff;
359 ItemId lpp;
360 bool all_visible;
361
362 Assert(page < scan->rs_nblocks);
363
364 /* release previous scan buffer, if any */
365 if (BufferIsValid(scan->rs_cbuf))
366 {
367 ReleaseBuffer(scan->rs_cbuf);
368 scan->rs_cbuf = InvalidBuffer;
369 }
370
371 /*
372 * Be sure to check for interrupts at least once per page. Checks at
373 * higher code levels won't be able to stop a seqscan that encounters many
374 * pages' worth of consecutive dead tuples.
375 */
376 CHECK_FOR_INTERRUPTS();
377
378 /* read page using selected strategy */
379 scan->rs_cbuf = ReadBufferExtended(scan->rs_rd, MAIN_FORKNUM, page,
380 RBM_NORMAL, scan->rs_strategy);
381 scan->rs_cblock = page;
382
383 if (!scan->rs_pageatatime)
384 return;
385
386 buffer = scan->rs_cbuf;
387 snapshot = scan->rs_snapshot;
388
389 /*
390 * Prune and repair fragmentation for the whole page, if possible.
391 */
392 heap_page_prune_opt(scan->rs_rd, buffer);
393
394 /*
395 * We must hold share lock on the buffer content while examining tuple
396 * visibility. Afterwards, however, the tuples we have found to be
397 * visible are guaranteed good as long as we hold the buffer pin.
398 */
399 LockBuffer(buffer, BUFFER_LOCK_SHARE);
400
401 dp = BufferGetPage(buffer);
402 TestForOldSnapshot(snapshot, scan->rs_rd, dp);
403 lines = PageGetMaxOffsetNumber(dp);
404 ntup = 0;
405
406 /*
407 * If the all-visible flag indicates that all tuples on the page are
408 * visible to everyone, we can skip the per-tuple visibility tests.
409 *
410 * Note: In hot standby, a tuple that's already visible to all
411 * transactions in the master might still be invisible to a read-only
412 * transaction in the standby. We partly handle this problem by tracking
413 * the minimum xmin of visible tuples as the cut-off XID while marking a
414 * page all-visible on master and WAL log that along with the visibility
415 * map SET operation. In hot standby, we wait for (or abort) all
416 * transactions that can potentially may not see one or more tuples on the
417 * page. That's how index-only scans work fine in hot standby. A crucial
418 * difference between index-only scans and heap scans is that the
419 * index-only scan completely relies on the visibility map where as heap
420 * scan looks at the page-level PD_ALL_VISIBLE flag. We are not sure if
421 * the page-level flag can be trusted in the same way, because it might
422 * get propagated somehow without being explicitly WAL-logged, e.g. via a
423 * full page write. Until we can prove that beyond doubt, let's check each
424 * tuple for visibility the hard way.
425 */
426 all_visible = PageIsAllVisible(dp) && !snapshot->takenDuringRecovery;
427
428 for (lineoff = FirstOffsetNumber, lpp = PageGetItemId(dp, lineoff);
429 lineoff <= lines;
430 lineoff++, lpp++)
431 {
432 if (ItemIdIsNormal(lpp))
433 {
434 HeapTupleData loctup;
435 bool valid;
436
437 loctup.t_tableOid = RelationGetRelid(scan->rs_rd);
438 loctup.t_data = (HeapTupleHeader) PageGetItem((Page) dp, lpp);
439 loctup.t_len = ItemIdGetLength(lpp);
440 ItemPointerSet(&(loctup.t_self), page, lineoff);
441
442 if (all_visible)
443 valid = true;
444 else
445 valid = HeapTupleSatisfiesVisibility(&loctup, snapshot, buffer);
446
447 CheckForSerializableConflictOut(valid, scan->rs_rd, &loctup,
448 buffer, snapshot);
449
450 if (valid)
451 scan->rs_vistuples[ntup++] = lineoff;
452 }
453 }
454
455 LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
456
457 Assert(ntup <= MaxHeapTuplesPerPage);
458 scan->rs_ntuples = ntup;
459 }
460
461 /* ----------------
462 * heapgettup - fetch next heap tuple
463 *
464 * Initialize the scan if not already done; then advance to the next
465 * tuple as indicated by "dir"; return the next tuple in scan->rs_ctup,
466 * or set scan->rs_ctup.t_data = NULL if no more tuples.
467 *
468 * dir == NoMovementScanDirection means "re-fetch the tuple indicated
469 * by scan->rs_ctup".
470 *
471 * Note: the reason nkeys/key are passed separately, even though they are
472 * kept in the scan descriptor, is that the caller may not want us to check
473 * the scankeys.
474 *
475 * Note: when we fall off the end of the scan in either direction, we
476 * reset rs_inited. This means that a further request with the same
477 * scan direction will restart the scan, which is a bit odd, but a
478 * request with the opposite scan direction will start a fresh scan
479 * in the proper direction. The latter is required behavior for cursors,
480 * while the former case is generally undefined behavior in Postgres
481 * so we don't care too much.
482 * ----------------
483 */
484 static void
heapgettup(HeapScanDesc scan,ScanDirection dir,int nkeys,ScanKey key)485 heapgettup(HeapScanDesc scan,
486 ScanDirection dir,
487 int nkeys,
488 ScanKey key)
489 {
490 HeapTuple tuple = &(scan->rs_ctup);
491 Snapshot snapshot = scan->rs_snapshot;
492 bool backward = ScanDirectionIsBackward(dir);
493 BlockNumber page;
494 bool finished;
495 Page dp;
496 int lines;
497 OffsetNumber lineoff;
498 int linesleft;
499 ItemId lpp;
500
501 /*
502 * calculate next starting lineoff, given scan direction
503 */
504 if (ScanDirectionIsForward(dir))
505 {
506 if (!scan->rs_inited)
507 {
508 /*
509 * return null immediately if relation is empty
510 */
511 if (scan->rs_nblocks == 0 || scan->rs_numblocks == 0)
512 {
513 Assert(!BufferIsValid(scan->rs_cbuf));
514 tuple->t_data = NULL;
515 return;
516 }
517 if (scan->rs_parallel != NULL)
518 {
519 heap_parallelscan_startblock_init(scan);
520
521 page = heap_parallelscan_nextpage(scan);
522
523 /* Other processes might have already finished the scan. */
524 if (page == InvalidBlockNumber)
525 {
526 Assert(!BufferIsValid(scan->rs_cbuf));
527 tuple->t_data = NULL;
528 return;
529 }
530 }
531 else
532 page = scan->rs_startblock; /* first page */
533 heapgetpage(scan, page);
534 lineoff = FirstOffsetNumber; /* first offnum */
535 scan->rs_inited = true;
536 }
537 else
538 {
539 /* continue from previously returned page/tuple */
540 page = scan->rs_cblock; /* current page */
541 lineoff = /* next offnum */
542 OffsetNumberNext(ItemPointerGetOffsetNumber(&(tuple->t_self)));
543 }
544
545 LockBuffer(scan->rs_cbuf, BUFFER_LOCK_SHARE);
546
547 dp = BufferGetPage(scan->rs_cbuf);
548 TestForOldSnapshot(snapshot, scan->rs_rd, dp);
549 lines = PageGetMaxOffsetNumber(dp);
550 /* page and lineoff now reference the physically next tid */
551
552 linesleft = lines - lineoff + 1;
553 }
554 else if (backward)
555 {
556 /* backward parallel scan not supported */
557 Assert(scan->rs_parallel == NULL);
558
559 if (!scan->rs_inited)
560 {
561 /*
562 * return null immediately if relation is empty
563 */
564 if (scan->rs_nblocks == 0 || scan->rs_numblocks == 0)
565 {
566 Assert(!BufferIsValid(scan->rs_cbuf));
567 tuple->t_data = NULL;
568 return;
569 }
570
571 /*
572 * Disable reporting to syncscan logic in a backwards scan; it's
573 * not very likely anyone else is doing the same thing at the same
574 * time, and much more likely that we'll just bollix things for
575 * forward scanners.
576 */
577 scan->rs_syncscan = false;
578
579 /*
580 * Start from last page of the scan. Ensure we take into account
581 * rs_numblocks if it's been adjusted by heap_setscanlimits().
582 */
583 if (scan->rs_numblocks != InvalidBlockNumber)
584 page = (scan->rs_startblock + scan->rs_numblocks - 1) % scan->rs_nblocks;
585 else if (scan->rs_startblock > 0)
586 page = scan->rs_startblock - 1;
587 else
588 page = scan->rs_nblocks - 1;
589 heapgetpage(scan, page);
590 }
591 else
592 {
593 /* continue from previously returned page/tuple */
594 page = scan->rs_cblock; /* current page */
595 }
596
597 LockBuffer(scan->rs_cbuf, BUFFER_LOCK_SHARE);
598
599 dp = BufferGetPage(scan->rs_cbuf);
600 TestForOldSnapshot(snapshot, scan->rs_rd, dp);
601 lines = PageGetMaxOffsetNumber(dp);
602
603 if (!scan->rs_inited)
604 {
605 lineoff = lines; /* final offnum */
606 scan->rs_inited = true;
607 }
608 else
609 {
610 lineoff = /* previous offnum */
611 OffsetNumberPrev(ItemPointerGetOffsetNumber(&(tuple->t_self)));
612 }
613 /* page and lineoff now reference the physically previous tid */
614
615 linesleft = lineoff;
616 }
617 else
618 {
619 /*
620 * ``no movement'' scan direction: refetch prior tuple
621 */
622 if (!scan->rs_inited)
623 {
624 Assert(!BufferIsValid(scan->rs_cbuf));
625 tuple->t_data = NULL;
626 return;
627 }
628
629 page = ItemPointerGetBlockNumber(&(tuple->t_self));
630 if (page != scan->rs_cblock)
631 heapgetpage(scan, page);
632
633 /* Since the tuple was previously fetched, needn't lock page here */
634 dp = BufferGetPage(scan->rs_cbuf);
635 TestForOldSnapshot(snapshot, scan->rs_rd, dp);
636 lineoff = ItemPointerGetOffsetNumber(&(tuple->t_self));
637 lpp = PageGetItemId(dp, lineoff);
638 Assert(ItemIdIsNormal(lpp));
639
640 tuple->t_data = (HeapTupleHeader) PageGetItem((Page) dp, lpp);
641 tuple->t_len = ItemIdGetLength(lpp);
642
643 return;
644 }
645
646 /*
647 * advance the scan until we find a qualifying tuple or run out of stuff
648 * to scan
649 */
650 lpp = PageGetItemId(dp, lineoff);
651 for (;;)
652 {
653 while (linesleft > 0)
654 {
655 if (ItemIdIsNormal(lpp))
656 {
657 bool valid;
658
659 tuple->t_data = (HeapTupleHeader) PageGetItem((Page) dp, lpp);
660 tuple->t_len = ItemIdGetLength(lpp);
661 ItemPointerSet(&(tuple->t_self), page, lineoff);
662
663 /*
664 * if current tuple qualifies, return it.
665 */
666 valid = HeapTupleSatisfiesVisibility(tuple,
667 snapshot,
668 scan->rs_cbuf);
669
670 CheckForSerializableConflictOut(valid, scan->rs_rd, tuple,
671 scan->rs_cbuf, snapshot);
672
673 if (valid && key != NULL)
674 HeapKeyTest(tuple, RelationGetDescr(scan->rs_rd),
675 nkeys, key, valid);
676
677 if (valid)
678 {
679 LockBuffer(scan->rs_cbuf, BUFFER_LOCK_UNLOCK);
680 return;
681 }
682 }
683
684 /*
685 * otherwise move to the next item on the page
686 */
687 --linesleft;
688 if (backward)
689 {
690 --lpp; /* move back in this page's ItemId array */
691 --lineoff;
692 }
693 else
694 {
695 ++lpp; /* move forward in this page's ItemId array */
696 ++lineoff;
697 }
698 }
699
700 /*
701 * if we get here, it means we've exhausted the items on this page and
702 * it's time to move to the next.
703 */
704 LockBuffer(scan->rs_cbuf, BUFFER_LOCK_UNLOCK);
705
706 /*
707 * advance to next/prior page and detect end of scan
708 */
709 if (backward)
710 {
711 finished = (page == scan->rs_startblock) ||
712 (scan->rs_numblocks != InvalidBlockNumber ? --scan->rs_numblocks == 0 : false);
713 if (page == 0)
714 page = scan->rs_nblocks;
715 page--;
716 }
717 else if (scan->rs_parallel != NULL)
718 {
719 page = heap_parallelscan_nextpage(scan);
720 finished = (page == InvalidBlockNumber);
721 }
722 else
723 {
724 page++;
725 if (page >= scan->rs_nblocks)
726 page = 0;
727 finished = (page == scan->rs_startblock) ||
728 (scan->rs_numblocks != InvalidBlockNumber ? --scan->rs_numblocks == 0 : false);
729
730 /*
731 * Report our new scan position for synchronization purposes. We
732 * don't do that when moving backwards, however. That would just
733 * mess up any other forward-moving scanners.
734 *
735 * Note: we do this before checking for end of scan so that the
736 * final state of the position hint is back at the start of the
737 * rel. That's not strictly necessary, but otherwise when you run
738 * the same query multiple times the starting position would shift
739 * a little bit backwards on every invocation, which is confusing.
740 * We don't guarantee any specific ordering in general, though.
741 */
742 if (scan->rs_syncscan)
743 ss_report_location(scan->rs_rd, page);
744 }
745
746 /*
747 * return NULL if we've exhausted all the pages
748 */
749 if (finished)
750 {
751 if (BufferIsValid(scan->rs_cbuf))
752 ReleaseBuffer(scan->rs_cbuf);
753 scan->rs_cbuf = InvalidBuffer;
754 scan->rs_cblock = InvalidBlockNumber;
755 tuple->t_data = NULL;
756 scan->rs_inited = false;
757 return;
758 }
759
760 heapgetpage(scan, page);
761
762 LockBuffer(scan->rs_cbuf, BUFFER_LOCK_SHARE);
763
764 dp = BufferGetPage(scan->rs_cbuf);
765 TestForOldSnapshot(snapshot, scan->rs_rd, dp);
766 lines = PageGetMaxOffsetNumber((Page) dp);
767 linesleft = lines;
768 if (backward)
769 {
770 lineoff = lines;
771 lpp = PageGetItemId(dp, lines);
772 }
773 else
774 {
775 lineoff = FirstOffsetNumber;
776 lpp = PageGetItemId(dp, FirstOffsetNumber);
777 }
778 }
779 }
780
781 /* ----------------
782 * heapgettup_pagemode - fetch next heap tuple in page-at-a-time mode
783 *
784 * Same API as heapgettup, but used in page-at-a-time mode
785 *
786 * The internal logic is much the same as heapgettup's too, but there are some
787 * differences: we do not take the buffer content lock (that only needs to
788 * happen inside heapgetpage), and we iterate through just the tuples listed
789 * in rs_vistuples[] rather than all tuples on the page. Notice that
790 * lineindex is 0-based, where the corresponding loop variable lineoff in
791 * heapgettup is 1-based.
792 * ----------------
793 */
794 static void
heapgettup_pagemode(HeapScanDesc scan,ScanDirection dir,int nkeys,ScanKey key)795 heapgettup_pagemode(HeapScanDesc scan,
796 ScanDirection dir,
797 int nkeys,
798 ScanKey key)
799 {
800 HeapTuple tuple = &(scan->rs_ctup);
801 bool backward = ScanDirectionIsBackward(dir);
802 BlockNumber page;
803 bool finished;
804 Page dp;
805 int lines;
806 int lineindex;
807 OffsetNumber lineoff;
808 int linesleft;
809 ItemId lpp;
810
811 /*
812 * calculate next starting lineindex, given scan direction
813 */
814 if (ScanDirectionIsForward(dir))
815 {
816 if (!scan->rs_inited)
817 {
818 /*
819 * return null immediately if relation is empty
820 */
821 if (scan->rs_nblocks == 0 || scan->rs_numblocks == 0)
822 {
823 Assert(!BufferIsValid(scan->rs_cbuf));
824 tuple->t_data = NULL;
825 return;
826 }
827 if (scan->rs_parallel != NULL)
828 {
829 heap_parallelscan_startblock_init(scan);
830
831 page = heap_parallelscan_nextpage(scan);
832
833 /* Other processes might have already finished the scan. */
834 if (page == InvalidBlockNumber)
835 {
836 Assert(!BufferIsValid(scan->rs_cbuf));
837 tuple->t_data = NULL;
838 return;
839 }
840 }
841 else
842 page = scan->rs_startblock; /* first page */
843 heapgetpage(scan, page);
844 lineindex = 0;
845 scan->rs_inited = true;
846 }
847 else
848 {
849 /* continue from previously returned page/tuple */
850 page = scan->rs_cblock; /* current page */
851 lineindex = scan->rs_cindex + 1;
852 }
853
854 dp = BufferGetPage(scan->rs_cbuf);
855 TestForOldSnapshot(scan->rs_snapshot, scan->rs_rd, dp);
856 lines = scan->rs_ntuples;
857 /* page and lineindex now reference the next visible tid */
858
859 linesleft = lines - lineindex;
860 }
861 else if (backward)
862 {
863 /* backward parallel scan not supported */
864 Assert(scan->rs_parallel == NULL);
865
866 if (!scan->rs_inited)
867 {
868 /*
869 * return null immediately if relation is empty
870 */
871 if (scan->rs_nblocks == 0 || scan->rs_numblocks == 0)
872 {
873 Assert(!BufferIsValid(scan->rs_cbuf));
874 tuple->t_data = NULL;
875 return;
876 }
877
878 /*
879 * Disable reporting to syncscan logic in a backwards scan; it's
880 * not very likely anyone else is doing the same thing at the same
881 * time, and much more likely that we'll just bollix things for
882 * forward scanners.
883 */
884 scan->rs_syncscan = false;
885
886 /*
887 * Start from last page of the scan. Ensure we take into account
888 * rs_numblocks if it's been adjusted by heap_setscanlimits().
889 */
890 if (scan->rs_numblocks != InvalidBlockNumber)
891 page = (scan->rs_startblock + scan->rs_numblocks - 1) % scan->rs_nblocks;
892 else if (scan->rs_startblock > 0)
893 page = scan->rs_startblock - 1;
894 else
895 page = scan->rs_nblocks - 1;
896 heapgetpage(scan, page);
897 }
898 else
899 {
900 /* continue from previously returned page/tuple */
901 page = scan->rs_cblock; /* current page */
902 }
903
904 dp = BufferGetPage(scan->rs_cbuf);
905 TestForOldSnapshot(scan->rs_snapshot, scan->rs_rd, dp);
906 lines = scan->rs_ntuples;
907
908 if (!scan->rs_inited)
909 {
910 lineindex = lines - 1;
911 scan->rs_inited = true;
912 }
913 else
914 {
915 lineindex = scan->rs_cindex - 1;
916 }
917 /* page and lineindex now reference the previous visible tid */
918
919 linesleft = lineindex + 1;
920 }
921 else
922 {
923 /*
924 * ``no movement'' scan direction: refetch prior tuple
925 */
926 if (!scan->rs_inited)
927 {
928 Assert(!BufferIsValid(scan->rs_cbuf));
929 tuple->t_data = NULL;
930 return;
931 }
932
933 page = ItemPointerGetBlockNumber(&(tuple->t_self));
934 if (page != scan->rs_cblock)
935 heapgetpage(scan, page);
936
937 /* Since the tuple was previously fetched, needn't lock page here */
938 dp = BufferGetPage(scan->rs_cbuf);
939 TestForOldSnapshot(scan->rs_snapshot, scan->rs_rd, dp);
940 lineoff = ItemPointerGetOffsetNumber(&(tuple->t_self));
941 lpp = PageGetItemId(dp, lineoff);
942 Assert(ItemIdIsNormal(lpp));
943
944 tuple->t_data = (HeapTupleHeader) PageGetItem((Page) dp, lpp);
945 tuple->t_len = ItemIdGetLength(lpp);
946
947 /* check that rs_cindex is in sync */
948 Assert(scan->rs_cindex < scan->rs_ntuples);
949 Assert(lineoff == scan->rs_vistuples[scan->rs_cindex]);
950
951 return;
952 }
953
954 /*
955 * advance the scan until we find a qualifying tuple or run out of stuff
956 * to scan
957 */
958 for (;;)
959 {
960 while (linesleft > 0)
961 {
962 lineoff = scan->rs_vistuples[lineindex];
963 lpp = PageGetItemId(dp, lineoff);
964 Assert(ItemIdIsNormal(lpp));
965
966 tuple->t_data = (HeapTupleHeader) PageGetItem((Page) dp, lpp);
967 tuple->t_len = ItemIdGetLength(lpp);
968 ItemPointerSet(&(tuple->t_self), page, lineoff);
969
970 /*
971 * if current tuple qualifies, return it.
972 */
973 if (key != NULL)
974 {
975 bool valid;
976
977 HeapKeyTest(tuple, RelationGetDescr(scan->rs_rd),
978 nkeys, key, valid);
979 if (valid)
980 {
981 scan->rs_cindex = lineindex;
982 return;
983 }
984 }
985 else
986 {
987 scan->rs_cindex = lineindex;
988 return;
989 }
990
991 /*
992 * otherwise move to the next item on the page
993 */
994 --linesleft;
995 if (backward)
996 --lineindex;
997 else
998 ++lineindex;
999 }
1000
1001 /*
1002 * if we get here, it means we've exhausted the items on this page and
1003 * it's time to move to the next.
1004 */
1005 if (backward)
1006 {
1007 finished = (page == scan->rs_startblock) ||
1008 (scan->rs_numblocks != InvalidBlockNumber ? --scan->rs_numblocks == 0 : false);
1009 if (page == 0)
1010 page = scan->rs_nblocks;
1011 page--;
1012 }
1013 else if (scan->rs_parallel != NULL)
1014 {
1015 page = heap_parallelscan_nextpage(scan);
1016 finished = (page == InvalidBlockNumber);
1017 }
1018 else
1019 {
1020 page++;
1021 if (page >= scan->rs_nblocks)
1022 page = 0;
1023 finished = (page == scan->rs_startblock) ||
1024 (scan->rs_numblocks != InvalidBlockNumber ? --scan->rs_numblocks == 0 : false);
1025
1026 /*
1027 * Report our new scan position for synchronization purposes. We
1028 * don't do that when moving backwards, however. That would just
1029 * mess up any other forward-moving scanners.
1030 *
1031 * Note: we do this before checking for end of scan so that the
1032 * final state of the position hint is back at the start of the
1033 * rel. That's not strictly necessary, but otherwise when you run
1034 * the same query multiple times the starting position would shift
1035 * a little bit backwards on every invocation, which is confusing.
1036 * We don't guarantee any specific ordering in general, though.
1037 */
1038 if (scan->rs_syncscan)
1039 ss_report_location(scan->rs_rd, page);
1040 }
1041
1042 /*
1043 * return NULL if we've exhausted all the pages
1044 */
1045 if (finished)
1046 {
1047 if (BufferIsValid(scan->rs_cbuf))
1048 ReleaseBuffer(scan->rs_cbuf);
1049 scan->rs_cbuf = InvalidBuffer;
1050 scan->rs_cblock = InvalidBlockNumber;
1051 tuple->t_data = NULL;
1052 scan->rs_inited = false;
1053 return;
1054 }
1055
1056 heapgetpage(scan, page);
1057
1058 dp = BufferGetPage(scan->rs_cbuf);
1059 TestForOldSnapshot(scan->rs_snapshot, scan->rs_rd, dp);
1060 lines = scan->rs_ntuples;
1061 linesleft = lines;
1062 if (backward)
1063 lineindex = lines - 1;
1064 else
1065 lineindex = 0;
1066 }
1067 }
1068
1069
1070 #if defined(DISABLE_COMPLEX_MACRO)
1071 /*
1072 * This is formatted so oddly so that the correspondence to the macro
1073 * definition in access/htup_details.h is maintained.
1074 */
1075 Datum
fastgetattr(HeapTuple tup,int attnum,TupleDesc tupleDesc,bool * isnull)1076 fastgetattr(HeapTuple tup, int attnum, TupleDesc tupleDesc,
1077 bool *isnull)
1078 {
1079 return (
1080 (attnum) > 0 ?
1081 (
1082 (*(isnull) = false),
1083 HeapTupleNoNulls(tup) ?
1084 (
1085 TupleDescAttr((tupleDesc), (attnum) - 1)->attcacheoff >= 0 ?
1086 (
1087 fetchatt(TupleDescAttr((tupleDesc), (attnum) - 1),
1088 (char *) (tup)->t_data + (tup)->t_data->t_hoff +
1089 TupleDescAttr((tupleDesc), (attnum) - 1)->attcacheoff)
1090 )
1091 :
1092 nocachegetattr((tup), (attnum), (tupleDesc))
1093 )
1094 :
1095 (
1096 att_isnull((attnum) - 1, (tup)->t_data->t_bits) ?
1097 (
1098 (*(isnull) = true),
1099 (Datum) NULL
1100 )
1101 :
1102 (
1103 nocachegetattr((tup), (attnum), (tupleDesc))
1104 )
1105 )
1106 )
1107 :
1108 (
1109 (Datum) NULL
1110 )
1111 );
1112 }
1113 #endif /* defined(DISABLE_COMPLEX_MACRO) */
1114
1115
1116 /* ----------------------------------------------------------------
1117 * heap access method interface
1118 * ----------------------------------------------------------------
1119 */
1120
1121 /* ----------------
1122 * relation_open - open any relation by relation OID
1123 *
1124 * If lockmode is not "NoLock", the specified kind of lock is
1125 * obtained on the relation. (Generally, NoLock should only be
1126 * used if the caller knows it has some appropriate lock on the
1127 * relation already.)
1128 *
1129 * An error is raised if the relation does not exist.
1130 *
1131 * NB: a "relation" is anything with a pg_class entry. The caller is
1132 * expected to check whether the relkind is something it can handle.
1133 * ----------------
1134 */
1135 Relation
relation_open(Oid relationId,LOCKMODE lockmode)1136 relation_open(Oid relationId, LOCKMODE lockmode)
1137 {
1138 Relation r;
1139
1140 Assert(lockmode >= NoLock && lockmode < MAX_LOCKMODES);
1141
1142 /* Get the lock before trying to open the relcache entry */
1143 if (lockmode != NoLock)
1144 LockRelationOid(relationId, lockmode);
1145
1146 /* The relcache does all the real work... */
1147 r = RelationIdGetRelation(relationId);
1148
1149 if (!RelationIsValid(r))
1150 elog(ERROR, "could not open relation with OID %u", relationId);
1151
1152 /* Make note that we've accessed a temporary relation */
1153 if (RelationUsesLocalBuffers(r))
1154 MyXactFlags |= XACT_FLAGS_ACCESSEDTEMPREL;
1155
1156 pgstat_initstats(r);
1157
1158 return r;
1159 }
1160
1161 /* ----------------
1162 * try_relation_open - open any relation by relation OID
1163 *
1164 * Same as relation_open, except return NULL instead of failing
1165 * if the relation does not exist.
1166 * ----------------
1167 */
1168 Relation
try_relation_open(Oid relationId,LOCKMODE lockmode)1169 try_relation_open(Oid relationId, LOCKMODE lockmode)
1170 {
1171 Relation r;
1172
1173 Assert(lockmode >= NoLock && lockmode < MAX_LOCKMODES);
1174
1175 /* Get the lock first */
1176 if (lockmode != NoLock)
1177 LockRelationOid(relationId, lockmode);
1178
1179 /*
1180 * Now that we have the lock, probe to see if the relation really exists
1181 * or not.
1182 */
1183 if (!SearchSysCacheExists1(RELOID, ObjectIdGetDatum(relationId)))
1184 {
1185 /* Release useless lock */
1186 if (lockmode != NoLock)
1187 UnlockRelationOid(relationId, lockmode);
1188
1189 return NULL;
1190 }
1191
1192 /* Should be safe to do a relcache load */
1193 r = RelationIdGetRelation(relationId);
1194
1195 if (!RelationIsValid(r))
1196 elog(ERROR, "could not open relation with OID %u", relationId);
1197
1198 /* Make note that we've accessed a temporary relation */
1199 if (RelationUsesLocalBuffers(r))
1200 MyXactFlags |= XACT_FLAGS_ACCESSEDTEMPREL;
1201
1202 pgstat_initstats(r);
1203
1204 return r;
1205 }
1206
1207 /* ----------------
1208 * relation_openrv - open any relation specified by a RangeVar
1209 *
1210 * Same as relation_open, but the relation is specified by a RangeVar.
1211 * ----------------
1212 */
1213 Relation
relation_openrv(const RangeVar * relation,LOCKMODE lockmode)1214 relation_openrv(const RangeVar *relation, LOCKMODE lockmode)
1215 {
1216 Oid relOid;
1217
1218 /*
1219 * Check for shared-cache-inval messages before trying to open the
1220 * relation. This is needed even if we already hold a lock on the
1221 * relation, because GRANT/REVOKE are executed without taking any lock on
1222 * the target relation, and we want to be sure we see current ACL
1223 * information. We can skip this if asked for NoLock, on the assumption
1224 * that such a call is not the first one in the current command, and so we
1225 * should be reasonably up-to-date already. (XXX this all could stand to
1226 * be redesigned, but for the moment we'll keep doing this like it's been
1227 * done historically.)
1228 */
1229 if (lockmode != NoLock)
1230 AcceptInvalidationMessages();
1231
1232 /* Look up and lock the appropriate relation using namespace search */
1233 relOid = RangeVarGetRelid(relation, lockmode, false);
1234
1235 /* Let relation_open do the rest */
1236 return relation_open(relOid, NoLock);
1237 }
1238
1239 /* ----------------
1240 * relation_openrv_extended - open any relation specified by a RangeVar
1241 *
1242 * Same as relation_openrv, but with an additional missing_ok argument
1243 * allowing a NULL return rather than an error if the relation is not
1244 * found. (Note that some other causes, such as permissions problems,
1245 * will still result in an ereport.)
1246 * ----------------
1247 */
1248 Relation
relation_openrv_extended(const RangeVar * relation,LOCKMODE lockmode,bool missing_ok)1249 relation_openrv_extended(const RangeVar *relation, LOCKMODE lockmode,
1250 bool missing_ok)
1251 {
1252 Oid relOid;
1253
1254 /*
1255 * Check for shared-cache-inval messages before trying to open the
1256 * relation. See comments in relation_openrv().
1257 */
1258 if (lockmode != NoLock)
1259 AcceptInvalidationMessages();
1260
1261 /* Look up and lock the appropriate relation using namespace search */
1262 relOid = RangeVarGetRelid(relation, lockmode, missing_ok);
1263
1264 /* Return NULL on not-found */
1265 if (!OidIsValid(relOid))
1266 return NULL;
1267
1268 /* Let relation_open do the rest */
1269 return relation_open(relOid, NoLock);
1270 }
1271
1272 /* ----------------
1273 * relation_close - close any relation
1274 *
1275 * If lockmode is not "NoLock", we then release the specified lock.
1276 *
1277 * Note that it is often sensible to hold a lock beyond relation_close;
1278 * in that case, the lock is released automatically at xact end.
1279 * ----------------
1280 */
1281 void
relation_close(Relation relation,LOCKMODE lockmode)1282 relation_close(Relation relation, LOCKMODE lockmode)
1283 {
1284 LockRelId relid = relation->rd_lockInfo.lockRelId;
1285
1286 Assert(lockmode >= NoLock && lockmode < MAX_LOCKMODES);
1287
1288 /* The relcache does the real work... */
1289 RelationClose(relation);
1290
1291 if (lockmode != NoLock)
1292 UnlockRelationId(&relid, lockmode);
1293 }
1294
1295
1296 /* ----------------
1297 * heap_open - open a heap relation by relation OID
1298 *
1299 * This is essentially relation_open plus check that the relation
1300 * is not an index nor a composite type. (The caller should also
1301 * check that it's not a view or foreign table before assuming it has
1302 * storage.)
1303 * ----------------
1304 */
1305 Relation
heap_open(Oid relationId,LOCKMODE lockmode)1306 heap_open(Oid relationId, LOCKMODE lockmode)
1307 {
1308 Relation r;
1309
1310 r = relation_open(relationId, lockmode);
1311
1312 if (r->rd_rel->relkind == RELKIND_INDEX ||
1313 r->rd_rel->relkind == RELKIND_PARTITIONED_INDEX)
1314 ereport(ERROR,
1315 (errcode(ERRCODE_WRONG_OBJECT_TYPE),
1316 errmsg("\"%s\" is an index",
1317 RelationGetRelationName(r))));
1318 else if (r->rd_rel->relkind == RELKIND_COMPOSITE_TYPE)
1319 ereport(ERROR,
1320 (errcode(ERRCODE_WRONG_OBJECT_TYPE),
1321 errmsg("\"%s\" is a composite type",
1322 RelationGetRelationName(r))));
1323
1324 return r;
1325 }
1326
1327 /* ----------------
1328 * heap_openrv - open a heap relation specified
1329 * by a RangeVar node
1330 *
1331 * As above, but relation is specified by a RangeVar.
1332 * ----------------
1333 */
1334 Relation
heap_openrv(const RangeVar * relation,LOCKMODE lockmode)1335 heap_openrv(const RangeVar *relation, LOCKMODE lockmode)
1336 {
1337 Relation r;
1338
1339 r = relation_openrv(relation, lockmode);
1340
1341 if (r->rd_rel->relkind == RELKIND_INDEX ||
1342 r->rd_rel->relkind == RELKIND_PARTITIONED_INDEX)
1343 ereport(ERROR,
1344 (errcode(ERRCODE_WRONG_OBJECT_TYPE),
1345 errmsg("\"%s\" is an index",
1346 RelationGetRelationName(r))));
1347 else if (r->rd_rel->relkind == RELKIND_COMPOSITE_TYPE)
1348 ereport(ERROR,
1349 (errcode(ERRCODE_WRONG_OBJECT_TYPE),
1350 errmsg("\"%s\" is a composite type",
1351 RelationGetRelationName(r))));
1352
1353 return r;
1354 }
1355
1356 /* ----------------
1357 * heap_openrv_extended - open a heap relation specified
1358 * by a RangeVar node
1359 *
1360 * As above, but optionally return NULL instead of failing for
1361 * relation-not-found.
1362 * ----------------
1363 */
1364 Relation
heap_openrv_extended(const RangeVar * relation,LOCKMODE lockmode,bool missing_ok)1365 heap_openrv_extended(const RangeVar *relation, LOCKMODE lockmode,
1366 bool missing_ok)
1367 {
1368 Relation r;
1369
1370 r = relation_openrv_extended(relation, lockmode, missing_ok);
1371
1372 if (r)
1373 {
1374 if (r->rd_rel->relkind == RELKIND_INDEX ||
1375 r->rd_rel->relkind == RELKIND_PARTITIONED_INDEX)
1376 ereport(ERROR,
1377 (errcode(ERRCODE_WRONG_OBJECT_TYPE),
1378 errmsg("\"%s\" is an index",
1379 RelationGetRelationName(r))));
1380 else if (r->rd_rel->relkind == RELKIND_COMPOSITE_TYPE)
1381 ereport(ERROR,
1382 (errcode(ERRCODE_WRONG_OBJECT_TYPE),
1383 errmsg("\"%s\" is a composite type",
1384 RelationGetRelationName(r))));
1385 }
1386
1387 return r;
1388 }
1389
1390
1391 /* ----------------
1392 * heap_beginscan - begin relation scan
1393 *
1394 * heap_beginscan is the "standard" case.
1395 *
1396 * heap_beginscan_catalog differs in setting up its own temporary snapshot.
1397 *
1398 * heap_beginscan_strat offers an extended API that lets the caller control
1399 * whether a nondefault buffer access strategy can be used, and whether
1400 * syncscan can be chosen (possibly resulting in the scan not starting from
1401 * block zero). Both of these default to true with plain heap_beginscan.
1402 *
1403 * heap_beginscan_bm is an alternative entry point for setting up a
1404 * HeapScanDesc for a bitmap heap scan. Although that scan technology is
1405 * really quite unlike a standard seqscan, there is just enough commonality
1406 * to make it worth using the same data structure.
1407 *
1408 * heap_beginscan_sampling is an alternative entry point for setting up a
1409 * HeapScanDesc for a TABLESAMPLE scan. As with bitmap scans, it's worth
1410 * using the same data structure although the behavior is rather different.
1411 * In addition to the options offered by heap_beginscan_strat, this call
1412 * also allows control of whether page-mode visibility checking is used.
1413 * ----------------
1414 */
1415 HeapScanDesc
heap_beginscan(Relation relation,Snapshot snapshot,int nkeys,ScanKey key)1416 heap_beginscan(Relation relation, Snapshot snapshot,
1417 int nkeys, ScanKey key)
1418 {
1419 return heap_beginscan_internal(relation, snapshot, nkeys, key, NULL,
1420 true, true, true, false, false, false);
1421 }
1422
1423 HeapScanDesc
heap_beginscan_catalog(Relation relation,int nkeys,ScanKey key)1424 heap_beginscan_catalog(Relation relation, int nkeys, ScanKey key)
1425 {
1426 Oid relid = RelationGetRelid(relation);
1427 Snapshot snapshot = RegisterSnapshot(GetCatalogSnapshot(relid));
1428
1429 return heap_beginscan_internal(relation, snapshot, nkeys, key, NULL,
1430 true, true, true, false, false, true);
1431 }
1432
1433 HeapScanDesc
heap_beginscan_strat(Relation relation,Snapshot snapshot,int nkeys,ScanKey key,bool allow_strat,bool allow_sync)1434 heap_beginscan_strat(Relation relation, Snapshot snapshot,
1435 int nkeys, ScanKey key,
1436 bool allow_strat, bool allow_sync)
1437 {
1438 return heap_beginscan_internal(relation, snapshot, nkeys, key, NULL,
1439 allow_strat, allow_sync, true,
1440 false, false, false);
1441 }
1442
1443 HeapScanDesc
heap_beginscan_bm(Relation relation,Snapshot snapshot,int nkeys,ScanKey key)1444 heap_beginscan_bm(Relation relation, Snapshot snapshot,
1445 int nkeys, ScanKey key)
1446 {
1447 return heap_beginscan_internal(relation, snapshot, nkeys, key, NULL,
1448 false, false, true, true, false, false);
1449 }
1450
1451 HeapScanDesc
heap_beginscan_sampling(Relation relation,Snapshot snapshot,int nkeys,ScanKey key,bool allow_strat,bool allow_sync,bool allow_pagemode)1452 heap_beginscan_sampling(Relation relation, Snapshot snapshot,
1453 int nkeys, ScanKey key,
1454 bool allow_strat, bool allow_sync, bool allow_pagemode)
1455 {
1456 return heap_beginscan_internal(relation, snapshot, nkeys, key, NULL,
1457 allow_strat, allow_sync, allow_pagemode,
1458 false, true, false);
1459 }
1460
1461 static HeapScanDesc
heap_beginscan_internal(Relation relation,Snapshot snapshot,int nkeys,ScanKey key,ParallelHeapScanDesc parallel_scan,bool allow_strat,bool allow_sync,bool allow_pagemode,bool is_bitmapscan,bool is_samplescan,bool temp_snap)1462 heap_beginscan_internal(Relation relation, Snapshot snapshot,
1463 int nkeys, ScanKey key,
1464 ParallelHeapScanDesc parallel_scan,
1465 bool allow_strat,
1466 bool allow_sync,
1467 bool allow_pagemode,
1468 bool is_bitmapscan,
1469 bool is_samplescan,
1470 bool temp_snap)
1471 {
1472 HeapScanDesc scan;
1473
1474 /*
1475 * increment relation ref count while scanning relation
1476 *
1477 * This is just to make really sure the relcache entry won't go away while
1478 * the scan has a pointer to it. Caller should be holding the rel open
1479 * anyway, so this is redundant in all normal scenarios...
1480 */
1481 RelationIncrementReferenceCount(relation);
1482
1483 /*
1484 * allocate and initialize scan descriptor
1485 */
1486 scan = (HeapScanDesc) palloc(sizeof(HeapScanDescData));
1487
1488 scan->rs_rd = relation;
1489 scan->rs_snapshot = snapshot;
1490 scan->rs_nkeys = nkeys;
1491 scan->rs_bitmapscan = is_bitmapscan;
1492 scan->rs_samplescan = is_samplescan;
1493 scan->rs_strategy = NULL; /* set in initscan */
1494 scan->rs_allow_strat = allow_strat;
1495 scan->rs_allow_sync = allow_sync;
1496 scan->rs_temp_snap = temp_snap;
1497 scan->rs_parallel = parallel_scan;
1498
1499 /*
1500 * we can use page-at-a-time mode if it's an MVCC-safe snapshot
1501 */
1502 scan->rs_pageatatime = allow_pagemode && IsMVCCSnapshot(snapshot);
1503
1504 /*
1505 * For a seqscan in a serializable transaction, acquire a predicate lock
1506 * on the entire relation. This is required not only to lock all the
1507 * matching tuples, but also to conflict with new insertions into the
1508 * table. In an indexscan, we take page locks on the index pages covering
1509 * the range specified in the scan qual, but in a heap scan there is
1510 * nothing more fine-grained to lock. A bitmap scan is a different story,
1511 * there we have already scanned the index and locked the index pages
1512 * covering the predicate. But in that case we still have to lock any
1513 * matching heap tuples.
1514 */
1515 if (!is_bitmapscan)
1516 PredicateLockRelation(relation, snapshot);
1517
1518 /* we only need to set this up once */
1519 scan->rs_ctup.t_tableOid = RelationGetRelid(relation);
1520
1521 /*
1522 * we do this here instead of in initscan() because heap_rescan also calls
1523 * initscan() and we don't want to allocate memory again
1524 */
1525 if (nkeys > 0)
1526 scan->rs_key = (ScanKey) palloc(sizeof(ScanKeyData) * nkeys);
1527 else
1528 scan->rs_key = NULL;
1529
1530 initscan(scan, key, false);
1531
1532 return scan;
1533 }
1534
1535 /* ----------------
1536 * heap_rescan - restart a relation scan
1537 * ----------------
1538 */
1539 void
heap_rescan(HeapScanDesc scan,ScanKey key)1540 heap_rescan(HeapScanDesc scan,
1541 ScanKey key)
1542 {
1543 /*
1544 * unpin scan buffers
1545 */
1546 if (BufferIsValid(scan->rs_cbuf))
1547 ReleaseBuffer(scan->rs_cbuf);
1548
1549 /*
1550 * reinitialize scan descriptor
1551 */
1552 initscan(scan, key, true);
1553 }
1554
1555 /* ----------------
1556 * heap_rescan_set_params - restart a relation scan after changing params
1557 *
1558 * This call allows changing the buffer strategy, syncscan, and pagemode
1559 * options before starting a fresh scan. Note that although the actual use
1560 * of syncscan might change (effectively, enabling or disabling reporting),
1561 * the previously selected startblock will be kept.
1562 * ----------------
1563 */
1564 void
heap_rescan_set_params(HeapScanDesc scan,ScanKey key,bool allow_strat,bool allow_sync,bool allow_pagemode)1565 heap_rescan_set_params(HeapScanDesc scan, ScanKey key,
1566 bool allow_strat, bool allow_sync, bool allow_pagemode)
1567 {
1568 /* adjust parameters */
1569 scan->rs_allow_strat = allow_strat;
1570 scan->rs_allow_sync = allow_sync;
1571 scan->rs_pageatatime = allow_pagemode && IsMVCCSnapshot(scan->rs_snapshot);
1572 /* ... and rescan */
1573 heap_rescan(scan, key);
1574 }
1575
1576 /* ----------------
1577 * heap_endscan - end relation scan
1578 *
1579 * See how to integrate with index scans.
1580 * Check handling if reldesc caching.
1581 * ----------------
1582 */
1583 void
heap_endscan(HeapScanDesc scan)1584 heap_endscan(HeapScanDesc scan)
1585 {
1586 /* Note: no locking manipulations needed */
1587
1588 /*
1589 * unpin scan buffers
1590 */
1591 if (BufferIsValid(scan->rs_cbuf))
1592 ReleaseBuffer(scan->rs_cbuf);
1593
1594 /*
1595 * decrement relation reference count and free scan descriptor storage
1596 */
1597 RelationDecrementReferenceCount(scan->rs_rd);
1598
1599 if (scan->rs_key)
1600 pfree(scan->rs_key);
1601
1602 if (scan->rs_strategy != NULL)
1603 FreeAccessStrategy(scan->rs_strategy);
1604
1605 if (scan->rs_temp_snap)
1606 UnregisterSnapshot(scan->rs_snapshot);
1607
1608 pfree(scan);
1609 }
1610
1611 /* ----------------
1612 * heap_parallelscan_estimate - estimate storage for ParallelHeapScanDesc
1613 *
1614 * Sadly, this doesn't reduce to a constant, because the size required
1615 * to serialize the snapshot can vary.
1616 * ----------------
1617 */
1618 Size
heap_parallelscan_estimate(Snapshot snapshot)1619 heap_parallelscan_estimate(Snapshot snapshot)
1620 {
1621 return add_size(offsetof(ParallelHeapScanDescData, phs_snapshot_data),
1622 EstimateSnapshotSpace(snapshot));
1623 }
1624
1625 /* ----------------
1626 * heap_parallelscan_initialize - initialize ParallelHeapScanDesc
1627 *
1628 * Must allow as many bytes of shared memory as returned by
1629 * heap_parallelscan_estimate. Call this just once in the leader
1630 * process; then, individual workers attach via heap_beginscan_parallel.
1631 * ----------------
1632 */
1633 void
heap_parallelscan_initialize(ParallelHeapScanDesc target,Relation relation,Snapshot snapshot)1634 heap_parallelscan_initialize(ParallelHeapScanDesc target, Relation relation,
1635 Snapshot snapshot)
1636 {
1637 target->phs_relid = RelationGetRelid(relation);
1638 target->phs_nblocks = RelationGetNumberOfBlocks(relation);
1639 /* compare phs_syncscan initialization to similar logic in initscan */
1640 target->phs_syncscan = synchronize_seqscans &&
1641 !RelationUsesLocalBuffers(relation) &&
1642 target->phs_nblocks > NBuffers / 4;
1643 SpinLockInit(&target->phs_mutex);
1644 target->phs_startblock = InvalidBlockNumber;
1645 pg_atomic_init_u64(&target->phs_nallocated, 0);
1646 if (IsMVCCSnapshot(snapshot))
1647 {
1648 SerializeSnapshot(snapshot, target->phs_snapshot_data);
1649 target->phs_snapshot_any = false;
1650 }
1651 else
1652 {
1653 Assert(snapshot == SnapshotAny);
1654 target->phs_snapshot_any = true;
1655 }
1656 }
1657
1658 /* ----------------
1659 * heap_parallelscan_reinitialize - reset a parallel scan
1660 *
1661 * Call this in the leader process. Caller is responsible for
1662 * making sure that all workers have finished the scan beforehand.
1663 * ----------------
1664 */
1665 void
heap_parallelscan_reinitialize(ParallelHeapScanDesc parallel_scan)1666 heap_parallelscan_reinitialize(ParallelHeapScanDesc parallel_scan)
1667 {
1668 pg_atomic_write_u64(¶llel_scan->phs_nallocated, 0);
1669 }
1670
1671 /* ----------------
1672 * heap_beginscan_parallel - join a parallel scan
1673 *
1674 * Caller must hold a suitable lock on the correct relation.
1675 * ----------------
1676 */
1677 HeapScanDesc
heap_beginscan_parallel(Relation relation,ParallelHeapScanDesc parallel_scan)1678 heap_beginscan_parallel(Relation relation, ParallelHeapScanDesc parallel_scan)
1679 {
1680 Snapshot snapshot;
1681
1682 Assert(RelationGetRelid(relation) == parallel_scan->phs_relid);
1683
1684 if (!parallel_scan->phs_snapshot_any)
1685 {
1686 /* Snapshot was serialized -- restore it */
1687 snapshot = RestoreSnapshot(parallel_scan->phs_snapshot_data);
1688 RegisterSnapshot(snapshot);
1689 }
1690 else
1691 {
1692 /* SnapshotAny passed by caller (not serialized) */
1693 snapshot = SnapshotAny;
1694 }
1695
1696 return heap_beginscan_internal(relation, snapshot, 0, NULL, parallel_scan,
1697 true, true, true, false, false,
1698 !parallel_scan->phs_snapshot_any);
1699 }
1700
1701 /* ----------------
1702 * heap_parallelscan_startblock_init - find and set the scan's startblock
1703 *
1704 * Determine where the parallel seq scan should start. This function may
1705 * be called many times, once by each parallel worker. We must be careful
1706 * only to set the startblock once.
1707 * ----------------
1708 */
1709 static void
heap_parallelscan_startblock_init(HeapScanDesc scan)1710 heap_parallelscan_startblock_init(HeapScanDesc scan)
1711 {
1712 BlockNumber sync_startpage = InvalidBlockNumber;
1713 ParallelHeapScanDesc parallel_scan;
1714
1715 Assert(scan->rs_parallel);
1716 parallel_scan = scan->rs_parallel;
1717
1718 retry:
1719 /* Grab the spinlock. */
1720 SpinLockAcquire(¶llel_scan->phs_mutex);
1721
1722 /*
1723 * If the scan's startblock has not yet been initialized, we must do so
1724 * now. If this is not a synchronized scan, we just start at block 0, but
1725 * if it is a synchronized scan, we must get the starting position from
1726 * the synchronized scan machinery. We can't hold the spinlock while
1727 * doing that, though, so release the spinlock, get the information we
1728 * need, and retry. If nobody else has initialized the scan in the
1729 * meantime, we'll fill in the value we fetched on the second time
1730 * through.
1731 */
1732 if (parallel_scan->phs_startblock == InvalidBlockNumber)
1733 {
1734 if (!parallel_scan->phs_syncscan)
1735 parallel_scan->phs_startblock = 0;
1736 else if (sync_startpage != InvalidBlockNumber)
1737 parallel_scan->phs_startblock = sync_startpage;
1738 else
1739 {
1740 SpinLockRelease(¶llel_scan->phs_mutex);
1741 sync_startpage = ss_get_location(scan->rs_rd, scan->rs_nblocks);
1742 goto retry;
1743 }
1744 }
1745 SpinLockRelease(¶llel_scan->phs_mutex);
1746 }
1747
1748 /* ----------------
1749 * heap_parallelscan_nextpage - get the next page to scan
1750 *
1751 * Get the next page to scan. Even if there are no pages left to scan,
1752 * another backend could have grabbed a page to scan and not yet finished
1753 * looking at it, so it doesn't follow that the scan is done when the
1754 * first backend gets an InvalidBlockNumber return.
1755 * ----------------
1756 */
1757 static BlockNumber
heap_parallelscan_nextpage(HeapScanDesc scan)1758 heap_parallelscan_nextpage(HeapScanDesc scan)
1759 {
1760 BlockNumber page;
1761 ParallelHeapScanDesc parallel_scan;
1762 uint64 nallocated;
1763
1764 Assert(scan->rs_parallel);
1765 parallel_scan = scan->rs_parallel;
1766
1767 /*
1768 * phs_nallocated tracks how many pages have been allocated to workers
1769 * already. When phs_nallocated >= rs_nblocks, all blocks have been
1770 * allocated.
1771 *
1772 * Because we use an atomic fetch-and-add to fetch the current value, the
1773 * phs_nallocated counter will exceed rs_nblocks, because workers will
1774 * still increment the value, when they try to allocate the next block but
1775 * all blocks have been allocated already. The counter must be 64 bits
1776 * wide because of that, to avoid wrapping around when rs_nblocks is close
1777 * to 2^32.
1778 *
1779 * The actual page to return is calculated by adding the counter to the
1780 * starting block number, modulo nblocks.
1781 */
1782 nallocated = pg_atomic_fetch_add_u64(¶llel_scan->phs_nallocated, 1);
1783 if (nallocated >= scan->rs_nblocks)
1784 page = InvalidBlockNumber; /* all blocks have been allocated */
1785 else
1786 page = (nallocated + parallel_scan->phs_startblock) % scan->rs_nblocks;
1787
1788 /*
1789 * Report scan location. Normally, we report the current page number.
1790 * When we reach the end of the scan, though, we report the starting page,
1791 * not the ending page, just so the starting positions for later scans
1792 * doesn't slew backwards. We only report the position at the end of the
1793 * scan once, though: subsequent callers will report nothing.
1794 */
1795 if (scan->rs_syncscan)
1796 {
1797 if (page != InvalidBlockNumber)
1798 ss_report_location(scan->rs_rd, page);
1799 else if (nallocated == scan->rs_nblocks)
1800 ss_report_location(scan->rs_rd, parallel_scan->phs_startblock);
1801 }
1802
1803 return page;
1804 }
1805
1806 /* ----------------
1807 * heap_update_snapshot
1808 *
1809 * Update snapshot info in heap scan descriptor.
1810 * ----------------
1811 */
1812 void
heap_update_snapshot(HeapScanDesc scan,Snapshot snapshot)1813 heap_update_snapshot(HeapScanDesc scan, Snapshot snapshot)
1814 {
1815 Assert(IsMVCCSnapshot(snapshot));
1816
1817 RegisterSnapshot(snapshot);
1818 scan->rs_snapshot = snapshot;
1819 scan->rs_temp_snap = true;
1820 }
1821
1822 /* ----------------
1823 * heap_getnext - retrieve next tuple in scan
1824 *
1825 * Fix to work with index relations.
1826 * We don't return the buffer anymore, but you can get it from the
1827 * returned HeapTuple.
1828 * ----------------
1829 */
1830
1831 #ifdef HEAPDEBUGALL
1832 #define HEAPDEBUG_1 \
1833 elog(DEBUG2, "heap_getnext([%s,nkeys=%d],dir=%d) called", \
1834 RelationGetRelationName(scan->rs_rd), scan->rs_nkeys, (int) direction)
1835 #define HEAPDEBUG_2 \
1836 elog(DEBUG2, "heap_getnext returning EOS")
1837 #define HEAPDEBUG_3 \
1838 elog(DEBUG2, "heap_getnext returning tuple")
1839 #else
1840 #define HEAPDEBUG_1
1841 #define HEAPDEBUG_2
1842 #define HEAPDEBUG_3
1843 #endif /* !defined(HEAPDEBUGALL) */
1844
1845
1846 HeapTuple
heap_getnext(HeapScanDesc scan,ScanDirection direction)1847 heap_getnext(HeapScanDesc scan, ScanDirection direction)
1848 {
1849 /* Note: no locking manipulations needed */
1850
1851 HEAPDEBUG_1; /* heap_getnext( info ) */
1852
1853 if (scan->rs_pageatatime)
1854 heapgettup_pagemode(scan, direction,
1855 scan->rs_nkeys, scan->rs_key);
1856 else
1857 heapgettup(scan, direction, scan->rs_nkeys, scan->rs_key);
1858
1859 if (scan->rs_ctup.t_data == NULL)
1860 {
1861 HEAPDEBUG_2; /* heap_getnext returning EOS */
1862 return NULL;
1863 }
1864
1865 /*
1866 * if we get here it means we have a new current scan tuple, so point to
1867 * the proper return buffer and return the tuple.
1868 */
1869 HEAPDEBUG_3; /* heap_getnext returning tuple */
1870
1871 pgstat_count_heap_getnext(scan->rs_rd);
1872
1873 return &(scan->rs_ctup);
1874 }
1875
1876 /*
1877 * heap_fetch - retrieve tuple with given tid
1878 *
1879 * On entry, tuple->t_self is the TID to fetch. We pin the buffer holding
1880 * the tuple, fill in the remaining fields of *tuple, and check the tuple
1881 * against the specified snapshot.
1882 *
1883 * If successful (tuple found and passes snapshot time qual), then *userbuf
1884 * is set to the buffer holding the tuple and true is returned. The caller
1885 * must unpin the buffer when done with the tuple.
1886 *
1887 * If the tuple is not found (ie, item number references a deleted slot),
1888 * then tuple->t_data is set to NULL and false is returned.
1889 *
1890 * If the tuple is found but fails the time qual check, then false is returned
1891 * but tuple->t_data is left pointing to the tuple.
1892 *
1893 * keep_buf determines what is done with the buffer in the false-result cases.
1894 * When the caller specifies keep_buf = true, we retain the pin on the buffer
1895 * and return it in *userbuf (so the caller must eventually unpin it); when
1896 * keep_buf = false, the pin is released and *userbuf is set to InvalidBuffer.
1897 *
1898 * stats_relation is the relation to charge the heap_fetch operation against
1899 * for statistical purposes. (This could be the heap rel itself, an
1900 * associated index, or NULL to not count the fetch at all.)
1901 *
1902 * heap_fetch does not follow HOT chains: only the exact TID requested will
1903 * be fetched.
1904 *
1905 * It is somewhat inconsistent that we ereport() on invalid block number but
1906 * return false on invalid item number. There are a couple of reasons though.
1907 * One is that the caller can relatively easily check the block number for
1908 * validity, but cannot check the item number without reading the page
1909 * himself. Another is that when we are following a t_ctid link, we can be
1910 * reasonably confident that the page number is valid (since VACUUM shouldn't
1911 * truncate off the destination page without having killed the referencing
1912 * tuple first), but the item number might well not be good.
1913 */
1914 bool
heap_fetch(Relation relation,Snapshot snapshot,HeapTuple tuple,Buffer * userbuf,bool keep_buf,Relation stats_relation)1915 heap_fetch(Relation relation,
1916 Snapshot snapshot,
1917 HeapTuple tuple,
1918 Buffer *userbuf,
1919 bool keep_buf,
1920 Relation stats_relation)
1921 {
1922 ItemPointer tid = &(tuple->t_self);
1923 ItemId lp;
1924 Buffer buffer;
1925 Page page;
1926 OffsetNumber offnum;
1927 bool valid;
1928
1929 /*
1930 * Fetch and pin the appropriate page of the relation.
1931 */
1932 buffer = ReadBuffer(relation, ItemPointerGetBlockNumber(tid));
1933
1934 /*
1935 * Need share lock on buffer to examine tuple commit status.
1936 */
1937 LockBuffer(buffer, BUFFER_LOCK_SHARE);
1938 page = BufferGetPage(buffer);
1939 TestForOldSnapshot(snapshot, relation, page);
1940
1941 /*
1942 * We'd better check for out-of-range offnum in case of VACUUM since the
1943 * TID was obtained.
1944 */
1945 offnum = ItemPointerGetOffsetNumber(tid);
1946 if (offnum < FirstOffsetNumber || offnum > PageGetMaxOffsetNumber(page))
1947 {
1948 LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
1949 if (keep_buf)
1950 *userbuf = buffer;
1951 else
1952 {
1953 ReleaseBuffer(buffer);
1954 *userbuf = InvalidBuffer;
1955 }
1956 tuple->t_data = NULL;
1957 return false;
1958 }
1959
1960 /*
1961 * get the item line pointer corresponding to the requested tid
1962 */
1963 lp = PageGetItemId(page, offnum);
1964
1965 /*
1966 * Must check for deleted tuple.
1967 */
1968 if (!ItemIdIsNormal(lp))
1969 {
1970 LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
1971 if (keep_buf)
1972 *userbuf = buffer;
1973 else
1974 {
1975 ReleaseBuffer(buffer);
1976 *userbuf = InvalidBuffer;
1977 }
1978 tuple->t_data = NULL;
1979 return false;
1980 }
1981
1982 /*
1983 * fill in *tuple fields
1984 */
1985 tuple->t_data = (HeapTupleHeader) PageGetItem(page, lp);
1986 tuple->t_len = ItemIdGetLength(lp);
1987 tuple->t_tableOid = RelationGetRelid(relation);
1988
1989 /*
1990 * check time qualification of tuple, then release lock
1991 */
1992 valid = HeapTupleSatisfiesVisibility(tuple, snapshot, buffer);
1993
1994 if (valid)
1995 PredicateLockTuple(relation, tuple, snapshot);
1996
1997 CheckForSerializableConflictOut(valid, relation, tuple, buffer, snapshot);
1998
1999 LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
2000
2001 if (valid)
2002 {
2003 /*
2004 * All checks passed, so return the tuple as valid. Caller is now
2005 * responsible for releasing the buffer.
2006 */
2007 *userbuf = buffer;
2008
2009 /* Count the successful fetch against appropriate rel, if any */
2010 if (stats_relation != NULL)
2011 pgstat_count_heap_fetch(stats_relation);
2012
2013 return true;
2014 }
2015
2016 /* Tuple failed time qual, but maybe caller wants to see it anyway. */
2017 if (keep_buf)
2018 *userbuf = buffer;
2019 else
2020 {
2021 ReleaseBuffer(buffer);
2022 *userbuf = InvalidBuffer;
2023 }
2024
2025 return false;
2026 }
2027
2028 /*
2029 * heap_hot_search_buffer - search HOT chain for tuple satisfying snapshot
2030 *
2031 * On entry, *tid is the TID of a tuple (either a simple tuple, or the root
2032 * of a HOT chain), and buffer is the buffer holding this tuple. We search
2033 * for the first chain member satisfying the given snapshot. If one is
2034 * found, we update *tid to reference that tuple's offset number, and
2035 * return true. If no match, return false without modifying *tid.
2036 *
2037 * heapTuple is a caller-supplied buffer. When a match is found, we return
2038 * the tuple here, in addition to updating *tid. If no match is found, the
2039 * contents of this buffer on return are undefined.
2040 *
2041 * If all_dead is not NULL, we check non-visible tuples to see if they are
2042 * globally dead; *all_dead is set true if all members of the HOT chain
2043 * are vacuumable, false if not.
2044 *
2045 * Unlike heap_fetch, the caller must already have pin and (at least) share
2046 * lock on the buffer; it is still pinned/locked at exit. Also unlike
2047 * heap_fetch, we do not report any pgstats count; caller may do so if wanted.
2048 */
2049 bool
heap_hot_search_buffer(ItemPointer tid,Relation relation,Buffer buffer,Snapshot snapshot,HeapTuple heapTuple,bool * all_dead,bool first_call)2050 heap_hot_search_buffer(ItemPointer tid, Relation relation, Buffer buffer,
2051 Snapshot snapshot, HeapTuple heapTuple,
2052 bool *all_dead, bool first_call)
2053 {
2054 Page dp = (Page) BufferGetPage(buffer);
2055 TransactionId prev_xmax = InvalidTransactionId;
2056 BlockNumber blkno;
2057 OffsetNumber offnum;
2058 bool at_chain_start;
2059 bool valid;
2060 bool skip;
2061
2062 /* If this is not the first call, previous call returned a (live!) tuple */
2063 if (all_dead)
2064 *all_dead = first_call;
2065
2066 blkno = ItemPointerGetBlockNumber(tid);
2067 offnum = ItemPointerGetOffsetNumber(tid);
2068 at_chain_start = first_call;
2069 skip = !first_call;
2070
2071 Assert(TransactionIdIsValid(RecentGlobalXmin));
2072 Assert(BufferGetBlockNumber(buffer) == blkno);
2073
2074 /* Scan through possible multiple members of HOT-chain */
2075 for (;;)
2076 {
2077 ItemId lp;
2078
2079 /* check for bogus TID */
2080 if (offnum < FirstOffsetNumber || offnum > PageGetMaxOffsetNumber(dp))
2081 break;
2082
2083 lp = PageGetItemId(dp, offnum);
2084
2085 /* check for unused, dead, or redirected items */
2086 if (!ItemIdIsNormal(lp))
2087 {
2088 /* We should only see a redirect at start of chain */
2089 if (ItemIdIsRedirected(lp) && at_chain_start)
2090 {
2091 /* Follow the redirect */
2092 offnum = ItemIdGetRedirect(lp);
2093 at_chain_start = false;
2094 continue;
2095 }
2096 /* else must be end of chain */
2097 break;
2098 }
2099
2100 /*
2101 * Update heapTuple to point to the element of the HOT chain we're
2102 * currently investigating. Having t_self set correctly is important
2103 * because the SSI checks and the *Satisfies routine for historical
2104 * MVCC snapshots need the correct tid to decide about the visibility.
2105 */
2106 heapTuple->t_data = (HeapTupleHeader) PageGetItem(dp, lp);
2107 heapTuple->t_len = ItemIdGetLength(lp);
2108 heapTuple->t_tableOid = RelationGetRelid(relation);
2109 ItemPointerSet(&heapTuple->t_self, blkno, offnum);
2110
2111 /*
2112 * Shouldn't see a HEAP_ONLY tuple at chain start.
2113 */
2114 if (at_chain_start && HeapTupleIsHeapOnly(heapTuple))
2115 break;
2116
2117 /*
2118 * The xmin should match the previous xmax value, else chain is
2119 * broken.
2120 */
2121 if (TransactionIdIsValid(prev_xmax) &&
2122 !TransactionIdEquals(prev_xmax,
2123 HeapTupleHeaderGetXmin(heapTuple->t_data)))
2124 break;
2125
2126 /*
2127 * When first_call is true (and thus, skip is initially false) we'll
2128 * return the first tuple we find. But on later passes, heapTuple
2129 * will initially be pointing to the tuple we returned last time.
2130 * Returning it again would be incorrect (and would loop forever), so
2131 * we skip it and return the next match we find.
2132 */
2133 if (!skip)
2134 {
2135 /* If it's visible per the snapshot, we must return it */
2136 valid = HeapTupleSatisfiesVisibility(heapTuple, snapshot, buffer);
2137 CheckForSerializableConflictOut(valid, relation, heapTuple,
2138 buffer, snapshot);
2139
2140 if (valid)
2141 {
2142 ItemPointerSetOffsetNumber(tid, offnum);
2143 PredicateLockTuple(relation, heapTuple, snapshot);
2144 if (all_dead)
2145 *all_dead = false;
2146 return true;
2147 }
2148 }
2149 skip = false;
2150
2151 /*
2152 * If we can't see it, maybe no one else can either. At caller
2153 * request, check whether all chain members are dead to all
2154 * transactions.
2155 *
2156 * Note: if you change the criterion here for what is "dead", fix the
2157 * planner's get_actual_variable_range() function to match.
2158 */
2159 if (all_dead && *all_dead &&
2160 !HeapTupleIsSurelyDead(heapTuple, RecentGlobalXmin))
2161 *all_dead = false;
2162
2163 /*
2164 * Check to see if HOT chain continues past this tuple; if so fetch
2165 * the next offnum and loop around.
2166 */
2167 if (HeapTupleIsHotUpdated(heapTuple))
2168 {
2169 Assert(ItemPointerGetBlockNumber(&heapTuple->t_data->t_ctid) ==
2170 blkno);
2171 offnum = ItemPointerGetOffsetNumber(&heapTuple->t_data->t_ctid);
2172 at_chain_start = false;
2173 prev_xmax = HeapTupleHeaderGetUpdateXid(heapTuple->t_data);
2174 }
2175 else
2176 break; /* end of chain */
2177 }
2178
2179 return false;
2180 }
2181
2182 /*
2183 * heap_hot_search - search HOT chain for tuple satisfying snapshot
2184 *
2185 * This has the same API as heap_hot_search_buffer, except that the caller
2186 * does not provide the buffer containing the page, rather we access it
2187 * locally.
2188 */
2189 bool
heap_hot_search(ItemPointer tid,Relation relation,Snapshot snapshot,bool * all_dead)2190 heap_hot_search(ItemPointer tid, Relation relation, Snapshot snapshot,
2191 bool *all_dead)
2192 {
2193 bool result;
2194 Buffer buffer;
2195 HeapTupleData heapTuple;
2196
2197 buffer = ReadBuffer(relation, ItemPointerGetBlockNumber(tid));
2198 LockBuffer(buffer, BUFFER_LOCK_SHARE);
2199 result = heap_hot_search_buffer(tid, relation, buffer, snapshot,
2200 &heapTuple, all_dead, true);
2201 LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
2202 ReleaseBuffer(buffer);
2203 return result;
2204 }
2205
2206 /*
2207 * heap_get_latest_tid - get the latest tid of a specified tuple
2208 *
2209 * Actually, this gets the latest version that is visible according to
2210 * the passed snapshot. You can pass SnapshotDirty to get the very latest,
2211 * possibly uncommitted version.
2212 *
2213 * *tid is both an input and an output parameter: it is updated to
2214 * show the latest version of the row. Note that it will not be changed
2215 * if no version of the row passes the snapshot test.
2216 */
2217 void
heap_get_latest_tid(Relation relation,Snapshot snapshot,ItemPointer tid)2218 heap_get_latest_tid(Relation relation,
2219 Snapshot snapshot,
2220 ItemPointer tid)
2221 {
2222 BlockNumber blk;
2223 ItemPointerData ctid;
2224 TransactionId priorXmax;
2225
2226 /* this is to avoid Assert failures on bad input */
2227 if (!ItemPointerIsValid(tid))
2228 return;
2229
2230 /*
2231 * Since this can be called with user-supplied TID, don't trust the input
2232 * too much. (RelationGetNumberOfBlocks is an expensive check, so we
2233 * don't check t_ctid links again this way. Note that it would not do to
2234 * call it just once and save the result, either.)
2235 */
2236 blk = ItemPointerGetBlockNumber(tid);
2237 if (blk >= RelationGetNumberOfBlocks(relation))
2238 elog(ERROR, "block number %u is out of range for relation \"%s\"",
2239 blk, RelationGetRelationName(relation));
2240
2241 /*
2242 * Loop to chase down t_ctid links. At top of loop, ctid is the tuple we
2243 * need to examine, and *tid is the TID we will return if ctid turns out
2244 * to be bogus.
2245 *
2246 * Note that we will loop until we reach the end of the t_ctid chain.
2247 * Depending on the snapshot passed, there might be at most one visible
2248 * version of the row, but we don't try to optimize for that.
2249 */
2250 ctid = *tid;
2251 priorXmax = InvalidTransactionId; /* cannot check first XMIN */
2252 for (;;)
2253 {
2254 Buffer buffer;
2255 Page page;
2256 OffsetNumber offnum;
2257 ItemId lp;
2258 HeapTupleData tp;
2259 bool valid;
2260
2261 /*
2262 * Read, pin, and lock the page.
2263 */
2264 buffer = ReadBuffer(relation, ItemPointerGetBlockNumber(&ctid));
2265 LockBuffer(buffer, BUFFER_LOCK_SHARE);
2266 page = BufferGetPage(buffer);
2267 TestForOldSnapshot(snapshot, relation, page);
2268
2269 /*
2270 * Check for bogus item number. This is not treated as an error
2271 * condition because it can happen while following a t_ctid link. We
2272 * just assume that the prior tid is OK and return it unchanged.
2273 */
2274 offnum = ItemPointerGetOffsetNumber(&ctid);
2275 if (offnum < FirstOffsetNumber || offnum > PageGetMaxOffsetNumber(page))
2276 {
2277 UnlockReleaseBuffer(buffer);
2278 break;
2279 }
2280 lp = PageGetItemId(page, offnum);
2281 if (!ItemIdIsNormal(lp))
2282 {
2283 UnlockReleaseBuffer(buffer);
2284 break;
2285 }
2286
2287 /* OK to access the tuple */
2288 tp.t_self = ctid;
2289 tp.t_data = (HeapTupleHeader) PageGetItem(page, lp);
2290 tp.t_len = ItemIdGetLength(lp);
2291 tp.t_tableOid = RelationGetRelid(relation);
2292
2293 /*
2294 * After following a t_ctid link, we might arrive at an unrelated
2295 * tuple. Check for XMIN match.
2296 */
2297 if (TransactionIdIsValid(priorXmax) &&
2298 !TransactionIdEquals(priorXmax, HeapTupleHeaderGetXmin(tp.t_data)))
2299 {
2300 UnlockReleaseBuffer(buffer);
2301 break;
2302 }
2303
2304 /*
2305 * Check time qualification of tuple; if visible, set it as the new
2306 * result candidate.
2307 */
2308 valid = HeapTupleSatisfiesVisibility(&tp, snapshot, buffer);
2309 CheckForSerializableConflictOut(valid, relation, &tp, buffer, snapshot);
2310 if (valid)
2311 *tid = ctid;
2312
2313 /*
2314 * If there's a valid t_ctid link, follow it, else we're done.
2315 */
2316 if ((tp.t_data->t_infomask & HEAP_XMAX_INVALID) ||
2317 HeapTupleHeaderIsOnlyLocked(tp.t_data) ||
2318 HeapTupleHeaderIndicatesMovedPartitions(tp.t_data) ||
2319 ItemPointerEquals(&tp.t_self, &tp.t_data->t_ctid))
2320 {
2321 UnlockReleaseBuffer(buffer);
2322 break;
2323 }
2324
2325 ctid = tp.t_data->t_ctid;
2326 priorXmax = HeapTupleHeaderGetUpdateXid(tp.t_data);
2327 UnlockReleaseBuffer(buffer);
2328 } /* end of loop */
2329 }
2330
2331
2332 /*
2333 * UpdateXmaxHintBits - update tuple hint bits after xmax transaction ends
2334 *
2335 * This is called after we have waited for the XMAX transaction to terminate.
2336 * If the transaction aborted, we guarantee the XMAX_INVALID hint bit will
2337 * be set on exit. If the transaction committed, we set the XMAX_COMMITTED
2338 * hint bit if possible --- but beware that that may not yet be possible,
2339 * if the transaction committed asynchronously.
2340 *
2341 * Note that if the transaction was a locker only, we set HEAP_XMAX_INVALID
2342 * even if it commits.
2343 *
2344 * Hence callers should look only at XMAX_INVALID.
2345 *
2346 * Note this is not allowed for tuples whose xmax is a multixact.
2347 */
2348 static void
UpdateXmaxHintBits(HeapTupleHeader tuple,Buffer buffer,TransactionId xid)2349 UpdateXmaxHintBits(HeapTupleHeader tuple, Buffer buffer, TransactionId xid)
2350 {
2351 Assert(TransactionIdEquals(HeapTupleHeaderGetRawXmax(tuple), xid));
2352 Assert(!(tuple->t_infomask & HEAP_XMAX_IS_MULTI));
2353
2354 if (!(tuple->t_infomask & (HEAP_XMAX_COMMITTED | HEAP_XMAX_INVALID)))
2355 {
2356 if (!HEAP_XMAX_IS_LOCKED_ONLY(tuple->t_infomask) &&
2357 TransactionIdDidCommit(xid))
2358 HeapTupleSetHintBits(tuple, buffer, HEAP_XMAX_COMMITTED,
2359 xid);
2360 else
2361 HeapTupleSetHintBits(tuple, buffer, HEAP_XMAX_INVALID,
2362 InvalidTransactionId);
2363 }
2364 }
2365
2366
2367 /*
2368 * GetBulkInsertState - prepare status object for a bulk insert
2369 */
2370 BulkInsertState
GetBulkInsertState(void)2371 GetBulkInsertState(void)
2372 {
2373 BulkInsertState bistate;
2374
2375 bistate = (BulkInsertState) palloc(sizeof(BulkInsertStateData));
2376 bistate->strategy = GetAccessStrategy(BAS_BULKWRITE);
2377 bistate->current_buf = InvalidBuffer;
2378 return bistate;
2379 }
2380
2381 /*
2382 * FreeBulkInsertState - clean up after finishing a bulk insert
2383 */
2384 void
FreeBulkInsertState(BulkInsertState bistate)2385 FreeBulkInsertState(BulkInsertState bistate)
2386 {
2387 if (bistate->current_buf != InvalidBuffer)
2388 ReleaseBuffer(bistate->current_buf);
2389 FreeAccessStrategy(bistate->strategy);
2390 pfree(bistate);
2391 }
2392
2393 /*
2394 * ReleaseBulkInsertStatePin - release a buffer currently held in bistate
2395 */
2396 void
ReleaseBulkInsertStatePin(BulkInsertState bistate)2397 ReleaseBulkInsertStatePin(BulkInsertState bistate)
2398 {
2399 if (bistate->current_buf != InvalidBuffer)
2400 ReleaseBuffer(bistate->current_buf);
2401 bistate->current_buf = InvalidBuffer;
2402 }
2403
2404
2405 /*
2406 * heap_insert - insert tuple into a heap
2407 *
2408 * The new tuple is stamped with current transaction ID and the specified
2409 * command ID.
2410 *
2411 * If the HEAP_INSERT_SKIP_WAL option is specified, the new tuple is not
2412 * logged in WAL, even for a non-temp relation. Safe usage of this behavior
2413 * requires that we arrange that all new tuples go into new pages not
2414 * containing any tuples from other transactions, and that the relation gets
2415 * fsync'd before commit. (See also heap_sync() comments)
2416 *
2417 * The HEAP_INSERT_SKIP_FSM option is passed directly to
2418 * RelationGetBufferForTuple, which see for more info.
2419 *
2420 * HEAP_INSERT_FROZEN should only be specified for inserts into
2421 * relfilenodes created during the current subtransaction and when
2422 * there are no prior snapshots or pre-existing portals open.
2423 * This causes rows to be frozen, which is an MVCC violation and
2424 * requires explicit options chosen by user.
2425 *
2426 * HEAP_INSERT_SPECULATIVE is used on so-called "speculative insertions",
2427 * which can be backed out afterwards without aborting the whole transaction.
2428 * Other sessions can wait for the speculative insertion to be confirmed,
2429 * turning it into a regular tuple, or aborted, as if it never existed.
2430 * Speculatively inserted tuples behave as "value locks" of short duration,
2431 * used to implement INSERT .. ON CONFLICT.
2432 *
2433 * HEAP_INSERT_NO_LOGICAL force-disables the emitting of logical decoding
2434 * information for the tuple. This should solely be used during table rewrites
2435 * where RelationIsLogicallyLogged(relation) is not yet accurate for the new
2436 * relation.
2437 *
2438 * Note that most of these options will be applied when inserting into the
2439 * heap's TOAST table, too, if the tuple requires any out-of-line data. Only
2440 * HEAP_INSERT_SPECULATIVE is explicitly ignored, as the toast data does not
2441 * partake in speculative insertion.
2442 *
2443 * The BulkInsertState object (if any; bistate can be NULL for default
2444 * behavior) is also just passed through to RelationGetBufferForTuple.
2445 *
2446 * The return value is the OID assigned to the tuple (either here or by the
2447 * caller), or InvalidOid if no OID. The header fields of *tup are updated
2448 * to match the stored tuple; in particular tup->t_self receives the actual
2449 * TID where the tuple was stored. But note that any toasting of fields
2450 * within the tuple data is NOT reflected into *tup.
2451 */
2452 Oid
heap_insert(Relation relation,HeapTuple tup,CommandId cid,int options,BulkInsertState bistate)2453 heap_insert(Relation relation, HeapTuple tup, CommandId cid,
2454 int options, BulkInsertState bistate)
2455 {
2456 TransactionId xid = GetCurrentTransactionId();
2457 HeapTuple heaptup;
2458 Buffer buffer;
2459 Buffer vmbuffer = InvalidBuffer;
2460 bool all_visible_cleared = false;
2461
2462 /* Cheap, simplistic check that the tuple matches the rel's rowtype. */
2463 Assert(HeapTupleHeaderGetNatts(tup->t_data) <=
2464 RelationGetNumberOfAttributes(relation));
2465
2466 /*
2467 * Fill in tuple header fields, assign an OID, and toast the tuple if
2468 * necessary.
2469 *
2470 * Note: below this point, heaptup is the data we actually intend to store
2471 * into the relation; tup is the caller's original untoasted data.
2472 */
2473 heaptup = heap_prepare_insert(relation, tup, xid, cid, options);
2474
2475 /*
2476 * Find buffer to insert this tuple into. If the page is all visible,
2477 * this will also pin the requisite visibility map page.
2478 */
2479 buffer = RelationGetBufferForTuple(relation, heaptup->t_len,
2480 InvalidBuffer, options, bistate,
2481 &vmbuffer, NULL);
2482
2483 /*
2484 * We're about to do the actual insert -- but check for conflict first, to
2485 * avoid possibly having to roll back work we've just done.
2486 *
2487 * This is safe without a recheck as long as there is no possibility of
2488 * another process scanning the page between this check and the insert
2489 * being visible to the scan (i.e., an exclusive buffer content lock is
2490 * continuously held from this point until the tuple insert is visible).
2491 *
2492 * For a heap insert, we only need to check for table-level SSI locks. Our
2493 * new tuple can't possibly conflict with existing tuple locks, and heap
2494 * page locks are only consolidated versions of tuple locks; they do not
2495 * lock "gaps" as index page locks do. So we don't need to specify a
2496 * buffer when making the call, which makes for a faster check.
2497 */
2498 CheckForSerializableConflictIn(relation, NULL, InvalidBuffer);
2499
2500 /* NO EREPORT(ERROR) from here till changes are logged */
2501 START_CRIT_SECTION();
2502
2503 RelationPutHeapTuple(relation, buffer, heaptup,
2504 (options & HEAP_INSERT_SPECULATIVE) != 0);
2505
2506 if (PageIsAllVisible(BufferGetPage(buffer)))
2507 {
2508 all_visible_cleared = true;
2509 PageClearAllVisible(BufferGetPage(buffer));
2510 visibilitymap_clear(relation,
2511 ItemPointerGetBlockNumber(&(heaptup->t_self)),
2512 vmbuffer, VISIBILITYMAP_VALID_BITS);
2513 }
2514
2515 /*
2516 * XXX Should we set PageSetPrunable on this page ?
2517 *
2518 * The inserting transaction may eventually abort thus making this tuple
2519 * DEAD and hence available for pruning. Though we don't want to optimize
2520 * for aborts, if no other tuple in this page is UPDATEd/DELETEd, the
2521 * aborted tuple will never be pruned until next vacuum is triggered.
2522 *
2523 * If you do add PageSetPrunable here, add it in heap_xlog_insert too.
2524 */
2525
2526 MarkBufferDirty(buffer);
2527
2528 /* XLOG stuff */
2529 if (!(options & HEAP_INSERT_SKIP_WAL) && RelationNeedsWAL(relation))
2530 {
2531 xl_heap_insert xlrec;
2532 xl_heap_header xlhdr;
2533 XLogRecPtr recptr;
2534 Page page = BufferGetPage(buffer);
2535 uint8 info = XLOG_HEAP_INSERT;
2536 int bufflags = 0;
2537
2538 /*
2539 * If this is a catalog, we need to transmit combocids to properly
2540 * decode, so log that as well.
2541 */
2542 if (RelationIsAccessibleInLogicalDecoding(relation))
2543 log_heap_new_cid(relation, heaptup);
2544
2545 /*
2546 * If this is the single and first tuple on page, we can reinit the
2547 * page instead of restoring the whole thing. Set flag, and hide
2548 * buffer references from XLogInsert.
2549 */
2550 if (ItemPointerGetOffsetNumber(&(heaptup->t_self)) == FirstOffsetNumber &&
2551 PageGetMaxOffsetNumber(page) == FirstOffsetNumber)
2552 {
2553 info |= XLOG_HEAP_INIT_PAGE;
2554 bufflags |= REGBUF_WILL_INIT;
2555 }
2556
2557 xlrec.offnum = ItemPointerGetOffsetNumber(&heaptup->t_self);
2558 xlrec.flags = 0;
2559 if (all_visible_cleared)
2560 xlrec.flags |= XLH_INSERT_ALL_VISIBLE_CLEARED;
2561 if (options & HEAP_INSERT_SPECULATIVE)
2562 xlrec.flags |= XLH_INSERT_IS_SPECULATIVE;
2563 Assert(ItemPointerGetBlockNumber(&heaptup->t_self) == BufferGetBlockNumber(buffer));
2564
2565 /*
2566 * For logical decoding, we need the tuple even if we're doing a full
2567 * page write, so make sure it's included even if we take a full-page
2568 * image. (XXX We could alternatively store a pointer into the FPW).
2569 */
2570 if (RelationIsLogicallyLogged(relation) &&
2571 !(options & HEAP_INSERT_NO_LOGICAL))
2572 {
2573 xlrec.flags |= XLH_INSERT_CONTAINS_NEW_TUPLE;
2574 bufflags |= REGBUF_KEEP_DATA;
2575 }
2576
2577 XLogBeginInsert();
2578 XLogRegisterData((char *) &xlrec, SizeOfHeapInsert);
2579
2580 xlhdr.t_infomask2 = heaptup->t_data->t_infomask2;
2581 xlhdr.t_infomask = heaptup->t_data->t_infomask;
2582 xlhdr.t_hoff = heaptup->t_data->t_hoff;
2583
2584 /*
2585 * note we mark xlhdr as belonging to buffer; if XLogInsert decides to
2586 * write the whole page to the xlog, we don't need to store
2587 * xl_heap_header in the xlog.
2588 */
2589 XLogRegisterBuffer(0, buffer, REGBUF_STANDARD | bufflags);
2590 XLogRegisterBufData(0, (char *) &xlhdr, SizeOfHeapHeader);
2591 /* PG73FORMAT: write bitmap [+ padding] [+ oid] + data */
2592 XLogRegisterBufData(0,
2593 (char *) heaptup->t_data + SizeofHeapTupleHeader,
2594 heaptup->t_len - SizeofHeapTupleHeader);
2595
2596 /* filtering by origin on a row level is much more efficient */
2597 XLogSetRecordFlags(XLOG_INCLUDE_ORIGIN);
2598
2599 recptr = XLogInsert(RM_HEAP_ID, info);
2600
2601 PageSetLSN(page, recptr);
2602 }
2603
2604 END_CRIT_SECTION();
2605
2606 UnlockReleaseBuffer(buffer);
2607 if (vmbuffer != InvalidBuffer)
2608 ReleaseBuffer(vmbuffer);
2609
2610 /*
2611 * If tuple is cachable, mark it for invalidation from the caches in case
2612 * we abort. Note it is OK to do this after releasing the buffer, because
2613 * the heaptup data structure is all in local memory, not in the shared
2614 * buffer.
2615 */
2616 CacheInvalidateHeapTuple(relation, heaptup, NULL);
2617
2618 /* Note: speculative insertions are counted too, even if aborted later */
2619 pgstat_count_heap_insert(relation, 1);
2620
2621 /*
2622 * If heaptup is a private copy, release it. Don't forget to copy t_self
2623 * back to the caller's image, too.
2624 */
2625 if (heaptup != tup)
2626 {
2627 tup->t_self = heaptup->t_self;
2628 heap_freetuple(heaptup);
2629 }
2630
2631 return HeapTupleGetOid(tup);
2632 }
2633
2634 /*
2635 * Subroutine for heap_insert(). Prepares a tuple for insertion. This sets the
2636 * tuple header fields, assigns an OID, and toasts the tuple if necessary.
2637 * Returns a toasted version of the tuple if it was toasted, or the original
2638 * tuple if not. Note that in any case, the header fields are also set in
2639 * the original tuple.
2640 */
2641 static HeapTuple
heap_prepare_insert(Relation relation,HeapTuple tup,TransactionId xid,CommandId cid,int options)2642 heap_prepare_insert(Relation relation, HeapTuple tup, TransactionId xid,
2643 CommandId cid, int options)
2644 {
2645 /*
2646 * Parallel operations are required to be strictly read-only in a parallel
2647 * worker. Parallel inserts are not safe even in the leader in the
2648 * general case, because group locking means that heavyweight locks for
2649 * relation extension or GIN page locks will not conflict between members
2650 * of a lock group, but we don't prohibit that case here because there are
2651 * useful special cases that we can safely allow, such as CREATE TABLE AS.
2652 */
2653 if (IsParallelWorker())
2654 ereport(ERROR,
2655 (errcode(ERRCODE_INVALID_TRANSACTION_STATE),
2656 errmsg("cannot insert tuples in a parallel worker")));
2657
2658 if (relation->rd_rel->relhasoids)
2659 {
2660 #ifdef NOT_USED
2661 /* this is redundant with an Assert in HeapTupleSetOid */
2662 Assert(tup->t_data->t_infomask & HEAP_HASOID);
2663 #endif
2664
2665 /*
2666 * If the object id of this tuple has already been assigned, trust the
2667 * caller. There are a couple of ways this can happen. At initial db
2668 * creation, the backend program sets oids for tuples. When we define
2669 * an index, we set the oid. Finally, in the future, we may allow
2670 * users to set their own object ids in order to support a persistent
2671 * object store (objects need to contain pointers to one another).
2672 */
2673 if (!OidIsValid(HeapTupleGetOid(tup)))
2674 HeapTupleSetOid(tup, GetNewOid(relation));
2675 }
2676 else
2677 {
2678 /* check there is not space for an OID */
2679 Assert(!(tup->t_data->t_infomask & HEAP_HASOID));
2680 }
2681
2682 tup->t_data->t_infomask &= ~(HEAP_XACT_MASK);
2683 tup->t_data->t_infomask2 &= ~(HEAP2_XACT_MASK);
2684 tup->t_data->t_infomask |= HEAP_XMAX_INVALID;
2685 HeapTupleHeaderSetXmin(tup->t_data, xid);
2686 if (options & HEAP_INSERT_FROZEN)
2687 HeapTupleHeaderSetXminFrozen(tup->t_data);
2688
2689 HeapTupleHeaderSetCmin(tup->t_data, cid);
2690 HeapTupleHeaderSetXmax(tup->t_data, 0); /* for cleanliness */
2691 tup->t_tableOid = RelationGetRelid(relation);
2692
2693 /*
2694 * If the new tuple is too big for storage or contains already toasted
2695 * out-of-line attributes from some other relation, invoke the toaster.
2696 */
2697 if (relation->rd_rel->relkind != RELKIND_RELATION &&
2698 relation->rd_rel->relkind != RELKIND_MATVIEW)
2699 {
2700 /* toast table entries should never be recursively toasted */
2701 Assert(!HeapTupleHasExternal(tup));
2702 return tup;
2703 }
2704 else if (HeapTupleHasExternal(tup) || tup->t_len > TOAST_TUPLE_THRESHOLD)
2705 return toast_insert_or_update(relation, tup, NULL, options);
2706 else
2707 return tup;
2708 }
2709
2710 /*
2711 * heap_multi_insert - insert multiple tuple into a heap
2712 *
2713 * This is like heap_insert(), but inserts multiple tuples in one operation.
2714 * That's faster than calling heap_insert() in a loop, because when multiple
2715 * tuples can be inserted on a single page, we can write just a single WAL
2716 * record covering all of them, and only need to lock/unlock the page once.
2717 *
2718 * Note: this leaks memory into the current memory context. You can create a
2719 * temporary context before calling this, if that's a problem.
2720 */
2721 void
heap_multi_insert(Relation relation,HeapTuple * tuples,int ntuples,CommandId cid,int options,BulkInsertState bistate)2722 heap_multi_insert(Relation relation, HeapTuple *tuples, int ntuples,
2723 CommandId cid, int options, BulkInsertState bistate)
2724 {
2725 TransactionId xid = GetCurrentTransactionId();
2726 HeapTuple *heaptuples;
2727 int i;
2728 int ndone;
2729 PGAlignedBlock scratch;
2730 Page page;
2731 bool needwal;
2732 Size saveFreeSpace;
2733 bool need_tuple_data = RelationIsLogicallyLogged(relation);
2734 bool need_cids = RelationIsAccessibleInLogicalDecoding(relation);
2735
2736 /* currently not needed (thus unsupported) for heap_multi_insert() */
2737 AssertArg(!(options & HEAP_INSERT_NO_LOGICAL));
2738
2739 needwal = !(options & HEAP_INSERT_SKIP_WAL) && RelationNeedsWAL(relation);
2740 saveFreeSpace = RelationGetTargetPageFreeSpace(relation,
2741 HEAP_DEFAULT_FILLFACTOR);
2742
2743 /* Toast and set header data in all the tuples */
2744 heaptuples = palloc(ntuples * sizeof(HeapTuple));
2745 for (i = 0; i < ntuples; i++)
2746 heaptuples[i] = heap_prepare_insert(relation, tuples[i],
2747 xid, cid, options);
2748
2749 /*
2750 * We're about to do the actual inserts -- but check for conflict first,
2751 * to minimize the possibility of having to roll back work we've just
2752 * done.
2753 *
2754 * A check here does not definitively prevent a serialization anomaly;
2755 * that check MUST be done at least past the point of acquiring an
2756 * exclusive buffer content lock on every buffer that will be affected,
2757 * and MAY be done after all inserts are reflected in the buffers and
2758 * those locks are released; otherwise there race condition. Since
2759 * multiple buffers can be locked and unlocked in the loop below, and it
2760 * would not be feasible to identify and lock all of those buffers before
2761 * the loop, we must do a final check at the end.
2762 *
2763 * The check here could be omitted with no loss of correctness; it is
2764 * present strictly as an optimization.
2765 *
2766 * For heap inserts, we only need to check for table-level SSI locks. Our
2767 * new tuples can't possibly conflict with existing tuple locks, and heap
2768 * page locks are only consolidated versions of tuple locks; they do not
2769 * lock "gaps" as index page locks do. So we don't need to specify a
2770 * buffer when making the call, which makes for a faster check.
2771 */
2772 CheckForSerializableConflictIn(relation, NULL, InvalidBuffer);
2773
2774 ndone = 0;
2775 while (ndone < ntuples)
2776 {
2777 Buffer buffer;
2778 Buffer vmbuffer = InvalidBuffer;
2779 bool all_visible_cleared = false;
2780 int nthispage;
2781
2782 CHECK_FOR_INTERRUPTS();
2783
2784 /*
2785 * Find buffer where at least the next tuple will fit. If the page is
2786 * all-visible, this will also pin the requisite visibility map page.
2787 */
2788 buffer = RelationGetBufferForTuple(relation, heaptuples[ndone]->t_len,
2789 InvalidBuffer, options, bistate,
2790 &vmbuffer, NULL);
2791 page = BufferGetPage(buffer);
2792
2793 /* NO EREPORT(ERROR) from here till changes are logged */
2794 START_CRIT_SECTION();
2795
2796 /*
2797 * RelationGetBufferForTuple has ensured that the first tuple fits.
2798 * Put that on the page, and then as many other tuples as fit.
2799 */
2800 RelationPutHeapTuple(relation, buffer, heaptuples[ndone], false);
2801 for (nthispage = 1; ndone + nthispage < ntuples; nthispage++)
2802 {
2803 HeapTuple heaptup = heaptuples[ndone + nthispage];
2804
2805 if (PageGetHeapFreeSpace(page) < MAXALIGN(heaptup->t_len) + saveFreeSpace)
2806 break;
2807
2808 RelationPutHeapTuple(relation, buffer, heaptup, false);
2809
2810 /*
2811 * We don't use heap_multi_insert for catalog tuples yet, but
2812 * better be prepared...
2813 */
2814 if (needwal && need_cids)
2815 log_heap_new_cid(relation, heaptup);
2816 }
2817
2818 if (PageIsAllVisible(page))
2819 {
2820 all_visible_cleared = true;
2821 PageClearAllVisible(page);
2822 visibilitymap_clear(relation,
2823 BufferGetBlockNumber(buffer),
2824 vmbuffer, VISIBILITYMAP_VALID_BITS);
2825 }
2826
2827 /*
2828 * XXX Should we set PageSetPrunable on this page ? See heap_insert()
2829 */
2830
2831 MarkBufferDirty(buffer);
2832
2833 /* XLOG stuff */
2834 if (needwal)
2835 {
2836 XLogRecPtr recptr;
2837 xl_heap_multi_insert *xlrec;
2838 uint8 info = XLOG_HEAP2_MULTI_INSERT;
2839 char *tupledata;
2840 int totaldatalen;
2841 char *scratchptr = scratch.data;
2842 bool init;
2843 int bufflags = 0;
2844
2845 /*
2846 * If the page was previously empty, we can reinit the page
2847 * instead of restoring the whole thing.
2848 */
2849 init = (ItemPointerGetOffsetNumber(&(heaptuples[ndone]->t_self)) == FirstOffsetNumber &&
2850 PageGetMaxOffsetNumber(page) == FirstOffsetNumber + nthispage - 1);
2851
2852 /* allocate xl_heap_multi_insert struct from the scratch area */
2853 xlrec = (xl_heap_multi_insert *) scratchptr;
2854 scratchptr += SizeOfHeapMultiInsert;
2855
2856 /*
2857 * Allocate offsets array. Unless we're reinitializing the page,
2858 * in that case the tuples are stored in order starting at
2859 * FirstOffsetNumber and we don't need to store the offsets
2860 * explicitly.
2861 */
2862 if (!init)
2863 scratchptr += nthispage * sizeof(OffsetNumber);
2864
2865 /* the rest of the scratch space is used for tuple data */
2866 tupledata = scratchptr;
2867
2868 xlrec->flags = all_visible_cleared ? XLH_INSERT_ALL_VISIBLE_CLEARED : 0;
2869 xlrec->ntuples = nthispage;
2870
2871 /*
2872 * Write out an xl_multi_insert_tuple and the tuple data itself
2873 * for each tuple.
2874 */
2875 for (i = 0; i < nthispage; i++)
2876 {
2877 HeapTuple heaptup = heaptuples[ndone + i];
2878 xl_multi_insert_tuple *tuphdr;
2879 int datalen;
2880
2881 if (!init)
2882 xlrec->offsets[i] = ItemPointerGetOffsetNumber(&heaptup->t_self);
2883 /* xl_multi_insert_tuple needs two-byte alignment. */
2884 tuphdr = (xl_multi_insert_tuple *) SHORTALIGN(scratchptr);
2885 scratchptr = ((char *) tuphdr) + SizeOfMultiInsertTuple;
2886
2887 tuphdr->t_infomask2 = heaptup->t_data->t_infomask2;
2888 tuphdr->t_infomask = heaptup->t_data->t_infomask;
2889 tuphdr->t_hoff = heaptup->t_data->t_hoff;
2890
2891 /* write bitmap [+ padding] [+ oid] + data */
2892 datalen = heaptup->t_len - SizeofHeapTupleHeader;
2893 memcpy(scratchptr,
2894 (char *) heaptup->t_data + SizeofHeapTupleHeader,
2895 datalen);
2896 tuphdr->datalen = datalen;
2897 scratchptr += datalen;
2898 }
2899 totaldatalen = scratchptr - tupledata;
2900 Assert((scratchptr - scratch.data) < BLCKSZ);
2901
2902 if (need_tuple_data)
2903 xlrec->flags |= XLH_INSERT_CONTAINS_NEW_TUPLE;
2904
2905 /*
2906 * Signal that this is the last xl_heap_multi_insert record
2907 * emitted by this call to heap_multi_insert(). Needed for logical
2908 * decoding so it knows when to cleanup temporary data.
2909 */
2910 if (ndone + nthispage == ntuples)
2911 xlrec->flags |= XLH_INSERT_LAST_IN_MULTI;
2912
2913 if (init)
2914 {
2915 info |= XLOG_HEAP_INIT_PAGE;
2916 bufflags |= REGBUF_WILL_INIT;
2917 }
2918
2919 /*
2920 * If we're doing logical decoding, include the new tuple data
2921 * even if we take a full-page image of the page.
2922 */
2923 if (need_tuple_data)
2924 bufflags |= REGBUF_KEEP_DATA;
2925
2926 XLogBeginInsert();
2927 XLogRegisterData((char *) xlrec, tupledata - scratch.data);
2928 XLogRegisterBuffer(0, buffer, REGBUF_STANDARD | bufflags);
2929
2930 XLogRegisterBufData(0, tupledata, totaldatalen);
2931
2932 /* filtering by origin on a row level is much more efficient */
2933 XLogSetRecordFlags(XLOG_INCLUDE_ORIGIN);
2934
2935 recptr = XLogInsert(RM_HEAP2_ID, info);
2936
2937 PageSetLSN(page, recptr);
2938 }
2939
2940 END_CRIT_SECTION();
2941
2942 UnlockReleaseBuffer(buffer);
2943 if (vmbuffer != InvalidBuffer)
2944 ReleaseBuffer(vmbuffer);
2945
2946 ndone += nthispage;
2947 }
2948
2949 /*
2950 * We're done with the actual inserts. Check for conflicts again, to
2951 * ensure that all rw-conflicts in to these inserts are detected. Without
2952 * this final check, a sequential scan of the heap may have locked the
2953 * table after the "before" check, missing one opportunity to detect the
2954 * conflict, and then scanned the table before the new tuples were there,
2955 * missing the other chance to detect the conflict.
2956 *
2957 * For heap inserts, we only need to check for table-level SSI locks. Our
2958 * new tuples can't possibly conflict with existing tuple locks, and heap
2959 * page locks are only consolidated versions of tuple locks; they do not
2960 * lock "gaps" as index page locks do. So we don't need to specify a
2961 * buffer when making the call.
2962 */
2963 CheckForSerializableConflictIn(relation, NULL, InvalidBuffer);
2964
2965 /*
2966 * If tuples are cachable, mark them for invalidation from the caches in
2967 * case we abort. Note it is OK to do this after releasing the buffer,
2968 * because the heaptuples data structure is all in local memory, not in
2969 * the shared buffer.
2970 */
2971 if (IsCatalogRelation(relation))
2972 {
2973 for (i = 0; i < ntuples; i++)
2974 CacheInvalidateHeapTuple(relation, heaptuples[i], NULL);
2975 }
2976
2977 /*
2978 * Copy t_self fields back to the caller's original tuples. This does
2979 * nothing for untoasted tuples (tuples[i] == heaptuples[i)], but it's
2980 * probably faster to always copy than check.
2981 */
2982 for (i = 0; i < ntuples; i++)
2983 tuples[i]->t_self = heaptuples[i]->t_self;
2984
2985 pgstat_count_heap_insert(relation, ntuples);
2986 }
2987
2988 /*
2989 * simple_heap_insert - insert a tuple
2990 *
2991 * Currently, this routine differs from heap_insert only in supplying
2992 * a default command ID and not allowing access to the speedup options.
2993 *
2994 * This should be used rather than using heap_insert directly in most places
2995 * where we are modifying system catalogs.
2996 */
2997 Oid
simple_heap_insert(Relation relation,HeapTuple tup)2998 simple_heap_insert(Relation relation, HeapTuple tup)
2999 {
3000 return heap_insert(relation, tup, GetCurrentCommandId(true), 0, NULL);
3001 }
3002
3003 /*
3004 * Given infomask/infomask2, compute the bits that must be saved in the
3005 * "infobits" field of xl_heap_delete, xl_heap_update, xl_heap_lock,
3006 * xl_heap_lock_updated WAL records.
3007 *
3008 * See fix_infomask_from_infobits.
3009 */
3010 static uint8
compute_infobits(uint16 infomask,uint16 infomask2)3011 compute_infobits(uint16 infomask, uint16 infomask2)
3012 {
3013 return
3014 ((infomask & HEAP_XMAX_IS_MULTI) != 0 ? XLHL_XMAX_IS_MULTI : 0) |
3015 ((infomask & HEAP_XMAX_LOCK_ONLY) != 0 ? XLHL_XMAX_LOCK_ONLY : 0) |
3016 ((infomask & HEAP_XMAX_EXCL_LOCK) != 0 ? XLHL_XMAX_EXCL_LOCK : 0) |
3017 /* note we ignore HEAP_XMAX_SHR_LOCK here */
3018 ((infomask & HEAP_XMAX_KEYSHR_LOCK) != 0 ? XLHL_XMAX_KEYSHR_LOCK : 0) |
3019 ((infomask2 & HEAP_KEYS_UPDATED) != 0 ?
3020 XLHL_KEYS_UPDATED : 0);
3021 }
3022
3023 /*
3024 * Given two versions of the same t_infomask for a tuple, compare them and
3025 * return whether the relevant status for a tuple Xmax has changed. This is
3026 * used after a buffer lock has been released and reacquired: we want to ensure
3027 * that the tuple state continues to be the same it was when we previously
3028 * examined it.
3029 *
3030 * Note the Xmax field itself must be compared separately.
3031 */
3032 static inline bool
xmax_infomask_changed(uint16 new_infomask,uint16 old_infomask)3033 xmax_infomask_changed(uint16 new_infomask, uint16 old_infomask)
3034 {
3035 const uint16 interesting =
3036 HEAP_XMAX_IS_MULTI | HEAP_XMAX_LOCK_ONLY | HEAP_LOCK_MASK;
3037
3038 if ((new_infomask & interesting) != (old_infomask & interesting))
3039 return true;
3040
3041 return false;
3042 }
3043
3044 /*
3045 * heap_delete - delete a tuple
3046 *
3047 * NB: do not call this directly unless you are prepared to deal with
3048 * concurrent-update conditions. Use simple_heap_delete instead.
3049 *
3050 * relation - table to be modified (caller must hold suitable lock)
3051 * tid - TID of tuple to be deleted
3052 * cid - delete command ID (used for visibility test, and stored into
3053 * cmax if successful)
3054 * crosscheck - if not InvalidSnapshot, also check tuple against this
3055 * wait - true if should wait for any conflicting update to commit/abort
3056 * hufd - output parameter, filled in failure cases (see below)
3057 * changingPart - true iff the tuple is being moved to another partition
3058 * table due to an update of the partition key. Otherwise, false.
3059 *
3060 * Normal, successful return value is HeapTupleMayBeUpdated, which
3061 * actually means we did delete it. Failure return codes are
3062 * HeapTupleSelfUpdated, HeapTupleUpdated, or HeapTupleBeingUpdated
3063 * (the last only possible if wait == false).
3064 *
3065 * In the failure cases, the routine fills *hufd with the tuple's t_ctid,
3066 * t_xmax (resolving a possible MultiXact, if necessary), and t_cmax
3067 * (the last only for HeapTupleSelfUpdated, since we
3068 * cannot obtain cmax from a combocid generated by another transaction).
3069 * See comments for struct HeapUpdateFailureData for additional info.
3070 */
3071 HTSU_Result
heap_delete(Relation relation,ItemPointer tid,CommandId cid,Snapshot crosscheck,bool wait,HeapUpdateFailureData * hufd,bool changingPart)3072 heap_delete(Relation relation, ItemPointer tid,
3073 CommandId cid, Snapshot crosscheck, bool wait,
3074 HeapUpdateFailureData *hufd, bool changingPart)
3075 {
3076 HTSU_Result result;
3077 TransactionId xid = GetCurrentTransactionId();
3078 ItemId lp;
3079 HeapTupleData tp;
3080 Page page;
3081 BlockNumber block;
3082 Buffer buffer;
3083 Buffer vmbuffer = InvalidBuffer;
3084 TransactionId new_xmax;
3085 uint16 new_infomask,
3086 new_infomask2;
3087 bool have_tuple_lock = false;
3088 bool iscombo;
3089 bool all_visible_cleared = false;
3090 HeapTuple old_key_tuple = NULL; /* replica identity of the tuple */
3091 bool old_key_copied = false;
3092
3093 Assert(ItemPointerIsValid(tid));
3094
3095 /*
3096 * Forbid this during a parallel operation, lest it allocate a combocid.
3097 * Other workers might need that combocid for visibility checks, and we
3098 * have no provision for broadcasting it to them.
3099 */
3100 if (IsInParallelMode())
3101 ereport(ERROR,
3102 (errcode(ERRCODE_INVALID_TRANSACTION_STATE),
3103 errmsg("cannot delete tuples during a parallel operation")));
3104
3105 block = ItemPointerGetBlockNumber(tid);
3106 buffer = ReadBuffer(relation, block);
3107 page = BufferGetPage(buffer);
3108
3109 /*
3110 * Before locking the buffer, pin the visibility map page if it appears to
3111 * be necessary. Since we haven't got the lock yet, someone else might be
3112 * in the middle of changing this, so we'll need to recheck after we have
3113 * the lock.
3114 */
3115 if (PageIsAllVisible(page))
3116 visibilitymap_pin(relation, block, &vmbuffer);
3117
3118 LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE);
3119
3120 /*
3121 * If we didn't pin the visibility map page and the page has become all
3122 * visible while we were busy locking the buffer, we'll have to unlock and
3123 * re-lock, to avoid holding the buffer lock across an I/O. That's a bit
3124 * unfortunate, but hopefully shouldn't happen often.
3125 */
3126 if (vmbuffer == InvalidBuffer && PageIsAllVisible(page))
3127 {
3128 LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
3129 visibilitymap_pin(relation, block, &vmbuffer);
3130 LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE);
3131 }
3132
3133 lp = PageGetItemId(page, ItemPointerGetOffsetNumber(tid));
3134 Assert(ItemIdIsNormal(lp));
3135
3136 tp.t_tableOid = RelationGetRelid(relation);
3137 tp.t_data = (HeapTupleHeader) PageGetItem(page, lp);
3138 tp.t_len = ItemIdGetLength(lp);
3139 tp.t_self = *tid;
3140
3141 l1:
3142 result = HeapTupleSatisfiesUpdate(&tp, cid, buffer);
3143
3144 if (result == HeapTupleInvisible)
3145 {
3146 UnlockReleaseBuffer(buffer);
3147 ereport(ERROR,
3148 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
3149 errmsg("attempted to delete invisible tuple")));
3150 }
3151 else if (result == HeapTupleBeingUpdated && wait)
3152 {
3153 TransactionId xwait;
3154 uint16 infomask;
3155
3156 /* must copy state data before unlocking buffer */
3157 xwait = HeapTupleHeaderGetRawXmax(tp.t_data);
3158 infomask = tp.t_data->t_infomask;
3159
3160 /*
3161 * Sleep until concurrent transaction ends -- except when there's a
3162 * single locker and it's our own transaction. Note we don't care
3163 * which lock mode the locker has, because we need the strongest one.
3164 *
3165 * Before sleeping, we need to acquire tuple lock to establish our
3166 * priority for the tuple (see heap_lock_tuple). LockTuple will
3167 * release us when we are next-in-line for the tuple.
3168 *
3169 * If we are forced to "start over" below, we keep the tuple lock;
3170 * this arranges that we stay at the head of the line while rechecking
3171 * tuple state.
3172 */
3173 if (infomask & HEAP_XMAX_IS_MULTI)
3174 {
3175 bool current_is_member = false;
3176
3177 if (DoesMultiXactIdConflict((MultiXactId) xwait, infomask,
3178 LockTupleExclusive, ¤t_is_member))
3179 {
3180 LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
3181
3182 /*
3183 * Acquire the lock, if necessary (but skip it when we're
3184 * requesting a lock and already have one; avoids deadlock).
3185 */
3186 if (!current_is_member)
3187 heap_acquire_tuplock(relation, &(tp.t_self), LockTupleExclusive,
3188 LockWaitBlock, &have_tuple_lock);
3189
3190 /* wait for multixact */
3191 MultiXactIdWait((MultiXactId) xwait, MultiXactStatusUpdate, infomask,
3192 relation, &(tp.t_self), XLTW_Delete,
3193 NULL);
3194 LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE);
3195
3196 /*
3197 * If xwait had just locked the tuple then some other xact
3198 * could update this tuple before we get to this point. Check
3199 * for xmax change, and start over if so.
3200 */
3201 if (xmax_infomask_changed(tp.t_data->t_infomask, infomask) ||
3202 !TransactionIdEquals(HeapTupleHeaderGetRawXmax(tp.t_data),
3203 xwait))
3204 goto l1;
3205 }
3206
3207 /*
3208 * You might think the multixact is necessarily done here, but not
3209 * so: it could have surviving members, namely our own xact or
3210 * other subxacts of this backend. It is legal for us to delete
3211 * the tuple in either case, however (the latter case is
3212 * essentially a situation of upgrading our former shared lock to
3213 * exclusive). We don't bother changing the on-disk hint bits
3214 * since we are about to overwrite the xmax altogether.
3215 */
3216 }
3217 else if (!TransactionIdIsCurrentTransactionId(xwait))
3218 {
3219 /*
3220 * Wait for regular transaction to end; but first, acquire tuple
3221 * lock.
3222 */
3223 LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
3224 heap_acquire_tuplock(relation, &(tp.t_self), LockTupleExclusive,
3225 LockWaitBlock, &have_tuple_lock);
3226 XactLockTableWait(xwait, relation, &(tp.t_self), XLTW_Delete);
3227 LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE);
3228
3229 /*
3230 * xwait is done, but if xwait had just locked the tuple then some
3231 * other xact could update this tuple before we get to this point.
3232 * Check for xmax change, and start over if so.
3233 */
3234 if (xmax_infomask_changed(tp.t_data->t_infomask, infomask) ||
3235 !TransactionIdEquals(HeapTupleHeaderGetRawXmax(tp.t_data),
3236 xwait))
3237 goto l1;
3238
3239 /* Otherwise check if it committed or aborted */
3240 UpdateXmaxHintBits(tp.t_data, buffer, xwait);
3241 }
3242
3243 /*
3244 * We may overwrite if previous xmax aborted, or if it committed but
3245 * only locked the tuple without updating it.
3246 */
3247 if ((tp.t_data->t_infomask & HEAP_XMAX_INVALID) ||
3248 HEAP_XMAX_IS_LOCKED_ONLY(tp.t_data->t_infomask) ||
3249 HeapTupleHeaderIsOnlyLocked(tp.t_data))
3250 result = HeapTupleMayBeUpdated;
3251 else
3252 result = HeapTupleUpdated;
3253 }
3254
3255 if (crosscheck != InvalidSnapshot && result == HeapTupleMayBeUpdated)
3256 {
3257 /* Perform additional check for transaction-snapshot mode RI updates */
3258 if (!HeapTupleSatisfiesVisibility(&tp, crosscheck, buffer))
3259 result = HeapTupleUpdated;
3260 }
3261
3262 if (result != HeapTupleMayBeUpdated)
3263 {
3264 Assert(result == HeapTupleSelfUpdated ||
3265 result == HeapTupleUpdated ||
3266 result == HeapTupleBeingUpdated);
3267 Assert(!(tp.t_data->t_infomask & HEAP_XMAX_INVALID));
3268 hufd->ctid = tp.t_data->t_ctid;
3269 hufd->xmax = HeapTupleHeaderGetUpdateXid(tp.t_data);
3270 if (result == HeapTupleSelfUpdated)
3271 hufd->cmax = HeapTupleHeaderGetCmax(tp.t_data);
3272 else
3273 hufd->cmax = InvalidCommandId;
3274 UnlockReleaseBuffer(buffer);
3275 if (have_tuple_lock)
3276 UnlockTupleTuplock(relation, &(tp.t_self), LockTupleExclusive);
3277 if (vmbuffer != InvalidBuffer)
3278 ReleaseBuffer(vmbuffer);
3279 return result;
3280 }
3281
3282 /*
3283 * We're about to do the actual delete -- check for conflict first, to
3284 * avoid possibly having to roll back work we've just done.
3285 *
3286 * This is safe without a recheck as long as there is no possibility of
3287 * another process scanning the page between this check and the delete
3288 * being visible to the scan (i.e., an exclusive buffer content lock is
3289 * continuously held from this point until the tuple delete is visible).
3290 */
3291 CheckForSerializableConflictIn(relation, &tp, buffer);
3292
3293 /* replace cid with a combo cid if necessary */
3294 HeapTupleHeaderAdjustCmax(tp.t_data, &cid, &iscombo);
3295
3296 /*
3297 * Compute replica identity tuple before entering the critical section so
3298 * we don't PANIC upon a memory allocation failure.
3299 */
3300 old_key_tuple = ExtractReplicaIdentity(relation, &tp, true, &old_key_copied);
3301
3302 /*
3303 * If this is the first possibly-multixact-able operation in the current
3304 * transaction, set my per-backend OldestMemberMXactId setting. We can be
3305 * certain that the transaction will never become a member of any older
3306 * MultiXactIds than that. (We have to do this even if we end up just
3307 * using our own TransactionId below, since some other backend could
3308 * incorporate our XID into a MultiXact immediately afterwards.)
3309 */
3310 MultiXactIdSetOldestMember();
3311
3312 compute_new_xmax_infomask(HeapTupleHeaderGetRawXmax(tp.t_data),
3313 tp.t_data->t_infomask, tp.t_data->t_infomask2,
3314 xid, LockTupleExclusive, true,
3315 &new_xmax, &new_infomask, &new_infomask2);
3316
3317 START_CRIT_SECTION();
3318
3319 /*
3320 * If this transaction commits, the tuple will become DEAD sooner or
3321 * later. Set flag that this page is a candidate for pruning once our xid
3322 * falls below the OldestXmin horizon. If the transaction finally aborts,
3323 * the subsequent page pruning will be a no-op and the hint will be
3324 * cleared.
3325 */
3326 PageSetPrunable(page, xid);
3327
3328 if (PageIsAllVisible(page))
3329 {
3330 all_visible_cleared = true;
3331 PageClearAllVisible(page);
3332 visibilitymap_clear(relation, BufferGetBlockNumber(buffer),
3333 vmbuffer, VISIBILITYMAP_VALID_BITS);
3334 }
3335
3336 /* store transaction information of xact deleting the tuple */
3337 tp.t_data->t_infomask &= ~(HEAP_XMAX_BITS | HEAP_MOVED);
3338 tp.t_data->t_infomask2 &= ~HEAP_KEYS_UPDATED;
3339 tp.t_data->t_infomask |= new_infomask;
3340 tp.t_data->t_infomask2 |= new_infomask2;
3341 HeapTupleHeaderClearHotUpdated(tp.t_data);
3342 HeapTupleHeaderSetXmax(tp.t_data, new_xmax);
3343 HeapTupleHeaderSetCmax(tp.t_data, cid, iscombo);
3344 /* Make sure there is no forward chain link in t_ctid */
3345 tp.t_data->t_ctid = tp.t_self;
3346
3347 /* Signal that this is actually a move into another partition */
3348 if (changingPart)
3349 HeapTupleHeaderSetMovedPartitions(tp.t_data);
3350
3351 MarkBufferDirty(buffer);
3352
3353 /*
3354 * XLOG stuff
3355 *
3356 * NB: heap_abort_speculative() uses the same xlog record and replay
3357 * routines.
3358 */
3359 if (RelationNeedsWAL(relation))
3360 {
3361 xl_heap_delete xlrec;
3362 xl_heap_header xlhdr;
3363 XLogRecPtr recptr;
3364
3365 /* For logical decode we need combocids to properly decode the catalog */
3366 if (RelationIsAccessibleInLogicalDecoding(relation))
3367 log_heap_new_cid(relation, &tp);
3368
3369 xlrec.flags = 0;
3370 if (all_visible_cleared)
3371 xlrec.flags |= XLH_DELETE_ALL_VISIBLE_CLEARED;
3372 if (changingPart)
3373 xlrec.flags |= XLH_DELETE_IS_PARTITION_MOVE;
3374 xlrec.infobits_set = compute_infobits(tp.t_data->t_infomask,
3375 tp.t_data->t_infomask2);
3376 xlrec.offnum = ItemPointerGetOffsetNumber(&tp.t_self);
3377 xlrec.xmax = new_xmax;
3378
3379 if (old_key_tuple != NULL)
3380 {
3381 if (relation->rd_rel->relreplident == REPLICA_IDENTITY_FULL)
3382 xlrec.flags |= XLH_DELETE_CONTAINS_OLD_TUPLE;
3383 else
3384 xlrec.flags |= XLH_DELETE_CONTAINS_OLD_KEY;
3385 }
3386
3387 XLogBeginInsert();
3388 XLogRegisterData((char *) &xlrec, SizeOfHeapDelete);
3389
3390 XLogRegisterBuffer(0, buffer, REGBUF_STANDARD);
3391
3392 /*
3393 * Log replica identity of the deleted tuple if there is one
3394 */
3395 if (old_key_tuple != NULL)
3396 {
3397 xlhdr.t_infomask2 = old_key_tuple->t_data->t_infomask2;
3398 xlhdr.t_infomask = old_key_tuple->t_data->t_infomask;
3399 xlhdr.t_hoff = old_key_tuple->t_data->t_hoff;
3400
3401 XLogRegisterData((char *) &xlhdr, SizeOfHeapHeader);
3402 XLogRegisterData((char *) old_key_tuple->t_data
3403 + SizeofHeapTupleHeader,
3404 old_key_tuple->t_len
3405 - SizeofHeapTupleHeader);
3406 }
3407
3408 /* filtering by origin on a row level is much more efficient */
3409 XLogSetRecordFlags(XLOG_INCLUDE_ORIGIN);
3410
3411 recptr = XLogInsert(RM_HEAP_ID, XLOG_HEAP_DELETE);
3412
3413 PageSetLSN(page, recptr);
3414 }
3415
3416 END_CRIT_SECTION();
3417
3418 LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
3419
3420 if (vmbuffer != InvalidBuffer)
3421 ReleaseBuffer(vmbuffer);
3422
3423 /*
3424 * If the tuple has toasted out-of-line attributes, we need to delete
3425 * those items too. We have to do this before releasing the buffer
3426 * because we need to look at the contents of the tuple, but it's OK to
3427 * release the content lock on the buffer first.
3428 */
3429 if (relation->rd_rel->relkind != RELKIND_RELATION &&
3430 relation->rd_rel->relkind != RELKIND_MATVIEW)
3431 {
3432 /* toast table entries should never be recursively toasted */
3433 Assert(!HeapTupleHasExternal(&tp));
3434 }
3435 else if (HeapTupleHasExternal(&tp))
3436 toast_delete(relation, &tp, false);
3437
3438 /*
3439 * Mark tuple for invalidation from system caches at next command
3440 * boundary. We have to do this before releasing the buffer because we
3441 * need to look at the contents of the tuple.
3442 */
3443 CacheInvalidateHeapTuple(relation, &tp, NULL);
3444
3445 /* Now we can release the buffer */
3446 ReleaseBuffer(buffer);
3447
3448 /*
3449 * Release the lmgr tuple lock, if we had it.
3450 */
3451 if (have_tuple_lock)
3452 UnlockTupleTuplock(relation, &(tp.t_self), LockTupleExclusive);
3453
3454 pgstat_count_heap_delete(relation);
3455
3456 if (old_key_tuple != NULL && old_key_copied)
3457 heap_freetuple(old_key_tuple);
3458
3459 return HeapTupleMayBeUpdated;
3460 }
3461
3462 /*
3463 * simple_heap_delete - delete a tuple
3464 *
3465 * This routine may be used to delete a tuple when concurrent updates of
3466 * the target tuple are not expected (for example, because we have a lock
3467 * on the relation associated with the tuple). Any failure is reported
3468 * via ereport().
3469 */
3470 void
simple_heap_delete(Relation relation,ItemPointer tid)3471 simple_heap_delete(Relation relation, ItemPointer tid)
3472 {
3473 HTSU_Result result;
3474 HeapUpdateFailureData hufd;
3475
3476 result = heap_delete(relation, tid,
3477 GetCurrentCommandId(true), InvalidSnapshot,
3478 true /* wait for commit */ ,
3479 &hufd, false /* changingPart */ );
3480 switch (result)
3481 {
3482 case HeapTupleSelfUpdated:
3483 /* Tuple was already updated in current command? */
3484 elog(ERROR, "tuple already updated by self");
3485 break;
3486
3487 case HeapTupleMayBeUpdated:
3488 /* done successfully */
3489 break;
3490
3491 case HeapTupleUpdated:
3492 elog(ERROR, "tuple concurrently updated");
3493 break;
3494
3495 default:
3496 elog(ERROR, "unrecognized heap_delete status: %u", result);
3497 break;
3498 }
3499 }
3500
3501 /*
3502 * heap_update - replace a tuple
3503 *
3504 * NB: do not call this directly unless you are prepared to deal with
3505 * concurrent-update conditions. Use simple_heap_update instead.
3506 *
3507 * relation - table to be modified (caller must hold suitable lock)
3508 * otid - TID of old tuple to be replaced
3509 * newtup - newly constructed tuple data to store
3510 * cid - update command ID (used for visibility test, and stored into
3511 * cmax/cmin if successful)
3512 * crosscheck - if not InvalidSnapshot, also check old tuple against this
3513 * wait - true if should wait for any conflicting update to commit/abort
3514 * hufd - output parameter, filled in failure cases (see below)
3515 * lockmode - output parameter, filled with lock mode acquired on tuple
3516 *
3517 * Normal, successful return value is HeapTupleMayBeUpdated, which
3518 * actually means we *did* update it. Failure return codes are
3519 * HeapTupleSelfUpdated, HeapTupleUpdated, or HeapTupleBeingUpdated
3520 * (the last only possible if wait == false).
3521 *
3522 * On success, the header fields of *newtup are updated to match the new
3523 * stored tuple; in particular, newtup->t_self is set to the TID where the
3524 * new tuple was inserted, and its HEAP_ONLY_TUPLE flag is set iff a HOT
3525 * update was done. However, any TOAST changes in the new tuple's
3526 * data are not reflected into *newtup.
3527 *
3528 * In the failure cases, the routine fills *hufd with the tuple's t_ctid,
3529 * t_xmax (resolving a possible MultiXact, if necessary), and t_cmax
3530 * (the last only for HeapTupleSelfUpdated, since we
3531 * cannot obtain cmax from a combocid generated by another transaction).
3532 * See comments for struct HeapUpdateFailureData for additional info.
3533 */
3534 HTSU_Result
heap_update(Relation relation,ItemPointer otid,HeapTuple newtup,CommandId cid,Snapshot crosscheck,bool wait,HeapUpdateFailureData * hufd,LockTupleMode * lockmode)3535 heap_update(Relation relation, ItemPointer otid, HeapTuple newtup,
3536 CommandId cid, Snapshot crosscheck, bool wait,
3537 HeapUpdateFailureData *hufd, LockTupleMode *lockmode)
3538 {
3539 HTSU_Result result;
3540 TransactionId xid = GetCurrentTransactionId();
3541 Bitmapset *hot_attrs;
3542 Bitmapset *proj_idx_attrs;
3543 Bitmapset *key_attrs;
3544 Bitmapset *id_attrs;
3545 Bitmapset *interesting_attrs;
3546 Bitmapset *modified_attrs;
3547 ItemId lp;
3548 HeapTupleData oldtup;
3549 HeapTuple heaptup;
3550 HeapTuple old_key_tuple = NULL;
3551 bool old_key_copied = false;
3552 Page page;
3553 BlockNumber block;
3554 MultiXactStatus mxact_status;
3555 Buffer buffer,
3556 newbuf,
3557 vmbuffer = InvalidBuffer,
3558 vmbuffer_new = InvalidBuffer;
3559 bool need_toast;
3560 Size newtupsize,
3561 pagefree;
3562 bool have_tuple_lock = false;
3563 bool iscombo;
3564 bool use_hot_update = false;
3565 bool hot_attrs_checked = false;
3566 bool key_intact;
3567 bool all_visible_cleared = false;
3568 bool all_visible_cleared_new = false;
3569 bool checked_lockers;
3570 bool locker_remains;
3571 TransactionId xmax_new_tuple,
3572 xmax_old_tuple;
3573 uint16 infomask_old_tuple,
3574 infomask2_old_tuple,
3575 infomask_new_tuple,
3576 infomask2_new_tuple;
3577
3578 Assert(ItemPointerIsValid(otid));
3579
3580 /* Cheap, simplistic check that the tuple matches the rel's rowtype. */
3581 Assert(HeapTupleHeaderGetNatts(newtup->t_data) <=
3582 RelationGetNumberOfAttributes(relation));
3583
3584 /*
3585 * Forbid this during a parallel operation, lest it allocate a combocid.
3586 * Other workers might need that combocid for visibility checks, and we
3587 * have no provision for broadcasting it to them.
3588 */
3589 if (IsInParallelMode())
3590 ereport(ERROR,
3591 (errcode(ERRCODE_INVALID_TRANSACTION_STATE),
3592 errmsg("cannot update tuples during a parallel operation")));
3593
3594 /*
3595 * Fetch the list of attributes to be checked for various operations.
3596 *
3597 * For HOT considerations, this is wasted effort if we fail to update or
3598 * have to put the new tuple on a different page. But we must compute the
3599 * list before obtaining buffer lock --- in the worst case, if we are
3600 * doing an update on one of the relevant system catalogs, we could
3601 * deadlock if we try to fetch the list later. In any case, the relcache
3602 * caches the data so this is usually pretty cheap.
3603 *
3604 * We also need columns used by the replica identity and columns that are
3605 * considered the "key" of rows in the table.
3606 *
3607 * Note that we get copies of each bitmap, so we need not worry about
3608 * relcache flush happening midway through.
3609 */
3610 hot_attrs = RelationGetIndexAttrBitmap(relation, INDEX_ATTR_BITMAP_HOT);
3611 proj_idx_attrs = RelationGetIndexAttrBitmap(relation, INDEX_ATTR_BITMAP_PROJ);
3612 key_attrs = RelationGetIndexAttrBitmap(relation, INDEX_ATTR_BITMAP_KEY);
3613 id_attrs = RelationGetIndexAttrBitmap(relation,
3614 INDEX_ATTR_BITMAP_IDENTITY_KEY);
3615 block = ItemPointerGetBlockNumber(otid);
3616 buffer = ReadBuffer(relation, block);
3617 page = BufferGetPage(buffer);
3618
3619 interesting_attrs = NULL;
3620
3621 /*
3622 * If the page is already full, there is hardly any chance of doing a HOT
3623 * update on this page. It might be wasteful effort to look for index
3624 * column updates only to later reject HOT updates for lack of space in
3625 * the same page. So we be conservative and only fetch hot_attrs if the
3626 * page is not already full. Since we are already holding a pin on the
3627 * buffer, there is no chance that the buffer can get cleaned up
3628 * concurrently and even if that was possible, in the worst case we lose a
3629 * chance to do a HOT update.
3630 */
3631 if (!PageIsFull(page))
3632 {
3633 interesting_attrs = bms_add_members(interesting_attrs, hot_attrs);
3634 interesting_attrs = bms_add_members(interesting_attrs, proj_idx_attrs);
3635 hot_attrs_checked = true;
3636 }
3637 interesting_attrs = bms_add_members(interesting_attrs, key_attrs);
3638 interesting_attrs = bms_add_members(interesting_attrs, id_attrs);
3639
3640 /*
3641 * Before locking the buffer, pin the visibility map page if it appears to
3642 * be necessary. Since we haven't got the lock yet, someone else might be
3643 * in the middle of changing this, so we'll need to recheck after we have
3644 * the lock.
3645 */
3646 if (PageIsAllVisible(page))
3647 visibilitymap_pin(relation, block, &vmbuffer);
3648
3649 LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE);
3650
3651 lp = PageGetItemId(page, ItemPointerGetOffsetNumber(otid));
3652 Assert(ItemIdIsNormal(lp));
3653
3654 /*
3655 * Fill in enough data in oldtup for HeapDetermineModifiedColumns to work
3656 * properly.
3657 */
3658 oldtup.t_tableOid = RelationGetRelid(relation);
3659 oldtup.t_data = (HeapTupleHeader) PageGetItem(page, lp);
3660 oldtup.t_len = ItemIdGetLength(lp);
3661 oldtup.t_self = *otid;
3662
3663 /* the new tuple is ready, except for this: */
3664 newtup->t_tableOid = RelationGetRelid(relation);
3665
3666 /* Fill in OID for newtup */
3667 if (relation->rd_rel->relhasoids)
3668 {
3669 #ifdef NOT_USED
3670 /* this is redundant with an Assert in HeapTupleSetOid */
3671 Assert(newtup->t_data->t_infomask & HEAP_HASOID);
3672 #endif
3673 HeapTupleSetOid(newtup, HeapTupleGetOid(&oldtup));
3674 }
3675 else
3676 {
3677 /* check there is not space for an OID */
3678 Assert(!(newtup->t_data->t_infomask & HEAP_HASOID));
3679 }
3680
3681 /* Determine columns modified by the update. */
3682 modified_attrs = HeapDetermineModifiedColumns(relation, interesting_attrs,
3683 &oldtup, newtup);
3684
3685 /*
3686 * If we're not updating any "key" column, we can grab a weaker lock type.
3687 * This allows for more concurrency when we are running simultaneously
3688 * with foreign key checks.
3689 *
3690 * Note that if a column gets detoasted while executing the update, but
3691 * the value ends up being the same, this test will fail and we will use
3692 * the stronger lock. This is acceptable; the important case to optimize
3693 * is updates that don't manipulate key columns, not those that
3694 * serendipitiously arrive at the same key values.
3695 */
3696 if (!bms_overlap(modified_attrs, key_attrs))
3697 {
3698 *lockmode = LockTupleNoKeyExclusive;
3699 mxact_status = MultiXactStatusNoKeyUpdate;
3700 key_intact = true;
3701
3702 /*
3703 * If this is the first possibly-multixact-able operation in the
3704 * current transaction, set my per-backend OldestMemberMXactId
3705 * setting. We can be certain that the transaction will never become a
3706 * member of any older MultiXactIds than that. (We have to do this
3707 * even if we end up just using our own TransactionId below, since
3708 * some other backend could incorporate our XID into a MultiXact
3709 * immediately afterwards.)
3710 */
3711 MultiXactIdSetOldestMember();
3712 }
3713 else
3714 {
3715 *lockmode = LockTupleExclusive;
3716 mxact_status = MultiXactStatusUpdate;
3717 key_intact = false;
3718 }
3719
3720 /*
3721 * Note: beyond this point, use oldtup not otid to refer to old tuple.
3722 * otid may very well point at newtup->t_self, which we will overwrite
3723 * with the new tuple's location, so there's great risk of confusion if we
3724 * use otid anymore.
3725 */
3726
3727 l2:
3728 checked_lockers = false;
3729 locker_remains = false;
3730 result = HeapTupleSatisfiesUpdate(&oldtup, cid, buffer);
3731
3732 /* see below about the "no wait" case */
3733 Assert(result != HeapTupleBeingUpdated || wait);
3734
3735 if (result == HeapTupleInvisible)
3736 {
3737 UnlockReleaseBuffer(buffer);
3738 ereport(ERROR,
3739 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
3740 errmsg("attempted to update invisible tuple")));
3741 }
3742 else if (result == HeapTupleBeingUpdated && wait)
3743 {
3744 TransactionId xwait;
3745 uint16 infomask;
3746 bool can_continue = false;
3747
3748 /*
3749 * XXX note that we don't consider the "no wait" case here. This
3750 * isn't a problem currently because no caller uses that case, but it
3751 * should be fixed if such a caller is introduced. It wasn't a
3752 * problem previously because this code would always wait, but now
3753 * that some tuple locks do not conflict with one of the lock modes we
3754 * use, it is possible that this case is interesting to handle
3755 * specially.
3756 *
3757 * This may cause failures with third-party code that calls
3758 * heap_update directly.
3759 */
3760
3761 /* must copy state data before unlocking buffer */
3762 xwait = HeapTupleHeaderGetRawXmax(oldtup.t_data);
3763 infomask = oldtup.t_data->t_infomask;
3764
3765 /*
3766 * Now we have to do something about the existing locker. If it's a
3767 * multi, sleep on it; we might be awakened before it is completely
3768 * gone (or even not sleep at all in some cases); we need to preserve
3769 * it as locker, unless it is gone completely.
3770 *
3771 * If it's not a multi, we need to check for sleeping conditions
3772 * before actually going to sleep. If the update doesn't conflict
3773 * with the locks, we just continue without sleeping (but making sure
3774 * it is preserved).
3775 *
3776 * Before sleeping, we need to acquire tuple lock to establish our
3777 * priority for the tuple (see heap_lock_tuple). LockTuple will
3778 * release us when we are next-in-line for the tuple. Note we must
3779 * not acquire the tuple lock until we're sure we're going to sleep;
3780 * otherwise we're open for race conditions with other transactions
3781 * holding the tuple lock which sleep on us.
3782 *
3783 * If we are forced to "start over" below, we keep the tuple lock;
3784 * this arranges that we stay at the head of the line while rechecking
3785 * tuple state.
3786 */
3787 if (infomask & HEAP_XMAX_IS_MULTI)
3788 {
3789 TransactionId update_xact;
3790 int remain;
3791 bool current_is_member = false;
3792
3793 if (DoesMultiXactIdConflict((MultiXactId) xwait, infomask,
3794 *lockmode, ¤t_is_member))
3795 {
3796 LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
3797
3798 /*
3799 * Acquire the lock, if necessary (but skip it when we're
3800 * requesting a lock and already have one; avoids deadlock).
3801 */
3802 if (!current_is_member)
3803 heap_acquire_tuplock(relation, &(oldtup.t_self), *lockmode,
3804 LockWaitBlock, &have_tuple_lock);
3805
3806 /* wait for multixact */
3807 MultiXactIdWait((MultiXactId) xwait, mxact_status, infomask,
3808 relation, &oldtup.t_self, XLTW_Update,
3809 &remain);
3810 checked_lockers = true;
3811 locker_remains = remain != 0;
3812 LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE);
3813
3814 /*
3815 * If xwait had just locked the tuple then some other xact
3816 * could update this tuple before we get to this point. Check
3817 * for xmax change, and start over if so.
3818 */
3819 if (xmax_infomask_changed(oldtup.t_data->t_infomask,
3820 infomask) ||
3821 !TransactionIdEquals(HeapTupleHeaderGetRawXmax(oldtup.t_data),
3822 xwait))
3823 goto l2;
3824 }
3825
3826 /*
3827 * Note that the multixact may not be done by now. It could have
3828 * surviving members; our own xact or other subxacts of this
3829 * backend, and also any other concurrent transaction that locked
3830 * the tuple with LockTupleKeyShare if we only got
3831 * LockTupleNoKeyExclusive. If this is the case, we have to be
3832 * careful to mark the updated tuple with the surviving members in
3833 * Xmax.
3834 *
3835 * Note that there could have been another update in the
3836 * MultiXact. In that case, we need to check whether it committed
3837 * or aborted. If it aborted we are safe to update it again;
3838 * otherwise there is an update conflict, and we have to return
3839 * HeapTupleUpdated below.
3840 *
3841 * In the LockTupleExclusive case, we still need to preserve the
3842 * surviving members: those would include the tuple locks we had
3843 * before this one, which are important to keep in case this
3844 * subxact aborts.
3845 */
3846 if (!HEAP_XMAX_IS_LOCKED_ONLY(oldtup.t_data->t_infomask))
3847 update_xact = HeapTupleGetUpdateXid(oldtup.t_data);
3848 else
3849 update_xact = InvalidTransactionId;
3850
3851 /*
3852 * There was no UPDATE in the MultiXact; or it aborted. No
3853 * TransactionIdIsInProgress() call needed here, since we called
3854 * MultiXactIdWait() above.
3855 */
3856 if (!TransactionIdIsValid(update_xact) ||
3857 TransactionIdDidAbort(update_xact))
3858 can_continue = true;
3859 }
3860 else if (TransactionIdIsCurrentTransactionId(xwait))
3861 {
3862 /*
3863 * The only locker is ourselves; we can avoid grabbing the tuple
3864 * lock here, but must preserve our locking information.
3865 */
3866 checked_lockers = true;
3867 locker_remains = true;
3868 can_continue = true;
3869 }
3870 else if (HEAP_XMAX_IS_KEYSHR_LOCKED(infomask) && key_intact)
3871 {
3872 /*
3873 * If it's just a key-share locker, and we're not changing the key
3874 * columns, we don't need to wait for it to end; but we need to
3875 * preserve it as locker.
3876 */
3877 checked_lockers = true;
3878 locker_remains = true;
3879 can_continue = true;
3880 }
3881 else
3882 {
3883 /*
3884 * Wait for regular transaction to end; but first, acquire tuple
3885 * lock.
3886 */
3887 LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
3888 heap_acquire_tuplock(relation, &(oldtup.t_self), *lockmode,
3889 LockWaitBlock, &have_tuple_lock);
3890 XactLockTableWait(xwait, relation, &oldtup.t_self,
3891 XLTW_Update);
3892 checked_lockers = true;
3893 LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE);
3894
3895 /*
3896 * xwait is done, but if xwait had just locked the tuple then some
3897 * other xact could update this tuple before we get to this point.
3898 * Check for xmax change, and start over if so.
3899 */
3900 if (xmax_infomask_changed(oldtup.t_data->t_infomask, infomask) ||
3901 !TransactionIdEquals(xwait,
3902 HeapTupleHeaderGetRawXmax(oldtup.t_data)))
3903 goto l2;
3904
3905 /* Otherwise check if it committed or aborted */
3906 UpdateXmaxHintBits(oldtup.t_data, buffer, xwait);
3907 if (oldtup.t_data->t_infomask & HEAP_XMAX_INVALID)
3908 can_continue = true;
3909 }
3910
3911 result = can_continue ? HeapTupleMayBeUpdated : HeapTupleUpdated;
3912 }
3913
3914 if (crosscheck != InvalidSnapshot && result == HeapTupleMayBeUpdated)
3915 {
3916 /* Perform additional check for transaction-snapshot mode RI updates */
3917 if (!HeapTupleSatisfiesVisibility(&oldtup, crosscheck, buffer))
3918 result = HeapTupleUpdated;
3919 }
3920
3921 if (result != HeapTupleMayBeUpdated)
3922 {
3923 Assert(result == HeapTupleSelfUpdated ||
3924 result == HeapTupleUpdated ||
3925 result == HeapTupleBeingUpdated);
3926 Assert(!(oldtup.t_data->t_infomask & HEAP_XMAX_INVALID));
3927 hufd->ctid = oldtup.t_data->t_ctid;
3928 hufd->xmax = HeapTupleHeaderGetUpdateXid(oldtup.t_data);
3929 if (result == HeapTupleSelfUpdated)
3930 hufd->cmax = HeapTupleHeaderGetCmax(oldtup.t_data);
3931 else
3932 hufd->cmax = InvalidCommandId;
3933 UnlockReleaseBuffer(buffer);
3934 if (have_tuple_lock)
3935 UnlockTupleTuplock(relation, &(oldtup.t_self), *lockmode);
3936 if (vmbuffer != InvalidBuffer)
3937 ReleaseBuffer(vmbuffer);
3938 bms_free(hot_attrs);
3939 bms_free(proj_idx_attrs);
3940 bms_free(key_attrs);
3941 bms_free(id_attrs);
3942 bms_free(modified_attrs);
3943 bms_free(interesting_attrs);
3944 return result;
3945 }
3946
3947 /*
3948 * If we didn't pin the visibility map page and the page has become all
3949 * visible while we were busy locking the buffer, or during some
3950 * subsequent window during which we had it unlocked, we'll have to unlock
3951 * and re-lock, to avoid holding the buffer lock across an I/O. That's a
3952 * bit unfortunate, especially since we'll now have to recheck whether the
3953 * tuple has been locked or updated under us, but hopefully it won't
3954 * happen very often.
3955 */
3956 if (vmbuffer == InvalidBuffer && PageIsAllVisible(page))
3957 {
3958 LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
3959 visibilitymap_pin(relation, block, &vmbuffer);
3960 LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE);
3961 goto l2;
3962 }
3963
3964 /* Fill in transaction status data */
3965
3966 /*
3967 * If the tuple we're updating is locked, we need to preserve the locking
3968 * info in the old tuple's Xmax. Prepare a new Xmax value for this.
3969 */
3970 compute_new_xmax_infomask(HeapTupleHeaderGetRawXmax(oldtup.t_data),
3971 oldtup.t_data->t_infomask,
3972 oldtup.t_data->t_infomask2,
3973 xid, *lockmode, true,
3974 &xmax_old_tuple, &infomask_old_tuple,
3975 &infomask2_old_tuple);
3976
3977 /*
3978 * And also prepare an Xmax value for the new copy of the tuple. If there
3979 * was no xmax previously, or there was one but all lockers are now gone,
3980 * then use InvalidXid; otherwise, get the xmax from the old tuple. (In
3981 * rare cases that might also be InvalidXid and yet not have the
3982 * HEAP_XMAX_INVALID bit set; that's fine.)
3983 */
3984 if ((oldtup.t_data->t_infomask & HEAP_XMAX_INVALID) ||
3985 HEAP_LOCKED_UPGRADED(oldtup.t_data->t_infomask) ||
3986 (checked_lockers && !locker_remains))
3987 xmax_new_tuple = InvalidTransactionId;
3988 else
3989 xmax_new_tuple = HeapTupleHeaderGetRawXmax(oldtup.t_data);
3990
3991 if (!TransactionIdIsValid(xmax_new_tuple))
3992 {
3993 infomask_new_tuple = HEAP_XMAX_INVALID;
3994 infomask2_new_tuple = 0;
3995 }
3996 else
3997 {
3998 /*
3999 * If we found a valid Xmax for the new tuple, then the infomask bits
4000 * to use on the new tuple depend on what was there on the old one.
4001 * Note that since we're doing an update, the only possibility is that
4002 * the lockers had FOR KEY SHARE lock.
4003 */
4004 if (oldtup.t_data->t_infomask & HEAP_XMAX_IS_MULTI)
4005 {
4006 GetMultiXactIdHintBits(xmax_new_tuple, &infomask_new_tuple,
4007 &infomask2_new_tuple);
4008 }
4009 else
4010 {
4011 infomask_new_tuple = HEAP_XMAX_KEYSHR_LOCK | HEAP_XMAX_LOCK_ONLY;
4012 infomask2_new_tuple = 0;
4013 }
4014 }
4015
4016 /*
4017 * Prepare the new tuple with the appropriate initial values of Xmin and
4018 * Xmax, as well as initial infomask bits as computed above.
4019 */
4020 newtup->t_data->t_infomask &= ~(HEAP_XACT_MASK);
4021 newtup->t_data->t_infomask2 &= ~(HEAP2_XACT_MASK);
4022 HeapTupleHeaderSetXmin(newtup->t_data, xid);
4023 HeapTupleHeaderSetCmin(newtup->t_data, cid);
4024 newtup->t_data->t_infomask |= HEAP_UPDATED | infomask_new_tuple;
4025 newtup->t_data->t_infomask2 |= infomask2_new_tuple;
4026 HeapTupleHeaderSetXmax(newtup->t_data, xmax_new_tuple);
4027
4028 /*
4029 * Replace cid with a combo cid if necessary. Note that we already put
4030 * the plain cid into the new tuple.
4031 */
4032 HeapTupleHeaderAdjustCmax(oldtup.t_data, &cid, &iscombo);
4033
4034 /*
4035 * If the toaster needs to be activated, OR if the new tuple will not fit
4036 * on the same page as the old, then we need to release the content lock
4037 * (but not the pin!) on the old tuple's buffer while we are off doing
4038 * TOAST and/or table-file-extension work. We must mark the old tuple to
4039 * show that it's locked, else other processes may try to update it
4040 * themselves.
4041 *
4042 * We need to invoke the toaster if there are already any out-of-line
4043 * toasted values present, or if the new tuple is over-threshold.
4044 */
4045 if (relation->rd_rel->relkind != RELKIND_RELATION &&
4046 relation->rd_rel->relkind != RELKIND_MATVIEW)
4047 {
4048 /* toast table entries should never be recursively toasted */
4049 Assert(!HeapTupleHasExternal(&oldtup));
4050 Assert(!HeapTupleHasExternal(newtup));
4051 need_toast = false;
4052 }
4053 else
4054 need_toast = (HeapTupleHasExternal(&oldtup) ||
4055 HeapTupleHasExternal(newtup) ||
4056 newtup->t_len > TOAST_TUPLE_THRESHOLD);
4057
4058 pagefree = PageGetHeapFreeSpace(page);
4059
4060 newtupsize = MAXALIGN(newtup->t_len);
4061
4062 if (need_toast || newtupsize > pagefree)
4063 {
4064 TransactionId xmax_lock_old_tuple;
4065 uint16 infomask_lock_old_tuple,
4066 infomask2_lock_old_tuple;
4067 bool cleared_all_frozen = false;
4068
4069 /*
4070 * To prevent concurrent sessions from updating the tuple, we have to
4071 * temporarily mark it locked, while we release the page-level lock.
4072 *
4073 * To satisfy the rule that any xid potentially appearing in a buffer
4074 * written out to disk, we unfortunately have to WAL log this
4075 * temporary modification. We can reuse xl_heap_lock for this
4076 * purpose. If we crash/error before following through with the
4077 * actual update, xmax will be of an aborted transaction, allowing
4078 * other sessions to proceed.
4079 */
4080
4081 /*
4082 * Compute xmax / infomask appropriate for locking the tuple. This has
4083 * to be done separately from the combo that's going to be used for
4084 * updating, because the potentially created multixact would otherwise
4085 * be wrong.
4086 */
4087 compute_new_xmax_infomask(HeapTupleHeaderGetRawXmax(oldtup.t_data),
4088 oldtup.t_data->t_infomask,
4089 oldtup.t_data->t_infomask2,
4090 xid, *lockmode, false,
4091 &xmax_lock_old_tuple, &infomask_lock_old_tuple,
4092 &infomask2_lock_old_tuple);
4093
4094 Assert(HEAP_XMAX_IS_LOCKED_ONLY(infomask_lock_old_tuple));
4095
4096 START_CRIT_SECTION();
4097
4098 /* Clear obsolete visibility flags ... */
4099 oldtup.t_data->t_infomask &= ~(HEAP_XMAX_BITS | HEAP_MOVED);
4100 oldtup.t_data->t_infomask2 &= ~HEAP_KEYS_UPDATED;
4101 HeapTupleClearHotUpdated(&oldtup);
4102 /* ... and store info about transaction updating this tuple */
4103 Assert(TransactionIdIsValid(xmax_lock_old_tuple));
4104 HeapTupleHeaderSetXmax(oldtup.t_data, xmax_lock_old_tuple);
4105 oldtup.t_data->t_infomask |= infomask_lock_old_tuple;
4106 oldtup.t_data->t_infomask2 |= infomask2_lock_old_tuple;
4107 HeapTupleHeaderSetCmax(oldtup.t_data, cid, iscombo);
4108
4109 /* temporarily make it look not-updated, but locked */
4110 oldtup.t_data->t_ctid = oldtup.t_self;
4111
4112 /*
4113 * Clear all-frozen bit on visibility map if needed. We could
4114 * immediately reset ALL_VISIBLE, but given that the WAL logging
4115 * overhead would be unchanged, that doesn't seem necessarily
4116 * worthwhile.
4117 */
4118 if (PageIsAllVisible(BufferGetPage(buffer)) &&
4119 visibilitymap_clear(relation, block, vmbuffer,
4120 VISIBILITYMAP_ALL_FROZEN))
4121 cleared_all_frozen = true;
4122
4123 MarkBufferDirty(buffer);
4124
4125 if (RelationNeedsWAL(relation))
4126 {
4127 xl_heap_lock xlrec;
4128 XLogRecPtr recptr;
4129
4130 XLogBeginInsert();
4131 XLogRegisterBuffer(0, buffer, REGBUF_STANDARD);
4132
4133 xlrec.offnum = ItemPointerGetOffsetNumber(&oldtup.t_self);
4134 xlrec.locking_xid = xmax_lock_old_tuple;
4135 xlrec.infobits_set = compute_infobits(oldtup.t_data->t_infomask,
4136 oldtup.t_data->t_infomask2);
4137 xlrec.flags =
4138 cleared_all_frozen ? XLH_LOCK_ALL_FROZEN_CLEARED : 0;
4139 XLogRegisterData((char *) &xlrec, SizeOfHeapLock);
4140 recptr = XLogInsert(RM_HEAP_ID, XLOG_HEAP_LOCK);
4141 PageSetLSN(page, recptr);
4142 }
4143
4144 END_CRIT_SECTION();
4145
4146 LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
4147
4148 /*
4149 * Let the toaster do its thing, if needed.
4150 *
4151 * Note: below this point, heaptup is the data we actually intend to
4152 * store into the relation; newtup is the caller's original untoasted
4153 * data.
4154 */
4155 if (need_toast)
4156 {
4157 /* Note we always use WAL and FSM during updates */
4158 heaptup = toast_insert_or_update(relation, newtup, &oldtup, 0);
4159 newtupsize = MAXALIGN(heaptup->t_len);
4160 }
4161 else
4162 heaptup = newtup;
4163
4164 /*
4165 * Now, do we need a new page for the tuple, or not? This is a bit
4166 * tricky since someone else could have added tuples to the page while
4167 * we weren't looking. We have to recheck the available space after
4168 * reacquiring the buffer lock. But don't bother to do that if the
4169 * former amount of free space is still not enough; it's unlikely
4170 * there's more free now than before.
4171 *
4172 * What's more, if we need to get a new page, we will need to acquire
4173 * buffer locks on both old and new pages. To avoid deadlock against
4174 * some other backend trying to get the same two locks in the other
4175 * order, we must be consistent about the order we get the locks in.
4176 * We use the rule "lock the lower-numbered page of the relation
4177 * first". To implement this, we must do RelationGetBufferForTuple
4178 * while not holding the lock on the old page, and we must rely on it
4179 * to get the locks on both pages in the correct order.
4180 */
4181 if (newtupsize > pagefree)
4182 {
4183 /* Assume there's no chance to put heaptup on same page. */
4184 newbuf = RelationGetBufferForTuple(relation, heaptup->t_len,
4185 buffer, 0, NULL,
4186 &vmbuffer_new, &vmbuffer);
4187 }
4188 else
4189 {
4190 /* Re-acquire the lock on the old tuple's page. */
4191 LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE);
4192 /* Re-check using the up-to-date free space */
4193 pagefree = PageGetHeapFreeSpace(page);
4194 if (newtupsize > pagefree)
4195 {
4196 /*
4197 * Rats, it doesn't fit anymore. We must now unlock and
4198 * relock to avoid deadlock. Fortunately, this path should
4199 * seldom be taken.
4200 */
4201 LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
4202 newbuf = RelationGetBufferForTuple(relation, heaptup->t_len,
4203 buffer, 0, NULL,
4204 &vmbuffer_new, &vmbuffer);
4205 }
4206 else
4207 {
4208 /* OK, it fits here, so we're done. */
4209 newbuf = buffer;
4210 }
4211 }
4212 }
4213 else
4214 {
4215 /* No TOAST work needed, and it'll fit on same page */
4216 newbuf = buffer;
4217 heaptup = newtup;
4218 }
4219
4220 /*
4221 * We're about to do the actual update -- check for conflict first, to
4222 * avoid possibly having to roll back work we've just done.
4223 *
4224 * This is safe without a recheck as long as there is no possibility of
4225 * another process scanning the pages between this check and the update
4226 * being visible to the scan (i.e., exclusive buffer content lock(s) are
4227 * continuously held from this point until the tuple update is visible).
4228 *
4229 * For the new tuple the only check needed is at the relation level, but
4230 * since both tuples are in the same relation and the check for oldtup
4231 * will include checking the relation level, there is no benefit to a
4232 * separate check for the new tuple.
4233 */
4234 CheckForSerializableConflictIn(relation, &oldtup, buffer);
4235
4236 /*
4237 * At this point newbuf and buffer are both pinned and locked, and newbuf
4238 * has enough space for the new tuple. If they are the same buffer, only
4239 * one pin is held.
4240 */
4241
4242 if (newbuf == buffer)
4243 {
4244 /*
4245 * Since the new tuple is going into the same page, we might be able
4246 * to do a HOT update. A HOT update is possible if none of the index
4247 * columns, nor columns used in projection functional indexes, have
4248 * changed. If the page was already full, we may have skipped
4249 * checking for index columns, and also can't do a HOT update.
4250 */
4251 if (hot_attrs_checked
4252 && !bms_overlap(modified_attrs, hot_attrs)
4253 && (!bms_overlap(modified_attrs, proj_idx_attrs)
4254 || ProjIndexIsUnchanged(relation, &oldtup, newtup)))
4255 {
4256 use_hot_update = true;
4257 }
4258 }
4259 else
4260 {
4261 /* Set a hint that the old page could use prune/defrag */
4262 PageSetFull(page);
4263 }
4264
4265 /*
4266 * Compute replica identity tuple before entering the critical section so
4267 * we don't PANIC upon a memory allocation failure.
4268 * ExtractReplicaIdentity() will return NULL if nothing needs to be
4269 * logged.
4270 */
4271 old_key_tuple = ExtractReplicaIdentity(relation, &oldtup,
4272 bms_overlap(modified_attrs, id_attrs),
4273 &old_key_copied);
4274
4275 /* NO EREPORT(ERROR) from here till changes are logged */
4276 START_CRIT_SECTION();
4277
4278 /*
4279 * If this transaction commits, the old tuple will become DEAD sooner or
4280 * later. Set flag that this page is a candidate for pruning once our xid
4281 * falls below the OldestXmin horizon. If the transaction finally aborts,
4282 * the subsequent page pruning will be a no-op and the hint will be
4283 * cleared.
4284 *
4285 * XXX Should we set hint on newbuf as well? If the transaction aborts,
4286 * there would be a prunable tuple in the newbuf; but for now we choose
4287 * not to optimize for aborts. Note that heap_xlog_update must be kept in
4288 * sync if this decision changes.
4289 */
4290 PageSetPrunable(page, xid);
4291
4292 if (use_hot_update)
4293 {
4294 /* Mark the old tuple as HOT-updated */
4295 HeapTupleSetHotUpdated(&oldtup);
4296 /* And mark the new tuple as heap-only */
4297 HeapTupleSetHeapOnly(heaptup);
4298 /* Mark the caller's copy too, in case different from heaptup */
4299 HeapTupleSetHeapOnly(newtup);
4300 }
4301 else
4302 {
4303 /* Make sure tuples are correctly marked as not-HOT */
4304 HeapTupleClearHotUpdated(&oldtup);
4305 HeapTupleClearHeapOnly(heaptup);
4306 HeapTupleClearHeapOnly(newtup);
4307 }
4308
4309 RelationPutHeapTuple(relation, newbuf, heaptup, false); /* insert new tuple */
4310
4311
4312 /* Clear obsolete visibility flags, possibly set by ourselves above... */
4313 oldtup.t_data->t_infomask &= ~(HEAP_XMAX_BITS | HEAP_MOVED);
4314 oldtup.t_data->t_infomask2 &= ~HEAP_KEYS_UPDATED;
4315 /* ... and store info about transaction updating this tuple */
4316 Assert(TransactionIdIsValid(xmax_old_tuple));
4317 HeapTupleHeaderSetXmax(oldtup.t_data, xmax_old_tuple);
4318 oldtup.t_data->t_infomask |= infomask_old_tuple;
4319 oldtup.t_data->t_infomask2 |= infomask2_old_tuple;
4320 HeapTupleHeaderSetCmax(oldtup.t_data, cid, iscombo);
4321
4322 /* record address of new tuple in t_ctid of old one */
4323 oldtup.t_data->t_ctid = heaptup->t_self;
4324
4325 /* clear PD_ALL_VISIBLE flags, reset all visibilitymap bits */
4326 if (PageIsAllVisible(BufferGetPage(buffer)))
4327 {
4328 all_visible_cleared = true;
4329 PageClearAllVisible(BufferGetPage(buffer));
4330 visibilitymap_clear(relation, BufferGetBlockNumber(buffer),
4331 vmbuffer, VISIBILITYMAP_VALID_BITS);
4332 }
4333 if (newbuf != buffer && PageIsAllVisible(BufferGetPage(newbuf)))
4334 {
4335 all_visible_cleared_new = true;
4336 PageClearAllVisible(BufferGetPage(newbuf));
4337 visibilitymap_clear(relation, BufferGetBlockNumber(newbuf),
4338 vmbuffer_new, VISIBILITYMAP_VALID_BITS);
4339 }
4340
4341 if (newbuf != buffer)
4342 MarkBufferDirty(newbuf);
4343 MarkBufferDirty(buffer);
4344
4345 /* XLOG stuff */
4346 if (RelationNeedsWAL(relation))
4347 {
4348 XLogRecPtr recptr;
4349
4350 /*
4351 * For logical decoding we need combocids to properly decode the
4352 * catalog.
4353 */
4354 if (RelationIsAccessibleInLogicalDecoding(relation))
4355 {
4356 log_heap_new_cid(relation, &oldtup);
4357 log_heap_new_cid(relation, heaptup);
4358 }
4359
4360 recptr = log_heap_update(relation, buffer,
4361 newbuf, &oldtup, heaptup,
4362 old_key_tuple,
4363 all_visible_cleared,
4364 all_visible_cleared_new);
4365 if (newbuf != buffer)
4366 {
4367 PageSetLSN(BufferGetPage(newbuf), recptr);
4368 }
4369 PageSetLSN(BufferGetPage(buffer), recptr);
4370 }
4371
4372 END_CRIT_SECTION();
4373
4374 if (newbuf != buffer)
4375 LockBuffer(newbuf, BUFFER_LOCK_UNLOCK);
4376 LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
4377
4378 /*
4379 * Mark old tuple for invalidation from system caches at next command
4380 * boundary, and mark the new tuple for invalidation in case we abort. We
4381 * have to do this before releasing the buffer because oldtup is in the
4382 * buffer. (heaptup is all in local memory, but it's necessary to process
4383 * both tuple versions in one call to inval.c so we can avoid redundant
4384 * sinval messages.)
4385 */
4386 CacheInvalidateHeapTuple(relation, &oldtup, heaptup);
4387
4388 /* Now we can release the buffer(s) */
4389 if (newbuf != buffer)
4390 ReleaseBuffer(newbuf);
4391 ReleaseBuffer(buffer);
4392 if (BufferIsValid(vmbuffer_new))
4393 ReleaseBuffer(vmbuffer_new);
4394 if (BufferIsValid(vmbuffer))
4395 ReleaseBuffer(vmbuffer);
4396
4397 /*
4398 * Release the lmgr tuple lock, if we had it.
4399 */
4400 if (have_tuple_lock)
4401 UnlockTupleTuplock(relation, &(oldtup.t_self), *lockmode);
4402
4403 pgstat_count_heap_update(relation, use_hot_update);
4404
4405 /*
4406 * If heaptup is a private copy, release it. Don't forget to copy t_self
4407 * back to the caller's image, too.
4408 */
4409 if (heaptup != newtup)
4410 {
4411 newtup->t_self = heaptup->t_self;
4412 heap_freetuple(heaptup);
4413 }
4414
4415 if (old_key_tuple != NULL && old_key_copied)
4416 heap_freetuple(old_key_tuple);
4417
4418 bms_free(hot_attrs);
4419 bms_free(proj_idx_attrs);
4420 bms_free(key_attrs);
4421 bms_free(id_attrs);
4422 bms_free(modified_attrs);
4423 bms_free(interesting_attrs);
4424
4425 return HeapTupleMayBeUpdated;
4426 }
4427
4428 /*
4429 * Check if the specified attribute's value is same in both given tuples.
4430 * Subroutine for HeapDetermineModifiedColumns.
4431 */
4432 static bool
heap_tuple_attr_equals(TupleDesc tupdesc,int attrnum,HeapTuple tup1,HeapTuple tup2)4433 heap_tuple_attr_equals(TupleDesc tupdesc, int attrnum,
4434 HeapTuple tup1, HeapTuple tup2)
4435 {
4436 Datum value1,
4437 value2;
4438 bool isnull1,
4439 isnull2;
4440 Form_pg_attribute att;
4441
4442 /*
4443 * If it's a whole-tuple reference, say "not equal". It's not really
4444 * worth supporting this case, since it could only succeed after a no-op
4445 * update, which is hardly a case worth optimizing for.
4446 */
4447 if (attrnum == 0)
4448 return false;
4449
4450 /*
4451 * Likewise, automatically say "not equal" for any system attribute other
4452 * than OID and tableOID; we cannot expect these to be consistent in a HOT
4453 * chain, or even to be set correctly yet in the new tuple.
4454 */
4455 if (attrnum < 0)
4456 {
4457 if (attrnum != ObjectIdAttributeNumber &&
4458 attrnum != TableOidAttributeNumber)
4459 return false;
4460 }
4461
4462 /*
4463 * Extract the corresponding values. XXX this is pretty inefficient if
4464 * there are many indexed columns. Should HeapDetermineModifiedColumns do
4465 * a single heap_deform_tuple call on each tuple, instead? But that
4466 * doesn't work for system columns ...
4467 */
4468 value1 = heap_getattr(tup1, attrnum, tupdesc, &isnull1);
4469 value2 = heap_getattr(tup2, attrnum, tupdesc, &isnull2);
4470
4471 /*
4472 * If one value is NULL and other is not, then they are certainly not
4473 * equal
4474 */
4475 if (isnull1 != isnull2)
4476 return false;
4477
4478 /*
4479 * If both are NULL, they can be considered equal.
4480 */
4481 if (isnull1)
4482 return true;
4483
4484 /*
4485 * We do simple binary comparison of the two datums. This may be overly
4486 * strict because there can be multiple binary representations for the
4487 * same logical value. But we should be OK as long as there are no false
4488 * positives. Using a type-specific equality operator is messy because
4489 * there could be multiple notions of equality in different operator
4490 * classes; furthermore, we cannot safely invoke user-defined functions
4491 * while holding exclusive buffer lock.
4492 */
4493 if (attrnum <= 0)
4494 {
4495 /* The only allowed system columns are OIDs, so do this */
4496 return (DatumGetObjectId(value1) == DatumGetObjectId(value2));
4497 }
4498 else
4499 {
4500 Assert(attrnum <= tupdesc->natts);
4501 att = TupleDescAttr(tupdesc, attrnum - 1);
4502 return datumIsEqual(value1, value2, att->attbyval, att->attlen);
4503 }
4504 }
4505
4506 /*
4507 * Check whether the value is unchanged after update of a projection
4508 * functional index. Compare the new and old values of the indexed
4509 * expression to see if we are able to use a HOT update or not.
4510 */
4511 static bool
ProjIndexIsUnchanged(Relation relation,HeapTuple oldtup,HeapTuple newtup)4512 ProjIndexIsUnchanged(Relation relation, HeapTuple oldtup, HeapTuple newtup)
4513 {
4514 ListCell *l;
4515 List *indexoidlist = RelationGetIndexList(relation);
4516 EState *estate = CreateExecutorState();
4517 ExprContext *econtext = GetPerTupleExprContext(estate);
4518 TupleTableSlot *slot = MakeSingleTupleTableSlot(RelationGetDescr(relation));
4519 bool equals = true;
4520 Datum old_values[INDEX_MAX_KEYS];
4521 bool old_isnull[INDEX_MAX_KEYS];
4522 Datum new_values[INDEX_MAX_KEYS];
4523 bool new_isnull[INDEX_MAX_KEYS];
4524 int indexno = 0;
4525
4526 econtext->ecxt_scantuple = slot;
4527
4528 foreach(l, indexoidlist)
4529 {
4530 if (bms_is_member(indexno, relation->rd_projidx))
4531 {
4532 Oid indexOid = lfirst_oid(l);
4533 Relation indexDesc = index_open(indexOid, AccessShareLock);
4534 IndexInfo *indexInfo = BuildIndexInfo(indexDesc);
4535 int i;
4536
4537 ResetExprContext(econtext);
4538 ExecStoreTuple(oldtup, slot, InvalidBuffer, false);
4539 FormIndexDatum(indexInfo,
4540 slot,
4541 estate,
4542 old_values,
4543 old_isnull);
4544
4545 ExecStoreTuple(newtup, slot, InvalidBuffer, false);
4546 FormIndexDatum(indexInfo,
4547 slot,
4548 estate,
4549 new_values,
4550 new_isnull);
4551
4552 for (i = 0; i < indexInfo->ii_NumIndexAttrs; i++)
4553 {
4554 if (old_isnull[i] != new_isnull[i])
4555 {
4556 equals = false;
4557 break;
4558 }
4559 else if (!old_isnull[i])
4560 {
4561 Form_pg_attribute att = TupleDescAttr(RelationGetDescr(indexDesc), i);
4562
4563 if (!datumIsEqual(old_values[i], new_values[i], att->attbyval, att->attlen))
4564 {
4565 equals = false;
4566 break;
4567 }
4568 }
4569 }
4570 index_close(indexDesc, AccessShareLock);
4571
4572 if (!equals)
4573 {
4574 break;
4575 }
4576 }
4577 indexno += 1;
4578 }
4579 ExecDropSingleTupleTableSlot(slot);
4580 FreeExecutorState(estate);
4581
4582 return equals;
4583 }
4584
4585
4586 /*
4587 * Check which columns are being updated.
4588 *
4589 * Given an updated tuple, determine (and return into the output bitmapset),
4590 * from those listed as interesting, the set of columns that changed.
4591 *
4592 * The input bitmapset is destructively modified; that is OK since this is
4593 * invoked at most once in heap_update.
4594 */
4595 static Bitmapset *
HeapDetermineModifiedColumns(Relation relation,Bitmapset * interesting_cols,HeapTuple oldtup,HeapTuple newtup)4596 HeapDetermineModifiedColumns(Relation relation, Bitmapset *interesting_cols,
4597 HeapTuple oldtup, HeapTuple newtup)
4598 {
4599 int attnum;
4600 Bitmapset *modified = NULL;
4601
4602 while ((attnum = bms_first_member(interesting_cols)) >= 0)
4603 {
4604 attnum += FirstLowInvalidHeapAttributeNumber;
4605
4606 if (!heap_tuple_attr_equals(RelationGetDescr(relation),
4607 attnum, oldtup, newtup))
4608 modified = bms_add_member(modified,
4609 attnum - FirstLowInvalidHeapAttributeNumber);
4610 }
4611
4612 return modified;
4613 }
4614
4615 /*
4616 * simple_heap_update - replace a tuple
4617 *
4618 * This routine may be used to update a tuple when concurrent updates of
4619 * the target tuple are not expected (for example, because we have a lock
4620 * on the relation associated with the tuple). Any failure is reported
4621 * via ereport().
4622 */
4623 void
simple_heap_update(Relation relation,ItemPointer otid,HeapTuple tup)4624 simple_heap_update(Relation relation, ItemPointer otid, HeapTuple tup)
4625 {
4626 HTSU_Result result;
4627 HeapUpdateFailureData hufd;
4628 LockTupleMode lockmode;
4629
4630 result = heap_update(relation, otid, tup,
4631 GetCurrentCommandId(true), InvalidSnapshot,
4632 true /* wait for commit */ ,
4633 &hufd, &lockmode);
4634 switch (result)
4635 {
4636 case HeapTupleSelfUpdated:
4637 /* Tuple was already updated in current command? */
4638 elog(ERROR, "tuple already updated by self");
4639 break;
4640
4641 case HeapTupleMayBeUpdated:
4642 /* done successfully */
4643 break;
4644
4645 case HeapTupleUpdated:
4646 elog(ERROR, "tuple concurrently updated");
4647 break;
4648
4649 default:
4650 elog(ERROR, "unrecognized heap_update status: %u", result);
4651 break;
4652 }
4653 }
4654
4655
4656 /*
4657 * Return the MultiXactStatus corresponding to the given tuple lock mode.
4658 */
4659 static MultiXactStatus
get_mxact_status_for_lock(LockTupleMode mode,bool is_update)4660 get_mxact_status_for_lock(LockTupleMode mode, bool is_update)
4661 {
4662 int retval;
4663
4664 if (is_update)
4665 retval = tupleLockExtraInfo[mode].updstatus;
4666 else
4667 retval = tupleLockExtraInfo[mode].lockstatus;
4668
4669 if (retval == -1)
4670 elog(ERROR, "invalid lock tuple mode %d/%s", mode,
4671 is_update ? "true" : "false");
4672
4673 return (MultiXactStatus) retval;
4674 }
4675
4676 /*
4677 * heap_lock_tuple - lock a tuple in shared or exclusive mode
4678 *
4679 * Note that this acquires a buffer pin, which the caller must release.
4680 *
4681 * Input parameters:
4682 * relation: relation containing tuple (caller must hold suitable lock)
4683 * tuple->t_self: TID of tuple to lock (rest of struct need not be valid)
4684 * cid: current command ID (used for visibility test, and stored into
4685 * tuple's cmax if lock is successful)
4686 * mode: indicates if shared or exclusive tuple lock is desired
4687 * wait_policy: what to do if tuple lock is not available
4688 * follow_updates: if true, follow the update chain to also lock descendant
4689 * tuples.
4690 *
4691 * Output parameters:
4692 * *tuple: all fields filled in
4693 * *buffer: set to buffer holding tuple (pinned but not locked at exit)
4694 * *hufd: filled in failure cases (see below)
4695 *
4696 * Function result may be:
4697 * HeapTupleMayBeUpdated: lock was successfully acquired
4698 * HeapTupleInvisible: lock failed because tuple was never visible to us
4699 * HeapTupleSelfUpdated: lock failed because tuple updated by self
4700 * HeapTupleUpdated: lock failed because tuple updated by other xact
4701 * HeapTupleWouldBlock: lock couldn't be acquired and wait_policy is skip
4702 *
4703 * In the failure cases other than HeapTupleInvisible, the routine fills
4704 * *hufd with the tuple's t_ctid, t_xmax (resolving a possible MultiXact,
4705 * if necessary), and t_cmax (the last only for HeapTupleSelfUpdated,
4706 * since we cannot obtain cmax from a combocid generated by another
4707 * transaction).
4708 * See comments for struct HeapUpdateFailureData for additional info.
4709 *
4710 * See README.tuplock for a thorough explanation of this mechanism.
4711 */
4712 HTSU_Result
heap_lock_tuple(Relation relation,HeapTuple tuple,CommandId cid,LockTupleMode mode,LockWaitPolicy wait_policy,bool follow_updates,Buffer * buffer,HeapUpdateFailureData * hufd)4713 heap_lock_tuple(Relation relation, HeapTuple tuple,
4714 CommandId cid, LockTupleMode mode, LockWaitPolicy wait_policy,
4715 bool follow_updates,
4716 Buffer *buffer, HeapUpdateFailureData *hufd)
4717 {
4718 HTSU_Result result;
4719 ItemPointer tid = &(tuple->t_self);
4720 ItemId lp;
4721 Page page;
4722 Buffer vmbuffer = InvalidBuffer;
4723 BlockNumber block;
4724 TransactionId xid,
4725 xmax;
4726 uint16 old_infomask,
4727 new_infomask,
4728 new_infomask2;
4729 bool first_time = true;
4730 bool skip_tuple_lock = false;
4731 bool have_tuple_lock = false;
4732 bool cleared_all_frozen = false;
4733
4734 *buffer = ReadBuffer(relation, ItemPointerGetBlockNumber(tid));
4735 block = ItemPointerGetBlockNumber(tid);
4736
4737 /*
4738 * Before locking the buffer, pin the visibility map page if it appears to
4739 * be necessary. Since we haven't got the lock yet, someone else might be
4740 * in the middle of changing this, so we'll need to recheck after we have
4741 * the lock.
4742 */
4743 if (PageIsAllVisible(BufferGetPage(*buffer)))
4744 visibilitymap_pin(relation, block, &vmbuffer);
4745
4746 LockBuffer(*buffer, BUFFER_LOCK_EXCLUSIVE);
4747
4748 page = BufferGetPage(*buffer);
4749 lp = PageGetItemId(page, ItemPointerGetOffsetNumber(tid));
4750 Assert(ItemIdIsNormal(lp));
4751
4752 tuple->t_data = (HeapTupleHeader) PageGetItem(page, lp);
4753 tuple->t_len = ItemIdGetLength(lp);
4754 tuple->t_tableOid = RelationGetRelid(relation);
4755
4756 l3:
4757 result = HeapTupleSatisfiesUpdate(tuple, cid, *buffer);
4758
4759 if (result == HeapTupleInvisible)
4760 {
4761 /*
4762 * This is possible, but only when locking a tuple for ON CONFLICT
4763 * UPDATE. We return this value here rather than throwing an error in
4764 * order to give that case the opportunity to throw a more specific
4765 * error.
4766 */
4767 result = HeapTupleInvisible;
4768 goto out_locked;
4769 }
4770 else if (result == HeapTupleBeingUpdated || result == HeapTupleUpdated)
4771 {
4772 TransactionId xwait;
4773 uint16 infomask;
4774 uint16 infomask2;
4775 bool require_sleep;
4776 ItemPointerData t_ctid;
4777
4778 /* must copy state data before unlocking buffer */
4779 xwait = HeapTupleHeaderGetRawXmax(tuple->t_data);
4780 infomask = tuple->t_data->t_infomask;
4781 infomask2 = tuple->t_data->t_infomask2;
4782 ItemPointerCopy(&tuple->t_data->t_ctid, &t_ctid);
4783
4784 LockBuffer(*buffer, BUFFER_LOCK_UNLOCK);
4785
4786 /*
4787 * If any subtransaction of the current top transaction already holds
4788 * a lock as strong as or stronger than what we're requesting, we
4789 * effectively hold the desired lock already. We *must* succeed
4790 * without trying to take the tuple lock, else we will deadlock
4791 * against anyone wanting to acquire a stronger lock.
4792 *
4793 * Note we only do this the first time we loop on the HTSU result;
4794 * there is no point in testing in subsequent passes, because
4795 * evidently our own transaction cannot have acquired a new lock after
4796 * the first time we checked.
4797 */
4798 if (first_time)
4799 {
4800 first_time = false;
4801
4802 if (infomask & HEAP_XMAX_IS_MULTI)
4803 {
4804 int i;
4805 int nmembers;
4806 MultiXactMember *members;
4807
4808 /*
4809 * We don't need to allow old multixacts here; if that had
4810 * been the case, HeapTupleSatisfiesUpdate would have returned
4811 * MayBeUpdated and we wouldn't be here.
4812 */
4813 nmembers =
4814 GetMultiXactIdMembers(xwait, &members, false,
4815 HEAP_XMAX_IS_LOCKED_ONLY(infomask));
4816
4817 for (i = 0; i < nmembers; i++)
4818 {
4819 /* only consider members of our own transaction */
4820 if (!TransactionIdIsCurrentTransactionId(members[i].xid))
4821 continue;
4822
4823 if (TUPLOCK_from_mxstatus(members[i].status) >= mode)
4824 {
4825 pfree(members);
4826 result = HeapTupleMayBeUpdated;
4827 goto out_unlocked;
4828 }
4829 else
4830 {
4831 /*
4832 * Disable acquisition of the heavyweight tuple lock.
4833 * Otherwise, when promoting a weaker lock, we might
4834 * deadlock with another locker that has acquired the
4835 * heavyweight tuple lock and is waiting for our
4836 * transaction to finish.
4837 *
4838 * Note that in this case we still need to wait for
4839 * the multixact if required, to avoid acquiring
4840 * conflicting locks.
4841 */
4842 skip_tuple_lock = true;
4843 }
4844 }
4845
4846 if (members)
4847 pfree(members);
4848 }
4849 else if (TransactionIdIsCurrentTransactionId(xwait))
4850 {
4851 switch (mode)
4852 {
4853 case LockTupleKeyShare:
4854 Assert(HEAP_XMAX_IS_KEYSHR_LOCKED(infomask) ||
4855 HEAP_XMAX_IS_SHR_LOCKED(infomask) ||
4856 HEAP_XMAX_IS_EXCL_LOCKED(infomask));
4857 result = HeapTupleMayBeUpdated;
4858 goto out_unlocked;
4859 case LockTupleShare:
4860 if (HEAP_XMAX_IS_SHR_LOCKED(infomask) ||
4861 HEAP_XMAX_IS_EXCL_LOCKED(infomask))
4862 {
4863 result = HeapTupleMayBeUpdated;
4864 goto out_unlocked;
4865 }
4866 break;
4867 case LockTupleNoKeyExclusive:
4868 if (HEAP_XMAX_IS_EXCL_LOCKED(infomask))
4869 {
4870 result = HeapTupleMayBeUpdated;
4871 goto out_unlocked;
4872 }
4873 break;
4874 case LockTupleExclusive:
4875 if (HEAP_XMAX_IS_EXCL_LOCKED(infomask) &&
4876 infomask2 & HEAP_KEYS_UPDATED)
4877 {
4878 result = HeapTupleMayBeUpdated;
4879 goto out_unlocked;
4880 }
4881 break;
4882 }
4883 }
4884 }
4885
4886 /*
4887 * Initially assume that we will have to wait for the locking
4888 * transaction(s) to finish. We check various cases below in which
4889 * this can be turned off.
4890 */
4891 require_sleep = true;
4892 if (mode == LockTupleKeyShare)
4893 {
4894 /*
4895 * If we're requesting KeyShare, and there's no update present, we
4896 * don't need to wait. Even if there is an update, we can still
4897 * continue if the key hasn't been modified.
4898 *
4899 * However, if there are updates, we need to walk the update chain
4900 * to mark future versions of the row as locked, too. That way,
4901 * if somebody deletes that future version, we're protected
4902 * against the key going away. This locking of future versions
4903 * could block momentarily, if a concurrent transaction is
4904 * deleting a key; or it could return a value to the effect that
4905 * the transaction deleting the key has already committed. So we
4906 * do this before re-locking the buffer; otherwise this would be
4907 * prone to deadlocks.
4908 *
4909 * Note that the TID we're locking was grabbed before we unlocked
4910 * the buffer. For it to change while we're not looking, the
4911 * other properties we're testing for below after re-locking the
4912 * buffer would also change, in which case we would restart this
4913 * loop above.
4914 */
4915 if (!(infomask2 & HEAP_KEYS_UPDATED))
4916 {
4917 bool updated;
4918
4919 updated = !HEAP_XMAX_IS_LOCKED_ONLY(infomask);
4920
4921 /*
4922 * If there are updates, follow the update chain; bail out if
4923 * that cannot be done.
4924 */
4925 if (follow_updates && updated)
4926 {
4927 HTSU_Result res;
4928
4929 res = heap_lock_updated_tuple(relation, tuple, &t_ctid,
4930 GetCurrentTransactionId(),
4931 mode);
4932 if (res != HeapTupleMayBeUpdated)
4933 {
4934 result = res;
4935 /* recovery code expects to have buffer lock held */
4936 LockBuffer(*buffer, BUFFER_LOCK_EXCLUSIVE);
4937 goto failed;
4938 }
4939 }
4940
4941 LockBuffer(*buffer, BUFFER_LOCK_EXCLUSIVE);
4942
4943 /*
4944 * Make sure it's still an appropriate lock, else start over.
4945 * Also, if it wasn't updated before we released the lock, but
4946 * is updated now, we start over too; the reason is that we
4947 * now need to follow the update chain to lock the new
4948 * versions.
4949 */
4950 if (!HeapTupleHeaderIsOnlyLocked(tuple->t_data) &&
4951 ((tuple->t_data->t_infomask2 & HEAP_KEYS_UPDATED) ||
4952 !updated))
4953 goto l3;
4954
4955 /* Things look okay, so we can skip sleeping */
4956 require_sleep = false;
4957
4958 /*
4959 * Note we allow Xmax to change here; other updaters/lockers
4960 * could have modified it before we grabbed the buffer lock.
4961 * However, this is not a problem, because with the recheck we
4962 * just did we ensure that they still don't conflict with the
4963 * lock we want.
4964 */
4965 }
4966 }
4967 else if (mode == LockTupleShare)
4968 {
4969 /*
4970 * If we're requesting Share, we can similarly avoid sleeping if
4971 * there's no update and no exclusive lock present.
4972 */
4973 if (HEAP_XMAX_IS_LOCKED_ONLY(infomask) &&
4974 !HEAP_XMAX_IS_EXCL_LOCKED(infomask))
4975 {
4976 LockBuffer(*buffer, BUFFER_LOCK_EXCLUSIVE);
4977
4978 /*
4979 * Make sure it's still an appropriate lock, else start over.
4980 * See above about allowing xmax to change.
4981 */
4982 if (!HEAP_XMAX_IS_LOCKED_ONLY(tuple->t_data->t_infomask) ||
4983 HEAP_XMAX_IS_EXCL_LOCKED(tuple->t_data->t_infomask))
4984 goto l3;
4985 require_sleep = false;
4986 }
4987 }
4988 else if (mode == LockTupleNoKeyExclusive)
4989 {
4990 /*
4991 * If we're requesting NoKeyExclusive, we might also be able to
4992 * avoid sleeping; just ensure that there no conflicting lock
4993 * already acquired.
4994 */
4995 if (infomask & HEAP_XMAX_IS_MULTI)
4996 {
4997 if (!DoesMultiXactIdConflict((MultiXactId) xwait, infomask,
4998 mode, NULL))
4999 {
5000 /*
5001 * No conflict, but if the xmax changed under us in the
5002 * meantime, start over.
5003 */
5004 LockBuffer(*buffer, BUFFER_LOCK_EXCLUSIVE);
5005 if (xmax_infomask_changed(tuple->t_data->t_infomask, infomask) ||
5006 !TransactionIdEquals(HeapTupleHeaderGetRawXmax(tuple->t_data),
5007 xwait))
5008 goto l3;
5009
5010 /* otherwise, we're good */
5011 require_sleep = false;
5012 }
5013 }
5014 else if (HEAP_XMAX_IS_KEYSHR_LOCKED(infomask))
5015 {
5016 LockBuffer(*buffer, BUFFER_LOCK_EXCLUSIVE);
5017
5018 /* if the xmax changed in the meantime, start over */
5019 if (xmax_infomask_changed(tuple->t_data->t_infomask, infomask) ||
5020 !TransactionIdEquals(
5021 HeapTupleHeaderGetRawXmax(tuple->t_data),
5022 xwait))
5023 goto l3;
5024 /* otherwise, we're good */
5025 require_sleep = false;
5026 }
5027 }
5028
5029 /*
5030 * As a check independent from those above, we can also avoid sleeping
5031 * if the current transaction is the sole locker of the tuple. Note
5032 * that the strength of the lock already held is irrelevant; this is
5033 * not about recording the lock in Xmax (which will be done regardless
5034 * of this optimization, below). Also, note that the cases where we
5035 * hold a lock stronger than we are requesting are already handled
5036 * above by not doing anything.
5037 *
5038 * Note we only deal with the non-multixact case here; MultiXactIdWait
5039 * is well equipped to deal with this situation on its own.
5040 */
5041 if (require_sleep && !(infomask & HEAP_XMAX_IS_MULTI) &&
5042 TransactionIdIsCurrentTransactionId(xwait))
5043 {
5044 /* ... but if the xmax changed in the meantime, start over */
5045 LockBuffer(*buffer, BUFFER_LOCK_EXCLUSIVE);
5046 if (xmax_infomask_changed(tuple->t_data->t_infomask, infomask) ||
5047 !TransactionIdEquals(HeapTupleHeaderGetRawXmax(tuple->t_data),
5048 xwait))
5049 goto l3;
5050 Assert(HEAP_XMAX_IS_LOCKED_ONLY(tuple->t_data->t_infomask));
5051 require_sleep = false;
5052 }
5053
5054 /*
5055 * Time to sleep on the other transaction/multixact, if necessary.
5056 *
5057 * If the other transaction is an update that's already committed,
5058 * then sleeping cannot possibly do any good: if we're required to
5059 * sleep, get out to raise an error instead.
5060 *
5061 * By here, we either have already acquired the buffer exclusive lock,
5062 * or we must wait for the locking transaction or multixact; so below
5063 * we ensure that we grab buffer lock after the sleep.
5064 */
5065 if (require_sleep && result == HeapTupleUpdated)
5066 {
5067 LockBuffer(*buffer, BUFFER_LOCK_EXCLUSIVE);
5068 goto failed;
5069 }
5070 else if (require_sleep)
5071 {
5072 /*
5073 * Acquire tuple lock to establish our priority for the tuple, or
5074 * die trying. LockTuple will release us when we are next-in-line
5075 * for the tuple. We must do this even if we are share-locking,
5076 * but not if we already have a weaker lock on the tuple.
5077 *
5078 * If we are forced to "start over" below, we keep the tuple lock;
5079 * this arranges that we stay at the head of the line while
5080 * rechecking tuple state.
5081 */
5082 if (!skip_tuple_lock &&
5083 !heap_acquire_tuplock(relation, tid, mode, wait_policy,
5084 &have_tuple_lock))
5085 {
5086 /*
5087 * This can only happen if wait_policy is Skip and the lock
5088 * couldn't be obtained.
5089 */
5090 result = HeapTupleWouldBlock;
5091 /* recovery code expects to have buffer lock held */
5092 LockBuffer(*buffer, BUFFER_LOCK_EXCLUSIVE);
5093 goto failed;
5094 }
5095
5096 if (infomask & HEAP_XMAX_IS_MULTI)
5097 {
5098 MultiXactStatus status = get_mxact_status_for_lock(mode, false);
5099
5100 /* We only ever lock tuples, never update them */
5101 if (status >= MultiXactStatusNoKeyUpdate)
5102 elog(ERROR, "invalid lock mode in heap_lock_tuple");
5103
5104 /* wait for multixact to end, or die trying */
5105 switch (wait_policy)
5106 {
5107 case LockWaitBlock:
5108 MultiXactIdWait((MultiXactId) xwait, status, infomask,
5109 relation, &tuple->t_self, XLTW_Lock, NULL);
5110 break;
5111 case LockWaitSkip:
5112 if (!ConditionalMultiXactIdWait((MultiXactId) xwait,
5113 status, infomask, relation,
5114 NULL))
5115 {
5116 result = HeapTupleWouldBlock;
5117 /* recovery code expects to have buffer lock held */
5118 LockBuffer(*buffer, BUFFER_LOCK_EXCLUSIVE);
5119 goto failed;
5120 }
5121 break;
5122 case LockWaitError:
5123 if (!ConditionalMultiXactIdWait((MultiXactId) xwait,
5124 status, infomask, relation,
5125 NULL))
5126 ereport(ERROR,
5127 (errcode(ERRCODE_LOCK_NOT_AVAILABLE),
5128 errmsg("could not obtain lock on row in relation \"%s\"",
5129 RelationGetRelationName(relation))));
5130
5131 break;
5132 }
5133
5134 /*
5135 * Of course, the multixact might not be done here: if we're
5136 * requesting a light lock mode, other transactions with light
5137 * locks could still be alive, as well as locks owned by our
5138 * own xact or other subxacts of this backend. We need to
5139 * preserve the surviving MultiXact members. Note that it
5140 * isn't absolutely necessary in the latter case, but doing so
5141 * is simpler.
5142 */
5143 }
5144 else
5145 {
5146 /* wait for regular transaction to end, or die trying */
5147 switch (wait_policy)
5148 {
5149 case LockWaitBlock:
5150 XactLockTableWait(xwait, relation, &tuple->t_self,
5151 XLTW_Lock);
5152 break;
5153 case LockWaitSkip:
5154 if (!ConditionalXactLockTableWait(xwait))
5155 {
5156 result = HeapTupleWouldBlock;
5157 /* recovery code expects to have buffer lock held */
5158 LockBuffer(*buffer, BUFFER_LOCK_EXCLUSIVE);
5159 goto failed;
5160 }
5161 break;
5162 case LockWaitError:
5163 if (!ConditionalXactLockTableWait(xwait))
5164 ereport(ERROR,
5165 (errcode(ERRCODE_LOCK_NOT_AVAILABLE),
5166 errmsg("could not obtain lock on row in relation \"%s\"",
5167 RelationGetRelationName(relation))));
5168 break;
5169 }
5170 }
5171
5172 /* if there are updates, follow the update chain */
5173 if (follow_updates && !HEAP_XMAX_IS_LOCKED_ONLY(infomask))
5174 {
5175 HTSU_Result res;
5176
5177 res = heap_lock_updated_tuple(relation, tuple, &t_ctid,
5178 GetCurrentTransactionId(),
5179 mode);
5180 if (res != HeapTupleMayBeUpdated)
5181 {
5182 result = res;
5183 /* recovery code expects to have buffer lock held */
5184 LockBuffer(*buffer, BUFFER_LOCK_EXCLUSIVE);
5185 goto failed;
5186 }
5187 }
5188
5189 LockBuffer(*buffer, BUFFER_LOCK_EXCLUSIVE);
5190
5191 /*
5192 * xwait is done, but if xwait had just locked the tuple then some
5193 * other xact could update this tuple before we get to this point.
5194 * Check for xmax change, and start over if so.
5195 */
5196 if (xmax_infomask_changed(tuple->t_data->t_infomask, infomask) ||
5197 !TransactionIdEquals(HeapTupleHeaderGetRawXmax(tuple->t_data),
5198 xwait))
5199 goto l3;
5200
5201 if (!(infomask & HEAP_XMAX_IS_MULTI))
5202 {
5203 /*
5204 * Otherwise check if it committed or aborted. Note we cannot
5205 * be here if the tuple was only locked by somebody who didn't
5206 * conflict with us; that would have been handled above. So
5207 * that transaction must necessarily be gone by now. But
5208 * don't check for this in the multixact case, because some
5209 * locker transactions might still be running.
5210 */
5211 UpdateXmaxHintBits(tuple->t_data, *buffer, xwait);
5212 }
5213 }
5214
5215 /* By here, we're certain that we hold buffer exclusive lock again */
5216
5217 /*
5218 * We may lock if previous xmax aborted, or if it committed but only
5219 * locked the tuple without updating it; or if we didn't have to wait
5220 * at all for whatever reason.
5221 */
5222 if (!require_sleep ||
5223 (tuple->t_data->t_infomask & HEAP_XMAX_INVALID) ||
5224 HEAP_XMAX_IS_LOCKED_ONLY(tuple->t_data->t_infomask) ||
5225 HeapTupleHeaderIsOnlyLocked(tuple->t_data))
5226 result = HeapTupleMayBeUpdated;
5227 else
5228 result = HeapTupleUpdated;
5229 }
5230
5231 failed:
5232 if (result != HeapTupleMayBeUpdated)
5233 {
5234 Assert(result == HeapTupleSelfUpdated || result == HeapTupleUpdated ||
5235 result == HeapTupleWouldBlock);
5236 Assert(!(tuple->t_data->t_infomask & HEAP_XMAX_INVALID));
5237 hufd->ctid = tuple->t_data->t_ctid;
5238 hufd->xmax = HeapTupleHeaderGetUpdateXid(tuple->t_data);
5239 if (result == HeapTupleSelfUpdated)
5240 hufd->cmax = HeapTupleHeaderGetCmax(tuple->t_data);
5241 else
5242 hufd->cmax = InvalidCommandId;
5243 goto out_locked;
5244 }
5245
5246 /*
5247 * If we didn't pin the visibility map page and the page has become all
5248 * visible while we were busy locking the buffer, or during some
5249 * subsequent window during which we had it unlocked, we'll have to unlock
5250 * and re-lock, to avoid holding the buffer lock across I/O. That's a bit
5251 * unfortunate, especially since we'll now have to recheck whether the
5252 * tuple has been locked or updated under us, but hopefully it won't
5253 * happen very often.
5254 */
5255 if (vmbuffer == InvalidBuffer && PageIsAllVisible(page))
5256 {
5257 LockBuffer(*buffer, BUFFER_LOCK_UNLOCK);
5258 visibilitymap_pin(relation, block, &vmbuffer);
5259 LockBuffer(*buffer, BUFFER_LOCK_EXCLUSIVE);
5260 goto l3;
5261 }
5262
5263 xmax = HeapTupleHeaderGetRawXmax(tuple->t_data);
5264 old_infomask = tuple->t_data->t_infomask;
5265
5266 /*
5267 * If this is the first possibly-multixact-able operation in the current
5268 * transaction, set my per-backend OldestMemberMXactId setting. We can be
5269 * certain that the transaction will never become a member of any older
5270 * MultiXactIds than that. (We have to do this even if we end up just
5271 * using our own TransactionId below, since some other backend could
5272 * incorporate our XID into a MultiXact immediately afterwards.)
5273 */
5274 MultiXactIdSetOldestMember();
5275
5276 /*
5277 * Compute the new xmax and infomask to store into the tuple. Note we do
5278 * not modify the tuple just yet, because that would leave it in the wrong
5279 * state if multixact.c elogs.
5280 */
5281 compute_new_xmax_infomask(xmax, old_infomask, tuple->t_data->t_infomask2,
5282 GetCurrentTransactionId(), mode, false,
5283 &xid, &new_infomask, &new_infomask2);
5284
5285 START_CRIT_SECTION();
5286
5287 /*
5288 * Store transaction information of xact locking the tuple.
5289 *
5290 * Note: Cmax is meaningless in this context, so don't set it; this avoids
5291 * possibly generating a useless combo CID. Moreover, if we're locking a
5292 * previously updated tuple, it's important to preserve the Cmax.
5293 *
5294 * Also reset the HOT UPDATE bit, but only if there's no update; otherwise
5295 * we would break the HOT chain.
5296 */
5297 tuple->t_data->t_infomask &= ~HEAP_XMAX_BITS;
5298 tuple->t_data->t_infomask2 &= ~HEAP_KEYS_UPDATED;
5299 tuple->t_data->t_infomask |= new_infomask;
5300 tuple->t_data->t_infomask2 |= new_infomask2;
5301 if (HEAP_XMAX_IS_LOCKED_ONLY(new_infomask))
5302 HeapTupleHeaderClearHotUpdated(tuple->t_data);
5303 HeapTupleHeaderSetXmax(tuple->t_data, xid);
5304
5305 /*
5306 * Make sure there is no forward chain link in t_ctid. Note that in the
5307 * cases where the tuple has been updated, we must not overwrite t_ctid,
5308 * because it was set by the updater. Moreover, if the tuple has been
5309 * updated, we need to follow the update chain to lock the new versions of
5310 * the tuple as well.
5311 */
5312 if (HEAP_XMAX_IS_LOCKED_ONLY(new_infomask))
5313 tuple->t_data->t_ctid = *tid;
5314
5315 /* Clear only the all-frozen bit on visibility map if needed */
5316 if (PageIsAllVisible(page) &&
5317 visibilitymap_clear(relation, block, vmbuffer,
5318 VISIBILITYMAP_ALL_FROZEN))
5319 cleared_all_frozen = true;
5320
5321
5322 MarkBufferDirty(*buffer);
5323
5324 /*
5325 * XLOG stuff. You might think that we don't need an XLOG record because
5326 * there is no state change worth restoring after a crash. You would be
5327 * wrong however: we have just written either a TransactionId or a
5328 * MultiXactId that may never have been seen on disk before, and we need
5329 * to make sure that there are XLOG entries covering those ID numbers.
5330 * Else the same IDs might be re-used after a crash, which would be
5331 * disastrous if this page made it to disk before the crash. Essentially
5332 * we have to enforce the WAL log-before-data rule even in this case.
5333 * (Also, in a PITR log-shipping or 2PC environment, we have to have XLOG
5334 * entries for everything anyway.)
5335 */
5336 if (RelationNeedsWAL(relation))
5337 {
5338 xl_heap_lock xlrec;
5339 XLogRecPtr recptr;
5340
5341 XLogBeginInsert();
5342 XLogRegisterBuffer(0, *buffer, REGBUF_STANDARD);
5343
5344 xlrec.offnum = ItemPointerGetOffsetNumber(&tuple->t_self);
5345 xlrec.locking_xid = xid;
5346 xlrec.infobits_set = compute_infobits(new_infomask,
5347 tuple->t_data->t_infomask2);
5348 xlrec.flags = cleared_all_frozen ? XLH_LOCK_ALL_FROZEN_CLEARED : 0;
5349 XLogRegisterData((char *) &xlrec, SizeOfHeapLock);
5350
5351 /* we don't decode row locks atm, so no need to log the origin */
5352
5353 recptr = XLogInsert(RM_HEAP_ID, XLOG_HEAP_LOCK);
5354
5355 PageSetLSN(page, recptr);
5356 }
5357
5358 END_CRIT_SECTION();
5359
5360 result = HeapTupleMayBeUpdated;
5361
5362 out_locked:
5363 LockBuffer(*buffer, BUFFER_LOCK_UNLOCK);
5364
5365 out_unlocked:
5366 if (BufferIsValid(vmbuffer))
5367 ReleaseBuffer(vmbuffer);
5368
5369 /*
5370 * Don't update the visibility map here. Locking a tuple doesn't change
5371 * visibility info.
5372 */
5373
5374 /*
5375 * Now that we have successfully marked the tuple as locked, we can
5376 * release the lmgr tuple lock, if we had it.
5377 */
5378 if (have_tuple_lock)
5379 UnlockTupleTuplock(relation, tid, mode);
5380
5381 return result;
5382 }
5383
5384 /*
5385 * Acquire heavyweight lock on the given tuple, in preparation for acquiring
5386 * its normal, Xmax-based tuple lock.
5387 *
5388 * have_tuple_lock is an input and output parameter: on input, it indicates
5389 * whether the lock has previously been acquired (and this function does
5390 * nothing in that case). If this function returns success, have_tuple_lock
5391 * has been flipped to true.
5392 *
5393 * Returns false if it was unable to obtain the lock; this can only happen if
5394 * wait_policy is Skip.
5395 */
5396 static bool
heap_acquire_tuplock(Relation relation,ItemPointer tid,LockTupleMode mode,LockWaitPolicy wait_policy,bool * have_tuple_lock)5397 heap_acquire_tuplock(Relation relation, ItemPointer tid, LockTupleMode mode,
5398 LockWaitPolicy wait_policy, bool *have_tuple_lock)
5399 {
5400 if (*have_tuple_lock)
5401 return true;
5402
5403 switch (wait_policy)
5404 {
5405 case LockWaitBlock:
5406 LockTupleTuplock(relation, tid, mode);
5407 break;
5408
5409 case LockWaitSkip:
5410 if (!ConditionalLockTupleTuplock(relation, tid, mode))
5411 return false;
5412 break;
5413
5414 case LockWaitError:
5415 if (!ConditionalLockTupleTuplock(relation, tid, mode))
5416 ereport(ERROR,
5417 (errcode(ERRCODE_LOCK_NOT_AVAILABLE),
5418 errmsg("could not obtain lock on row in relation \"%s\"",
5419 RelationGetRelationName(relation))));
5420 break;
5421 }
5422 *have_tuple_lock = true;
5423
5424 return true;
5425 }
5426
5427 /*
5428 * Given an original set of Xmax and infomask, and a transaction (identified by
5429 * add_to_xmax) acquiring a new lock of some mode, compute the new Xmax and
5430 * corresponding infomasks to use on the tuple.
5431 *
5432 * Note that this might have side effects such as creating a new MultiXactId.
5433 *
5434 * Most callers will have called HeapTupleSatisfiesUpdate before this function;
5435 * that will have set the HEAP_XMAX_INVALID bit if the xmax was a MultiXactId
5436 * but it was not running anymore. There is a race condition, which is that the
5437 * MultiXactId may have finished since then, but that uncommon case is handled
5438 * either here, or within MultiXactIdExpand.
5439 *
5440 * There is a similar race condition possible when the old xmax was a regular
5441 * TransactionId. We test TransactionIdIsInProgress again just to narrow the
5442 * window, but it's still possible to end up creating an unnecessary
5443 * MultiXactId. Fortunately this is harmless.
5444 */
5445 static void
compute_new_xmax_infomask(TransactionId xmax,uint16 old_infomask,uint16 old_infomask2,TransactionId add_to_xmax,LockTupleMode mode,bool is_update,TransactionId * result_xmax,uint16 * result_infomask,uint16 * result_infomask2)5446 compute_new_xmax_infomask(TransactionId xmax, uint16 old_infomask,
5447 uint16 old_infomask2, TransactionId add_to_xmax,
5448 LockTupleMode mode, bool is_update,
5449 TransactionId *result_xmax, uint16 *result_infomask,
5450 uint16 *result_infomask2)
5451 {
5452 TransactionId new_xmax;
5453 uint16 new_infomask,
5454 new_infomask2;
5455
5456 Assert(TransactionIdIsCurrentTransactionId(add_to_xmax));
5457
5458 l5:
5459 new_infomask = 0;
5460 new_infomask2 = 0;
5461 if (old_infomask & HEAP_XMAX_INVALID)
5462 {
5463 /*
5464 * No previous locker; we just insert our own TransactionId.
5465 *
5466 * Note that it's critical that this case be the first one checked,
5467 * because there are several blocks below that come back to this one
5468 * to implement certain optimizations; old_infomask might contain
5469 * other dirty bits in those cases, but we don't really care.
5470 */
5471 if (is_update)
5472 {
5473 new_xmax = add_to_xmax;
5474 if (mode == LockTupleExclusive)
5475 new_infomask2 |= HEAP_KEYS_UPDATED;
5476 }
5477 else
5478 {
5479 new_infomask |= HEAP_XMAX_LOCK_ONLY;
5480 switch (mode)
5481 {
5482 case LockTupleKeyShare:
5483 new_xmax = add_to_xmax;
5484 new_infomask |= HEAP_XMAX_KEYSHR_LOCK;
5485 break;
5486 case LockTupleShare:
5487 new_xmax = add_to_xmax;
5488 new_infomask |= HEAP_XMAX_SHR_LOCK;
5489 break;
5490 case LockTupleNoKeyExclusive:
5491 new_xmax = add_to_xmax;
5492 new_infomask |= HEAP_XMAX_EXCL_LOCK;
5493 break;
5494 case LockTupleExclusive:
5495 new_xmax = add_to_xmax;
5496 new_infomask |= HEAP_XMAX_EXCL_LOCK;
5497 new_infomask2 |= HEAP_KEYS_UPDATED;
5498 break;
5499 default:
5500 new_xmax = InvalidTransactionId; /* silence compiler */
5501 elog(ERROR, "invalid lock mode");
5502 }
5503 }
5504 }
5505 else if (old_infomask & HEAP_XMAX_IS_MULTI)
5506 {
5507 MultiXactStatus new_status;
5508
5509 /*
5510 * Currently we don't allow XMAX_COMMITTED to be set for multis, so
5511 * cross-check.
5512 */
5513 Assert(!(old_infomask & HEAP_XMAX_COMMITTED));
5514
5515 /*
5516 * A multixact together with LOCK_ONLY set but neither lock bit set
5517 * (i.e. a pg_upgraded share locked tuple) cannot possibly be running
5518 * anymore. This check is critical for databases upgraded by
5519 * pg_upgrade; both MultiXactIdIsRunning and MultiXactIdExpand assume
5520 * that such multis are never passed.
5521 */
5522 if (HEAP_LOCKED_UPGRADED(old_infomask))
5523 {
5524 old_infomask &= ~HEAP_XMAX_IS_MULTI;
5525 old_infomask |= HEAP_XMAX_INVALID;
5526 goto l5;
5527 }
5528
5529 /*
5530 * If the XMAX is already a MultiXactId, then we need to expand it to
5531 * include add_to_xmax; but if all the members were lockers and are
5532 * all gone, we can do away with the IS_MULTI bit and just set
5533 * add_to_xmax as the only locker/updater. If all lockers are gone
5534 * and we have an updater that aborted, we can also do without a
5535 * multi.
5536 *
5537 * The cost of doing GetMultiXactIdMembers would be paid by
5538 * MultiXactIdExpand if we weren't to do this, so this check is not
5539 * incurring extra work anyhow.
5540 */
5541 if (!MultiXactIdIsRunning(xmax, HEAP_XMAX_IS_LOCKED_ONLY(old_infomask)))
5542 {
5543 if (HEAP_XMAX_IS_LOCKED_ONLY(old_infomask) ||
5544 !TransactionIdDidCommit(MultiXactIdGetUpdateXid(xmax,
5545 old_infomask)))
5546 {
5547 /*
5548 * Reset these bits and restart; otherwise fall through to
5549 * create a new multi below.
5550 */
5551 old_infomask &= ~HEAP_XMAX_IS_MULTI;
5552 old_infomask |= HEAP_XMAX_INVALID;
5553 goto l5;
5554 }
5555 }
5556
5557 new_status = get_mxact_status_for_lock(mode, is_update);
5558
5559 new_xmax = MultiXactIdExpand((MultiXactId) xmax, add_to_xmax,
5560 new_status);
5561 GetMultiXactIdHintBits(new_xmax, &new_infomask, &new_infomask2);
5562 }
5563 else if (old_infomask & HEAP_XMAX_COMMITTED)
5564 {
5565 /*
5566 * It's a committed update, so we need to preserve him as updater of
5567 * the tuple.
5568 */
5569 MultiXactStatus status;
5570 MultiXactStatus new_status;
5571
5572 if (old_infomask2 & HEAP_KEYS_UPDATED)
5573 status = MultiXactStatusUpdate;
5574 else
5575 status = MultiXactStatusNoKeyUpdate;
5576
5577 new_status = get_mxact_status_for_lock(mode, is_update);
5578
5579 /*
5580 * since it's not running, it's obviously impossible for the old
5581 * updater to be identical to the current one, so we need not check
5582 * for that case as we do in the block above.
5583 */
5584 new_xmax = MultiXactIdCreate(xmax, status, add_to_xmax, new_status);
5585 GetMultiXactIdHintBits(new_xmax, &new_infomask, &new_infomask2);
5586 }
5587 else if (TransactionIdIsInProgress(xmax))
5588 {
5589 /*
5590 * If the XMAX is a valid, in-progress TransactionId, then we need to
5591 * create a new MultiXactId that includes both the old locker or
5592 * updater and our own TransactionId.
5593 */
5594 MultiXactStatus new_status;
5595 MultiXactStatus old_status;
5596 LockTupleMode old_mode;
5597
5598 if (HEAP_XMAX_IS_LOCKED_ONLY(old_infomask))
5599 {
5600 if (HEAP_XMAX_IS_KEYSHR_LOCKED(old_infomask))
5601 old_status = MultiXactStatusForKeyShare;
5602 else if (HEAP_XMAX_IS_SHR_LOCKED(old_infomask))
5603 old_status = MultiXactStatusForShare;
5604 else if (HEAP_XMAX_IS_EXCL_LOCKED(old_infomask))
5605 {
5606 if (old_infomask2 & HEAP_KEYS_UPDATED)
5607 old_status = MultiXactStatusForUpdate;
5608 else
5609 old_status = MultiXactStatusForNoKeyUpdate;
5610 }
5611 else
5612 {
5613 /*
5614 * LOCK_ONLY can be present alone only when a page has been
5615 * upgraded by pg_upgrade. But in that case,
5616 * TransactionIdIsInProgress() should have returned false. We
5617 * assume it's no longer locked in this case.
5618 */
5619 elog(WARNING, "LOCK_ONLY found for Xid in progress %u", xmax);
5620 old_infomask |= HEAP_XMAX_INVALID;
5621 old_infomask &= ~HEAP_XMAX_LOCK_ONLY;
5622 goto l5;
5623 }
5624 }
5625 else
5626 {
5627 /* it's an update, but which kind? */
5628 if (old_infomask2 & HEAP_KEYS_UPDATED)
5629 old_status = MultiXactStatusUpdate;
5630 else
5631 old_status = MultiXactStatusNoKeyUpdate;
5632 }
5633
5634 old_mode = TUPLOCK_from_mxstatus(old_status);
5635
5636 /*
5637 * If the lock to be acquired is for the same TransactionId as the
5638 * existing lock, there's an optimization possible: consider only the
5639 * strongest of both locks as the only one present, and restart.
5640 */
5641 if (xmax == add_to_xmax)
5642 {
5643 /*
5644 * Note that it's not possible for the original tuple to be
5645 * updated: we wouldn't be here because the tuple would have been
5646 * invisible and we wouldn't try to update it. As a subtlety,
5647 * this code can also run when traversing an update chain to lock
5648 * future versions of a tuple. But we wouldn't be here either,
5649 * because the add_to_xmax would be different from the original
5650 * updater.
5651 */
5652 Assert(HEAP_XMAX_IS_LOCKED_ONLY(old_infomask));
5653
5654 /* acquire the strongest of both */
5655 if (mode < old_mode)
5656 mode = old_mode;
5657 /* mustn't touch is_update */
5658
5659 old_infomask |= HEAP_XMAX_INVALID;
5660 goto l5;
5661 }
5662
5663 /* otherwise, just fall back to creating a new multixact */
5664 new_status = get_mxact_status_for_lock(mode, is_update);
5665 new_xmax = MultiXactIdCreate(xmax, old_status,
5666 add_to_xmax, new_status);
5667 GetMultiXactIdHintBits(new_xmax, &new_infomask, &new_infomask2);
5668 }
5669 else if (!HEAP_XMAX_IS_LOCKED_ONLY(old_infomask) &&
5670 TransactionIdDidCommit(xmax))
5671 {
5672 /*
5673 * It's a committed update, so we gotta preserve him as updater of the
5674 * tuple.
5675 */
5676 MultiXactStatus status;
5677 MultiXactStatus new_status;
5678
5679 if (old_infomask2 & HEAP_KEYS_UPDATED)
5680 status = MultiXactStatusUpdate;
5681 else
5682 status = MultiXactStatusNoKeyUpdate;
5683
5684 new_status = get_mxact_status_for_lock(mode, is_update);
5685
5686 /*
5687 * since it's not running, it's obviously impossible for the old
5688 * updater to be identical to the current one, so we need not check
5689 * for that case as we do in the block above.
5690 */
5691 new_xmax = MultiXactIdCreate(xmax, status, add_to_xmax, new_status);
5692 GetMultiXactIdHintBits(new_xmax, &new_infomask, &new_infomask2);
5693 }
5694 else
5695 {
5696 /*
5697 * Can get here iff the locking/updating transaction was running when
5698 * the infomask was extracted from the tuple, but finished before
5699 * TransactionIdIsInProgress got to run. Deal with it as if there was
5700 * no locker at all in the first place.
5701 */
5702 old_infomask |= HEAP_XMAX_INVALID;
5703 goto l5;
5704 }
5705
5706 *result_infomask = new_infomask;
5707 *result_infomask2 = new_infomask2;
5708 *result_xmax = new_xmax;
5709 }
5710
5711 /*
5712 * Subroutine for heap_lock_updated_tuple_rec.
5713 *
5714 * Given a hypothetical multixact status held by the transaction identified
5715 * with the given xid, does the current transaction need to wait, fail, or can
5716 * it continue if it wanted to acquire a lock of the given mode? "needwait"
5717 * is set to true if waiting is necessary; if it can continue, then
5718 * HeapTupleMayBeUpdated is returned. If the lock is already held by the
5719 * current transaction, return HeapTupleSelfUpdated. In case of a conflict
5720 * with another transaction, a different HeapTupleSatisfiesUpdate return code
5721 * is returned.
5722 *
5723 * The held status is said to be hypothetical because it might correspond to a
5724 * lock held by a single Xid, i.e. not a real MultiXactId; we express it this
5725 * way for simplicity of API.
5726 */
5727 static HTSU_Result
test_lockmode_for_conflict(MultiXactStatus status,TransactionId xid,LockTupleMode mode,bool * needwait)5728 test_lockmode_for_conflict(MultiXactStatus status, TransactionId xid,
5729 LockTupleMode mode, bool *needwait)
5730 {
5731 MultiXactStatus wantedstatus;
5732
5733 *needwait = false;
5734 wantedstatus = get_mxact_status_for_lock(mode, false);
5735
5736 /*
5737 * Note: we *must* check TransactionIdIsInProgress before
5738 * TransactionIdDidAbort/Commit; see comment at top of tqual.c for an
5739 * explanation.
5740 */
5741 if (TransactionIdIsCurrentTransactionId(xid))
5742 {
5743 /*
5744 * The tuple has already been locked by our own transaction. This is
5745 * very rare but can happen if multiple transactions are trying to
5746 * lock an ancient version of the same tuple.
5747 */
5748 return HeapTupleSelfUpdated;
5749 }
5750 else if (TransactionIdIsInProgress(xid))
5751 {
5752 /*
5753 * If the locking transaction is running, what we do depends on
5754 * whether the lock modes conflict: if they do, then we must wait for
5755 * it to finish; otherwise we can fall through to lock this tuple
5756 * version without waiting.
5757 */
5758 if (DoLockModesConflict(LOCKMODE_from_mxstatus(status),
5759 LOCKMODE_from_mxstatus(wantedstatus)))
5760 {
5761 *needwait = true;
5762 }
5763
5764 /*
5765 * If we set needwait above, then this value doesn't matter;
5766 * otherwise, this value signals to caller that it's okay to proceed.
5767 */
5768 return HeapTupleMayBeUpdated;
5769 }
5770 else if (TransactionIdDidAbort(xid))
5771 return HeapTupleMayBeUpdated;
5772 else if (TransactionIdDidCommit(xid))
5773 {
5774 /*
5775 * The other transaction committed. If it was only a locker, then the
5776 * lock is completely gone now and we can return success; but if it
5777 * was an update, then what we do depends on whether the two lock
5778 * modes conflict. If they conflict, then we must report error to
5779 * caller. But if they don't, we can fall through to allow the current
5780 * transaction to lock the tuple.
5781 *
5782 * Note: the reason we worry about ISUPDATE here is because as soon as
5783 * a transaction ends, all its locks are gone and meaningless, and
5784 * thus we can ignore them; whereas its updates persist. In the
5785 * TransactionIdIsInProgress case, above, we don't need to check
5786 * because we know the lock is still "alive" and thus a conflict needs
5787 * always be checked.
5788 */
5789 if (!ISUPDATE_from_mxstatus(status))
5790 return HeapTupleMayBeUpdated;
5791
5792 if (DoLockModesConflict(LOCKMODE_from_mxstatus(status),
5793 LOCKMODE_from_mxstatus(wantedstatus)))
5794 /* bummer */
5795 return HeapTupleUpdated;
5796
5797 return HeapTupleMayBeUpdated;
5798 }
5799
5800 /* Not in progress, not aborted, not committed -- must have crashed */
5801 return HeapTupleMayBeUpdated;
5802 }
5803
5804
5805 /*
5806 * Recursive part of heap_lock_updated_tuple
5807 *
5808 * Fetch the tuple pointed to by tid in rel, and mark it as locked by the given
5809 * xid with the given mode; if this tuple is updated, recurse to lock the new
5810 * version as well.
5811 */
5812 static HTSU_Result
heap_lock_updated_tuple_rec(Relation rel,ItemPointer tid,TransactionId xid,LockTupleMode mode)5813 heap_lock_updated_tuple_rec(Relation rel, ItemPointer tid, TransactionId xid,
5814 LockTupleMode mode)
5815 {
5816 HTSU_Result result;
5817 ItemPointerData tupid;
5818 HeapTupleData mytup;
5819 Buffer buf;
5820 uint16 new_infomask,
5821 new_infomask2,
5822 old_infomask,
5823 old_infomask2;
5824 TransactionId xmax,
5825 new_xmax;
5826 TransactionId priorXmax = InvalidTransactionId;
5827 bool cleared_all_frozen = false;
5828 bool pinned_desired_page;
5829 Buffer vmbuffer = InvalidBuffer;
5830 BlockNumber block;
5831
5832 ItemPointerCopy(tid, &tupid);
5833
5834 for (;;)
5835 {
5836 new_infomask = 0;
5837 new_xmax = InvalidTransactionId;
5838 block = ItemPointerGetBlockNumber(&tupid);
5839 ItemPointerCopy(&tupid, &(mytup.t_self));
5840
5841 if (!heap_fetch(rel, SnapshotAny, &mytup, &buf, false, NULL))
5842 {
5843 /*
5844 * if we fail to find the updated version of the tuple, it's
5845 * because it was vacuumed/pruned away after its creator
5846 * transaction aborted. So behave as if we got to the end of the
5847 * chain, and there's no further tuple to lock: return success to
5848 * caller.
5849 */
5850 result = HeapTupleMayBeUpdated;
5851 goto out_unlocked;
5852 }
5853
5854 l4:
5855 CHECK_FOR_INTERRUPTS();
5856
5857 /*
5858 * Before locking the buffer, pin the visibility map page if it
5859 * appears to be necessary. Since we haven't got the lock yet,
5860 * someone else might be in the middle of changing this, so we'll need
5861 * to recheck after we have the lock.
5862 */
5863 if (PageIsAllVisible(BufferGetPage(buf)))
5864 {
5865 visibilitymap_pin(rel, block, &vmbuffer);
5866 pinned_desired_page = true;
5867 }
5868 else
5869 pinned_desired_page = false;
5870
5871 LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE);
5872
5873 /*
5874 * If we didn't pin the visibility map page and the page has become
5875 * all visible while we were busy locking the buffer, we'll have to
5876 * unlock and re-lock, to avoid holding the buffer lock across I/O.
5877 * That's a bit unfortunate, but hopefully shouldn't happen often.
5878 *
5879 * Note: in some paths through this function, we will reach here
5880 * holding a pin on a vm page that may or may not be the one matching
5881 * this page. If this page isn't all-visible, we won't use the vm
5882 * page, but we hold onto such a pin till the end of the function.
5883 */
5884 if (!pinned_desired_page && PageIsAllVisible(BufferGetPage(buf)))
5885 {
5886 LockBuffer(buf, BUFFER_LOCK_UNLOCK);
5887 visibilitymap_pin(rel, block, &vmbuffer);
5888 LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE);
5889 }
5890
5891 /*
5892 * Check the tuple XMIN against prior XMAX, if any. If we reached the
5893 * end of the chain, we're done, so return success.
5894 */
5895 if (TransactionIdIsValid(priorXmax) &&
5896 !TransactionIdEquals(HeapTupleHeaderGetXmin(mytup.t_data),
5897 priorXmax))
5898 {
5899 result = HeapTupleMayBeUpdated;
5900 goto out_locked;
5901 }
5902
5903 /*
5904 * Also check Xmin: if this tuple was created by an aborted
5905 * (sub)transaction, then we already locked the last live one in the
5906 * chain, thus we're done, so return success.
5907 */
5908 if (TransactionIdDidAbort(HeapTupleHeaderGetXmin(mytup.t_data)))
5909 {
5910 result = HeapTupleMayBeUpdated;
5911 goto out_locked;
5912 }
5913
5914 old_infomask = mytup.t_data->t_infomask;
5915 old_infomask2 = mytup.t_data->t_infomask2;
5916 xmax = HeapTupleHeaderGetRawXmax(mytup.t_data);
5917
5918 /*
5919 * If this tuple version has been updated or locked by some concurrent
5920 * transaction(s), what we do depends on whether our lock mode
5921 * conflicts with what those other transactions hold, and also on the
5922 * status of them.
5923 */
5924 if (!(old_infomask & HEAP_XMAX_INVALID))
5925 {
5926 TransactionId rawxmax;
5927 bool needwait;
5928
5929 rawxmax = HeapTupleHeaderGetRawXmax(mytup.t_data);
5930 if (old_infomask & HEAP_XMAX_IS_MULTI)
5931 {
5932 int nmembers;
5933 int i;
5934 MultiXactMember *members;
5935
5936 /*
5937 * We don't need a test for pg_upgrade'd tuples: this is only
5938 * applied to tuples after the first in an update chain. Said
5939 * first tuple in the chain may well be locked-in-9.2-and-
5940 * pg_upgraded, but that one was already locked by our caller,
5941 * not us; and any subsequent ones cannot be because our
5942 * caller must necessarily have obtained a snapshot later than
5943 * the pg_upgrade itself.
5944 */
5945 Assert(!HEAP_LOCKED_UPGRADED(mytup.t_data->t_infomask));
5946
5947 nmembers = GetMultiXactIdMembers(rawxmax, &members, false,
5948 HEAP_XMAX_IS_LOCKED_ONLY(old_infomask));
5949 for (i = 0; i < nmembers; i++)
5950 {
5951 result = test_lockmode_for_conflict(members[i].status,
5952 members[i].xid,
5953 mode, &needwait);
5954
5955 /*
5956 * If the tuple was already locked by ourselves in a
5957 * previous iteration of this (say heap_lock_tuple was
5958 * forced to restart the locking loop because of a change
5959 * in xmax), then we hold the lock already on this tuple
5960 * version and we don't need to do anything; and this is
5961 * not an error condition either. We just need to skip
5962 * this tuple and continue locking the next version in the
5963 * update chain.
5964 */
5965 if (result == HeapTupleSelfUpdated)
5966 {
5967 pfree(members);
5968 goto next;
5969 }
5970
5971 if (needwait)
5972 {
5973 LockBuffer(buf, BUFFER_LOCK_UNLOCK);
5974 XactLockTableWait(members[i].xid, rel,
5975 &mytup.t_self,
5976 XLTW_LockUpdated);
5977 pfree(members);
5978 goto l4;
5979 }
5980 if (result != HeapTupleMayBeUpdated)
5981 {
5982 pfree(members);
5983 goto out_locked;
5984 }
5985 }
5986 if (members)
5987 pfree(members);
5988 }
5989 else
5990 {
5991 MultiXactStatus status;
5992
5993 /*
5994 * For a non-multi Xmax, we first need to compute the
5995 * corresponding MultiXactStatus by using the infomask bits.
5996 */
5997 if (HEAP_XMAX_IS_LOCKED_ONLY(old_infomask))
5998 {
5999 if (HEAP_XMAX_IS_KEYSHR_LOCKED(old_infomask))
6000 status = MultiXactStatusForKeyShare;
6001 else if (HEAP_XMAX_IS_SHR_LOCKED(old_infomask))
6002 status = MultiXactStatusForShare;
6003 else if (HEAP_XMAX_IS_EXCL_LOCKED(old_infomask))
6004 {
6005 if (old_infomask2 & HEAP_KEYS_UPDATED)
6006 status = MultiXactStatusForUpdate;
6007 else
6008 status = MultiXactStatusForNoKeyUpdate;
6009 }
6010 else
6011 {
6012 /*
6013 * LOCK_ONLY present alone (a pg_upgraded tuple marked
6014 * as share-locked in the old cluster) shouldn't be
6015 * seen in the middle of an update chain.
6016 */
6017 elog(ERROR, "invalid lock status in tuple");
6018 }
6019 }
6020 else
6021 {
6022 /* it's an update, but which kind? */
6023 if (old_infomask2 & HEAP_KEYS_UPDATED)
6024 status = MultiXactStatusUpdate;
6025 else
6026 status = MultiXactStatusNoKeyUpdate;
6027 }
6028
6029 result = test_lockmode_for_conflict(status, rawxmax, mode,
6030 &needwait);
6031
6032 /*
6033 * If the tuple was already locked by ourselves in a previous
6034 * iteration of this (say heap_lock_tuple was forced to
6035 * restart the locking loop because of a change in xmax), then
6036 * we hold the lock already on this tuple version and we don't
6037 * need to do anything; and this is not an error condition
6038 * either. We just need to skip this tuple and continue
6039 * locking the next version in the update chain.
6040 */
6041 if (result == HeapTupleSelfUpdated)
6042 goto next;
6043
6044 if (needwait)
6045 {
6046 LockBuffer(buf, BUFFER_LOCK_UNLOCK);
6047 XactLockTableWait(rawxmax, rel, &mytup.t_self,
6048 XLTW_LockUpdated);
6049 goto l4;
6050 }
6051 if (result != HeapTupleMayBeUpdated)
6052 {
6053 goto out_locked;
6054 }
6055 }
6056 }
6057
6058 /* compute the new Xmax and infomask values for the tuple ... */
6059 compute_new_xmax_infomask(xmax, old_infomask, mytup.t_data->t_infomask2,
6060 xid, mode, false,
6061 &new_xmax, &new_infomask, &new_infomask2);
6062
6063 if (PageIsAllVisible(BufferGetPage(buf)) &&
6064 visibilitymap_clear(rel, block, vmbuffer,
6065 VISIBILITYMAP_ALL_FROZEN))
6066 cleared_all_frozen = true;
6067
6068 START_CRIT_SECTION();
6069
6070 /* ... and set them */
6071 HeapTupleHeaderSetXmax(mytup.t_data, new_xmax);
6072 mytup.t_data->t_infomask &= ~HEAP_XMAX_BITS;
6073 mytup.t_data->t_infomask2 &= ~HEAP_KEYS_UPDATED;
6074 mytup.t_data->t_infomask |= new_infomask;
6075 mytup.t_data->t_infomask2 |= new_infomask2;
6076
6077 MarkBufferDirty(buf);
6078
6079 /* XLOG stuff */
6080 if (RelationNeedsWAL(rel))
6081 {
6082 xl_heap_lock_updated xlrec;
6083 XLogRecPtr recptr;
6084 Page page = BufferGetPage(buf);
6085
6086 XLogBeginInsert();
6087 XLogRegisterBuffer(0, buf, REGBUF_STANDARD);
6088
6089 xlrec.offnum = ItemPointerGetOffsetNumber(&mytup.t_self);
6090 xlrec.xmax = new_xmax;
6091 xlrec.infobits_set = compute_infobits(new_infomask, new_infomask2);
6092 xlrec.flags =
6093 cleared_all_frozen ? XLH_LOCK_ALL_FROZEN_CLEARED : 0;
6094
6095 XLogRegisterData((char *) &xlrec, SizeOfHeapLockUpdated);
6096
6097 recptr = XLogInsert(RM_HEAP2_ID, XLOG_HEAP2_LOCK_UPDATED);
6098
6099 PageSetLSN(page, recptr);
6100 }
6101
6102 END_CRIT_SECTION();
6103
6104 next:
6105 /* if we find the end of update chain, we're done. */
6106 if (mytup.t_data->t_infomask & HEAP_XMAX_INVALID ||
6107 HeapTupleHeaderIndicatesMovedPartitions(mytup.t_data) ||
6108 ItemPointerEquals(&mytup.t_self, &mytup.t_data->t_ctid) ||
6109 HeapTupleHeaderIsOnlyLocked(mytup.t_data))
6110 {
6111 result = HeapTupleMayBeUpdated;
6112 goto out_locked;
6113 }
6114
6115 /* tail recursion */
6116 priorXmax = HeapTupleHeaderGetUpdateXid(mytup.t_data);
6117 ItemPointerCopy(&(mytup.t_data->t_ctid), &tupid);
6118 UnlockReleaseBuffer(buf);
6119 }
6120
6121 result = HeapTupleMayBeUpdated;
6122
6123 out_locked:
6124 UnlockReleaseBuffer(buf);
6125
6126 out_unlocked:
6127 if (vmbuffer != InvalidBuffer)
6128 ReleaseBuffer(vmbuffer);
6129
6130 return result;
6131 }
6132
6133 /*
6134 * heap_lock_updated_tuple
6135 * Follow update chain when locking an updated tuple, acquiring locks (row
6136 * marks) on the updated versions.
6137 *
6138 * The initial tuple is assumed to be already locked.
6139 *
6140 * This function doesn't check visibility, it just unconditionally marks the
6141 * tuple(s) as locked. If any tuple in the updated chain is being deleted
6142 * concurrently (or updated with the key being modified), sleep until the
6143 * transaction doing it is finished.
6144 *
6145 * Note that we don't acquire heavyweight tuple locks on the tuples we walk
6146 * when we have to wait for other transactions to release them, as opposed to
6147 * what heap_lock_tuple does. The reason is that having more than one
6148 * transaction walking the chain is probably uncommon enough that risk of
6149 * starvation is not likely: one of the preconditions for being here is that
6150 * the snapshot in use predates the update that created this tuple (because we
6151 * started at an earlier version of the tuple), but at the same time such a
6152 * transaction cannot be using repeatable read or serializable isolation
6153 * levels, because that would lead to a serializability failure.
6154 */
6155 static HTSU_Result
heap_lock_updated_tuple(Relation rel,HeapTuple tuple,ItemPointer ctid,TransactionId xid,LockTupleMode mode)6156 heap_lock_updated_tuple(Relation rel, HeapTuple tuple, ItemPointer ctid,
6157 TransactionId xid, LockTupleMode mode)
6158 {
6159 /*
6160 * If the tuple has not been updated, or has moved into another partition
6161 * (effectively a delete) stop here.
6162 */
6163 if (!HeapTupleHeaderIndicatesMovedPartitions(tuple->t_data) &&
6164 !ItemPointerEquals(&tuple->t_self, ctid))
6165 {
6166 /*
6167 * If this is the first possibly-multixact-able operation in the
6168 * current transaction, set my per-backend OldestMemberMXactId
6169 * setting. We can be certain that the transaction will never become a
6170 * member of any older MultiXactIds than that. (We have to do this
6171 * even if we end up just using our own TransactionId below, since
6172 * some other backend could incorporate our XID into a MultiXact
6173 * immediately afterwards.)
6174 */
6175 MultiXactIdSetOldestMember();
6176
6177 return heap_lock_updated_tuple_rec(rel, ctid, xid, mode);
6178 }
6179
6180 /* nothing to lock */
6181 return HeapTupleMayBeUpdated;
6182 }
6183
6184 /*
6185 * heap_finish_speculative - mark speculative insertion as successful
6186 *
6187 * To successfully finish a speculative insertion we have to clear speculative
6188 * token from tuple. To do so the t_ctid field, which will contain a
6189 * speculative token value, is modified in place to point to the tuple itself,
6190 * which is characteristic of a newly inserted ordinary tuple.
6191 *
6192 * NB: It is not ok to commit without either finishing or aborting a
6193 * speculative insertion. We could treat speculative tuples of committed
6194 * transactions implicitly as completed, but then we would have to be prepared
6195 * to deal with speculative tokens on committed tuples. That wouldn't be
6196 * difficult - no-one looks at the ctid field of a tuple with invalid xmax -
6197 * but clearing the token at completion isn't very expensive either.
6198 * An explicit confirmation WAL record also makes logical decoding simpler.
6199 */
6200 void
heap_finish_speculative(Relation relation,HeapTuple tuple)6201 heap_finish_speculative(Relation relation, HeapTuple tuple)
6202 {
6203 Buffer buffer;
6204 Page page;
6205 OffsetNumber offnum;
6206 ItemId lp = NULL;
6207 HeapTupleHeader htup;
6208
6209 buffer = ReadBuffer(relation, ItemPointerGetBlockNumber(&(tuple->t_self)));
6210 LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE);
6211 page = (Page) BufferGetPage(buffer);
6212
6213 offnum = ItemPointerGetOffsetNumber(&(tuple->t_self));
6214 if (PageGetMaxOffsetNumber(page) >= offnum)
6215 lp = PageGetItemId(page, offnum);
6216
6217 if (PageGetMaxOffsetNumber(page) < offnum || !ItemIdIsNormal(lp))
6218 elog(ERROR, "invalid lp");
6219
6220 htup = (HeapTupleHeader) PageGetItem(page, lp);
6221
6222 /* SpecTokenOffsetNumber should be distinguishable from any real offset */
6223 StaticAssertStmt(MaxOffsetNumber < SpecTokenOffsetNumber,
6224 "invalid speculative token constant");
6225
6226 /* NO EREPORT(ERROR) from here till changes are logged */
6227 START_CRIT_SECTION();
6228
6229 Assert(HeapTupleHeaderIsSpeculative(tuple->t_data));
6230
6231 MarkBufferDirty(buffer);
6232
6233 /*
6234 * Replace the speculative insertion token with a real t_ctid, pointing to
6235 * itself like it does on regular tuples.
6236 */
6237 htup->t_ctid = tuple->t_self;
6238
6239 /* XLOG stuff */
6240 if (RelationNeedsWAL(relation))
6241 {
6242 xl_heap_confirm xlrec;
6243 XLogRecPtr recptr;
6244
6245 xlrec.offnum = ItemPointerGetOffsetNumber(&tuple->t_self);
6246
6247 XLogBeginInsert();
6248
6249 /* We want the same filtering on this as on a plain insert */
6250 XLogSetRecordFlags(XLOG_INCLUDE_ORIGIN);
6251
6252 XLogRegisterData((char *) &xlrec, SizeOfHeapConfirm);
6253 XLogRegisterBuffer(0, buffer, REGBUF_STANDARD);
6254
6255 recptr = XLogInsert(RM_HEAP_ID, XLOG_HEAP_CONFIRM);
6256
6257 PageSetLSN(page, recptr);
6258 }
6259
6260 END_CRIT_SECTION();
6261
6262 UnlockReleaseBuffer(buffer);
6263 }
6264
6265 /*
6266 * heap_abort_speculative - kill a speculatively inserted tuple
6267 *
6268 * Marks a tuple that was speculatively inserted in the same command as dead,
6269 * by setting its xmin as invalid. That makes it immediately appear as dead
6270 * to all transactions, including our own. In particular, it makes
6271 * HeapTupleSatisfiesDirty() regard the tuple as dead, so that another backend
6272 * inserting a duplicate key value won't unnecessarily wait for our whole
6273 * transaction to finish (it'll just wait for our speculative insertion to
6274 * finish).
6275 *
6276 * Killing the tuple prevents "unprincipled deadlocks", which are deadlocks
6277 * that arise due to a mutual dependency that is not user visible. By
6278 * definition, unprincipled deadlocks cannot be prevented by the user
6279 * reordering lock acquisition in client code, because the implementation level
6280 * lock acquisitions are not under the user's direct control. If speculative
6281 * inserters did not take this precaution, then under high concurrency they
6282 * could deadlock with each other, which would not be acceptable.
6283 *
6284 * This is somewhat redundant with heap_delete, but we prefer to have a
6285 * dedicated routine with stripped down requirements. Note that this is also
6286 * used to delete the TOAST tuples created during speculative insertion.
6287 *
6288 * This routine does not affect logical decoding as it only looks at
6289 * confirmation records.
6290 */
6291 void
heap_abort_speculative(Relation relation,HeapTuple tuple)6292 heap_abort_speculative(Relation relation, HeapTuple tuple)
6293 {
6294 TransactionId xid = GetCurrentTransactionId();
6295 ItemPointer tid = &(tuple->t_self);
6296 ItemId lp;
6297 HeapTupleData tp;
6298 Page page;
6299 BlockNumber block;
6300 Buffer buffer;
6301 TransactionId prune_xid;
6302
6303 Assert(ItemPointerIsValid(tid));
6304
6305 block = ItemPointerGetBlockNumber(tid);
6306 buffer = ReadBuffer(relation, block);
6307 page = BufferGetPage(buffer);
6308
6309 LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE);
6310
6311 /*
6312 * Page can't be all visible, we just inserted into it, and are still
6313 * running.
6314 */
6315 Assert(!PageIsAllVisible(page));
6316
6317 lp = PageGetItemId(page, ItemPointerGetOffsetNumber(tid));
6318 Assert(ItemIdIsNormal(lp));
6319
6320 tp.t_tableOid = RelationGetRelid(relation);
6321 tp.t_data = (HeapTupleHeader) PageGetItem(page, lp);
6322 tp.t_len = ItemIdGetLength(lp);
6323 tp.t_self = *tid;
6324
6325 /*
6326 * Sanity check that the tuple really is a speculatively inserted tuple,
6327 * inserted by us.
6328 */
6329 if (tp.t_data->t_choice.t_heap.t_xmin != xid)
6330 elog(ERROR, "attempted to kill a tuple inserted by another transaction");
6331 if (!(IsToastRelation(relation) || HeapTupleHeaderIsSpeculative(tp.t_data)))
6332 elog(ERROR, "attempted to kill a non-speculative tuple");
6333 Assert(!HeapTupleHeaderIsHeapOnly(tp.t_data));
6334
6335 /*
6336 * No need to check for serializable conflicts here. There is never a
6337 * need for a combocid, either. No need to extract replica identity, or
6338 * do anything special with infomask bits.
6339 */
6340
6341 START_CRIT_SECTION();
6342
6343 /*
6344 * The tuple will become DEAD immediately. Flag that this page is a
6345 * candidate for pruning by setting xmin to TransactionXmin. While not
6346 * immediately prunable, it is the oldest xid we can cheaply determine
6347 * that's safe against wraparound / being older than the table's
6348 * relfrozenxid. To defend against the unlikely case of a new relation
6349 * having a newer relfrozenxid than our TransactionXmin, use relfrozenxid
6350 * if so (vacuum can't subsequently move relfrozenxid to beyond
6351 * TransactionXmin, so there's no race here).
6352 */
6353 Assert(TransactionIdIsValid(TransactionXmin));
6354 if (TransactionIdPrecedes(TransactionXmin, relation->rd_rel->relfrozenxid))
6355 prune_xid = relation->rd_rel->relfrozenxid;
6356 else
6357 prune_xid = TransactionXmin;
6358 PageSetPrunable(page, prune_xid);
6359
6360 /* store transaction information of xact deleting the tuple */
6361 tp.t_data->t_infomask &= ~(HEAP_XMAX_BITS | HEAP_MOVED);
6362 tp.t_data->t_infomask2 &= ~HEAP_KEYS_UPDATED;
6363
6364 /*
6365 * Set the tuple header xmin to InvalidTransactionId. This makes the
6366 * tuple immediately invisible everyone. (In particular, to any
6367 * transactions waiting on the speculative token, woken up later.)
6368 */
6369 HeapTupleHeaderSetXmin(tp.t_data, InvalidTransactionId);
6370
6371 /* Clear the speculative insertion token too */
6372 tp.t_data->t_ctid = tp.t_self;
6373
6374 MarkBufferDirty(buffer);
6375
6376 /*
6377 * XLOG stuff
6378 *
6379 * The WAL records generated here match heap_delete(). The same recovery
6380 * routines are used.
6381 */
6382 if (RelationNeedsWAL(relation))
6383 {
6384 xl_heap_delete xlrec;
6385 XLogRecPtr recptr;
6386
6387 xlrec.flags = XLH_DELETE_IS_SUPER;
6388 xlrec.infobits_set = compute_infobits(tp.t_data->t_infomask,
6389 tp.t_data->t_infomask2);
6390 xlrec.offnum = ItemPointerGetOffsetNumber(&tp.t_self);
6391 xlrec.xmax = xid;
6392
6393 XLogBeginInsert();
6394 XLogRegisterData((char *) &xlrec, SizeOfHeapDelete);
6395 XLogRegisterBuffer(0, buffer, REGBUF_STANDARD);
6396
6397 /* No replica identity & replication origin logged */
6398
6399 recptr = XLogInsert(RM_HEAP_ID, XLOG_HEAP_DELETE);
6400
6401 PageSetLSN(page, recptr);
6402 }
6403
6404 END_CRIT_SECTION();
6405
6406 LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
6407
6408 if (HeapTupleHasExternal(&tp))
6409 {
6410 Assert(!IsToastRelation(relation));
6411 toast_delete(relation, &tp, true);
6412 }
6413
6414 /*
6415 * Never need to mark tuple for invalidation, since catalogs don't support
6416 * speculative insertion
6417 */
6418
6419 /* Now we can release the buffer */
6420 ReleaseBuffer(buffer);
6421
6422 /* count deletion, as we counted the insertion too */
6423 pgstat_count_heap_delete(relation);
6424 }
6425
6426 /*
6427 * heap_inplace_update - update a tuple "in place" (ie, overwrite it)
6428 *
6429 * Overwriting violates both MVCC and transactional safety, so the uses
6430 * of this function in Postgres are extremely limited. Nonetheless we
6431 * find some places to use it.
6432 *
6433 * The tuple cannot change size, and therefore it's reasonable to assume
6434 * that its null bitmap (if any) doesn't change either. So we just
6435 * overwrite the data portion of the tuple without touching the null
6436 * bitmap or any of the header fields.
6437 *
6438 * tuple is an in-memory tuple structure containing the data to be written
6439 * over the target tuple. Also, tuple->t_self identifies the target tuple.
6440 */
6441 void
heap_inplace_update(Relation relation,HeapTuple tuple)6442 heap_inplace_update(Relation relation, HeapTuple tuple)
6443 {
6444 Buffer buffer;
6445 Page page;
6446 OffsetNumber offnum;
6447 ItemId lp = NULL;
6448 HeapTupleHeader htup;
6449 uint32 oldlen;
6450 uint32 newlen;
6451
6452 /*
6453 * For now, parallel operations are required to be strictly read-only.
6454 * Unlike a regular update, this should never create a combo CID, so it
6455 * might be possible to relax this restriction, but not without more
6456 * thought and testing. It's not clear that it would be useful, anyway.
6457 */
6458 if (IsInParallelMode())
6459 ereport(ERROR,
6460 (errcode(ERRCODE_INVALID_TRANSACTION_STATE),
6461 errmsg("cannot update tuples during a parallel operation")));
6462
6463 buffer = ReadBuffer(relation, ItemPointerGetBlockNumber(&(tuple->t_self)));
6464 LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE);
6465 page = (Page) BufferGetPage(buffer);
6466
6467 offnum = ItemPointerGetOffsetNumber(&(tuple->t_self));
6468 if (PageGetMaxOffsetNumber(page) >= offnum)
6469 lp = PageGetItemId(page, offnum);
6470
6471 if (PageGetMaxOffsetNumber(page) < offnum || !ItemIdIsNormal(lp))
6472 elog(ERROR, "invalid lp");
6473
6474 htup = (HeapTupleHeader) PageGetItem(page, lp);
6475
6476 oldlen = ItemIdGetLength(lp) - htup->t_hoff;
6477 newlen = tuple->t_len - tuple->t_data->t_hoff;
6478 if (oldlen != newlen || htup->t_hoff != tuple->t_data->t_hoff)
6479 elog(ERROR, "wrong tuple length");
6480
6481 /* NO EREPORT(ERROR) from here till changes are logged */
6482 START_CRIT_SECTION();
6483
6484 memcpy((char *) htup + htup->t_hoff,
6485 (char *) tuple->t_data + tuple->t_data->t_hoff,
6486 newlen);
6487
6488 MarkBufferDirty(buffer);
6489
6490 /* XLOG stuff */
6491 if (RelationNeedsWAL(relation))
6492 {
6493 xl_heap_inplace xlrec;
6494 XLogRecPtr recptr;
6495
6496 xlrec.offnum = ItemPointerGetOffsetNumber(&tuple->t_self);
6497
6498 XLogBeginInsert();
6499 XLogRegisterData((char *) &xlrec, SizeOfHeapInplace);
6500
6501 XLogRegisterBuffer(0, buffer, REGBUF_STANDARD);
6502 XLogRegisterBufData(0, (char *) htup + htup->t_hoff, newlen);
6503
6504 /* inplace updates aren't decoded atm, don't log the origin */
6505
6506 recptr = XLogInsert(RM_HEAP_ID, XLOG_HEAP_INPLACE);
6507
6508 PageSetLSN(page, recptr);
6509 }
6510
6511 END_CRIT_SECTION();
6512
6513 UnlockReleaseBuffer(buffer);
6514
6515 /*
6516 * Send out shared cache inval if necessary. Note that because we only
6517 * pass the new version of the tuple, this mustn't be used for any
6518 * operations that could change catcache lookup keys. But we aren't
6519 * bothering with index updates either, so that's true a fortiori.
6520 */
6521 if (!IsBootstrapProcessingMode())
6522 CacheInvalidateHeapTuple(relation, tuple, NULL);
6523 }
6524
6525 #define FRM_NOOP 0x0001
6526 #define FRM_INVALIDATE_XMAX 0x0002
6527 #define FRM_RETURN_IS_XID 0x0004
6528 #define FRM_RETURN_IS_MULTI 0x0008
6529 #define FRM_MARK_COMMITTED 0x0010
6530
6531 /*
6532 * FreezeMultiXactId
6533 * Determine what to do during freezing when a tuple is marked by a
6534 * MultiXactId.
6535 *
6536 * NB -- this might have the side-effect of creating a new MultiXactId!
6537 *
6538 * "flags" is an output value; it's used to tell caller what to do on return.
6539 * Possible flags are:
6540 * FRM_NOOP
6541 * don't do anything -- keep existing Xmax
6542 * FRM_INVALIDATE_XMAX
6543 * mark Xmax as InvalidTransactionId and set XMAX_INVALID flag.
6544 * FRM_RETURN_IS_XID
6545 * The Xid return value is a single update Xid to set as xmax.
6546 * FRM_MARK_COMMITTED
6547 * Xmax can be marked as HEAP_XMAX_COMMITTED
6548 * FRM_RETURN_IS_MULTI
6549 * The return value is a new MultiXactId to set as new Xmax.
6550 * (caller must obtain proper infomask bits using GetMultiXactIdHintBits)
6551 */
6552 static TransactionId
FreezeMultiXactId(MultiXactId multi,uint16 t_infomask,TransactionId relfrozenxid,TransactionId relminmxid,TransactionId cutoff_xid,MultiXactId cutoff_multi,uint16 * flags)6553 FreezeMultiXactId(MultiXactId multi, uint16 t_infomask,
6554 TransactionId relfrozenxid, TransactionId relminmxid,
6555 TransactionId cutoff_xid, MultiXactId cutoff_multi,
6556 uint16 *flags)
6557 {
6558 TransactionId xid = InvalidTransactionId;
6559 int i;
6560 MultiXactMember *members;
6561 int nmembers;
6562 bool need_replace;
6563 int nnewmembers;
6564 MultiXactMember *newmembers;
6565 bool has_lockers;
6566 TransactionId update_xid;
6567 bool update_committed;
6568
6569 *flags = 0;
6570
6571 /* We should only be called in Multis */
6572 Assert(t_infomask & HEAP_XMAX_IS_MULTI);
6573
6574 if (!MultiXactIdIsValid(multi) ||
6575 HEAP_LOCKED_UPGRADED(t_infomask))
6576 {
6577 /* Ensure infomask bits are appropriately set/reset */
6578 *flags |= FRM_INVALIDATE_XMAX;
6579 return InvalidTransactionId;
6580 }
6581 else if (MultiXactIdPrecedes(multi, relminmxid))
6582 ereport(ERROR,
6583 (errcode(ERRCODE_DATA_CORRUPTED),
6584 errmsg_internal("found multixact %u from before relminmxid %u",
6585 multi, relminmxid)));
6586 else if (MultiXactIdPrecedes(multi, cutoff_multi))
6587 {
6588 /*
6589 * This old multi cannot possibly have members still running, but
6590 * verify just in case. If it was a locker only, it can be removed
6591 * without any further consideration; but if it contained an update,
6592 * we might need to preserve it.
6593 */
6594 if (MultiXactIdIsRunning(multi,
6595 HEAP_XMAX_IS_LOCKED_ONLY(t_infomask)))
6596 ereport(ERROR,
6597 (errcode(ERRCODE_DATA_CORRUPTED),
6598 errmsg_internal("multixact %u from before cutoff %u found to be still running",
6599 multi, cutoff_multi)));
6600
6601 if (HEAP_XMAX_IS_LOCKED_ONLY(t_infomask))
6602 {
6603 *flags |= FRM_INVALIDATE_XMAX;
6604 xid = InvalidTransactionId; /* not strictly necessary */
6605 }
6606 else
6607 {
6608 /* replace multi by update xid */
6609 xid = MultiXactIdGetUpdateXid(multi, t_infomask);
6610
6611 /* wasn't only a lock, xid needs to be valid */
6612 Assert(TransactionIdIsValid(xid));
6613
6614 if (TransactionIdPrecedes(xid, relfrozenxid))
6615 ereport(ERROR,
6616 (errcode(ERRCODE_DATA_CORRUPTED),
6617 errmsg_internal("found update xid %u from before relfrozenxid %u",
6618 xid, relfrozenxid)));
6619
6620 /*
6621 * If the xid is older than the cutoff, it has to have aborted,
6622 * otherwise the tuple would have gotten pruned away.
6623 */
6624 if (TransactionIdPrecedes(xid, cutoff_xid))
6625 {
6626 if (TransactionIdDidCommit(xid))
6627 ereport(ERROR,
6628 (errcode(ERRCODE_DATA_CORRUPTED),
6629 errmsg_internal("cannot freeze committed update xid %u", xid)));
6630 *flags |= FRM_INVALIDATE_XMAX;
6631 xid = InvalidTransactionId; /* not strictly necessary */
6632 }
6633 else
6634 {
6635 *flags |= FRM_RETURN_IS_XID;
6636 }
6637 }
6638
6639 return xid;
6640 }
6641
6642 /*
6643 * This multixact might have or might not have members still running, but
6644 * we know it's valid and is newer than the cutoff point for multis.
6645 * However, some member(s) of it may be below the cutoff for Xids, so we
6646 * need to walk the whole members array to figure out what to do, if
6647 * anything.
6648 */
6649
6650 nmembers =
6651 GetMultiXactIdMembers(multi, &members, false,
6652 HEAP_XMAX_IS_LOCKED_ONLY(t_infomask));
6653 if (nmembers <= 0)
6654 {
6655 /* Nothing worth keeping */
6656 *flags |= FRM_INVALIDATE_XMAX;
6657 return InvalidTransactionId;
6658 }
6659
6660 /* is there anything older than the cutoff? */
6661 need_replace = false;
6662 for (i = 0; i < nmembers; i++)
6663 {
6664 if (TransactionIdPrecedes(members[i].xid, cutoff_xid))
6665 {
6666 need_replace = true;
6667 break;
6668 }
6669 }
6670
6671 /*
6672 * In the simplest case, there is no member older than the cutoff; we can
6673 * keep the existing MultiXactId as is.
6674 */
6675 if (!need_replace)
6676 {
6677 *flags |= FRM_NOOP;
6678 pfree(members);
6679 return InvalidTransactionId;
6680 }
6681
6682 /*
6683 * If the multi needs to be updated, figure out which members do we need
6684 * to keep.
6685 */
6686 nnewmembers = 0;
6687 newmembers = palloc(sizeof(MultiXactMember) * nmembers);
6688 has_lockers = false;
6689 update_xid = InvalidTransactionId;
6690 update_committed = false;
6691
6692 for (i = 0; i < nmembers; i++)
6693 {
6694 /*
6695 * Determine whether to keep this member or ignore it.
6696 */
6697 if (ISUPDATE_from_mxstatus(members[i].status))
6698 {
6699 TransactionId xid = members[i].xid;
6700
6701 Assert(TransactionIdIsValid(xid));
6702 if (TransactionIdPrecedes(xid, relfrozenxid))
6703 ereport(ERROR,
6704 (errcode(ERRCODE_DATA_CORRUPTED),
6705 errmsg_internal("found update xid %u from before relfrozenxid %u",
6706 xid, relfrozenxid)));
6707
6708 /*
6709 * It's an update; should we keep it? If the transaction is known
6710 * aborted or crashed then it's okay to ignore it, otherwise not.
6711 * Note that an updater older than cutoff_xid cannot possibly be
6712 * committed, because HeapTupleSatisfiesVacuum would have returned
6713 * HEAPTUPLE_DEAD and we would not be trying to freeze the tuple.
6714 *
6715 * As with all tuple visibility routines, it's critical to test
6716 * TransactionIdIsInProgress before TransactionIdDidCommit,
6717 * because of race conditions explained in detail in tqual.c.
6718 */
6719 if (TransactionIdIsCurrentTransactionId(xid) ||
6720 TransactionIdIsInProgress(xid))
6721 {
6722 Assert(!TransactionIdIsValid(update_xid));
6723 update_xid = xid;
6724 }
6725 else if (TransactionIdDidCommit(xid))
6726 {
6727 /*
6728 * The transaction committed, so we can tell caller to set
6729 * HEAP_XMAX_COMMITTED. (We can only do this because we know
6730 * the transaction is not running.)
6731 */
6732 Assert(!TransactionIdIsValid(update_xid));
6733 update_committed = true;
6734 update_xid = xid;
6735 }
6736 else
6737 {
6738 /*
6739 * Not in progress, not committed -- must be aborted or
6740 * crashed; we can ignore it.
6741 */
6742 }
6743
6744 /*
6745 * Since the tuple wasn't marked HEAPTUPLE_DEAD by vacuum, the
6746 * update Xid cannot possibly be older than the xid cutoff. The
6747 * presence of such a tuple would cause corruption, so be paranoid
6748 * and check.
6749 */
6750 if (TransactionIdIsValid(update_xid) &&
6751 TransactionIdPrecedes(update_xid, cutoff_xid))
6752 ereport(ERROR,
6753 (errcode(ERRCODE_DATA_CORRUPTED),
6754 errmsg_internal("found update xid %u from before xid cutoff %u",
6755 update_xid, cutoff_xid)));
6756
6757 /*
6758 * If we determined that it's an Xid corresponding to an update
6759 * that must be retained, additionally add it to the list of
6760 * members of the new Multi, in case we end up using that. (We
6761 * might still decide to use only an update Xid and not a multi,
6762 * but it's easier to maintain the list as we walk the old members
6763 * list.)
6764 */
6765 if (TransactionIdIsValid(update_xid))
6766 newmembers[nnewmembers++] = members[i];
6767 }
6768 else
6769 {
6770 /* We only keep lockers if they are still running */
6771 if (TransactionIdIsCurrentTransactionId(members[i].xid) ||
6772 TransactionIdIsInProgress(members[i].xid))
6773 {
6774 /* running locker cannot possibly be older than the cutoff */
6775 Assert(!TransactionIdPrecedes(members[i].xid, cutoff_xid));
6776 newmembers[nnewmembers++] = members[i];
6777 has_lockers = true;
6778 }
6779 }
6780 }
6781
6782 pfree(members);
6783
6784 if (nnewmembers == 0)
6785 {
6786 /* nothing worth keeping!? Tell caller to remove the whole thing */
6787 *flags |= FRM_INVALIDATE_XMAX;
6788 xid = InvalidTransactionId;
6789 }
6790 else if (TransactionIdIsValid(update_xid) && !has_lockers)
6791 {
6792 /*
6793 * If there's a single member and it's an update, pass it back alone
6794 * without creating a new Multi. (XXX we could do this when there's a
6795 * single remaining locker, too, but that would complicate the API too
6796 * much; moreover, the case with the single updater is more
6797 * interesting, because those are longer-lived.)
6798 */
6799 Assert(nnewmembers == 1);
6800 *flags |= FRM_RETURN_IS_XID;
6801 if (update_committed)
6802 *flags |= FRM_MARK_COMMITTED;
6803 xid = update_xid;
6804 }
6805 else
6806 {
6807 /*
6808 * Create a new multixact with the surviving members of the previous
6809 * one, to set as new Xmax in the tuple.
6810 */
6811 xid = MultiXactIdCreateFromMembers(nnewmembers, newmembers);
6812 *flags |= FRM_RETURN_IS_MULTI;
6813 }
6814
6815 pfree(newmembers);
6816
6817 return xid;
6818 }
6819
6820 /*
6821 * heap_prepare_freeze_tuple
6822 *
6823 * Check to see whether any of the XID fields of a tuple (xmin, xmax, xvac)
6824 * are older than the specified cutoff XID and cutoff MultiXactId. If so,
6825 * setup enough state (in the *frz output argument) to later execute and
6826 * WAL-log what we would need to do, and return true. Return false if nothing
6827 * is to be changed. In addition, set *totally_frozen_p to true if the tuple
6828 * will be totally frozen after these operations are performed and false if
6829 * more freezing will eventually be required.
6830 *
6831 * Caller is responsible for setting the offset field, if appropriate.
6832 *
6833 * It is assumed that the caller has checked the tuple with
6834 * HeapTupleSatisfiesVacuum() and determined that it is not HEAPTUPLE_DEAD
6835 * (else we should be removing the tuple, not freezing it).
6836 *
6837 * NB: cutoff_xid *must* be <= the current global xmin, to ensure that any
6838 * XID older than it could neither be running nor seen as running by any
6839 * open transaction. This ensures that the replacement will not change
6840 * anyone's idea of the tuple state.
6841 * Similarly, cutoff_multi must be less than or equal to the smallest
6842 * MultiXactId used by any transaction currently open.
6843 *
6844 * If the tuple is in a shared buffer, caller must hold an exclusive lock on
6845 * that buffer.
6846 *
6847 * NB: It is not enough to set hint bits to indicate something is
6848 * committed/invalid -- they might not be set on a standby, or after crash
6849 * recovery. We really need to remove old xids.
6850 */
6851 bool
heap_prepare_freeze_tuple(HeapTupleHeader tuple,TransactionId relfrozenxid,TransactionId relminmxid,TransactionId cutoff_xid,TransactionId cutoff_multi,xl_heap_freeze_tuple * frz,bool * totally_frozen_p)6852 heap_prepare_freeze_tuple(HeapTupleHeader tuple,
6853 TransactionId relfrozenxid, TransactionId relminmxid,
6854 TransactionId cutoff_xid, TransactionId cutoff_multi,
6855 xl_heap_freeze_tuple *frz, bool *totally_frozen_p)
6856 {
6857 bool changed = false;
6858 bool xmax_already_frozen = false;
6859 bool xmin_frozen;
6860 bool freeze_xmax;
6861 TransactionId xid;
6862
6863 frz->frzflags = 0;
6864 frz->t_infomask2 = tuple->t_infomask2;
6865 frz->t_infomask = tuple->t_infomask;
6866 frz->xmax = HeapTupleHeaderGetRawXmax(tuple);
6867
6868 /* Process xmin */
6869 xid = HeapTupleHeaderGetXmin(tuple);
6870 xmin_frozen = ((xid == FrozenTransactionId) ||
6871 HeapTupleHeaderXminFrozen(tuple));
6872 if (TransactionIdIsNormal(xid))
6873 {
6874 if (TransactionIdPrecedes(xid, relfrozenxid))
6875 ereport(ERROR,
6876 (errcode(ERRCODE_DATA_CORRUPTED),
6877 errmsg_internal("found xmin %u from before relfrozenxid %u",
6878 xid, relfrozenxid)));
6879
6880 if (TransactionIdPrecedes(xid, cutoff_xid))
6881 {
6882 if (!TransactionIdDidCommit(xid))
6883 ereport(ERROR,
6884 (errcode(ERRCODE_DATA_CORRUPTED),
6885 errmsg_internal("uncommitted xmin %u from before xid cutoff %u needs to be frozen",
6886 xid, cutoff_xid)));
6887
6888 frz->t_infomask |= HEAP_XMIN_FROZEN;
6889 changed = true;
6890 xmin_frozen = true;
6891 }
6892 }
6893
6894 /*
6895 * Process xmax. To thoroughly examine the current Xmax value we need to
6896 * resolve a MultiXactId to its member Xids, in case some of them are
6897 * below the given cutoff for Xids. In that case, those values might need
6898 * freezing, too. Also, if a multi needs freezing, we cannot simply take
6899 * it out --- if there's a live updater Xid, it needs to be kept.
6900 *
6901 * Make sure to keep heap_tuple_needs_freeze in sync with this.
6902 */
6903 xid = HeapTupleHeaderGetRawXmax(tuple);
6904
6905 if (tuple->t_infomask & HEAP_XMAX_IS_MULTI)
6906 {
6907 TransactionId newxmax;
6908 uint16 flags;
6909
6910 newxmax = FreezeMultiXactId(xid, tuple->t_infomask,
6911 relfrozenxid, relminmxid,
6912 cutoff_xid, cutoff_multi, &flags);
6913
6914 freeze_xmax = (flags & FRM_INVALIDATE_XMAX);
6915
6916 if (flags & FRM_RETURN_IS_XID)
6917 {
6918 /*
6919 * NB -- some of these transformations are only valid because we
6920 * know the return Xid is a tuple updater (i.e. not merely a
6921 * locker.) Also note that the only reason we don't explicitly
6922 * worry about HEAP_KEYS_UPDATED is because it lives in
6923 * t_infomask2 rather than t_infomask.
6924 */
6925 frz->t_infomask &= ~HEAP_XMAX_BITS;
6926 frz->xmax = newxmax;
6927 if (flags & FRM_MARK_COMMITTED)
6928 frz->t_infomask |= HEAP_XMAX_COMMITTED;
6929 changed = true;
6930 }
6931 else if (flags & FRM_RETURN_IS_MULTI)
6932 {
6933 uint16 newbits;
6934 uint16 newbits2;
6935
6936 /*
6937 * We can't use GetMultiXactIdHintBits directly on the new multi
6938 * here; that routine initializes the masks to all zeroes, which
6939 * would lose other bits we need. Doing it this way ensures all
6940 * unrelated bits remain untouched.
6941 */
6942 frz->t_infomask &= ~HEAP_XMAX_BITS;
6943 frz->t_infomask2 &= ~HEAP_KEYS_UPDATED;
6944 GetMultiXactIdHintBits(newxmax, &newbits, &newbits2);
6945 frz->t_infomask |= newbits;
6946 frz->t_infomask2 |= newbits2;
6947
6948 frz->xmax = newxmax;
6949
6950 changed = true;
6951 }
6952 }
6953 else if (TransactionIdIsNormal(xid))
6954 {
6955 if (TransactionIdPrecedes(xid, relfrozenxid))
6956 ereport(ERROR,
6957 (errcode(ERRCODE_DATA_CORRUPTED),
6958 errmsg_internal("found xmax %u from before relfrozenxid %u",
6959 xid, relfrozenxid)));
6960
6961 if (TransactionIdPrecedes(xid, cutoff_xid))
6962 {
6963 /*
6964 * If we freeze xmax, make absolutely sure that it's not an XID
6965 * that is important. (Note, a lock-only xmax can be removed
6966 * independent of committedness, since a committed lock holder has
6967 * released the lock).
6968 */
6969 if (!HEAP_XMAX_IS_LOCKED_ONLY(tuple->t_infomask) &&
6970 TransactionIdDidCommit(xid))
6971 ereport(ERROR,
6972 (errcode(ERRCODE_DATA_CORRUPTED),
6973 errmsg_internal("cannot freeze committed xmax %u",
6974 xid)));
6975 freeze_xmax = true;
6976 }
6977 else
6978 freeze_xmax = false;
6979 }
6980 else if ((tuple->t_infomask & HEAP_XMAX_INVALID) ||
6981 !TransactionIdIsValid(HeapTupleHeaderGetRawXmax(tuple)))
6982 {
6983 freeze_xmax = false;
6984 xmax_already_frozen = true;
6985 }
6986 else
6987 ereport(ERROR,
6988 (errcode(ERRCODE_DATA_CORRUPTED),
6989 errmsg_internal("found xmax %u (infomask 0x%04x) not frozen, not multi, not normal",
6990 xid, tuple->t_infomask)));
6991
6992 if (freeze_xmax)
6993 {
6994 Assert(!xmax_already_frozen);
6995
6996 frz->xmax = InvalidTransactionId;
6997
6998 /*
6999 * The tuple might be marked either XMAX_INVALID or XMAX_COMMITTED +
7000 * LOCKED. Normalize to INVALID just to be sure no one gets confused.
7001 * Also get rid of the HEAP_KEYS_UPDATED bit.
7002 */
7003 frz->t_infomask &= ~HEAP_XMAX_BITS;
7004 frz->t_infomask |= HEAP_XMAX_INVALID;
7005 frz->t_infomask2 &= ~HEAP_HOT_UPDATED;
7006 frz->t_infomask2 &= ~HEAP_KEYS_UPDATED;
7007 changed = true;
7008 }
7009
7010 /*
7011 * Old-style VACUUM FULL is gone, but we have to keep this code as long as
7012 * we support having MOVED_OFF/MOVED_IN tuples in the database.
7013 */
7014 if (tuple->t_infomask & HEAP_MOVED)
7015 {
7016 xid = HeapTupleHeaderGetXvac(tuple);
7017
7018 /*
7019 * For Xvac, we ignore the cutoff_xid and just always perform the
7020 * freeze operation. The oldest release in which such a value can
7021 * actually be set is PostgreSQL 8.4, because old-style VACUUM FULL
7022 * was removed in PostgreSQL 9.0. Note that if we were to respect
7023 * cutoff_xid here, we'd need to make surely to clear totally_frozen
7024 * when we skipped freezing on that basis.
7025 */
7026 if (TransactionIdIsNormal(xid))
7027 {
7028 /*
7029 * If a MOVED_OFF tuple is not dead, the xvac transaction must
7030 * have failed; whereas a non-dead MOVED_IN tuple must mean the
7031 * xvac transaction succeeded.
7032 */
7033 if (tuple->t_infomask & HEAP_MOVED_OFF)
7034 frz->frzflags |= XLH_INVALID_XVAC;
7035 else
7036 frz->frzflags |= XLH_FREEZE_XVAC;
7037
7038 /*
7039 * Might as well fix the hint bits too; usually XMIN_COMMITTED
7040 * will already be set here, but there's a small chance not.
7041 */
7042 Assert(!(tuple->t_infomask & HEAP_XMIN_INVALID));
7043 frz->t_infomask |= HEAP_XMIN_COMMITTED;
7044 changed = true;
7045 }
7046 }
7047
7048 *totally_frozen_p = (xmin_frozen &&
7049 (freeze_xmax || xmax_already_frozen));
7050 return changed;
7051 }
7052
7053 /*
7054 * heap_execute_freeze_tuple
7055 * Execute the prepared freezing of a tuple.
7056 *
7057 * Caller is responsible for ensuring that no other backend can access the
7058 * storage underlying this tuple, either by holding an exclusive lock on the
7059 * buffer containing it (which is what lazy VACUUM does), or by having it be
7060 * in private storage (which is what CLUSTER and friends do).
7061 *
7062 * Note: it might seem we could make the changes without exclusive lock, since
7063 * TransactionId read/write is assumed atomic anyway. However there is a race
7064 * condition: someone who just fetched an old XID that we overwrite here could
7065 * conceivably not finish checking the XID against pg_xact before we finish
7066 * the VACUUM and perhaps truncate off the part of pg_xact he needs. Getting
7067 * exclusive lock ensures no other backend is in process of checking the
7068 * tuple status. Also, getting exclusive lock makes it safe to adjust the
7069 * infomask bits.
7070 *
7071 * NB: All code in here must be safe to execute during crash recovery!
7072 */
7073 void
heap_execute_freeze_tuple(HeapTupleHeader tuple,xl_heap_freeze_tuple * frz)7074 heap_execute_freeze_tuple(HeapTupleHeader tuple, xl_heap_freeze_tuple *frz)
7075 {
7076 HeapTupleHeaderSetXmax(tuple, frz->xmax);
7077
7078 if (frz->frzflags & XLH_FREEZE_XVAC)
7079 HeapTupleHeaderSetXvac(tuple, FrozenTransactionId);
7080
7081 if (frz->frzflags & XLH_INVALID_XVAC)
7082 HeapTupleHeaderSetXvac(tuple, InvalidTransactionId);
7083
7084 tuple->t_infomask = frz->t_infomask;
7085 tuple->t_infomask2 = frz->t_infomask2;
7086 }
7087
7088 /*
7089 * heap_freeze_tuple
7090 * Freeze tuple in place, without WAL logging.
7091 *
7092 * Useful for callers like CLUSTER that perform their own WAL logging.
7093 */
7094 bool
heap_freeze_tuple(HeapTupleHeader tuple,TransactionId relfrozenxid,TransactionId relminmxid,TransactionId cutoff_xid,TransactionId cutoff_multi)7095 heap_freeze_tuple(HeapTupleHeader tuple,
7096 TransactionId relfrozenxid, TransactionId relminmxid,
7097 TransactionId cutoff_xid, TransactionId cutoff_multi)
7098 {
7099 xl_heap_freeze_tuple frz;
7100 bool do_freeze;
7101 bool tuple_totally_frozen;
7102
7103 do_freeze = heap_prepare_freeze_tuple(tuple,
7104 relfrozenxid, relminmxid,
7105 cutoff_xid, cutoff_multi,
7106 &frz, &tuple_totally_frozen);
7107
7108 /*
7109 * Note that because this is not a WAL-logged operation, we don't need to
7110 * fill in the offset in the freeze record.
7111 */
7112
7113 if (do_freeze)
7114 heap_execute_freeze_tuple(tuple, &frz);
7115 return do_freeze;
7116 }
7117
7118 /*
7119 * For a given MultiXactId, return the hint bits that should be set in the
7120 * tuple's infomask.
7121 *
7122 * Normally this should be called for a multixact that was just created, and
7123 * so is on our local cache, so the GetMembers call is fast.
7124 */
7125 static void
GetMultiXactIdHintBits(MultiXactId multi,uint16 * new_infomask,uint16 * new_infomask2)7126 GetMultiXactIdHintBits(MultiXactId multi, uint16 *new_infomask,
7127 uint16 *new_infomask2)
7128 {
7129 int nmembers;
7130 MultiXactMember *members;
7131 int i;
7132 uint16 bits = HEAP_XMAX_IS_MULTI;
7133 uint16 bits2 = 0;
7134 bool has_update = false;
7135 LockTupleMode strongest = LockTupleKeyShare;
7136
7137 /*
7138 * We only use this in multis we just created, so they cannot be values
7139 * pre-pg_upgrade.
7140 */
7141 nmembers = GetMultiXactIdMembers(multi, &members, false, false);
7142
7143 for (i = 0; i < nmembers; i++)
7144 {
7145 LockTupleMode mode;
7146
7147 /*
7148 * Remember the strongest lock mode held by any member of the
7149 * multixact.
7150 */
7151 mode = TUPLOCK_from_mxstatus(members[i].status);
7152 if (mode > strongest)
7153 strongest = mode;
7154
7155 /* See what other bits we need */
7156 switch (members[i].status)
7157 {
7158 case MultiXactStatusForKeyShare:
7159 case MultiXactStatusForShare:
7160 case MultiXactStatusForNoKeyUpdate:
7161 break;
7162
7163 case MultiXactStatusForUpdate:
7164 bits2 |= HEAP_KEYS_UPDATED;
7165 break;
7166
7167 case MultiXactStatusNoKeyUpdate:
7168 has_update = true;
7169 break;
7170
7171 case MultiXactStatusUpdate:
7172 bits2 |= HEAP_KEYS_UPDATED;
7173 has_update = true;
7174 break;
7175 }
7176 }
7177
7178 if (strongest == LockTupleExclusive ||
7179 strongest == LockTupleNoKeyExclusive)
7180 bits |= HEAP_XMAX_EXCL_LOCK;
7181 else if (strongest == LockTupleShare)
7182 bits |= HEAP_XMAX_SHR_LOCK;
7183 else if (strongest == LockTupleKeyShare)
7184 bits |= HEAP_XMAX_KEYSHR_LOCK;
7185
7186 if (!has_update)
7187 bits |= HEAP_XMAX_LOCK_ONLY;
7188
7189 if (nmembers > 0)
7190 pfree(members);
7191
7192 *new_infomask = bits;
7193 *new_infomask2 = bits2;
7194 }
7195
7196 /*
7197 * MultiXactIdGetUpdateXid
7198 *
7199 * Given a multixact Xmax and corresponding infomask, which does not have the
7200 * HEAP_XMAX_LOCK_ONLY bit set, obtain and return the Xid of the updating
7201 * transaction.
7202 *
7203 * Caller is expected to check the status of the updating transaction, if
7204 * necessary.
7205 */
7206 static TransactionId
MultiXactIdGetUpdateXid(TransactionId xmax,uint16 t_infomask)7207 MultiXactIdGetUpdateXid(TransactionId xmax, uint16 t_infomask)
7208 {
7209 TransactionId update_xact = InvalidTransactionId;
7210 MultiXactMember *members;
7211 int nmembers;
7212
7213 Assert(!(t_infomask & HEAP_XMAX_LOCK_ONLY));
7214 Assert(t_infomask & HEAP_XMAX_IS_MULTI);
7215
7216 /*
7217 * Since we know the LOCK_ONLY bit is not set, this cannot be a multi from
7218 * pre-pg_upgrade.
7219 */
7220 nmembers = GetMultiXactIdMembers(xmax, &members, false, false);
7221
7222 if (nmembers > 0)
7223 {
7224 int i;
7225
7226 for (i = 0; i < nmembers; i++)
7227 {
7228 /* Ignore lockers */
7229 if (!ISUPDATE_from_mxstatus(members[i].status))
7230 continue;
7231
7232 /* there can be at most one updater */
7233 Assert(update_xact == InvalidTransactionId);
7234 update_xact = members[i].xid;
7235 #ifndef USE_ASSERT_CHECKING
7236
7237 /*
7238 * in an assert-enabled build, walk the whole array to ensure
7239 * there's no other updater.
7240 */
7241 break;
7242 #endif
7243 }
7244
7245 pfree(members);
7246 }
7247
7248 return update_xact;
7249 }
7250
7251 /*
7252 * HeapTupleGetUpdateXid
7253 * As above, but use a HeapTupleHeader
7254 *
7255 * See also HeapTupleHeaderGetUpdateXid, which can be used without previously
7256 * checking the hint bits.
7257 */
7258 TransactionId
HeapTupleGetUpdateXid(HeapTupleHeader tuple)7259 HeapTupleGetUpdateXid(HeapTupleHeader tuple)
7260 {
7261 return MultiXactIdGetUpdateXid(HeapTupleHeaderGetRawXmax(tuple),
7262 tuple->t_infomask);
7263 }
7264
7265 /*
7266 * Does the given multixact conflict with the current transaction grabbing a
7267 * tuple lock of the given strength?
7268 *
7269 * The passed infomask pairs up with the given multixact in the tuple header.
7270 *
7271 * If current_is_member is not NULL, it is set to 'true' if the current
7272 * transaction is a member of the given multixact.
7273 */
7274 static bool
DoesMultiXactIdConflict(MultiXactId multi,uint16 infomask,LockTupleMode lockmode,bool * current_is_member)7275 DoesMultiXactIdConflict(MultiXactId multi, uint16 infomask,
7276 LockTupleMode lockmode, bool *current_is_member)
7277 {
7278 int nmembers;
7279 MultiXactMember *members;
7280 bool result = false;
7281 LOCKMODE wanted = tupleLockExtraInfo[lockmode].hwlock;
7282
7283 if (HEAP_LOCKED_UPGRADED(infomask))
7284 return false;
7285
7286 nmembers = GetMultiXactIdMembers(multi, &members, false,
7287 HEAP_XMAX_IS_LOCKED_ONLY(infomask));
7288 if (nmembers >= 0)
7289 {
7290 int i;
7291
7292 for (i = 0; i < nmembers; i++)
7293 {
7294 TransactionId memxid;
7295 LOCKMODE memlockmode;
7296
7297 if (result && (current_is_member == NULL || *current_is_member))
7298 break;
7299
7300 memlockmode = LOCKMODE_from_mxstatus(members[i].status);
7301
7302 /* ignore members from current xact (but track their presence) */
7303 memxid = members[i].xid;
7304 if (TransactionIdIsCurrentTransactionId(memxid))
7305 {
7306 if (current_is_member != NULL)
7307 *current_is_member = true;
7308 continue;
7309 }
7310 else if (result)
7311 continue;
7312
7313 /* ignore members that don't conflict with the lock we want */
7314 if (!DoLockModesConflict(memlockmode, wanted))
7315 continue;
7316
7317 if (ISUPDATE_from_mxstatus(members[i].status))
7318 {
7319 /* ignore aborted updaters */
7320 if (TransactionIdDidAbort(memxid))
7321 continue;
7322 }
7323 else
7324 {
7325 /* ignore lockers-only that are no longer in progress */
7326 if (!TransactionIdIsInProgress(memxid))
7327 continue;
7328 }
7329
7330 /*
7331 * Whatever remains are either live lockers that conflict with our
7332 * wanted lock, and updaters that are not aborted. Those conflict
7333 * with what we want. Set up to return true, but keep going to
7334 * look for the current transaction among the multixact members,
7335 * if needed.
7336 */
7337 result = true;
7338 }
7339 pfree(members);
7340 }
7341
7342 return result;
7343 }
7344
7345 /*
7346 * Do_MultiXactIdWait
7347 * Actual implementation for the two functions below.
7348 *
7349 * 'multi', 'status' and 'infomask' indicate what to sleep on (the status is
7350 * needed to ensure we only sleep on conflicting members, and the infomask is
7351 * used to optimize multixact access in case it's a lock-only multi); 'nowait'
7352 * indicates whether to use conditional lock acquisition, to allow callers to
7353 * fail if lock is unavailable. 'rel', 'ctid' and 'oper' are used to set up
7354 * context information for error messages. 'remaining', if not NULL, receives
7355 * the number of members that are still running, including any (non-aborted)
7356 * subtransactions of our own transaction.
7357 *
7358 * We do this by sleeping on each member using XactLockTableWait. Any
7359 * members that belong to the current backend are *not* waited for, however;
7360 * this would not merely be useless but would lead to Assert failure inside
7361 * XactLockTableWait. By the time this returns, it is certain that all
7362 * transactions *of other backends* that were members of the MultiXactId
7363 * that conflict with the requested status are dead (and no new ones can have
7364 * been added, since it is not legal to add members to an existing
7365 * MultiXactId).
7366 *
7367 * But by the time we finish sleeping, someone else may have changed the Xmax
7368 * of the containing tuple, so the caller needs to iterate on us somehow.
7369 *
7370 * Note that in case we return false, the number of remaining members is
7371 * not to be trusted.
7372 */
7373 static bool
Do_MultiXactIdWait(MultiXactId multi,MultiXactStatus status,uint16 infomask,bool nowait,Relation rel,ItemPointer ctid,XLTW_Oper oper,int * remaining)7374 Do_MultiXactIdWait(MultiXactId multi, MultiXactStatus status,
7375 uint16 infomask, bool nowait,
7376 Relation rel, ItemPointer ctid, XLTW_Oper oper,
7377 int *remaining)
7378 {
7379 bool result = true;
7380 MultiXactMember *members;
7381 int nmembers;
7382 int remain = 0;
7383
7384 /* for pre-pg_upgrade tuples, no need to sleep at all */
7385 nmembers = HEAP_LOCKED_UPGRADED(infomask) ? -1 :
7386 GetMultiXactIdMembers(multi, &members, false,
7387 HEAP_XMAX_IS_LOCKED_ONLY(infomask));
7388
7389 if (nmembers >= 0)
7390 {
7391 int i;
7392
7393 for (i = 0; i < nmembers; i++)
7394 {
7395 TransactionId memxid = members[i].xid;
7396 MultiXactStatus memstatus = members[i].status;
7397
7398 if (TransactionIdIsCurrentTransactionId(memxid))
7399 {
7400 remain++;
7401 continue;
7402 }
7403
7404 if (!DoLockModesConflict(LOCKMODE_from_mxstatus(memstatus),
7405 LOCKMODE_from_mxstatus(status)))
7406 {
7407 if (remaining && TransactionIdIsInProgress(memxid))
7408 remain++;
7409 continue;
7410 }
7411
7412 /*
7413 * This member conflicts with our multi, so we have to sleep (or
7414 * return failure, if asked to avoid waiting.)
7415 *
7416 * Note that we don't set up an error context callback ourselves,
7417 * but instead we pass the info down to XactLockTableWait. This
7418 * might seem a bit wasteful because the context is set up and
7419 * tore down for each member of the multixact, but in reality it
7420 * should be barely noticeable, and it avoids duplicate code.
7421 */
7422 if (nowait)
7423 {
7424 result = ConditionalXactLockTableWait(memxid);
7425 if (!result)
7426 break;
7427 }
7428 else
7429 XactLockTableWait(memxid, rel, ctid, oper);
7430 }
7431
7432 pfree(members);
7433 }
7434
7435 if (remaining)
7436 *remaining = remain;
7437
7438 return result;
7439 }
7440
7441 /*
7442 * MultiXactIdWait
7443 * Sleep on a MultiXactId.
7444 *
7445 * By the time we finish sleeping, someone else may have changed the Xmax
7446 * of the containing tuple, so the caller needs to iterate on us somehow.
7447 *
7448 * We return (in *remaining, if not NULL) the number of members that are still
7449 * running, including any (non-aborted) subtransactions of our own transaction.
7450 */
7451 static void
MultiXactIdWait(MultiXactId multi,MultiXactStatus status,uint16 infomask,Relation rel,ItemPointer ctid,XLTW_Oper oper,int * remaining)7452 MultiXactIdWait(MultiXactId multi, MultiXactStatus status, uint16 infomask,
7453 Relation rel, ItemPointer ctid, XLTW_Oper oper,
7454 int *remaining)
7455 {
7456 (void) Do_MultiXactIdWait(multi, status, infomask, false,
7457 rel, ctid, oper, remaining);
7458 }
7459
7460 /*
7461 * ConditionalMultiXactIdWait
7462 * As above, but only lock if we can get the lock without blocking.
7463 *
7464 * By the time we finish sleeping, someone else may have changed the Xmax
7465 * of the containing tuple, so the caller needs to iterate on us somehow.
7466 *
7467 * If the multixact is now all gone, return true. Returns false if some
7468 * transactions might still be running.
7469 *
7470 * We return (in *remaining, if not NULL) the number of members that are still
7471 * running, including any (non-aborted) subtransactions of our own transaction.
7472 */
7473 static bool
ConditionalMultiXactIdWait(MultiXactId multi,MultiXactStatus status,uint16 infomask,Relation rel,int * remaining)7474 ConditionalMultiXactIdWait(MultiXactId multi, MultiXactStatus status,
7475 uint16 infomask, Relation rel, int *remaining)
7476 {
7477 return Do_MultiXactIdWait(multi, status, infomask, true,
7478 rel, NULL, XLTW_None, remaining);
7479 }
7480
7481 /*
7482 * heap_tuple_needs_eventual_freeze
7483 *
7484 * Check to see whether any of the XID fields of a tuple (xmin, xmax, xvac)
7485 * will eventually require freezing. Similar to heap_tuple_needs_freeze,
7486 * but there's no cutoff, since we're trying to figure out whether freezing
7487 * will ever be needed, not whether it's needed now.
7488 */
7489 bool
heap_tuple_needs_eventual_freeze(HeapTupleHeader tuple)7490 heap_tuple_needs_eventual_freeze(HeapTupleHeader tuple)
7491 {
7492 TransactionId xid;
7493
7494 /*
7495 * If xmin is a normal transaction ID, this tuple is definitely not
7496 * frozen.
7497 */
7498 xid = HeapTupleHeaderGetXmin(tuple);
7499 if (TransactionIdIsNormal(xid))
7500 return true;
7501
7502 /*
7503 * If xmax is a valid xact or multixact, this tuple is also not frozen.
7504 */
7505 if (tuple->t_infomask & HEAP_XMAX_IS_MULTI)
7506 {
7507 MultiXactId multi;
7508
7509 multi = HeapTupleHeaderGetRawXmax(tuple);
7510 if (MultiXactIdIsValid(multi))
7511 return true;
7512 }
7513 else
7514 {
7515 xid = HeapTupleHeaderGetRawXmax(tuple);
7516 if (TransactionIdIsNormal(xid))
7517 return true;
7518 }
7519
7520 if (tuple->t_infomask & HEAP_MOVED)
7521 {
7522 xid = HeapTupleHeaderGetXvac(tuple);
7523 if (TransactionIdIsNormal(xid))
7524 return true;
7525 }
7526
7527 return false;
7528 }
7529
7530 /*
7531 * heap_tuple_needs_freeze
7532 *
7533 * Check to see whether any of the XID fields of a tuple (xmin, xmax, xvac)
7534 * are older than the specified cutoff XID or MultiXactId. If so, return true.
7535 *
7536 * It doesn't matter whether the tuple is alive or dead, we are checking
7537 * to see if a tuple needs to be removed or frozen to avoid wraparound.
7538 *
7539 * NB: Cannot rely on hint bits here, they might not be set after a crash or
7540 * on a standby.
7541 */
7542 bool
heap_tuple_needs_freeze(HeapTupleHeader tuple,TransactionId cutoff_xid,MultiXactId cutoff_multi,Buffer buf)7543 heap_tuple_needs_freeze(HeapTupleHeader tuple, TransactionId cutoff_xid,
7544 MultiXactId cutoff_multi, Buffer buf)
7545 {
7546 TransactionId xid;
7547
7548 xid = HeapTupleHeaderGetXmin(tuple);
7549 if (TransactionIdIsNormal(xid) &&
7550 TransactionIdPrecedes(xid, cutoff_xid))
7551 return true;
7552
7553 /*
7554 * The considerations for multixacts are complicated; look at
7555 * heap_prepare_freeze_tuple for justifications. This routine had better
7556 * be in sync with that one!
7557 */
7558 if (tuple->t_infomask & HEAP_XMAX_IS_MULTI)
7559 {
7560 MultiXactId multi;
7561
7562 multi = HeapTupleHeaderGetRawXmax(tuple);
7563 if (!MultiXactIdIsValid(multi))
7564 {
7565 /* no xmax set, ignore */
7566 ;
7567 }
7568 else if (HEAP_LOCKED_UPGRADED(tuple->t_infomask))
7569 return true;
7570 else if (MultiXactIdPrecedes(multi, cutoff_multi))
7571 return true;
7572 else
7573 {
7574 MultiXactMember *members;
7575 int nmembers;
7576 int i;
7577
7578 /* need to check whether any member of the mxact is too old */
7579
7580 nmembers = GetMultiXactIdMembers(multi, &members, false,
7581 HEAP_XMAX_IS_LOCKED_ONLY(tuple->t_infomask));
7582
7583 for (i = 0; i < nmembers; i++)
7584 {
7585 if (TransactionIdPrecedes(members[i].xid, cutoff_xid))
7586 {
7587 pfree(members);
7588 return true;
7589 }
7590 }
7591 if (nmembers > 0)
7592 pfree(members);
7593 }
7594 }
7595 else
7596 {
7597 xid = HeapTupleHeaderGetRawXmax(tuple);
7598 if (TransactionIdIsNormal(xid) &&
7599 TransactionIdPrecedes(xid, cutoff_xid))
7600 return true;
7601 }
7602
7603 if (tuple->t_infomask & HEAP_MOVED)
7604 {
7605 xid = HeapTupleHeaderGetXvac(tuple);
7606 if (TransactionIdIsNormal(xid) &&
7607 TransactionIdPrecedes(xid, cutoff_xid))
7608 return true;
7609 }
7610
7611 return false;
7612 }
7613
7614 /*
7615 * If 'tuple' contains any visible XID greater than latestRemovedXid,
7616 * ratchet forwards latestRemovedXid to the greatest one found.
7617 * This is used as the basis for generating Hot Standby conflicts, so
7618 * if a tuple was never visible then removing it should not conflict
7619 * with queries.
7620 */
7621 void
HeapTupleHeaderAdvanceLatestRemovedXid(HeapTupleHeader tuple,TransactionId * latestRemovedXid)7622 HeapTupleHeaderAdvanceLatestRemovedXid(HeapTupleHeader tuple,
7623 TransactionId *latestRemovedXid)
7624 {
7625 TransactionId xmin = HeapTupleHeaderGetXmin(tuple);
7626 TransactionId xmax = HeapTupleHeaderGetUpdateXid(tuple);
7627 TransactionId xvac = HeapTupleHeaderGetXvac(tuple);
7628
7629 if (tuple->t_infomask & HEAP_MOVED)
7630 {
7631 if (TransactionIdPrecedes(*latestRemovedXid, xvac))
7632 *latestRemovedXid = xvac;
7633 }
7634
7635 /*
7636 * Ignore tuples inserted by an aborted transaction or if the tuple was
7637 * updated/deleted by the inserting transaction.
7638 *
7639 * Look for a committed hint bit, or if no xmin bit is set, check clog.
7640 * This needs to work on both master and standby, where it is used to
7641 * assess btree delete records.
7642 */
7643 if (HeapTupleHeaderXminCommitted(tuple) ||
7644 (!HeapTupleHeaderXminInvalid(tuple) && TransactionIdDidCommit(xmin)))
7645 {
7646 if (xmax != xmin &&
7647 TransactionIdFollows(xmax, *latestRemovedXid))
7648 *latestRemovedXid = xmax;
7649 }
7650
7651 /* *latestRemovedXid may still be invalid at end */
7652 }
7653
7654 /*
7655 * Perform XLogInsert to register a heap cleanup info message. These
7656 * messages are sent once per VACUUM and are required because
7657 * of the phasing of removal operations during a lazy VACUUM.
7658 * see comments for vacuum_log_cleanup_info().
7659 */
7660 XLogRecPtr
log_heap_cleanup_info(RelFileNode rnode,TransactionId latestRemovedXid)7661 log_heap_cleanup_info(RelFileNode rnode, TransactionId latestRemovedXid)
7662 {
7663 xl_heap_cleanup_info xlrec;
7664 XLogRecPtr recptr;
7665
7666 xlrec.node = rnode;
7667 xlrec.latestRemovedXid = latestRemovedXid;
7668
7669 XLogBeginInsert();
7670 XLogRegisterData((char *) &xlrec, SizeOfHeapCleanupInfo);
7671
7672 recptr = XLogInsert(RM_HEAP2_ID, XLOG_HEAP2_CLEANUP_INFO);
7673
7674 return recptr;
7675 }
7676
7677 /*
7678 * Perform XLogInsert for a heap-clean operation. Caller must already
7679 * have modified the buffer and marked it dirty.
7680 *
7681 * Note: prior to Postgres 8.3, the entries in the nowunused[] array were
7682 * zero-based tuple indexes. Now they are one-based like other uses
7683 * of OffsetNumber.
7684 *
7685 * We also include latestRemovedXid, which is the greatest XID present in
7686 * the removed tuples. That allows recovery processing to cancel or wait
7687 * for long standby queries that can still see these tuples.
7688 */
7689 XLogRecPtr
log_heap_clean(Relation reln,Buffer buffer,OffsetNumber * redirected,int nredirected,OffsetNumber * nowdead,int ndead,OffsetNumber * nowunused,int nunused,TransactionId latestRemovedXid)7690 log_heap_clean(Relation reln, Buffer buffer,
7691 OffsetNumber *redirected, int nredirected,
7692 OffsetNumber *nowdead, int ndead,
7693 OffsetNumber *nowunused, int nunused,
7694 TransactionId latestRemovedXid)
7695 {
7696 xl_heap_clean xlrec;
7697 XLogRecPtr recptr;
7698
7699 /* Caller should not call me on a non-WAL-logged relation */
7700 Assert(RelationNeedsWAL(reln));
7701
7702 xlrec.latestRemovedXid = latestRemovedXid;
7703 xlrec.nredirected = nredirected;
7704 xlrec.ndead = ndead;
7705
7706 XLogBeginInsert();
7707 XLogRegisterData((char *) &xlrec, SizeOfHeapClean);
7708
7709 XLogRegisterBuffer(0, buffer, REGBUF_STANDARD);
7710
7711 /*
7712 * The OffsetNumber arrays are not actually in the buffer, but we pretend
7713 * that they are. When XLogInsert stores the whole buffer, the offset
7714 * arrays need not be stored too. Note that even if all three arrays are
7715 * empty, we want to expose the buffer as a candidate for whole-page
7716 * storage, since this record type implies a defragmentation operation
7717 * even if no item pointers changed state.
7718 */
7719 if (nredirected > 0)
7720 XLogRegisterBufData(0, (char *) redirected,
7721 nredirected * sizeof(OffsetNumber) * 2);
7722
7723 if (ndead > 0)
7724 XLogRegisterBufData(0, (char *) nowdead,
7725 ndead * sizeof(OffsetNumber));
7726
7727 if (nunused > 0)
7728 XLogRegisterBufData(0, (char *) nowunused,
7729 nunused * sizeof(OffsetNumber));
7730
7731 recptr = XLogInsert(RM_HEAP2_ID, XLOG_HEAP2_CLEAN);
7732
7733 return recptr;
7734 }
7735
7736 /*
7737 * Perform XLogInsert for a heap-freeze operation. Caller must have already
7738 * modified the buffer and marked it dirty.
7739 */
7740 XLogRecPtr
log_heap_freeze(Relation reln,Buffer buffer,TransactionId cutoff_xid,xl_heap_freeze_tuple * tuples,int ntuples)7741 log_heap_freeze(Relation reln, Buffer buffer, TransactionId cutoff_xid,
7742 xl_heap_freeze_tuple *tuples, int ntuples)
7743 {
7744 xl_heap_freeze_page xlrec;
7745 XLogRecPtr recptr;
7746
7747 /* Caller should not call me on a non-WAL-logged relation */
7748 Assert(RelationNeedsWAL(reln));
7749 /* nor when there are no tuples to freeze */
7750 Assert(ntuples > 0);
7751
7752 xlrec.cutoff_xid = cutoff_xid;
7753 xlrec.ntuples = ntuples;
7754
7755 XLogBeginInsert();
7756 XLogRegisterData((char *) &xlrec, SizeOfHeapFreezePage);
7757
7758 /*
7759 * The freeze plan array is not actually in the buffer, but pretend that
7760 * it is. When XLogInsert stores the whole buffer, the freeze plan need
7761 * not be stored too.
7762 */
7763 XLogRegisterBuffer(0, buffer, REGBUF_STANDARD);
7764 XLogRegisterBufData(0, (char *) tuples,
7765 ntuples * sizeof(xl_heap_freeze_tuple));
7766
7767 recptr = XLogInsert(RM_HEAP2_ID, XLOG_HEAP2_FREEZE_PAGE);
7768
7769 return recptr;
7770 }
7771
7772 /*
7773 * Perform XLogInsert for a heap-visible operation. 'block' is the block
7774 * being marked all-visible, and vm_buffer is the buffer containing the
7775 * corresponding visibility map block. Both should have already been modified
7776 * and dirtied.
7777 *
7778 * If checksums are enabled, we also generate a full-page image of
7779 * heap_buffer, if necessary.
7780 */
7781 XLogRecPtr
log_heap_visible(RelFileNode rnode,Buffer heap_buffer,Buffer vm_buffer,TransactionId cutoff_xid,uint8 vmflags)7782 log_heap_visible(RelFileNode rnode, Buffer heap_buffer, Buffer vm_buffer,
7783 TransactionId cutoff_xid, uint8 vmflags)
7784 {
7785 xl_heap_visible xlrec;
7786 XLogRecPtr recptr;
7787 uint8 flags;
7788
7789 Assert(BufferIsValid(heap_buffer));
7790 Assert(BufferIsValid(vm_buffer));
7791
7792 xlrec.cutoff_xid = cutoff_xid;
7793 xlrec.flags = vmflags;
7794 XLogBeginInsert();
7795 XLogRegisterData((char *) &xlrec, SizeOfHeapVisible);
7796
7797 XLogRegisterBuffer(0, vm_buffer, 0);
7798
7799 flags = REGBUF_STANDARD;
7800 if (!XLogHintBitIsNeeded())
7801 flags |= REGBUF_NO_IMAGE;
7802 XLogRegisterBuffer(1, heap_buffer, flags);
7803
7804 recptr = XLogInsert(RM_HEAP2_ID, XLOG_HEAP2_VISIBLE);
7805
7806 return recptr;
7807 }
7808
7809 /*
7810 * Perform XLogInsert for a heap-update operation. Caller must already
7811 * have modified the buffer(s) and marked them dirty.
7812 */
7813 static XLogRecPtr
log_heap_update(Relation reln,Buffer oldbuf,Buffer newbuf,HeapTuple oldtup,HeapTuple newtup,HeapTuple old_key_tuple,bool all_visible_cleared,bool new_all_visible_cleared)7814 log_heap_update(Relation reln, Buffer oldbuf,
7815 Buffer newbuf, HeapTuple oldtup, HeapTuple newtup,
7816 HeapTuple old_key_tuple,
7817 bool all_visible_cleared, bool new_all_visible_cleared)
7818 {
7819 xl_heap_update xlrec;
7820 xl_heap_header xlhdr;
7821 xl_heap_header xlhdr_idx;
7822 uint8 info;
7823 uint16 prefix_suffix[2];
7824 uint16 prefixlen = 0,
7825 suffixlen = 0;
7826 XLogRecPtr recptr;
7827 Page page = BufferGetPage(newbuf);
7828 bool need_tuple_data = RelationIsLogicallyLogged(reln);
7829 bool init;
7830 int bufflags;
7831
7832 /* Caller should not call me on a non-WAL-logged relation */
7833 Assert(RelationNeedsWAL(reln));
7834
7835 XLogBeginInsert();
7836
7837 if (HeapTupleIsHeapOnly(newtup))
7838 info = XLOG_HEAP_HOT_UPDATE;
7839 else
7840 info = XLOG_HEAP_UPDATE;
7841
7842 /*
7843 * If the old and new tuple are on the same page, we only need to log the
7844 * parts of the new tuple that were changed. That saves on the amount of
7845 * WAL we need to write. Currently, we just count any unchanged bytes in
7846 * the beginning and end of the tuple. That's quick to check, and
7847 * perfectly covers the common case that only one field is updated.
7848 *
7849 * We could do this even if the old and new tuple are on different pages,
7850 * but only if we don't make a full-page image of the old page, which is
7851 * difficult to know in advance. Also, if the old tuple is corrupt for
7852 * some reason, it would allow the corruption to propagate the new page,
7853 * so it seems best to avoid. Under the general assumption that most
7854 * updates tend to create the new tuple version on the same page, there
7855 * isn't much to be gained by doing this across pages anyway.
7856 *
7857 * Skip this if we're taking a full-page image of the new page, as we
7858 * don't include the new tuple in the WAL record in that case. Also
7859 * disable if wal_level='logical', as logical decoding needs to be able to
7860 * read the new tuple in whole from the WAL record alone.
7861 */
7862 if (oldbuf == newbuf && !need_tuple_data &&
7863 !XLogCheckBufferNeedsBackup(newbuf))
7864 {
7865 char *oldp = (char *) oldtup->t_data + oldtup->t_data->t_hoff;
7866 char *newp = (char *) newtup->t_data + newtup->t_data->t_hoff;
7867 int oldlen = oldtup->t_len - oldtup->t_data->t_hoff;
7868 int newlen = newtup->t_len - newtup->t_data->t_hoff;
7869
7870 /* Check for common prefix between old and new tuple */
7871 for (prefixlen = 0; prefixlen < Min(oldlen, newlen); prefixlen++)
7872 {
7873 if (newp[prefixlen] != oldp[prefixlen])
7874 break;
7875 }
7876
7877 /*
7878 * Storing the length of the prefix takes 2 bytes, so we need to save
7879 * at least 3 bytes or there's no point.
7880 */
7881 if (prefixlen < 3)
7882 prefixlen = 0;
7883
7884 /* Same for suffix */
7885 for (suffixlen = 0; suffixlen < Min(oldlen, newlen) - prefixlen; suffixlen++)
7886 {
7887 if (newp[newlen - suffixlen - 1] != oldp[oldlen - suffixlen - 1])
7888 break;
7889 }
7890 if (suffixlen < 3)
7891 suffixlen = 0;
7892 }
7893
7894 /* Prepare main WAL data chain */
7895 xlrec.flags = 0;
7896 if (all_visible_cleared)
7897 xlrec.flags |= XLH_UPDATE_OLD_ALL_VISIBLE_CLEARED;
7898 if (new_all_visible_cleared)
7899 xlrec.flags |= XLH_UPDATE_NEW_ALL_VISIBLE_CLEARED;
7900 if (prefixlen > 0)
7901 xlrec.flags |= XLH_UPDATE_PREFIX_FROM_OLD;
7902 if (suffixlen > 0)
7903 xlrec.flags |= XLH_UPDATE_SUFFIX_FROM_OLD;
7904 if (need_tuple_data)
7905 {
7906 xlrec.flags |= XLH_UPDATE_CONTAINS_NEW_TUPLE;
7907 if (old_key_tuple)
7908 {
7909 if (reln->rd_rel->relreplident == REPLICA_IDENTITY_FULL)
7910 xlrec.flags |= XLH_UPDATE_CONTAINS_OLD_TUPLE;
7911 else
7912 xlrec.flags |= XLH_UPDATE_CONTAINS_OLD_KEY;
7913 }
7914 }
7915
7916 /* If new tuple is the single and first tuple on page... */
7917 if (ItemPointerGetOffsetNumber(&(newtup->t_self)) == FirstOffsetNumber &&
7918 PageGetMaxOffsetNumber(page) == FirstOffsetNumber)
7919 {
7920 info |= XLOG_HEAP_INIT_PAGE;
7921 init = true;
7922 }
7923 else
7924 init = false;
7925
7926 /* Prepare WAL data for the old page */
7927 xlrec.old_offnum = ItemPointerGetOffsetNumber(&oldtup->t_self);
7928 xlrec.old_xmax = HeapTupleHeaderGetRawXmax(oldtup->t_data);
7929 xlrec.old_infobits_set = compute_infobits(oldtup->t_data->t_infomask,
7930 oldtup->t_data->t_infomask2);
7931
7932 /* Prepare WAL data for the new page */
7933 xlrec.new_offnum = ItemPointerGetOffsetNumber(&newtup->t_self);
7934 xlrec.new_xmax = HeapTupleHeaderGetRawXmax(newtup->t_data);
7935
7936 bufflags = REGBUF_STANDARD;
7937 if (init)
7938 bufflags |= REGBUF_WILL_INIT;
7939 if (need_tuple_data)
7940 bufflags |= REGBUF_KEEP_DATA;
7941
7942 XLogRegisterBuffer(0, newbuf, bufflags);
7943 if (oldbuf != newbuf)
7944 XLogRegisterBuffer(1, oldbuf, REGBUF_STANDARD);
7945
7946 XLogRegisterData((char *) &xlrec, SizeOfHeapUpdate);
7947
7948 /*
7949 * Prepare WAL data for the new tuple.
7950 */
7951 if (prefixlen > 0 || suffixlen > 0)
7952 {
7953 if (prefixlen > 0 && suffixlen > 0)
7954 {
7955 prefix_suffix[0] = prefixlen;
7956 prefix_suffix[1] = suffixlen;
7957 XLogRegisterBufData(0, (char *) &prefix_suffix, sizeof(uint16) * 2);
7958 }
7959 else if (prefixlen > 0)
7960 {
7961 XLogRegisterBufData(0, (char *) &prefixlen, sizeof(uint16));
7962 }
7963 else
7964 {
7965 XLogRegisterBufData(0, (char *) &suffixlen, sizeof(uint16));
7966 }
7967 }
7968
7969 xlhdr.t_infomask2 = newtup->t_data->t_infomask2;
7970 xlhdr.t_infomask = newtup->t_data->t_infomask;
7971 xlhdr.t_hoff = newtup->t_data->t_hoff;
7972 Assert(SizeofHeapTupleHeader + prefixlen + suffixlen <= newtup->t_len);
7973
7974 /*
7975 * PG73FORMAT: write bitmap [+ padding] [+ oid] + data
7976 *
7977 * The 'data' doesn't include the common prefix or suffix.
7978 */
7979 XLogRegisterBufData(0, (char *) &xlhdr, SizeOfHeapHeader);
7980 if (prefixlen == 0)
7981 {
7982 XLogRegisterBufData(0,
7983 ((char *) newtup->t_data) + SizeofHeapTupleHeader,
7984 newtup->t_len - SizeofHeapTupleHeader - suffixlen);
7985 }
7986 else
7987 {
7988 /*
7989 * Have to write the null bitmap and data after the common prefix as
7990 * two separate rdata entries.
7991 */
7992 /* bitmap [+ padding] [+ oid] */
7993 if (newtup->t_data->t_hoff - SizeofHeapTupleHeader > 0)
7994 {
7995 XLogRegisterBufData(0,
7996 ((char *) newtup->t_data) + SizeofHeapTupleHeader,
7997 newtup->t_data->t_hoff - SizeofHeapTupleHeader);
7998 }
7999
8000 /* data after common prefix */
8001 XLogRegisterBufData(0,
8002 ((char *) newtup->t_data) + newtup->t_data->t_hoff + prefixlen,
8003 newtup->t_len - newtup->t_data->t_hoff - prefixlen - suffixlen);
8004 }
8005
8006 /* We need to log a tuple identity */
8007 if (need_tuple_data && old_key_tuple)
8008 {
8009 /* don't really need this, but its more comfy to decode */
8010 xlhdr_idx.t_infomask2 = old_key_tuple->t_data->t_infomask2;
8011 xlhdr_idx.t_infomask = old_key_tuple->t_data->t_infomask;
8012 xlhdr_idx.t_hoff = old_key_tuple->t_data->t_hoff;
8013
8014 XLogRegisterData((char *) &xlhdr_idx, SizeOfHeapHeader);
8015
8016 /* PG73FORMAT: write bitmap [+ padding] [+ oid] + data */
8017 XLogRegisterData((char *) old_key_tuple->t_data + SizeofHeapTupleHeader,
8018 old_key_tuple->t_len - SizeofHeapTupleHeader);
8019 }
8020
8021 /* filtering by origin on a row level is much more efficient */
8022 XLogSetRecordFlags(XLOG_INCLUDE_ORIGIN);
8023
8024 recptr = XLogInsert(RM_HEAP_ID, info);
8025
8026 return recptr;
8027 }
8028
8029 /*
8030 * Perform XLogInsert of an XLOG_HEAP2_NEW_CID record
8031 *
8032 * This is only used in wal_level >= WAL_LEVEL_LOGICAL, and only for catalog
8033 * tuples.
8034 */
8035 static XLogRecPtr
log_heap_new_cid(Relation relation,HeapTuple tup)8036 log_heap_new_cid(Relation relation, HeapTuple tup)
8037 {
8038 xl_heap_new_cid xlrec;
8039
8040 XLogRecPtr recptr;
8041 HeapTupleHeader hdr = tup->t_data;
8042
8043 Assert(ItemPointerIsValid(&tup->t_self));
8044 Assert(tup->t_tableOid != InvalidOid);
8045
8046 xlrec.top_xid = GetTopTransactionId();
8047 xlrec.target_node = relation->rd_node;
8048 xlrec.target_tid = tup->t_self;
8049
8050 /*
8051 * If the tuple got inserted & deleted in the same TX we definitely have a
8052 * combocid, set cmin and cmax.
8053 */
8054 if (hdr->t_infomask & HEAP_COMBOCID)
8055 {
8056 Assert(!(hdr->t_infomask & HEAP_XMAX_INVALID));
8057 Assert(!HeapTupleHeaderXminInvalid(hdr));
8058 xlrec.cmin = HeapTupleHeaderGetCmin(hdr);
8059 xlrec.cmax = HeapTupleHeaderGetCmax(hdr);
8060 xlrec.combocid = HeapTupleHeaderGetRawCommandId(hdr);
8061 }
8062 /* No combocid, so only cmin or cmax can be set by this TX */
8063 else
8064 {
8065 /*
8066 * Tuple inserted.
8067 *
8068 * We need to check for LOCK ONLY because multixacts might be
8069 * transferred to the new tuple in case of FOR KEY SHARE updates in
8070 * which case there will be an xmax, although the tuple just got
8071 * inserted.
8072 */
8073 if (hdr->t_infomask & HEAP_XMAX_INVALID ||
8074 HEAP_XMAX_IS_LOCKED_ONLY(hdr->t_infomask))
8075 {
8076 xlrec.cmin = HeapTupleHeaderGetRawCommandId(hdr);
8077 xlrec.cmax = InvalidCommandId;
8078 }
8079 /* Tuple from a different tx updated or deleted. */
8080 else
8081 {
8082 xlrec.cmin = InvalidCommandId;
8083 xlrec.cmax = HeapTupleHeaderGetRawCommandId(hdr);
8084
8085 }
8086 xlrec.combocid = InvalidCommandId;
8087 }
8088
8089 /*
8090 * Note that we don't need to register the buffer here, because this
8091 * operation does not modify the page. The insert/update/delete that
8092 * called us certainly did, but that's WAL-logged separately.
8093 */
8094 XLogBeginInsert();
8095 XLogRegisterData((char *) &xlrec, SizeOfHeapNewCid);
8096
8097 /* will be looked at irrespective of origin */
8098
8099 recptr = XLogInsert(RM_HEAP2_ID, XLOG_HEAP2_NEW_CID);
8100
8101 return recptr;
8102 }
8103
8104 /*
8105 * Build a heap tuple representing the configured REPLICA IDENTITY to represent
8106 * the old tuple in a UPDATE or DELETE.
8107 *
8108 * Returns NULL if there's no need to log an identity or if there's no suitable
8109 * key in the Relation relation.
8110 */
8111 static HeapTuple
ExtractReplicaIdentity(Relation relation,HeapTuple tp,bool key_changed,bool * copy)8112 ExtractReplicaIdentity(Relation relation, HeapTuple tp, bool key_changed, bool *copy)
8113 {
8114 TupleDesc desc = RelationGetDescr(relation);
8115 Oid replidindex;
8116 Relation idx_rel;
8117 char replident = relation->rd_rel->relreplident;
8118 HeapTuple key_tuple = NULL;
8119 bool nulls[MaxHeapAttributeNumber];
8120 Datum values[MaxHeapAttributeNumber];
8121 int natt;
8122
8123 *copy = false;
8124
8125 if (!RelationIsLogicallyLogged(relation))
8126 return NULL;
8127
8128 if (replident == REPLICA_IDENTITY_NOTHING)
8129 return NULL;
8130
8131 if (replident == REPLICA_IDENTITY_FULL)
8132 {
8133 /*
8134 * When logging the entire old tuple, it very well could contain
8135 * toasted columns. If so, force them to be inlined.
8136 */
8137 if (HeapTupleHasExternal(tp))
8138 {
8139 *copy = true;
8140 tp = toast_flatten_tuple(tp, RelationGetDescr(relation));
8141 }
8142 return tp;
8143 }
8144
8145 /* if the key hasn't changed and we're only logging the key, we're done */
8146 if (!key_changed)
8147 return NULL;
8148
8149 /* find the replica identity index */
8150 replidindex = RelationGetReplicaIndex(relation);
8151 if (!OidIsValid(replidindex))
8152 {
8153 elog(DEBUG4, "could not find configured replica identity for table \"%s\"",
8154 RelationGetRelationName(relation));
8155 return NULL;
8156 }
8157
8158 idx_rel = RelationIdGetRelation(replidindex);
8159
8160 if (!RelationIsValid(idx_rel))
8161 elog(ERROR, "could not open relation with OID %u", replidindex);
8162
8163 /* deform tuple, so we have fast access to columns */
8164 heap_deform_tuple(tp, desc, values, nulls);
8165
8166 /* set all columns to NULL, regardless of whether they actually are */
8167 memset(nulls, 1, sizeof(nulls));
8168
8169 /*
8170 * Now set all columns contained in the index to NOT NULL, they cannot
8171 * currently be NULL.
8172 */
8173 for (natt = 0; natt < IndexRelationGetNumberOfKeyAttributes(idx_rel); natt++)
8174 {
8175 int attno = idx_rel->rd_index->indkey.values[natt];
8176
8177 if (attno < 0)
8178 {
8179 /*
8180 * The OID column can appear in an index definition, but that's
8181 * OK, because we always copy the OID if present (see below).
8182 * Other system columns may not.
8183 */
8184 if (attno == ObjectIdAttributeNumber)
8185 continue;
8186 elog(ERROR, "system column in index");
8187 }
8188 nulls[attno - 1] = false;
8189 }
8190
8191 key_tuple = heap_form_tuple(desc, values, nulls);
8192 *copy = true;
8193 RelationClose(idx_rel);
8194
8195 /*
8196 * Always copy oids if the table has them, even if not included in the
8197 * index. The space in the logged tuple is used anyway, so there's little
8198 * point in not including the information.
8199 */
8200 if (relation->rd_rel->relhasoids)
8201 HeapTupleSetOid(key_tuple, HeapTupleGetOid(tp));
8202
8203 /*
8204 * If the tuple, which by here only contains indexed columns, still has
8205 * toasted columns, force them to be inlined. This is somewhat unlikely
8206 * since there's limits on the size of indexed columns, so we don't
8207 * duplicate toast_flatten_tuple()s functionality in the above loop over
8208 * the indexed columns, even if it would be more efficient.
8209 */
8210 if (HeapTupleHasExternal(key_tuple))
8211 {
8212 HeapTuple oldtup = key_tuple;
8213
8214 key_tuple = toast_flatten_tuple(oldtup, RelationGetDescr(relation));
8215 heap_freetuple(oldtup);
8216 }
8217
8218 return key_tuple;
8219 }
8220
8221 /*
8222 * Handles CLEANUP_INFO
8223 */
8224 static void
heap_xlog_cleanup_info(XLogReaderState * record)8225 heap_xlog_cleanup_info(XLogReaderState *record)
8226 {
8227 xl_heap_cleanup_info *xlrec = (xl_heap_cleanup_info *) XLogRecGetData(record);
8228
8229 if (InHotStandby)
8230 ResolveRecoveryConflictWithSnapshot(xlrec->latestRemovedXid, xlrec->node);
8231
8232 /*
8233 * Actual operation is a no-op. Record type exists to provide a means for
8234 * conflict processing to occur before we begin index vacuum actions. see
8235 * vacuumlazy.c and also comments in btvacuumpage()
8236 */
8237
8238 /* Backup blocks are not used in cleanup_info records */
8239 Assert(!XLogRecHasAnyBlockRefs(record));
8240 }
8241
8242 /*
8243 * Handles XLOG_HEAP2_CLEAN record type
8244 */
8245 static void
heap_xlog_clean(XLogReaderState * record)8246 heap_xlog_clean(XLogReaderState *record)
8247 {
8248 XLogRecPtr lsn = record->EndRecPtr;
8249 xl_heap_clean *xlrec = (xl_heap_clean *) XLogRecGetData(record);
8250 Buffer buffer;
8251 RelFileNode rnode;
8252 BlockNumber blkno;
8253 XLogRedoAction action;
8254
8255 XLogRecGetBlockTag(record, 0, &rnode, NULL, &blkno);
8256
8257 /*
8258 * We're about to remove tuples. In Hot Standby mode, ensure that there's
8259 * no queries running for which the removed tuples are still visible.
8260 *
8261 * Not all HEAP2_CLEAN records remove tuples with xids, so we only want to
8262 * conflict on the records that cause MVCC failures for user queries. If
8263 * latestRemovedXid is invalid, skip conflict processing.
8264 */
8265 if (InHotStandby && TransactionIdIsValid(xlrec->latestRemovedXid))
8266 ResolveRecoveryConflictWithSnapshot(xlrec->latestRemovedXid, rnode);
8267
8268 /*
8269 * If we have a full-page image, restore it (using a cleanup lock) and
8270 * we're done.
8271 */
8272 action = XLogReadBufferForRedoExtended(record, 0, RBM_NORMAL, true,
8273 &buffer);
8274 if (action == BLK_NEEDS_REDO)
8275 {
8276 Page page = (Page) BufferGetPage(buffer);
8277 OffsetNumber *end;
8278 OffsetNumber *redirected;
8279 OffsetNumber *nowdead;
8280 OffsetNumber *nowunused;
8281 int nredirected;
8282 int ndead;
8283 int nunused;
8284 Size datalen;
8285
8286 redirected = (OffsetNumber *) XLogRecGetBlockData(record, 0, &datalen);
8287
8288 nredirected = xlrec->nredirected;
8289 ndead = xlrec->ndead;
8290 end = (OffsetNumber *) ((char *) redirected + datalen);
8291 nowdead = redirected + (nredirected * 2);
8292 nowunused = nowdead + ndead;
8293 nunused = (end - nowunused);
8294 Assert(nunused >= 0);
8295
8296 /* Update all item pointers per the record, and repair fragmentation */
8297 heap_page_prune_execute(buffer,
8298 redirected, nredirected,
8299 nowdead, ndead,
8300 nowunused, nunused);
8301
8302 /*
8303 * Note: we don't worry about updating the page's prunability hints.
8304 * At worst this will cause an extra prune cycle to occur soon.
8305 */
8306
8307 PageSetLSN(page, lsn);
8308 MarkBufferDirty(buffer);
8309 }
8310
8311 if (BufferIsValid(buffer))
8312 {
8313 Size freespace = PageGetHeapFreeSpace(BufferGetPage(buffer));
8314
8315 UnlockReleaseBuffer(buffer);
8316
8317 /*
8318 * After cleaning records from a page, it's useful to update the FSM
8319 * about it, as it may cause the page become target for insertions
8320 * later even if vacuum decides not to visit it (which is possible if
8321 * gets marked all-visible.)
8322 *
8323 * Do this regardless of a full-page image being applied, since the
8324 * FSM data is not in the page anyway.
8325 */
8326 XLogRecordPageWithFreeSpace(rnode, blkno, freespace);
8327 }
8328 }
8329
8330 /*
8331 * Replay XLOG_HEAP2_VISIBLE record.
8332 *
8333 * The critical integrity requirement here is that we must never end up with
8334 * a situation where the visibility map bit is set, and the page-level
8335 * PD_ALL_VISIBLE bit is clear. If that were to occur, then a subsequent
8336 * page modification would fail to clear the visibility map bit.
8337 */
8338 static void
heap_xlog_visible(XLogReaderState * record)8339 heap_xlog_visible(XLogReaderState *record)
8340 {
8341 XLogRecPtr lsn = record->EndRecPtr;
8342 xl_heap_visible *xlrec = (xl_heap_visible *) XLogRecGetData(record);
8343 Buffer vmbuffer = InvalidBuffer;
8344 Buffer buffer;
8345 Page page;
8346 RelFileNode rnode;
8347 BlockNumber blkno;
8348 XLogRedoAction action;
8349
8350 XLogRecGetBlockTag(record, 1, &rnode, NULL, &blkno);
8351
8352 /*
8353 * If there are any Hot Standby transactions running that have an xmin
8354 * horizon old enough that this page isn't all-visible for them, they
8355 * might incorrectly decide that an index-only scan can skip a heap fetch.
8356 *
8357 * NB: It might be better to throw some kind of "soft" conflict here that
8358 * forces any index-only scan that is in flight to perform heap fetches,
8359 * rather than killing the transaction outright.
8360 */
8361 if (InHotStandby)
8362 ResolveRecoveryConflictWithSnapshot(xlrec->cutoff_xid, rnode);
8363
8364 /*
8365 * Read the heap page, if it still exists. If the heap file has dropped or
8366 * truncated later in recovery, we don't need to update the page, but we'd
8367 * better still update the visibility map.
8368 */
8369 action = XLogReadBufferForRedo(record, 1, &buffer);
8370 if (action == BLK_NEEDS_REDO)
8371 {
8372 /*
8373 * We don't bump the LSN of the heap page when setting the visibility
8374 * map bit (unless checksums or wal_hint_bits is enabled, in which
8375 * case we must), because that would generate an unworkable volume of
8376 * full-page writes. This exposes us to torn page hazards, but since
8377 * we're not inspecting the existing page contents in any way, we
8378 * don't care.
8379 *
8380 * However, all operations that clear the visibility map bit *do* bump
8381 * the LSN, and those operations will only be replayed if the XLOG LSN
8382 * follows the page LSN. Thus, if the page LSN has advanced past our
8383 * XLOG record's LSN, we mustn't mark the page all-visible, because
8384 * the subsequent update won't be replayed to clear the flag.
8385 */
8386 page = BufferGetPage(buffer);
8387
8388 PageSetAllVisible(page);
8389
8390 MarkBufferDirty(buffer);
8391 }
8392 else if (action == BLK_RESTORED)
8393 {
8394 /*
8395 * If heap block was backed up, we already restored it and there's
8396 * nothing more to do. (This can only happen with checksums or
8397 * wal_log_hints enabled.)
8398 */
8399 }
8400
8401 if (BufferIsValid(buffer))
8402 {
8403 Size space = PageGetFreeSpace(BufferGetPage(buffer));
8404
8405 UnlockReleaseBuffer(buffer);
8406
8407 /*
8408 * Since FSM is not WAL-logged and only updated heuristically, it
8409 * easily becomes stale in standbys. If the standby is later promoted
8410 * and runs VACUUM, it will skip updating individual free space
8411 * figures for pages that became all-visible (or all-frozen, depending
8412 * on the vacuum mode,) which is troublesome when FreeSpaceMapVacuum
8413 * propagates too optimistic free space values to upper FSM layers;
8414 * later inserters try to use such pages only to find out that they
8415 * are unusable. This can cause long stalls when there are many such
8416 * pages.
8417 *
8418 * Forestall those problems by updating FSM's idea about a page that
8419 * is becoming all-visible or all-frozen.
8420 *
8421 * Do this regardless of a full-page image being applied, since the
8422 * FSM data is not in the page anyway.
8423 */
8424 if (xlrec->flags & VISIBILITYMAP_VALID_BITS)
8425 XLogRecordPageWithFreeSpace(rnode, blkno, space);
8426 }
8427
8428 /*
8429 * Even if we skipped the heap page update due to the LSN interlock, it's
8430 * still safe to update the visibility map. Any WAL record that clears
8431 * the visibility map bit does so before checking the page LSN, so any
8432 * bits that need to be cleared will still be cleared.
8433 */
8434 if (XLogReadBufferForRedoExtended(record, 0, RBM_ZERO_ON_ERROR, false,
8435 &vmbuffer) == BLK_NEEDS_REDO)
8436 {
8437 Page vmpage = BufferGetPage(vmbuffer);
8438 Relation reln;
8439
8440 /* initialize the page if it was read as zeros */
8441 if (PageIsNew(vmpage))
8442 PageInit(vmpage, BLCKSZ, 0);
8443
8444 /*
8445 * XLogReadBufferForRedoExtended locked the buffer. But
8446 * visibilitymap_set will handle locking itself.
8447 */
8448 LockBuffer(vmbuffer, BUFFER_LOCK_UNLOCK);
8449
8450 reln = CreateFakeRelcacheEntry(rnode);
8451 visibilitymap_pin(reln, blkno, &vmbuffer);
8452
8453 /*
8454 * Don't set the bit if replay has already passed this point.
8455 *
8456 * It might be safe to do this unconditionally; if replay has passed
8457 * this point, we'll replay at least as far this time as we did
8458 * before, and if this bit needs to be cleared, the record responsible
8459 * for doing so should be again replayed, and clear it. For right
8460 * now, out of an abundance of conservatism, we use the same test here
8461 * we did for the heap page. If this results in a dropped bit, no
8462 * real harm is done; and the next VACUUM will fix it.
8463 */
8464 if (lsn > PageGetLSN(vmpage))
8465 visibilitymap_set(reln, blkno, InvalidBuffer, lsn, vmbuffer,
8466 xlrec->cutoff_xid, xlrec->flags);
8467
8468 ReleaseBuffer(vmbuffer);
8469 FreeFakeRelcacheEntry(reln);
8470 }
8471 else if (BufferIsValid(vmbuffer))
8472 UnlockReleaseBuffer(vmbuffer);
8473 }
8474
8475 /*
8476 * Replay XLOG_HEAP2_FREEZE_PAGE records
8477 */
8478 static void
heap_xlog_freeze_page(XLogReaderState * record)8479 heap_xlog_freeze_page(XLogReaderState *record)
8480 {
8481 XLogRecPtr lsn = record->EndRecPtr;
8482 xl_heap_freeze_page *xlrec = (xl_heap_freeze_page *) XLogRecGetData(record);
8483 TransactionId cutoff_xid = xlrec->cutoff_xid;
8484 Buffer buffer;
8485 int ntup;
8486
8487 /*
8488 * In Hot Standby mode, ensure that there's no queries running which still
8489 * consider the frozen xids as running.
8490 */
8491 if (InHotStandby)
8492 {
8493 RelFileNode rnode;
8494 TransactionId latestRemovedXid = cutoff_xid;
8495
8496 TransactionIdRetreat(latestRemovedXid);
8497
8498 XLogRecGetBlockTag(record, 0, &rnode, NULL, NULL);
8499 ResolveRecoveryConflictWithSnapshot(latestRemovedXid, rnode);
8500 }
8501
8502 if (XLogReadBufferForRedo(record, 0, &buffer) == BLK_NEEDS_REDO)
8503 {
8504 Page page = BufferGetPage(buffer);
8505 xl_heap_freeze_tuple *tuples;
8506
8507 tuples = (xl_heap_freeze_tuple *) XLogRecGetBlockData(record, 0, NULL);
8508
8509 /* now execute freeze plan for each frozen tuple */
8510 for (ntup = 0; ntup < xlrec->ntuples; ntup++)
8511 {
8512 xl_heap_freeze_tuple *xlrec_tp;
8513 ItemId lp;
8514 HeapTupleHeader tuple;
8515
8516 xlrec_tp = &tuples[ntup];
8517 lp = PageGetItemId(page, xlrec_tp->offset); /* offsets are one-based */
8518 tuple = (HeapTupleHeader) PageGetItem(page, lp);
8519
8520 heap_execute_freeze_tuple(tuple, xlrec_tp);
8521 }
8522
8523 PageSetLSN(page, lsn);
8524 MarkBufferDirty(buffer);
8525 }
8526 if (BufferIsValid(buffer))
8527 UnlockReleaseBuffer(buffer);
8528 }
8529
8530 /*
8531 * Given an "infobits" field from an XLog record, set the correct bits in the
8532 * given infomask and infomask2 for the tuple touched by the record.
8533 *
8534 * (This is the reverse of compute_infobits).
8535 */
8536 static void
fix_infomask_from_infobits(uint8 infobits,uint16 * infomask,uint16 * infomask2)8537 fix_infomask_from_infobits(uint8 infobits, uint16 *infomask, uint16 *infomask2)
8538 {
8539 *infomask &= ~(HEAP_XMAX_IS_MULTI | HEAP_XMAX_LOCK_ONLY |
8540 HEAP_XMAX_KEYSHR_LOCK | HEAP_XMAX_EXCL_LOCK);
8541 *infomask2 &= ~HEAP_KEYS_UPDATED;
8542
8543 if (infobits & XLHL_XMAX_IS_MULTI)
8544 *infomask |= HEAP_XMAX_IS_MULTI;
8545 if (infobits & XLHL_XMAX_LOCK_ONLY)
8546 *infomask |= HEAP_XMAX_LOCK_ONLY;
8547 if (infobits & XLHL_XMAX_EXCL_LOCK)
8548 *infomask |= HEAP_XMAX_EXCL_LOCK;
8549 /* note HEAP_XMAX_SHR_LOCK isn't considered here */
8550 if (infobits & XLHL_XMAX_KEYSHR_LOCK)
8551 *infomask |= HEAP_XMAX_KEYSHR_LOCK;
8552
8553 if (infobits & XLHL_KEYS_UPDATED)
8554 *infomask2 |= HEAP_KEYS_UPDATED;
8555 }
8556
8557 static void
heap_xlog_delete(XLogReaderState * record)8558 heap_xlog_delete(XLogReaderState *record)
8559 {
8560 XLogRecPtr lsn = record->EndRecPtr;
8561 xl_heap_delete *xlrec = (xl_heap_delete *) XLogRecGetData(record);
8562 Buffer buffer;
8563 Page page;
8564 ItemId lp = NULL;
8565 HeapTupleHeader htup;
8566 BlockNumber blkno;
8567 RelFileNode target_node;
8568 ItemPointerData target_tid;
8569
8570 XLogRecGetBlockTag(record, 0, &target_node, NULL, &blkno);
8571 ItemPointerSetBlockNumber(&target_tid, blkno);
8572 ItemPointerSetOffsetNumber(&target_tid, xlrec->offnum);
8573
8574 /*
8575 * The visibility map may need to be fixed even if the heap page is
8576 * already up-to-date.
8577 */
8578 if (xlrec->flags & XLH_DELETE_ALL_VISIBLE_CLEARED)
8579 {
8580 Relation reln = CreateFakeRelcacheEntry(target_node);
8581 Buffer vmbuffer = InvalidBuffer;
8582
8583 visibilitymap_pin(reln, blkno, &vmbuffer);
8584 visibilitymap_clear(reln, blkno, vmbuffer, VISIBILITYMAP_VALID_BITS);
8585 ReleaseBuffer(vmbuffer);
8586 FreeFakeRelcacheEntry(reln);
8587 }
8588
8589 if (XLogReadBufferForRedo(record, 0, &buffer) == BLK_NEEDS_REDO)
8590 {
8591 page = BufferGetPage(buffer);
8592
8593 if (PageGetMaxOffsetNumber(page) >= xlrec->offnum)
8594 lp = PageGetItemId(page, xlrec->offnum);
8595
8596 if (PageGetMaxOffsetNumber(page) < xlrec->offnum || !ItemIdIsNormal(lp))
8597 elog(PANIC, "invalid lp");
8598
8599 htup = (HeapTupleHeader) PageGetItem(page, lp);
8600
8601 htup->t_infomask &= ~(HEAP_XMAX_BITS | HEAP_MOVED);
8602 htup->t_infomask2 &= ~HEAP_KEYS_UPDATED;
8603 HeapTupleHeaderClearHotUpdated(htup);
8604 fix_infomask_from_infobits(xlrec->infobits_set,
8605 &htup->t_infomask, &htup->t_infomask2);
8606 if (!(xlrec->flags & XLH_DELETE_IS_SUPER))
8607 HeapTupleHeaderSetXmax(htup, xlrec->xmax);
8608 else
8609 HeapTupleHeaderSetXmin(htup, InvalidTransactionId);
8610 HeapTupleHeaderSetCmax(htup, FirstCommandId, false);
8611
8612 /* Mark the page as a candidate for pruning */
8613 PageSetPrunable(page, XLogRecGetXid(record));
8614
8615 if (xlrec->flags & XLH_DELETE_ALL_VISIBLE_CLEARED)
8616 PageClearAllVisible(page);
8617
8618 /* Make sure t_ctid is set correctly */
8619 if (xlrec->flags & XLH_DELETE_IS_PARTITION_MOVE)
8620 HeapTupleHeaderSetMovedPartitions(htup);
8621 else
8622 htup->t_ctid = target_tid;
8623 PageSetLSN(page, lsn);
8624 MarkBufferDirty(buffer);
8625 }
8626 if (BufferIsValid(buffer))
8627 UnlockReleaseBuffer(buffer);
8628 }
8629
8630 static void
heap_xlog_insert(XLogReaderState * record)8631 heap_xlog_insert(XLogReaderState *record)
8632 {
8633 XLogRecPtr lsn = record->EndRecPtr;
8634 xl_heap_insert *xlrec = (xl_heap_insert *) XLogRecGetData(record);
8635 Buffer buffer;
8636 Page page;
8637 union
8638 {
8639 HeapTupleHeaderData hdr;
8640 char data[MaxHeapTupleSize];
8641 } tbuf;
8642 HeapTupleHeader htup;
8643 xl_heap_header xlhdr;
8644 uint32 newlen;
8645 Size freespace = 0;
8646 RelFileNode target_node;
8647 BlockNumber blkno;
8648 ItemPointerData target_tid;
8649 XLogRedoAction action;
8650
8651 XLogRecGetBlockTag(record, 0, &target_node, NULL, &blkno);
8652 ItemPointerSetBlockNumber(&target_tid, blkno);
8653 ItemPointerSetOffsetNumber(&target_tid, xlrec->offnum);
8654
8655 /*
8656 * The visibility map may need to be fixed even if the heap page is
8657 * already up-to-date.
8658 */
8659 if (xlrec->flags & XLH_INSERT_ALL_VISIBLE_CLEARED)
8660 {
8661 Relation reln = CreateFakeRelcacheEntry(target_node);
8662 Buffer vmbuffer = InvalidBuffer;
8663
8664 visibilitymap_pin(reln, blkno, &vmbuffer);
8665 visibilitymap_clear(reln, blkno, vmbuffer, VISIBILITYMAP_VALID_BITS);
8666 ReleaseBuffer(vmbuffer);
8667 FreeFakeRelcacheEntry(reln);
8668 }
8669
8670 /*
8671 * If we inserted the first and only tuple on the page, re-initialize the
8672 * page from scratch.
8673 */
8674 if (XLogRecGetInfo(record) & XLOG_HEAP_INIT_PAGE)
8675 {
8676 buffer = XLogInitBufferForRedo(record, 0);
8677 page = BufferGetPage(buffer);
8678 PageInit(page, BufferGetPageSize(buffer), 0);
8679 action = BLK_NEEDS_REDO;
8680 }
8681 else
8682 action = XLogReadBufferForRedo(record, 0, &buffer);
8683 if (action == BLK_NEEDS_REDO)
8684 {
8685 Size datalen;
8686 char *data;
8687
8688 page = BufferGetPage(buffer);
8689
8690 if (PageGetMaxOffsetNumber(page) + 1 < xlrec->offnum)
8691 elog(PANIC, "invalid max offset number");
8692
8693 data = XLogRecGetBlockData(record, 0, &datalen);
8694
8695 newlen = datalen - SizeOfHeapHeader;
8696 Assert(datalen > SizeOfHeapHeader && newlen <= MaxHeapTupleSize);
8697 memcpy((char *) &xlhdr, data, SizeOfHeapHeader);
8698 data += SizeOfHeapHeader;
8699
8700 htup = &tbuf.hdr;
8701 MemSet((char *) htup, 0, SizeofHeapTupleHeader);
8702 /* PG73FORMAT: get bitmap [+ padding] [+ oid] + data */
8703 memcpy((char *) htup + SizeofHeapTupleHeader,
8704 data,
8705 newlen);
8706 newlen += SizeofHeapTupleHeader;
8707 htup->t_infomask2 = xlhdr.t_infomask2;
8708 htup->t_infomask = xlhdr.t_infomask;
8709 htup->t_hoff = xlhdr.t_hoff;
8710 HeapTupleHeaderSetXmin(htup, XLogRecGetXid(record));
8711 HeapTupleHeaderSetCmin(htup, FirstCommandId);
8712 htup->t_ctid = target_tid;
8713
8714 if (PageAddItem(page, (Item) htup, newlen, xlrec->offnum,
8715 true, true) == InvalidOffsetNumber)
8716 elog(PANIC, "failed to add tuple");
8717
8718 freespace = PageGetHeapFreeSpace(page); /* needed to update FSM below */
8719
8720 PageSetLSN(page, lsn);
8721
8722 if (xlrec->flags & XLH_INSERT_ALL_VISIBLE_CLEARED)
8723 PageClearAllVisible(page);
8724
8725 MarkBufferDirty(buffer);
8726 }
8727 if (BufferIsValid(buffer))
8728 UnlockReleaseBuffer(buffer);
8729
8730 /*
8731 * If the page is running low on free space, update the FSM as well.
8732 * Arbitrarily, our definition of "low" is less than 20%. We can't do much
8733 * better than that without knowing the fill-factor for the table.
8734 *
8735 * XXX: Don't do this if the page was restored from full page image. We
8736 * don't bother to update the FSM in that case, it doesn't need to be
8737 * totally accurate anyway.
8738 */
8739 if (action == BLK_NEEDS_REDO && freespace < BLCKSZ / 5)
8740 XLogRecordPageWithFreeSpace(target_node, blkno, freespace);
8741 }
8742
8743 /*
8744 * Handles MULTI_INSERT record type.
8745 */
8746 static void
heap_xlog_multi_insert(XLogReaderState * record)8747 heap_xlog_multi_insert(XLogReaderState *record)
8748 {
8749 XLogRecPtr lsn = record->EndRecPtr;
8750 xl_heap_multi_insert *xlrec;
8751 RelFileNode rnode;
8752 BlockNumber blkno;
8753 Buffer buffer;
8754 Page page;
8755 union
8756 {
8757 HeapTupleHeaderData hdr;
8758 char data[MaxHeapTupleSize];
8759 } tbuf;
8760 HeapTupleHeader htup;
8761 uint32 newlen;
8762 Size freespace = 0;
8763 int i;
8764 bool isinit = (XLogRecGetInfo(record) & XLOG_HEAP_INIT_PAGE) != 0;
8765 XLogRedoAction action;
8766
8767 /*
8768 * Insertion doesn't overwrite MVCC data, so no conflict processing is
8769 * required.
8770 */
8771 xlrec = (xl_heap_multi_insert *) XLogRecGetData(record);
8772
8773 XLogRecGetBlockTag(record, 0, &rnode, NULL, &blkno);
8774
8775 /*
8776 * The visibility map may need to be fixed even if the heap page is
8777 * already up-to-date.
8778 */
8779 if (xlrec->flags & XLH_INSERT_ALL_VISIBLE_CLEARED)
8780 {
8781 Relation reln = CreateFakeRelcacheEntry(rnode);
8782 Buffer vmbuffer = InvalidBuffer;
8783
8784 visibilitymap_pin(reln, blkno, &vmbuffer);
8785 visibilitymap_clear(reln, blkno, vmbuffer, VISIBILITYMAP_VALID_BITS);
8786 ReleaseBuffer(vmbuffer);
8787 FreeFakeRelcacheEntry(reln);
8788 }
8789
8790 if (isinit)
8791 {
8792 buffer = XLogInitBufferForRedo(record, 0);
8793 page = BufferGetPage(buffer);
8794 PageInit(page, BufferGetPageSize(buffer), 0);
8795 action = BLK_NEEDS_REDO;
8796 }
8797 else
8798 action = XLogReadBufferForRedo(record, 0, &buffer);
8799 if (action == BLK_NEEDS_REDO)
8800 {
8801 char *tupdata;
8802 char *endptr;
8803 Size len;
8804
8805 /* Tuples are stored as block data */
8806 tupdata = XLogRecGetBlockData(record, 0, &len);
8807 endptr = tupdata + len;
8808
8809 page = (Page) BufferGetPage(buffer);
8810
8811 for (i = 0; i < xlrec->ntuples; i++)
8812 {
8813 OffsetNumber offnum;
8814 xl_multi_insert_tuple *xlhdr;
8815
8816 /*
8817 * If we're reinitializing the page, the tuples are stored in
8818 * order from FirstOffsetNumber. Otherwise there's an array of
8819 * offsets in the WAL record, and the tuples come after that.
8820 */
8821 if (isinit)
8822 offnum = FirstOffsetNumber + i;
8823 else
8824 offnum = xlrec->offsets[i];
8825 if (PageGetMaxOffsetNumber(page) + 1 < offnum)
8826 elog(PANIC, "invalid max offset number");
8827
8828 xlhdr = (xl_multi_insert_tuple *) SHORTALIGN(tupdata);
8829 tupdata = ((char *) xlhdr) + SizeOfMultiInsertTuple;
8830
8831 newlen = xlhdr->datalen;
8832 Assert(newlen <= MaxHeapTupleSize);
8833 htup = &tbuf.hdr;
8834 MemSet((char *) htup, 0, SizeofHeapTupleHeader);
8835 /* PG73FORMAT: get bitmap [+ padding] [+ oid] + data */
8836 memcpy((char *) htup + SizeofHeapTupleHeader,
8837 (char *) tupdata,
8838 newlen);
8839 tupdata += newlen;
8840
8841 newlen += SizeofHeapTupleHeader;
8842 htup->t_infomask2 = xlhdr->t_infomask2;
8843 htup->t_infomask = xlhdr->t_infomask;
8844 htup->t_hoff = xlhdr->t_hoff;
8845 HeapTupleHeaderSetXmin(htup, XLogRecGetXid(record));
8846 HeapTupleHeaderSetCmin(htup, FirstCommandId);
8847 ItemPointerSetBlockNumber(&htup->t_ctid, blkno);
8848 ItemPointerSetOffsetNumber(&htup->t_ctid, offnum);
8849
8850 offnum = PageAddItem(page, (Item) htup, newlen, offnum, true, true);
8851 if (offnum == InvalidOffsetNumber)
8852 elog(PANIC, "failed to add tuple");
8853 }
8854 if (tupdata != endptr)
8855 elog(PANIC, "total tuple length mismatch");
8856
8857 freespace = PageGetHeapFreeSpace(page); /* needed to update FSM below */
8858
8859 PageSetLSN(page, lsn);
8860
8861 if (xlrec->flags & XLH_INSERT_ALL_VISIBLE_CLEARED)
8862 PageClearAllVisible(page);
8863
8864 MarkBufferDirty(buffer);
8865 }
8866 if (BufferIsValid(buffer))
8867 UnlockReleaseBuffer(buffer);
8868
8869 /*
8870 * If the page is running low on free space, update the FSM as well.
8871 * Arbitrarily, our definition of "low" is less than 20%. We can't do much
8872 * better than that without knowing the fill-factor for the table.
8873 *
8874 * XXX: Don't do this if the page was restored from full page image. We
8875 * don't bother to update the FSM in that case, it doesn't need to be
8876 * totally accurate anyway.
8877 */
8878 if (action == BLK_NEEDS_REDO && freespace < BLCKSZ / 5)
8879 XLogRecordPageWithFreeSpace(rnode, blkno, freespace);
8880 }
8881
8882 /*
8883 * Handles UPDATE and HOT_UPDATE
8884 */
8885 static void
heap_xlog_update(XLogReaderState * record,bool hot_update)8886 heap_xlog_update(XLogReaderState *record, bool hot_update)
8887 {
8888 XLogRecPtr lsn = record->EndRecPtr;
8889 xl_heap_update *xlrec = (xl_heap_update *) XLogRecGetData(record);
8890 RelFileNode rnode;
8891 BlockNumber oldblk;
8892 BlockNumber newblk;
8893 ItemPointerData newtid;
8894 Buffer obuffer,
8895 nbuffer;
8896 Page page;
8897 OffsetNumber offnum;
8898 ItemId lp = NULL;
8899 HeapTupleData oldtup;
8900 HeapTupleHeader htup;
8901 uint16 prefixlen = 0,
8902 suffixlen = 0;
8903 char *newp;
8904 union
8905 {
8906 HeapTupleHeaderData hdr;
8907 char data[MaxHeapTupleSize];
8908 } tbuf;
8909 xl_heap_header xlhdr;
8910 uint32 newlen;
8911 Size freespace = 0;
8912 XLogRedoAction oldaction;
8913 XLogRedoAction newaction;
8914
8915 /* initialize to keep the compiler quiet */
8916 oldtup.t_data = NULL;
8917 oldtup.t_len = 0;
8918
8919 XLogRecGetBlockTag(record, 0, &rnode, NULL, &newblk);
8920 if (XLogRecGetBlockTag(record, 1, NULL, NULL, &oldblk))
8921 {
8922 /* HOT updates are never done across pages */
8923 Assert(!hot_update);
8924 }
8925 else
8926 oldblk = newblk;
8927
8928 ItemPointerSet(&newtid, newblk, xlrec->new_offnum);
8929
8930 /*
8931 * The visibility map may need to be fixed even if the heap page is
8932 * already up-to-date.
8933 */
8934 if (xlrec->flags & XLH_UPDATE_OLD_ALL_VISIBLE_CLEARED)
8935 {
8936 Relation reln = CreateFakeRelcacheEntry(rnode);
8937 Buffer vmbuffer = InvalidBuffer;
8938
8939 visibilitymap_pin(reln, oldblk, &vmbuffer);
8940 visibilitymap_clear(reln, oldblk, vmbuffer, VISIBILITYMAP_VALID_BITS);
8941 ReleaseBuffer(vmbuffer);
8942 FreeFakeRelcacheEntry(reln);
8943 }
8944
8945 /*
8946 * In normal operation, it is important to lock the two pages in
8947 * page-number order, to avoid possible deadlocks against other update
8948 * operations going the other way. However, during WAL replay there can
8949 * be no other update happening, so we don't need to worry about that. But
8950 * we *do* need to worry that we don't expose an inconsistent state to Hot
8951 * Standby queries --- so the original page can't be unlocked before we've
8952 * added the new tuple to the new page.
8953 */
8954
8955 /* Deal with old tuple version */
8956 oldaction = XLogReadBufferForRedo(record, (oldblk == newblk) ? 0 : 1,
8957 &obuffer);
8958 if (oldaction == BLK_NEEDS_REDO)
8959 {
8960 page = BufferGetPage(obuffer);
8961 offnum = xlrec->old_offnum;
8962 if (PageGetMaxOffsetNumber(page) >= offnum)
8963 lp = PageGetItemId(page, offnum);
8964
8965 if (PageGetMaxOffsetNumber(page) < offnum || !ItemIdIsNormal(lp))
8966 elog(PANIC, "invalid lp");
8967
8968 htup = (HeapTupleHeader) PageGetItem(page, lp);
8969
8970 oldtup.t_data = htup;
8971 oldtup.t_len = ItemIdGetLength(lp);
8972
8973 htup->t_infomask &= ~(HEAP_XMAX_BITS | HEAP_MOVED);
8974 htup->t_infomask2 &= ~HEAP_KEYS_UPDATED;
8975 if (hot_update)
8976 HeapTupleHeaderSetHotUpdated(htup);
8977 else
8978 HeapTupleHeaderClearHotUpdated(htup);
8979 fix_infomask_from_infobits(xlrec->old_infobits_set, &htup->t_infomask,
8980 &htup->t_infomask2);
8981 HeapTupleHeaderSetXmax(htup, xlrec->old_xmax);
8982 HeapTupleHeaderSetCmax(htup, FirstCommandId, false);
8983 /* Set forward chain link in t_ctid */
8984 htup->t_ctid = newtid;
8985
8986 /* Mark the page as a candidate for pruning */
8987 PageSetPrunable(page, XLogRecGetXid(record));
8988
8989 if (xlrec->flags & XLH_UPDATE_OLD_ALL_VISIBLE_CLEARED)
8990 PageClearAllVisible(page);
8991
8992 PageSetLSN(page, lsn);
8993 MarkBufferDirty(obuffer);
8994 }
8995
8996 /*
8997 * Read the page the new tuple goes into, if different from old.
8998 */
8999 if (oldblk == newblk)
9000 {
9001 nbuffer = obuffer;
9002 newaction = oldaction;
9003 }
9004 else if (XLogRecGetInfo(record) & XLOG_HEAP_INIT_PAGE)
9005 {
9006 nbuffer = XLogInitBufferForRedo(record, 0);
9007 page = (Page) BufferGetPage(nbuffer);
9008 PageInit(page, BufferGetPageSize(nbuffer), 0);
9009 newaction = BLK_NEEDS_REDO;
9010 }
9011 else
9012 newaction = XLogReadBufferForRedo(record, 0, &nbuffer);
9013
9014 /*
9015 * The visibility map may need to be fixed even if the heap page is
9016 * already up-to-date.
9017 */
9018 if (xlrec->flags & XLH_UPDATE_NEW_ALL_VISIBLE_CLEARED)
9019 {
9020 Relation reln = CreateFakeRelcacheEntry(rnode);
9021 Buffer vmbuffer = InvalidBuffer;
9022
9023 visibilitymap_pin(reln, newblk, &vmbuffer);
9024 visibilitymap_clear(reln, newblk, vmbuffer, VISIBILITYMAP_VALID_BITS);
9025 ReleaseBuffer(vmbuffer);
9026 FreeFakeRelcacheEntry(reln);
9027 }
9028
9029 /* Deal with new tuple */
9030 if (newaction == BLK_NEEDS_REDO)
9031 {
9032 char *recdata;
9033 char *recdata_end;
9034 Size datalen;
9035 Size tuplen;
9036
9037 recdata = XLogRecGetBlockData(record, 0, &datalen);
9038 recdata_end = recdata + datalen;
9039
9040 page = BufferGetPage(nbuffer);
9041
9042 offnum = xlrec->new_offnum;
9043 if (PageGetMaxOffsetNumber(page) + 1 < offnum)
9044 elog(PANIC, "invalid max offset number");
9045
9046 if (xlrec->flags & XLH_UPDATE_PREFIX_FROM_OLD)
9047 {
9048 Assert(newblk == oldblk);
9049 memcpy(&prefixlen, recdata, sizeof(uint16));
9050 recdata += sizeof(uint16);
9051 }
9052 if (xlrec->flags & XLH_UPDATE_SUFFIX_FROM_OLD)
9053 {
9054 Assert(newblk == oldblk);
9055 memcpy(&suffixlen, recdata, sizeof(uint16));
9056 recdata += sizeof(uint16);
9057 }
9058
9059 memcpy((char *) &xlhdr, recdata, SizeOfHeapHeader);
9060 recdata += SizeOfHeapHeader;
9061
9062 tuplen = recdata_end - recdata;
9063 Assert(tuplen <= MaxHeapTupleSize);
9064
9065 htup = &tbuf.hdr;
9066 MemSet((char *) htup, 0, SizeofHeapTupleHeader);
9067
9068 /*
9069 * Reconstruct the new tuple using the prefix and/or suffix from the
9070 * old tuple, and the data stored in the WAL record.
9071 */
9072 newp = (char *) htup + SizeofHeapTupleHeader;
9073 if (prefixlen > 0)
9074 {
9075 int len;
9076
9077 /* copy bitmap [+ padding] [+ oid] from WAL record */
9078 len = xlhdr.t_hoff - SizeofHeapTupleHeader;
9079 memcpy(newp, recdata, len);
9080 recdata += len;
9081 newp += len;
9082
9083 /* copy prefix from old tuple */
9084 memcpy(newp, (char *) oldtup.t_data + oldtup.t_data->t_hoff, prefixlen);
9085 newp += prefixlen;
9086
9087 /* copy new tuple data from WAL record */
9088 len = tuplen - (xlhdr.t_hoff - SizeofHeapTupleHeader);
9089 memcpy(newp, recdata, len);
9090 recdata += len;
9091 newp += len;
9092 }
9093 else
9094 {
9095 /*
9096 * copy bitmap [+ padding] [+ oid] + data from record, all in one
9097 * go
9098 */
9099 memcpy(newp, recdata, tuplen);
9100 recdata += tuplen;
9101 newp += tuplen;
9102 }
9103 Assert(recdata == recdata_end);
9104
9105 /* copy suffix from old tuple */
9106 if (suffixlen > 0)
9107 memcpy(newp, (char *) oldtup.t_data + oldtup.t_len - suffixlen, suffixlen);
9108
9109 newlen = SizeofHeapTupleHeader + tuplen + prefixlen + suffixlen;
9110 htup->t_infomask2 = xlhdr.t_infomask2;
9111 htup->t_infomask = xlhdr.t_infomask;
9112 htup->t_hoff = xlhdr.t_hoff;
9113
9114 HeapTupleHeaderSetXmin(htup, XLogRecGetXid(record));
9115 HeapTupleHeaderSetCmin(htup, FirstCommandId);
9116 HeapTupleHeaderSetXmax(htup, xlrec->new_xmax);
9117 /* Make sure there is no forward chain link in t_ctid */
9118 htup->t_ctid = newtid;
9119
9120 offnum = PageAddItem(page, (Item) htup, newlen, offnum, true, true);
9121 if (offnum == InvalidOffsetNumber)
9122 elog(PANIC, "failed to add tuple");
9123
9124 if (xlrec->flags & XLH_UPDATE_NEW_ALL_VISIBLE_CLEARED)
9125 PageClearAllVisible(page);
9126
9127 freespace = PageGetHeapFreeSpace(page); /* needed to update FSM below */
9128
9129 PageSetLSN(page, lsn);
9130 MarkBufferDirty(nbuffer);
9131 }
9132
9133 if (BufferIsValid(nbuffer) && nbuffer != obuffer)
9134 UnlockReleaseBuffer(nbuffer);
9135 if (BufferIsValid(obuffer))
9136 UnlockReleaseBuffer(obuffer);
9137
9138 /*
9139 * If the new page is running low on free space, update the FSM as well.
9140 * Arbitrarily, our definition of "low" is less than 20%. We can't do much
9141 * better than that without knowing the fill-factor for the table.
9142 *
9143 * However, don't update the FSM on HOT updates, because after crash
9144 * recovery, either the old or the new tuple will certainly be dead and
9145 * prunable. After pruning, the page will have roughly as much free space
9146 * as it did before the update, assuming the new tuple is about the same
9147 * size as the old one.
9148 *
9149 * XXX: Don't do this if the page was restored from full page image. We
9150 * don't bother to update the FSM in that case, it doesn't need to be
9151 * totally accurate anyway.
9152 */
9153 if (newaction == BLK_NEEDS_REDO && !hot_update && freespace < BLCKSZ / 5)
9154 XLogRecordPageWithFreeSpace(rnode, newblk, freespace);
9155 }
9156
9157 static void
heap_xlog_confirm(XLogReaderState * record)9158 heap_xlog_confirm(XLogReaderState *record)
9159 {
9160 XLogRecPtr lsn = record->EndRecPtr;
9161 xl_heap_confirm *xlrec = (xl_heap_confirm *) XLogRecGetData(record);
9162 Buffer buffer;
9163 Page page;
9164 OffsetNumber offnum;
9165 ItemId lp = NULL;
9166 HeapTupleHeader htup;
9167
9168 if (XLogReadBufferForRedo(record, 0, &buffer) == BLK_NEEDS_REDO)
9169 {
9170 page = BufferGetPage(buffer);
9171
9172 offnum = xlrec->offnum;
9173 if (PageGetMaxOffsetNumber(page) >= offnum)
9174 lp = PageGetItemId(page, offnum);
9175
9176 if (PageGetMaxOffsetNumber(page) < offnum || !ItemIdIsNormal(lp))
9177 elog(PANIC, "invalid lp");
9178
9179 htup = (HeapTupleHeader) PageGetItem(page, lp);
9180
9181 /*
9182 * Confirm tuple as actually inserted
9183 */
9184 ItemPointerSet(&htup->t_ctid, BufferGetBlockNumber(buffer), offnum);
9185
9186 PageSetLSN(page, lsn);
9187 MarkBufferDirty(buffer);
9188 }
9189 if (BufferIsValid(buffer))
9190 UnlockReleaseBuffer(buffer);
9191 }
9192
9193 static void
heap_xlog_lock(XLogReaderState * record)9194 heap_xlog_lock(XLogReaderState *record)
9195 {
9196 XLogRecPtr lsn = record->EndRecPtr;
9197 xl_heap_lock *xlrec = (xl_heap_lock *) XLogRecGetData(record);
9198 Buffer buffer;
9199 Page page;
9200 OffsetNumber offnum;
9201 ItemId lp = NULL;
9202 HeapTupleHeader htup;
9203
9204 /*
9205 * The visibility map may need to be fixed even if the heap page is
9206 * already up-to-date.
9207 */
9208 if (xlrec->flags & XLH_LOCK_ALL_FROZEN_CLEARED)
9209 {
9210 RelFileNode rnode;
9211 Buffer vmbuffer = InvalidBuffer;
9212 BlockNumber block;
9213 Relation reln;
9214
9215 XLogRecGetBlockTag(record, 0, &rnode, NULL, &block);
9216 reln = CreateFakeRelcacheEntry(rnode);
9217
9218 visibilitymap_pin(reln, block, &vmbuffer);
9219 visibilitymap_clear(reln, block, vmbuffer, VISIBILITYMAP_ALL_FROZEN);
9220
9221 ReleaseBuffer(vmbuffer);
9222 FreeFakeRelcacheEntry(reln);
9223 }
9224
9225 if (XLogReadBufferForRedo(record, 0, &buffer) == BLK_NEEDS_REDO)
9226 {
9227 page = (Page) BufferGetPage(buffer);
9228
9229 offnum = xlrec->offnum;
9230 if (PageGetMaxOffsetNumber(page) >= offnum)
9231 lp = PageGetItemId(page, offnum);
9232
9233 if (PageGetMaxOffsetNumber(page) < offnum || !ItemIdIsNormal(lp))
9234 elog(PANIC, "invalid lp");
9235
9236 htup = (HeapTupleHeader) PageGetItem(page, lp);
9237
9238 htup->t_infomask &= ~(HEAP_XMAX_BITS | HEAP_MOVED);
9239 htup->t_infomask2 &= ~HEAP_KEYS_UPDATED;
9240 fix_infomask_from_infobits(xlrec->infobits_set, &htup->t_infomask,
9241 &htup->t_infomask2);
9242
9243 /*
9244 * Clear relevant update flags, but only if the modified infomask says
9245 * there's no update.
9246 */
9247 if (HEAP_XMAX_IS_LOCKED_ONLY(htup->t_infomask))
9248 {
9249 HeapTupleHeaderClearHotUpdated(htup);
9250 /* Make sure there is no forward chain link in t_ctid */
9251 ItemPointerSet(&htup->t_ctid,
9252 BufferGetBlockNumber(buffer),
9253 offnum);
9254 }
9255 HeapTupleHeaderSetXmax(htup, xlrec->locking_xid);
9256 HeapTupleHeaderSetCmax(htup, FirstCommandId, false);
9257 PageSetLSN(page, lsn);
9258 MarkBufferDirty(buffer);
9259 }
9260 if (BufferIsValid(buffer))
9261 UnlockReleaseBuffer(buffer);
9262 }
9263
9264 static void
heap_xlog_lock_updated(XLogReaderState * record)9265 heap_xlog_lock_updated(XLogReaderState *record)
9266 {
9267 XLogRecPtr lsn = record->EndRecPtr;
9268 xl_heap_lock_updated *xlrec;
9269 Buffer buffer;
9270 Page page;
9271 OffsetNumber offnum;
9272 ItemId lp = NULL;
9273 HeapTupleHeader htup;
9274
9275 xlrec = (xl_heap_lock_updated *) XLogRecGetData(record);
9276
9277 /*
9278 * The visibility map may need to be fixed even if the heap page is
9279 * already up-to-date.
9280 */
9281 if (xlrec->flags & XLH_LOCK_ALL_FROZEN_CLEARED)
9282 {
9283 RelFileNode rnode;
9284 Buffer vmbuffer = InvalidBuffer;
9285 BlockNumber block;
9286 Relation reln;
9287
9288 XLogRecGetBlockTag(record, 0, &rnode, NULL, &block);
9289 reln = CreateFakeRelcacheEntry(rnode);
9290
9291 visibilitymap_pin(reln, block, &vmbuffer);
9292 visibilitymap_clear(reln, block, vmbuffer, VISIBILITYMAP_ALL_FROZEN);
9293
9294 ReleaseBuffer(vmbuffer);
9295 FreeFakeRelcacheEntry(reln);
9296 }
9297
9298 if (XLogReadBufferForRedo(record, 0, &buffer) == BLK_NEEDS_REDO)
9299 {
9300 page = BufferGetPage(buffer);
9301
9302 offnum = xlrec->offnum;
9303 if (PageGetMaxOffsetNumber(page) >= offnum)
9304 lp = PageGetItemId(page, offnum);
9305
9306 if (PageGetMaxOffsetNumber(page) < offnum || !ItemIdIsNormal(lp))
9307 elog(PANIC, "invalid lp");
9308
9309 htup = (HeapTupleHeader) PageGetItem(page, lp);
9310
9311 htup->t_infomask &= ~(HEAP_XMAX_BITS | HEAP_MOVED);
9312 htup->t_infomask2 &= ~HEAP_KEYS_UPDATED;
9313 fix_infomask_from_infobits(xlrec->infobits_set, &htup->t_infomask,
9314 &htup->t_infomask2);
9315 HeapTupleHeaderSetXmax(htup, xlrec->xmax);
9316
9317 PageSetLSN(page, lsn);
9318 MarkBufferDirty(buffer);
9319 }
9320 if (BufferIsValid(buffer))
9321 UnlockReleaseBuffer(buffer);
9322 }
9323
9324 static void
heap_xlog_inplace(XLogReaderState * record)9325 heap_xlog_inplace(XLogReaderState *record)
9326 {
9327 XLogRecPtr lsn = record->EndRecPtr;
9328 xl_heap_inplace *xlrec = (xl_heap_inplace *) XLogRecGetData(record);
9329 Buffer buffer;
9330 Page page;
9331 OffsetNumber offnum;
9332 ItemId lp = NULL;
9333 HeapTupleHeader htup;
9334 uint32 oldlen;
9335 Size newlen;
9336
9337 if (XLogReadBufferForRedo(record, 0, &buffer) == BLK_NEEDS_REDO)
9338 {
9339 char *newtup = XLogRecGetBlockData(record, 0, &newlen);
9340
9341 page = BufferGetPage(buffer);
9342
9343 offnum = xlrec->offnum;
9344 if (PageGetMaxOffsetNumber(page) >= offnum)
9345 lp = PageGetItemId(page, offnum);
9346
9347 if (PageGetMaxOffsetNumber(page) < offnum || !ItemIdIsNormal(lp))
9348 elog(PANIC, "invalid lp");
9349
9350 htup = (HeapTupleHeader) PageGetItem(page, lp);
9351
9352 oldlen = ItemIdGetLength(lp) - htup->t_hoff;
9353 if (oldlen != newlen)
9354 elog(PANIC, "wrong tuple length");
9355
9356 memcpy((char *) htup + htup->t_hoff, newtup, newlen);
9357
9358 PageSetLSN(page, lsn);
9359 MarkBufferDirty(buffer);
9360 }
9361 if (BufferIsValid(buffer))
9362 UnlockReleaseBuffer(buffer);
9363 }
9364
9365 void
heap_redo(XLogReaderState * record)9366 heap_redo(XLogReaderState *record)
9367 {
9368 uint8 info = XLogRecGetInfo(record) & ~XLR_INFO_MASK;
9369
9370 /*
9371 * These operations don't overwrite MVCC data so no conflict processing is
9372 * required. The ones in heap2 rmgr do.
9373 */
9374
9375 switch (info & XLOG_HEAP_OPMASK)
9376 {
9377 case XLOG_HEAP_INSERT:
9378 heap_xlog_insert(record);
9379 break;
9380 case XLOG_HEAP_DELETE:
9381 heap_xlog_delete(record);
9382 break;
9383 case XLOG_HEAP_UPDATE:
9384 heap_xlog_update(record, false);
9385 break;
9386 case XLOG_HEAP_TRUNCATE:
9387
9388 /*
9389 * TRUNCATE is a no-op because the actions are already logged as
9390 * SMGR WAL records. TRUNCATE WAL record only exists for logical
9391 * decoding.
9392 */
9393 break;
9394 case XLOG_HEAP_HOT_UPDATE:
9395 heap_xlog_update(record, true);
9396 break;
9397 case XLOG_HEAP_CONFIRM:
9398 heap_xlog_confirm(record);
9399 break;
9400 case XLOG_HEAP_LOCK:
9401 heap_xlog_lock(record);
9402 break;
9403 case XLOG_HEAP_INPLACE:
9404 heap_xlog_inplace(record);
9405 break;
9406 default:
9407 elog(PANIC, "heap_redo: unknown op code %u", info);
9408 }
9409 }
9410
9411 void
heap2_redo(XLogReaderState * record)9412 heap2_redo(XLogReaderState *record)
9413 {
9414 uint8 info = XLogRecGetInfo(record) & ~XLR_INFO_MASK;
9415
9416 switch (info & XLOG_HEAP_OPMASK)
9417 {
9418 case XLOG_HEAP2_CLEAN:
9419 heap_xlog_clean(record);
9420 break;
9421 case XLOG_HEAP2_FREEZE_PAGE:
9422 heap_xlog_freeze_page(record);
9423 break;
9424 case XLOG_HEAP2_CLEANUP_INFO:
9425 heap_xlog_cleanup_info(record);
9426 break;
9427 case XLOG_HEAP2_VISIBLE:
9428 heap_xlog_visible(record);
9429 break;
9430 case XLOG_HEAP2_MULTI_INSERT:
9431 heap_xlog_multi_insert(record);
9432 break;
9433 case XLOG_HEAP2_LOCK_UPDATED:
9434 heap_xlog_lock_updated(record);
9435 break;
9436 case XLOG_HEAP2_NEW_CID:
9437
9438 /*
9439 * Nothing to do on a real replay, only used during logical
9440 * decoding.
9441 */
9442 break;
9443 case XLOG_HEAP2_REWRITE:
9444 heap_xlog_logical_rewrite(record);
9445 break;
9446 default:
9447 elog(PANIC, "heap2_redo: unknown op code %u", info);
9448 }
9449 }
9450
9451 /*
9452 * heap_sync - sync a heap, for use when no WAL has been written
9453 *
9454 * This forces the heap contents (including TOAST heap if any) down to disk.
9455 * If we skipped using WAL, and WAL is otherwise needed, we must force the
9456 * relation down to disk before it's safe to commit the transaction. This
9457 * requires writing out any dirty buffers and then doing a forced fsync.
9458 *
9459 * Indexes are not touched. (Currently, index operations associated with
9460 * the commands that use this are WAL-logged and so do not need fsync.
9461 * That behavior might change someday, but in any case it's likely that
9462 * any fsync decisions required would be per-index and hence not appropriate
9463 * to be done here.)
9464 */
9465 void
heap_sync(Relation rel)9466 heap_sync(Relation rel)
9467 {
9468 /* non-WAL-logged tables never need fsync */
9469 if (!RelationNeedsWAL(rel))
9470 return;
9471
9472 /* main heap */
9473 FlushRelationBuffers(rel);
9474 /* FlushRelationBuffers will have opened rd_smgr */
9475 smgrimmedsync(rel->rd_smgr, MAIN_FORKNUM);
9476
9477 /* FSM is not critical, don't bother syncing it */
9478
9479 /* toast heap, if any */
9480 if (OidIsValid(rel->rd_rel->reltoastrelid))
9481 {
9482 Relation toastrel;
9483
9484 toastrel = heap_open(rel->rd_rel->reltoastrelid, AccessShareLock);
9485 FlushRelationBuffers(toastrel);
9486 smgrimmedsync(toastrel->rd_smgr, MAIN_FORKNUM);
9487 heap_close(toastrel, AccessShareLock);
9488 }
9489 }
9490
9491 /*
9492 * Mask a heap page before performing consistency checks on it.
9493 */
9494 void
heap_mask(char * pagedata,BlockNumber blkno)9495 heap_mask(char *pagedata, BlockNumber blkno)
9496 {
9497 Page page = (Page) pagedata;
9498 OffsetNumber off;
9499
9500 mask_page_lsn_and_checksum(page);
9501
9502 mask_page_hint_bits(page);
9503 mask_unused_space(page);
9504
9505 for (off = 1; off <= PageGetMaxOffsetNumber(page); off++)
9506 {
9507 ItemId iid = PageGetItemId(page, off);
9508 char *page_item;
9509
9510 page_item = (char *) (page + ItemIdGetOffset(iid));
9511
9512 if (ItemIdIsNormal(iid))
9513 {
9514 HeapTupleHeader page_htup = (HeapTupleHeader) page_item;
9515
9516 /*
9517 * If xmin of a tuple is not yet frozen, we should ignore
9518 * differences in hint bits, since they can be set without
9519 * emitting WAL.
9520 */
9521 if (!HeapTupleHeaderXminFrozen(page_htup))
9522 page_htup->t_infomask &= ~HEAP_XACT_MASK;
9523 else
9524 {
9525 /* Still we need to mask xmax hint bits. */
9526 page_htup->t_infomask &= ~HEAP_XMAX_INVALID;
9527 page_htup->t_infomask &= ~HEAP_XMAX_COMMITTED;
9528 }
9529
9530 /*
9531 * During replay, we set Command Id to FirstCommandId. Hence, mask
9532 * it. See heap_xlog_insert() for details.
9533 */
9534 page_htup->t_choice.t_heap.t_field3.t_cid = MASK_MARKER;
9535
9536 /*
9537 * For a speculative tuple, heap_insert() does not set ctid in the
9538 * caller-passed heap tuple itself, leaving the ctid field to
9539 * contain a speculative token value - a per-backend monotonically
9540 * increasing identifier. Besides, it does not WAL-log ctid under
9541 * any circumstances.
9542 *
9543 * During redo, heap_xlog_insert() sets t_ctid to current block
9544 * number and self offset number. It doesn't care about any
9545 * speculative insertions in master. Hence, we set t_ctid to
9546 * current block number and self offset number to ignore any
9547 * inconsistency.
9548 */
9549 if (HeapTupleHeaderIsSpeculative(page_htup))
9550 ItemPointerSet(&page_htup->t_ctid, blkno, off);
9551
9552 /*
9553 * NB: Not ignoring ctid changes due to the tuple having moved
9554 * (i.e. HeapTupleHeaderIndicatesMovedPartitions), because that's
9555 * important information that needs to be in-sync between primary
9556 * and standby, and thus is WAL logged.
9557 */
9558 }
9559
9560 /*
9561 * Ignore any padding bytes after the tuple, when the length of the
9562 * item is not MAXALIGNed.
9563 */
9564 if (ItemIdHasStorage(iid))
9565 {
9566 int len = ItemIdGetLength(iid);
9567 int padlen = MAXALIGN(len) - len;
9568
9569 if (padlen > 0)
9570 memset(page_item + len, MASK_MARKER, padlen);
9571 }
9572 }
9573 }
9574