1 /*-------------------------------------------------------------------------
2 *
3 * gistxlog.c
4 * WAL replay logic for GiST.
5 *
6 *
7 * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
8 * Portions Copyright (c) 1994, Regents of the University of California
9 *
10 * IDENTIFICATION
11 * src/backend/access/gist/gistxlog.c
12 *-------------------------------------------------------------------------
13 */
14 #include "postgres.h"
15
16 #include "access/gist_private.h"
17 #include "access/heapam_xlog.h"
18 #include "access/transam.h"
19 #include "access/xloginsert.h"
20 #include "access/xlogutils.h"
21 #include "miscadmin.h"
22 #include "storage/procarray.h"
23 #include "utils/memutils.h"
24
25 static MemoryContext opCtx; /* working memory for operations */
26
27 /*
28 * Replay the clearing of F_FOLLOW_RIGHT flag on a child page.
29 *
30 * Even if the WAL record includes a full-page image, we have to update the
31 * follow-right flag, because that change is not included in the full-page
32 * image. To be sure that the intermediate state with the wrong flag value is
33 * not visible to concurrent Hot Standby queries, this function handles
34 * restoring the full-page image as well as updating the flag. (Note that
35 * we never need to do anything else to the child page in the current WAL
36 * action.)
37 */
38 static void
gistRedoClearFollowRight(XLogReaderState * record,uint8 block_id)39 gistRedoClearFollowRight(XLogReaderState *record, uint8 block_id)
40 {
41 XLogRecPtr lsn = record->EndRecPtr;
42 Buffer buffer;
43 Page page;
44 XLogRedoAction action;
45
46 /*
47 * Note that we still update the page even if it was restored from a full
48 * page image, because the updated NSN is not included in the image.
49 */
50 action = XLogReadBufferForRedo(record, block_id, &buffer);
51 if (action == BLK_NEEDS_REDO || action == BLK_RESTORED)
52 {
53 page = BufferGetPage(buffer);
54
55 GistPageSetNSN(page, lsn);
56 GistClearFollowRight(page);
57
58 PageSetLSN(page, lsn);
59 MarkBufferDirty(buffer);
60 }
61 if (BufferIsValid(buffer))
62 UnlockReleaseBuffer(buffer);
63 }
64
65 /*
66 * Get the latestRemovedXid from the heap pages pointed at by the index
67 * tuples being deleted. See also btree_xlog_delete_get_latestRemovedXid,
68 * on which this function is based.
69 */
70 static TransactionId
gistRedoPageUpdateRecordGetLatestRemovedXid(XLogReaderState * record)71 gistRedoPageUpdateRecordGetLatestRemovedXid(XLogReaderState *record)
72 {
73 gistxlogPageUpdate *xlrec = (gistxlogPageUpdate *) XLogRecGetData(record);
74 OffsetNumber *todelete;
75 Buffer ibuffer,
76 hbuffer;
77 Page ipage,
78 hpage;
79 RelFileNode rnode,
80 *hnode;
81 BlockNumber blkno;
82 ItemId iitemid,
83 hitemid;
84 IndexTuple itup;
85 HeapTupleHeader htuphdr;
86 BlockNumber hblkno;
87 OffsetNumber hoffnum;
88 TransactionId latestRemovedXid = InvalidTransactionId;
89 int i;
90
91 /*
92 * If there's nothing running on the standby we don't need to derive a
93 * full latestRemovedXid value, so use a fast path out of here. This
94 * returns InvalidTransactionId, and so will conflict with all HS
95 * transactions; but since we just worked out that that's zero people,
96 * it's OK.
97 *
98 * XXX There is a race condition here, which is that a new backend might
99 * start just after we look. If so, it cannot need to conflict, but this
100 * coding will result in throwing a conflict anyway.
101 */
102 if (CountDBBackends(InvalidOid) == 0)
103 return latestRemovedXid;
104
105 /*
106 * In what follows, we have to examine the previous state of the index
107 * page, as well as the heap page(s) it points to. This is only valid if
108 * WAL replay has reached a consistent database state; which means that
109 * the preceding check is not just an optimization, but is *necessary*. We
110 * won't have let in any user sessions before we reach consistency.
111 */
112 if (!reachedConsistency)
113 elog(PANIC, "gistRedoDeleteRecordGetLatestRemovedXid: cannot operate with inconsistent data");
114
115 /*
116 * Get index page. If the DB is consistent, this should not fail, nor
117 * should any of the heap page fetches below. If one does, we return
118 * InvalidTransactionId to cancel all HS transactions. That's probably
119 * overkill, but it's safe, and certainly better than panicking here.
120 */
121 XLogRecGetBlockTag(record, 0, &rnode, NULL, &blkno);
122 ibuffer = XLogReadBufferExtended(rnode, MAIN_FORKNUM, blkno, RBM_NORMAL);
123 if (!BufferIsValid(ibuffer))
124 return InvalidTransactionId;
125 LockBuffer(ibuffer, BUFFER_LOCK_EXCLUSIVE);
126 ipage = (Page) BufferGetPage(ibuffer);
127
128 /*
129 * Loop through the deleted index items to obtain the TransactionId from
130 * the heap items they point to.
131 */
132 hnode = (RelFileNode *) ((char *) xlrec + sizeof(gistxlogPageUpdate));
133 todelete = (OffsetNumber *) ((char *) hnode + sizeof(RelFileNode));
134
135 for (i = 0; i < xlrec->ntodelete; i++)
136 {
137 /*
138 * Identify the index tuple about to be deleted
139 */
140 iitemid = PageGetItemId(ipage, todelete[i]);
141 itup = (IndexTuple) PageGetItem(ipage, iitemid);
142
143 /*
144 * Locate the heap page that the index tuple points at
145 */
146 hblkno = ItemPointerGetBlockNumber(&(itup->t_tid));
147 hbuffer = XLogReadBufferExtended(*hnode, MAIN_FORKNUM, hblkno, RBM_NORMAL);
148 if (!BufferIsValid(hbuffer))
149 {
150 UnlockReleaseBuffer(ibuffer);
151 return InvalidTransactionId;
152 }
153 LockBuffer(hbuffer, BUFFER_LOCK_SHARE);
154 hpage = (Page) BufferGetPage(hbuffer);
155
156 /*
157 * Look up the heap tuple header that the index tuple points at by
158 * using the heap node supplied with the xlrec. We can't use
159 * heap_fetch, since it uses ReadBuffer rather than XLogReadBuffer.
160 * Note that we are not looking at tuple data here, just headers.
161 */
162 hoffnum = ItemPointerGetOffsetNumber(&(itup->t_tid));
163 hitemid = PageGetItemId(hpage, hoffnum);
164
165 /*
166 * Follow any redirections until we find something useful.
167 */
168 while (ItemIdIsRedirected(hitemid))
169 {
170 hoffnum = ItemIdGetRedirect(hitemid);
171 hitemid = PageGetItemId(hpage, hoffnum);
172 CHECK_FOR_INTERRUPTS();
173 }
174
175 /*
176 * If the heap item has storage, then read the header and use that to
177 * set latestRemovedXid.
178 *
179 * Some LP_DEAD items may not be accessible, so we ignore them.
180 */
181 if (ItemIdHasStorage(hitemid))
182 {
183 htuphdr = (HeapTupleHeader) PageGetItem(hpage, hitemid);
184
185 HeapTupleHeaderAdvanceLatestRemovedXid(htuphdr, &latestRemovedXid);
186 }
187 else if (ItemIdIsDead(hitemid))
188 {
189 /*
190 * Conjecture: if hitemid is dead then it had xids before the xids
191 * marked on LP_NORMAL items. So we just ignore this item and move
192 * onto the next, for the purposes of calculating
193 * latestRemovedxids.
194 */
195 }
196 else
197 Assert(!ItemIdIsUsed(hitemid));
198
199 UnlockReleaseBuffer(hbuffer);
200 }
201
202 UnlockReleaseBuffer(ibuffer);
203
204 /*
205 * If all heap tuples were LP_DEAD then we will be returning
206 * InvalidTransactionId here, which avoids conflicts. This matches
207 * existing logic which assumes that LP_DEAD tuples must already be older
208 * than the latestRemovedXid on the cleanup record that set them as
209 * LP_DEAD, hence must already have generated a conflict.
210 */
211 return latestRemovedXid;
212 }
213
214 /*
215 * redo any page update (except page split)
216 */
217 static void
gistRedoPageUpdateRecord(XLogReaderState * record)218 gistRedoPageUpdateRecord(XLogReaderState *record)
219 {
220 XLogRecPtr lsn = record->EndRecPtr;
221 gistxlogPageUpdate *xldata = (gistxlogPageUpdate *) XLogRecGetData(record);
222 Buffer buffer;
223 Page page;
224
225 /*
226 * If we have any conflict processing to do, it must happen before we
227 * update the page.
228 *
229 * Support for conflict processing in GiST has been backpatched. This is
230 * why we have to use tricky way of saving WAL-compatibility between minor
231 * versions. Information required for conflict processing is just
232 * appended to data of XLOG_GIST_PAGE_UPDATE record. So, PostgreSQL
233 * version, which doesn't know about conflict processing, will just ignore
234 * that.
235 *
236 * GiST delete records can conflict with standby queries. You might think
237 * that vacuum records would conflict as well, but we've handled that
238 * already. XLOG_HEAP2_CLEANUP_INFO records provide the highest xid
239 * cleaned by the vacuum of the heap and so we can resolve any conflicts
240 * just once when that arrives. After that we know that no conflicts
241 * exist from individual gist vacuum records on that index.
242 */
243 if (InHotStandby && XLogRecGetDataLen(record) > sizeof(gistxlogPageUpdate))
244 {
245 TransactionId latestRemovedXid = gistRedoPageUpdateRecordGetLatestRemovedXid(record);
246 RelFileNode rnode;
247
248 XLogRecGetBlockTag(record, 0, &rnode, NULL, NULL);
249
250 ResolveRecoveryConflictWithSnapshot(latestRemovedXid, rnode);
251 }
252
253 if (XLogReadBufferForRedo(record, 0, &buffer) == BLK_NEEDS_REDO)
254 {
255 char *begin;
256 char *data;
257 Size datalen;
258 int ninserted = 0;
259
260 data = begin = XLogRecGetBlockData(record, 0, &datalen);
261
262 page = (Page) BufferGetPage(buffer);
263
264 /* Delete old tuples */
265 if (xldata->ntodelete > 0)
266 {
267 OffsetNumber *todelete = (OffsetNumber *) data;
268
269 data += sizeof(OffsetNumber) * xldata->ntodelete;
270
271 PageIndexMultiDelete(page, todelete, xldata->ntodelete);
272 if (GistPageIsLeaf(page))
273 GistMarkTuplesDeleted(page);
274 }
275
276 /* add tuples */
277 if (data - begin < datalen)
278 {
279 OffsetNumber off = (PageIsEmpty(page)) ? FirstOffsetNumber :
280 OffsetNumberNext(PageGetMaxOffsetNumber(page));
281
282 while (data - begin < datalen)
283 {
284 IndexTuple itup = (IndexTuple) data;
285 Size sz = IndexTupleSize(itup);
286 OffsetNumber l;
287
288 data += sz;
289
290 l = PageAddItem(page, (Item) itup, sz, off, false, false);
291 if (l == InvalidOffsetNumber)
292 elog(ERROR, "failed to add item to GiST index page, size %d bytes",
293 (int) sz);
294 off++;
295 ninserted++;
296 }
297 }
298
299 Assert(ninserted == xldata->ntoinsert);
300
301 PageSetLSN(page, lsn);
302 MarkBufferDirty(buffer);
303 }
304
305 /*
306 * Fix follow-right data on left child page
307 *
308 * This must be done while still holding the lock on the target page. Note
309 * that even if the target page no longer exists, we still attempt to
310 * replay the change on the child page.
311 */
312 if (XLogRecHasBlockRef(record, 1))
313 gistRedoClearFollowRight(record, 1);
314
315 if (BufferIsValid(buffer))
316 UnlockReleaseBuffer(buffer);
317 }
318
319 /*
320 * Returns an array of index pointers.
321 */
322 static IndexTuple *
decodePageSplitRecord(char * begin,int len,int * n)323 decodePageSplitRecord(char *begin, int len, int *n)
324 {
325 char *ptr;
326 int i = 0;
327 IndexTuple *tuples;
328
329 /* extract the number of tuples */
330 memcpy(n, begin, sizeof(int));
331 ptr = begin + sizeof(int);
332
333 tuples = palloc(*n * sizeof(IndexTuple));
334
335 for (i = 0; i < *n; i++)
336 {
337 Assert(ptr - begin < len);
338 tuples[i] = (IndexTuple) ptr;
339 ptr += IndexTupleSize((IndexTuple) ptr);
340 }
341 Assert(ptr - begin == len);
342
343 return tuples;
344 }
345
346 static void
gistRedoPageSplitRecord(XLogReaderState * record)347 gistRedoPageSplitRecord(XLogReaderState *record)
348 {
349 XLogRecPtr lsn = record->EndRecPtr;
350 gistxlogPageSplit *xldata = (gistxlogPageSplit *) XLogRecGetData(record);
351 Buffer firstbuffer = InvalidBuffer;
352 Buffer buffer;
353 Page page;
354 int i;
355 bool isrootsplit = false;
356
357 /*
358 * We must hold lock on the first-listed page throughout the action,
359 * including while updating the left child page (if any). We can unlock
360 * remaining pages in the list as soon as they've been written, because
361 * there is no path for concurrent queries to reach those pages without
362 * first visiting the first-listed page.
363 */
364
365 /* loop around all pages */
366 for (i = 0; i < xldata->npage; i++)
367 {
368 int flags;
369 char *data;
370 Size datalen;
371 int num;
372 BlockNumber blkno;
373 IndexTuple *tuples;
374
375 XLogRecGetBlockTag(record, i + 1, NULL, NULL, &blkno);
376 if (blkno == GIST_ROOT_BLKNO)
377 {
378 Assert(i == 0);
379 isrootsplit = true;
380 }
381
382 buffer = XLogInitBufferForRedo(record, i + 1);
383 page = (Page) BufferGetPage(buffer);
384 data = XLogRecGetBlockData(record, i + 1, &datalen);
385
386 tuples = decodePageSplitRecord(data, datalen, &num);
387
388 /* ok, clear buffer */
389 if (xldata->origleaf && blkno != GIST_ROOT_BLKNO)
390 flags = F_LEAF;
391 else
392 flags = 0;
393 GISTInitBuffer(buffer, flags);
394
395 /* and fill it */
396 gistfillbuffer(page, tuples, num, FirstOffsetNumber);
397
398 if (blkno == GIST_ROOT_BLKNO)
399 {
400 GistPageGetOpaque(page)->rightlink = InvalidBlockNumber;
401 GistPageSetNSN(page, xldata->orignsn);
402 GistClearFollowRight(page);
403 }
404 else
405 {
406 if (i < xldata->npage - 1)
407 {
408 BlockNumber nextblkno;
409
410 XLogRecGetBlockTag(record, i + 2, NULL, NULL, &nextblkno);
411 GistPageGetOpaque(page)->rightlink = nextblkno;
412 }
413 else
414 GistPageGetOpaque(page)->rightlink = xldata->origrlink;
415 GistPageSetNSN(page, xldata->orignsn);
416 if (i < xldata->npage - 1 && !isrootsplit &&
417 xldata->markfollowright)
418 GistMarkFollowRight(page);
419 else
420 GistClearFollowRight(page);
421 }
422
423 PageSetLSN(page, lsn);
424 MarkBufferDirty(buffer);
425
426 if (i == 0)
427 firstbuffer = buffer;
428 else
429 UnlockReleaseBuffer(buffer);
430 }
431
432 /* Fix follow-right data on left child page, if any */
433 if (XLogRecHasBlockRef(record, 0))
434 gistRedoClearFollowRight(record, 0);
435
436 /* Finally, release lock on the first page */
437 UnlockReleaseBuffer(firstbuffer);
438 }
439
440 static void
gistRedoCreateIndex(XLogReaderState * record)441 gistRedoCreateIndex(XLogReaderState *record)
442 {
443 XLogRecPtr lsn = record->EndRecPtr;
444 Buffer buffer;
445 Page page;
446
447 buffer = XLogInitBufferForRedo(record, 0);
448 Assert(BufferGetBlockNumber(buffer) == GIST_ROOT_BLKNO);
449 page = (Page) BufferGetPage(buffer);
450
451 GISTInitBuffer(buffer, F_LEAF);
452
453 PageSetLSN(page, lsn);
454
455 MarkBufferDirty(buffer);
456 UnlockReleaseBuffer(buffer);
457 }
458
459 void
gist_redo(XLogReaderState * record)460 gist_redo(XLogReaderState *record)
461 {
462 uint8 info = XLogRecGetInfo(record) & ~XLR_INFO_MASK;
463 MemoryContext oldCxt;
464
465 /*
466 * GiST indexes do not require any conflict processing. NB: If we ever
467 * implement a similar optimization we have in b-tree, and remove killed
468 * tuples outside VACUUM, we'll need to handle that here.
469 */
470
471 oldCxt = MemoryContextSwitchTo(opCtx);
472 switch (info)
473 {
474 case XLOG_GIST_PAGE_UPDATE:
475 gistRedoPageUpdateRecord(record);
476 break;
477 case XLOG_GIST_PAGE_SPLIT:
478 gistRedoPageSplitRecord(record);
479 break;
480 case XLOG_GIST_CREATE_INDEX:
481 gistRedoCreateIndex(record);
482 break;
483 default:
484 elog(PANIC, "gist_redo: unknown op code %u", info);
485 }
486
487 MemoryContextSwitchTo(oldCxt);
488 MemoryContextReset(opCtx);
489 }
490
491 void
gist_xlog_startup(void)492 gist_xlog_startup(void)
493 {
494 opCtx = createTempGistContext();
495 }
496
497 void
gist_xlog_cleanup(void)498 gist_xlog_cleanup(void)
499 {
500 MemoryContextDelete(opCtx);
501 }
502
503 /*
504 * Write WAL record of a page split.
505 */
506 XLogRecPtr
gistXLogSplit(bool page_is_leaf,SplitedPageLayout * dist,BlockNumber origrlink,GistNSN orignsn,Buffer leftchildbuf,bool markfollowright)507 gistXLogSplit(bool page_is_leaf,
508 SplitedPageLayout *dist,
509 BlockNumber origrlink, GistNSN orignsn,
510 Buffer leftchildbuf, bool markfollowright)
511 {
512 gistxlogPageSplit xlrec;
513 SplitedPageLayout *ptr;
514 int npage = 0;
515 XLogRecPtr recptr;
516 int i;
517
518 for (ptr = dist; ptr; ptr = ptr->next)
519 npage++;
520
521 xlrec.origrlink = origrlink;
522 xlrec.orignsn = orignsn;
523 xlrec.origleaf = page_is_leaf;
524 xlrec.npage = (uint16) npage;
525 xlrec.markfollowright = markfollowright;
526
527 XLogBeginInsert();
528
529 /*
530 * Include a full page image of the child buf. (only necessary if a
531 * checkpoint happened since the child page was split)
532 */
533 if (BufferIsValid(leftchildbuf))
534 XLogRegisterBuffer(0, leftchildbuf, REGBUF_STANDARD);
535
536 /*
537 * NOTE: We register a lot of data. The caller must've called
538 * XLogEnsureRecordSpace() to prepare for that. We cannot do it here,
539 * because we're already in a critical section. If you change the number
540 * of buffer or data registrations here, make sure you modify the
541 * XLogEnsureRecordSpace() calls accordingly!
542 */
543 XLogRegisterData((char *) &xlrec, sizeof(gistxlogPageSplit));
544
545 i = 1;
546 for (ptr = dist; ptr; ptr = ptr->next)
547 {
548 XLogRegisterBuffer(i, ptr->buffer, REGBUF_WILL_INIT);
549 XLogRegisterBufData(i, (char *) &(ptr->block.num), sizeof(int));
550 XLogRegisterBufData(i, (char *) ptr->list, ptr->lenlist);
551 i++;
552 }
553
554 recptr = XLogInsert(RM_GIST_ID, XLOG_GIST_PAGE_SPLIT);
555
556 return recptr;
557 }
558
559 /*
560 * Write XLOG record describing a page update. The update can include any
561 * number of deletions and/or insertions of tuples on a single index page.
562 *
563 * If this update inserts a downlink for a split page, also record that
564 * the F_FOLLOW_RIGHT flag on the child page is cleared and NSN set.
565 *
566 * Note that both the todelete array and the tuples are marked as belonging
567 * to the target buffer; they need not be stored in XLOG if XLogInsert decides
568 * to log the whole buffer contents instead.
569 */
570 XLogRecPtr
gistXLogUpdate(Buffer buffer,OffsetNumber * todelete,int ntodelete,IndexTuple * itup,int ituplen,Buffer leftchildbuf,RelFileNode * hnode)571 gistXLogUpdate(Buffer buffer,
572 OffsetNumber *todelete, int ntodelete,
573 IndexTuple *itup, int ituplen,
574 Buffer leftchildbuf, RelFileNode *hnode)
575 {
576 gistxlogPageUpdate xlrec;
577 int i;
578 XLogRecPtr recptr;
579
580 xlrec.ntodelete = ntodelete;
581 xlrec.ntoinsert = ituplen;
582
583 XLogBeginInsert();
584 XLogRegisterData((char *) &xlrec, sizeof(gistxlogPageUpdate));
585
586 /*
587 * Append the information required for standby conflict processing if it
588 * is provided by caller.
589 */
590 if (hnode)
591 {
592 XLogRegisterData((char *) hnode, sizeof(RelFileNode));
593 XLogRegisterData((char *) todelete, sizeof(OffsetNumber) * ntodelete);
594 }
595
596 XLogRegisterBuffer(0, buffer, REGBUF_STANDARD);
597 XLogRegisterBufData(0, (char *) todelete, sizeof(OffsetNumber) * ntodelete);
598
599 /* new tuples */
600 for (i = 0; i < ituplen; i++)
601 XLogRegisterBufData(0, (char *) (itup[i]), IndexTupleSize(itup[i]));
602
603 /*
604 * Include a full page image of the child buf. (only necessary if a
605 * checkpoint happened since the child page was split)
606 */
607 if (BufferIsValid(leftchildbuf))
608 XLogRegisterBuffer(1, leftchildbuf, REGBUF_STANDARD);
609
610 recptr = XLogInsert(RM_GIST_ID, XLOG_GIST_PAGE_UPDATE);
611
612 return recptr;
613 }
614