1 /*-------------------------------------------------------------------------
2  *
3  * gistxlog.c
4  *	  WAL replay logic for GiST.
5  *
6  *
7  * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
8  * Portions Copyright (c) 1994, Regents of the University of California
9  *
10  * IDENTIFICATION
11  *			 src/backend/access/gist/gistxlog.c
12  *-------------------------------------------------------------------------
13  */
14 #include "postgres.h"
15 
16 #include "access/gist_private.h"
17 #include "access/heapam_xlog.h"
18 #include "access/transam.h"
19 #include "access/xloginsert.h"
20 #include "access/xlogutils.h"
21 #include "miscadmin.h"
22 #include "storage/procarray.h"
23 #include "utils/memutils.h"
24 
25 static MemoryContext opCtx;		/* working memory for operations */
26 
27 /*
28  * Replay the clearing of F_FOLLOW_RIGHT flag on a child page.
29  *
30  * Even if the WAL record includes a full-page image, we have to update the
31  * follow-right flag, because that change is not included in the full-page
32  * image.  To be sure that the intermediate state with the wrong flag value is
33  * not visible to concurrent Hot Standby queries, this function handles
34  * restoring the full-page image as well as updating the flag.  (Note that
35  * we never need to do anything else to the child page in the current WAL
36  * action.)
37  */
38 static void
gistRedoClearFollowRight(XLogReaderState * record,uint8 block_id)39 gistRedoClearFollowRight(XLogReaderState *record, uint8 block_id)
40 {
41 	XLogRecPtr	lsn = record->EndRecPtr;
42 	Buffer		buffer;
43 	Page		page;
44 	XLogRedoAction action;
45 
46 	/*
47 	 * Note that we still update the page even if it was restored from a full
48 	 * page image, because the updated NSN is not included in the image.
49 	 */
50 	action = XLogReadBufferForRedo(record, block_id, &buffer);
51 	if (action == BLK_NEEDS_REDO || action == BLK_RESTORED)
52 	{
53 		page = BufferGetPage(buffer);
54 
55 		GistPageSetNSN(page, lsn);
56 		GistClearFollowRight(page);
57 
58 		PageSetLSN(page, lsn);
59 		MarkBufferDirty(buffer);
60 	}
61 	if (BufferIsValid(buffer))
62 		UnlockReleaseBuffer(buffer);
63 }
64 
65 /*
66  * Get the latestRemovedXid from the heap pages pointed at by the index
67  * tuples being deleted. See also btree_xlog_delete_get_latestRemovedXid,
68  * on which this function is based.
69  */
70 static TransactionId
gistRedoPageUpdateRecordGetLatestRemovedXid(XLogReaderState * record)71 gistRedoPageUpdateRecordGetLatestRemovedXid(XLogReaderState *record)
72 {
73 	gistxlogPageUpdate *xlrec = (gistxlogPageUpdate *) XLogRecGetData(record);
74 	OffsetNumber *todelete;
75 	Buffer		ibuffer,
76 				hbuffer;
77 	Page		ipage,
78 				hpage;
79 	RelFileNode rnode,
80 			   *hnode;
81 	BlockNumber blkno;
82 	ItemId		iitemid,
83 				hitemid;
84 	IndexTuple	itup;
85 	HeapTupleHeader htuphdr;
86 	BlockNumber hblkno;
87 	OffsetNumber hoffnum;
88 	TransactionId latestRemovedXid = InvalidTransactionId;
89 	int			i;
90 
91 	/*
92 	 * If there's nothing running on the standby we don't need to derive a
93 	 * full latestRemovedXid value, so use a fast path out of here.  This
94 	 * returns InvalidTransactionId, and so will conflict with all HS
95 	 * transactions; but since we just worked out that that's zero people,
96 	 * it's OK.
97 	 *
98 	 * XXX There is a race condition here, which is that a new backend might
99 	 * start just after we look.  If so, it cannot need to conflict, but this
100 	 * coding will result in throwing a conflict anyway.
101 	 */
102 	if (CountDBBackends(InvalidOid) == 0)
103 		return latestRemovedXid;
104 
105 	/*
106 	 * In what follows, we have to examine the previous state of the index
107 	 * page, as well as the heap page(s) it points to.  This is only valid if
108 	 * WAL replay has reached a consistent database state; which means that
109 	 * the preceding check is not just an optimization, but is *necessary*. We
110 	 * won't have let in any user sessions before we reach consistency.
111 	 */
112 	if (!reachedConsistency)
113 		elog(PANIC, "gistRedoDeleteRecordGetLatestRemovedXid: cannot operate with inconsistent data");
114 
115 	/*
116 	 * Get index page.  If the DB is consistent, this should not fail, nor
117 	 * should any of the heap page fetches below.  If one does, we return
118 	 * InvalidTransactionId to cancel all HS transactions.  That's probably
119 	 * overkill, but it's safe, and certainly better than panicking here.
120 	 */
121 	XLogRecGetBlockTag(record, 0, &rnode, NULL, &blkno);
122 	ibuffer = XLogReadBufferExtended(rnode, MAIN_FORKNUM, blkno, RBM_NORMAL);
123 	if (!BufferIsValid(ibuffer))
124 		return InvalidTransactionId;
125 	LockBuffer(ibuffer, BUFFER_LOCK_EXCLUSIVE);
126 	ipage = (Page) BufferGetPage(ibuffer);
127 
128 	/*
129 	 * Loop through the deleted index items to obtain the TransactionId from
130 	 * the heap items they point to.
131 	 */
132 	hnode = (RelFileNode *) ((char *) xlrec + sizeof(gistxlogPageUpdate));
133 	todelete = (OffsetNumber *) ((char *) hnode + sizeof(RelFileNode));
134 
135 	for (i = 0; i < xlrec->ntodelete; i++)
136 	{
137 		/*
138 		 * Identify the index tuple about to be deleted
139 		 */
140 		iitemid = PageGetItemId(ipage, todelete[i]);
141 		itup = (IndexTuple) PageGetItem(ipage, iitemid);
142 
143 		/*
144 		 * Locate the heap page that the index tuple points at
145 		 */
146 		hblkno = ItemPointerGetBlockNumber(&(itup->t_tid));
147 		hbuffer = XLogReadBufferExtended(*hnode, MAIN_FORKNUM, hblkno, RBM_NORMAL);
148 		if (!BufferIsValid(hbuffer))
149 		{
150 			UnlockReleaseBuffer(ibuffer);
151 			return InvalidTransactionId;
152 		}
153 		LockBuffer(hbuffer, BUFFER_LOCK_SHARE);
154 		hpage = (Page) BufferGetPage(hbuffer);
155 
156 		/*
157 		 * Look up the heap tuple header that the index tuple points at by
158 		 * using the heap node supplied with the xlrec. We can't use
159 		 * heap_fetch, since it uses ReadBuffer rather than XLogReadBuffer.
160 		 * Note that we are not looking at tuple data here, just headers.
161 		 */
162 		hoffnum = ItemPointerGetOffsetNumber(&(itup->t_tid));
163 		hitemid = PageGetItemId(hpage, hoffnum);
164 
165 		/*
166 		 * Follow any redirections until we find something useful.
167 		 */
168 		while (ItemIdIsRedirected(hitemid))
169 		{
170 			hoffnum = ItemIdGetRedirect(hitemid);
171 			hitemid = PageGetItemId(hpage, hoffnum);
172 			CHECK_FOR_INTERRUPTS();
173 		}
174 
175 		/*
176 		 * If the heap item has storage, then read the header and use that to
177 		 * set latestRemovedXid.
178 		 *
179 		 * Some LP_DEAD items may not be accessible, so we ignore them.
180 		 */
181 		if (ItemIdHasStorage(hitemid))
182 		{
183 			htuphdr = (HeapTupleHeader) PageGetItem(hpage, hitemid);
184 
185 			HeapTupleHeaderAdvanceLatestRemovedXid(htuphdr, &latestRemovedXid);
186 		}
187 		else if (ItemIdIsDead(hitemid))
188 		{
189 			/*
190 			 * Conjecture: if hitemid is dead then it had xids before the xids
191 			 * marked on LP_NORMAL items. So we just ignore this item and move
192 			 * onto the next, for the purposes of calculating
193 			 * latestRemovedxids.
194 			 */
195 		}
196 		else
197 			Assert(!ItemIdIsUsed(hitemid));
198 
199 		UnlockReleaseBuffer(hbuffer);
200 	}
201 
202 	UnlockReleaseBuffer(ibuffer);
203 
204 	/*
205 	 * If all heap tuples were LP_DEAD then we will be returning
206 	 * InvalidTransactionId here, which avoids conflicts. This matches
207 	 * existing logic which assumes that LP_DEAD tuples must already be older
208 	 * than the latestRemovedXid on the cleanup record that set them as
209 	 * LP_DEAD, hence must already have generated a conflict.
210 	 */
211 	return latestRemovedXid;
212 }
213 
214 /*
215  * redo any page update (except page split)
216  */
217 static void
gistRedoPageUpdateRecord(XLogReaderState * record)218 gistRedoPageUpdateRecord(XLogReaderState *record)
219 {
220 	XLogRecPtr	lsn = record->EndRecPtr;
221 	gistxlogPageUpdate *xldata = (gistxlogPageUpdate *) XLogRecGetData(record);
222 	Buffer		buffer;
223 	Page		page;
224 
225 	/*
226 	 * If we have any conflict processing to do, it must happen before we
227 	 * update the page.
228 	 *
229 	 * Support for conflict processing in GiST has been backpatched.  This is
230 	 * why we have to use tricky way of saving WAL-compatibility between minor
231 	 * versions.  Information required for conflict processing is just
232 	 * appended to data of XLOG_GIST_PAGE_UPDATE record.  So, PostgreSQL
233 	 * version, which doesn't know about conflict processing, will just ignore
234 	 * that.
235 	 *
236 	 * GiST delete records can conflict with standby queries.  You might think
237 	 * that vacuum records would conflict as well, but we've handled that
238 	 * already.  XLOG_HEAP2_CLEANUP_INFO records provide the highest xid
239 	 * cleaned by the vacuum of the heap and so we can resolve any conflicts
240 	 * just once when that arrives.  After that we know that no conflicts
241 	 * exist from individual gist vacuum records on that index.
242 	 */
243 	if (InHotStandby && XLogRecGetDataLen(record) > sizeof(gistxlogPageUpdate))
244 	{
245 		TransactionId latestRemovedXid = gistRedoPageUpdateRecordGetLatestRemovedXid(record);
246 		RelFileNode rnode;
247 
248 		XLogRecGetBlockTag(record, 0, &rnode, NULL, NULL);
249 
250 		ResolveRecoveryConflictWithSnapshot(latestRemovedXid, rnode);
251 	}
252 
253 	if (XLogReadBufferForRedo(record, 0, &buffer) == BLK_NEEDS_REDO)
254 	{
255 		char	   *begin;
256 		char	   *data;
257 		Size		datalen;
258 		int			ninserted = 0;
259 
260 		data = begin = XLogRecGetBlockData(record, 0, &datalen);
261 
262 		page = (Page) BufferGetPage(buffer);
263 
264 		/* Delete old tuples */
265 		if (xldata->ntodelete > 0)
266 		{
267 			OffsetNumber *todelete = (OffsetNumber *) data;
268 
269 			data += sizeof(OffsetNumber) * xldata->ntodelete;
270 
271 			PageIndexMultiDelete(page, todelete, xldata->ntodelete);
272 			if (GistPageIsLeaf(page))
273 				GistMarkTuplesDeleted(page);
274 		}
275 
276 		/* add tuples */
277 		if (data - begin < datalen)
278 		{
279 			OffsetNumber off = (PageIsEmpty(page)) ? FirstOffsetNumber :
280 			OffsetNumberNext(PageGetMaxOffsetNumber(page));
281 
282 			while (data - begin < datalen)
283 			{
284 				IndexTuple	itup = (IndexTuple) data;
285 				Size		sz = IndexTupleSize(itup);
286 				OffsetNumber l;
287 
288 				data += sz;
289 
290 				l = PageAddItem(page, (Item) itup, sz, off, false, false);
291 				if (l == InvalidOffsetNumber)
292 					elog(ERROR, "failed to add item to GiST index page, size %d bytes",
293 						 (int) sz);
294 				off++;
295 				ninserted++;
296 			}
297 		}
298 
299 		Assert(ninserted == xldata->ntoinsert);
300 
301 		PageSetLSN(page, lsn);
302 		MarkBufferDirty(buffer);
303 	}
304 
305 	/*
306 	 * Fix follow-right data on left child page
307 	 *
308 	 * This must be done while still holding the lock on the target page. Note
309 	 * that even if the target page no longer exists, we still attempt to
310 	 * replay the change on the child page.
311 	 */
312 	if (XLogRecHasBlockRef(record, 1))
313 		gistRedoClearFollowRight(record, 1);
314 
315 	if (BufferIsValid(buffer))
316 		UnlockReleaseBuffer(buffer);
317 }
318 
319 /*
320  * Returns an array of index pointers.
321  */
322 static IndexTuple *
decodePageSplitRecord(char * begin,int len,int * n)323 decodePageSplitRecord(char *begin, int len, int *n)
324 {
325 	char	   *ptr;
326 	int			i = 0;
327 	IndexTuple *tuples;
328 
329 	/* extract the number of tuples */
330 	memcpy(n, begin, sizeof(int));
331 	ptr = begin + sizeof(int);
332 
333 	tuples = palloc(*n * sizeof(IndexTuple));
334 
335 	for (i = 0; i < *n; i++)
336 	{
337 		Assert(ptr - begin < len);
338 		tuples[i] = (IndexTuple) ptr;
339 		ptr += IndexTupleSize((IndexTuple) ptr);
340 	}
341 	Assert(ptr - begin == len);
342 
343 	return tuples;
344 }
345 
346 static void
gistRedoPageSplitRecord(XLogReaderState * record)347 gistRedoPageSplitRecord(XLogReaderState *record)
348 {
349 	XLogRecPtr	lsn = record->EndRecPtr;
350 	gistxlogPageSplit *xldata = (gistxlogPageSplit *) XLogRecGetData(record);
351 	Buffer		firstbuffer = InvalidBuffer;
352 	Buffer		buffer;
353 	Page		page;
354 	int			i;
355 	bool		isrootsplit = false;
356 
357 	/*
358 	 * We must hold lock on the first-listed page throughout the action,
359 	 * including while updating the left child page (if any).  We can unlock
360 	 * remaining pages in the list as soon as they've been written, because
361 	 * there is no path for concurrent queries to reach those pages without
362 	 * first visiting the first-listed page.
363 	 */
364 
365 	/* loop around all pages */
366 	for (i = 0; i < xldata->npage; i++)
367 	{
368 		int			flags;
369 		char	   *data;
370 		Size		datalen;
371 		int			num;
372 		BlockNumber blkno;
373 		IndexTuple *tuples;
374 
375 		XLogRecGetBlockTag(record, i + 1, NULL, NULL, &blkno);
376 		if (blkno == GIST_ROOT_BLKNO)
377 		{
378 			Assert(i == 0);
379 			isrootsplit = true;
380 		}
381 
382 		buffer = XLogInitBufferForRedo(record, i + 1);
383 		page = (Page) BufferGetPage(buffer);
384 		data = XLogRecGetBlockData(record, i + 1, &datalen);
385 
386 		tuples = decodePageSplitRecord(data, datalen, &num);
387 
388 		/* ok, clear buffer */
389 		if (xldata->origleaf && blkno != GIST_ROOT_BLKNO)
390 			flags = F_LEAF;
391 		else
392 			flags = 0;
393 		GISTInitBuffer(buffer, flags);
394 
395 		/* and fill it */
396 		gistfillbuffer(page, tuples, num, FirstOffsetNumber);
397 
398 		if (blkno == GIST_ROOT_BLKNO)
399 		{
400 			GistPageGetOpaque(page)->rightlink = InvalidBlockNumber;
401 			GistPageSetNSN(page, xldata->orignsn);
402 			GistClearFollowRight(page);
403 		}
404 		else
405 		{
406 			if (i < xldata->npage - 1)
407 			{
408 				BlockNumber nextblkno;
409 
410 				XLogRecGetBlockTag(record, i + 2, NULL, NULL, &nextblkno);
411 				GistPageGetOpaque(page)->rightlink = nextblkno;
412 			}
413 			else
414 				GistPageGetOpaque(page)->rightlink = xldata->origrlink;
415 			GistPageSetNSN(page, xldata->orignsn);
416 			if (i < xldata->npage - 1 && !isrootsplit &&
417 				xldata->markfollowright)
418 				GistMarkFollowRight(page);
419 			else
420 				GistClearFollowRight(page);
421 		}
422 
423 		PageSetLSN(page, lsn);
424 		MarkBufferDirty(buffer);
425 
426 		if (i == 0)
427 			firstbuffer = buffer;
428 		else
429 			UnlockReleaseBuffer(buffer);
430 	}
431 
432 	/* Fix follow-right data on left child page, if any */
433 	if (XLogRecHasBlockRef(record, 0))
434 		gistRedoClearFollowRight(record, 0);
435 
436 	/* Finally, release lock on the first page */
437 	UnlockReleaseBuffer(firstbuffer);
438 }
439 
440 static void
gistRedoCreateIndex(XLogReaderState * record)441 gistRedoCreateIndex(XLogReaderState *record)
442 {
443 	XLogRecPtr	lsn = record->EndRecPtr;
444 	Buffer		buffer;
445 	Page		page;
446 
447 	buffer = XLogInitBufferForRedo(record, 0);
448 	Assert(BufferGetBlockNumber(buffer) == GIST_ROOT_BLKNO);
449 	page = (Page) BufferGetPage(buffer);
450 
451 	GISTInitBuffer(buffer, F_LEAF);
452 
453 	PageSetLSN(page, lsn);
454 
455 	MarkBufferDirty(buffer);
456 	UnlockReleaseBuffer(buffer);
457 }
458 
459 void
gist_redo(XLogReaderState * record)460 gist_redo(XLogReaderState *record)
461 {
462 	uint8		info = XLogRecGetInfo(record) & ~XLR_INFO_MASK;
463 	MemoryContext oldCxt;
464 
465 	/*
466 	 * GiST indexes do not require any conflict processing. NB: If we ever
467 	 * implement a similar optimization we have in b-tree, and remove killed
468 	 * tuples outside VACUUM, we'll need to handle that here.
469 	 */
470 
471 	oldCxt = MemoryContextSwitchTo(opCtx);
472 	switch (info)
473 	{
474 		case XLOG_GIST_PAGE_UPDATE:
475 			gistRedoPageUpdateRecord(record);
476 			break;
477 		case XLOG_GIST_PAGE_SPLIT:
478 			gistRedoPageSplitRecord(record);
479 			break;
480 		case XLOG_GIST_CREATE_INDEX:
481 			gistRedoCreateIndex(record);
482 			break;
483 		default:
484 			elog(PANIC, "gist_redo: unknown op code %u", info);
485 	}
486 
487 	MemoryContextSwitchTo(oldCxt);
488 	MemoryContextReset(opCtx);
489 }
490 
491 void
gist_xlog_startup(void)492 gist_xlog_startup(void)
493 {
494 	opCtx = createTempGistContext();
495 }
496 
497 void
gist_xlog_cleanup(void)498 gist_xlog_cleanup(void)
499 {
500 	MemoryContextDelete(opCtx);
501 }
502 
503 /*
504  * Write WAL record of a page split.
505  */
506 XLogRecPtr
gistXLogSplit(bool page_is_leaf,SplitedPageLayout * dist,BlockNumber origrlink,GistNSN orignsn,Buffer leftchildbuf,bool markfollowright)507 gistXLogSplit(bool page_is_leaf,
508 			  SplitedPageLayout *dist,
509 			  BlockNumber origrlink, GistNSN orignsn,
510 			  Buffer leftchildbuf, bool markfollowright)
511 {
512 	gistxlogPageSplit xlrec;
513 	SplitedPageLayout *ptr;
514 	int			npage = 0;
515 	XLogRecPtr	recptr;
516 	int			i;
517 
518 	for (ptr = dist; ptr; ptr = ptr->next)
519 		npage++;
520 
521 	xlrec.origrlink = origrlink;
522 	xlrec.orignsn = orignsn;
523 	xlrec.origleaf = page_is_leaf;
524 	xlrec.npage = (uint16) npage;
525 	xlrec.markfollowright = markfollowright;
526 
527 	XLogBeginInsert();
528 
529 	/*
530 	 * Include a full page image of the child buf. (only necessary if a
531 	 * checkpoint happened since the child page was split)
532 	 */
533 	if (BufferIsValid(leftchildbuf))
534 		XLogRegisterBuffer(0, leftchildbuf, REGBUF_STANDARD);
535 
536 	/*
537 	 * NOTE: We register a lot of data. The caller must've called
538 	 * XLogEnsureRecordSpace() to prepare for that. We cannot do it here,
539 	 * because we're already in a critical section. If you change the number
540 	 * of buffer or data registrations here, make sure you modify the
541 	 * XLogEnsureRecordSpace() calls accordingly!
542 	 */
543 	XLogRegisterData((char *) &xlrec, sizeof(gistxlogPageSplit));
544 
545 	i = 1;
546 	for (ptr = dist; ptr; ptr = ptr->next)
547 	{
548 		XLogRegisterBuffer(i, ptr->buffer, REGBUF_WILL_INIT);
549 		XLogRegisterBufData(i, (char *) &(ptr->block.num), sizeof(int));
550 		XLogRegisterBufData(i, (char *) ptr->list, ptr->lenlist);
551 		i++;
552 	}
553 
554 	recptr = XLogInsert(RM_GIST_ID, XLOG_GIST_PAGE_SPLIT);
555 
556 	return recptr;
557 }
558 
559 /*
560  * Write XLOG record describing a page update. The update can include any
561  * number of deletions and/or insertions of tuples on a single index page.
562  *
563  * If this update inserts a downlink for a split page, also record that
564  * the F_FOLLOW_RIGHT flag on the child page is cleared and NSN set.
565  *
566  * Note that both the todelete array and the tuples are marked as belonging
567  * to the target buffer; they need not be stored in XLOG if XLogInsert decides
568  * to log the whole buffer contents instead.
569  */
570 XLogRecPtr
gistXLogUpdate(Buffer buffer,OffsetNumber * todelete,int ntodelete,IndexTuple * itup,int ituplen,Buffer leftchildbuf,RelFileNode * hnode)571 gistXLogUpdate(Buffer buffer,
572 			   OffsetNumber *todelete, int ntodelete,
573 			   IndexTuple *itup, int ituplen,
574 			   Buffer leftchildbuf, RelFileNode *hnode)
575 {
576 	gistxlogPageUpdate xlrec;
577 	int			i;
578 	XLogRecPtr	recptr;
579 
580 	xlrec.ntodelete = ntodelete;
581 	xlrec.ntoinsert = ituplen;
582 
583 	XLogBeginInsert();
584 	XLogRegisterData((char *) &xlrec, sizeof(gistxlogPageUpdate));
585 
586 	/*
587 	 * Append the information required for standby conflict processing if it
588 	 * is provided by caller.
589 	 */
590 	if (hnode)
591 	{
592 		XLogRegisterData((char *) hnode, sizeof(RelFileNode));
593 		XLogRegisterData((char *) todelete, sizeof(OffsetNumber) * ntodelete);
594 	}
595 
596 	XLogRegisterBuffer(0, buffer, REGBUF_STANDARD);
597 	XLogRegisterBufData(0, (char *) todelete, sizeof(OffsetNumber) * ntodelete);
598 
599 	/* new tuples */
600 	for (i = 0; i < ituplen; i++)
601 		XLogRegisterBufData(0, (char *) (itup[i]), IndexTupleSize(itup[i]));
602 
603 	/*
604 	 * Include a full page image of the child buf. (only necessary if a
605 	 * checkpoint happened since the child page was split)
606 	 */
607 	if (BufferIsValid(leftchildbuf))
608 		XLogRegisterBuffer(1, leftchildbuf, REGBUF_STANDARD);
609 
610 	recptr = XLogInsert(RM_GIST_ID, XLOG_GIST_PAGE_UPDATE);
611 
612 	return recptr;
613 }
614