1 /*-------------------------------------------------------------------------
2  *
3  * hash_xlog.c
4  *	  WAL replay logic for hash index.
5  *
6  *
7  * Portions Copyright (c) 1996-2019, PostgreSQL Global Development Group
8  * Portions Copyright (c) 1994, Regents of the University of California
9  *
10  * IDENTIFICATION
11  *	  src/backend/access/hash/hash_xlog.c
12  *
13  *-------------------------------------------------------------------------
14  */
15 #include "postgres.h"
16 
17 #include "access/bufmask.h"
18 #include "access/hash.h"
19 #include "access/hash_xlog.h"
20 #include "access/xlogutils.h"
21 #include "access/xlog.h"
22 #include "access/transam.h"
23 #include "storage/procarray.h"
24 #include "miscadmin.h"
25 
26 /*
27  * replay a hash index meta page
28  */
29 static void
hash_xlog_init_meta_page(XLogReaderState * record)30 hash_xlog_init_meta_page(XLogReaderState *record)
31 {
32 	XLogRecPtr	lsn = record->EndRecPtr;
33 	Page		page;
34 	Buffer		metabuf;
35 	ForkNumber	forknum;
36 
37 	xl_hash_init_meta_page *xlrec = (xl_hash_init_meta_page *) XLogRecGetData(record);
38 
39 	/* create the index' metapage */
40 	metabuf = XLogInitBufferForRedo(record, 0);
41 	Assert(BufferIsValid(metabuf));
42 	_hash_init_metabuffer(metabuf, xlrec->num_tuples, xlrec->procid,
43 						  xlrec->ffactor, true);
44 	page = (Page) BufferGetPage(metabuf);
45 	PageSetLSN(page, lsn);
46 	MarkBufferDirty(metabuf);
47 
48 	/*
49 	 * Force the on-disk state of init forks to always be in sync with the
50 	 * state in shared buffers.  See XLogReadBufferForRedoExtended.  We need
51 	 * special handling for init forks as create index operations don't log a
52 	 * full page image of the metapage.
53 	 */
54 	XLogRecGetBlockTag(record, 0, NULL, &forknum, NULL);
55 	if (forknum == INIT_FORKNUM)
56 		FlushOneBuffer(metabuf);
57 
58 	/* all done */
59 	UnlockReleaseBuffer(metabuf);
60 }
61 
62 /*
63  * replay a hash index bitmap page
64  */
65 static void
hash_xlog_init_bitmap_page(XLogReaderState * record)66 hash_xlog_init_bitmap_page(XLogReaderState *record)
67 {
68 	XLogRecPtr	lsn = record->EndRecPtr;
69 	Buffer		bitmapbuf;
70 	Buffer		metabuf;
71 	Page		page;
72 	HashMetaPage metap;
73 	uint32		num_buckets;
74 	ForkNumber	forknum;
75 
76 	xl_hash_init_bitmap_page *xlrec = (xl_hash_init_bitmap_page *) XLogRecGetData(record);
77 
78 	/*
79 	 * Initialize bitmap page
80 	 */
81 	bitmapbuf = XLogInitBufferForRedo(record, 0);
82 	_hash_initbitmapbuffer(bitmapbuf, xlrec->bmsize, true);
83 	PageSetLSN(BufferGetPage(bitmapbuf), lsn);
84 	MarkBufferDirty(bitmapbuf);
85 
86 	/*
87 	 * Force the on-disk state of init forks to always be in sync with the
88 	 * state in shared buffers.  See XLogReadBufferForRedoExtended.  We need
89 	 * special handling for init forks as create index operations don't log a
90 	 * full page image of the metapage.
91 	 */
92 	XLogRecGetBlockTag(record, 0, NULL, &forknum, NULL);
93 	if (forknum == INIT_FORKNUM)
94 		FlushOneBuffer(bitmapbuf);
95 	UnlockReleaseBuffer(bitmapbuf);
96 
97 	/* add the new bitmap page to the metapage's list of bitmaps */
98 	if (XLogReadBufferForRedo(record, 1, &metabuf) == BLK_NEEDS_REDO)
99 	{
100 		/*
101 		 * Note: in normal operation, we'd update the metapage while still
102 		 * holding lock on the bitmap page.  But during replay it's not
103 		 * necessary to hold that lock, since nobody can see it yet; the
104 		 * creating transaction hasn't yet committed.
105 		 */
106 		page = BufferGetPage(metabuf);
107 		metap = HashPageGetMeta(page);
108 
109 		num_buckets = metap->hashm_maxbucket + 1;
110 		metap->hashm_mapp[metap->hashm_nmaps] = num_buckets + 1;
111 		metap->hashm_nmaps++;
112 
113 		PageSetLSN(page, lsn);
114 		MarkBufferDirty(metabuf);
115 
116 		XLogRecGetBlockTag(record, 1, NULL, &forknum, NULL);
117 		if (forknum == INIT_FORKNUM)
118 			FlushOneBuffer(metabuf);
119 	}
120 	if (BufferIsValid(metabuf))
121 		UnlockReleaseBuffer(metabuf);
122 }
123 
124 /*
125  * replay a hash index insert without split
126  */
127 static void
hash_xlog_insert(XLogReaderState * record)128 hash_xlog_insert(XLogReaderState *record)
129 {
130 	HashMetaPage metap;
131 	XLogRecPtr	lsn = record->EndRecPtr;
132 	xl_hash_insert *xlrec = (xl_hash_insert *) XLogRecGetData(record);
133 	Buffer		buffer;
134 	Page		page;
135 
136 	if (XLogReadBufferForRedo(record, 0, &buffer) == BLK_NEEDS_REDO)
137 	{
138 		Size		datalen;
139 		char	   *datapos = XLogRecGetBlockData(record, 0, &datalen);
140 
141 		page = BufferGetPage(buffer);
142 
143 		if (PageAddItem(page, (Item) datapos, datalen, xlrec->offnum,
144 						false, false) == InvalidOffsetNumber)
145 			elog(PANIC, "hash_xlog_insert: failed to add item");
146 
147 		PageSetLSN(page, lsn);
148 		MarkBufferDirty(buffer);
149 	}
150 	if (BufferIsValid(buffer))
151 		UnlockReleaseBuffer(buffer);
152 
153 	if (XLogReadBufferForRedo(record, 1, &buffer) == BLK_NEEDS_REDO)
154 	{
155 		/*
156 		 * Note: in normal operation, we'd update the metapage while still
157 		 * holding lock on the page we inserted into.  But during replay it's
158 		 * not necessary to hold that lock, since no other index updates can
159 		 * be happening concurrently.
160 		 */
161 		page = BufferGetPage(buffer);
162 		metap = HashPageGetMeta(page);
163 		metap->hashm_ntuples += 1;
164 
165 		PageSetLSN(page, lsn);
166 		MarkBufferDirty(buffer);
167 	}
168 	if (BufferIsValid(buffer))
169 		UnlockReleaseBuffer(buffer);
170 }
171 
172 /*
173  * replay addition of overflow page for hash index
174  */
175 static void
hash_xlog_add_ovfl_page(XLogReaderState * record)176 hash_xlog_add_ovfl_page(XLogReaderState *record)
177 {
178 	XLogRecPtr	lsn = record->EndRecPtr;
179 	xl_hash_add_ovfl_page *xlrec = (xl_hash_add_ovfl_page *) XLogRecGetData(record);
180 	Buffer		leftbuf;
181 	Buffer		ovflbuf;
182 	Buffer		metabuf;
183 	BlockNumber leftblk;
184 	BlockNumber rightblk;
185 	BlockNumber newmapblk = InvalidBlockNumber;
186 	Page		ovflpage;
187 	HashPageOpaque ovflopaque;
188 	uint32	   *num_bucket;
189 	char	   *data;
190 	Size		datalen PG_USED_FOR_ASSERTS_ONLY;
191 	bool		new_bmpage = false;
192 
193 	XLogRecGetBlockTag(record, 0, NULL, NULL, &rightblk);
194 	XLogRecGetBlockTag(record, 1, NULL, NULL, &leftblk);
195 
196 	ovflbuf = XLogInitBufferForRedo(record, 0);
197 	Assert(BufferIsValid(ovflbuf));
198 
199 	data = XLogRecGetBlockData(record, 0, &datalen);
200 	num_bucket = (uint32 *) data;
201 	Assert(datalen == sizeof(uint32));
202 	_hash_initbuf(ovflbuf, InvalidBlockNumber, *num_bucket, LH_OVERFLOW_PAGE,
203 				  true);
204 	/* update backlink */
205 	ovflpage = BufferGetPage(ovflbuf);
206 	ovflopaque = (HashPageOpaque) PageGetSpecialPointer(ovflpage);
207 	ovflopaque->hasho_prevblkno = leftblk;
208 
209 	PageSetLSN(ovflpage, lsn);
210 	MarkBufferDirty(ovflbuf);
211 
212 	if (XLogReadBufferForRedo(record, 1, &leftbuf) == BLK_NEEDS_REDO)
213 	{
214 		Page		leftpage;
215 		HashPageOpaque leftopaque;
216 
217 		leftpage = BufferGetPage(leftbuf);
218 		leftopaque = (HashPageOpaque) PageGetSpecialPointer(leftpage);
219 		leftopaque->hasho_nextblkno = rightblk;
220 
221 		PageSetLSN(leftpage, lsn);
222 		MarkBufferDirty(leftbuf);
223 	}
224 
225 	if (BufferIsValid(leftbuf))
226 		UnlockReleaseBuffer(leftbuf);
227 	UnlockReleaseBuffer(ovflbuf);
228 
229 	/*
230 	 * Note: in normal operation, we'd update the bitmap and meta page while
231 	 * still holding lock on the overflow pages.  But during replay it's not
232 	 * necessary to hold those locks, since no other index updates can be
233 	 * happening concurrently.
234 	 */
235 	if (XLogRecHasBlockRef(record, 2))
236 	{
237 		Buffer		mapbuffer;
238 
239 		if (XLogReadBufferForRedo(record, 2, &mapbuffer) == BLK_NEEDS_REDO)
240 		{
241 			Page		mappage = (Page) BufferGetPage(mapbuffer);
242 			uint32	   *freep = NULL;
243 			char	   *data;
244 			uint32	   *bitmap_page_bit;
245 
246 			freep = HashPageGetBitmap(mappage);
247 
248 			data = XLogRecGetBlockData(record, 2, &datalen);
249 			bitmap_page_bit = (uint32 *) data;
250 
251 			SETBIT(freep, *bitmap_page_bit);
252 
253 			PageSetLSN(mappage, lsn);
254 			MarkBufferDirty(mapbuffer);
255 		}
256 		if (BufferIsValid(mapbuffer))
257 			UnlockReleaseBuffer(mapbuffer);
258 	}
259 
260 	if (XLogRecHasBlockRef(record, 3))
261 	{
262 		Buffer		newmapbuf;
263 
264 		newmapbuf = XLogInitBufferForRedo(record, 3);
265 
266 		_hash_initbitmapbuffer(newmapbuf, xlrec->bmsize, true);
267 
268 		new_bmpage = true;
269 		newmapblk = BufferGetBlockNumber(newmapbuf);
270 
271 		MarkBufferDirty(newmapbuf);
272 		PageSetLSN(BufferGetPage(newmapbuf), lsn);
273 
274 		UnlockReleaseBuffer(newmapbuf);
275 	}
276 
277 	if (XLogReadBufferForRedo(record, 4, &metabuf) == BLK_NEEDS_REDO)
278 	{
279 		HashMetaPage metap;
280 		Page		page;
281 		uint32	   *firstfree_ovflpage;
282 
283 		data = XLogRecGetBlockData(record, 4, &datalen);
284 		firstfree_ovflpage = (uint32 *) data;
285 
286 		page = BufferGetPage(metabuf);
287 		metap = HashPageGetMeta(page);
288 		metap->hashm_firstfree = *firstfree_ovflpage;
289 
290 		if (!xlrec->bmpage_found)
291 		{
292 			metap->hashm_spares[metap->hashm_ovflpoint]++;
293 
294 			if (new_bmpage)
295 			{
296 				Assert(BlockNumberIsValid(newmapblk));
297 
298 				metap->hashm_mapp[metap->hashm_nmaps] = newmapblk;
299 				metap->hashm_nmaps++;
300 				metap->hashm_spares[metap->hashm_ovflpoint]++;
301 			}
302 		}
303 
304 		PageSetLSN(page, lsn);
305 		MarkBufferDirty(metabuf);
306 	}
307 	if (BufferIsValid(metabuf))
308 		UnlockReleaseBuffer(metabuf);
309 }
310 
311 /*
312  * replay allocation of page for split operation
313  */
314 static void
hash_xlog_split_allocate_page(XLogReaderState * record)315 hash_xlog_split_allocate_page(XLogReaderState *record)
316 {
317 	XLogRecPtr	lsn = record->EndRecPtr;
318 	xl_hash_split_allocate_page *xlrec = (xl_hash_split_allocate_page *) XLogRecGetData(record);
319 	Buffer		oldbuf;
320 	Buffer		newbuf;
321 	Buffer		metabuf;
322 	Size		datalen PG_USED_FOR_ASSERTS_ONLY;
323 	char	   *data;
324 	XLogRedoAction action;
325 
326 	/*
327 	 * To be consistent with normal operation, here we take cleanup locks on
328 	 * both the old and new buckets even though there can't be any concurrent
329 	 * inserts.
330 	 */
331 
332 	/* replay the record for old bucket */
333 	action = XLogReadBufferForRedoExtended(record, 0, RBM_NORMAL, true, &oldbuf);
334 
335 	/*
336 	 * Note that we still update the page even if it was restored from a full
337 	 * page image, because the special space is not included in the image.
338 	 */
339 	if (action == BLK_NEEDS_REDO || action == BLK_RESTORED)
340 	{
341 		Page		oldpage;
342 		HashPageOpaque oldopaque;
343 
344 		oldpage = BufferGetPage(oldbuf);
345 		oldopaque = (HashPageOpaque) PageGetSpecialPointer(oldpage);
346 
347 		oldopaque->hasho_flag = xlrec->old_bucket_flag;
348 		oldopaque->hasho_prevblkno = xlrec->new_bucket;
349 
350 		PageSetLSN(oldpage, lsn);
351 		MarkBufferDirty(oldbuf);
352 	}
353 
354 	/* replay the record for new bucket */
355 	newbuf = XLogInitBufferForRedo(record, 1);
356 	_hash_initbuf(newbuf, xlrec->new_bucket, xlrec->new_bucket,
357 				  xlrec->new_bucket_flag, true);
358 	if (!IsBufferCleanupOK(newbuf))
359 		elog(PANIC, "hash_xlog_split_allocate_page: failed to acquire cleanup lock");
360 	MarkBufferDirty(newbuf);
361 	PageSetLSN(BufferGetPage(newbuf), lsn);
362 
363 	/*
364 	 * We can release the lock on old bucket early as well but doing here to
365 	 * consistent with normal operation.
366 	 */
367 	if (BufferIsValid(oldbuf))
368 		UnlockReleaseBuffer(oldbuf);
369 	if (BufferIsValid(newbuf))
370 		UnlockReleaseBuffer(newbuf);
371 
372 	/*
373 	 * Note: in normal operation, we'd update the meta page while still
374 	 * holding lock on the old and new bucket pages.  But during replay it's
375 	 * not necessary to hold those locks, since no other bucket splits can be
376 	 * happening concurrently.
377 	 */
378 
379 	/* replay the record for metapage changes */
380 	if (XLogReadBufferForRedo(record, 2, &metabuf) == BLK_NEEDS_REDO)
381 	{
382 		Page		page;
383 		HashMetaPage metap;
384 
385 		page = BufferGetPage(metabuf);
386 		metap = HashPageGetMeta(page);
387 		metap->hashm_maxbucket = xlrec->new_bucket;
388 
389 		data = XLogRecGetBlockData(record, 2, &datalen);
390 
391 		if (xlrec->flags & XLH_SPLIT_META_UPDATE_MASKS)
392 		{
393 			uint32		lowmask;
394 			uint32	   *highmask;
395 
396 			/* extract low and high masks. */
397 			memcpy(&lowmask, data, sizeof(uint32));
398 			highmask = (uint32 *) ((char *) data + sizeof(uint32));
399 
400 			/* update metapage */
401 			metap->hashm_lowmask = lowmask;
402 			metap->hashm_highmask = *highmask;
403 
404 			data += sizeof(uint32) * 2;
405 		}
406 
407 		if (xlrec->flags & XLH_SPLIT_META_UPDATE_SPLITPOINT)
408 		{
409 			uint32		ovflpoint;
410 			uint32	   *ovflpages;
411 
412 			/* extract information of overflow pages. */
413 			memcpy(&ovflpoint, data, sizeof(uint32));
414 			ovflpages = (uint32 *) ((char *) data + sizeof(uint32));
415 
416 			/* update metapage */
417 			metap->hashm_spares[ovflpoint] = *ovflpages;
418 			metap->hashm_ovflpoint = ovflpoint;
419 		}
420 
421 		MarkBufferDirty(metabuf);
422 		PageSetLSN(BufferGetPage(metabuf), lsn);
423 	}
424 
425 	if (BufferIsValid(metabuf))
426 		UnlockReleaseBuffer(metabuf);
427 }
428 
429 /*
430  * replay of split operation
431  */
432 static void
hash_xlog_split_page(XLogReaderState * record)433 hash_xlog_split_page(XLogReaderState *record)
434 {
435 	Buffer		buf;
436 
437 	if (XLogReadBufferForRedo(record, 0, &buf) != BLK_RESTORED)
438 		elog(ERROR, "Hash split record did not contain a full-page image");
439 
440 	UnlockReleaseBuffer(buf);
441 }
442 
443 /*
444  * replay completion of split operation
445  */
446 static void
hash_xlog_split_complete(XLogReaderState * record)447 hash_xlog_split_complete(XLogReaderState *record)
448 {
449 	XLogRecPtr	lsn = record->EndRecPtr;
450 	xl_hash_split_complete *xlrec = (xl_hash_split_complete *) XLogRecGetData(record);
451 	Buffer		oldbuf;
452 	Buffer		newbuf;
453 	XLogRedoAction action;
454 
455 	/* replay the record for old bucket */
456 	action = XLogReadBufferForRedo(record, 0, &oldbuf);
457 
458 	/*
459 	 * Note that we still update the page even if it was restored from a full
460 	 * page image, because the bucket flag is not included in the image.
461 	 */
462 	if (action == BLK_NEEDS_REDO || action == BLK_RESTORED)
463 	{
464 		Page		oldpage;
465 		HashPageOpaque oldopaque;
466 
467 		oldpage = BufferGetPage(oldbuf);
468 		oldopaque = (HashPageOpaque) PageGetSpecialPointer(oldpage);
469 
470 		oldopaque->hasho_flag = xlrec->old_bucket_flag;
471 
472 		PageSetLSN(oldpage, lsn);
473 		MarkBufferDirty(oldbuf);
474 	}
475 	if (BufferIsValid(oldbuf))
476 		UnlockReleaseBuffer(oldbuf);
477 
478 	/* replay the record for new bucket */
479 	action = XLogReadBufferForRedo(record, 1, &newbuf);
480 
481 	/*
482 	 * Note that we still update the page even if it was restored from a full
483 	 * page image, because the bucket flag is not included in the image.
484 	 */
485 	if (action == BLK_NEEDS_REDO || action == BLK_RESTORED)
486 	{
487 		Page		newpage;
488 		HashPageOpaque nopaque;
489 
490 		newpage = BufferGetPage(newbuf);
491 		nopaque = (HashPageOpaque) PageGetSpecialPointer(newpage);
492 
493 		nopaque->hasho_flag = xlrec->new_bucket_flag;
494 
495 		PageSetLSN(newpage, lsn);
496 		MarkBufferDirty(newbuf);
497 	}
498 	if (BufferIsValid(newbuf))
499 		UnlockReleaseBuffer(newbuf);
500 }
501 
502 /*
503  * replay move of page contents for squeeze operation of hash index
504  */
505 static void
hash_xlog_move_page_contents(XLogReaderState * record)506 hash_xlog_move_page_contents(XLogReaderState *record)
507 {
508 	XLogRecPtr	lsn = record->EndRecPtr;
509 	xl_hash_move_page_contents *xldata = (xl_hash_move_page_contents *) XLogRecGetData(record);
510 	Buffer		bucketbuf = InvalidBuffer;
511 	Buffer		writebuf = InvalidBuffer;
512 	Buffer		deletebuf = InvalidBuffer;
513 	XLogRedoAction action;
514 
515 	/*
516 	 * Ensure we have a cleanup lock on primary bucket page before we start
517 	 * with the actual replay operation.  This is to ensure that neither a
518 	 * scan can start nor a scan can be already-in-progress during the replay
519 	 * of this operation.  If we allow scans during this operation, then they
520 	 * can miss some records or show the same record multiple times.
521 	 */
522 	if (xldata->is_prim_bucket_same_wrt)
523 		action = XLogReadBufferForRedoExtended(record, 1, RBM_NORMAL, true, &writebuf);
524 	else
525 	{
526 		/*
527 		 * we don't care for return value as the purpose of reading bucketbuf
528 		 * is to ensure a cleanup lock on primary bucket page.
529 		 */
530 		(void) XLogReadBufferForRedoExtended(record, 0, RBM_NORMAL, true, &bucketbuf);
531 
532 		action = XLogReadBufferForRedo(record, 1, &writebuf);
533 	}
534 
535 	/* replay the record for adding entries in overflow buffer */
536 	if (action == BLK_NEEDS_REDO)
537 	{
538 		Page		writepage;
539 		char	   *begin;
540 		char	   *data;
541 		Size		datalen;
542 		uint16		ninserted = 0;
543 
544 		data = begin = XLogRecGetBlockData(record, 1, &datalen);
545 
546 		writepage = (Page) BufferGetPage(writebuf);
547 
548 		if (xldata->ntups > 0)
549 		{
550 			OffsetNumber *towrite = (OffsetNumber *) data;
551 
552 			data += sizeof(OffsetNumber) * xldata->ntups;
553 
554 			while (data - begin < datalen)
555 			{
556 				IndexTuple	itup = (IndexTuple) data;
557 				Size		itemsz;
558 				OffsetNumber l;
559 
560 				itemsz = IndexTupleSize(itup);
561 				itemsz = MAXALIGN(itemsz);
562 
563 				data += itemsz;
564 
565 				l = PageAddItem(writepage, (Item) itup, itemsz, towrite[ninserted], false, false);
566 				if (l == InvalidOffsetNumber)
567 					elog(ERROR, "hash_xlog_move_page_contents: failed to add item to hash index page, size %d bytes",
568 						 (int) itemsz);
569 
570 				ninserted++;
571 			}
572 		}
573 
574 		/*
575 		 * number of tuples inserted must be same as requested in REDO record.
576 		 */
577 		Assert(ninserted == xldata->ntups);
578 
579 		PageSetLSN(writepage, lsn);
580 		MarkBufferDirty(writebuf);
581 	}
582 
583 	/* replay the record for deleting entries from overflow buffer */
584 	if (XLogReadBufferForRedo(record, 2, &deletebuf) == BLK_NEEDS_REDO)
585 	{
586 		Page		page;
587 		char	   *ptr;
588 		Size		len;
589 
590 		ptr = XLogRecGetBlockData(record, 2, &len);
591 
592 		page = (Page) BufferGetPage(deletebuf);
593 
594 		if (len > 0)
595 		{
596 			OffsetNumber *unused;
597 			OffsetNumber *unend;
598 
599 			unused = (OffsetNumber *) ptr;
600 			unend = (OffsetNumber *) ((char *) ptr + len);
601 
602 			if ((unend - unused) > 0)
603 				PageIndexMultiDelete(page, unused, unend - unused);
604 		}
605 
606 		PageSetLSN(page, lsn);
607 		MarkBufferDirty(deletebuf);
608 	}
609 
610 	/*
611 	 * Replay is complete, now we can release the buffers. We release locks at
612 	 * end of replay operation to ensure that we hold lock on primary bucket
613 	 * page till end of operation.  We can optimize by releasing the lock on
614 	 * write buffer as soon as the operation for same is complete, if it is
615 	 * not same as primary bucket page, but that doesn't seem to be worth
616 	 * complicating the code.
617 	 */
618 	if (BufferIsValid(deletebuf))
619 		UnlockReleaseBuffer(deletebuf);
620 
621 	if (BufferIsValid(writebuf))
622 		UnlockReleaseBuffer(writebuf);
623 
624 	if (BufferIsValid(bucketbuf))
625 		UnlockReleaseBuffer(bucketbuf);
626 }
627 
628 /*
629  * replay squeeze page operation of hash index
630  */
631 static void
hash_xlog_squeeze_page(XLogReaderState * record)632 hash_xlog_squeeze_page(XLogReaderState *record)
633 {
634 	XLogRecPtr	lsn = record->EndRecPtr;
635 	xl_hash_squeeze_page *xldata = (xl_hash_squeeze_page *) XLogRecGetData(record);
636 	Buffer		bucketbuf = InvalidBuffer;
637 	Buffer		writebuf;
638 	Buffer		ovflbuf;
639 	Buffer		prevbuf = InvalidBuffer;
640 	Buffer		mapbuf;
641 	XLogRedoAction action;
642 
643 	/*
644 	 * Ensure we have a cleanup lock on primary bucket page before we start
645 	 * with the actual replay operation.  This is to ensure that neither a
646 	 * scan can start nor a scan can be already-in-progress during the replay
647 	 * of this operation.  If we allow scans during this operation, then they
648 	 * can miss some records or show the same record multiple times.
649 	 */
650 	if (xldata->is_prim_bucket_same_wrt)
651 		action = XLogReadBufferForRedoExtended(record, 1, RBM_NORMAL, true, &writebuf);
652 	else
653 	{
654 		/*
655 		 * we don't care for return value as the purpose of reading bucketbuf
656 		 * is to ensure a cleanup lock on primary bucket page.
657 		 */
658 		(void) XLogReadBufferForRedoExtended(record, 0, RBM_NORMAL, true, &bucketbuf);
659 
660 		action = XLogReadBufferForRedo(record, 1, &writebuf);
661 	}
662 
663 	/* replay the record for adding entries in overflow buffer */
664 	if (action == BLK_NEEDS_REDO)
665 	{
666 		Page		writepage;
667 		char	   *begin;
668 		char	   *data;
669 		Size		datalen;
670 		uint16		ninserted = 0;
671 
672 		data = begin = XLogRecGetBlockData(record, 1, &datalen);
673 
674 		writepage = (Page) BufferGetPage(writebuf);
675 
676 		if (xldata->ntups > 0)
677 		{
678 			OffsetNumber *towrite = (OffsetNumber *) data;
679 
680 			data += sizeof(OffsetNumber) * xldata->ntups;
681 
682 			while (data - begin < datalen)
683 			{
684 				IndexTuple	itup = (IndexTuple) data;
685 				Size		itemsz;
686 				OffsetNumber l;
687 
688 				itemsz = IndexTupleSize(itup);
689 				itemsz = MAXALIGN(itemsz);
690 
691 				data += itemsz;
692 
693 				l = PageAddItem(writepage, (Item) itup, itemsz, towrite[ninserted], false, false);
694 				if (l == InvalidOffsetNumber)
695 					elog(ERROR, "hash_xlog_squeeze_page: failed to add item to hash index page, size %d bytes",
696 						 (int) itemsz);
697 
698 				ninserted++;
699 			}
700 		}
701 
702 		/*
703 		 * number of tuples inserted must be same as requested in REDO record.
704 		 */
705 		Assert(ninserted == xldata->ntups);
706 
707 		/*
708 		 * if the page on which are adding tuples is a page previous to freed
709 		 * overflow page, then update its nextblno.
710 		 */
711 		if (xldata->is_prev_bucket_same_wrt)
712 		{
713 			HashPageOpaque writeopaque = (HashPageOpaque) PageGetSpecialPointer(writepage);
714 
715 			writeopaque->hasho_nextblkno = xldata->nextblkno;
716 		}
717 
718 		PageSetLSN(writepage, lsn);
719 		MarkBufferDirty(writebuf);
720 	}
721 
722 	/* replay the record for initializing overflow buffer */
723 	if (XLogReadBufferForRedo(record, 2, &ovflbuf) == BLK_NEEDS_REDO)
724 	{
725 		Page		ovflpage;
726 		HashPageOpaque ovflopaque;
727 
728 		ovflpage = BufferGetPage(ovflbuf);
729 
730 		_hash_pageinit(ovflpage, BufferGetPageSize(ovflbuf));
731 
732 		ovflopaque = (HashPageOpaque) PageGetSpecialPointer(ovflpage);
733 
734 		ovflopaque->hasho_prevblkno = InvalidBlockNumber;
735 		ovflopaque->hasho_nextblkno = InvalidBlockNumber;
736 		ovflopaque->hasho_bucket = -1;
737 		ovflopaque->hasho_flag = LH_UNUSED_PAGE;
738 		ovflopaque->hasho_page_id = HASHO_PAGE_ID;
739 
740 		PageSetLSN(ovflpage, lsn);
741 		MarkBufferDirty(ovflbuf);
742 	}
743 	if (BufferIsValid(ovflbuf))
744 		UnlockReleaseBuffer(ovflbuf);
745 
746 	/* replay the record for page previous to the freed overflow page */
747 	if (!xldata->is_prev_bucket_same_wrt &&
748 		XLogReadBufferForRedo(record, 3, &prevbuf) == BLK_NEEDS_REDO)
749 	{
750 		Page		prevpage = BufferGetPage(prevbuf);
751 		HashPageOpaque prevopaque = (HashPageOpaque) PageGetSpecialPointer(prevpage);
752 
753 		prevopaque->hasho_nextblkno = xldata->nextblkno;
754 
755 		PageSetLSN(prevpage, lsn);
756 		MarkBufferDirty(prevbuf);
757 	}
758 	if (BufferIsValid(prevbuf))
759 		UnlockReleaseBuffer(prevbuf);
760 
761 	/* replay the record for page next to the freed overflow page */
762 	if (XLogRecHasBlockRef(record, 4))
763 	{
764 		Buffer		nextbuf;
765 
766 		if (XLogReadBufferForRedo(record, 4, &nextbuf) == BLK_NEEDS_REDO)
767 		{
768 			Page		nextpage = BufferGetPage(nextbuf);
769 			HashPageOpaque nextopaque = (HashPageOpaque) PageGetSpecialPointer(nextpage);
770 
771 			nextopaque->hasho_prevblkno = xldata->prevblkno;
772 
773 			PageSetLSN(nextpage, lsn);
774 			MarkBufferDirty(nextbuf);
775 		}
776 		if (BufferIsValid(nextbuf))
777 			UnlockReleaseBuffer(nextbuf);
778 	}
779 
780 	if (BufferIsValid(writebuf))
781 		UnlockReleaseBuffer(writebuf);
782 
783 	if (BufferIsValid(bucketbuf))
784 		UnlockReleaseBuffer(bucketbuf);
785 
786 	/*
787 	 * Note: in normal operation, we'd update the bitmap and meta page while
788 	 * still holding lock on the primary bucket page and overflow pages.  But
789 	 * during replay it's not necessary to hold those locks, since no other
790 	 * index updates can be happening concurrently.
791 	 */
792 	/* replay the record for bitmap page */
793 	if (XLogReadBufferForRedo(record, 5, &mapbuf) == BLK_NEEDS_REDO)
794 	{
795 		Page		mappage = (Page) BufferGetPage(mapbuf);
796 		uint32	   *freep = NULL;
797 		char	   *data;
798 		uint32	   *bitmap_page_bit;
799 		Size		datalen;
800 
801 		freep = HashPageGetBitmap(mappage);
802 
803 		data = XLogRecGetBlockData(record, 5, &datalen);
804 		bitmap_page_bit = (uint32 *) data;
805 
806 		CLRBIT(freep, *bitmap_page_bit);
807 
808 		PageSetLSN(mappage, lsn);
809 		MarkBufferDirty(mapbuf);
810 	}
811 	if (BufferIsValid(mapbuf))
812 		UnlockReleaseBuffer(mapbuf);
813 
814 	/* replay the record for meta page */
815 	if (XLogRecHasBlockRef(record, 6))
816 	{
817 		Buffer		metabuf;
818 
819 		if (XLogReadBufferForRedo(record, 6, &metabuf) == BLK_NEEDS_REDO)
820 		{
821 			HashMetaPage metap;
822 			Page		page;
823 			char	   *data;
824 			uint32	   *firstfree_ovflpage;
825 			Size		datalen;
826 
827 			data = XLogRecGetBlockData(record, 6, &datalen);
828 			firstfree_ovflpage = (uint32 *) data;
829 
830 			page = BufferGetPage(metabuf);
831 			metap = HashPageGetMeta(page);
832 			metap->hashm_firstfree = *firstfree_ovflpage;
833 
834 			PageSetLSN(page, lsn);
835 			MarkBufferDirty(metabuf);
836 		}
837 		if (BufferIsValid(metabuf))
838 			UnlockReleaseBuffer(metabuf);
839 	}
840 }
841 
842 /*
843  * replay delete operation of hash index
844  */
845 static void
hash_xlog_delete(XLogReaderState * record)846 hash_xlog_delete(XLogReaderState *record)
847 {
848 	XLogRecPtr	lsn = record->EndRecPtr;
849 	xl_hash_delete *xldata = (xl_hash_delete *) XLogRecGetData(record);
850 	Buffer		bucketbuf = InvalidBuffer;
851 	Buffer		deletebuf;
852 	Page		page;
853 	XLogRedoAction action;
854 
855 	/*
856 	 * Ensure we have a cleanup lock on primary bucket page before we start
857 	 * with the actual replay operation.  This is to ensure that neither a
858 	 * scan can start nor a scan can be already-in-progress during the replay
859 	 * of this operation.  If we allow scans during this operation, then they
860 	 * can miss some records or show the same record multiple times.
861 	 */
862 	if (xldata->is_primary_bucket_page)
863 		action = XLogReadBufferForRedoExtended(record, 1, RBM_NORMAL, true, &deletebuf);
864 	else
865 	{
866 		/*
867 		 * we don't care for return value as the purpose of reading bucketbuf
868 		 * is to ensure a cleanup lock on primary bucket page.
869 		 */
870 		(void) XLogReadBufferForRedoExtended(record, 0, RBM_NORMAL, true, &bucketbuf);
871 
872 		action = XLogReadBufferForRedo(record, 1, &deletebuf);
873 	}
874 
875 	/* replay the record for deleting entries in bucket page */
876 	if (action == BLK_NEEDS_REDO)
877 	{
878 		char	   *ptr;
879 		Size		len;
880 
881 		ptr = XLogRecGetBlockData(record, 1, &len);
882 
883 		page = (Page) BufferGetPage(deletebuf);
884 
885 		if (len > 0)
886 		{
887 			OffsetNumber *unused;
888 			OffsetNumber *unend;
889 
890 			unused = (OffsetNumber *) ptr;
891 			unend = (OffsetNumber *) ((char *) ptr + len);
892 
893 			if ((unend - unused) > 0)
894 				PageIndexMultiDelete(page, unused, unend - unused);
895 		}
896 
897 		/*
898 		 * Mark the page as not containing any LP_DEAD items only if
899 		 * clear_dead_marking flag is set to true. See comments in
900 		 * hashbucketcleanup() for details.
901 		 */
902 		if (xldata->clear_dead_marking)
903 		{
904 			HashPageOpaque pageopaque;
905 
906 			pageopaque = (HashPageOpaque) PageGetSpecialPointer(page);
907 			pageopaque->hasho_flag &= ~LH_PAGE_HAS_DEAD_TUPLES;
908 		}
909 
910 		PageSetLSN(page, lsn);
911 		MarkBufferDirty(deletebuf);
912 	}
913 	if (BufferIsValid(deletebuf))
914 		UnlockReleaseBuffer(deletebuf);
915 
916 	if (BufferIsValid(bucketbuf))
917 		UnlockReleaseBuffer(bucketbuf);
918 }
919 
920 /*
921  * replay split cleanup flag operation for primary bucket page.
922  */
923 static void
hash_xlog_split_cleanup(XLogReaderState * record)924 hash_xlog_split_cleanup(XLogReaderState *record)
925 {
926 	XLogRecPtr	lsn = record->EndRecPtr;
927 	Buffer		buffer;
928 	Page		page;
929 
930 	if (XLogReadBufferForRedo(record, 0, &buffer) == BLK_NEEDS_REDO)
931 	{
932 		HashPageOpaque bucket_opaque;
933 
934 		page = (Page) BufferGetPage(buffer);
935 
936 		bucket_opaque = (HashPageOpaque) PageGetSpecialPointer(page);
937 		bucket_opaque->hasho_flag &= ~LH_BUCKET_NEEDS_SPLIT_CLEANUP;
938 		PageSetLSN(page, lsn);
939 		MarkBufferDirty(buffer);
940 	}
941 	if (BufferIsValid(buffer))
942 		UnlockReleaseBuffer(buffer);
943 }
944 
945 /*
946  * replay for update meta page
947  */
948 static void
hash_xlog_update_meta_page(XLogReaderState * record)949 hash_xlog_update_meta_page(XLogReaderState *record)
950 {
951 	HashMetaPage metap;
952 	XLogRecPtr	lsn = record->EndRecPtr;
953 	xl_hash_update_meta_page *xldata = (xl_hash_update_meta_page *) XLogRecGetData(record);
954 	Buffer		metabuf;
955 	Page		page;
956 
957 	if (XLogReadBufferForRedo(record, 0, &metabuf) == BLK_NEEDS_REDO)
958 	{
959 		page = BufferGetPage(metabuf);
960 		metap = HashPageGetMeta(page);
961 
962 		metap->hashm_ntuples = xldata->ntuples;
963 
964 		PageSetLSN(page, lsn);
965 		MarkBufferDirty(metabuf);
966 	}
967 	if (BufferIsValid(metabuf))
968 		UnlockReleaseBuffer(metabuf);
969 }
970 
971 /*
972  * replay delete operation in hash index to remove
973  * tuples marked as DEAD during index tuple insertion.
974  */
975 static void
hash_xlog_vacuum_one_page(XLogReaderState * record)976 hash_xlog_vacuum_one_page(XLogReaderState *record)
977 {
978 	XLogRecPtr	lsn = record->EndRecPtr;
979 	xl_hash_vacuum_one_page *xldata;
980 	Buffer		buffer;
981 	Buffer		metabuf;
982 	Page		page;
983 	XLogRedoAction action;
984 	HashPageOpaque pageopaque;
985 
986 	xldata = (xl_hash_vacuum_one_page *) XLogRecGetData(record);
987 
988 	/*
989 	 * If we have any conflict processing to do, it must happen before we
990 	 * update the page.
991 	 *
992 	 * Hash index records that are marked as LP_DEAD and being removed during
993 	 * hash index tuple insertion can conflict with standby queries. You might
994 	 * think that vacuum records would conflict as well, but we've handled
995 	 * that already.  XLOG_HEAP2_CLEANUP_INFO records provide the highest xid
996 	 * cleaned by the vacuum of the heap and so we can resolve any conflicts
997 	 * just once when that arrives.  After that we know that no conflicts
998 	 * exist from individual hash index vacuum records on that index.
999 	 */
1000 	if (InHotStandby)
1001 	{
1002 		RelFileNode rnode;
1003 
1004 		XLogRecGetBlockTag(record, 0, &rnode, NULL, NULL);
1005 		ResolveRecoveryConflictWithSnapshot(xldata->latestRemovedXid, rnode);
1006 	}
1007 
1008 	action = XLogReadBufferForRedoExtended(record, 0, RBM_NORMAL, true, &buffer);
1009 
1010 	if (action == BLK_NEEDS_REDO)
1011 	{
1012 		page = (Page) BufferGetPage(buffer);
1013 
1014 		if (XLogRecGetDataLen(record) > SizeOfHashVacuumOnePage)
1015 		{
1016 			OffsetNumber *unused;
1017 
1018 			unused = (OffsetNumber *) ((char *) xldata + SizeOfHashVacuumOnePage);
1019 
1020 			PageIndexMultiDelete(page, unused, xldata->ntuples);
1021 		}
1022 
1023 		/*
1024 		 * Mark the page as not containing any LP_DEAD items. See comments in
1025 		 * _hash_vacuum_one_page() for details.
1026 		 */
1027 		pageopaque = (HashPageOpaque) PageGetSpecialPointer(page);
1028 		pageopaque->hasho_flag &= ~LH_PAGE_HAS_DEAD_TUPLES;
1029 
1030 		PageSetLSN(page, lsn);
1031 		MarkBufferDirty(buffer);
1032 	}
1033 	if (BufferIsValid(buffer))
1034 		UnlockReleaseBuffer(buffer);
1035 
1036 	if (XLogReadBufferForRedo(record, 1, &metabuf) == BLK_NEEDS_REDO)
1037 	{
1038 		Page		metapage;
1039 		HashMetaPage metap;
1040 
1041 		metapage = BufferGetPage(metabuf);
1042 		metap = HashPageGetMeta(metapage);
1043 
1044 		metap->hashm_ntuples -= xldata->ntuples;
1045 
1046 		PageSetLSN(metapage, lsn);
1047 		MarkBufferDirty(metabuf);
1048 	}
1049 	if (BufferIsValid(metabuf))
1050 		UnlockReleaseBuffer(metabuf);
1051 }
1052 
1053 void
hash_redo(XLogReaderState * record)1054 hash_redo(XLogReaderState *record)
1055 {
1056 	uint8		info = XLogRecGetInfo(record) & ~XLR_INFO_MASK;
1057 
1058 	switch (info)
1059 	{
1060 		case XLOG_HASH_INIT_META_PAGE:
1061 			hash_xlog_init_meta_page(record);
1062 			break;
1063 		case XLOG_HASH_INIT_BITMAP_PAGE:
1064 			hash_xlog_init_bitmap_page(record);
1065 			break;
1066 		case XLOG_HASH_INSERT:
1067 			hash_xlog_insert(record);
1068 			break;
1069 		case XLOG_HASH_ADD_OVFL_PAGE:
1070 			hash_xlog_add_ovfl_page(record);
1071 			break;
1072 		case XLOG_HASH_SPLIT_ALLOCATE_PAGE:
1073 			hash_xlog_split_allocate_page(record);
1074 			break;
1075 		case XLOG_HASH_SPLIT_PAGE:
1076 			hash_xlog_split_page(record);
1077 			break;
1078 		case XLOG_HASH_SPLIT_COMPLETE:
1079 			hash_xlog_split_complete(record);
1080 			break;
1081 		case XLOG_HASH_MOVE_PAGE_CONTENTS:
1082 			hash_xlog_move_page_contents(record);
1083 			break;
1084 		case XLOG_HASH_SQUEEZE_PAGE:
1085 			hash_xlog_squeeze_page(record);
1086 			break;
1087 		case XLOG_HASH_DELETE:
1088 			hash_xlog_delete(record);
1089 			break;
1090 		case XLOG_HASH_SPLIT_CLEANUP:
1091 			hash_xlog_split_cleanup(record);
1092 			break;
1093 		case XLOG_HASH_UPDATE_META_PAGE:
1094 			hash_xlog_update_meta_page(record);
1095 			break;
1096 		case XLOG_HASH_VACUUM_ONE_PAGE:
1097 			hash_xlog_vacuum_one_page(record);
1098 			break;
1099 		default:
1100 			elog(PANIC, "hash_redo: unknown op code %u", info);
1101 	}
1102 }
1103 
1104 /*
1105  * Mask a hash page before performing consistency checks on it.
1106  */
1107 void
hash_mask(char * pagedata,BlockNumber blkno)1108 hash_mask(char *pagedata, BlockNumber blkno)
1109 {
1110 	Page		page = (Page) pagedata;
1111 	HashPageOpaque opaque;
1112 	int			pagetype;
1113 
1114 	mask_page_lsn_and_checksum(page);
1115 
1116 	mask_page_hint_bits(page);
1117 	mask_unused_space(page);
1118 
1119 	opaque = (HashPageOpaque) PageGetSpecialPointer(page);
1120 
1121 	pagetype = opaque->hasho_flag & LH_PAGE_TYPE;
1122 	if (pagetype == LH_UNUSED_PAGE)
1123 	{
1124 		/*
1125 		 * Mask everything on a UNUSED page.
1126 		 */
1127 		mask_page_content(page);
1128 	}
1129 	else if (pagetype == LH_BUCKET_PAGE ||
1130 			 pagetype == LH_OVERFLOW_PAGE)
1131 	{
1132 		/*
1133 		 * In hash bucket and overflow pages, it is possible to modify the
1134 		 * LP_FLAGS without emitting any WAL record. Hence, mask the line
1135 		 * pointer flags. See hashgettuple(), _hash_kill_items() for details.
1136 		 */
1137 		mask_lp_flags(page);
1138 	}
1139 
1140 	/*
1141 	 * It is possible that the hint bit LH_PAGE_HAS_DEAD_TUPLES may remain
1142 	 * unlogged. So, mask it. See _hash_kill_items() for details.
1143 	 */
1144 	opaque->hasho_flag &= ~LH_PAGE_HAS_DEAD_TUPLES;
1145 }
1146