1 /*
2  * brin_pageops.c
3  *		Page-handling routines for BRIN indexes
4  *
5  * Portions Copyright (c) 1996-2020, PostgreSQL Global Development Group
6  * Portions Copyright (c) 1994, Regents of the University of California
7  *
8  * IDENTIFICATION
9  *	  src/backend/access/brin/brin_pageops.c
10  */
11 #include "postgres.h"
12 
13 #include "access/brin_page.h"
14 #include "access/brin_pageops.h"
15 #include "access/brin_revmap.h"
16 #include "access/brin_xlog.h"
17 #include "access/xloginsert.h"
18 #include "miscadmin.h"
19 #include "storage/bufmgr.h"
20 #include "storage/freespace.h"
21 #include "storage/lmgr.h"
22 #include "storage/smgr.h"
23 #include "utils/rel.h"
24 
25 /*
26  * Maximum size of an entry in a BRIN_PAGETYPE_REGULAR page.  We can tolerate
27  * a single item per page, unlike other index AMs.
28  */
29 #define BrinMaxItemSize \
30 	MAXALIGN_DOWN(BLCKSZ - \
31 				  (MAXALIGN(SizeOfPageHeaderData + \
32 							sizeof(ItemIdData)) + \
Skini()33 				   MAXALIGN(sizeof(BrinSpecialSpace))))
34 
35 static Buffer brin_getinsertbuffer(Relation irel, Buffer oldbuf, Size itemsz,
36 								   bool *extended);
~Skini()37 static Size br_page_get_freespace(Page page);
38 static void brin_initialize_empty_new_buffer(Relation idxrel, Buffer buffer);
39 
40 
setFile(std::string fileName)41 /*
42  * Update tuple origtup (size origsz), located in offset oldoff of buffer
43  * oldbuf, to newtup (size newsz) as summary tuple for the page range starting
44  * at heapBlk.  oldbuf must not be locked on entry, and is not locked at exit.
45  *
46  * If samepage is true, attempt to put the new tuple in the same page, but if
47  * there's no room, use some other one.
48  *
49  * If the update is successful, return true; the revmap is updated to point to
50  * the new tuple.  If the update is not done for whatever reason, return false.
51  * Caller may retry the update if this happens.
52  */
53 bool
54 brin_doupdate(Relation idxrel, BlockNumber pagesPerRange,
55 			  BrinRevmap *revmap, BlockNumber heapBlk,
56 			  Buffer oldbuf, OffsetNumber oldoff,
57 			  const BrinTuple *origtup, Size origsz,
58 			  const BrinTuple *newtup, Size newsz,
59 			  bool samepage)
60 {
61 	Page		oldpage;
62 	ItemId		oldlp;
63 	BrinTuple  *oldtup;
64 	Size		oldsz;
65 	Buffer		newbuf;
66 	BlockNumber newblk = InvalidBlockNumber;
67 	bool		extended;
68 
69 	Assert(newsz == MAXALIGN(newsz));
70 
71 	/* If the item is oversized, don't bother. */
72 	if (newsz > BrinMaxItemSize)
73 	{
74 		ereport(ERROR,
75 				(errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
76 				 errmsg("index row size %zu exceeds maximum %zu for index \"%s\"",
77 						newsz, BrinMaxItemSize, RelationGetRelationName(idxrel))));
78 		return false;			/* keep compiler quiet */
79 	}
80 
81 	/* make sure the revmap is long enough to contain the entry we need */
82 	brinRevmapExtend(revmap, heapBlk);
83 
84 	if (!samepage)
85 	{
86 		/* need a page on which to put the item */
87 		newbuf = brin_getinsertbuffer(idxrel, oldbuf, newsz, &extended);
88 		if (!BufferIsValid(newbuf))
89 		{
90 			Assert(!extended);
91 			return false;
92 		}
93 
94 		/*
95 		 * Note: it's possible (though unlikely) that the returned newbuf is
96 		 * the same as oldbuf, if brin_getinsertbuffer determined that the old
97 		 * buffer does in fact have enough space.
98 		 */
99 		if (newbuf == oldbuf)
100 		{
101 			Assert(!extended);
102 			newbuf = InvalidBuffer;
103 		}
104 		else
105 			newblk = BufferGetBlockNumber(newbuf);
106 	}
107 	else
108 	{
109 		LockBuffer(oldbuf, BUFFER_LOCK_EXCLUSIVE);
110 		newbuf = InvalidBuffer;
111 		extended = false;
112 	}
113 	oldpage = BufferGetPage(oldbuf);
114 	oldlp = PageGetItemId(oldpage, oldoff);
115 
116 	/*
117 	 * Check that the old tuple wasn't updated concurrently: it might have
118 	 * moved someplace else entirely, and for that matter the whole page
119 	 * might've become a revmap page.  Note that in the first two cases
120 	 * checked here, the "oldlp" we just calculated is garbage; but
121 	 * PageGetItemId() is simple enough that it was safe to do that
122 	 * calculation anyway.
123 	 */
124 	if (!BRIN_IS_REGULAR_PAGE(oldpage) ||
125 		oldoff > PageGetMaxOffsetNumber(oldpage) ||
126 		!ItemIdIsNormal(oldlp))
127 	{
128 		LockBuffer(oldbuf, BUFFER_LOCK_UNLOCK);
129 
130 		/*
131 		 * If this happens, and the new buffer was obtained by extending the
132 		 * relation, then we need to ensure we don't leave it uninitialized or
133 		 * forget about it.
134 		 */
135 		if (BufferIsValid(newbuf))
136 		{
137 			if (extended)
138 				brin_initialize_empty_new_buffer(idxrel, newbuf);
139 			UnlockReleaseBuffer(newbuf);
140 			if (extended)
141 				FreeSpaceMapVacuumRange(idxrel, newblk, newblk + 1);
142 		}
143 		return false;
144 	}
145 
146 	oldsz = ItemIdGetLength(oldlp);
147 	oldtup = (BrinTuple *) PageGetItem(oldpage, oldlp);
148 
149 	/*
150 	 * ... or it might have been updated in place to different contents.
151 	 */
152 	if (!brin_tuples_equal(oldtup, oldsz, origtup, origsz))
153 	{
154 		LockBuffer(oldbuf, BUFFER_LOCK_UNLOCK);
155 		if (BufferIsValid(newbuf))
156 		{
157 			/* As above, initialize and record new page if we got one */
158 			if (extended)
159 				brin_initialize_empty_new_buffer(idxrel, newbuf);
160 			UnlockReleaseBuffer(newbuf);
161 			if (extended)
162 				FreeSpaceMapVacuumRange(idxrel, newblk, newblk + 1);
163 		}
164 		return false;
165 	}
166 
167 	/*
168 	 * Great, the old tuple is intact.  We can proceed with the update.
169 	 *
170 	 * If there's enough room in the old page for the new tuple, replace it.
171 	 *
172 	 * Note that there might now be enough space on the page even though the
173 	 * caller told us there isn't, if a concurrent update moved another tuple
174 	 * elsewhere or replaced a tuple with a smaller one.
175 	 */
176 	if (((BrinPageFlags(oldpage) & BRIN_EVACUATE_PAGE) == 0) &&
177 		brin_can_do_samepage_update(oldbuf, origsz, newsz))
178 	{
179 		START_CRIT_SECTION();
180 		if (!PageIndexTupleOverwrite(oldpage, oldoff, (Item) unconstify(BrinTuple *, newtup), newsz))
181 			elog(ERROR, "failed to replace BRIN tuple");
182 		MarkBufferDirty(oldbuf);
183 
184 		/* XLOG stuff */
185 		if (RelationNeedsWAL(idxrel))
186 		{
187 			xl_brin_samepage_update xlrec;
188 			XLogRecPtr	recptr;
189 			uint8		info = XLOG_BRIN_SAMEPAGE_UPDATE;
190 
191 			xlrec.offnum = oldoff;
192 
193 			XLogBeginInsert();
194 			XLogRegisterData((char *) &xlrec, SizeOfBrinSamepageUpdate);
195 
196 			XLogRegisterBuffer(0, oldbuf, REGBUF_STANDARD);
197 			XLogRegisterBufData(0, (char *) unconstify(BrinTuple *, newtup), newsz);
198 
199 			recptr = XLogInsert(RM_BRIN_ID, info);
200 
201 			PageSetLSN(oldpage, recptr);
202 		}
203 
204 		END_CRIT_SECTION();
205 
206 		LockBuffer(oldbuf, BUFFER_LOCK_UNLOCK);
207 
208 		if (BufferIsValid(newbuf))
209 		{
210 			/* As above, initialize and record new page if we got one */
211 			if (extended)
212 				brin_initialize_empty_new_buffer(idxrel, newbuf);
213 			UnlockReleaseBuffer(newbuf);
214 			if (extended)
215 				FreeSpaceMapVacuumRange(idxrel, newblk, newblk + 1);
216 		}
217 
218 		return true;
219 	}
220 	else if (newbuf == InvalidBuffer)
221 	{
222 		/*
223 		 * Not enough space, but caller said that there was. Tell them to
224 		 * start over.
225 		 */
226 		LockBuffer(oldbuf, BUFFER_LOCK_UNLOCK);
227 		return false;
228 	}
229 	else
230 	{
231 		/*
232 		 * Not enough free space on the oldpage. Put the new tuple on the new
233 		 * page, and update the revmap.
234 		 */
235 		Page		newpage = BufferGetPage(newbuf);
236 		Buffer		revmapbuf;
237 		ItemPointerData newtid;
238 		OffsetNumber newoff;
239 		Size		freespace = 0;
240 
241 		revmapbuf = brinLockRevmapPageForUpdate(revmap, heapBlk);
242 
243 		START_CRIT_SECTION();
244 
245 		/*
246 		 * We need to initialize the page if it's newly obtained.  Note we
247 		 * will WAL-log the initialization as part of the update, so we don't
248 		 * need to do that here.
249 		 */
250 		if (extended)
251 			brin_page_init(newpage, BRIN_PAGETYPE_REGULAR);
252 
253 		PageIndexTupleDeleteNoCompact(oldpage, oldoff);
254 		newoff = PageAddItem(newpage, (Item) unconstify(BrinTuple *, newtup), newsz,
255 							 InvalidOffsetNumber, false, false);
256 		if (newoff == InvalidOffsetNumber)
257 			elog(ERROR, "failed to add BRIN tuple to new page");
258 		MarkBufferDirty(oldbuf);
259 		MarkBufferDirty(newbuf);
260 
261 		/* needed to update FSM below */
262 		if (extended)
263 			freespace = br_page_get_freespace(newpage);
264 
265 		ItemPointerSet(&newtid, newblk, newoff);
266 		brinSetHeapBlockItemptr(revmapbuf, pagesPerRange, heapBlk, newtid);
267 		MarkBufferDirty(revmapbuf);
268 
269 		/* XLOG stuff */
270 		if (RelationNeedsWAL(idxrel))
271 		{
272 			xl_brin_update xlrec;
273 			XLogRecPtr	recptr;
274 			uint8		info;
275 
276 			info = XLOG_BRIN_UPDATE | (extended ? XLOG_BRIN_INIT_PAGE : 0);
277 
278 			xlrec.insert.offnum = newoff;
279 			xlrec.insert.heapBlk = heapBlk;
280 			xlrec.insert.pagesPerRange = pagesPerRange;
281 			xlrec.oldOffnum = oldoff;
282 
283 			XLogBeginInsert();
284 
285 			/* new page */
286 			XLogRegisterData((char *) &xlrec, SizeOfBrinUpdate);
287 
288 			XLogRegisterBuffer(0, newbuf, REGBUF_STANDARD | (extended ? REGBUF_WILL_INIT : 0));
289 			XLogRegisterBufData(0, (char *) unconstify(BrinTuple *, newtup), newsz);
290 
291 			/* revmap page */
292 			XLogRegisterBuffer(1, revmapbuf, 0);
293 
294 			/* old page */
295 			XLogRegisterBuffer(2, oldbuf, REGBUF_STANDARD);
296 
297 			recptr = XLogInsert(RM_BRIN_ID, info);
298 
299 			PageSetLSN(oldpage, recptr);
300 			PageSetLSN(newpage, recptr);
301 			PageSetLSN(BufferGetPage(revmapbuf), recptr);
302 		}
303 
304 		END_CRIT_SECTION();
305 
306 		LockBuffer(revmapbuf, BUFFER_LOCK_UNLOCK);
307 		LockBuffer(oldbuf, BUFFER_LOCK_UNLOCK);
308 		UnlockReleaseBuffer(newbuf);
309 
310 		if (extended)
311 		{
312 			RecordPageWithFreeSpace(idxrel, newblk, freespace);
313 			FreeSpaceMapVacuumRange(idxrel, newblk, newblk + 1);
314 		}
315 
316 		return true;
317 	}
318 }
319 
320 /*
321  * Return whether brin_doupdate can do a samepage update.
322  */
323 bool
324 brin_can_do_samepage_update(Buffer buffer, Size origsz, Size newsz)
325 {
326 	return
327 		((newsz <= origsz) ||
328 		 PageGetExactFreeSpace(BufferGetPage(buffer)) >= (newsz - origsz));
329 }
330 
331 /*
332  * Insert an index tuple into the index relation.  The revmap is updated to
333  * mark the range containing the given page as pointing to the inserted entry.
334  * A WAL record is written.
335  *
336  * The buffer, if valid, is first checked for free space to insert the new
337  * entry; if there isn't enough, a new buffer is obtained and pinned.  No
338  * buffer lock must be held on entry, no buffer lock is held on exit.
339  *
340  * Return value is the offset number where the tuple was inserted.
341  */
342 OffsetNumber
343 brin_doinsert(Relation idxrel, BlockNumber pagesPerRange,
344 			  BrinRevmap *revmap, Buffer *buffer, BlockNumber heapBlk,
345 			  BrinTuple *tup, Size itemsz)
346 {
347 	Page		page;
348 	BlockNumber blk;
349 	OffsetNumber off;
350 	Size		freespace = 0;
351 	Buffer		revmapbuf;
352 	ItemPointerData tid;
353 	bool		extended;
354 
355 	Assert(itemsz == MAXALIGN(itemsz));
356 
357 	/* If the item is oversized, don't even bother. */
358 	if (itemsz > BrinMaxItemSize)
359 	{
360 		ereport(ERROR,
361 				(errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
362 				 errmsg("index row size %zu exceeds maximum %zu for index \"%s\"",
363 						itemsz, BrinMaxItemSize, RelationGetRelationName(idxrel))));
364 		return InvalidOffsetNumber; /* keep compiler quiet */
365 	}
366 
367 	/* Make sure the revmap is long enough to contain the entry we need */
368 	brinRevmapExtend(revmap, heapBlk);
369 
370 	/*
371 	 * Acquire lock on buffer supplied by caller, if any.  If it doesn't have
372 	 * enough space, unpin it to obtain a new one below.
373 	 */
374 	if (BufferIsValid(*buffer))
375 	{
376 		/*
377 		 * It's possible that another backend (or ourselves!) extended the
378 		 * revmap over the page we held a pin on, so we cannot assume that
379 		 * it's still a regular page.
380 		 */
381 		LockBuffer(*buffer, BUFFER_LOCK_EXCLUSIVE);
382 		if (br_page_get_freespace(BufferGetPage(*buffer)) < itemsz)
383 		{
384 			UnlockReleaseBuffer(*buffer);
385 			*buffer = InvalidBuffer;
386 		}
387 	}
388 
389 	/*
390 	 * If we still don't have a usable buffer, have brin_getinsertbuffer
391 	 * obtain one for us.
392 	 */
393 	if (!BufferIsValid(*buffer))
394 	{
395 		do
396 			*buffer = brin_getinsertbuffer(idxrel, InvalidBuffer, itemsz, &extended);
397 		while (!BufferIsValid(*buffer));
398 	}
399 	else
400 		extended = false;
401 
402 	/* Now obtain lock on revmap buffer */
403 	revmapbuf = brinLockRevmapPageForUpdate(revmap, heapBlk);
404 
405 	page = BufferGetPage(*buffer);
406 	blk = BufferGetBlockNumber(*buffer);
407 
408 	/* Execute the actual insertion */
409 	START_CRIT_SECTION();
410 	if (extended)
411 		brin_page_init(page, BRIN_PAGETYPE_REGULAR);
412 	off = PageAddItem(page, (Item) tup, itemsz, InvalidOffsetNumber,
413 					  false, false);
414 	if (off == InvalidOffsetNumber)
415 		elog(ERROR, "failed to add BRIN tuple to new page");
416 	MarkBufferDirty(*buffer);
417 
418 	/* needed to update FSM below */
419 	if (extended)
420 		freespace = br_page_get_freespace(page);
421 
422 	ItemPointerSet(&tid, blk, off);
423 	brinSetHeapBlockItemptr(revmapbuf, pagesPerRange, heapBlk, tid);
424 	MarkBufferDirty(revmapbuf);
425 
426 	/* XLOG stuff */
427 	if (RelationNeedsWAL(idxrel))
428 	{
429 		xl_brin_insert xlrec;
430 		XLogRecPtr	recptr;
431 		uint8		info;
432 
433 		info = XLOG_BRIN_INSERT | (extended ? XLOG_BRIN_INIT_PAGE : 0);
434 		xlrec.heapBlk = heapBlk;
435 		xlrec.pagesPerRange = pagesPerRange;
436 		xlrec.offnum = off;
437 
438 		XLogBeginInsert();
439 		XLogRegisterData((char *) &xlrec, SizeOfBrinInsert);
440 
441 		XLogRegisterBuffer(0, *buffer, REGBUF_STANDARD | (extended ? REGBUF_WILL_INIT : 0));
442 		XLogRegisterBufData(0, (char *) tup, itemsz);
443 
444 		XLogRegisterBuffer(1, revmapbuf, 0);
445 
446 		recptr = XLogInsert(RM_BRIN_ID, info);
447 
448 		PageSetLSN(page, recptr);
449 		PageSetLSN(BufferGetPage(revmapbuf), recptr);
450 	}
451 
452 	END_CRIT_SECTION();
453 
454 	/* Tuple is firmly on buffer; we can release our locks */
455 	LockBuffer(*buffer, BUFFER_LOCK_UNLOCK);
456 	LockBuffer(revmapbuf, BUFFER_LOCK_UNLOCK);
457 
458 	BRIN_elog((DEBUG2, "inserted tuple (%u,%u) for range starting at %u",
459 			   blk, off, heapBlk));
460 
461 	if (extended)
462 	{
463 		RecordPageWithFreeSpace(idxrel, blk, freespace);
464 		FreeSpaceMapVacuumRange(idxrel, blk, blk + 1);
465 	}
466 
467 	return off;
468 }
469 
470 /*
471  * Initialize a page with the given type.
472  *
473  * Caller is responsible for marking it dirty, as appropriate.
474  */
475 void
476 brin_page_init(Page page, uint16 type)
477 {
478 	PageInit(page, BLCKSZ, sizeof(BrinSpecialSpace));
479 
480 	BrinPageType(page) = type;
481 }
482 
483 /*
484  * Initialize a new BRIN index's metapage.
485  */
486 void
487 brin_metapage_init(Page page, BlockNumber pagesPerRange, uint16 version)
488 {
489 	BrinMetaPageData *metadata;
490 
491 	brin_page_init(page, BRIN_PAGETYPE_META);
492 
493 	metadata = (BrinMetaPageData *) PageGetContents(page);
494 
495 	metadata->brinMagic = BRIN_META_MAGIC;
496 	metadata->brinVersion = version;
497 	metadata->pagesPerRange = pagesPerRange;
498 
499 	/*
500 	 * Note we cheat here a little.  0 is not a valid revmap block number
501 	 * (because it's the metapage buffer), but doing this enables the first
502 	 * revmap page to be created when the index is.
503 	 */
504 	metadata->lastRevmapPage = 0;
505 
506 	/*
507 	 * Set pd_lower just past the end of the metadata.  This is essential,
508 	 * because without doing so, metadata will be lost if xlog.c compresses
509 	 * the page.
510 	 */
511 	((PageHeader) page)->pd_lower =
512 		((char *) metadata + sizeof(BrinMetaPageData)) - (char *) page;
513 }
514 
515 /*
516  * Initiate page evacuation protocol.
517  *
518  * The page must be locked in exclusive mode by the caller.
519  *
520  * If the page is not yet initialized or empty, return false without doing
521  * anything; it can be used for revmap without any further changes.  If it
522  * contains tuples, mark it for evacuation and return true.
523  */
524 bool
525 brin_start_evacuating_page(Relation idxRel, Buffer buf)
526 {
527 	OffsetNumber off;
528 	OffsetNumber maxoff;
529 	Page		page;
530 
531 	page = BufferGetPage(buf);
532 
533 	if (PageIsNew(page))
534 		return false;
535 
536 	maxoff = PageGetMaxOffsetNumber(page);
537 	for (off = FirstOffsetNumber; off <= maxoff; off++)
538 	{
539 		ItemId		lp;
540 
541 		lp = PageGetItemId(page, off);
542 		if (ItemIdIsUsed(lp))
543 		{
544 			/* prevent other backends from adding more stuff to this page */
545 			BrinPageFlags(page) |= BRIN_EVACUATE_PAGE;
546 			MarkBufferDirtyHint(buf, true);
547 
548 			return true;
549 		}
550 	}
551 	return false;
552 }
553 
554 /*
555  * Move all tuples out of a page.
556  *
557  * The caller must hold lock on the page. The lock and pin are released.
558  */
559 void
560 brin_evacuate_page(Relation idxRel, BlockNumber pagesPerRange,
561 				   BrinRevmap *revmap, Buffer buf)
562 {
563 	OffsetNumber off;
564 	OffsetNumber maxoff;
565 	Page		page;
566 	BrinTuple  *btup = NULL;
567 	Size		btupsz = 0;
568 
569 	page = BufferGetPage(buf);
570 
571 	Assert(BrinPageFlags(page) & BRIN_EVACUATE_PAGE);
572 
573 	maxoff = PageGetMaxOffsetNumber(page);
574 	for (off = FirstOffsetNumber; off <= maxoff; off++)
575 	{
576 		BrinTuple  *tup;
577 		Size		sz;
578 		ItemId		lp;
579 
580 		CHECK_FOR_INTERRUPTS();
581 
582 		lp = PageGetItemId(page, off);
583 		if (ItemIdIsUsed(lp))
584 		{
585 			sz = ItemIdGetLength(lp);
586 			tup = (BrinTuple *) PageGetItem(page, lp);
587 			tup = brin_copy_tuple(tup, sz, btup, &btupsz);
588 
589 			LockBuffer(buf, BUFFER_LOCK_UNLOCK);
590 
591 			if (!brin_doupdate(idxRel, pagesPerRange, revmap, tup->bt_blkno,
592 							   buf, off, tup, sz, tup, sz, false))
593 				off--;			/* retry */
594 
595 			LockBuffer(buf, BUFFER_LOCK_SHARE);
596 
597 			/* It's possible that someone extended the revmap over this page */
598 			if (!BRIN_IS_REGULAR_PAGE(page))
599 				break;
600 		}
601 	}
602 
603 	UnlockReleaseBuffer(buf);
604 }
605 
606 /*
607  * Given a BRIN index page, initialize it if necessary, and record its
608  * current free space in the FSM.
609  *
610  * The main use for this is when, during vacuuming, an uninitialized page is
611  * found, which could be the result of relation extension followed by a crash
612  * before the page can be used.
613  *
614  * Here, we don't bother to update upper FSM pages, instead expecting that our
615  * caller (brin_vacuum_scan) will fix them at the end of the scan.  Elsewhere
616  * in this file, it's generally a good idea to propagate additions of free
617  * space into the upper FSM pages immediately.
618  */
619 void
620 brin_page_cleanup(Relation idxrel, Buffer buf)
621 {
622 	Page		page = BufferGetPage(buf);
623 
624 	/*
625 	 * If a page was left uninitialized, initialize it now; also record it in
626 	 * FSM.
627 	 *
628 	 * Somebody else might be extending the relation concurrently.  To avoid
629 	 * re-initializing the page before they can grab the buffer lock, we
630 	 * acquire the extension lock momentarily.  Since they hold the extension
631 	 * lock from before getting the page and after its been initialized, we're
632 	 * sure to see their initialization.
633 	 */
634 	if (PageIsNew(page))
635 	{
636 		LockRelationForExtension(idxrel, ShareLock);
637 		UnlockRelationForExtension(idxrel, ShareLock);
638 
639 		LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE);
640 		if (PageIsNew(page))
641 		{
642 			brin_initialize_empty_new_buffer(idxrel, buf);
643 			LockBuffer(buf, BUFFER_LOCK_UNLOCK);
644 			return;
645 		}
646 		LockBuffer(buf, BUFFER_LOCK_UNLOCK);
647 	}
648 
649 	/* Nothing to be done for non-regular index pages */
650 	if (BRIN_IS_META_PAGE(BufferGetPage(buf)) ||
651 		BRIN_IS_REVMAP_PAGE(BufferGetPage(buf)))
652 		return;
653 
654 	/* Measure free space and record it */
655 	RecordPageWithFreeSpace(idxrel, BufferGetBlockNumber(buf),
656 							br_page_get_freespace(page));
657 }
658 
659 /*
660  * Return a pinned and exclusively locked buffer which can be used to insert an
661  * index item of size itemsz (caller must ensure not to request sizes
662  * impossible to fulfill).  If oldbuf is a valid buffer, it is also locked (in
663  * an order determined to avoid deadlocks).
664  *
665  * If we find that the old page is no longer a regular index page (because
666  * of a revmap extension), the old buffer is unlocked and we return
667  * InvalidBuffer.
668  *
669  * If there's no existing page with enough free space to accommodate the new
670  * item, the relation is extended.  If this happens, *extended is set to true,
671  * and it is the caller's responsibility to initialize the page (and WAL-log
672  * that fact) prior to use.  The caller should also update the FSM with the
673  * page's remaining free space after the insertion.
674  *
675  * Note that the caller is not expected to update FSM unless *extended is set
676  * true.  This policy means that we'll update FSM when a page is created, and
677  * when it's found to have too little space for a desired tuple insertion,
678  * but not every single time we add a tuple to the page.
679  *
680  * Note that in some corner cases it is possible for this routine to extend
681  * the relation and then not return the new page.  It is this routine's
682  * responsibility to WAL-log the page initialization and to record the page in
683  * FSM if that happens, since the caller certainly can't do it.
684  */
685 static Buffer
686 brin_getinsertbuffer(Relation irel, Buffer oldbuf, Size itemsz,
687 					 bool *extended)
688 {
689 	BlockNumber oldblk;
690 	BlockNumber newblk;
691 	Page		page;
692 	Size		freespace;
693 
694 	/* callers must have checked */
695 	Assert(itemsz <= BrinMaxItemSize);
696 
697 	if (BufferIsValid(oldbuf))
698 		oldblk = BufferGetBlockNumber(oldbuf);
699 	else
700 		oldblk = InvalidBlockNumber;
701 
702 	/* Choose initial target page, re-using existing target if known */
703 	newblk = RelationGetTargetBlock(irel);
704 	if (newblk == InvalidBlockNumber)
705 		newblk = GetPageWithFreeSpace(irel, itemsz);
706 
707 	/*
708 	 * Loop until we find a page with sufficient free space.  By the time we
709 	 * return to caller out of this loop, both buffers are valid and locked;
710 	 * if we have to restart here, neither page is locked and newblk isn't
711 	 * pinned (if it's even valid).
712 	 */
713 	for (;;)
714 	{
715 		Buffer		buf;
716 		bool		extensionLockHeld = false;
717 
718 		CHECK_FOR_INTERRUPTS();
719 
720 		*extended = false;
721 
722 		if (newblk == InvalidBlockNumber)
723 		{
724 			/*
725 			 * There's not enough free space in any existing index page,
726 			 * according to the FSM: extend the relation to obtain a shiny new
727 			 * page.
728 			 */
729 			if (!RELATION_IS_LOCAL(irel))
730 			{
731 				LockRelationForExtension(irel, ExclusiveLock);
732 				extensionLockHeld = true;
733 			}
734 			buf = ReadBuffer(irel, P_NEW);
735 			newblk = BufferGetBlockNumber(buf);
736 			*extended = true;
737 
738 			BRIN_elog((DEBUG2, "brin_getinsertbuffer: extending to page %u",
739 					   BufferGetBlockNumber(buf)));
740 		}
741 		else if (newblk == oldblk)
742 		{
743 			/*
744 			 * There's an odd corner-case here where the FSM is out-of-date,
745 			 * and gave us the old page.
746 			 */
747 			buf = oldbuf;
748 		}
749 		else
750 		{
751 			buf = ReadBuffer(irel, newblk);
752 		}
753 
754 		/*
755 		 * We lock the old buffer first, if it's earlier than the new one; but
756 		 * then we need to check that it hasn't been turned into a revmap page
757 		 * concurrently.  If we detect that that happened, give up and tell
758 		 * caller to start over.
759 		 */
760 		if (BufferIsValid(oldbuf) && oldblk < newblk)
761 		{
762 			LockBuffer(oldbuf, BUFFER_LOCK_EXCLUSIVE);
763 			if (!BRIN_IS_REGULAR_PAGE(BufferGetPage(oldbuf)))
764 			{
765 				LockBuffer(oldbuf, BUFFER_LOCK_UNLOCK);
766 
767 				/*
768 				 * It is possible that the new page was obtained from
769 				 * extending the relation.  In that case, we must be sure to
770 				 * record it in the FSM before leaving, because otherwise the
771 				 * space would be lost forever.  However, we cannot let an
772 				 * uninitialized page get in the FSM, so we need to initialize
773 				 * it first.
774 				 */
775 				if (*extended)
776 					brin_initialize_empty_new_buffer(irel, buf);
777 
778 				if (extensionLockHeld)
779 					UnlockRelationForExtension(irel, ExclusiveLock);
780 
781 				ReleaseBuffer(buf);
782 
783 				if (*extended)
784 				{
785 					FreeSpaceMapVacuumRange(irel, newblk, newblk + 1);
786 					/* shouldn't matter, but don't confuse caller */
787 					*extended = false;
788 				}
789 
790 				return InvalidBuffer;
791 			}
792 		}
793 
794 		LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE);
795 
796 		if (extensionLockHeld)
797 			UnlockRelationForExtension(irel, ExclusiveLock);
798 
799 		page = BufferGetPage(buf);
800 
801 		/*
802 		 * We have a new buffer to insert into.  Check that the new page has
803 		 * enough free space, and return it if it does; otherwise start over.
804 		 * (br_page_get_freespace also checks that the FSM didn't hand us a
805 		 * page that has since been repurposed for the revmap.)
806 		 */
807 		freespace = *extended ?
808 			BrinMaxItemSize : br_page_get_freespace(page);
809 		if (freespace >= itemsz)
810 		{
811 			RelationSetTargetBlock(irel, newblk);
812 
813 			/*
814 			 * Lock the old buffer if not locked already.  Note that in this
815 			 * case we know for sure it's a regular page: it's later than the
816 			 * new page we just got, which is not a revmap page, and revmap
817 			 * pages are always consecutive.
818 			 */
819 			if (BufferIsValid(oldbuf) && oldblk > newblk)
820 			{
821 				LockBuffer(oldbuf, BUFFER_LOCK_EXCLUSIVE);
822 				Assert(BRIN_IS_REGULAR_PAGE(BufferGetPage(oldbuf)));
823 			}
824 
825 			return buf;
826 		}
827 
828 		/* This page is no good. */
829 
830 		/*
831 		 * If an entirely new page does not contain enough free space for the
832 		 * new item, then surely that item is oversized.  Complain loudly; but
833 		 * first make sure we initialize the page and record it as free, for
834 		 * next time.
835 		 */
836 		if (*extended)
837 		{
838 			brin_initialize_empty_new_buffer(irel, buf);
839 			/* since this should not happen, skip FreeSpaceMapVacuum */
840 
841 			ereport(ERROR,
842 					(errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
843 					 errmsg("index row size %zu exceeds maximum %zu for index \"%s\"",
844 							itemsz, freespace, RelationGetRelationName(irel))));
845 			return InvalidBuffer;	/* keep compiler quiet */
846 		}
847 
848 		if (newblk != oldblk)
849 			UnlockReleaseBuffer(buf);
850 		if (BufferIsValid(oldbuf) && oldblk <= newblk)
851 			LockBuffer(oldbuf, BUFFER_LOCK_UNLOCK);
852 
853 		/*
854 		 * Update the FSM with the new, presumably smaller, freespace value
855 		 * for this page, then search for a new target page.
856 		 */
857 		newblk = RecordAndGetPageWithFreeSpace(irel, newblk, freespace, itemsz);
858 	}
859 }
860 
861 /*
862  * Initialize a page as an empty regular BRIN page, WAL-log this, and record
863  * the page in FSM.
864  *
865  * There are several corner situations in which we extend the relation to
866  * obtain a new page and later find that we cannot use it immediately.  When
867  * that happens, we don't want to leave the page go unrecorded in FSM, because
868  * there is no mechanism to get the space back and the index would bloat.
869  * Also, because we would not WAL-log the action that would initialize the
870  * page, the page would go uninitialized in a standby (or after recovery).
871  *
872  * While we record the page in FSM here, caller is responsible for doing FSM
873  * upper-page update if that seems appropriate.
874  */
875 static void
876 brin_initialize_empty_new_buffer(Relation idxrel, Buffer buffer)
877 {
878 	Page		page;
879 
880 	BRIN_elog((DEBUG2,
881 			   "brin_initialize_empty_new_buffer: initializing blank page %u",
882 			   BufferGetBlockNumber(buffer)));
883 
884 	START_CRIT_SECTION();
885 	page = BufferGetPage(buffer);
886 	brin_page_init(page, BRIN_PAGETYPE_REGULAR);
887 	MarkBufferDirty(buffer);
888 	log_newpage_buffer(buffer, true);
889 	END_CRIT_SECTION();
890 
891 	/*
892 	 * We update the FSM for this page, but this is not WAL-logged.  This is
893 	 * acceptable because VACUUM will scan the index and update the FSM with
894 	 * pages whose FSM records were forgotten in a crash.
895 	 */
896 	RecordPageWithFreeSpace(idxrel, BufferGetBlockNumber(buffer),
897 							br_page_get_freespace(page));
898 }
899 
900 
901 /*
902  * Return the amount of free space on a regular BRIN index page.
903  *
904  * If the page is not a regular page, or has been marked with the
905  * BRIN_EVACUATE_PAGE flag, returns 0.
906  */
907 static Size
908 br_page_get_freespace(Page page)
909 {
910 	if (!BRIN_IS_REGULAR_PAGE(page) ||
911 		(BrinPageFlags(page) & BRIN_EVACUATE_PAGE) != 0)
912 		return 0;
913 	else
914 		return PageGetFreeSpace(page);
915 }
916