1 /*
2 * brin_pageops.c
3 * Page-handling routines for BRIN indexes
4 *
5 * Portions Copyright (c) 1996-2020, PostgreSQL Global Development Group
6 * Portions Copyright (c) 1994, Regents of the University of California
7 *
8 * IDENTIFICATION
9 * src/backend/access/brin/brin_pageops.c
10 */
11 #include "postgres.h"
12
13 #include "access/brin_page.h"
14 #include "access/brin_pageops.h"
15 #include "access/brin_revmap.h"
16 #include "access/brin_xlog.h"
17 #include "access/xloginsert.h"
18 #include "miscadmin.h"
19 #include "storage/bufmgr.h"
20 #include "storage/freespace.h"
21 #include "storage/lmgr.h"
22 #include "storage/smgr.h"
23 #include "utils/rel.h"
24
25 /*
26 * Maximum size of an entry in a BRIN_PAGETYPE_REGULAR page. We can tolerate
27 * a single item per page, unlike other index AMs.
28 */
29 #define BrinMaxItemSize \
30 MAXALIGN_DOWN(BLCKSZ - \
31 (MAXALIGN(SizeOfPageHeaderData + \
32 sizeof(ItemIdData)) + \
Skini()33 MAXALIGN(sizeof(BrinSpecialSpace))))
34
35 static Buffer brin_getinsertbuffer(Relation irel, Buffer oldbuf, Size itemsz,
36 bool *extended);
~Skini()37 static Size br_page_get_freespace(Page page);
38 static void brin_initialize_empty_new_buffer(Relation idxrel, Buffer buffer);
39
40
setFile(std::string fileName)41 /*
42 * Update tuple origtup (size origsz), located in offset oldoff of buffer
43 * oldbuf, to newtup (size newsz) as summary tuple for the page range starting
44 * at heapBlk. oldbuf must not be locked on entry, and is not locked at exit.
45 *
46 * If samepage is true, attempt to put the new tuple in the same page, but if
47 * there's no room, use some other one.
48 *
49 * If the update is successful, return true; the revmap is updated to point to
50 * the new tuple. If the update is not done for whatever reason, return false.
51 * Caller may retry the update if this happens.
52 */
53 bool
54 brin_doupdate(Relation idxrel, BlockNumber pagesPerRange,
55 BrinRevmap *revmap, BlockNumber heapBlk,
56 Buffer oldbuf, OffsetNumber oldoff,
57 const BrinTuple *origtup, Size origsz,
58 const BrinTuple *newtup, Size newsz,
59 bool samepage)
60 {
61 Page oldpage;
62 ItemId oldlp;
63 BrinTuple *oldtup;
64 Size oldsz;
65 Buffer newbuf;
66 BlockNumber newblk = InvalidBlockNumber;
67 bool extended;
68
69 Assert(newsz == MAXALIGN(newsz));
70
71 /* If the item is oversized, don't bother. */
72 if (newsz > BrinMaxItemSize)
73 {
74 ereport(ERROR,
75 (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
76 errmsg("index row size %zu exceeds maximum %zu for index \"%s\"",
77 newsz, BrinMaxItemSize, RelationGetRelationName(idxrel))));
78 return false; /* keep compiler quiet */
79 }
80
81 /* make sure the revmap is long enough to contain the entry we need */
82 brinRevmapExtend(revmap, heapBlk);
83
84 if (!samepage)
85 {
86 /* need a page on which to put the item */
87 newbuf = brin_getinsertbuffer(idxrel, oldbuf, newsz, &extended);
88 if (!BufferIsValid(newbuf))
89 {
90 Assert(!extended);
91 return false;
92 }
93
94 /*
95 * Note: it's possible (though unlikely) that the returned newbuf is
96 * the same as oldbuf, if brin_getinsertbuffer determined that the old
97 * buffer does in fact have enough space.
98 */
99 if (newbuf == oldbuf)
100 {
101 Assert(!extended);
102 newbuf = InvalidBuffer;
103 }
104 else
105 newblk = BufferGetBlockNumber(newbuf);
106 }
107 else
108 {
109 LockBuffer(oldbuf, BUFFER_LOCK_EXCLUSIVE);
110 newbuf = InvalidBuffer;
111 extended = false;
112 }
113 oldpage = BufferGetPage(oldbuf);
114 oldlp = PageGetItemId(oldpage, oldoff);
115
116 /*
117 * Check that the old tuple wasn't updated concurrently: it might have
118 * moved someplace else entirely, and for that matter the whole page
119 * might've become a revmap page. Note that in the first two cases
120 * checked here, the "oldlp" we just calculated is garbage; but
121 * PageGetItemId() is simple enough that it was safe to do that
122 * calculation anyway.
123 */
124 if (!BRIN_IS_REGULAR_PAGE(oldpage) ||
125 oldoff > PageGetMaxOffsetNumber(oldpage) ||
126 !ItemIdIsNormal(oldlp))
127 {
128 LockBuffer(oldbuf, BUFFER_LOCK_UNLOCK);
129
130 /*
131 * If this happens, and the new buffer was obtained by extending the
132 * relation, then we need to ensure we don't leave it uninitialized or
133 * forget about it.
134 */
135 if (BufferIsValid(newbuf))
136 {
137 if (extended)
138 brin_initialize_empty_new_buffer(idxrel, newbuf);
139 UnlockReleaseBuffer(newbuf);
140 if (extended)
141 FreeSpaceMapVacuumRange(idxrel, newblk, newblk + 1);
142 }
143 return false;
144 }
145
146 oldsz = ItemIdGetLength(oldlp);
147 oldtup = (BrinTuple *) PageGetItem(oldpage, oldlp);
148
149 /*
150 * ... or it might have been updated in place to different contents.
151 */
152 if (!brin_tuples_equal(oldtup, oldsz, origtup, origsz))
153 {
154 LockBuffer(oldbuf, BUFFER_LOCK_UNLOCK);
155 if (BufferIsValid(newbuf))
156 {
157 /* As above, initialize and record new page if we got one */
158 if (extended)
159 brin_initialize_empty_new_buffer(idxrel, newbuf);
160 UnlockReleaseBuffer(newbuf);
161 if (extended)
162 FreeSpaceMapVacuumRange(idxrel, newblk, newblk + 1);
163 }
164 return false;
165 }
166
167 /*
168 * Great, the old tuple is intact. We can proceed with the update.
169 *
170 * If there's enough room in the old page for the new tuple, replace it.
171 *
172 * Note that there might now be enough space on the page even though the
173 * caller told us there isn't, if a concurrent update moved another tuple
174 * elsewhere or replaced a tuple with a smaller one.
175 */
176 if (((BrinPageFlags(oldpage) & BRIN_EVACUATE_PAGE) == 0) &&
177 brin_can_do_samepage_update(oldbuf, origsz, newsz))
178 {
179 START_CRIT_SECTION();
180 if (!PageIndexTupleOverwrite(oldpage, oldoff, (Item) unconstify(BrinTuple *, newtup), newsz))
181 elog(ERROR, "failed to replace BRIN tuple");
182 MarkBufferDirty(oldbuf);
183
184 /* XLOG stuff */
185 if (RelationNeedsWAL(idxrel))
186 {
187 xl_brin_samepage_update xlrec;
188 XLogRecPtr recptr;
189 uint8 info = XLOG_BRIN_SAMEPAGE_UPDATE;
190
191 xlrec.offnum = oldoff;
192
193 XLogBeginInsert();
194 XLogRegisterData((char *) &xlrec, SizeOfBrinSamepageUpdate);
195
196 XLogRegisterBuffer(0, oldbuf, REGBUF_STANDARD);
197 XLogRegisterBufData(0, (char *) unconstify(BrinTuple *, newtup), newsz);
198
199 recptr = XLogInsert(RM_BRIN_ID, info);
200
201 PageSetLSN(oldpage, recptr);
202 }
203
204 END_CRIT_SECTION();
205
206 LockBuffer(oldbuf, BUFFER_LOCK_UNLOCK);
207
208 if (BufferIsValid(newbuf))
209 {
210 /* As above, initialize and record new page if we got one */
211 if (extended)
212 brin_initialize_empty_new_buffer(idxrel, newbuf);
213 UnlockReleaseBuffer(newbuf);
214 if (extended)
215 FreeSpaceMapVacuumRange(idxrel, newblk, newblk + 1);
216 }
217
218 return true;
219 }
220 else if (newbuf == InvalidBuffer)
221 {
222 /*
223 * Not enough space, but caller said that there was. Tell them to
224 * start over.
225 */
226 LockBuffer(oldbuf, BUFFER_LOCK_UNLOCK);
227 return false;
228 }
229 else
230 {
231 /*
232 * Not enough free space on the oldpage. Put the new tuple on the new
233 * page, and update the revmap.
234 */
235 Page newpage = BufferGetPage(newbuf);
236 Buffer revmapbuf;
237 ItemPointerData newtid;
238 OffsetNumber newoff;
239 Size freespace = 0;
240
241 revmapbuf = brinLockRevmapPageForUpdate(revmap, heapBlk);
242
243 START_CRIT_SECTION();
244
245 /*
246 * We need to initialize the page if it's newly obtained. Note we
247 * will WAL-log the initialization as part of the update, so we don't
248 * need to do that here.
249 */
250 if (extended)
251 brin_page_init(newpage, BRIN_PAGETYPE_REGULAR);
252
253 PageIndexTupleDeleteNoCompact(oldpage, oldoff);
254 newoff = PageAddItem(newpage, (Item) unconstify(BrinTuple *, newtup), newsz,
255 InvalidOffsetNumber, false, false);
256 if (newoff == InvalidOffsetNumber)
257 elog(ERROR, "failed to add BRIN tuple to new page");
258 MarkBufferDirty(oldbuf);
259 MarkBufferDirty(newbuf);
260
261 /* needed to update FSM below */
262 if (extended)
263 freespace = br_page_get_freespace(newpage);
264
265 ItemPointerSet(&newtid, newblk, newoff);
266 brinSetHeapBlockItemptr(revmapbuf, pagesPerRange, heapBlk, newtid);
267 MarkBufferDirty(revmapbuf);
268
269 /* XLOG stuff */
270 if (RelationNeedsWAL(idxrel))
271 {
272 xl_brin_update xlrec;
273 XLogRecPtr recptr;
274 uint8 info;
275
276 info = XLOG_BRIN_UPDATE | (extended ? XLOG_BRIN_INIT_PAGE : 0);
277
278 xlrec.insert.offnum = newoff;
279 xlrec.insert.heapBlk = heapBlk;
280 xlrec.insert.pagesPerRange = pagesPerRange;
281 xlrec.oldOffnum = oldoff;
282
283 XLogBeginInsert();
284
285 /* new page */
286 XLogRegisterData((char *) &xlrec, SizeOfBrinUpdate);
287
288 XLogRegisterBuffer(0, newbuf, REGBUF_STANDARD | (extended ? REGBUF_WILL_INIT : 0));
289 XLogRegisterBufData(0, (char *) unconstify(BrinTuple *, newtup), newsz);
290
291 /* revmap page */
292 XLogRegisterBuffer(1, revmapbuf, 0);
293
294 /* old page */
295 XLogRegisterBuffer(2, oldbuf, REGBUF_STANDARD);
296
297 recptr = XLogInsert(RM_BRIN_ID, info);
298
299 PageSetLSN(oldpage, recptr);
300 PageSetLSN(newpage, recptr);
301 PageSetLSN(BufferGetPage(revmapbuf), recptr);
302 }
303
304 END_CRIT_SECTION();
305
306 LockBuffer(revmapbuf, BUFFER_LOCK_UNLOCK);
307 LockBuffer(oldbuf, BUFFER_LOCK_UNLOCK);
308 UnlockReleaseBuffer(newbuf);
309
310 if (extended)
311 {
312 RecordPageWithFreeSpace(idxrel, newblk, freespace);
313 FreeSpaceMapVacuumRange(idxrel, newblk, newblk + 1);
314 }
315
316 return true;
317 }
318 }
319
320 /*
321 * Return whether brin_doupdate can do a samepage update.
322 */
323 bool
324 brin_can_do_samepage_update(Buffer buffer, Size origsz, Size newsz)
325 {
326 return
327 ((newsz <= origsz) ||
328 PageGetExactFreeSpace(BufferGetPage(buffer)) >= (newsz - origsz));
329 }
330
331 /*
332 * Insert an index tuple into the index relation. The revmap is updated to
333 * mark the range containing the given page as pointing to the inserted entry.
334 * A WAL record is written.
335 *
336 * The buffer, if valid, is first checked for free space to insert the new
337 * entry; if there isn't enough, a new buffer is obtained and pinned. No
338 * buffer lock must be held on entry, no buffer lock is held on exit.
339 *
340 * Return value is the offset number where the tuple was inserted.
341 */
342 OffsetNumber
343 brin_doinsert(Relation idxrel, BlockNumber pagesPerRange,
344 BrinRevmap *revmap, Buffer *buffer, BlockNumber heapBlk,
345 BrinTuple *tup, Size itemsz)
346 {
347 Page page;
348 BlockNumber blk;
349 OffsetNumber off;
350 Size freespace = 0;
351 Buffer revmapbuf;
352 ItemPointerData tid;
353 bool extended;
354
355 Assert(itemsz == MAXALIGN(itemsz));
356
357 /* If the item is oversized, don't even bother. */
358 if (itemsz > BrinMaxItemSize)
359 {
360 ereport(ERROR,
361 (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
362 errmsg("index row size %zu exceeds maximum %zu for index \"%s\"",
363 itemsz, BrinMaxItemSize, RelationGetRelationName(idxrel))));
364 return InvalidOffsetNumber; /* keep compiler quiet */
365 }
366
367 /* Make sure the revmap is long enough to contain the entry we need */
368 brinRevmapExtend(revmap, heapBlk);
369
370 /*
371 * Acquire lock on buffer supplied by caller, if any. If it doesn't have
372 * enough space, unpin it to obtain a new one below.
373 */
374 if (BufferIsValid(*buffer))
375 {
376 /*
377 * It's possible that another backend (or ourselves!) extended the
378 * revmap over the page we held a pin on, so we cannot assume that
379 * it's still a regular page.
380 */
381 LockBuffer(*buffer, BUFFER_LOCK_EXCLUSIVE);
382 if (br_page_get_freespace(BufferGetPage(*buffer)) < itemsz)
383 {
384 UnlockReleaseBuffer(*buffer);
385 *buffer = InvalidBuffer;
386 }
387 }
388
389 /*
390 * If we still don't have a usable buffer, have brin_getinsertbuffer
391 * obtain one for us.
392 */
393 if (!BufferIsValid(*buffer))
394 {
395 do
396 *buffer = brin_getinsertbuffer(idxrel, InvalidBuffer, itemsz, &extended);
397 while (!BufferIsValid(*buffer));
398 }
399 else
400 extended = false;
401
402 /* Now obtain lock on revmap buffer */
403 revmapbuf = brinLockRevmapPageForUpdate(revmap, heapBlk);
404
405 page = BufferGetPage(*buffer);
406 blk = BufferGetBlockNumber(*buffer);
407
408 /* Execute the actual insertion */
409 START_CRIT_SECTION();
410 if (extended)
411 brin_page_init(page, BRIN_PAGETYPE_REGULAR);
412 off = PageAddItem(page, (Item) tup, itemsz, InvalidOffsetNumber,
413 false, false);
414 if (off == InvalidOffsetNumber)
415 elog(ERROR, "failed to add BRIN tuple to new page");
416 MarkBufferDirty(*buffer);
417
418 /* needed to update FSM below */
419 if (extended)
420 freespace = br_page_get_freespace(page);
421
422 ItemPointerSet(&tid, blk, off);
423 brinSetHeapBlockItemptr(revmapbuf, pagesPerRange, heapBlk, tid);
424 MarkBufferDirty(revmapbuf);
425
426 /* XLOG stuff */
427 if (RelationNeedsWAL(idxrel))
428 {
429 xl_brin_insert xlrec;
430 XLogRecPtr recptr;
431 uint8 info;
432
433 info = XLOG_BRIN_INSERT | (extended ? XLOG_BRIN_INIT_PAGE : 0);
434 xlrec.heapBlk = heapBlk;
435 xlrec.pagesPerRange = pagesPerRange;
436 xlrec.offnum = off;
437
438 XLogBeginInsert();
439 XLogRegisterData((char *) &xlrec, SizeOfBrinInsert);
440
441 XLogRegisterBuffer(0, *buffer, REGBUF_STANDARD | (extended ? REGBUF_WILL_INIT : 0));
442 XLogRegisterBufData(0, (char *) tup, itemsz);
443
444 XLogRegisterBuffer(1, revmapbuf, 0);
445
446 recptr = XLogInsert(RM_BRIN_ID, info);
447
448 PageSetLSN(page, recptr);
449 PageSetLSN(BufferGetPage(revmapbuf), recptr);
450 }
451
452 END_CRIT_SECTION();
453
454 /* Tuple is firmly on buffer; we can release our locks */
455 LockBuffer(*buffer, BUFFER_LOCK_UNLOCK);
456 LockBuffer(revmapbuf, BUFFER_LOCK_UNLOCK);
457
458 BRIN_elog((DEBUG2, "inserted tuple (%u,%u) for range starting at %u",
459 blk, off, heapBlk));
460
461 if (extended)
462 {
463 RecordPageWithFreeSpace(idxrel, blk, freespace);
464 FreeSpaceMapVacuumRange(idxrel, blk, blk + 1);
465 }
466
467 return off;
468 }
469
470 /*
471 * Initialize a page with the given type.
472 *
473 * Caller is responsible for marking it dirty, as appropriate.
474 */
475 void
476 brin_page_init(Page page, uint16 type)
477 {
478 PageInit(page, BLCKSZ, sizeof(BrinSpecialSpace));
479
480 BrinPageType(page) = type;
481 }
482
483 /*
484 * Initialize a new BRIN index's metapage.
485 */
486 void
487 brin_metapage_init(Page page, BlockNumber pagesPerRange, uint16 version)
488 {
489 BrinMetaPageData *metadata;
490
491 brin_page_init(page, BRIN_PAGETYPE_META);
492
493 metadata = (BrinMetaPageData *) PageGetContents(page);
494
495 metadata->brinMagic = BRIN_META_MAGIC;
496 metadata->brinVersion = version;
497 metadata->pagesPerRange = pagesPerRange;
498
499 /*
500 * Note we cheat here a little. 0 is not a valid revmap block number
501 * (because it's the metapage buffer), but doing this enables the first
502 * revmap page to be created when the index is.
503 */
504 metadata->lastRevmapPage = 0;
505
506 /*
507 * Set pd_lower just past the end of the metadata. This is essential,
508 * because without doing so, metadata will be lost if xlog.c compresses
509 * the page.
510 */
511 ((PageHeader) page)->pd_lower =
512 ((char *) metadata + sizeof(BrinMetaPageData)) - (char *) page;
513 }
514
515 /*
516 * Initiate page evacuation protocol.
517 *
518 * The page must be locked in exclusive mode by the caller.
519 *
520 * If the page is not yet initialized or empty, return false without doing
521 * anything; it can be used for revmap without any further changes. If it
522 * contains tuples, mark it for evacuation and return true.
523 */
524 bool
525 brin_start_evacuating_page(Relation idxRel, Buffer buf)
526 {
527 OffsetNumber off;
528 OffsetNumber maxoff;
529 Page page;
530
531 page = BufferGetPage(buf);
532
533 if (PageIsNew(page))
534 return false;
535
536 maxoff = PageGetMaxOffsetNumber(page);
537 for (off = FirstOffsetNumber; off <= maxoff; off++)
538 {
539 ItemId lp;
540
541 lp = PageGetItemId(page, off);
542 if (ItemIdIsUsed(lp))
543 {
544 /* prevent other backends from adding more stuff to this page */
545 BrinPageFlags(page) |= BRIN_EVACUATE_PAGE;
546 MarkBufferDirtyHint(buf, true);
547
548 return true;
549 }
550 }
551 return false;
552 }
553
554 /*
555 * Move all tuples out of a page.
556 *
557 * The caller must hold lock on the page. The lock and pin are released.
558 */
559 void
560 brin_evacuate_page(Relation idxRel, BlockNumber pagesPerRange,
561 BrinRevmap *revmap, Buffer buf)
562 {
563 OffsetNumber off;
564 OffsetNumber maxoff;
565 Page page;
566 BrinTuple *btup = NULL;
567 Size btupsz = 0;
568
569 page = BufferGetPage(buf);
570
571 Assert(BrinPageFlags(page) & BRIN_EVACUATE_PAGE);
572
573 maxoff = PageGetMaxOffsetNumber(page);
574 for (off = FirstOffsetNumber; off <= maxoff; off++)
575 {
576 BrinTuple *tup;
577 Size sz;
578 ItemId lp;
579
580 CHECK_FOR_INTERRUPTS();
581
582 lp = PageGetItemId(page, off);
583 if (ItemIdIsUsed(lp))
584 {
585 sz = ItemIdGetLength(lp);
586 tup = (BrinTuple *) PageGetItem(page, lp);
587 tup = brin_copy_tuple(tup, sz, btup, &btupsz);
588
589 LockBuffer(buf, BUFFER_LOCK_UNLOCK);
590
591 if (!brin_doupdate(idxRel, pagesPerRange, revmap, tup->bt_blkno,
592 buf, off, tup, sz, tup, sz, false))
593 off--; /* retry */
594
595 LockBuffer(buf, BUFFER_LOCK_SHARE);
596
597 /* It's possible that someone extended the revmap over this page */
598 if (!BRIN_IS_REGULAR_PAGE(page))
599 break;
600 }
601 }
602
603 UnlockReleaseBuffer(buf);
604 }
605
606 /*
607 * Given a BRIN index page, initialize it if necessary, and record its
608 * current free space in the FSM.
609 *
610 * The main use for this is when, during vacuuming, an uninitialized page is
611 * found, which could be the result of relation extension followed by a crash
612 * before the page can be used.
613 *
614 * Here, we don't bother to update upper FSM pages, instead expecting that our
615 * caller (brin_vacuum_scan) will fix them at the end of the scan. Elsewhere
616 * in this file, it's generally a good idea to propagate additions of free
617 * space into the upper FSM pages immediately.
618 */
619 void
620 brin_page_cleanup(Relation idxrel, Buffer buf)
621 {
622 Page page = BufferGetPage(buf);
623
624 /*
625 * If a page was left uninitialized, initialize it now; also record it in
626 * FSM.
627 *
628 * Somebody else might be extending the relation concurrently. To avoid
629 * re-initializing the page before they can grab the buffer lock, we
630 * acquire the extension lock momentarily. Since they hold the extension
631 * lock from before getting the page and after its been initialized, we're
632 * sure to see their initialization.
633 */
634 if (PageIsNew(page))
635 {
636 LockRelationForExtension(idxrel, ShareLock);
637 UnlockRelationForExtension(idxrel, ShareLock);
638
639 LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE);
640 if (PageIsNew(page))
641 {
642 brin_initialize_empty_new_buffer(idxrel, buf);
643 LockBuffer(buf, BUFFER_LOCK_UNLOCK);
644 return;
645 }
646 LockBuffer(buf, BUFFER_LOCK_UNLOCK);
647 }
648
649 /* Nothing to be done for non-regular index pages */
650 if (BRIN_IS_META_PAGE(BufferGetPage(buf)) ||
651 BRIN_IS_REVMAP_PAGE(BufferGetPage(buf)))
652 return;
653
654 /* Measure free space and record it */
655 RecordPageWithFreeSpace(idxrel, BufferGetBlockNumber(buf),
656 br_page_get_freespace(page));
657 }
658
659 /*
660 * Return a pinned and exclusively locked buffer which can be used to insert an
661 * index item of size itemsz (caller must ensure not to request sizes
662 * impossible to fulfill). If oldbuf is a valid buffer, it is also locked (in
663 * an order determined to avoid deadlocks).
664 *
665 * If we find that the old page is no longer a regular index page (because
666 * of a revmap extension), the old buffer is unlocked and we return
667 * InvalidBuffer.
668 *
669 * If there's no existing page with enough free space to accommodate the new
670 * item, the relation is extended. If this happens, *extended is set to true,
671 * and it is the caller's responsibility to initialize the page (and WAL-log
672 * that fact) prior to use. The caller should also update the FSM with the
673 * page's remaining free space after the insertion.
674 *
675 * Note that the caller is not expected to update FSM unless *extended is set
676 * true. This policy means that we'll update FSM when a page is created, and
677 * when it's found to have too little space for a desired tuple insertion,
678 * but not every single time we add a tuple to the page.
679 *
680 * Note that in some corner cases it is possible for this routine to extend
681 * the relation and then not return the new page. It is this routine's
682 * responsibility to WAL-log the page initialization and to record the page in
683 * FSM if that happens, since the caller certainly can't do it.
684 */
685 static Buffer
686 brin_getinsertbuffer(Relation irel, Buffer oldbuf, Size itemsz,
687 bool *extended)
688 {
689 BlockNumber oldblk;
690 BlockNumber newblk;
691 Page page;
692 Size freespace;
693
694 /* callers must have checked */
695 Assert(itemsz <= BrinMaxItemSize);
696
697 if (BufferIsValid(oldbuf))
698 oldblk = BufferGetBlockNumber(oldbuf);
699 else
700 oldblk = InvalidBlockNumber;
701
702 /* Choose initial target page, re-using existing target if known */
703 newblk = RelationGetTargetBlock(irel);
704 if (newblk == InvalidBlockNumber)
705 newblk = GetPageWithFreeSpace(irel, itemsz);
706
707 /*
708 * Loop until we find a page with sufficient free space. By the time we
709 * return to caller out of this loop, both buffers are valid and locked;
710 * if we have to restart here, neither page is locked and newblk isn't
711 * pinned (if it's even valid).
712 */
713 for (;;)
714 {
715 Buffer buf;
716 bool extensionLockHeld = false;
717
718 CHECK_FOR_INTERRUPTS();
719
720 *extended = false;
721
722 if (newblk == InvalidBlockNumber)
723 {
724 /*
725 * There's not enough free space in any existing index page,
726 * according to the FSM: extend the relation to obtain a shiny new
727 * page.
728 */
729 if (!RELATION_IS_LOCAL(irel))
730 {
731 LockRelationForExtension(irel, ExclusiveLock);
732 extensionLockHeld = true;
733 }
734 buf = ReadBuffer(irel, P_NEW);
735 newblk = BufferGetBlockNumber(buf);
736 *extended = true;
737
738 BRIN_elog((DEBUG2, "brin_getinsertbuffer: extending to page %u",
739 BufferGetBlockNumber(buf)));
740 }
741 else if (newblk == oldblk)
742 {
743 /*
744 * There's an odd corner-case here where the FSM is out-of-date,
745 * and gave us the old page.
746 */
747 buf = oldbuf;
748 }
749 else
750 {
751 buf = ReadBuffer(irel, newblk);
752 }
753
754 /*
755 * We lock the old buffer first, if it's earlier than the new one; but
756 * then we need to check that it hasn't been turned into a revmap page
757 * concurrently. If we detect that that happened, give up and tell
758 * caller to start over.
759 */
760 if (BufferIsValid(oldbuf) && oldblk < newblk)
761 {
762 LockBuffer(oldbuf, BUFFER_LOCK_EXCLUSIVE);
763 if (!BRIN_IS_REGULAR_PAGE(BufferGetPage(oldbuf)))
764 {
765 LockBuffer(oldbuf, BUFFER_LOCK_UNLOCK);
766
767 /*
768 * It is possible that the new page was obtained from
769 * extending the relation. In that case, we must be sure to
770 * record it in the FSM before leaving, because otherwise the
771 * space would be lost forever. However, we cannot let an
772 * uninitialized page get in the FSM, so we need to initialize
773 * it first.
774 */
775 if (*extended)
776 brin_initialize_empty_new_buffer(irel, buf);
777
778 if (extensionLockHeld)
779 UnlockRelationForExtension(irel, ExclusiveLock);
780
781 ReleaseBuffer(buf);
782
783 if (*extended)
784 {
785 FreeSpaceMapVacuumRange(irel, newblk, newblk + 1);
786 /* shouldn't matter, but don't confuse caller */
787 *extended = false;
788 }
789
790 return InvalidBuffer;
791 }
792 }
793
794 LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE);
795
796 if (extensionLockHeld)
797 UnlockRelationForExtension(irel, ExclusiveLock);
798
799 page = BufferGetPage(buf);
800
801 /*
802 * We have a new buffer to insert into. Check that the new page has
803 * enough free space, and return it if it does; otherwise start over.
804 * (br_page_get_freespace also checks that the FSM didn't hand us a
805 * page that has since been repurposed for the revmap.)
806 */
807 freespace = *extended ?
808 BrinMaxItemSize : br_page_get_freespace(page);
809 if (freespace >= itemsz)
810 {
811 RelationSetTargetBlock(irel, newblk);
812
813 /*
814 * Lock the old buffer if not locked already. Note that in this
815 * case we know for sure it's a regular page: it's later than the
816 * new page we just got, which is not a revmap page, and revmap
817 * pages are always consecutive.
818 */
819 if (BufferIsValid(oldbuf) && oldblk > newblk)
820 {
821 LockBuffer(oldbuf, BUFFER_LOCK_EXCLUSIVE);
822 Assert(BRIN_IS_REGULAR_PAGE(BufferGetPage(oldbuf)));
823 }
824
825 return buf;
826 }
827
828 /* This page is no good. */
829
830 /*
831 * If an entirely new page does not contain enough free space for the
832 * new item, then surely that item is oversized. Complain loudly; but
833 * first make sure we initialize the page and record it as free, for
834 * next time.
835 */
836 if (*extended)
837 {
838 brin_initialize_empty_new_buffer(irel, buf);
839 /* since this should not happen, skip FreeSpaceMapVacuum */
840
841 ereport(ERROR,
842 (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
843 errmsg("index row size %zu exceeds maximum %zu for index \"%s\"",
844 itemsz, freespace, RelationGetRelationName(irel))));
845 return InvalidBuffer; /* keep compiler quiet */
846 }
847
848 if (newblk != oldblk)
849 UnlockReleaseBuffer(buf);
850 if (BufferIsValid(oldbuf) && oldblk <= newblk)
851 LockBuffer(oldbuf, BUFFER_LOCK_UNLOCK);
852
853 /*
854 * Update the FSM with the new, presumably smaller, freespace value
855 * for this page, then search for a new target page.
856 */
857 newblk = RecordAndGetPageWithFreeSpace(irel, newblk, freespace, itemsz);
858 }
859 }
860
861 /*
862 * Initialize a page as an empty regular BRIN page, WAL-log this, and record
863 * the page in FSM.
864 *
865 * There are several corner situations in which we extend the relation to
866 * obtain a new page and later find that we cannot use it immediately. When
867 * that happens, we don't want to leave the page go unrecorded in FSM, because
868 * there is no mechanism to get the space back and the index would bloat.
869 * Also, because we would not WAL-log the action that would initialize the
870 * page, the page would go uninitialized in a standby (or after recovery).
871 *
872 * While we record the page in FSM here, caller is responsible for doing FSM
873 * upper-page update if that seems appropriate.
874 */
875 static void
876 brin_initialize_empty_new_buffer(Relation idxrel, Buffer buffer)
877 {
878 Page page;
879
880 BRIN_elog((DEBUG2,
881 "brin_initialize_empty_new_buffer: initializing blank page %u",
882 BufferGetBlockNumber(buffer)));
883
884 START_CRIT_SECTION();
885 page = BufferGetPage(buffer);
886 brin_page_init(page, BRIN_PAGETYPE_REGULAR);
887 MarkBufferDirty(buffer);
888 log_newpage_buffer(buffer, true);
889 END_CRIT_SECTION();
890
891 /*
892 * We update the FSM for this page, but this is not WAL-logged. This is
893 * acceptable because VACUUM will scan the index and update the FSM with
894 * pages whose FSM records were forgotten in a crash.
895 */
896 RecordPageWithFreeSpace(idxrel, BufferGetBlockNumber(buffer),
897 br_page_get_freespace(page));
898 }
899
900
901 /*
902 * Return the amount of free space on a regular BRIN index page.
903 *
904 * If the page is not a regular page, or has been marked with the
905 * BRIN_EVACUATE_PAGE flag, returns 0.
906 */
907 static Size
908 br_page_get_freespace(Page page)
909 {
910 if (!BRIN_IS_REGULAR_PAGE(page) ||
911 (BrinPageFlags(page) & BRIN_EVACUATE_PAGE) != 0)
912 return 0;
913 else
914 return PageGetFreeSpace(page);
915 }
916