1 /*
2 * brin_pageops.c
3 * Page-handling routines for BRIN indexes
4 *
5 * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
6 * Portions Copyright (c) 1994, Regents of the University of California
7 *
8 * IDENTIFICATION
9 * src/backend/access/brin/brin_pageops.c
10 */
11 #include "postgres.h"
12
13 #include "access/brin_pageops.h"
14 #include "access/brin_page.h"
15 #include "access/brin_revmap.h"
16 #include "access/brin_xlog.h"
17 #include "access/xloginsert.h"
18 #include "miscadmin.h"
19 #include "storage/bufmgr.h"
20 #include "storage/freespace.h"
21 #include "storage/lmgr.h"
22 #include "storage/smgr.h"
23 #include "utils/rel.h"
24
25
26 /*
27 * Maximum size of an entry in a BRIN_PAGETYPE_REGULAR page. We can tolerate
28 * a single item per page, unlike other index AMs.
29 */
30 #define BrinMaxItemSize \
31 MAXALIGN_DOWN(BLCKSZ - \
32 (MAXALIGN(SizeOfPageHeaderData + \
33 sizeof(ItemIdData)) + \
34 MAXALIGN(sizeof(BrinSpecialSpace))))
35
36 static Buffer brin_getinsertbuffer(Relation irel, Buffer oldbuf, Size itemsz,
37 bool *extended);
38 static Size br_page_get_freespace(Page page);
39 static void brin_initialize_empty_new_buffer(Relation idxrel, Buffer buffer);
40
41
42 /*
43 * Update tuple origtup (size origsz), located in offset oldoff of buffer
44 * oldbuf, to newtup (size newsz) as summary tuple for the page range starting
45 * at heapBlk. oldbuf must not be locked on entry, and is not locked at exit.
46 *
47 * If samepage is true, attempt to put the new tuple in the same page, but if
48 * there's no room, use some other one.
49 *
50 * If the update is successful, return true; the revmap is updated to point to
51 * the new tuple. If the update is not done for whatever reason, return false.
52 * Caller may retry the update if this happens.
53 */
54 bool
brin_doupdate(Relation idxrel,BlockNumber pagesPerRange,BrinRevmap * revmap,BlockNumber heapBlk,Buffer oldbuf,OffsetNumber oldoff,const BrinTuple * origtup,Size origsz,const BrinTuple * newtup,Size newsz,bool samepage)55 brin_doupdate(Relation idxrel, BlockNumber pagesPerRange,
56 BrinRevmap *revmap, BlockNumber heapBlk,
57 Buffer oldbuf, OffsetNumber oldoff,
58 const BrinTuple *origtup, Size origsz,
59 const BrinTuple *newtup, Size newsz,
60 bool samepage)
61 {
62 Page oldpage;
63 ItemId oldlp;
64 BrinTuple *oldtup;
65 Size oldsz;
66 Buffer newbuf;
67 bool extended;
68
69 Assert(newsz == MAXALIGN(newsz));
70
71 /* If the item is oversized, don't bother. */
72 if (newsz > BrinMaxItemSize)
73 {
74 ereport(ERROR,
75 (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
76 errmsg("index row size %zu exceeds maximum %zu for index \"%s\"",
77 newsz, BrinMaxItemSize, RelationGetRelationName(idxrel))));
78 return false; /* keep compiler quiet */
79 }
80
81 /* make sure the revmap is long enough to contain the entry we need */
82 brinRevmapExtend(revmap, heapBlk);
83
84 if (!samepage)
85 {
86 /* need a page on which to put the item */
87 newbuf = brin_getinsertbuffer(idxrel, oldbuf, newsz, &extended);
88 if (!BufferIsValid(newbuf))
89 {
90 Assert(!extended);
91 return false;
92 }
93
94 /*
95 * Note: it's possible (though unlikely) that the returned newbuf is
96 * the same as oldbuf, if brin_getinsertbuffer determined that the old
97 * buffer does in fact have enough space.
98 */
99 if (newbuf == oldbuf)
100 {
101 Assert(!extended);
102 newbuf = InvalidBuffer;
103 }
104 }
105 else
106 {
107 LockBuffer(oldbuf, BUFFER_LOCK_EXCLUSIVE);
108 newbuf = InvalidBuffer;
109 extended = false;
110 }
111 oldpage = BufferGetPage(oldbuf);
112 oldlp = PageGetItemId(oldpage, oldoff);
113
114 /*
115 * Check that the old tuple wasn't updated concurrently: it might have
116 * moved someplace else entirely, and for that matter the whole page
117 * might've become a revmap page. Note that in the first two cases
118 * checked here, the "oldlp" we just calculated is garbage; but
119 * PageGetItemId() is simple enough that it was safe to do that
120 * calculation anyway.
121 */
122 if (!BRIN_IS_REGULAR_PAGE(oldpage) ||
123 oldoff > PageGetMaxOffsetNumber(oldpage) ||
124 !ItemIdIsNormal(oldlp))
125 {
126 LockBuffer(oldbuf, BUFFER_LOCK_UNLOCK);
127
128 /*
129 * If this happens, and the new buffer was obtained by extending the
130 * relation, then we need to ensure we don't leave it uninitialized or
131 * forget about it.
132 */
133 if (BufferIsValid(newbuf))
134 {
135 if (extended)
136 brin_initialize_empty_new_buffer(idxrel, newbuf);
137 UnlockReleaseBuffer(newbuf);
138 if (extended)
139 FreeSpaceMapVacuum(idxrel);
140 }
141 return false;
142 }
143
144 oldsz = ItemIdGetLength(oldlp);
145 oldtup = (BrinTuple *) PageGetItem(oldpage, oldlp);
146
147 /*
148 * ... or it might have been updated in place to different contents.
149 */
150 if (!brin_tuples_equal(oldtup, oldsz, origtup, origsz))
151 {
152 LockBuffer(oldbuf, BUFFER_LOCK_UNLOCK);
153 if (BufferIsValid(newbuf))
154 {
155 if (extended)
156 brin_initialize_empty_new_buffer(idxrel, newbuf);
157 UnlockReleaseBuffer(newbuf);
158 if (extended)
159 FreeSpaceMapVacuum(idxrel);
160 }
161 return false;
162 }
163
164 /*
165 * Great, the old tuple is intact. We can proceed with the update.
166 *
167 * If there's enough room in the old page for the new tuple, replace it.
168 *
169 * Note that there might now be enough space on the page even though the
170 * caller told us there isn't, if a concurrent update moved another tuple
171 * elsewhere or replaced a tuple with a smaller one.
172 */
173 if (((BrinPageFlags(oldpage) & BRIN_EVACUATE_PAGE) == 0) &&
174 brin_can_do_samepage_update(oldbuf, origsz, newsz))
175 {
176 if (BufferIsValid(newbuf))
177 {
178 /* as above */
179 if (extended)
180 brin_initialize_empty_new_buffer(idxrel, newbuf);
181 UnlockReleaseBuffer(newbuf);
182 }
183
184 START_CRIT_SECTION();
185 if (!PageIndexTupleOverwrite(oldpage, oldoff, (Item) newtup, newsz))
186 elog(ERROR, "failed to replace BRIN tuple");
187 MarkBufferDirty(oldbuf);
188
189 /* XLOG stuff */
190 if (RelationNeedsWAL(idxrel))
191 {
192 xl_brin_samepage_update xlrec;
193 XLogRecPtr recptr;
194 uint8 info = XLOG_BRIN_SAMEPAGE_UPDATE;
195
196 xlrec.offnum = oldoff;
197
198 XLogBeginInsert();
199 XLogRegisterData((char *) &xlrec, SizeOfBrinSamepageUpdate);
200
201 XLogRegisterBuffer(0, oldbuf, REGBUF_STANDARD);
202 XLogRegisterBufData(0, (char *) newtup, newsz);
203
204 recptr = XLogInsert(RM_BRIN_ID, info);
205
206 PageSetLSN(oldpage, recptr);
207 }
208
209 END_CRIT_SECTION();
210
211 LockBuffer(oldbuf, BUFFER_LOCK_UNLOCK);
212
213 if (extended)
214 FreeSpaceMapVacuum(idxrel);
215
216 return true;
217 }
218 else if (newbuf == InvalidBuffer)
219 {
220 /*
221 * Not enough space, but caller said that there was. Tell them to
222 * start over.
223 */
224 LockBuffer(oldbuf, BUFFER_LOCK_UNLOCK);
225 return false;
226 }
227 else
228 {
229 /*
230 * Not enough free space on the oldpage. Put the new tuple on the new
231 * page, and update the revmap.
232 */
233 Page newpage = BufferGetPage(newbuf);
234 Buffer revmapbuf;
235 ItemPointerData newtid;
236 OffsetNumber newoff;
237 BlockNumber newblk = InvalidBlockNumber;
238 Size freespace = 0;
239
240 revmapbuf = brinLockRevmapPageForUpdate(revmap, heapBlk);
241
242 START_CRIT_SECTION();
243
244 /*
245 * We need to initialize the page if it's newly obtained. Note we
246 * will WAL-log the initialization as part of the update, so we don't
247 * need to do that here.
248 */
249 if (extended)
250 brin_page_init(BufferGetPage(newbuf), BRIN_PAGETYPE_REGULAR);
251
252 PageIndexTupleDeleteNoCompact(oldpage, oldoff);
253 newoff = PageAddItem(newpage, (Item) newtup, newsz,
254 InvalidOffsetNumber, false, false);
255 if (newoff == InvalidOffsetNumber)
256 elog(ERROR, "failed to add BRIN tuple to new page");
257 MarkBufferDirty(oldbuf);
258 MarkBufferDirty(newbuf);
259
260 /* needed to update FSM below */
261 if (extended)
262 {
263 newblk = BufferGetBlockNumber(newbuf);
264 freespace = br_page_get_freespace(newpage);
265 }
266
267 ItemPointerSet(&newtid, BufferGetBlockNumber(newbuf), newoff);
268 brinSetHeapBlockItemptr(revmapbuf, pagesPerRange, heapBlk, newtid);
269 MarkBufferDirty(revmapbuf);
270
271 /* XLOG stuff */
272 if (RelationNeedsWAL(idxrel))
273 {
274 xl_brin_update xlrec;
275 XLogRecPtr recptr;
276 uint8 info;
277
278 info = XLOG_BRIN_UPDATE | (extended ? XLOG_BRIN_INIT_PAGE : 0);
279
280 xlrec.insert.offnum = newoff;
281 xlrec.insert.heapBlk = heapBlk;
282 xlrec.insert.pagesPerRange = pagesPerRange;
283 xlrec.oldOffnum = oldoff;
284
285 XLogBeginInsert();
286
287 /* new page */
288 XLogRegisterData((char *) &xlrec, SizeOfBrinUpdate);
289
290 XLogRegisterBuffer(0, newbuf, REGBUF_STANDARD | (extended ? REGBUF_WILL_INIT : 0));
291 XLogRegisterBufData(0, (char *) newtup, newsz);
292
293 /* revmap page */
294 XLogRegisterBuffer(1, revmapbuf, 0);
295
296 /* old page */
297 XLogRegisterBuffer(2, oldbuf, REGBUF_STANDARD);
298
299 recptr = XLogInsert(RM_BRIN_ID, info);
300
301 PageSetLSN(oldpage, recptr);
302 PageSetLSN(newpage, recptr);
303 PageSetLSN(BufferGetPage(revmapbuf), recptr);
304 }
305
306 END_CRIT_SECTION();
307
308 LockBuffer(revmapbuf, BUFFER_LOCK_UNLOCK);
309 LockBuffer(oldbuf, BUFFER_LOCK_UNLOCK);
310 UnlockReleaseBuffer(newbuf);
311
312 if (extended)
313 {
314 Assert(BlockNumberIsValid(newblk));
315 RecordPageWithFreeSpace(idxrel, newblk, freespace);
316 FreeSpaceMapVacuum(idxrel);
317 }
318
319 return true;
320 }
321 }
322
323 /*
324 * Return whether brin_doupdate can do a samepage update.
325 */
326 bool
brin_can_do_samepage_update(Buffer buffer,Size origsz,Size newsz)327 brin_can_do_samepage_update(Buffer buffer, Size origsz, Size newsz)
328 {
329 return
330 ((newsz <= origsz) ||
331 PageGetExactFreeSpace(BufferGetPage(buffer)) >= (newsz - origsz));
332 }
333
334 /*
335 * Insert an index tuple into the index relation. The revmap is updated to
336 * mark the range containing the given page as pointing to the inserted entry.
337 * A WAL record is written.
338 *
339 * The buffer, if valid, is first checked for free space to insert the new
340 * entry; if there isn't enough, a new buffer is obtained and pinned. No
341 * buffer lock must be held on entry, no buffer lock is held on exit.
342 *
343 * Return value is the offset number where the tuple was inserted.
344 */
345 OffsetNumber
brin_doinsert(Relation idxrel,BlockNumber pagesPerRange,BrinRevmap * revmap,Buffer * buffer,BlockNumber heapBlk,BrinTuple * tup,Size itemsz)346 brin_doinsert(Relation idxrel, BlockNumber pagesPerRange,
347 BrinRevmap *revmap, Buffer *buffer, BlockNumber heapBlk,
348 BrinTuple *tup, Size itemsz)
349 {
350 Page page;
351 BlockNumber blk;
352 OffsetNumber off;
353 Buffer revmapbuf;
354 ItemPointerData tid;
355 bool extended;
356
357 Assert(itemsz == MAXALIGN(itemsz));
358
359 /* If the item is oversized, don't even bother. */
360 if (itemsz > BrinMaxItemSize)
361 {
362 ereport(ERROR,
363 (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
364 errmsg("index row size %zu exceeds maximum %zu for index \"%s\"",
365 itemsz, BrinMaxItemSize, RelationGetRelationName(idxrel))));
366 return InvalidOffsetNumber; /* keep compiler quiet */
367 }
368
369 /* Make sure the revmap is long enough to contain the entry we need */
370 brinRevmapExtend(revmap, heapBlk);
371
372 /*
373 * Acquire lock on buffer supplied by caller, if any. If it doesn't have
374 * enough space, unpin it to obtain a new one below.
375 */
376 if (BufferIsValid(*buffer))
377 {
378 /*
379 * It's possible that another backend (or ourselves!) extended the
380 * revmap over the page we held a pin on, so we cannot assume that
381 * it's still a regular page.
382 */
383 LockBuffer(*buffer, BUFFER_LOCK_EXCLUSIVE);
384 if (br_page_get_freespace(BufferGetPage(*buffer)) < itemsz)
385 {
386 UnlockReleaseBuffer(*buffer);
387 *buffer = InvalidBuffer;
388 }
389 }
390
391 /*
392 * If we still don't have a usable buffer, have brin_getinsertbuffer
393 * obtain one for us.
394 */
395 if (!BufferIsValid(*buffer))
396 {
397 do
398 *buffer = brin_getinsertbuffer(idxrel, InvalidBuffer, itemsz, &extended);
399 while (!BufferIsValid(*buffer));
400 }
401 else
402 extended = false;
403
404 /* Now obtain lock on revmap buffer */
405 revmapbuf = brinLockRevmapPageForUpdate(revmap, heapBlk);
406
407 page = BufferGetPage(*buffer);
408 blk = BufferGetBlockNumber(*buffer);
409
410 /* Execute the actual insertion */
411 START_CRIT_SECTION();
412 if (extended)
413 brin_page_init(BufferGetPage(*buffer), BRIN_PAGETYPE_REGULAR);
414 off = PageAddItem(page, (Item) tup, itemsz, InvalidOffsetNumber,
415 false, false);
416 if (off == InvalidOffsetNumber)
417 elog(ERROR, "could not insert new index tuple to page");
418 MarkBufferDirty(*buffer);
419
420 BRIN_elog((DEBUG2, "inserted tuple (%u,%u) for range starting at %u",
421 blk, off, heapBlk));
422
423 ItemPointerSet(&tid, blk, off);
424 brinSetHeapBlockItemptr(revmapbuf, pagesPerRange, heapBlk, tid);
425 MarkBufferDirty(revmapbuf);
426
427 /* XLOG stuff */
428 if (RelationNeedsWAL(idxrel))
429 {
430 xl_brin_insert xlrec;
431 XLogRecPtr recptr;
432 uint8 info;
433
434 info = XLOG_BRIN_INSERT | (extended ? XLOG_BRIN_INIT_PAGE : 0);
435 xlrec.heapBlk = heapBlk;
436 xlrec.pagesPerRange = pagesPerRange;
437 xlrec.offnum = off;
438
439 XLogBeginInsert();
440 XLogRegisterData((char *) &xlrec, SizeOfBrinInsert);
441
442 XLogRegisterBuffer(0, *buffer, REGBUF_STANDARD | (extended ? REGBUF_WILL_INIT : 0));
443 XLogRegisterBufData(0, (char *) tup, itemsz);
444
445 XLogRegisterBuffer(1, revmapbuf, 0);
446
447 recptr = XLogInsert(RM_BRIN_ID, info);
448
449 PageSetLSN(page, recptr);
450 PageSetLSN(BufferGetPage(revmapbuf), recptr);
451 }
452
453 END_CRIT_SECTION();
454
455 /* Tuple is firmly on buffer; we can release our locks */
456 LockBuffer(*buffer, BUFFER_LOCK_UNLOCK);
457 LockBuffer(revmapbuf, BUFFER_LOCK_UNLOCK);
458
459 if (extended)
460 FreeSpaceMapVacuum(idxrel);
461
462 return off;
463 }
464
465 /*
466 * Initialize a page with the given type.
467 *
468 * Caller is responsible for marking it dirty, as appropriate.
469 */
470 void
brin_page_init(Page page,uint16 type)471 brin_page_init(Page page, uint16 type)
472 {
473 PageInit(page, BLCKSZ, sizeof(BrinSpecialSpace));
474
475 BrinPageType(page) = type;
476 }
477
478 /*
479 * Initialize a new BRIN index' metapage.
480 */
481 void
brin_metapage_init(Page page,BlockNumber pagesPerRange,uint16 version)482 brin_metapage_init(Page page, BlockNumber pagesPerRange, uint16 version)
483 {
484 BrinMetaPageData *metadata;
485
486 brin_page_init(page, BRIN_PAGETYPE_META);
487
488 metadata = (BrinMetaPageData *) PageGetContents(page);
489
490 metadata->brinMagic = BRIN_META_MAGIC;
491 metadata->brinVersion = version;
492 metadata->pagesPerRange = pagesPerRange;
493
494 /*
495 * Note we cheat here a little. 0 is not a valid revmap block number
496 * (because it's the metapage buffer), but doing this enables the first
497 * revmap page to be created when the index is.
498 */
499 metadata->lastRevmapPage = 0;
500 }
501
502 /*
503 * Initiate page evacuation protocol.
504 *
505 * The page must be locked in exclusive mode by the caller.
506 *
507 * If the page is not yet initialized or empty, return false without doing
508 * anything; it can be used for revmap without any further changes. If it
509 * contains tuples, mark it for evacuation and return true.
510 */
511 bool
brin_start_evacuating_page(Relation idxRel,Buffer buf)512 brin_start_evacuating_page(Relation idxRel, Buffer buf)
513 {
514 OffsetNumber off;
515 OffsetNumber maxoff;
516 Page page;
517
518 page = BufferGetPage(buf);
519
520 if (PageIsNew(page))
521 return false;
522
523 maxoff = PageGetMaxOffsetNumber(page);
524 for (off = FirstOffsetNumber; off <= maxoff; off++)
525 {
526 ItemId lp;
527
528 lp = PageGetItemId(page, off);
529 if (ItemIdIsUsed(lp))
530 {
531 /* prevent other backends from adding more stuff to this page */
532 BrinPageFlags(page) |= BRIN_EVACUATE_PAGE;
533 MarkBufferDirtyHint(buf, true);
534
535 return true;
536 }
537 }
538 return false;
539 }
540
541 /*
542 * Move all tuples out of a page.
543 *
544 * The caller must hold lock on the page. The lock and pin are released.
545 */
546 void
brin_evacuate_page(Relation idxRel,BlockNumber pagesPerRange,BrinRevmap * revmap,Buffer buf)547 brin_evacuate_page(Relation idxRel, BlockNumber pagesPerRange,
548 BrinRevmap *revmap, Buffer buf)
549 {
550 OffsetNumber off;
551 OffsetNumber maxoff;
552 Page page;
553 BrinTuple *btup = NULL;
554 Size btupsz = 0;
555
556 page = BufferGetPage(buf);
557
558 Assert(BrinPageFlags(page) & BRIN_EVACUATE_PAGE);
559
560 maxoff = PageGetMaxOffsetNumber(page);
561 for (off = FirstOffsetNumber; off <= maxoff; off++)
562 {
563 BrinTuple *tup;
564 Size sz;
565 ItemId lp;
566
567 CHECK_FOR_INTERRUPTS();
568
569 lp = PageGetItemId(page, off);
570 if (ItemIdIsUsed(lp))
571 {
572 sz = ItemIdGetLength(lp);
573 tup = (BrinTuple *) PageGetItem(page, lp);
574 tup = brin_copy_tuple(tup, sz, btup, &btupsz);
575
576 LockBuffer(buf, BUFFER_LOCK_UNLOCK);
577
578 if (!brin_doupdate(idxRel, pagesPerRange, revmap, tup->bt_blkno,
579 buf, off, tup, sz, tup, sz, false))
580 off--; /* retry */
581
582 LockBuffer(buf, BUFFER_LOCK_SHARE);
583
584 /* It's possible that someone extended the revmap over this page */
585 if (!BRIN_IS_REGULAR_PAGE(page))
586 break;
587 }
588 }
589
590 UnlockReleaseBuffer(buf);
591 }
592
593 /*
594 * Given a BRIN index page, initialize it if necessary, and record it into the
595 * FSM if necessary. Return value is true if the FSM itself needs "vacuuming".
596 * The main use for this is when, during vacuuming, an uninitialized page is
597 * found, which could be the result of relation extension followed by a crash
598 * before the page can be used.
599 */
600 bool
brin_page_cleanup(Relation idxrel,Buffer buf)601 brin_page_cleanup(Relation idxrel, Buffer buf)
602 {
603 Page page = BufferGetPage(buf);
604 Size freespace;
605
606 /*
607 * If a page was left uninitialized, initialize it now; also record it in
608 * FSM.
609 *
610 * Somebody else might be extending the relation concurrently. To avoid
611 * re-initializing the page before they can grab the buffer lock, we
612 * acquire the extension lock momentarily. Since they hold the extension
613 * lock from before getting the page and after its been initialized, we're
614 * sure to see their initialization.
615 */
616 if (PageIsNew(page))
617 {
618 LockRelationForExtension(idxrel, ShareLock);
619 UnlockRelationForExtension(idxrel, ShareLock);
620
621 LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE);
622 if (PageIsNew(page))
623 {
624 brin_initialize_empty_new_buffer(idxrel, buf);
625 LockBuffer(buf, BUFFER_LOCK_UNLOCK);
626 return true;
627 }
628 LockBuffer(buf, BUFFER_LOCK_UNLOCK);
629 }
630
631 /* Nothing to be done for non-regular index pages */
632 if (BRIN_IS_META_PAGE(BufferGetPage(buf)) ||
633 BRIN_IS_REVMAP_PAGE(BufferGetPage(buf)))
634 return false;
635
636 /* Measure free space and record it */
637 freespace = br_page_get_freespace(page);
638 if (freespace > GetRecordedFreeSpace(idxrel, BufferGetBlockNumber(buf)))
639 {
640 RecordPageWithFreeSpace(idxrel, BufferGetBlockNumber(buf), freespace);
641 return true;
642 }
643
644 return false;
645 }
646
647 /*
648 * Return a pinned and exclusively locked buffer which can be used to insert an
649 * index item of size itemsz (caller must ensure not to request sizes
650 * impossible to fulfill). If oldbuf is a valid buffer, it is also locked (in
651 * an order determined to avoid deadlocks.)
652 *
653 * If we find that the old page is no longer a regular index page (because
654 * of a revmap extension), the old buffer is unlocked and we return
655 * InvalidBuffer.
656 *
657 * If there's no existing page with enough free space to accommodate the new
658 * item, the relation is extended. If this happens, *extended is set to true,
659 * and it is the caller's responsibility to initialize the page (and WAL-log
660 * that fact) prior to use.
661 *
662 * Note that in some corner cases it is possible for this routine to extend the
663 * relation and then not return the buffer. It is this routine's
664 * responsibility to WAL-log the page initialization and to record the page in
665 * FSM if that happens. Such a buffer may later be reused by this routine.
666 */
667 static Buffer
brin_getinsertbuffer(Relation irel,Buffer oldbuf,Size itemsz,bool * extended)668 brin_getinsertbuffer(Relation irel, Buffer oldbuf, Size itemsz,
669 bool *extended)
670 {
671 BlockNumber oldblk;
672 BlockNumber newblk;
673 Page page;
674 Size freespace;
675
676 /* callers must have checked */
677 Assert(itemsz <= BrinMaxItemSize);
678
679 *extended = false;
680
681 if (BufferIsValid(oldbuf))
682 oldblk = BufferGetBlockNumber(oldbuf);
683 else
684 oldblk = InvalidBlockNumber;
685
686 /*
687 * Loop until we find a page with sufficient free space. By the time we
688 * return to caller out of this loop, both buffers are valid and locked;
689 * if we have to restart here, neither buffer is locked and buf is not a
690 * pinned buffer.
691 */
692 newblk = RelationGetTargetBlock(irel);
693 if (newblk == InvalidBlockNumber)
694 newblk = GetPageWithFreeSpace(irel, itemsz);
695 for (;;)
696 {
697 Buffer buf;
698 bool extensionLockHeld = false;
699
700 CHECK_FOR_INTERRUPTS();
701
702 if (newblk == InvalidBlockNumber)
703 {
704 /*
705 * There's not enough free space in any existing index page,
706 * according to the FSM: extend the relation to obtain a shiny new
707 * page.
708 */
709 if (!RELATION_IS_LOCAL(irel))
710 {
711 LockRelationForExtension(irel, ExclusiveLock);
712 extensionLockHeld = true;
713 }
714 buf = ReadBuffer(irel, P_NEW);
715 newblk = BufferGetBlockNumber(buf);
716 *extended = true;
717
718 BRIN_elog((DEBUG2, "brin_getinsertbuffer: extending to page %u",
719 BufferGetBlockNumber(buf)));
720 }
721 else if (newblk == oldblk)
722 {
723 /*
724 * There's an odd corner-case here where the FSM is out-of-date,
725 * and gave us the old page.
726 */
727 buf = oldbuf;
728 }
729 else
730 {
731 buf = ReadBuffer(irel, newblk);
732 }
733
734 /*
735 * We lock the old buffer first, if it's earlier than the new one; but
736 * before we do, we need to check that it hasn't been turned into a
737 * revmap page concurrently; if we detect that it happened, give up
738 * and tell caller to start over.
739 */
740 if (BufferIsValid(oldbuf) && oldblk < newblk)
741 {
742 LockBuffer(oldbuf, BUFFER_LOCK_EXCLUSIVE);
743 if (!BRIN_IS_REGULAR_PAGE(BufferGetPage(oldbuf)))
744 {
745 LockBuffer(oldbuf, BUFFER_LOCK_UNLOCK);
746
747 /*
748 * It is possible that the new page was obtained from
749 * extending the relation. In that case, we must be sure to
750 * record it in the FSM before leaving, because otherwise the
751 * space would be lost forever. However, we cannot let an
752 * uninitialized page get in the FSM, so we need to initialize
753 * it first.
754 */
755 if (*extended)
756 {
757 brin_initialize_empty_new_buffer(irel, buf);
758 /* shouldn't matter, but don't confuse caller */
759 *extended = false;
760 }
761
762 if (extensionLockHeld)
763 UnlockRelationForExtension(irel, ExclusiveLock);
764
765 ReleaseBuffer(buf);
766 return InvalidBuffer;
767 }
768 }
769
770 LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE);
771
772 if (extensionLockHeld)
773 UnlockRelationForExtension(irel, ExclusiveLock);
774
775 page = BufferGetPage(buf);
776
777 /*
778 * We have a new buffer to insert into. Check that the new page has
779 * enough free space, and return it if it does; otherwise start over.
780 * Note that we allow for the FSM to be out of date here, and in that
781 * case we update it and move on.
782 *
783 * (br_page_get_freespace also checks that the FSM didn't hand us a
784 * page that has since been repurposed for the revmap.)
785 */
786 freespace = *extended ?
787 BrinMaxItemSize : br_page_get_freespace(page);
788 if (freespace >= itemsz)
789 {
790 RelationSetTargetBlock(irel, BufferGetBlockNumber(buf));
791
792 /*
793 * Since the target block specification can get lost on cache
794 * invalidations, make sure we update the more permanent FSM with
795 * data about it before going away.
796 */
797 if (*extended)
798 RecordPageWithFreeSpace(irel, BufferGetBlockNumber(buf),
799 freespace);
800
801 /*
802 * Lock the old buffer if not locked already. Note that in this
803 * case we know for sure it's a regular page: it's later than the
804 * new page we just got, which is not a revmap page, and revmap
805 * pages are always consecutive.
806 */
807 if (BufferIsValid(oldbuf) && oldblk > newblk)
808 {
809 LockBuffer(oldbuf, BUFFER_LOCK_EXCLUSIVE);
810 Assert(BRIN_IS_REGULAR_PAGE(BufferGetPage(oldbuf)));
811 }
812
813 return buf;
814 }
815
816 /* This page is no good. */
817
818 /*
819 * If an entirely new page does not contain enough free space for the
820 * new item, then surely that item is oversized. Complain loudly; but
821 * first make sure we initialize the page and record it as free, for
822 * next time.
823 */
824 if (*extended)
825 {
826 brin_initialize_empty_new_buffer(irel, buf);
827
828 ereport(ERROR,
829 (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
830 errmsg("index row size %zu exceeds maximum %zu for index \"%s\"",
831 itemsz, freespace, RelationGetRelationName(irel))));
832 return InvalidBuffer; /* keep compiler quiet */
833 }
834
835 if (newblk != oldblk)
836 UnlockReleaseBuffer(buf);
837 if (BufferIsValid(oldbuf) && oldblk <= newblk)
838 LockBuffer(oldbuf, BUFFER_LOCK_UNLOCK);
839
840 newblk = RecordAndGetPageWithFreeSpace(irel, newblk, freespace, itemsz);
841 }
842 }
843
844 /*
845 * Initialize a page as an empty regular BRIN page, WAL-log this, and record
846 * the page in FSM.
847 *
848 * There are several corner situations in which we extend the relation to
849 * obtain a new page and later find that we cannot use it immediately. When
850 * that happens, we don't want to leave the page go unrecorded in FSM, because
851 * there is no mechanism to get the space back and the index would bloat.
852 * Also, because we would not WAL-log the action that would initialize the
853 * page, the page would go uninitialized in a standby (or after recovery).
854 */
855 static void
brin_initialize_empty_new_buffer(Relation idxrel,Buffer buffer)856 brin_initialize_empty_new_buffer(Relation idxrel, Buffer buffer)
857 {
858 Page page;
859
860 BRIN_elog((DEBUG2,
861 "brin_initialize_empty_new_buffer: initializing blank page %u",
862 BufferGetBlockNumber(buffer)));
863
864 START_CRIT_SECTION();
865 page = BufferGetPage(buffer);
866 brin_page_init(page, BRIN_PAGETYPE_REGULAR);
867 MarkBufferDirty(buffer);
868 log_newpage_buffer(buffer, true);
869 END_CRIT_SECTION();
870
871 /*
872 * We update the FSM for this page, but this is not WAL-logged. This is
873 * acceptable because VACUUM will scan the index and update the FSM with
874 * pages whose FSM records were forgotten in a crash.
875 */
876 RecordPageWithFreeSpace(idxrel, BufferGetBlockNumber(buffer),
877 br_page_get_freespace(page));
878 }
879
880
881 /*
882 * Return the amount of free space on a regular BRIN index page.
883 *
884 * If the page is not a regular page, or has been marked with the
885 * BRIN_EVACUATE_PAGE flag, returns 0.
886 */
887 static Size
br_page_get_freespace(Page page)888 br_page_get_freespace(Page page)
889 {
890 if (!BRIN_IS_REGULAR_PAGE(page) ||
891 (BrinPageFlags(page) & BRIN_EVACUATE_PAGE) != 0)
892 return 0;
893 else
894 return PageGetFreeSpace(page);
895 }
896