1 /*-------------------------------------------------------------------------
2 *
3 * freespace.c
4 * POSTGRES free space map for quickly finding free space in relations
5 *
6 *
7 * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
8 * Portions Copyright (c) 1994, Regents of the University of California
9 *
10 * IDENTIFICATION
11 * src/backend/storage/freespace/freespace.c
12 *
13 *
14 * NOTES:
15 *
16 * Free Space Map keeps track of the amount of free space on pages, and
17 * allows quickly searching for a page with enough free space. The FSM is
18 * stored in a dedicated relation fork of all heap relations, and those
19 * index access methods that need it (see also indexfsm.c). See README for
20 * more information.
21 *
22 *-------------------------------------------------------------------------
23 */
24 #include "postgres.h"
25
26 #include "access/htup_details.h"
27 #include "access/xlogutils.h"
28 #include "miscadmin.h"
29 #include "storage/freespace.h"
30 #include "storage/fsm_internals.h"
31 #include "storage/lmgr.h"
32 #include "storage/smgr.h"
33
34
35 /*
36 * We use just one byte to store the amount of free space on a page, so we
37 * divide the amount of free space a page can have into 256 different
38 * categories. The highest category, 255, represents a page with at least
39 * MaxFSMRequestSize bytes of free space, and the second highest category
40 * represents the range from 254 * FSM_CAT_STEP, inclusive, to
41 * MaxFSMRequestSize, exclusive.
42 *
43 * MaxFSMRequestSize depends on the architecture and BLCKSZ, but assuming
44 * default 8k BLCKSZ, and that MaxFSMRequestSize is 8164 bytes, the
45 * categories look like this:
46 *
47 *
48 * Range Category
49 * 0 - 31 0
50 * 32 - 63 1
51 * ... ... ...
52 * 8096 - 8127 253
53 * 8128 - 8163 254
54 * 8164 - 8192 255
55 *
56 * The reason that MaxFSMRequestSize is special is that if MaxFSMRequestSize
57 * isn't equal to a range boundary, a page with exactly MaxFSMRequestSize
58 * bytes of free space wouldn't satisfy a request for MaxFSMRequestSize
59 * bytes. If there isn't more than MaxFSMRequestSize bytes of free space on a
60 * completely empty page, that would mean that we could never satisfy a
61 * request of exactly MaxFSMRequestSize bytes.
62 */
63 #define FSM_CATEGORIES 256
64 #define FSM_CAT_STEP (BLCKSZ / FSM_CATEGORIES)
65 #define MaxFSMRequestSize MaxHeapTupleSize
66
67 /*
68 * Depth of the on-disk tree. We need to be able to address 2^32-1 blocks,
69 * and 1626 is the smallest number that satisfies X^3 >= 2^32-1. Likewise,
70 * 216 is the smallest number that satisfies X^4 >= 2^32-1. In practice,
71 * this means that 4096 bytes is the smallest BLCKSZ that we can get away
72 * with a 3-level tree, and 512 is the smallest we support.
73 */
74 #define FSM_TREE_DEPTH ((SlotsPerFSMPage >= 1626) ? 3 : 4)
75
76 #define FSM_ROOT_LEVEL (FSM_TREE_DEPTH - 1)
77 #define FSM_BOTTOM_LEVEL 0
78
79 /*
80 * The internal FSM routines work on a logical addressing scheme. Each
81 * level of the tree can be thought of as a separately addressable file.
82 */
83 typedef struct
84 {
85 int level; /* level */
86 int logpageno; /* page number within the level */
87 } FSMAddress;
88
89 /* Address of the root page. */
90 static const FSMAddress FSM_ROOT_ADDRESS = {FSM_ROOT_LEVEL, 0};
91
92 /* functions to navigate the tree */
93 static FSMAddress fsm_get_child(FSMAddress parent, uint16 slot);
94 static FSMAddress fsm_get_parent(FSMAddress child, uint16 *slot);
95 static FSMAddress fsm_get_location(BlockNumber heapblk, uint16 *slot);
96 static BlockNumber fsm_get_heap_blk(FSMAddress addr, uint16 slot);
97 static BlockNumber fsm_logical_to_physical(FSMAddress addr);
98
99 static Buffer fsm_readbuf(Relation rel, FSMAddress addr, bool extend);
100 static void fsm_extend(Relation rel, BlockNumber fsm_nblocks);
101
102 /* functions to convert amount of free space to a FSM category */
103 static uint8 fsm_space_avail_to_cat(Size avail);
104 static uint8 fsm_space_needed_to_cat(Size needed);
105 static Size fsm_space_cat_to_avail(uint8 cat);
106
107 /* workhorse functions for various operations */
108 static int fsm_set_and_search(Relation rel, FSMAddress addr, uint16 slot,
109 uint8 newValue, uint8 minValue);
110 static BlockNumber fsm_search(Relation rel, uint8 min_cat);
111 static uint8 fsm_vacuum_page(Relation rel, FSMAddress addr,
112 BlockNumber start, BlockNumber end,
113 bool *eof);
114
115
116 /******** Public API ********/
117
118 /*
119 * GetPageWithFreeSpace - try to find a page in the given relation with
120 * at least the specified amount of free space.
121 *
122 * If successful, return the block number; if not, return InvalidBlockNumber.
123 *
124 * The caller must be prepared for the possibility that the returned page
125 * will turn out to have too little space available by the time the caller
126 * gets a lock on it. In that case, the caller should report the actual
127 * amount of free space available on that page and then try again (see
128 * RecordAndGetPageWithFreeSpace). If InvalidBlockNumber is returned,
129 * extend the relation.
130 */
131 BlockNumber
GetPageWithFreeSpace(Relation rel,Size spaceNeeded)132 GetPageWithFreeSpace(Relation rel, Size spaceNeeded)
133 {
134 uint8 min_cat = fsm_space_needed_to_cat(spaceNeeded);
135
136 return fsm_search(rel, min_cat);
137 }
138
139 /*
140 * RecordAndGetPageWithFreeSpace - update info about a page and try again.
141 *
142 * We provide this combo form to save some locking overhead, compared to
143 * separate RecordPageWithFreeSpace + GetPageWithFreeSpace calls. There's
144 * also some effort to return a page close to the old page; if there's a
145 * page with enough free space on the same FSM page where the old one page
146 * is located, it is preferred.
147 */
148 BlockNumber
RecordAndGetPageWithFreeSpace(Relation rel,BlockNumber oldPage,Size oldSpaceAvail,Size spaceNeeded)149 RecordAndGetPageWithFreeSpace(Relation rel, BlockNumber oldPage,
150 Size oldSpaceAvail, Size spaceNeeded)
151 {
152 int old_cat = fsm_space_avail_to_cat(oldSpaceAvail);
153 int search_cat = fsm_space_needed_to_cat(spaceNeeded);
154 FSMAddress addr;
155 uint16 slot;
156 int search_slot;
157
158 /* Get the location of the FSM byte representing the heap block */
159 addr = fsm_get_location(oldPage, &slot);
160
161 search_slot = fsm_set_and_search(rel, addr, slot, old_cat, search_cat);
162
163 /*
164 * If fsm_set_and_search found a suitable new block, return that.
165 * Otherwise, search as usual.
166 */
167 if (search_slot != -1)
168 return fsm_get_heap_blk(addr, search_slot);
169 else
170 return fsm_search(rel, search_cat);
171 }
172
173 /*
174 * RecordPageWithFreeSpace - update info about a page.
175 *
176 * Note that if the new spaceAvail value is higher than the old value stored
177 * in the FSM, the space might not become visible to searchers until the next
178 * FreeSpaceMapVacuum call, which updates the upper level pages.
179 */
180 void
RecordPageWithFreeSpace(Relation rel,BlockNumber heapBlk,Size spaceAvail)181 RecordPageWithFreeSpace(Relation rel, BlockNumber heapBlk, Size spaceAvail)
182 {
183 int new_cat = fsm_space_avail_to_cat(spaceAvail);
184 FSMAddress addr;
185 uint16 slot;
186
187 /* Get the location of the FSM byte representing the heap block */
188 addr = fsm_get_location(heapBlk, &slot);
189
190 fsm_set_and_search(rel, addr, slot, new_cat, 0);
191 }
192
193 /*
194 * XLogRecordPageWithFreeSpace - like RecordPageWithFreeSpace, for use in
195 * WAL replay
196 */
197 void
XLogRecordPageWithFreeSpace(RelFileNode rnode,BlockNumber heapBlk,Size spaceAvail)198 XLogRecordPageWithFreeSpace(RelFileNode rnode, BlockNumber heapBlk,
199 Size spaceAvail)
200 {
201 int new_cat = fsm_space_avail_to_cat(spaceAvail);
202 FSMAddress addr;
203 uint16 slot;
204 BlockNumber blkno;
205 Buffer buf;
206 Page page;
207
208 /* Get the location of the FSM byte representing the heap block */
209 addr = fsm_get_location(heapBlk, &slot);
210 blkno = fsm_logical_to_physical(addr);
211
212 /* If the page doesn't exist already, extend */
213 buf = XLogReadBufferExtended(rnode, FSM_FORKNUM, blkno, RBM_ZERO_ON_ERROR);
214 LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE);
215
216 page = BufferGetPage(buf);
217 if (PageIsNew(page))
218 PageInit(page, BLCKSZ, 0);
219
220 if (fsm_set_avail(page, slot, new_cat))
221 MarkBufferDirtyHint(buf, false);
222 UnlockReleaseBuffer(buf);
223 }
224
225 /*
226 * GetRecordedFreeSpace - return the amount of free space on a particular page,
227 * according to the FSM.
228 */
229 Size
GetRecordedFreeSpace(Relation rel,BlockNumber heapBlk)230 GetRecordedFreeSpace(Relation rel, BlockNumber heapBlk)
231 {
232 FSMAddress addr;
233 uint16 slot;
234 Buffer buf;
235 uint8 cat;
236
237 /* Get the location of the FSM byte representing the heap block */
238 addr = fsm_get_location(heapBlk, &slot);
239
240 buf = fsm_readbuf(rel, addr, false);
241 if (!BufferIsValid(buf))
242 return 0;
243 cat = fsm_get_avail(BufferGetPage(buf), slot);
244 ReleaseBuffer(buf);
245
246 return fsm_space_cat_to_avail(cat);
247 }
248
249 /*
250 * FreeSpaceMapPrepareTruncateRel - prepare for truncation of a relation.
251 *
252 * nblocks is the new size of the heap.
253 *
254 * Return the number of blocks of new FSM.
255 * If it's InvalidBlockNumber, there is nothing to truncate;
256 * otherwise the caller is responsible for calling smgrtruncate()
257 * to truncate the FSM pages, and FreeSpaceMapVacuumRange()
258 * to update upper-level pages in the FSM.
259 */
260 BlockNumber
FreeSpaceMapPrepareTruncateRel(Relation rel,BlockNumber nblocks)261 FreeSpaceMapPrepareTruncateRel(Relation rel, BlockNumber nblocks)
262 {
263 BlockNumber new_nfsmblocks;
264 FSMAddress first_removed_address;
265 uint16 first_removed_slot;
266 Buffer buf;
267
268 RelationOpenSmgr(rel);
269
270 /*
271 * If no FSM has been created yet for this relation, there's nothing to
272 * truncate.
273 */
274 if (!smgrexists(rel->rd_smgr, FSM_FORKNUM))
275 return InvalidBlockNumber;
276
277 /* Get the location in the FSM of the first removed heap block */
278 first_removed_address = fsm_get_location(nblocks, &first_removed_slot);
279
280 /*
281 * Zero out the tail of the last remaining FSM page. If the slot
282 * representing the first removed heap block is at a page boundary, as the
283 * first slot on the FSM page that first_removed_address points to, we can
284 * just truncate that page altogether.
285 */
286 if (first_removed_slot > 0)
287 {
288 buf = fsm_readbuf(rel, first_removed_address, false);
289 if (!BufferIsValid(buf))
290 return InvalidBlockNumber; /* nothing to do; the FSM was already
291 * smaller */
292 LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE);
293
294 /* NO EREPORT(ERROR) from here till changes are logged */
295 START_CRIT_SECTION();
296
297 fsm_truncate_avail(BufferGetPage(buf), first_removed_slot);
298
299 /*
300 * Truncation of a relation is WAL-logged at a higher-level, and we
301 * will be called at WAL replay. But if checksums are enabled, we need
302 * to still write a WAL record to protect against a torn page, if the
303 * page is flushed to disk before the truncation WAL record. We cannot
304 * use MarkBufferDirtyHint here, because that will not dirty the page
305 * during recovery.
306 */
307 MarkBufferDirty(buf);
308 if (!InRecovery && RelationNeedsWAL(rel) && XLogHintBitIsNeeded())
309 log_newpage_buffer(buf, false);
310
311 END_CRIT_SECTION();
312
313 UnlockReleaseBuffer(buf);
314
315 new_nfsmblocks = fsm_logical_to_physical(first_removed_address) + 1;
316 }
317 else
318 {
319 new_nfsmblocks = fsm_logical_to_physical(first_removed_address);
320 if (smgrnblocks(rel->rd_smgr, FSM_FORKNUM) <= new_nfsmblocks)
321 return InvalidBlockNumber; /* nothing to do; the FSM was already
322 * smaller */
323 }
324
325 return new_nfsmblocks;
326 }
327
328 /*
329 * FreeSpaceMapVacuum - update upper-level pages in the rel's FSM
330 *
331 * We assume that the bottom-level pages have already been updated with
332 * new free-space information.
333 */
334 void
FreeSpaceMapVacuum(Relation rel)335 FreeSpaceMapVacuum(Relation rel)
336 {
337 bool dummy;
338
339 /* Recursively scan the tree, starting at the root */
340 (void) fsm_vacuum_page(rel, FSM_ROOT_ADDRESS,
341 (BlockNumber) 0, InvalidBlockNumber,
342 &dummy);
343 }
344
345 /*
346 * FreeSpaceMapVacuumRange - update upper-level pages in the rel's FSM
347 *
348 * As above, but assume that only heap pages between start and end-1 inclusive
349 * have new free-space information, so update only the upper-level slots
350 * covering that block range. end == InvalidBlockNumber is equivalent to
351 * "all the rest of the relation".
352 */
353 void
FreeSpaceMapVacuumRange(Relation rel,BlockNumber start,BlockNumber end)354 FreeSpaceMapVacuumRange(Relation rel, BlockNumber start, BlockNumber end)
355 {
356 bool dummy;
357
358 /* Recursively scan the tree, starting at the root */
359 if (end > start)
360 (void) fsm_vacuum_page(rel, FSM_ROOT_ADDRESS, start, end, &dummy);
361 }
362
363 /******** Internal routines ********/
364
365 /*
366 * Return category corresponding x bytes of free space
367 */
368 static uint8
fsm_space_avail_to_cat(Size avail)369 fsm_space_avail_to_cat(Size avail)
370 {
371 int cat;
372
373 Assert(avail < BLCKSZ);
374
375 if (avail >= MaxFSMRequestSize)
376 return 255;
377
378 cat = avail / FSM_CAT_STEP;
379
380 /*
381 * The highest category, 255, is reserved for MaxFSMRequestSize bytes or
382 * more.
383 */
384 if (cat > 254)
385 cat = 254;
386
387 return (uint8) cat;
388 }
389
390 /*
391 * Return the lower bound of the range of free space represented by given
392 * category.
393 */
394 static Size
fsm_space_cat_to_avail(uint8 cat)395 fsm_space_cat_to_avail(uint8 cat)
396 {
397 /* The highest category represents exactly MaxFSMRequestSize bytes. */
398 if (cat == 255)
399 return MaxFSMRequestSize;
400 else
401 return cat * FSM_CAT_STEP;
402 }
403
404 /*
405 * Which category does a page need to have, to accommodate x bytes of data?
406 * While fsm_space_avail_to_cat() rounds down, this needs to round up.
407 */
408 static uint8
fsm_space_needed_to_cat(Size needed)409 fsm_space_needed_to_cat(Size needed)
410 {
411 int cat;
412
413 /* Can't ask for more space than the highest category represents */
414 if (needed > MaxFSMRequestSize)
415 elog(ERROR, "invalid FSM request size %zu", needed);
416
417 if (needed == 0)
418 return 1;
419
420 cat = (needed + FSM_CAT_STEP - 1) / FSM_CAT_STEP;
421
422 if (cat > 255)
423 cat = 255;
424
425 return (uint8) cat;
426 }
427
428 /*
429 * Returns the physical block number of a FSM page
430 */
431 static BlockNumber
fsm_logical_to_physical(FSMAddress addr)432 fsm_logical_to_physical(FSMAddress addr)
433 {
434 BlockNumber pages;
435 int leafno;
436 int l;
437
438 /*
439 * Calculate the logical page number of the first leaf page below the
440 * given page.
441 */
442 leafno = addr.logpageno;
443 for (l = 0; l < addr.level; l++)
444 leafno *= SlotsPerFSMPage;
445
446 /* Count upper level nodes required to address the leaf page */
447 pages = 0;
448 for (l = 0; l < FSM_TREE_DEPTH; l++)
449 {
450 pages += leafno + 1;
451 leafno /= SlotsPerFSMPage;
452 }
453
454 /*
455 * If the page we were asked for wasn't at the bottom level, subtract the
456 * additional lower level pages we counted above.
457 */
458 pages -= addr.level;
459
460 /* Turn the page count into 0-based block number */
461 return pages - 1;
462 }
463
464 /*
465 * Return the FSM location corresponding to given heap block.
466 */
467 static FSMAddress
fsm_get_location(BlockNumber heapblk,uint16 * slot)468 fsm_get_location(BlockNumber heapblk, uint16 *slot)
469 {
470 FSMAddress addr;
471
472 addr.level = FSM_BOTTOM_LEVEL;
473 addr.logpageno = heapblk / SlotsPerFSMPage;
474 *slot = heapblk % SlotsPerFSMPage;
475
476 return addr;
477 }
478
479 /*
480 * Return the heap block number corresponding to given location in the FSM.
481 */
482 static BlockNumber
fsm_get_heap_blk(FSMAddress addr,uint16 slot)483 fsm_get_heap_blk(FSMAddress addr, uint16 slot)
484 {
485 Assert(addr.level == FSM_BOTTOM_LEVEL);
486 return ((unsigned int) addr.logpageno) * SlotsPerFSMPage + slot;
487 }
488
489 /*
490 * Given a logical address of a child page, get the logical page number of
491 * the parent, and the slot within the parent corresponding to the child.
492 */
493 static FSMAddress
fsm_get_parent(FSMAddress child,uint16 * slot)494 fsm_get_parent(FSMAddress child, uint16 *slot)
495 {
496 FSMAddress parent;
497
498 Assert(child.level < FSM_ROOT_LEVEL);
499
500 parent.level = child.level + 1;
501 parent.logpageno = child.logpageno / SlotsPerFSMPage;
502 *slot = child.logpageno % SlotsPerFSMPage;
503
504 return parent;
505 }
506
507 /*
508 * Given a logical address of a parent page and a slot number, get the
509 * logical address of the corresponding child page.
510 */
511 static FSMAddress
fsm_get_child(FSMAddress parent,uint16 slot)512 fsm_get_child(FSMAddress parent, uint16 slot)
513 {
514 FSMAddress child;
515
516 Assert(parent.level > FSM_BOTTOM_LEVEL);
517
518 child.level = parent.level - 1;
519 child.logpageno = parent.logpageno * SlotsPerFSMPage + slot;
520
521 return child;
522 }
523
524 /*
525 * Read a FSM page.
526 *
527 * If the page doesn't exist, InvalidBuffer is returned, or if 'extend' is
528 * true, the FSM file is extended.
529 */
530 static Buffer
fsm_readbuf(Relation rel,FSMAddress addr,bool extend)531 fsm_readbuf(Relation rel, FSMAddress addr, bool extend)
532 {
533 BlockNumber blkno = fsm_logical_to_physical(addr);
534 Buffer buf;
535
536 RelationOpenSmgr(rel);
537
538 /*
539 * If we haven't cached the size of the FSM yet, check it first. Also
540 * recheck if the requested block seems to be past end, since our cached
541 * value might be stale. (We send smgr inval messages on truncation, but
542 * not on extension.)
543 */
544 if (rel->rd_smgr->smgr_cached_nblocks[FSM_FORKNUM] == InvalidBlockNumber ||
545 blkno >= rel->rd_smgr->smgr_cached_nblocks[FSM_FORKNUM])
546 {
547 /* Invalidate the cache so smgrnblocks asks the kernel. */
548 rel->rd_smgr->smgr_cached_nblocks[FSM_FORKNUM] = InvalidBlockNumber;
549 if (smgrexists(rel->rd_smgr, FSM_FORKNUM))
550 smgrnblocks(rel->rd_smgr, FSM_FORKNUM);
551 else
552 rel->rd_smgr->smgr_cached_nblocks[FSM_FORKNUM] = 0;
553 }
554
555 /* Handle requests beyond EOF */
556 if (blkno >= rel->rd_smgr->smgr_cached_nblocks[FSM_FORKNUM])
557 {
558 if (extend)
559 fsm_extend(rel, blkno + 1);
560 else
561 return InvalidBuffer;
562 }
563
564 /*
565 * Use ZERO_ON_ERROR mode, and initialize the page if necessary. The FSM
566 * information is not accurate anyway, so it's better to clear corrupt
567 * pages than error out. Since the FSM changes are not WAL-logged, the
568 * so-called torn page problem on crash can lead to pages with corrupt
569 * headers, for example.
570 *
571 * The initialize-the-page part is trickier than it looks, because of the
572 * possibility of multiple backends doing this concurrently, and our
573 * desire to not uselessly take the buffer lock in the normal path where
574 * the page is OK. We must take the lock to initialize the page, so
575 * recheck page newness after we have the lock, in case someone else
576 * already did it. Also, because we initially check PageIsNew with no
577 * lock, it's possible to fall through and return the buffer while someone
578 * else is still initializing the page (i.e., we might see pd_upper as set
579 * but other page header fields are still zeroes). This is harmless for
580 * callers that will take a buffer lock themselves, but some callers
581 * inspect the page without any lock at all. The latter is OK only so
582 * long as it doesn't depend on the page header having correct contents.
583 * Current usage is safe because PageGetContents() does not require that.
584 */
585 buf = ReadBufferExtended(rel, FSM_FORKNUM, blkno, RBM_ZERO_ON_ERROR, NULL);
586 if (PageIsNew(BufferGetPage(buf)))
587 {
588 LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE);
589 if (PageIsNew(BufferGetPage(buf)))
590 PageInit(BufferGetPage(buf), BLCKSZ, 0);
591 LockBuffer(buf, BUFFER_LOCK_UNLOCK);
592 }
593 return buf;
594 }
595
596 /*
597 * Ensure that the FSM fork is at least fsm_nblocks long, extending
598 * it if necessary with empty pages. And by empty, I mean pages filled
599 * with zeros, meaning there's no free space.
600 */
601 static void
fsm_extend(Relation rel,BlockNumber fsm_nblocks)602 fsm_extend(Relation rel, BlockNumber fsm_nblocks)
603 {
604 BlockNumber fsm_nblocks_now;
605 PGAlignedBlock pg;
606
607 PageInit((Page) pg.data, BLCKSZ, 0);
608
609 /*
610 * We use the relation extension lock to lock out other backends trying to
611 * extend the FSM at the same time. It also locks out extension of the
612 * main fork, unnecessarily, but extending the FSM happens seldom enough
613 * that it doesn't seem worthwhile to have a separate lock tag type for
614 * it.
615 *
616 * Note that another backend might have extended or created the relation
617 * by the time we get the lock.
618 */
619 LockRelationForExtension(rel, ExclusiveLock);
620
621 /* Might have to re-open if a cache flush happened */
622 RelationOpenSmgr(rel);
623
624 /*
625 * Create the FSM file first if it doesn't exist. If
626 * smgr_cached_nblocks[FSM_FORKNUM] is positive then it must exist, no
627 * need for an smgrexists call.
628 */
629 if ((rel->rd_smgr->smgr_cached_nblocks[FSM_FORKNUM] == 0 ||
630 rel->rd_smgr->smgr_cached_nblocks[FSM_FORKNUM] == InvalidBlockNumber) &&
631 !smgrexists(rel->rd_smgr, FSM_FORKNUM))
632 smgrcreate(rel->rd_smgr, FSM_FORKNUM, false);
633
634 /* Invalidate cache so that smgrnblocks() asks the kernel. */
635 rel->rd_smgr->smgr_cached_nblocks[FSM_FORKNUM] = InvalidBlockNumber;
636 fsm_nblocks_now = smgrnblocks(rel->rd_smgr, FSM_FORKNUM);
637
638 while (fsm_nblocks_now < fsm_nblocks)
639 {
640 PageSetChecksumInplace((Page) pg.data, fsm_nblocks_now);
641
642 smgrextend(rel->rd_smgr, FSM_FORKNUM, fsm_nblocks_now,
643 pg.data, false);
644 fsm_nblocks_now++;
645 }
646
647 UnlockRelationForExtension(rel, ExclusiveLock);
648 }
649
650 /*
651 * Set value in given FSM page and slot.
652 *
653 * If minValue > 0, the updated page is also searched for a page with at
654 * least minValue of free space. If one is found, its slot number is
655 * returned, -1 otherwise.
656 */
657 static int
fsm_set_and_search(Relation rel,FSMAddress addr,uint16 slot,uint8 newValue,uint8 minValue)658 fsm_set_and_search(Relation rel, FSMAddress addr, uint16 slot,
659 uint8 newValue, uint8 minValue)
660 {
661 Buffer buf;
662 Page page;
663 int newslot = -1;
664
665 buf = fsm_readbuf(rel, addr, true);
666 LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE);
667
668 page = BufferGetPage(buf);
669
670 if (fsm_set_avail(page, slot, newValue))
671 MarkBufferDirtyHint(buf, false);
672
673 if (minValue != 0)
674 {
675 /* Search while we still hold the lock */
676 newslot = fsm_search_avail(buf, minValue,
677 addr.level == FSM_BOTTOM_LEVEL,
678 true);
679 }
680
681 UnlockReleaseBuffer(buf);
682
683 return newslot;
684 }
685
686 /*
687 * Search the tree for a heap page with at least min_cat of free space
688 */
689 static BlockNumber
fsm_search(Relation rel,uint8 min_cat)690 fsm_search(Relation rel, uint8 min_cat)
691 {
692 int restarts = 0;
693 FSMAddress addr = FSM_ROOT_ADDRESS;
694
695 for (;;)
696 {
697 int slot;
698 Buffer buf;
699 uint8 max_avail = 0;
700
701 /* Read the FSM page. */
702 buf = fsm_readbuf(rel, addr, false);
703
704 /* Search within the page */
705 if (BufferIsValid(buf))
706 {
707 LockBuffer(buf, BUFFER_LOCK_SHARE);
708 slot = fsm_search_avail(buf, min_cat,
709 (addr.level == FSM_BOTTOM_LEVEL),
710 false);
711 if (slot == -1)
712 max_avail = fsm_get_max_avail(BufferGetPage(buf));
713 UnlockReleaseBuffer(buf);
714 }
715 else
716 slot = -1;
717
718 if (slot != -1)
719 {
720 /*
721 * Descend the tree, or return the found block if we're at the
722 * bottom.
723 */
724 if (addr.level == FSM_BOTTOM_LEVEL)
725 return fsm_get_heap_blk(addr, slot);
726
727 addr = fsm_get_child(addr, slot);
728 }
729 else if (addr.level == FSM_ROOT_LEVEL)
730 {
731 /*
732 * At the root, failure means there's no page with enough free
733 * space in the FSM. Give up.
734 */
735 return InvalidBlockNumber;
736 }
737 else
738 {
739 uint16 parentslot;
740 FSMAddress parent;
741
742 /*
743 * At lower level, failure can happen if the value in the upper-
744 * level node didn't reflect the value on the lower page. Update
745 * the upper node, to avoid falling into the same trap again, and
746 * start over.
747 *
748 * There's a race condition here, if another backend updates this
749 * page right after we release it, and gets the lock on the parent
750 * page before us. We'll then update the parent page with the now
751 * stale information we had. It's OK, because it should happen
752 * rarely, and will be fixed by the next vacuum.
753 */
754 parent = fsm_get_parent(addr, &parentslot);
755 fsm_set_and_search(rel, parent, parentslot, max_avail, 0);
756
757 /*
758 * If the upper pages are badly out of date, we might need to loop
759 * quite a few times, updating them as we go. Any inconsistencies
760 * should eventually be corrected and the loop should end. Looping
761 * indefinitely is nevertheless scary, so provide an emergency
762 * valve.
763 */
764 if (restarts++ > 10000)
765 return InvalidBlockNumber;
766
767 /* Start search all over from the root */
768 addr = FSM_ROOT_ADDRESS;
769 }
770 }
771 }
772
773
774 /*
775 * Recursive guts of FreeSpaceMapVacuum
776 *
777 * Examine the FSM page indicated by addr, as well as its children, updating
778 * upper-level nodes that cover the heap block range from start to end-1.
779 * (It's okay if end is beyond the actual end of the map.)
780 * Return the maximum freespace value on this page.
781 *
782 * If addr is past the end of the FSM, set *eof_p to true and return 0.
783 *
784 * This traverses the tree in depth-first order. The tree is stored
785 * physically in depth-first order, so this should be pretty I/O efficient.
786 */
787 static uint8
fsm_vacuum_page(Relation rel,FSMAddress addr,BlockNumber start,BlockNumber end,bool * eof_p)788 fsm_vacuum_page(Relation rel, FSMAddress addr,
789 BlockNumber start, BlockNumber end,
790 bool *eof_p)
791 {
792 Buffer buf;
793 Page page;
794 uint8 max_avail;
795
796 /* Read the page if it exists, or return EOF */
797 buf = fsm_readbuf(rel, addr, false);
798 if (!BufferIsValid(buf))
799 {
800 *eof_p = true;
801 return 0;
802 }
803 else
804 *eof_p = false;
805
806 page = BufferGetPage(buf);
807
808 /*
809 * If we're above the bottom level, recurse into children, and fix the
810 * information stored about them at this level.
811 */
812 if (addr.level > FSM_BOTTOM_LEVEL)
813 {
814 FSMAddress fsm_start,
815 fsm_end;
816 uint16 fsm_start_slot,
817 fsm_end_slot;
818 int slot,
819 start_slot,
820 end_slot;
821 bool eof = false;
822
823 /*
824 * Compute the range of slots we need to update on this page, given
825 * the requested range of heap blocks to consider. The first slot to
826 * update is the one covering the "start" block, and the last slot is
827 * the one covering "end - 1". (Some of this work will be duplicated
828 * in each recursive call, but it's cheap enough to not worry about.)
829 */
830 fsm_start = fsm_get_location(start, &fsm_start_slot);
831 fsm_end = fsm_get_location(end - 1, &fsm_end_slot);
832
833 while (fsm_start.level < addr.level)
834 {
835 fsm_start = fsm_get_parent(fsm_start, &fsm_start_slot);
836 fsm_end = fsm_get_parent(fsm_end, &fsm_end_slot);
837 }
838 Assert(fsm_start.level == addr.level);
839
840 if (fsm_start.logpageno == addr.logpageno)
841 start_slot = fsm_start_slot;
842 else if (fsm_start.logpageno > addr.logpageno)
843 start_slot = SlotsPerFSMPage; /* shouldn't get here... */
844 else
845 start_slot = 0;
846
847 if (fsm_end.logpageno == addr.logpageno)
848 end_slot = fsm_end_slot;
849 else if (fsm_end.logpageno > addr.logpageno)
850 end_slot = SlotsPerFSMPage - 1;
851 else
852 end_slot = -1; /* shouldn't get here... */
853
854 for (slot = start_slot; slot <= end_slot; slot++)
855 {
856 int child_avail;
857
858 CHECK_FOR_INTERRUPTS();
859
860 /* After we hit end-of-file, just clear the rest of the slots */
861 if (!eof)
862 child_avail = fsm_vacuum_page(rel, fsm_get_child(addr, slot),
863 start, end,
864 &eof);
865 else
866 child_avail = 0;
867
868 /* Update information about the child */
869 if (fsm_get_avail(page, slot) != child_avail)
870 {
871 LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE);
872 fsm_set_avail(page, slot, child_avail);
873 MarkBufferDirtyHint(buf, false);
874 LockBuffer(buf, BUFFER_LOCK_UNLOCK);
875 }
876 }
877 }
878
879 /* Now get the maximum value on the page, to return to caller */
880 max_avail = fsm_get_max_avail(page);
881
882 /*
883 * Reset the next slot pointer. This encourages the use of low-numbered
884 * pages, increasing the chances that a later vacuum can truncate the
885 * relation. We don't bother with a lock here, nor with marking the page
886 * dirty if it wasn't already, since this is just a hint.
887 */
888 ((FSMPage) PageGetContents(page))->fp_next_slot = 0;
889
890 ReleaseBuffer(buf);
891
892 return max_avail;
893 }
894