1 /*-------------------------------------------------------------------------
2  *
3  * freepage.c
4  *	  Management of free memory pages.
5  *
6  * The intention of this code is to provide infrastructure for memory
7  * allocators written specifically for PostgreSQL.  At least in the case
8  * of dynamic shared memory, we can't simply use malloc() or even
9  * relatively thin wrappers like palloc() which sit on top of it, because
10  * no allocator built into the operating system will deal with relative
11  * pointers.  In the future, we may find other cases in which greater
12  * control over our own memory management seems desirable.
13  *
14  * A FreePageManager keeps track of which 4kB pages of memory are currently
15  * unused from the point of view of some higher-level memory allocator.
16  * Unlike a user-facing allocator such as palloc(), a FreePageManager can
17  * only allocate and free in units of whole pages, and freeing an
18  * allocation can only be done given knowledge of its length in pages.
19  *
20  * Since a free page manager has only a fixed amount of dedicated memory,
21  * and since there is no underlying allocator, it uses the free pages
22  * it is given to manage to store its bookkeeping data.  It keeps multiple
23  * freelists of runs of pages, sorted by the size of the run; the head of
24  * each freelist is stored in the FreePageManager itself, and the first
25  * page of each run contains a relative pointer to the next run. See
26  * FreePageManagerGetInternal for more details on how the freelists are
27  * managed.
28  *
29  * To avoid memory fragmentation, it's important to consolidate adjacent
30  * spans of pages whenever possible; otherwise, large allocation requests
31  * might not be satisfied even when sufficient contiguous space is
32  * available.  Therefore, in addition to the freelists, we maintain an
33  * in-memory btree of free page ranges ordered by page number.  If a
34  * range being freed precedes or follows a range that is already free,
35  * the existing range is extended; if it exactly bridges the gap between
36  * free ranges, then the two existing ranges are consolidated with the
37  * newly-freed range to form one great big range of free pages.
38  *
39  * When there is only one range of free pages, the btree is trivial and
40  * is stored within the FreePageManager proper; otherwise, pages are
41  * allocated from the area under management as needed.  Even in cases
42  * where memory fragmentation is very severe, only a tiny fraction of
43  * the pages under management are consumed by this btree.
44  *
45  * Portions Copyright (c) 1996-2020, PostgreSQL Global Development Group
46  * Portions Copyright (c) 1994, Regents of the University of California
47  *
48  * IDENTIFICATION
49  *	  src/backend/utils/mmgr/freepage.c
50  *
51  *-------------------------------------------------------------------------
52  */
53 
54 #include "postgres.h"
55 #include "lib/stringinfo.h"
56 #include "miscadmin.h"
57 
58 #include "utils/freepage.h"
59 #include "utils/relptr.h"
60 
61 
62 /* Magic numbers to identify various page types */
63 #define FREE_PAGE_SPAN_LEADER_MAGIC		0xea4020f0
64 #define FREE_PAGE_LEAF_MAGIC			0x98eae728
65 #define FREE_PAGE_INTERNAL_MAGIC		0x19aa32c9
66 
67 /* Doubly linked list of spans of free pages; stored in first page of span. */
68 struct FreePageSpanLeader
69 {
70 	int			magic;			/* always FREE_PAGE_SPAN_LEADER_MAGIC */
71 	Size		npages;			/* number of pages in span */
72 	RelptrFreePageSpanLeader prev;
73 	RelptrFreePageSpanLeader next;
74 };
75 
76 /* Common header for btree leaf and internal pages. */
77 typedef struct FreePageBtreeHeader
78 {
79 	int			magic;			/* FREE_PAGE_LEAF_MAGIC or
80 								 * FREE_PAGE_INTERNAL_MAGIC */
81 	Size		nused;			/* number of items used */
82 	RelptrFreePageBtree parent; /* uplink */
83 } FreePageBtreeHeader;
84 
85 /* Internal key; points to next level of btree. */
86 typedef struct FreePageBtreeInternalKey
87 {
88 	Size		first_page;		/* low bound for keys on child page */
89 	RelptrFreePageBtree child;	/* downlink */
90 } FreePageBtreeInternalKey;
91 
92 /* Leaf key; no payload data. */
93 typedef struct FreePageBtreeLeafKey
94 {
95 	Size		first_page;		/* first page in span */
96 	Size		npages;			/* number of pages in span */
97 } FreePageBtreeLeafKey;
98 
99 /* Work out how many keys will fit on a page. */
100 #define FPM_ITEMS_PER_INTERNAL_PAGE \
101 	((FPM_PAGE_SIZE - sizeof(FreePageBtreeHeader)) / \
102 		sizeof(FreePageBtreeInternalKey))
103 #define FPM_ITEMS_PER_LEAF_PAGE \
104 	((FPM_PAGE_SIZE - sizeof(FreePageBtreeHeader)) / \
105 		sizeof(FreePageBtreeLeafKey))
106 
107 /* A btree page of either sort */
108 struct FreePageBtree
109 {
110 	FreePageBtreeHeader hdr;
111 	union
112 	{
113 		FreePageBtreeInternalKey internal_key[FPM_ITEMS_PER_INTERNAL_PAGE];
114 		FreePageBtreeLeafKey leaf_key[FPM_ITEMS_PER_LEAF_PAGE];
115 	}			u;
116 };
117 
118 /* Results of a btree search */
119 typedef struct FreePageBtreeSearchResult
120 {
121 	FreePageBtree *page;
122 	Size		index;
123 	bool		found;
124 	unsigned	split_pages;
125 } FreePageBtreeSearchResult;
126 
127 /* Helper functions */
128 static void FreePageBtreeAdjustAncestorKeys(FreePageManager *fpm,
129 											FreePageBtree *btp);
130 static Size FreePageBtreeCleanup(FreePageManager *fpm);
131 static FreePageBtree *FreePageBtreeFindLeftSibling(char *base,
132 												   FreePageBtree *btp);
133 static FreePageBtree *FreePageBtreeFindRightSibling(char *base,
134 													FreePageBtree *btp);
135 static Size FreePageBtreeFirstKey(FreePageBtree *btp);
136 static FreePageBtree *FreePageBtreeGetRecycled(FreePageManager *fpm);
137 static void FreePageBtreeInsertInternal(char *base, FreePageBtree *btp,
138 										Size index, Size first_page, FreePageBtree *child);
139 static void FreePageBtreeInsertLeaf(FreePageBtree *btp, Size index,
140 									Size first_page, Size npages);
141 static void FreePageBtreeRecycle(FreePageManager *fpm, Size pageno);
142 static void FreePageBtreeRemove(FreePageManager *fpm, FreePageBtree *btp,
143 								Size index);
144 static void FreePageBtreeRemovePage(FreePageManager *fpm, FreePageBtree *btp);
145 static void FreePageBtreeSearch(FreePageManager *fpm, Size first_page,
146 								FreePageBtreeSearchResult *result);
147 static Size FreePageBtreeSearchInternal(FreePageBtree *btp, Size first_page);
148 static Size FreePageBtreeSearchLeaf(FreePageBtree *btp, Size first_page);
149 static FreePageBtree *FreePageBtreeSplitPage(FreePageManager *fpm,
150 											 FreePageBtree *btp);
151 static void FreePageBtreeUpdateParentPointers(char *base, FreePageBtree *btp);
152 static void FreePageManagerDumpBtree(FreePageManager *fpm, FreePageBtree *btp,
153 									 FreePageBtree *parent, int level, StringInfo buf);
154 static void FreePageManagerDumpSpans(FreePageManager *fpm,
155 									 FreePageSpanLeader *span, Size expected_pages,
156 									 StringInfo buf);
157 static bool FreePageManagerGetInternal(FreePageManager *fpm, Size npages,
158 									   Size *first_page);
159 static Size FreePageManagerPutInternal(FreePageManager *fpm, Size first_page,
160 									   Size npages, bool soft);
161 static void FreePagePopSpanLeader(FreePageManager *fpm, Size pageno);
162 static void FreePagePushSpanLeader(FreePageManager *fpm, Size first_page,
163 								   Size npages);
164 static Size FreePageManagerLargestContiguous(FreePageManager *fpm);
165 static void FreePageManagerUpdateLargest(FreePageManager *fpm);
166 
167 #ifdef FPM_EXTRA_ASSERTS
168 static Size sum_free_pages(FreePageManager *fpm);
169 #endif
170 
171 /*
172  * Initialize a new, empty free page manager.
173  *
174  * 'fpm' should reference caller-provided memory large enough to contain a
175  * FreePageManager.  We'll initialize it here.
176  *
177  * 'base' is the address to which all pointers are relative.  When managing
178  * a dynamic shared memory segment, it should normally be the base of the
179  * segment.  When managing backend-private memory, it can be either NULL or,
180  * if managing a single contiguous extent of memory, the start of that extent.
181  */
182 void
FreePageManagerInitialize(FreePageManager * fpm,char * base)183 FreePageManagerInitialize(FreePageManager *fpm, char *base)
184 {
185 	Size		f;
186 
187 	relptr_store(base, fpm->self, fpm);
188 	relptr_store(base, fpm->btree_root, (FreePageBtree *) NULL);
189 	relptr_store(base, fpm->btree_recycle, (FreePageSpanLeader *) NULL);
190 	fpm->btree_depth = 0;
191 	fpm->btree_recycle_count = 0;
192 	fpm->singleton_first_page = 0;
193 	fpm->singleton_npages = 0;
194 	fpm->contiguous_pages = 0;
195 	fpm->contiguous_pages_dirty = true;
196 #ifdef FPM_EXTRA_ASSERTS
197 	fpm->free_pages = 0;
198 #endif
199 
200 	for (f = 0; f < FPM_NUM_FREELISTS; f++)
201 		relptr_store(base, fpm->freelist[f], (FreePageSpanLeader *) NULL);
202 }
203 
204 /*
205  * Allocate a run of pages of the given length from the free page manager.
206  * The return value indicates whether we were able to satisfy the request;
207  * if true, the first page of the allocation is stored in *first_page.
208  */
209 bool
FreePageManagerGet(FreePageManager * fpm,Size npages,Size * first_page)210 FreePageManagerGet(FreePageManager *fpm, Size npages, Size *first_page)
211 {
212 	bool		result;
213 	Size		contiguous_pages;
214 
215 	result = FreePageManagerGetInternal(fpm, npages, first_page);
216 
217 	/*
218 	 * It's a bit counterintuitive, but allocating pages can actually create
219 	 * opportunities for cleanup that create larger ranges.  We might pull a
220 	 * key out of the btree that enables the item at the head of the btree
221 	 * recycle list to be inserted; and then if there are more items behind it
222 	 * one of those might cause two currently-separated ranges to merge,
223 	 * creating a single range of contiguous pages larger than any that
224 	 * existed previously.  It might be worth trying to improve the cleanup
225 	 * algorithm to avoid such corner cases, but for now we just notice the
226 	 * condition and do the appropriate reporting.
227 	 */
228 	contiguous_pages = FreePageBtreeCleanup(fpm);
229 	if (fpm->contiguous_pages < contiguous_pages)
230 		fpm->contiguous_pages = contiguous_pages;
231 
232 	/*
233 	 * FreePageManagerGetInternal may have set contiguous_pages_dirty.
234 	 * Recompute contiguous_pages if so.
235 	 */
236 	FreePageManagerUpdateLargest(fpm);
237 
238 #ifdef FPM_EXTRA_ASSERTS
239 	if (result)
240 	{
241 		Assert(fpm->free_pages >= npages);
242 		fpm->free_pages -= npages;
243 	}
244 	Assert(fpm->free_pages == sum_free_pages(fpm));
245 	Assert(fpm->contiguous_pages == FreePageManagerLargestContiguous(fpm));
246 #endif
247 	return result;
248 }
249 
250 #ifdef FPM_EXTRA_ASSERTS
251 static void
sum_free_pages_recurse(FreePageManager * fpm,FreePageBtree * btp,Size * sum)252 sum_free_pages_recurse(FreePageManager *fpm, FreePageBtree *btp, Size *sum)
253 {
254 	char	   *base = fpm_segment_base(fpm);
255 
256 	Assert(btp->hdr.magic == FREE_PAGE_INTERNAL_MAGIC ||
257 		   btp->hdr.magic == FREE_PAGE_LEAF_MAGIC);
258 	++*sum;
259 	if (btp->hdr.magic == FREE_PAGE_INTERNAL_MAGIC)
260 	{
261 		Size		index;
262 
263 
264 		for (index = 0; index < btp->hdr.nused; ++index)
265 		{
266 			FreePageBtree *child;
267 
268 			child = relptr_access(base, btp->u.internal_key[index].child);
269 			sum_free_pages_recurse(fpm, child, sum);
270 		}
271 	}
272 }
273 static Size
sum_free_pages(FreePageManager * fpm)274 sum_free_pages(FreePageManager *fpm)
275 {
276 	FreePageSpanLeader *recycle;
277 	char	   *base = fpm_segment_base(fpm);
278 	Size		sum = 0;
279 	int			list;
280 
281 	/* Count the spans by scanning the freelists. */
282 	for (list = 0; list < FPM_NUM_FREELISTS; ++list)
283 	{
284 
285 		if (!relptr_is_null(fpm->freelist[list]))
286 		{
287 			FreePageSpanLeader *candidate =
288 			relptr_access(base, fpm->freelist[list]);
289 
290 			do
291 			{
292 				sum += candidate->npages;
293 				candidate = relptr_access(base, candidate->next);
294 			} while (candidate != NULL);
295 		}
296 	}
297 
298 	/* Count btree internal pages. */
299 	if (fpm->btree_depth > 0)
300 	{
301 		FreePageBtree *root = relptr_access(base, fpm->btree_root);
302 
303 		sum_free_pages_recurse(fpm, root, &sum);
304 	}
305 
306 	/* Count the recycle list. */
307 	for (recycle = relptr_access(base, fpm->btree_recycle);
308 		 recycle != NULL;
309 		 recycle = relptr_access(base, recycle->next))
310 	{
311 		Assert(recycle->npages == 1);
312 		++sum;
313 	}
314 
315 	return sum;
316 }
317 #endif
318 
319 /*
320  * Compute the size of the largest run of pages that the user could
321  * successfully get.
322  */
323 static Size
FreePageManagerLargestContiguous(FreePageManager * fpm)324 FreePageManagerLargestContiguous(FreePageManager *fpm)
325 {
326 	char	   *base;
327 	Size		largest;
328 
329 	base = fpm_segment_base(fpm);
330 	largest = 0;
331 	if (!relptr_is_null(fpm->freelist[FPM_NUM_FREELISTS - 1]))
332 	{
333 		FreePageSpanLeader *candidate;
334 
335 		candidate = relptr_access(base, fpm->freelist[FPM_NUM_FREELISTS - 1]);
336 		do
337 		{
338 			if (candidate->npages > largest)
339 				largest = candidate->npages;
340 			candidate = relptr_access(base, candidate->next);
341 		} while (candidate != NULL);
342 	}
343 	else
344 	{
345 		Size		f = FPM_NUM_FREELISTS - 1;
346 
347 		do
348 		{
349 			--f;
350 			if (!relptr_is_null(fpm->freelist[f]))
351 			{
352 				largest = f + 1;
353 				break;
354 			}
355 		} while (f > 0);
356 	}
357 
358 	return largest;
359 }
360 
361 /*
362  * Recompute the size of the largest run of pages that the user could
363  * successfully get, if it has been marked dirty.
364  */
365 static void
FreePageManagerUpdateLargest(FreePageManager * fpm)366 FreePageManagerUpdateLargest(FreePageManager *fpm)
367 {
368 	if (!fpm->contiguous_pages_dirty)
369 		return;
370 
371 	fpm->contiguous_pages = FreePageManagerLargestContiguous(fpm);
372 	fpm->contiguous_pages_dirty = false;
373 }
374 
375 /*
376  * Transfer a run of pages to the free page manager.
377  */
378 void
FreePageManagerPut(FreePageManager * fpm,Size first_page,Size npages)379 FreePageManagerPut(FreePageManager *fpm, Size first_page, Size npages)
380 {
381 	Size		contiguous_pages;
382 
383 	Assert(npages > 0);
384 
385 	/* Record the new pages. */
386 	contiguous_pages =
387 		FreePageManagerPutInternal(fpm, first_page, npages, false);
388 
389 	/*
390 	 * If the new range we inserted into the page manager was contiguous with
391 	 * an existing range, it may have opened up cleanup opportunities.
392 	 */
393 	if (contiguous_pages > npages)
394 	{
395 		Size		cleanup_contiguous_pages;
396 
397 		cleanup_contiguous_pages = FreePageBtreeCleanup(fpm);
398 		if (cleanup_contiguous_pages > contiguous_pages)
399 			contiguous_pages = cleanup_contiguous_pages;
400 	}
401 
402 	/* See if we now have a new largest chunk. */
403 	if (fpm->contiguous_pages < contiguous_pages)
404 		fpm->contiguous_pages = contiguous_pages;
405 
406 	/*
407 	 * The earlier call to FreePageManagerPutInternal may have set
408 	 * contiguous_pages_dirty if it needed to allocate internal pages, so
409 	 * recompute contiguous_pages if necessary.
410 	 */
411 	FreePageManagerUpdateLargest(fpm);
412 
413 #ifdef FPM_EXTRA_ASSERTS
414 	fpm->free_pages += npages;
415 	Assert(fpm->free_pages == sum_free_pages(fpm));
416 	Assert(fpm->contiguous_pages == FreePageManagerLargestContiguous(fpm));
417 #endif
418 }
419 
420 /*
421  * Produce a debugging dump of the state of a free page manager.
422  */
423 char *
FreePageManagerDump(FreePageManager * fpm)424 FreePageManagerDump(FreePageManager *fpm)
425 {
426 	char	   *base = fpm_segment_base(fpm);
427 	StringInfoData buf;
428 	FreePageSpanLeader *recycle;
429 	bool		dumped_any_freelist = false;
430 	Size		f;
431 
432 	/* Initialize output buffer. */
433 	initStringInfo(&buf);
434 
435 	/* Dump general stuff. */
436 	appendStringInfo(&buf, "metadata: self %zu max contiguous pages = %zu\n",
437 					 fpm->self.relptr_off, fpm->contiguous_pages);
438 
439 	/* Dump btree. */
440 	if (fpm->btree_depth > 0)
441 	{
442 		FreePageBtree *root;
443 
444 		appendStringInfo(&buf, "btree depth %u:\n", fpm->btree_depth);
445 		root = relptr_access(base, fpm->btree_root);
446 		FreePageManagerDumpBtree(fpm, root, NULL, 0, &buf);
447 	}
448 	else if (fpm->singleton_npages > 0)
449 	{
450 		appendStringInfo(&buf, "singleton: %zu(%zu)\n",
451 						 fpm->singleton_first_page, fpm->singleton_npages);
452 	}
453 
454 	/* Dump btree recycle list. */
455 	recycle = relptr_access(base, fpm->btree_recycle);
456 	if (recycle != NULL)
457 	{
458 		appendStringInfoString(&buf, "btree recycle:");
459 		FreePageManagerDumpSpans(fpm, recycle, 1, &buf);
460 	}
461 
462 	/* Dump free lists. */
463 	for (f = 0; f < FPM_NUM_FREELISTS; ++f)
464 	{
465 		FreePageSpanLeader *span;
466 
467 		if (relptr_is_null(fpm->freelist[f]))
468 			continue;
469 		if (!dumped_any_freelist)
470 		{
471 			appendStringInfoString(&buf, "freelists:\n");
472 			dumped_any_freelist = true;
473 		}
474 		appendStringInfo(&buf, "  %zu:", f + 1);
475 		span = relptr_access(base, fpm->freelist[f]);
476 		FreePageManagerDumpSpans(fpm, span, f + 1, &buf);
477 	}
478 
479 	/* And return result to caller. */
480 	return buf.data;
481 }
482 
483 
484 /*
485  * The first_page value stored at index zero in any non-root page must match
486  * the first_page value stored in its parent at the index which points to that
487  * page.  So when the value stored at index zero in a btree page changes, we've
488  * got to walk up the tree adjusting ancestor keys until we reach an ancestor
489  * where that key isn't index zero.  This function should be called after
490  * updating the first key on the target page; it will propagate the change
491  * upward as far as needed.
492  *
493  * We assume here that the first key on the page has not changed enough to
494  * require changes in the ordering of keys on its ancestor pages.  Thus,
495  * if we search the parent page for the first key greater than or equal to
496  * the first key on the current page, the downlink to this page will be either
497  * the exact index returned by the search (if the first key decreased)
498  * or one less (if the first key increased).
499  */
500 static void
FreePageBtreeAdjustAncestorKeys(FreePageManager * fpm,FreePageBtree * btp)501 FreePageBtreeAdjustAncestorKeys(FreePageManager *fpm, FreePageBtree *btp)
502 {
503 	char	   *base = fpm_segment_base(fpm);
504 	Size		first_page;
505 	FreePageBtree *parent;
506 	FreePageBtree *child;
507 
508 	/* This might be either a leaf or an internal page. */
509 	Assert(btp->hdr.nused > 0);
510 	if (btp->hdr.magic == FREE_PAGE_LEAF_MAGIC)
511 	{
512 		Assert(btp->hdr.nused <= FPM_ITEMS_PER_LEAF_PAGE);
513 		first_page = btp->u.leaf_key[0].first_page;
514 	}
515 	else
516 	{
517 		Assert(btp->hdr.magic == FREE_PAGE_INTERNAL_MAGIC);
518 		Assert(btp->hdr.nused <= FPM_ITEMS_PER_INTERNAL_PAGE);
519 		first_page = btp->u.internal_key[0].first_page;
520 	}
521 	child = btp;
522 
523 	/* Loop until we find an ancestor that does not require adjustment. */
524 	for (;;)
525 	{
526 		Size		s;
527 
528 		parent = relptr_access(base, child->hdr.parent);
529 		if (parent == NULL)
530 			break;
531 		s = FreePageBtreeSearchInternal(parent, first_page);
532 
533 		/* Key is either at index s or index s-1; figure out which. */
534 		if (s >= parent->hdr.nused)
535 		{
536 			Assert(s == parent->hdr.nused);
537 			--s;
538 		}
539 		else
540 		{
541 			FreePageBtree *check;
542 
543 			check = relptr_access(base, parent->u.internal_key[s].child);
544 			if (check != child)
545 			{
546 				Assert(s > 0);
547 				--s;
548 			}
549 		}
550 
551 #ifdef USE_ASSERT_CHECKING
552 		/* Debugging double-check. */
553 		{
554 			FreePageBtree *check;
555 
556 			check = relptr_access(base, parent->u.internal_key[s].child);
557 			Assert(s < parent->hdr.nused);
558 			Assert(child == check);
559 		}
560 #endif
561 
562 		/* Update the parent key. */
563 		parent->u.internal_key[s].first_page = first_page;
564 
565 		/*
566 		 * If this is the first key in the parent, go up another level; else
567 		 * done.
568 		 */
569 		if (s > 0)
570 			break;
571 		child = parent;
572 	}
573 }
574 
575 /*
576  * Attempt to reclaim space from the free-page btree.  The return value is
577  * the largest range of contiguous pages created by the cleanup operation.
578  */
579 static Size
FreePageBtreeCleanup(FreePageManager * fpm)580 FreePageBtreeCleanup(FreePageManager *fpm)
581 {
582 	char	   *base = fpm_segment_base(fpm);
583 	Size		max_contiguous_pages = 0;
584 
585 	/* Attempt to shrink the depth of the btree. */
586 	while (!relptr_is_null(fpm->btree_root))
587 	{
588 		FreePageBtree *root = relptr_access(base, fpm->btree_root);
589 
590 		/* If the root contains only one key, reduce depth by one. */
591 		if (root->hdr.nused == 1)
592 		{
593 			/* Shrink depth of tree by one. */
594 			Assert(fpm->btree_depth > 0);
595 			--fpm->btree_depth;
596 			if (root->hdr.magic == FREE_PAGE_LEAF_MAGIC)
597 			{
598 				/* If root is a leaf, convert only entry to singleton range. */
599 				relptr_store(base, fpm->btree_root, (FreePageBtree *) NULL);
600 				fpm->singleton_first_page = root->u.leaf_key[0].first_page;
601 				fpm->singleton_npages = root->u.leaf_key[0].npages;
602 			}
603 			else
604 			{
605 				FreePageBtree *newroot;
606 
607 				/* If root is an internal page, make only child the root. */
608 				Assert(root->hdr.magic == FREE_PAGE_INTERNAL_MAGIC);
609 				relptr_copy(fpm->btree_root, root->u.internal_key[0].child);
610 				newroot = relptr_access(base, fpm->btree_root);
611 				relptr_store(base, newroot->hdr.parent, (FreePageBtree *) NULL);
612 			}
613 			FreePageBtreeRecycle(fpm, fpm_pointer_to_page(base, root));
614 		}
615 		else if (root->hdr.nused == 2 &&
616 				 root->hdr.magic == FREE_PAGE_LEAF_MAGIC)
617 		{
618 			Size		end_of_first;
619 			Size		start_of_second;
620 
621 			end_of_first = root->u.leaf_key[0].first_page +
622 				root->u.leaf_key[0].npages;
623 			start_of_second = root->u.leaf_key[1].first_page;
624 
625 			if (end_of_first + 1 == start_of_second)
626 			{
627 				Size		root_page = fpm_pointer_to_page(base, root);
628 
629 				if (end_of_first == root_page)
630 				{
631 					FreePagePopSpanLeader(fpm, root->u.leaf_key[0].first_page);
632 					FreePagePopSpanLeader(fpm, root->u.leaf_key[1].first_page);
633 					fpm->singleton_first_page = root->u.leaf_key[0].first_page;
634 					fpm->singleton_npages = root->u.leaf_key[0].npages +
635 						root->u.leaf_key[1].npages + 1;
636 					fpm->btree_depth = 0;
637 					relptr_store(base, fpm->btree_root,
638 								 (FreePageBtree *) NULL);
639 					FreePagePushSpanLeader(fpm, fpm->singleton_first_page,
640 										   fpm->singleton_npages);
641 					Assert(max_contiguous_pages == 0);
642 					max_contiguous_pages = fpm->singleton_npages;
643 				}
644 			}
645 
646 			/* Whether it worked or not, it's time to stop. */
647 			break;
648 		}
649 		else
650 		{
651 			/* Nothing more to do.  Stop. */
652 			break;
653 		}
654 	}
655 
656 	/*
657 	 * Attempt to free recycled btree pages.  We skip this if releasing the
658 	 * recycled page would require a btree page split, because the page we're
659 	 * trying to recycle would be consumed by the split, which would be
660 	 * counterproductive.
661 	 *
662 	 * We also currently only ever attempt to recycle the first page on the
663 	 * list; that could be made more aggressive, but it's not clear that the
664 	 * complexity would be worthwhile.
665 	 */
666 	while (fpm->btree_recycle_count > 0)
667 	{
668 		FreePageBtree *btp;
669 		Size		first_page;
670 		Size		contiguous_pages;
671 
672 		btp = FreePageBtreeGetRecycled(fpm);
673 		first_page = fpm_pointer_to_page(base, btp);
674 		contiguous_pages = FreePageManagerPutInternal(fpm, first_page, 1, true);
675 		if (contiguous_pages == 0)
676 		{
677 			FreePageBtreeRecycle(fpm, first_page);
678 			break;
679 		}
680 		else
681 		{
682 			if (contiguous_pages > max_contiguous_pages)
683 				max_contiguous_pages = contiguous_pages;
684 		}
685 	}
686 
687 	return max_contiguous_pages;
688 }
689 
690 /*
691  * Consider consolidating the given page with its left or right sibling,
692  * if it's fairly empty.
693  */
694 static void
FreePageBtreeConsolidate(FreePageManager * fpm,FreePageBtree * btp)695 FreePageBtreeConsolidate(FreePageManager *fpm, FreePageBtree *btp)
696 {
697 	char	   *base = fpm_segment_base(fpm);
698 	FreePageBtree *np;
699 	Size		max;
700 
701 	/*
702 	 * We only try to consolidate pages that are less than a third full. We
703 	 * could be more aggressive about this, but that might risk performing
704 	 * consolidation only to end up splitting again shortly thereafter.  Since
705 	 * the btree should be very small compared to the space under management,
706 	 * our goal isn't so much to ensure that it always occupies the absolutely
707 	 * smallest possible number of pages as to reclaim pages before things get
708 	 * too egregiously out of hand.
709 	 */
710 	if (btp->hdr.magic == FREE_PAGE_LEAF_MAGIC)
711 		max = FPM_ITEMS_PER_LEAF_PAGE;
712 	else
713 	{
714 		Assert(btp->hdr.magic == FREE_PAGE_INTERNAL_MAGIC);
715 		max = FPM_ITEMS_PER_INTERNAL_PAGE;
716 	}
717 	if (btp->hdr.nused >= max / 3)
718 		return;
719 
720 	/*
721 	 * If we can fit our right sibling's keys onto this page, consolidate.
722 	 */
723 	np = FreePageBtreeFindRightSibling(base, btp);
724 	if (np != NULL && btp->hdr.nused + np->hdr.nused <= max)
725 	{
726 		if (btp->hdr.magic == FREE_PAGE_LEAF_MAGIC)
727 		{
728 			memcpy(&btp->u.leaf_key[btp->hdr.nused], &np->u.leaf_key[0],
729 				   sizeof(FreePageBtreeLeafKey) * np->hdr.nused);
730 			btp->hdr.nused += np->hdr.nused;
731 		}
732 		else
733 		{
734 			memcpy(&btp->u.internal_key[btp->hdr.nused], &np->u.internal_key[0],
735 				   sizeof(FreePageBtreeInternalKey) * np->hdr.nused);
736 			btp->hdr.nused += np->hdr.nused;
737 			FreePageBtreeUpdateParentPointers(base, btp);
738 		}
739 		FreePageBtreeRemovePage(fpm, np);
740 		return;
741 	}
742 
743 	/*
744 	 * If we can fit our keys onto our left sibling's page, consolidate. In
745 	 * this case, we move our keys onto the other page rather than vice versa,
746 	 * to avoid having to adjust ancestor keys.
747 	 */
748 	np = FreePageBtreeFindLeftSibling(base, btp);
749 	if (np != NULL && btp->hdr.nused + np->hdr.nused <= max)
750 	{
751 		if (btp->hdr.magic == FREE_PAGE_LEAF_MAGIC)
752 		{
753 			memcpy(&np->u.leaf_key[np->hdr.nused], &btp->u.leaf_key[0],
754 				   sizeof(FreePageBtreeLeafKey) * btp->hdr.nused);
755 			np->hdr.nused += btp->hdr.nused;
756 		}
757 		else
758 		{
759 			memcpy(&np->u.internal_key[np->hdr.nused], &btp->u.internal_key[0],
760 				   sizeof(FreePageBtreeInternalKey) * btp->hdr.nused);
761 			np->hdr.nused += btp->hdr.nused;
762 			FreePageBtreeUpdateParentPointers(base, np);
763 		}
764 		FreePageBtreeRemovePage(fpm, btp);
765 		return;
766 	}
767 }
768 
769 /*
770  * Find the passed page's left sibling; that is, the page at the same level
771  * of the tree whose keyspace immediately precedes ours.
772  */
773 static FreePageBtree *
FreePageBtreeFindLeftSibling(char * base,FreePageBtree * btp)774 FreePageBtreeFindLeftSibling(char *base, FreePageBtree *btp)
775 {
776 	FreePageBtree *p = btp;
777 	int			levels = 0;
778 
779 	/* Move up until we can move left. */
780 	for (;;)
781 	{
782 		Size		first_page;
783 		Size		index;
784 
785 		first_page = FreePageBtreeFirstKey(p);
786 		p = relptr_access(base, p->hdr.parent);
787 
788 		if (p == NULL)
789 			return NULL;		/* we were passed the rightmost page */
790 
791 		index = FreePageBtreeSearchInternal(p, first_page);
792 		if (index > 0)
793 		{
794 			Assert(p->u.internal_key[index].first_page == first_page);
795 			p = relptr_access(base, p->u.internal_key[index - 1].child);
796 			break;
797 		}
798 		Assert(index == 0);
799 		++levels;
800 	}
801 
802 	/* Descend left. */
803 	while (levels > 0)
804 	{
805 		Assert(p->hdr.magic == FREE_PAGE_INTERNAL_MAGIC);
806 		p = relptr_access(base, p->u.internal_key[p->hdr.nused - 1].child);
807 		--levels;
808 	}
809 	Assert(p->hdr.magic == btp->hdr.magic);
810 
811 	return p;
812 }
813 
814 /*
815  * Find the passed page's right sibling; that is, the page at the same level
816  * of the tree whose keyspace immediately follows ours.
817  */
818 static FreePageBtree *
FreePageBtreeFindRightSibling(char * base,FreePageBtree * btp)819 FreePageBtreeFindRightSibling(char *base, FreePageBtree *btp)
820 {
821 	FreePageBtree *p = btp;
822 	int			levels = 0;
823 
824 	/* Move up until we can move right. */
825 	for (;;)
826 	{
827 		Size		first_page;
828 		Size		index;
829 
830 		first_page = FreePageBtreeFirstKey(p);
831 		p = relptr_access(base, p->hdr.parent);
832 
833 		if (p == NULL)
834 			return NULL;		/* we were passed the rightmost page */
835 
836 		index = FreePageBtreeSearchInternal(p, first_page);
837 		if (index < p->hdr.nused - 1)
838 		{
839 			Assert(p->u.internal_key[index].first_page == first_page);
840 			p = relptr_access(base, p->u.internal_key[index + 1].child);
841 			break;
842 		}
843 		Assert(index == p->hdr.nused - 1);
844 		++levels;
845 	}
846 
847 	/* Descend left. */
848 	while (levels > 0)
849 	{
850 		Assert(p->hdr.magic == FREE_PAGE_INTERNAL_MAGIC);
851 		p = relptr_access(base, p->u.internal_key[0].child);
852 		--levels;
853 	}
854 	Assert(p->hdr.magic == btp->hdr.magic);
855 
856 	return p;
857 }
858 
859 /*
860  * Get the first key on a btree page.
861  */
862 static Size
FreePageBtreeFirstKey(FreePageBtree * btp)863 FreePageBtreeFirstKey(FreePageBtree *btp)
864 {
865 	Assert(btp->hdr.nused > 0);
866 
867 	if (btp->hdr.magic == FREE_PAGE_LEAF_MAGIC)
868 		return btp->u.leaf_key[0].first_page;
869 	else
870 	{
871 		Assert(btp->hdr.magic == FREE_PAGE_INTERNAL_MAGIC);
872 		return btp->u.internal_key[0].first_page;
873 	}
874 }
875 
876 /*
877  * Get a page from the btree recycle list for use as a btree page.
878  */
879 static FreePageBtree *
FreePageBtreeGetRecycled(FreePageManager * fpm)880 FreePageBtreeGetRecycled(FreePageManager *fpm)
881 {
882 	char	   *base = fpm_segment_base(fpm);
883 	FreePageSpanLeader *victim = relptr_access(base, fpm->btree_recycle);
884 	FreePageSpanLeader *newhead;
885 
886 	Assert(victim != NULL);
887 	newhead = relptr_access(base, victim->next);
888 	if (newhead != NULL)
889 		relptr_copy(newhead->prev, victim->prev);
890 	relptr_store(base, fpm->btree_recycle, newhead);
891 	Assert(fpm_pointer_is_page_aligned(base, victim));
892 	fpm->btree_recycle_count--;
893 	return (FreePageBtree *) victim;
894 }
895 
896 /*
897  * Insert an item into an internal page.
898  */
899 static void
FreePageBtreeInsertInternal(char * base,FreePageBtree * btp,Size index,Size first_page,FreePageBtree * child)900 FreePageBtreeInsertInternal(char *base, FreePageBtree *btp, Size index,
901 							Size first_page, FreePageBtree *child)
902 {
903 	Assert(btp->hdr.magic == FREE_PAGE_INTERNAL_MAGIC);
904 	Assert(btp->hdr.nused <= FPM_ITEMS_PER_INTERNAL_PAGE);
905 	Assert(index <= btp->hdr.nused);
906 	memmove(&btp->u.internal_key[index + 1], &btp->u.internal_key[index],
907 			sizeof(FreePageBtreeInternalKey) * (btp->hdr.nused - index));
908 	btp->u.internal_key[index].first_page = first_page;
909 	relptr_store(base, btp->u.internal_key[index].child, child);
910 	++btp->hdr.nused;
911 }
912 
913 /*
914  * Insert an item into a leaf page.
915  */
916 static void
FreePageBtreeInsertLeaf(FreePageBtree * btp,Size index,Size first_page,Size npages)917 FreePageBtreeInsertLeaf(FreePageBtree *btp, Size index, Size first_page,
918 						Size npages)
919 {
920 	Assert(btp->hdr.magic == FREE_PAGE_LEAF_MAGIC);
921 	Assert(btp->hdr.nused <= FPM_ITEMS_PER_LEAF_PAGE);
922 	Assert(index <= btp->hdr.nused);
923 	memmove(&btp->u.leaf_key[index + 1], &btp->u.leaf_key[index],
924 			sizeof(FreePageBtreeLeafKey) * (btp->hdr.nused - index));
925 	btp->u.leaf_key[index].first_page = first_page;
926 	btp->u.leaf_key[index].npages = npages;
927 	++btp->hdr.nused;
928 }
929 
930 /*
931  * Put a page on the btree recycle list.
932  */
933 static void
FreePageBtreeRecycle(FreePageManager * fpm,Size pageno)934 FreePageBtreeRecycle(FreePageManager *fpm, Size pageno)
935 {
936 	char	   *base = fpm_segment_base(fpm);
937 	FreePageSpanLeader *head = relptr_access(base, fpm->btree_recycle);
938 	FreePageSpanLeader *span;
939 
940 	span = (FreePageSpanLeader *) fpm_page_to_pointer(base, pageno);
941 	span->magic = FREE_PAGE_SPAN_LEADER_MAGIC;
942 	span->npages = 1;
943 	relptr_store(base, span->next, head);
944 	relptr_store(base, span->prev, (FreePageSpanLeader *) NULL);
945 	if (head != NULL)
946 		relptr_store(base, head->prev, span);
947 	relptr_store(base, fpm->btree_recycle, span);
948 	fpm->btree_recycle_count++;
949 }
950 
951 /*
952  * Remove an item from the btree at the given position on the given page.
953  */
954 static void
FreePageBtreeRemove(FreePageManager * fpm,FreePageBtree * btp,Size index)955 FreePageBtreeRemove(FreePageManager *fpm, FreePageBtree *btp, Size index)
956 {
957 	Assert(btp->hdr.magic == FREE_PAGE_LEAF_MAGIC);
958 	Assert(index < btp->hdr.nused);
959 
960 	/* When last item is removed, extirpate entire page from btree. */
961 	if (btp->hdr.nused == 1)
962 	{
963 		FreePageBtreeRemovePage(fpm, btp);
964 		return;
965 	}
966 
967 	/* Physically remove the key from the page. */
968 	--btp->hdr.nused;
969 	if (index < btp->hdr.nused)
970 		memmove(&btp->u.leaf_key[index], &btp->u.leaf_key[index + 1],
971 				sizeof(FreePageBtreeLeafKey) * (btp->hdr.nused - index));
972 
973 	/* If we just removed the first key, adjust ancestor keys. */
974 	if (index == 0)
975 		FreePageBtreeAdjustAncestorKeys(fpm, btp);
976 
977 	/* Consider whether to consolidate this page with a sibling. */
978 	FreePageBtreeConsolidate(fpm, btp);
979 }
980 
981 /*
982  * Remove a page from the btree.  Caller is responsible for having relocated
983  * any keys from this page that are still wanted.  The page is placed on the
984  * recycled list.
985  */
986 static void
FreePageBtreeRemovePage(FreePageManager * fpm,FreePageBtree * btp)987 FreePageBtreeRemovePage(FreePageManager *fpm, FreePageBtree *btp)
988 {
989 	char	   *base = fpm_segment_base(fpm);
990 	FreePageBtree *parent;
991 	Size		index;
992 	Size		first_page;
993 
994 	for (;;)
995 	{
996 		/* Find parent page. */
997 		parent = relptr_access(base, btp->hdr.parent);
998 		if (parent == NULL)
999 		{
1000 			/* We are removing the root page. */
1001 			relptr_store(base, fpm->btree_root, (FreePageBtree *) NULL);
1002 			fpm->btree_depth = 0;
1003 			Assert(fpm->singleton_first_page == 0);
1004 			Assert(fpm->singleton_npages == 0);
1005 			return;
1006 		}
1007 
1008 		/*
1009 		 * If the parent contains only one item, we need to remove it as well.
1010 		 */
1011 		if (parent->hdr.nused > 1)
1012 			break;
1013 		FreePageBtreeRecycle(fpm, fpm_pointer_to_page(base, btp));
1014 		btp = parent;
1015 	}
1016 
1017 	/* Find and remove the downlink. */
1018 	first_page = FreePageBtreeFirstKey(btp);
1019 	if (parent->hdr.magic == FREE_PAGE_LEAF_MAGIC)
1020 	{
1021 		index = FreePageBtreeSearchLeaf(parent, first_page);
1022 		Assert(index < parent->hdr.nused);
1023 		if (index < parent->hdr.nused - 1)
1024 			memmove(&parent->u.leaf_key[index],
1025 					&parent->u.leaf_key[index + 1],
1026 					sizeof(FreePageBtreeLeafKey)
1027 					* (parent->hdr.nused - index - 1));
1028 	}
1029 	else
1030 	{
1031 		index = FreePageBtreeSearchInternal(parent, first_page);
1032 		Assert(index < parent->hdr.nused);
1033 		if (index < parent->hdr.nused - 1)
1034 			memmove(&parent->u.internal_key[index],
1035 					&parent->u.internal_key[index + 1],
1036 					sizeof(FreePageBtreeInternalKey)
1037 					* (parent->hdr.nused - index - 1));
1038 	}
1039 	parent->hdr.nused--;
1040 	Assert(parent->hdr.nused > 0);
1041 
1042 	/* Recycle the page. */
1043 	FreePageBtreeRecycle(fpm, fpm_pointer_to_page(base, btp));
1044 
1045 	/* Adjust ancestor keys if needed. */
1046 	if (index == 0)
1047 		FreePageBtreeAdjustAncestorKeys(fpm, parent);
1048 
1049 	/* Consider whether to consolidate the parent with a sibling. */
1050 	FreePageBtreeConsolidate(fpm, parent);
1051 }
1052 
1053 /*
1054  * Search the btree for an entry for the given first page and initialize
1055  * *result with the results of the search.  result->page and result->index
1056  * indicate either the position of an exact match or the position at which
1057  * the new key should be inserted.  result->found is true for an exact match,
1058  * otherwise false.  result->split_pages will contain the number of additional
1059  * btree pages that will be needed when performing a split to insert a key.
1060  * Except as described above, the contents of fields in the result object are
1061  * undefined on return.
1062  */
1063 static void
FreePageBtreeSearch(FreePageManager * fpm,Size first_page,FreePageBtreeSearchResult * result)1064 FreePageBtreeSearch(FreePageManager *fpm, Size first_page,
1065 					FreePageBtreeSearchResult *result)
1066 {
1067 	char	   *base = fpm_segment_base(fpm);
1068 	FreePageBtree *btp = relptr_access(base, fpm->btree_root);
1069 	Size		index;
1070 
1071 	result->split_pages = 1;
1072 
1073 	/* If the btree is empty, there's nothing to find. */
1074 	if (btp == NULL)
1075 	{
1076 		result->page = NULL;
1077 		result->found = false;
1078 		return;
1079 	}
1080 
1081 	/* Descend until we hit a leaf. */
1082 	while (btp->hdr.magic == FREE_PAGE_INTERNAL_MAGIC)
1083 	{
1084 		FreePageBtree *child;
1085 		bool		found_exact;
1086 
1087 		index = FreePageBtreeSearchInternal(btp, first_page);
1088 		found_exact = index < btp->hdr.nused &&
1089 			btp->u.internal_key[index].first_page == first_page;
1090 
1091 		/*
1092 		 * If we found an exact match we descend directly.  Otherwise, we
1093 		 * descend into the child to the left if possible so that we can find
1094 		 * the insertion point at that child's high end.
1095 		 */
1096 		if (!found_exact && index > 0)
1097 			--index;
1098 
1099 		/* Track required split depth for leaf insert. */
1100 		if (btp->hdr.nused >= FPM_ITEMS_PER_INTERNAL_PAGE)
1101 		{
1102 			Assert(btp->hdr.nused == FPM_ITEMS_PER_INTERNAL_PAGE);
1103 			result->split_pages++;
1104 		}
1105 		else
1106 			result->split_pages = 0;
1107 
1108 		/* Descend to appropriate child page. */
1109 		Assert(index < btp->hdr.nused);
1110 		child = relptr_access(base, btp->u.internal_key[index].child);
1111 		Assert(relptr_access(base, child->hdr.parent) == btp);
1112 		btp = child;
1113 	}
1114 
1115 	/* Track required split depth for leaf insert. */
1116 	if (btp->hdr.nused >= FPM_ITEMS_PER_LEAF_PAGE)
1117 	{
1118 		Assert(btp->hdr.nused == FPM_ITEMS_PER_INTERNAL_PAGE);
1119 		result->split_pages++;
1120 	}
1121 	else
1122 		result->split_pages = 0;
1123 
1124 	/* Search leaf page. */
1125 	index = FreePageBtreeSearchLeaf(btp, first_page);
1126 
1127 	/* Assemble results. */
1128 	result->page = btp;
1129 	result->index = index;
1130 	result->found = index < btp->hdr.nused &&
1131 		first_page == btp->u.leaf_key[index].first_page;
1132 }
1133 
1134 /*
1135  * Search an internal page for the first key greater than or equal to a given
1136  * page number.  Returns the index of that key, or one greater than the number
1137  * of keys on the page if none.
1138  */
1139 static Size
FreePageBtreeSearchInternal(FreePageBtree * btp,Size first_page)1140 FreePageBtreeSearchInternal(FreePageBtree *btp, Size first_page)
1141 {
1142 	Size		low = 0;
1143 	Size		high = btp->hdr.nused;
1144 
1145 	Assert(btp->hdr.magic == FREE_PAGE_INTERNAL_MAGIC);
1146 	Assert(high > 0 && high <= FPM_ITEMS_PER_INTERNAL_PAGE);
1147 
1148 	while (low < high)
1149 	{
1150 		Size		mid = (low + high) / 2;
1151 		Size		val = btp->u.internal_key[mid].first_page;
1152 
1153 		if (first_page == val)
1154 			return mid;
1155 		else if (first_page < val)
1156 			high = mid;
1157 		else
1158 			low = mid + 1;
1159 	}
1160 
1161 	return low;
1162 }
1163 
1164 /*
1165  * Search a leaf page for the first key greater than or equal to a given
1166  * page number.  Returns the index of that key, or one greater than the number
1167  * of keys on the page if none.
1168  */
1169 static Size
FreePageBtreeSearchLeaf(FreePageBtree * btp,Size first_page)1170 FreePageBtreeSearchLeaf(FreePageBtree *btp, Size first_page)
1171 {
1172 	Size		low = 0;
1173 	Size		high = btp->hdr.nused;
1174 
1175 	Assert(btp->hdr.magic == FREE_PAGE_LEAF_MAGIC);
1176 	Assert(high > 0 && high <= FPM_ITEMS_PER_LEAF_PAGE);
1177 
1178 	while (low < high)
1179 	{
1180 		Size		mid = (low + high) / 2;
1181 		Size		val = btp->u.leaf_key[mid].first_page;
1182 
1183 		if (first_page == val)
1184 			return mid;
1185 		else if (first_page < val)
1186 			high = mid;
1187 		else
1188 			low = mid + 1;
1189 	}
1190 
1191 	return low;
1192 }
1193 
1194 /*
1195  * Allocate a new btree page and move half the keys from the provided page
1196  * to the new page.  Caller is responsible for making sure that there's a
1197  * page available from fpm->btree_recycle.  Returns a pointer to the new page,
1198  * to which caller must add a downlink.
1199  */
1200 static FreePageBtree *
FreePageBtreeSplitPage(FreePageManager * fpm,FreePageBtree * btp)1201 FreePageBtreeSplitPage(FreePageManager *fpm, FreePageBtree *btp)
1202 {
1203 	FreePageBtree *newsibling;
1204 
1205 	newsibling = FreePageBtreeGetRecycled(fpm);
1206 	newsibling->hdr.magic = btp->hdr.magic;
1207 	newsibling->hdr.nused = btp->hdr.nused / 2;
1208 	relptr_copy(newsibling->hdr.parent, btp->hdr.parent);
1209 	btp->hdr.nused -= newsibling->hdr.nused;
1210 
1211 	if (btp->hdr.magic == FREE_PAGE_LEAF_MAGIC)
1212 		memcpy(&newsibling->u.leaf_key,
1213 			   &btp->u.leaf_key[btp->hdr.nused],
1214 			   sizeof(FreePageBtreeLeafKey) * newsibling->hdr.nused);
1215 	else
1216 	{
1217 		Assert(btp->hdr.magic == FREE_PAGE_INTERNAL_MAGIC);
1218 		memcpy(&newsibling->u.internal_key,
1219 			   &btp->u.internal_key[btp->hdr.nused],
1220 			   sizeof(FreePageBtreeInternalKey) * newsibling->hdr.nused);
1221 		FreePageBtreeUpdateParentPointers(fpm_segment_base(fpm), newsibling);
1222 	}
1223 
1224 	return newsibling;
1225 }
1226 
1227 /*
1228  * When internal pages are split or merged, the parent pointers of their
1229  * children must be updated.
1230  */
1231 static void
FreePageBtreeUpdateParentPointers(char * base,FreePageBtree * btp)1232 FreePageBtreeUpdateParentPointers(char *base, FreePageBtree *btp)
1233 {
1234 	Size		i;
1235 
1236 	Assert(btp->hdr.magic == FREE_PAGE_INTERNAL_MAGIC);
1237 	for (i = 0; i < btp->hdr.nused; ++i)
1238 	{
1239 		FreePageBtree *child;
1240 
1241 		child = relptr_access(base, btp->u.internal_key[i].child);
1242 		relptr_store(base, child->hdr.parent, btp);
1243 	}
1244 }
1245 
1246 /*
1247  * Debugging dump of btree data.
1248  */
1249 static void
FreePageManagerDumpBtree(FreePageManager * fpm,FreePageBtree * btp,FreePageBtree * parent,int level,StringInfo buf)1250 FreePageManagerDumpBtree(FreePageManager *fpm, FreePageBtree *btp,
1251 						 FreePageBtree *parent, int level, StringInfo buf)
1252 {
1253 	char	   *base = fpm_segment_base(fpm);
1254 	Size		pageno = fpm_pointer_to_page(base, btp);
1255 	Size		index;
1256 	FreePageBtree *check_parent;
1257 
1258 	check_stack_depth();
1259 	check_parent = relptr_access(base, btp->hdr.parent);
1260 	appendStringInfo(buf, "  %zu@%d %c", pageno, level,
1261 					 btp->hdr.magic == FREE_PAGE_INTERNAL_MAGIC ? 'i' : 'l');
1262 	if (parent != check_parent)
1263 		appendStringInfo(buf, " [actual parent %zu, expected %zu]",
1264 						 fpm_pointer_to_page(base, check_parent),
1265 						 fpm_pointer_to_page(base, parent));
1266 	appendStringInfoChar(buf, ':');
1267 	for (index = 0; index < btp->hdr.nused; ++index)
1268 	{
1269 		if (btp->hdr.magic == FREE_PAGE_INTERNAL_MAGIC)
1270 			appendStringInfo(buf, " %zu->%zu",
1271 							 btp->u.internal_key[index].first_page,
1272 							 btp->u.internal_key[index].child.relptr_off / FPM_PAGE_SIZE);
1273 		else
1274 			appendStringInfo(buf, " %zu(%zu)",
1275 							 btp->u.leaf_key[index].first_page,
1276 							 btp->u.leaf_key[index].npages);
1277 	}
1278 	appendStringInfoChar(buf, '\n');
1279 
1280 	if (btp->hdr.magic == FREE_PAGE_INTERNAL_MAGIC)
1281 	{
1282 		for (index = 0; index < btp->hdr.nused; ++index)
1283 		{
1284 			FreePageBtree *child;
1285 
1286 			child = relptr_access(base, btp->u.internal_key[index].child);
1287 			FreePageManagerDumpBtree(fpm, child, btp, level + 1, buf);
1288 		}
1289 	}
1290 }
1291 
1292 /*
1293  * Debugging dump of free-span data.
1294  */
1295 static void
FreePageManagerDumpSpans(FreePageManager * fpm,FreePageSpanLeader * span,Size expected_pages,StringInfo buf)1296 FreePageManagerDumpSpans(FreePageManager *fpm, FreePageSpanLeader *span,
1297 						 Size expected_pages, StringInfo buf)
1298 {
1299 	char	   *base = fpm_segment_base(fpm);
1300 
1301 	while (span != NULL)
1302 	{
1303 		if (span->npages != expected_pages)
1304 			appendStringInfo(buf, " %zu(%zu)", fpm_pointer_to_page(base, span),
1305 							 span->npages);
1306 		else
1307 			appendStringInfo(buf, " %zu", fpm_pointer_to_page(base, span));
1308 		span = relptr_access(base, span->next);
1309 	}
1310 
1311 	appendStringInfoChar(buf, '\n');
1312 }
1313 
1314 /*
1315  * This function allocates a run of pages of the given length from the free
1316  * page manager.
1317  */
1318 static bool
FreePageManagerGetInternal(FreePageManager * fpm,Size npages,Size * first_page)1319 FreePageManagerGetInternal(FreePageManager *fpm, Size npages, Size *first_page)
1320 {
1321 	char	   *base = fpm_segment_base(fpm);
1322 	FreePageSpanLeader *victim = NULL;
1323 	FreePageSpanLeader *prev;
1324 	FreePageSpanLeader *next;
1325 	FreePageBtreeSearchResult result;
1326 	Size		victim_page = 0;	/* placate compiler */
1327 	Size		f;
1328 
1329 	/*
1330 	 * Search for a free span.
1331 	 *
1332 	 * Right now, we use a simple best-fit policy here, but it's possible for
1333 	 * this to result in memory fragmentation if we're repeatedly asked to
1334 	 * allocate chunks just a little smaller than what we have available.
1335 	 * Hopefully, this is unlikely, because we expect most requests to be
1336 	 * single pages or superblock-sized chunks -- but no policy can be optimal
1337 	 * under all circumstances unless it has knowledge of future allocation
1338 	 * patterns.
1339 	 */
1340 	for (f = Min(npages, FPM_NUM_FREELISTS) - 1; f < FPM_NUM_FREELISTS; ++f)
1341 	{
1342 		/* Skip empty freelists. */
1343 		if (relptr_is_null(fpm->freelist[f]))
1344 			continue;
1345 
1346 		/*
1347 		 * All of the freelists except the last one contain only items of a
1348 		 * single size, so we just take the first one.  But the final free
1349 		 * list contains everything too big for any of the other lists, so we
1350 		 * need to search the list.
1351 		 */
1352 		if (f < FPM_NUM_FREELISTS - 1)
1353 			victim = relptr_access(base, fpm->freelist[f]);
1354 		else
1355 		{
1356 			FreePageSpanLeader *candidate;
1357 
1358 			candidate = relptr_access(base, fpm->freelist[f]);
1359 			do
1360 			{
1361 				if (candidate->npages >= npages && (victim == NULL ||
1362 													victim->npages > candidate->npages))
1363 				{
1364 					victim = candidate;
1365 					if (victim->npages == npages)
1366 						break;
1367 				}
1368 				candidate = relptr_access(base, candidate->next);
1369 			} while (candidate != NULL);
1370 		}
1371 		break;
1372 	}
1373 
1374 	/* If we didn't find an allocatable span, return failure. */
1375 	if (victim == NULL)
1376 		return false;
1377 
1378 	/* Remove span from free list. */
1379 	Assert(victim->magic == FREE_PAGE_SPAN_LEADER_MAGIC);
1380 	prev = relptr_access(base, victim->prev);
1381 	next = relptr_access(base, victim->next);
1382 	if (prev != NULL)
1383 		relptr_copy(prev->next, victim->next);
1384 	else
1385 		relptr_copy(fpm->freelist[f], victim->next);
1386 	if (next != NULL)
1387 		relptr_copy(next->prev, victim->prev);
1388 	victim_page = fpm_pointer_to_page(base, victim);
1389 
1390 	/* Decide whether we might be invalidating contiguous_pages. */
1391 	if (f == FPM_NUM_FREELISTS - 1 &&
1392 		victim->npages == fpm->contiguous_pages)
1393 	{
1394 		/*
1395 		 * The victim span came from the oversized freelist, and had the same
1396 		 * size as the longest span.  There may or may not be another one of
1397 		 * the same size, so contiguous_pages must be recomputed just to be
1398 		 * safe.
1399 		 */
1400 		fpm->contiguous_pages_dirty = true;
1401 	}
1402 	else if (f + 1 == fpm->contiguous_pages &&
1403 			 relptr_is_null(fpm->freelist[f]))
1404 	{
1405 		/*
1406 		 * The victim span came from a fixed sized freelist, and it was the
1407 		 * list for spans of the same size as the current longest span, and
1408 		 * the list is now empty after removing the victim.  So
1409 		 * contiguous_pages must be recomputed without a doubt.
1410 		 */
1411 		fpm->contiguous_pages_dirty = true;
1412 	}
1413 
1414 	/*
1415 	 * If we haven't initialized the btree yet, the victim must be the single
1416 	 * span stored within the FreePageManager itself.  Otherwise, we need to
1417 	 * update the btree.
1418 	 */
1419 	if (relptr_is_null(fpm->btree_root))
1420 	{
1421 		Assert(victim_page == fpm->singleton_first_page);
1422 		Assert(victim->npages == fpm->singleton_npages);
1423 		Assert(victim->npages >= npages);
1424 		fpm->singleton_first_page += npages;
1425 		fpm->singleton_npages -= npages;
1426 		if (fpm->singleton_npages > 0)
1427 			FreePagePushSpanLeader(fpm, fpm->singleton_first_page,
1428 								   fpm->singleton_npages);
1429 	}
1430 	else
1431 	{
1432 		/*
1433 		 * If the span we found is exactly the right size, remove it from the
1434 		 * btree completely.  Otherwise, adjust the btree entry to reflect the
1435 		 * still-unallocated portion of the span, and put that portion on the
1436 		 * appropriate free list.
1437 		 */
1438 		FreePageBtreeSearch(fpm, victim_page, &result);
1439 		Assert(result.found);
1440 		if (victim->npages == npages)
1441 			FreePageBtreeRemove(fpm, result.page, result.index);
1442 		else
1443 		{
1444 			FreePageBtreeLeafKey *key;
1445 
1446 			/* Adjust btree to reflect remaining pages. */
1447 			Assert(victim->npages > npages);
1448 			key = &result.page->u.leaf_key[result.index];
1449 			Assert(key->npages == victim->npages);
1450 			key->first_page += npages;
1451 			key->npages -= npages;
1452 			if (result.index == 0)
1453 				FreePageBtreeAdjustAncestorKeys(fpm, result.page);
1454 
1455 			/* Put the unallocated pages back on the appropriate free list. */
1456 			FreePagePushSpanLeader(fpm, victim_page + npages,
1457 								   victim->npages - npages);
1458 		}
1459 	}
1460 
1461 	/* Return results to caller. */
1462 	*first_page = fpm_pointer_to_page(base, victim);
1463 	return true;
1464 }
1465 
1466 /*
1467  * Put a range of pages into the btree and freelists, consolidating it with
1468  * existing free spans just before and/or after it.  If 'soft' is true,
1469  * only perform the insertion if it can be done without allocating new btree
1470  * pages; if false, do it always.  Returns 0 if the soft flag caused the
1471  * insertion to be skipped, or otherwise the size of the contiguous span
1472  * created by the insertion.  This may be larger than npages if we're able
1473  * to consolidate with an adjacent range.
1474  */
1475 static Size
FreePageManagerPutInternal(FreePageManager * fpm,Size first_page,Size npages,bool soft)1476 FreePageManagerPutInternal(FreePageManager *fpm, Size first_page, Size npages,
1477 						   bool soft)
1478 {
1479 	char	   *base = fpm_segment_base(fpm);
1480 	FreePageBtreeSearchResult result;
1481 	FreePageBtreeLeafKey *prevkey = NULL;
1482 	FreePageBtreeLeafKey *nextkey = NULL;
1483 	FreePageBtree *np;
1484 	Size		nindex;
1485 
1486 	Assert(npages > 0);
1487 
1488 	/* We can store a single free span without initializing the btree. */
1489 	if (fpm->btree_depth == 0)
1490 	{
1491 		if (fpm->singleton_npages == 0)
1492 		{
1493 			/* Don't have a span yet; store this one. */
1494 			fpm->singleton_first_page = first_page;
1495 			fpm->singleton_npages = npages;
1496 			FreePagePushSpanLeader(fpm, first_page, npages);
1497 			return fpm->singleton_npages;
1498 		}
1499 		else if (fpm->singleton_first_page + fpm->singleton_npages ==
1500 				 first_page)
1501 		{
1502 			/* New span immediately follows sole existing span. */
1503 			fpm->singleton_npages += npages;
1504 			FreePagePopSpanLeader(fpm, fpm->singleton_first_page);
1505 			FreePagePushSpanLeader(fpm, fpm->singleton_first_page,
1506 								   fpm->singleton_npages);
1507 			return fpm->singleton_npages;
1508 		}
1509 		else if (first_page + npages == fpm->singleton_first_page)
1510 		{
1511 			/* New span immediately precedes sole existing span. */
1512 			FreePagePopSpanLeader(fpm, fpm->singleton_first_page);
1513 			fpm->singleton_first_page = first_page;
1514 			fpm->singleton_npages += npages;
1515 			FreePagePushSpanLeader(fpm, fpm->singleton_first_page,
1516 								   fpm->singleton_npages);
1517 			return fpm->singleton_npages;
1518 		}
1519 		else
1520 		{
1521 			/* Not contiguous; we need to initialize the btree. */
1522 			Size		root_page;
1523 			FreePageBtree *root;
1524 
1525 			if (!relptr_is_null(fpm->btree_recycle))
1526 				root = FreePageBtreeGetRecycled(fpm);
1527 			else if (soft)
1528 				return 0;		/* Should not allocate if soft. */
1529 			else if (FreePageManagerGetInternal(fpm, 1, &root_page))
1530 				root = (FreePageBtree *) fpm_page_to_pointer(base, root_page);
1531 			else
1532 			{
1533 				/* We'd better be able to get a page from the existing range. */
1534 				elog(FATAL, "free page manager btree is corrupt");
1535 			}
1536 
1537 			/* Create the btree and move the preexisting range into it. */
1538 			root->hdr.magic = FREE_PAGE_LEAF_MAGIC;
1539 			root->hdr.nused = 1;
1540 			relptr_store(base, root->hdr.parent, (FreePageBtree *) NULL);
1541 			root->u.leaf_key[0].first_page = fpm->singleton_first_page;
1542 			root->u.leaf_key[0].npages = fpm->singleton_npages;
1543 			relptr_store(base, fpm->btree_root, root);
1544 			fpm->singleton_first_page = 0;
1545 			fpm->singleton_npages = 0;
1546 			fpm->btree_depth = 1;
1547 
1548 			/*
1549 			 * Corner case: it may be that the btree root took the very last
1550 			 * free page.  In that case, the sole btree entry covers a zero
1551 			 * page run, which is invalid.  Overwrite it with the entry we're
1552 			 * trying to insert and get out.
1553 			 */
1554 			if (root->u.leaf_key[0].npages == 0)
1555 			{
1556 				root->u.leaf_key[0].first_page = first_page;
1557 				root->u.leaf_key[0].npages = npages;
1558 				FreePagePushSpanLeader(fpm, first_page, npages);
1559 				return npages;
1560 			}
1561 
1562 			/* Fall through to insert the new key. */
1563 		}
1564 	}
1565 
1566 	/* Search the btree. */
1567 	FreePageBtreeSearch(fpm, first_page, &result);
1568 	Assert(!result.found);
1569 	if (result.index > 0)
1570 		prevkey = &result.page->u.leaf_key[result.index - 1];
1571 	if (result.index < result.page->hdr.nused)
1572 	{
1573 		np = result.page;
1574 		nindex = result.index;
1575 		nextkey = &result.page->u.leaf_key[result.index];
1576 	}
1577 	else
1578 	{
1579 		np = FreePageBtreeFindRightSibling(base, result.page);
1580 		nindex = 0;
1581 		if (np != NULL)
1582 			nextkey = &np->u.leaf_key[0];
1583 	}
1584 
1585 	/* Consolidate with the previous entry if possible. */
1586 	if (prevkey != NULL && prevkey->first_page + prevkey->npages >= first_page)
1587 	{
1588 		bool		remove_next = false;
1589 		Size		result;
1590 
1591 		Assert(prevkey->first_page + prevkey->npages == first_page);
1592 		prevkey->npages = (first_page - prevkey->first_page) + npages;
1593 
1594 		/* Check whether we can *also* consolidate with the following entry. */
1595 		if (nextkey != NULL &&
1596 			prevkey->first_page + prevkey->npages >= nextkey->first_page)
1597 		{
1598 			Assert(prevkey->first_page + prevkey->npages ==
1599 				   nextkey->first_page);
1600 			prevkey->npages = (nextkey->first_page - prevkey->first_page)
1601 				+ nextkey->npages;
1602 			FreePagePopSpanLeader(fpm, nextkey->first_page);
1603 			remove_next = true;
1604 		}
1605 
1606 		/* Put the span on the correct freelist and save size. */
1607 		FreePagePopSpanLeader(fpm, prevkey->first_page);
1608 		FreePagePushSpanLeader(fpm, prevkey->first_page, prevkey->npages);
1609 		result = prevkey->npages;
1610 
1611 		/*
1612 		 * If we consolidated with both the preceding and following entries,
1613 		 * we must remove the following entry.  We do this last, because
1614 		 * removing an element from the btree may invalidate pointers we hold
1615 		 * into the current data structure.
1616 		 *
1617 		 * NB: The btree is technically in an invalid state a this point
1618 		 * because we've already updated prevkey to cover the same key space
1619 		 * as nextkey.  FreePageBtreeRemove() shouldn't notice that, though.
1620 		 */
1621 		if (remove_next)
1622 			FreePageBtreeRemove(fpm, np, nindex);
1623 
1624 		return result;
1625 	}
1626 
1627 	/* Consolidate with the next entry if possible. */
1628 	if (nextkey != NULL && first_page + npages >= nextkey->first_page)
1629 	{
1630 		Size		newpages;
1631 
1632 		/* Compute new size for span. */
1633 		Assert(first_page + npages == nextkey->first_page);
1634 		newpages = (nextkey->first_page - first_page) + nextkey->npages;
1635 
1636 		/* Put span on correct free list. */
1637 		FreePagePopSpanLeader(fpm, nextkey->first_page);
1638 		FreePagePushSpanLeader(fpm, first_page, newpages);
1639 
1640 		/* Update key in place. */
1641 		nextkey->first_page = first_page;
1642 		nextkey->npages = newpages;
1643 
1644 		/* If reducing first key on page, ancestors might need adjustment. */
1645 		if (nindex == 0)
1646 			FreePageBtreeAdjustAncestorKeys(fpm, np);
1647 
1648 		return nextkey->npages;
1649 	}
1650 
1651 	/* Split leaf page and as many of its ancestors as necessary. */
1652 	if (result.split_pages > 0)
1653 	{
1654 		/*
1655 		 * NB: We could consider various coping strategies here to avoid a
1656 		 * split; most obviously, if np != result.page, we could target that
1657 		 * page instead.   More complicated shuffling strategies could be
1658 		 * possible as well; basically, unless every single leaf page is 100%
1659 		 * full, we can jam this key in there if we try hard enough.  It's
1660 		 * unlikely that trying that hard is worthwhile, but it's possible we
1661 		 * might need to make more than no effort.  For now, we just do the
1662 		 * easy thing, which is nothing.
1663 		 */
1664 
1665 		/* If this is a soft insert, it's time to give up. */
1666 		if (soft)
1667 			return 0;
1668 
1669 		/* Check whether we need to allocate more btree pages to split. */
1670 		if (result.split_pages > fpm->btree_recycle_count)
1671 		{
1672 			Size		pages_needed;
1673 			Size		recycle_page;
1674 			Size		i;
1675 
1676 			/*
1677 			 * Allocate the required number of pages and split each one in
1678 			 * turn.  This should never fail, because if we've got enough
1679 			 * spans of free pages kicking around that we need additional
1680 			 * storage space just to remember them all, then we should
1681 			 * certainly have enough to expand the btree, which should only
1682 			 * ever use a tiny number of pages compared to the number under
1683 			 * management.  If it does, something's badly screwed up.
1684 			 */
1685 			pages_needed = result.split_pages - fpm->btree_recycle_count;
1686 			for (i = 0; i < pages_needed; ++i)
1687 			{
1688 				if (!FreePageManagerGetInternal(fpm, 1, &recycle_page))
1689 					elog(FATAL, "free page manager btree is corrupt");
1690 				FreePageBtreeRecycle(fpm, recycle_page);
1691 			}
1692 
1693 			/*
1694 			 * The act of allocating pages to recycle may have invalidated the
1695 			 * results of our previous btree research, so repeat it. (We could
1696 			 * recheck whether any of our split-avoidance strategies that were
1697 			 * not viable before now are, but it hardly seems worthwhile, so
1698 			 * we don't bother. Consolidation can't be possible now if it
1699 			 * wasn't previously.)
1700 			 */
1701 			FreePageBtreeSearch(fpm, first_page, &result);
1702 
1703 			/*
1704 			 * The act of allocating pages for use in constructing our btree
1705 			 * should never cause any page to become more full, so the new
1706 			 * split depth should be no greater than the old one, and perhaps
1707 			 * less if we fortuitously allocated a chunk that freed up a slot
1708 			 * on the page we need to update.
1709 			 */
1710 			Assert(result.split_pages <= fpm->btree_recycle_count);
1711 		}
1712 
1713 		/* If we still need to perform a split, do it. */
1714 		if (result.split_pages > 0)
1715 		{
1716 			FreePageBtree *split_target = result.page;
1717 			FreePageBtree *child = NULL;
1718 			Size		key = first_page;
1719 
1720 			for (;;)
1721 			{
1722 				FreePageBtree *newsibling;
1723 				FreePageBtree *parent;
1724 
1725 				/* Identify parent page, which must receive downlink. */
1726 				parent = relptr_access(base, split_target->hdr.parent);
1727 
1728 				/* Split the page - downlink not added yet. */
1729 				newsibling = FreePageBtreeSplitPage(fpm, split_target);
1730 
1731 				/*
1732 				 * At this point in the loop, we're always carrying a pending
1733 				 * insertion.  On the first pass, it's the actual key we're
1734 				 * trying to insert; on subsequent passes, it's the downlink
1735 				 * that needs to be added as a result of the split performed
1736 				 * during the previous loop iteration.  Since we've just split
1737 				 * the page, there's definitely room on one of the two
1738 				 * resulting pages.
1739 				 */
1740 				if (child == NULL)
1741 				{
1742 					Size		index;
1743 					FreePageBtree *insert_into;
1744 
1745 					insert_into = key < newsibling->u.leaf_key[0].first_page ?
1746 						split_target : newsibling;
1747 					index = FreePageBtreeSearchLeaf(insert_into, key);
1748 					FreePageBtreeInsertLeaf(insert_into, index, key, npages);
1749 					if (index == 0 && insert_into == split_target)
1750 						FreePageBtreeAdjustAncestorKeys(fpm, split_target);
1751 				}
1752 				else
1753 				{
1754 					Size		index;
1755 					FreePageBtree *insert_into;
1756 
1757 					insert_into =
1758 						key < newsibling->u.internal_key[0].first_page ?
1759 						split_target : newsibling;
1760 					index = FreePageBtreeSearchInternal(insert_into, key);
1761 					FreePageBtreeInsertInternal(base, insert_into, index,
1762 												key, child);
1763 					relptr_store(base, child->hdr.parent, insert_into);
1764 					if (index == 0 && insert_into == split_target)
1765 						FreePageBtreeAdjustAncestorKeys(fpm, split_target);
1766 				}
1767 
1768 				/* If the page we just split has no parent, split the root. */
1769 				if (parent == NULL)
1770 				{
1771 					FreePageBtree *newroot;
1772 
1773 					newroot = FreePageBtreeGetRecycled(fpm);
1774 					newroot->hdr.magic = FREE_PAGE_INTERNAL_MAGIC;
1775 					newroot->hdr.nused = 2;
1776 					relptr_store(base, newroot->hdr.parent,
1777 								 (FreePageBtree *) NULL);
1778 					newroot->u.internal_key[0].first_page =
1779 						FreePageBtreeFirstKey(split_target);
1780 					relptr_store(base, newroot->u.internal_key[0].child,
1781 								 split_target);
1782 					relptr_store(base, split_target->hdr.parent, newroot);
1783 					newroot->u.internal_key[1].first_page =
1784 						FreePageBtreeFirstKey(newsibling);
1785 					relptr_store(base, newroot->u.internal_key[1].child,
1786 								 newsibling);
1787 					relptr_store(base, newsibling->hdr.parent, newroot);
1788 					relptr_store(base, fpm->btree_root, newroot);
1789 					fpm->btree_depth++;
1790 
1791 					break;
1792 				}
1793 
1794 				/* If the parent page isn't full, insert the downlink. */
1795 				key = newsibling->u.internal_key[0].first_page;
1796 				if (parent->hdr.nused < FPM_ITEMS_PER_INTERNAL_PAGE)
1797 				{
1798 					Size		index;
1799 
1800 					index = FreePageBtreeSearchInternal(parent, key);
1801 					FreePageBtreeInsertInternal(base, parent, index,
1802 												key, newsibling);
1803 					relptr_store(base, newsibling->hdr.parent, parent);
1804 					if (index == 0)
1805 						FreePageBtreeAdjustAncestorKeys(fpm, parent);
1806 					break;
1807 				}
1808 
1809 				/* The parent also needs to be split, so loop around. */
1810 				child = newsibling;
1811 				split_target = parent;
1812 			}
1813 
1814 			/*
1815 			 * The loop above did the insert, so just need to update the free
1816 			 * list, and we're done.
1817 			 */
1818 			FreePagePushSpanLeader(fpm, first_page, npages);
1819 
1820 			return npages;
1821 		}
1822 	}
1823 
1824 	/* Physically add the key to the page. */
1825 	Assert(result.page->hdr.nused < FPM_ITEMS_PER_LEAF_PAGE);
1826 	FreePageBtreeInsertLeaf(result.page, result.index, first_page, npages);
1827 
1828 	/* If new first key on page, ancestors might need adjustment. */
1829 	if (result.index == 0)
1830 		FreePageBtreeAdjustAncestorKeys(fpm, result.page);
1831 
1832 	/* Put it on the free list. */
1833 	FreePagePushSpanLeader(fpm, first_page, npages);
1834 
1835 	return npages;
1836 }
1837 
1838 /*
1839  * Remove a FreePageSpanLeader from the linked-list that contains it, either
1840  * because we're changing the size of the span, or because we're allocating it.
1841  */
1842 static void
FreePagePopSpanLeader(FreePageManager * fpm,Size pageno)1843 FreePagePopSpanLeader(FreePageManager *fpm, Size pageno)
1844 {
1845 	char	   *base = fpm_segment_base(fpm);
1846 	FreePageSpanLeader *span;
1847 	FreePageSpanLeader *next;
1848 	FreePageSpanLeader *prev;
1849 
1850 	span = (FreePageSpanLeader *) fpm_page_to_pointer(base, pageno);
1851 
1852 	next = relptr_access(base, span->next);
1853 	prev = relptr_access(base, span->prev);
1854 	if (next != NULL)
1855 		relptr_copy(next->prev, span->prev);
1856 	if (prev != NULL)
1857 		relptr_copy(prev->next, span->next);
1858 	else
1859 	{
1860 		Size		f = Min(span->npages, FPM_NUM_FREELISTS) - 1;
1861 
1862 		Assert(fpm->freelist[f].relptr_off == pageno * FPM_PAGE_SIZE);
1863 		relptr_copy(fpm->freelist[f], span->next);
1864 	}
1865 }
1866 
1867 /*
1868  * Initialize a new FreePageSpanLeader and put it on the appropriate free list.
1869  */
1870 static void
FreePagePushSpanLeader(FreePageManager * fpm,Size first_page,Size npages)1871 FreePagePushSpanLeader(FreePageManager *fpm, Size first_page, Size npages)
1872 {
1873 	char	   *base = fpm_segment_base(fpm);
1874 	Size		f = Min(npages, FPM_NUM_FREELISTS) - 1;
1875 	FreePageSpanLeader *head = relptr_access(base, fpm->freelist[f]);
1876 	FreePageSpanLeader *span;
1877 
1878 	span = (FreePageSpanLeader *) fpm_page_to_pointer(base, first_page);
1879 	span->magic = FREE_PAGE_SPAN_LEADER_MAGIC;
1880 	span->npages = npages;
1881 	relptr_store(base, span->next, head);
1882 	relptr_store(base, span->prev, (FreePageSpanLeader *) NULL);
1883 	if (head != NULL)
1884 		relptr_store(base, head->prev, span);
1885 	relptr_store(base, fpm->freelist[f], span);
1886 }
1887