1 /*-------------------------------------------------------------------------
2  *
3  * dsa.c
4  *	  Dynamic shared memory areas.
5  *
6  * This module provides dynamic shared memory areas which are built on top of
7  * DSM segments.  While dsm.c allows segments of memory of shared memory to be
8  * created and shared between backends, it isn't designed to deal with small
9  * objects.  A DSA area is a shared memory heap usually backed by one or more
10  * DSM segments which can allocate memory using dsa_allocate() and dsa_free().
11  * Alternatively, it can be created in pre-existing shared memory, including a
12  * DSM segment, and then create extra DSM segments as required.  Unlike the
13  * regular system heap, it deals in pseudo-pointers which must be converted to
14  * backend-local pointers before they are dereferenced.  These pseudo-pointers
15  * can however be shared with other backends, and can be used to construct
16  * shared data structures.
17  *
18  * Each DSA area manages a set of DSM segments, adding new segments as
19  * required and detaching them when they are no longer needed.  Each segment
20  * contains a number of 4KB pages, a free page manager for tracking
21  * consecutive runs of free pages, and a page map for tracking the source of
22  * objects allocated on each page.  Allocation requests above 8KB are handled
23  * by choosing a segment and finding consecutive free pages in its free page
24  * manager.  Allocation requests for smaller sizes are handled using pools of
25  * objects of a selection of sizes.  Each pool consists of a number of 16 page
26  * (64KB) superblocks allocated in the same way as large objects.  Allocation
27  * of large objects and new superblocks is serialized by a single LWLock, but
28  * allocation of small objects from pre-existing superblocks uses one LWLock
29  * per pool.  Currently there is one pool, and therefore one lock, per size
30  * class.  Per-core pools to increase concurrency and strategies for reducing
31  * the resulting fragmentation are areas for future research.  Each superblock
32  * is managed with a 'span', which tracks the superblock's freelist.  Free
33  * requests are handled by looking in the page map to find which span an
34  * address was allocated from, so that small objects can be returned to the
35  * appropriate free list, and large object pages can be returned directly to
36  * the free page map.  When allocating, simple heuristics for selecting
37  * segments and superblocks try to encourage occupied memory to be
38  * concentrated, increasing the likelihood that whole superblocks can become
39  * empty and be returned to the free page manager, and whole segments can
40  * become empty and be returned to the operating system.
41  *
42  * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
43  * Portions Copyright (c) 1994, Regents of the University of California
44  *
45  * IDENTIFICATION
46  *	  src/backend/utils/mmgr/dsa.c
47  *
48  *-------------------------------------------------------------------------
49  */
50 
51 #include "postgres.h"
52 
53 #include "port/atomics.h"
54 #include "storage/dsm.h"
55 #include "storage/ipc.h"
56 #include "storage/lwlock.h"
57 #include "storage/shmem.h"
58 #include "utils/dsa.h"
59 #include "utils/freepage.h"
60 #include "utils/memutils.h"
61 
62 /*
63  * The size of the initial DSM segment that backs a dsa_area created by
64  * dsa_create.  After creating some number of segments of this size we'll
65  * double this size, and so on.  Larger segments may be created if necessary
66  * to satisfy large requests.
67  */
68 #define DSA_INITIAL_SEGMENT_SIZE ((size_t) (1 * 1024 * 1024))
69 
70 /*
71  * How many segments to create before we double the segment size.  If this is
72  * low, then there is likely to be a lot of wasted space in the largest
73  * segment.  If it is high, then we risk running out of segment slots (see
74  * dsm.c's limits on total number of segments), or limiting the total size
75  * an area can manage when using small pointers.
76  */
77 #define DSA_NUM_SEGMENTS_AT_EACH_SIZE 2
78 
79 /*
80  * The number of bits used to represent the offset part of a dsa_pointer.
81  * This controls the maximum size of a segment, the maximum possible
82  * allocation size and also the maximum number of segments per area.
83  */
84 #if SIZEOF_DSA_POINTER == 4
85 #define DSA_OFFSET_WIDTH 27		/* 32 segments of size up to 128MB */
86 #else
87 #define DSA_OFFSET_WIDTH 40		/* 1024 segments of size up to 1TB */
88 #endif
89 
90 /*
91  * The maximum number of DSM segments that an area can own, determined by
92  * the number of bits remaining (but capped at 1024).
93  */
94 #define DSA_MAX_SEGMENTS \
95 	Min(1024, (1 << ((SIZEOF_DSA_POINTER * 8) - DSA_OFFSET_WIDTH)))
96 
97 /* The bitmask for extracting the offset from a dsa_pointer. */
98 #define DSA_OFFSET_BITMASK (((dsa_pointer) 1 << DSA_OFFSET_WIDTH) - 1)
99 
100 /* The maximum size of a DSM segment. */
101 #define DSA_MAX_SEGMENT_SIZE ((size_t) 1 << DSA_OFFSET_WIDTH)
102 
103 /* Number of pages (see FPM_PAGE_SIZE) per regular superblock. */
104 #define DSA_PAGES_PER_SUPERBLOCK		16
105 
106 /*
107  * A magic number used as a sanity check for following DSM segments belonging
108  * to a DSA area (this number will be XORed with the area handle and
109  * the segment index).
110  */
111 #define DSA_SEGMENT_HEADER_MAGIC 0x0ce26608
112 
113 /* Build a dsa_pointer given a segment number and offset. */
114 #define DSA_MAKE_POINTER(segment_number, offset) \
115 	(((dsa_pointer) (segment_number) << DSA_OFFSET_WIDTH) | (offset))
116 
117 /* Extract the segment number from a dsa_pointer. */
118 #define DSA_EXTRACT_SEGMENT_NUMBER(dp) ((dp) >> DSA_OFFSET_WIDTH)
119 
120 /* Extract the offset from a dsa_pointer. */
121 #define DSA_EXTRACT_OFFSET(dp) ((dp) & DSA_OFFSET_BITMASK)
122 
123 /* The type used for index segment indexes (zero based). */
124 typedef size_t dsa_segment_index;
125 
126 /* Sentinel value for dsa_segment_index indicating 'none' or 'end'. */
127 #define DSA_SEGMENT_INDEX_NONE (~(dsa_segment_index)0)
128 
129 /*
130  * How many bins of segments do we have?  The bins are used to categorize
131  * segments by their largest contiguous run of free pages.
132  */
133 #define DSA_NUM_SEGMENT_BINS 16
134 
135 /*
136  * What is the lowest bin that holds segments that *might* have n contiguous
137  * free pages?	There is no point in looking in segments in lower bins; they
138  * definitely can't service a request for n free pages.
139  */
140 #define contiguous_pages_to_segment_bin(n) Min(fls(n), DSA_NUM_SEGMENT_BINS - 1)
141 
142 /* Macros for access to locks. */
143 #define DSA_AREA_LOCK(area) (&area->control->lock)
144 #define DSA_SCLASS_LOCK(area, sclass) (&area->control->pools[sclass].lock)
145 
146 /*
147  * The header for an individual segment.  This lives at the start of each DSM
148  * segment owned by a DSA area including the first segment (where it appears
149  * as part of the dsa_area_control struct).
150  */
151 typedef struct
152 {
153 	/* Sanity check magic value. */
154 	uint32		magic;
155 	/* Total number of pages in this segment (excluding metadata area). */
156 	size_t		usable_pages;
157 	/* Total size of this segment in bytes. */
158 	size_t		size;
159 
160 	/*
161 	 * Index of the segment that precedes this one in the same segment bin, or
162 	 * DSA_SEGMENT_INDEX_NONE if this is the first one.
163 	 */
164 	dsa_segment_index prev;
165 
166 	/*
167 	 * Index of the segment that follows this one in the same segment bin, or
168 	 * DSA_SEGMENT_INDEX_NONE if this is the last one.
169 	 */
170 	dsa_segment_index next;
171 	/* The index of the bin that contains this segment. */
172 	size_t		bin;
173 
174 	/*
175 	 * A flag raised to indicate that this segment is being returned to the
176 	 * operating system and has been unpinned.
177 	 */
178 	bool		freed;
179 } dsa_segment_header;
180 
181 /*
182  * Metadata for one superblock.
183  *
184  * For most blocks, span objects are stored out-of-line; that is, the span
185  * object is not stored within the block itself.  But, as an exception, for a
186  * "span of spans", the span object is stored "inline".  The allocation is
187  * always exactly one page, and the dsa_area_span object is located at
188  * the beginning of that page.  The size class is DSA_SCLASS_BLOCK_OF_SPANS,
189  * and the remaining fields are used just as they would be in an ordinary
190  * block.  We can't allocate spans out of ordinary superblocks because
191  * creating an ordinary superblock requires us to be able to allocate a span
192  * *first*.  Doing it this way avoids that circularity.
193  */
194 typedef struct
195 {
196 	dsa_pointer pool;			/* Containing pool. */
197 	dsa_pointer prevspan;		/* Previous span. */
198 	dsa_pointer nextspan;		/* Next span. */
199 	dsa_pointer start;			/* Starting address. */
200 	size_t		npages;			/* Length of span in pages. */
201 	uint16		size_class;		/* Size class. */
202 	uint16		ninitialized;	/* Maximum number of objects ever allocated. */
203 	uint16		nallocatable;	/* Number of objects currently allocatable. */
204 	uint16		firstfree;		/* First object on free list. */
205 	uint16		nmax;			/* Maximum number of objects ever possible. */
206 	uint16		fclass;			/* Current fullness class. */
207 } dsa_area_span;
208 
209 /*
210  * Given a pointer to an object in a span, access the index of the next free
211  * object in the same span (ie in the span's freelist) as an L-value.
212  */
213 #define NextFreeObjectIndex(object) (* (uint16 *) (object))
214 
215 /*
216  * Small allocations are handled by dividing a single block of memory into
217  * many small objects of equal size.  The possible allocation sizes are
218  * defined by the following array.  Larger size classes are spaced more widely
219  * than smaller size classes.  We fudge the spacing for size classes >1kB to
220  * avoid space wastage: based on the knowledge that we plan to allocate 64kB
221  * blocks, we bump the maximum object size up to the largest multiple of
222  * 8 bytes that still lets us fit the same number of objects into one block.
223  *
224  * NB: Because of this fudging, if we were ever to use differently-sized blocks
225  * for small allocations, these size classes would need to be reworked to be
226  * optimal for the new size.
227  *
228  * NB: The optimal spacing for size classes, as well as the size of the blocks
229  * out of which small objects are allocated, is not a question that has one
230  * right answer.  Some allocators (such as tcmalloc) use more closely-spaced
231  * size classes than we do here, while others (like aset.c) use more
232  * widely-spaced classes.  Spacing the classes more closely avoids wasting
233  * memory within individual chunks, but also means a larger number of
234  * potentially-unfilled blocks.
235  */
236 static const uint16 dsa_size_classes[] = {
237 	sizeof(dsa_area_span), 0,	/* special size classes */
238 	8, 16, 24, 32, 40, 48, 56, 64,	/* 8 classes separated by 8 bytes */
239 	80, 96, 112, 128,			/* 4 classes separated by 16 bytes */
240 	160, 192, 224, 256,			/* 4 classes separated by 32 bytes */
241 	320, 384, 448, 512,			/* 4 classes separated by 64 bytes */
242 	640, 768, 896, 1024,		/* 4 classes separated by 128 bytes */
243 	1280, 1560, 1816, 2048,		/* 4 classes separated by ~256 bytes */
244 	2616, 3120, 3640, 4096,		/* 4 classes separated by ~512 bytes */
245 	5456, 6552, 7280, 8192		/* 4 classes separated by ~1024 bytes */
246 };
247 #define DSA_NUM_SIZE_CLASSES				lengthof(dsa_size_classes)
248 
249 /* Special size classes. */
250 #define DSA_SCLASS_BLOCK_OF_SPANS		0
251 #define DSA_SCLASS_SPAN_LARGE			1
252 
253 /*
254  * The following lookup table is used to map the size of small objects
255  * (less than 1kB) onto the corresponding size class.  To use this table,
256  * round the size of the object up to the next multiple of 8 bytes, and then
257  * index into this array.
258  */
259 static char dsa_size_class_map[] = {
260 	2, 3, 4, 5, 6, 7, 8, 9, 10, 10, 11, 11, 12, 12, 13, 13,
261 	14, 14, 14, 14, 15, 15, 15, 15, 16, 16, 16, 16, 17, 17, 17, 17,
262 	18, 18, 18, 18, 18, 18, 18, 18, 19, 19, 19, 19, 19, 19, 19, 19,
263 	20, 20, 20, 20, 20, 20, 20, 20, 21, 21, 21, 21, 21, 21, 21, 21,
264 	22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22,
265 	23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23,
266 	24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
267 	25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25
268 };
269 #define DSA_SIZE_CLASS_MAP_QUANTUM	8
270 
271 /*
272  * Superblocks are binned by how full they are.  Generally, each fullness
273  * class corresponds to one quartile, but the block being used for
274  * allocations is always at the head of the list for fullness class 1,
275  * regardless of how full it really is.
276  */
277 #define DSA_FULLNESS_CLASSES		4
278 
279 /*
280  * A dsa_area_pool represents a set of objects of a given size class.
281  *
282  * Perhaps there should be multiple pools for the same size class for
283  * contention avoidance, but for now there is just one!
284  */
285 typedef struct
286 {
287 	/* A lock protecting access to this pool. */
288 	LWLock		lock;
289 	/* A set of linked lists of spans, arranged by fullness. */
290 	dsa_pointer spans[DSA_FULLNESS_CLASSES];
291 	/* Should we pad this out to a cacheline boundary? */
292 } dsa_area_pool;
293 
294 /*
295  * The control block for an area.  This lives in shared memory, at the start of
296  * the first DSM segment controlled by this area.
297  */
298 typedef struct
299 {
300 	/* The segment header for the first segment. */
301 	dsa_segment_header segment_header;
302 	/* The handle for this area. */
303 	dsa_handle	handle;
304 	/* The handles of the segments owned by this area. */
305 	dsm_handle	segment_handles[DSA_MAX_SEGMENTS];
306 	/* Lists of segments, binned by maximum contiguous run of free pages. */
307 	dsa_segment_index segment_bins[DSA_NUM_SEGMENT_BINS];
308 	/* The object pools for each size class. */
309 	dsa_area_pool pools[DSA_NUM_SIZE_CLASSES];
310 	/* The total size of all active segments. */
311 	size_t		total_segment_size;
312 	/* The maximum total size of backing storage we are allowed. */
313 	size_t		max_total_segment_size;
314 	/* Highest used segment index in the history of this area. */
315 	dsa_segment_index high_segment_index;
316 	/* The reference count for this area. */
317 	int			refcnt;
318 	/* A flag indicating that this area has been pinned. */
319 	bool		pinned;
320 	/* The number of times that segments have been freed. */
321 	size_t		freed_segment_counter;
322 	/* The LWLock tranche ID. */
323 	int			lwlock_tranche_id;
324 	/* The general lock (protects everything except object pools). */
325 	LWLock		lock;
326 } dsa_area_control;
327 
328 /* Given a pointer to a pool, find a dsa_pointer. */
329 #define DsaAreaPoolToDsaPointer(area, p)	\
330 	DSA_MAKE_POINTER(0, (char *) p - (char *) area->control)
331 
332 /*
333  * A dsa_segment_map is stored within the backend-private memory of each
334  * individual backend.  It holds the base address of the segment within that
335  * backend, plus the addresses of key objects within the segment.  Those
336  * could instead be derived from the base address but it's handy to have them
337  * around.
338  */
339 typedef struct
340 {
341 	dsm_segment *segment;		/* DSM segment */
342 	char	   *mapped_address; /* Address at which segment is mapped */
343 	dsa_segment_header *header; /* Header (same as mapped_address) */
344 	FreePageManager *fpm;		/* Free page manager within segment. */
345 	dsa_pointer *pagemap;		/* Page map within segment. */
346 } dsa_segment_map;
347 
348 /*
349  * Per-backend state for a storage area.  Backends obtain one of these by
350  * creating an area or attaching to an existing one using a handle.  Each
351  * process that needs to use an area uses its own object to track where the
352  * segments are mapped.
353  */
354 struct dsa_area
355 {
356 	/* Pointer to the control object in shared memory. */
357 	dsa_area_control *control;
358 
359 	/* Has the mapping been pinned? */
360 	bool		mapping_pinned;
361 
362 	/*
363 	 * This backend's array of segment maps, ordered by segment index
364 	 * corresponding to control->segment_handles.  Some of the area's segments
365 	 * may not be mapped in this backend yet, and some slots may have been
366 	 * freed and need to be detached; these operations happen on demand.
367 	 */
368 	dsa_segment_map segment_maps[DSA_MAX_SEGMENTS];
369 
370 	/* The highest segment index this backend has ever mapped. */
371 	dsa_segment_index high_segment_index;
372 
373 	/* The last observed freed_segment_counter. */
374 	size_t		freed_segment_counter;
375 };
376 
377 #define DSA_SPAN_NOTHING_FREE	((uint16) -1)
378 #define DSA_SUPERBLOCK_SIZE (DSA_PAGES_PER_SUPERBLOCK * FPM_PAGE_SIZE)
379 
380 /* Given a pointer to a segment_map, obtain a segment index number. */
381 #define get_segment_index(area, segment_map_ptr) \
382 	(segment_map_ptr - &area->segment_maps[0])
383 
384 static void init_span(dsa_area *area, dsa_pointer span_pointer,
385 		  dsa_area_pool *pool, dsa_pointer start, size_t npages,
386 		  uint16 size_class);
387 static bool transfer_first_span(dsa_area *area, dsa_area_pool *pool,
388 					int fromclass, int toclass);
389 static inline dsa_pointer alloc_object(dsa_area *area, int size_class);
390 static bool ensure_active_superblock(dsa_area *area, dsa_area_pool *pool,
391 						 int size_class);
392 static dsa_segment_map *get_segment_by_index(dsa_area *area,
393 					 dsa_segment_index index);
394 static void destroy_superblock(dsa_area *area, dsa_pointer span_pointer);
395 static void unlink_span(dsa_area *area, dsa_area_span *span);
396 static void add_span_to_fullness_class(dsa_area *area, dsa_area_span *span,
397 						   dsa_pointer span_pointer, int fclass);
398 static void unlink_segment(dsa_area *area, dsa_segment_map *segment_map);
399 static dsa_segment_map *get_best_segment(dsa_area *area, size_t npages);
400 static dsa_segment_map *make_new_segment(dsa_area *area, size_t requested_pages);
401 static dsa_area *create_internal(void *place, size_t size,
402 				int tranche_id,
403 				dsm_handle control_handle,
404 				dsm_segment *control_segment);
405 static dsa_area *attach_internal(void *place, dsm_segment *segment,
406 				dsa_handle handle);
407 static void check_for_freed_segments(dsa_area *area);
408 static void check_for_freed_segments_locked(dsa_area *area);
409 
410 /*
411  * Create a new shared area in a new DSM segment.  Further DSM segments will
412  * be allocated as required to extend the available space.
413  *
414  * We can't allocate a LWLock tranche_id within this function, because tranche
415  * IDs are a scarce resource; there are only 64k available, using low numbers
416  * when possible matters, and we have no provision for recycling them.  So,
417  * we require the caller to provide one.
418  */
419 dsa_area *
dsa_create(int tranche_id)420 dsa_create(int tranche_id)
421 {
422 	dsm_segment *segment;
423 	dsa_area   *area;
424 
425 	/*
426 	 * Create the DSM segment that will hold the shared control object and the
427 	 * first segment of usable space.
428 	 */
429 	segment = dsm_create(DSA_INITIAL_SEGMENT_SIZE, 0);
430 
431 	/*
432 	 * All segments backing this area are pinned, so that DSA can explicitly
433 	 * control their lifetime (otherwise a newly created segment belonging to
434 	 * this area might be freed when the only backend that happens to have it
435 	 * mapped in ends, corrupting the area).
436 	 */
437 	dsm_pin_segment(segment);
438 
439 	/* Create a new DSA area with the control object in this segment. */
440 	area = create_internal(dsm_segment_address(segment),
441 						   DSA_INITIAL_SEGMENT_SIZE,
442 						   tranche_id,
443 						   dsm_segment_handle(segment), segment);
444 
445 	/* Clean up when the control segment detaches. */
446 	on_dsm_detach(segment, &dsa_on_dsm_detach_release_in_place,
447 				  PointerGetDatum(dsm_segment_address(segment)));
448 
449 	return area;
450 }
451 
452 /*
453  * Create a new shared area in an existing shared memory space, which may be
454  * either DSM or Postmaster-initialized memory.  DSM segments will be
455  * allocated as required to extend the available space, though that can be
456  * prevented with dsa_set_size_limit(area, size) using the same size provided
457  * to dsa_create_in_place.
458  *
459  * Areas created in-place must eventually be released by the backend that
460  * created them and all backends that attach to them.  This can be done
461  * explicitly with dsa_release_in_place, or, in the special case that 'place'
462  * happens to be in a pre-existing DSM segment, by passing in a pointer to the
463  * segment so that a detach hook can be registered with the containing DSM
464  * segment.
465  *
466  * See dsa_create() for a note about the tranche arguments.
467  */
468 dsa_area *
dsa_create_in_place(void * place,size_t size,int tranche_id,dsm_segment * segment)469 dsa_create_in_place(void *place, size_t size,
470 					int tranche_id, dsm_segment *segment)
471 {
472 	dsa_area   *area;
473 
474 	area = create_internal(place, size, tranche_id,
475 						   DSM_HANDLE_INVALID, NULL);
476 
477 	/*
478 	 * Clean up when the control segment detaches, if a containing DSM segment
479 	 * was provided.
480 	 */
481 	if (segment != NULL)
482 		on_dsm_detach(segment, &dsa_on_dsm_detach_release_in_place,
483 					  PointerGetDatum(place));
484 
485 	return area;
486 }
487 
488 /*
489  * Obtain a handle that can be passed to other processes so that they can
490  * attach to the given area.  Cannot be called for areas created with
491  * dsa_create_in_place.
492  */
493 dsa_handle
dsa_get_handle(dsa_area * area)494 dsa_get_handle(dsa_area *area)
495 {
496 	Assert(area->control->handle != DSM_HANDLE_INVALID);
497 	return area->control->handle;
498 }
499 
500 /*
501  * Attach to an area given a handle generated (possibly in another process) by
502  * dsa_get_handle.  The area must have been created with dsa_create (not
503  * dsa_create_in_place).
504  */
505 dsa_area *
dsa_attach(dsa_handle handle)506 dsa_attach(dsa_handle handle)
507 {
508 	dsm_segment *segment;
509 	dsa_area   *area;
510 
511 	/*
512 	 * An area handle is really a DSM segment handle for the first segment, so
513 	 * we go ahead and attach to that.
514 	 */
515 	segment = dsm_attach(handle);
516 	if (segment == NULL)
517 		ereport(ERROR,
518 				(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
519 				 errmsg("could not attach to dynamic shared area")));
520 
521 	area = attach_internal(dsm_segment_address(segment), segment, handle);
522 
523 	/* Clean up when the control segment detaches. */
524 	on_dsm_detach(segment, &dsa_on_dsm_detach_release_in_place,
525 				  PointerGetDatum(dsm_segment_address(segment)));
526 
527 	return area;
528 }
529 
530 /*
531  * Attach to an area that was created with dsa_create_in_place.  The caller
532  * must somehow know the location in memory that was used when the area was
533  * created, though it may be mapped at a different virtual address in this
534  * process.
535  *
536  * See dsa_create_in_place for note about releasing in-place areas, and the
537  * optional 'segment' argument which can be provided to allow automatic
538  * release if the containing memory happens to be a DSM segment.
539  */
540 dsa_area *
dsa_attach_in_place(void * place,dsm_segment * segment)541 dsa_attach_in_place(void *place, dsm_segment *segment)
542 {
543 	dsa_area   *area;
544 
545 	area = attach_internal(place, NULL, DSM_HANDLE_INVALID);
546 
547 	/*
548 	 * Clean up when the control segment detaches, if a containing DSM segment
549 	 * was provided.
550 	 */
551 	if (segment != NULL)
552 		on_dsm_detach(segment, &dsa_on_dsm_detach_release_in_place,
553 					  PointerGetDatum(place));
554 
555 	return area;
556 }
557 
558 /*
559  * Release a DSA area that was produced by dsa_create_in_place or
560  * dsa_attach_in_place.  The 'segment' argument is ignored but provides an
561  * interface suitable for on_dsm_detach, for the convenience of users who want
562  * to create a DSA segment inside an existing DSM segment and have it
563  * automatically released when the containing DSM segment is detached.
564  * 'place' should be the address of the place where the area was created.
565  *
566  * This callback is automatically registered for the DSM segment containing
567  * the control object of in-place areas when a segment is provided to
568  * dsa_create_in_place or dsa_attach_in_place, and also for all areas created
569  * with dsa_create.
570  */
571 void
dsa_on_dsm_detach_release_in_place(dsm_segment * segment,Datum place)572 dsa_on_dsm_detach_release_in_place(dsm_segment *segment, Datum place)
573 {
574 	dsa_release_in_place(DatumGetPointer(place));
575 }
576 
577 /*
578  * Release a DSA area that was produced by dsa_create_in_place or
579  * dsa_attach_in_place.  The 'code' argument is ignored but provides an
580  * interface suitable for on_shmem_exit or before_shmem_exit, for the
581  * convenience of users who want to create a DSA segment inside shared memory
582  * other than a DSM segment and have it automatically release at backend exit.
583  * 'place' should be the address of the place where the area was created.
584  */
585 void
dsa_on_shmem_exit_release_in_place(int code,Datum place)586 dsa_on_shmem_exit_release_in_place(int code, Datum place)
587 {
588 	dsa_release_in_place(DatumGetPointer(place));
589 }
590 
591 /*
592  * Release a DSA area that was produced by dsa_create_in_place or
593  * dsa_attach_in_place.  It is preferable to use one of the 'dsa_on_XXX'
594  * callbacks so that this is managed automatically, because failure to release
595  * an area created in-place leaks its segments permanently.
596  *
597  * This is also called automatically for areas produced by dsa_create or
598  * dsa_attach as an implementation detail.
599  */
600 void
dsa_release_in_place(void * place)601 dsa_release_in_place(void *place)
602 {
603 	dsa_area_control *control = (dsa_area_control *) place;
604 	int			i;
605 
606 	LWLockAcquire(&control->lock, LW_EXCLUSIVE);
607 	Assert(control->segment_header.magic ==
608 		   (DSA_SEGMENT_HEADER_MAGIC ^ control->handle ^ 0));
609 	Assert(control->refcnt > 0);
610 	if (--control->refcnt == 0)
611 	{
612 		for (i = 0; i <= control->high_segment_index; ++i)
613 		{
614 			dsm_handle	handle;
615 
616 			handle = control->segment_handles[i];
617 			if (handle != DSM_HANDLE_INVALID)
618 				dsm_unpin_segment(handle);
619 		}
620 	}
621 	LWLockRelease(&control->lock);
622 }
623 
624 /*
625  * Keep a DSA area attached until end of session or explicit detach.
626  *
627  * By default, areas are owned by the current resource owner, which means they
628  * are detached automatically when that scope ends.
629  */
630 void
dsa_pin_mapping(dsa_area * area)631 dsa_pin_mapping(dsa_area *area)
632 {
633 	int			i;
634 
635 	Assert(!area->mapping_pinned);
636 	area->mapping_pinned = true;
637 
638 	for (i = 0; i <= area->high_segment_index; ++i)
639 		if (area->segment_maps[i].segment != NULL)
640 			dsm_pin_mapping(area->segment_maps[i].segment);
641 }
642 
643 /*
644  * Allocate memory in this storage area.  The return value is a dsa_pointer
645  * that can be passed to other processes, and converted to a local pointer
646  * with dsa_get_address.  'flags' is a bitmap which should be constructed
647  * from the following values:
648  *
649  * DSA_ALLOC_HUGE allows allocations >= 1GB.  Otherwise, such allocations
650  * will result in an ERROR.
651  *
652  * DSA_ALLOC_NO_OOM causes this function to return InvalidDsaPointer when
653  * no memory is available or a size limit established by dsa_set_size_limit
654  * would be exceeded.  Otherwise, such allocations will result in an ERROR.
655  *
656  * DSA_ALLOC_ZERO causes the allocated memory to be zeroed.  Otherwise, the
657  * contents of newly-allocated memory are indeterminate.
658  *
659  * These flags correspond to similarly named flags used by
660  * MemoryContextAllocExtended().  See also the macros dsa_allocate and
661  * dsa_allocate0 which expand to a call to this function with commonly used
662  * flags.
663  */
664 dsa_pointer
dsa_allocate_extended(dsa_area * area,size_t size,int flags)665 dsa_allocate_extended(dsa_area *area, size_t size, int flags)
666 {
667 	uint16		size_class;
668 	dsa_pointer start_pointer;
669 	dsa_segment_map *segment_map;
670 	dsa_pointer result;
671 
672 	Assert(size > 0);
673 
674 	/* Sanity check on huge individual allocation size. */
675 	if (((flags & DSA_ALLOC_HUGE) != 0 && !AllocHugeSizeIsValid(size)) ||
676 		((flags & DSA_ALLOC_HUGE) == 0 && !AllocSizeIsValid(size)))
677 		elog(ERROR, "invalid DSA memory alloc request size %zu", size);
678 
679 	/*
680 	 * If bigger than the largest size class, just grab a run of pages from
681 	 * the free page manager, instead of allocating an object from a pool.
682 	 * There will still be a span, but it's a special class of span that
683 	 * manages this whole allocation and simply gives all pages back to the
684 	 * free page manager when dsa_free is called.
685 	 */
686 	if (size > dsa_size_classes[lengthof(dsa_size_classes) - 1])
687 	{
688 		size_t		npages = fpm_size_to_pages(size);
689 		size_t		first_page;
690 		dsa_pointer span_pointer;
691 		dsa_area_pool *pool = &area->control->pools[DSA_SCLASS_SPAN_LARGE];
692 
693 		/* Obtain a span object. */
694 		span_pointer = alloc_object(area, DSA_SCLASS_BLOCK_OF_SPANS);
695 		if (!DsaPointerIsValid(span_pointer))
696 		{
697 			/* Raise error unless asked not to. */
698 			if ((flags & DSA_ALLOC_NO_OOM) == 0)
699 				ereport(ERROR,
700 						(errcode(ERRCODE_OUT_OF_MEMORY),
701 						 errmsg("out of memory"),
702 						 errdetail("Failed on DSA request of size %zu.",
703 								   size)));
704 			return InvalidDsaPointer;
705 		}
706 
707 		LWLockAcquire(DSA_AREA_LOCK(area), LW_EXCLUSIVE);
708 
709 		/* Find a segment from which to allocate. */
710 		segment_map = get_best_segment(area, npages);
711 		if (segment_map == NULL)
712 			segment_map = make_new_segment(area, npages);
713 		if (segment_map == NULL)
714 		{
715 			/* Can't make any more segments: game over. */
716 			LWLockRelease(DSA_AREA_LOCK(area));
717 			dsa_free(area, span_pointer);
718 
719 			/* Raise error unless asked not to. */
720 			if ((flags & DSA_ALLOC_NO_OOM) == 0)
721 				ereport(ERROR,
722 						(errcode(ERRCODE_OUT_OF_MEMORY),
723 						 errmsg("out of memory"),
724 						 errdetail("Failed on DSA request of size %zu.",
725 								   size)));
726 			return InvalidDsaPointer;
727 		}
728 
729 		/*
730 		 * Ask the free page manager for a run of pages.  This should always
731 		 * succeed, since both get_best_segment and make_new_segment should
732 		 * only return a non-NULL pointer if it actually contains enough
733 		 * contiguous freespace.  If it does fail, something in our backend
734 		 * private state is out of whack, so use FATAL to kill the process.
735 		 */
736 		if (!FreePageManagerGet(segment_map->fpm, npages, &first_page))
737 			elog(FATAL,
738 				 "dsa_allocate could not find %zu free pages", npages);
739 		LWLockRelease(DSA_AREA_LOCK(area));
740 
741 		start_pointer = DSA_MAKE_POINTER(get_segment_index(area, segment_map),
742 										 first_page * FPM_PAGE_SIZE);
743 
744 		/* Initialize span and pagemap. */
745 		LWLockAcquire(DSA_SCLASS_LOCK(area, DSA_SCLASS_SPAN_LARGE),
746 					  LW_EXCLUSIVE);
747 		init_span(area, span_pointer, pool, start_pointer, npages,
748 				  DSA_SCLASS_SPAN_LARGE);
749 		segment_map->pagemap[first_page] = span_pointer;
750 		LWLockRelease(DSA_SCLASS_LOCK(area, DSA_SCLASS_SPAN_LARGE));
751 
752 		/* Zero-initialize the memory if requested. */
753 		if ((flags & DSA_ALLOC_ZERO) != 0)
754 			memset(dsa_get_address(area, start_pointer), 0, size);
755 
756 		return start_pointer;
757 	}
758 
759 	/* Map allocation to a size class. */
760 	if (size < lengthof(dsa_size_class_map) * DSA_SIZE_CLASS_MAP_QUANTUM)
761 	{
762 		int			mapidx;
763 
764 		/* For smaller sizes we have a lookup table... */
765 		mapidx = ((size + DSA_SIZE_CLASS_MAP_QUANTUM - 1) /
766 				  DSA_SIZE_CLASS_MAP_QUANTUM) - 1;
767 		size_class = dsa_size_class_map[mapidx];
768 	}
769 	else
770 	{
771 		uint16		min;
772 		uint16		max;
773 
774 		/* ... and for the rest we search by binary chop. */
775 		min = dsa_size_class_map[lengthof(dsa_size_class_map) - 1];
776 		max = lengthof(dsa_size_classes) - 1;
777 
778 		while (min < max)
779 		{
780 			uint16		mid = (min + max) / 2;
781 			uint16		class_size = dsa_size_classes[mid];
782 
783 			if (class_size < size)
784 				min = mid + 1;
785 			else
786 				max = mid;
787 		}
788 
789 		size_class = min;
790 	}
791 	Assert(size <= dsa_size_classes[size_class]);
792 	Assert(size_class == 0 || size > dsa_size_classes[size_class - 1]);
793 
794 	/* Attempt to allocate an object from the appropriate pool. */
795 	result = alloc_object(area, size_class);
796 
797 	/* Check for failure to allocate. */
798 	if (!DsaPointerIsValid(result))
799 	{
800 		/* Raise error unless asked not to. */
801 		if ((flags & DSA_ALLOC_NO_OOM) == 0)
802 			ereport(ERROR,
803 					(errcode(ERRCODE_OUT_OF_MEMORY),
804 					 errmsg("out of memory"),
805 					 errdetail("Failed on DSA request of size %zu.", size)));
806 		return InvalidDsaPointer;
807 	}
808 
809 	/* Zero-initialize the memory if requested. */
810 	if ((flags & DSA_ALLOC_ZERO) != 0)
811 		memset(dsa_get_address(area, result), 0, size);
812 
813 	return result;
814 }
815 
816 /*
817  * Free memory obtained with dsa_allocate.
818  */
819 void
dsa_free(dsa_area * area,dsa_pointer dp)820 dsa_free(dsa_area *area, dsa_pointer dp)
821 {
822 	dsa_segment_map *segment_map;
823 	int			pageno;
824 	dsa_pointer span_pointer;
825 	dsa_area_span *span;
826 	char	   *superblock;
827 	char	   *object;
828 	size_t		size;
829 	int			size_class;
830 
831 	/* Make sure we don't have a stale segment in the slot 'dp' refers to. */
832 	check_for_freed_segments(area);
833 
834 	/* Locate the object, span and pool. */
835 	segment_map = get_segment_by_index(area, DSA_EXTRACT_SEGMENT_NUMBER(dp));
836 	pageno = DSA_EXTRACT_OFFSET(dp) / FPM_PAGE_SIZE;
837 	span_pointer = segment_map->pagemap[pageno];
838 	span = dsa_get_address(area, span_pointer);
839 	superblock = dsa_get_address(area, span->start);
840 	object = dsa_get_address(area, dp);
841 	size_class = span->size_class;
842 	size = dsa_size_classes[size_class];
843 
844 	/*
845 	 * Special case for large objects that live in a special span: we return
846 	 * those pages directly to the free page manager and free the span.
847 	 */
848 	if (span->size_class == DSA_SCLASS_SPAN_LARGE)
849 	{
850 
851 #ifdef CLOBBER_FREED_MEMORY
852 		memset(object, 0x7f, span->npages * FPM_PAGE_SIZE);
853 #endif
854 
855 		/* Give pages back to free page manager. */
856 		LWLockAcquire(DSA_AREA_LOCK(area), LW_EXCLUSIVE);
857 		FreePageManagerPut(segment_map->fpm,
858 						   DSA_EXTRACT_OFFSET(span->start) / FPM_PAGE_SIZE,
859 						   span->npages);
860 		LWLockRelease(DSA_AREA_LOCK(area));
861 		/* Unlink span. */
862 		LWLockAcquire(DSA_SCLASS_LOCK(area, DSA_SCLASS_SPAN_LARGE),
863 					  LW_EXCLUSIVE);
864 		unlink_span(area, span);
865 		LWLockRelease(DSA_SCLASS_LOCK(area, DSA_SCLASS_SPAN_LARGE));
866 		/* Free the span object so it can be reused. */
867 		dsa_free(area, span_pointer);
868 		return;
869 	}
870 
871 #ifdef CLOBBER_FREED_MEMORY
872 	memset(object, 0x7f, size);
873 #endif
874 
875 	LWLockAcquire(DSA_SCLASS_LOCK(area, size_class), LW_EXCLUSIVE);
876 
877 	/* Put the object on the span's freelist. */
878 	Assert(object >= superblock);
879 	Assert(object < superblock + DSA_SUPERBLOCK_SIZE);
880 	Assert((object - superblock) % size == 0);
881 	NextFreeObjectIndex(object) = span->firstfree;
882 	span->firstfree = (object - superblock) / size;
883 	++span->nallocatable;
884 
885 	/*
886 	 * See if the span needs to moved to a different fullness class, or be
887 	 * freed so its pages can be given back to the segment.
888 	 */
889 	if (span->nallocatable == 1 && span->fclass == DSA_FULLNESS_CLASSES - 1)
890 	{
891 		/*
892 		 * The block was completely full and is located in the
893 		 * highest-numbered fullness class, which is never scanned for free
894 		 * chunks.  We must move it to the next-lower fullness class.
895 		 */
896 		unlink_span(area, span);
897 		add_span_to_fullness_class(area, span, span_pointer,
898 								   DSA_FULLNESS_CLASSES - 2);
899 
900 		/*
901 		 * If this is the only span, and there is no active span, then we
902 		 * should probably move this span to fullness class 1.  (Otherwise if
903 		 * you allocate exactly all the objects in the only span, it moves to
904 		 * class 3, then you free them all, it moves to 2, and then is given
905 		 * back, leaving no active span).
906 		 */
907 	}
908 	else if (span->nallocatable == span->nmax &&
909 			 (span->fclass != 1 || span->prevspan != InvalidDsaPointer))
910 	{
911 		/*
912 		 * This entire block is free, and it's not the active block for this
913 		 * size class.  Return the memory to the free page manager. We don't
914 		 * do this for the active block to prevent hysteresis: if we
915 		 * repeatedly allocate and free the only chunk in the active block, it
916 		 * will be very inefficient if we deallocate and reallocate the block
917 		 * every time.
918 		 */
919 		destroy_superblock(area, span_pointer);
920 	}
921 
922 	LWLockRelease(DSA_SCLASS_LOCK(area, size_class));
923 }
924 
925 /*
926  * Obtain a backend-local address for a dsa_pointer.  'dp' must point to
927  * memory allocated by the given area (possibly in another process) that
928  * hasn't yet been freed.  This may cause a segment to be mapped into the
929  * current process if required, and may cause freed segments to be unmapped.
930  */
931 void *
dsa_get_address(dsa_area * area,dsa_pointer dp)932 dsa_get_address(dsa_area *area, dsa_pointer dp)
933 {
934 	dsa_segment_index index;
935 	size_t		offset;
936 
937 	/* Convert InvalidDsaPointer to NULL. */
938 	if (!DsaPointerIsValid(dp))
939 		return NULL;
940 
941 	/* Process any requests to detach from freed segments. */
942 	check_for_freed_segments(area);
943 
944 	/* Break the dsa_pointer into its components. */
945 	index = DSA_EXTRACT_SEGMENT_NUMBER(dp);
946 	offset = DSA_EXTRACT_OFFSET(dp);
947 	Assert(index < DSA_MAX_SEGMENTS);
948 
949 	/* Check if we need to cause this segment to be mapped in. */
950 	if (unlikely(area->segment_maps[index].mapped_address == NULL))
951 	{
952 		/* Call for effect (we don't need the result). */
953 		get_segment_by_index(area, index);
954 	}
955 
956 	return area->segment_maps[index].mapped_address + offset;
957 }
958 
959 /*
960  * Pin this area, so that it will continue to exist even if all backends
961  * detach from it.  In that case, the area can still be reattached to if a
962  * handle has been recorded somewhere.
963  */
964 void
dsa_pin(dsa_area * area)965 dsa_pin(dsa_area *area)
966 {
967 	LWLockAcquire(DSA_AREA_LOCK(area), LW_EXCLUSIVE);
968 	if (area->control->pinned)
969 	{
970 		LWLockRelease(DSA_AREA_LOCK(area));
971 		elog(ERROR, "dsa_area already pinned");
972 	}
973 	area->control->pinned = true;
974 	++area->control->refcnt;
975 	LWLockRelease(DSA_AREA_LOCK(area));
976 }
977 
978 /*
979  * Undo the effects of dsa_pin, so that the given area can be freed when no
980  * backends are attached to it.  May be called only if dsa_pin has been
981  * called.
982  */
983 void
dsa_unpin(dsa_area * area)984 dsa_unpin(dsa_area *area)
985 {
986 	LWLockAcquire(DSA_AREA_LOCK(area), LW_EXCLUSIVE);
987 	Assert(area->control->refcnt > 1);
988 	if (!area->control->pinned)
989 	{
990 		LWLockRelease(DSA_AREA_LOCK(area));
991 		elog(ERROR, "dsa_area not pinned");
992 	}
993 	area->control->pinned = false;
994 	--area->control->refcnt;
995 	LWLockRelease(DSA_AREA_LOCK(area));
996 }
997 
998 /*
999  * Set the total size limit for this area.  This limit is checked whenever new
1000  * segments need to be allocated from the operating system.  If the new size
1001  * limit is already exceeded, this has no immediate effect.
1002  *
1003  * Note that the total virtual memory usage may be temporarily larger than
1004  * this limit when segments have been freed, but not yet detached by all
1005  * backends that have attached to them.
1006  */
1007 void
dsa_set_size_limit(dsa_area * area,size_t limit)1008 dsa_set_size_limit(dsa_area *area, size_t limit)
1009 {
1010 	LWLockAcquire(DSA_AREA_LOCK(area), LW_EXCLUSIVE);
1011 	area->control->max_total_segment_size = limit;
1012 	LWLockRelease(DSA_AREA_LOCK(area));
1013 }
1014 
1015 /*
1016  * Aggressively free all spare memory in the hope of returning DSM segments to
1017  * the operating system.
1018  */
1019 void
dsa_trim(dsa_area * area)1020 dsa_trim(dsa_area *area)
1021 {
1022 	int			size_class;
1023 
1024 	/*
1025 	 * Trim in reverse pool order so we get to the spans-of-spans last, just
1026 	 * in case any become entirely free while processing all the other pools.
1027 	 */
1028 	for (size_class = DSA_NUM_SIZE_CLASSES - 1; size_class >= 0; --size_class)
1029 	{
1030 		dsa_area_pool *pool = &area->control->pools[size_class];
1031 		dsa_pointer span_pointer;
1032 
1033 		if (size_class == DSA_SCLASS_SPAN_LARGE)
1034 		{
1035 			/* Large object frees give back segments aggressively already. */
1036 			continue;
1037 		}
1038 
1039 		/*
1040 		 * Search fullness class 1 only.  That is where we expect to find an
1041 		 * entirely empty superblock (entirely empty superblocks in other
1042 		 * fullness classes are returned to the free page map by dsa_free).
1043 		 */
1044 		LWLockAcquire(DSA_SCLASS_LOCK(area, size_class), LW_EXCLUSIVE);
1045 		span_pointer = pool->spans[1];
1046 		while (DsaPointerIsValid(span_pointer))
1047 		{
1048 			dsa_area_span *span = dsa_get_address(area, span_pointer);
1049 			dsa_pointer next = span->nextspan;
1050 
1051 			if (span->nallocatable == span->nmax)
1052 				destroy_superblock(area, span_pointer);
1053 
1054 			span_pointer = next;
1055 		}
1056 		LWLockRelease(DSA_SCLASS_LOCK(area, size_class));
1057 	}
1058 }
1059 
1060 /*
1061  * Print out debugging information about the internal state of the shared
1062  * memory area.
1063  */
1064 void
dsa_dump(dsa_area * area)1065 dsa_dump(dsa_area *area)
1066 {
1067 	size_t		i,
1068 				j;
1069 
1070 	/*
1071 	 * Note: This gives an inconsistent snapshot as it acquires and releases
1072 	 * individual locks as it goes...
1073 	 */
1074 
1075 	LWLockAcquire(DSA_AREA_LOCK(area), LW_EXCLUSIVE);
1076 	check_for_freed_segments_locked(area);
1077 	fprintf(stderr, "dsa_area handle %x:\n", area->control->handle);
1078 	fprintf(stderr, "  max_total_segment_size: %zu\n",
1079 			area->control->max_total_segment_size);
1080 	fprintf(stderr, "  total_segment_size: %zu\n",
1081 			area->control->total_segment_size);
1082 	fprintf(stderr, "  refcnt: %d\n", area->control->refcnt);
1083 	fprintf(stderr, "  pinned: %c\n", area->control->pinned ? 't' : 'f');
1084 	fprintf(stderr, "  segment bins:\n");
1085 	for (i = 0; i < DSA_NUM_SEGMENT_BINS; ++i)
1086 	{
1087 		if (area->control->segment_bins[i] != DSA_SEGMENT_INDEX_NONE)
1088 		{
1089 			dsa_segment_index segment_index;
1090 
1091 			fprintf(stderr,
1092 					"    segment bin %zu (at least %d contiguous pages free):\n",
1093 					i, 1 << (i - 1));
1094 			segment_index = area->control->segment_bins[i];
1095 			while (segment_index != DSA_SEGMENT_INDEX_NONE)
1096 			{
1097 				dsa_segment_map *segment_map;
1098 
1099 				segment_map =
1100 					get_segment_by_index(area, segment_index);
1101 
1102 				fprintf(stderr,
1103 						"      segment index %zu, usable_pages = %zu, "
1104 						"contiguous_pages = %zu, mapped at %p\n",
1105 						segment_index,
1106 						segment_map->header->usable_pages,
1107 						fpm_largest(segment_map->fpm),
1108 						segment_map->mapped_address);
1109 				segment_index = segment_map->header->next;
1110 			}
1111 		}
1112 	}
1113 	LWLockRelease(DSA_AREA_LOCK(area));
1114 
1115 	fprintf(stderr, "  pools:\n");
1116 	for (i = 0; i < DSA_NUM_SIZE_CLASSES; ++i)
1117 	{
1118 		bool		found = false;
1119 
1120 		LWLockAcquire(DSA_SCLASS_LOCK(area, i), LW_EXCLUSIVE);
1121 		for (j = 0; j < DSA_FULLNESS_CLASSES; ++j)
1122 			if (DsaPointerIsValid(area->control->pools[i].spans[j]))
1123 				found = true;
1124 		if (found)
1125 		{
1126 			if (i == DSA_SCLASS_BLOCK_OF_SPANS)
1127 				fprintf(stderr, "    pool for blocks of span objects:\n");
1128 			else if (i == DSA_SCLASS_SPAN_LARGE)
1129 				fprintf(stderr, "    pool for large object spans:\n");
1130 			else
1131 				fprintf(stderr,
1132 						"    pool for size class %zu (object size %hu bytes):\n",
1133 						i, dsa_size_classes[i]);
1134 			for (j = 0; j < DSA_FULLNESS_CLASSES; ++j)
1135 			{
1136 				if (!DsaPointerIsValid(area->control->pools[i].spans[j]))
1137 					fprintf(stderr, "      fullness class %zu is empty\n", j);
1138 				else
1139 				{
1140 					dsa_pointer span_pointer = area->control->pools[i].spans[j];
1141 
1142 					fprintf(stderr, "      fullness class %zu:\n", j);
1143 					while (DsaPointerIsValid(span_pointer))
1144 					{
1145 						dsa_area_span *span;
1146 
1147 						span = dsa_get_address(area, span_pointer);
1148 						fprintf(stderr,
1149 								"        span descriptor at "
1150 								DSA_POINTER_FORMAT ", superblock at "
1151 								DSA_POINTER_FORMAT
1152 								", pages = %zu, objects free = %hu/%hu\n",
1153 								span_pointer, span->start, span->npages,
1154 								span->nallocatable, span->nmax);
1155 						span_pointer = span->nextspan;
1156 					}
1157 				}
1158 			}
1159 		}
1160 		LWLockRelease(DSA_SCLASS_LOCK(area, i));
1161 	}
1162 }
1163 
1164 /*
1165  * Return the smallest size that you can successfully provide to
1166  * dsa_create_in_place.
1167  */
1168 size_t
dsa_minimum_size(void)1169 dsa_minimum_size(void)
1170 {
1171 	size_t		size;
1172 	int			pages = 0;
1173 
1174 	size = MAXALIGN(sizeof(dsa_area_control)) +
1175 		MAXALIGN(sizeof(FreePageManager));
1176 
1177 	/* Figure out how many pages we need, including the page map... */
1178 	while (((size + FPM_PAGE_SIZE - 1) / FPM_PAGE_SIZE) > pages)
1179 	{
1180 		++pages;
1181 		size += sizeof(dsa_pointer);
1182 	}
1183 
1184 	return pages * FPM_PAGE_SIZE;
1185 }
1186 
1187 /*
1188  * Workhorse function for dsa_create and dsa_create_in_place.
1189  */
1190 static dsa_area *
create_internal(void * place,size_t size,int tranche_id,dsm_handle control_handle,dsm_segment * control_segment)1191 create_internal(void *place, size_t size,
1192 				int tranche_id,
1193 				dsm_handle control_handle,
1194 				dsm_segment *control_segment)
1195 {
1196 	dsa_area_control *control;
1197 	dsa_area   *area;
1198 	dsa_segment_map *segment_map;
1199 	size_t		usable_pages;
1200 	size_t		total_pages;
1201 	size_t		metadata_bytes;
1202 	int			i;
1203 
1204 	/* Sanity check on the space we have to work in. */
1205 	if (size < dsa_minimum_size())
1206 		elog(ERROR, "dsa_area space must be at least %zu, but %zu provided",
1207 			 dsa_minimum_size(), size);
1208 
1209 	/* Now figure out how much space is usable */
1210 	total_pages = size / FPM_PAGE_SIZE;
1211 	metadata_bytes =
1212 		MAXALIGN(sizeof(dsa_area_control)) +
1213 		MAXALIGN(sizeof(FreePageManager)) +
1214 		total_pages * sizeof(dsa_pointer);
1215 	/* Add padding up to next page boundary. */
1216 	if (metadata_bytes % FPM_PAGE_SIZE != 0)
1217 		metadata_bytes += FPM_PAGE_SIZE - (metadata_bytes % FPM_PAGE_SIZE);
1218 	Assert(metadata_bytes <= size);
1219 	usable_pages = (size - metadata_bytes) / FPM_PAGE_SIZE;
1220 
1221 	/*
1222 	 * Initialize the dsa_area_control object located at the start of the
1223 	 * space.
1224 	 */
1225 	control = (dsa_area_control *) place;
1226 	control->segment_header.magic =
1227 		DSA_SEGMENT_HEADER_MAGIC ^ control_handle ^ 0;
1228 	control->segment_header.next = DSA_SEGMENT_INDEX_NONE;
1229 	control->segment_header.prev = DSA_SEGMENT_INDEX_NONE;
1230 	control->segment_header.usable_pages = usable_pages;
1231 	control->segment_header.freed = false;
1232 	control->segment_header.size = DSA_INITIAL_SEGMENT_SIZE;
1233 	control->handle = control_handle;
1234 	control->max_total_segment_size = (size_t) -1;
1235 	control->total_segment_size = size;
1236 	memset(&control->segment_handles[0], 0,
1237 		   sizeof(dsm_handle) * DSA_MAX_SEGMENTS);
1238 	control->segment_handles[0] = control_handle;
1239 	for (i = 0; i < DSA_NUM_SEGMENT_BINS; ++i)
1240 		control->segment_bins[i] = DSA_SEGMENT_INDEX_NONE;
1241 	control->high_segment_index = 0;
1242 	control->refcnt = 1;
1243 	control->freed_segment_counter = 0;
1244 	control->lwlock_tranche_id = tranche_id;
1245 
1246 	/*
1247 	 * Create the dsa_area object that this backend will use to access the
1248 	 * area.  Other backends will need to obtain their own dsa_area object by
1249 	 * attaching.
1250 	 */
1251 	area = palloc(sizeof(dsa_area));
1252 	area->control = control;
1253 	area->mapping_pinned = false;
1254 	memset(area->segment_maps, 0, sizeof(dsa_segment_map) * DSA_MAX_SEGMENTS);
1255 	area->high_segment_index = 0;
1256 	area->freed_segment_counter = 0;
1257 	LWLockInitialize(&control->lock, control->lwlock_tranche_id);
1258 	for (i = 0; i < DSA_NUM_SIZE_CLASSES; ++i)
1259 		LWLockInitialize(DSA_SCLASS_LOCK(area, i),
1260 						 control->lwlock_tranche_id);
1261 
1262 	/* Set up the segment map for this process's mapping. */
1263 	segment_map = &area->segment_maps[0];
1264 	segment_map->segment = control_segment;
1265 	segment_map->mapped_address = place;
1266 	segment_map->header = (dsa_segment_header *) place;
1267 	segment_map->fpm = (FreePageManager *)
1268 		(segment_map->mapped_address +
1269 		 MAXALIGN(sizeof(dsa_area_control)));
1270 	segment_map->pagemap = (dsa_pointer *)
1271 		(segment_map->mapped_address +
1272 		 MAXALIGN(sizeof(dsa_area_control)) +
1273 		 MAXALIGN(sizeof(FreePageManager)));
1274 
1275 	/* Set up the free page map. */
1276 	FreePageManagerInitialize(segment_map->fpm, segment_map->mapped_address);
1277 	/* There can be 0 usable pages if size is dsa_minimum_size(). */
1278 
1279 	if (usable_pages > 0)
1280 		FreePageManagerPut(segment_map->fpm, metadata_bytes / FPM_PAGE_SIZE,
1281 						   usable_pages);
1282 
1283 	/* Put this segment into the appropriate bin. */
1284 	control->segment_bins[contiguous_pages_to_segment_bin(usable_pages)] = 0;
1285 	segment_map->header->bin = contiguous_pages_to_segment_bin(usable_pages);
1286 
1287 	return area;
1288 }
1289 
1290 /*
1291  * Workhorse function for dsa_attach and dsa_attach_in_place.
1292  */
1293 static dsa_area *
attach_internal(void * place,dsm_segment * segment,dsa_handle handle)1294 attach_internal(void *place, dsm_segment *segment, dsa_handle handle)
1295 {
1296 	dsa_area_control *control;
1297 	dsa_area   *area;
1298 	dsa_segment_map *segment_map;
1299 
1300 	control = (dsa_area_control *) place;
1301 	Assert(control->handle == handle);
1302 	Assert(control->segment_handles[0] == handle);
1303 	Assert(control->segment_header.magic ==
1304 		   (DSA_SEGMENT_HEADER_MAGIC ^ handle ^ 0));
1305 
1306 	/* Build the backend-local area object. */
1307 	area = palloc(sizeof(dsa_area));
1308 	area->control = control;
1309 	area->mapping_pinned = false;
1310 	memset(&area->segment_maps[0], 0,
1311 		   sizeof(dsa_segment_map) * DSA_MAX_SEGMENTS);
1312 	area->high_segment_index = 0;
1313 
1314 	/* Set up the segment map for this process's mapping. */
1315 	segment_map = &area->segment_maps[0];
1316 	segment_map->segment = segment; /* NULL for in-place */
1317 	segment_map->mapped_address = place;
1318 	segment_map->header = (dsa_segment_header *) segment_map->mapped_address;
1319 	segment_map->fpm = (FreePageManager *)
1320 		(segment_map->mapped_address + MAXALIGN(sizeof(dsa_area_control)));
1321 	segment_map->pagemap = (dsa_pointer *)
1322 		(segment_map->mapped_address + MAXALIGN(sizeof(dsa_area_control)) +
1323 		 MAXALIGN(sizeof(FreePageManager)));
1324 
1325 	/* Bump the reference count. */
1326 	LWLockAcquire(DSA_AREA_LOCK(area), LW_EXCLUSIVE);
1327 	if (control->refcnt == 0)
1328 	{
1329 		/* We can't attach to a DSA area that has already been destroyed. */
1330 		ereport(ERROR,
1331 				(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
1332 				 errmsg("could not attach to dynamic shared area")));
1333 	}
1334 	++control->refcnt;
1335 	area->freed_segment_counter = area->control->freed_segment_counter;
1336 	LWLockRelease(DSA_AREA_LOCK(area));
1337 
1338 	return area;
1339 }
1340 
1341 /*
1342  * Add a new span to fullness class 1 of the indicated pool.
1343  */
1344 static void
init_span(dsa_area * area,dsa_pointer span_pointer,dsa_area_pool * pool,dsa_pointer start,size_t npages,uint16 size_class)1345 init_span(dsa_area *area,
1346 		  dsa_pointer span_pointer,
1347 		  dsa_area_pool *pool, dsa_pointer start, size_t npages,
1348 		  uint16 size_class)
1349 {
1350 	dsa_area_span *span = dsa_get_address(area, span_pointer);
1351 	size_t		obsize = dsa_size_classes[size_class];
1352 
1353 	/*
1354 	 * The per-pool lock must be held because we manipulate the span list for
1355 	 * this pool.
1356 	 */
1357 	Assert(LWLockHeldByMe(DSA_SCLASS_LOCK(area, size_class)));
1358 
1359 	/* Push this span onto the front of the span list for fullness class 1. */
1360 	if (DsaPointerIsValid(pool->spans[1]))
1361 	{
1362 		dsa_area_span *head = (dsa_area_span *)
1363 		dsa_get_address(area, pool->spans[1]);
1364 
1365 		head->prevspan = span_pointer;
1366 	}
1367 	span->pool = DsaAreaPoolToDsaPointer(area, pool);
1368 	span->nextspan = pool->spans[1];
1369 	span->prevspan = InvalidDsaPointer;
1370 	pool->spans[1] = span_pointer;
1371 
1372 	span->start = start;
1373 	span->npages = npages;
1374 	span->size_class = size_class;
1375 	span->ninitialized = 0;
1376 	if (size_class == DSA_SCLASS_BLOCK_OF_SPANS)
1377 	{
1378 		/*
1379 		 * A block-of-spans contains its own descriptor, so mark one object as
1380 		 * initialized and reduce the count of allocatable objects by one.
1381 		 * Doing this here has the side effect of also reducing nmax by one,
1382 		 * which is important to make sure we free this object at the correct
1383 		 * time.
1384 		 */
1385 		span->ninitialized = 1;
1386 		span->nallocatable = FPM_PAGE_SIZE / obsize - 1;
1387 	}
1388 	else if (size_class != DSA_SCLASS_SPAN_LARGE)
1389 		span->nallocatable = DSA_SUPERBLOCK_SIZE / obsize;
1390 	span->firstfree = DSA_SPAN_NOTHING_FREE;
1391 	span->nmax = span->nallocatable;
1392 	span->fclass = 1;
1393 }
1394 
1395 /*
1396  * Transfer the first span in one fullness class to the head of another
1397  * fullness class.
1398  */
1399 static bool
transfer_first_span(dsa_area * area,dsa_area_pool * pool,int fromclass,int toclass)1400 transfer_first_span(dsa_area *area,
1401 					dsa_area_pool *pool, int fromclass, int toclass)
1402 {
1403 	dsa_pointer span_pointer;
1404 	dsa_area_span *span;
1405 	dsa_area_span *nextspan;
1406 
1407 	/* Can't do it if source list is empty. */
1408 	span_pointer = pool->spans[fromclass];
1409 	if (!DsaPointerIsValid(span_pointer))
1410 		return false;
1411 
1412 	/* Remove span from head of source list. */
1413 	span = dsa_get_address(area, span_pointer);
1414 	pool->spans[fromclass] = span->nextspan;
1415 	if (DsaPointerIsValid(span->nextspan))
1416 	{
1417 		nextspan = (dsa_area_span *)
1418 			dsa_get_address(area, span->nextspan);
1419 		nextspan->prevspan = InvalidDsaPointer;
1420 	}
1421 
1422 	/* Add span to head of target list. */
1423 	span->nextspan = pool->spans[toclass];
1424 	pool->spans[toclass] = span_pointer;
1425 	if (DsaPointerIsValid(span->nextspan))
1426 	{
1427 		nextspan = (dsa_area_span *)
1428 			dsa_get_address(area, span->nextspan);
1429 		nextspan->prevspan = span_pointer;
1430 	}
1431 	span->fclass = toclass;
1432 
1433 	return true;
1434 }
1435 
1436 /*
1437  * Allocate one object of the requested size class from the given area.
1438  */
1439 static inline dsa_pointer
alloc_object(dsa_area * area,int size_class)1440 alloc_object(dsa_area *area, int size_class)
1441 {
1442 	dsa_area_pool *pool = &area->control->pools[size_class];
1443 	dsa_area_span *span;
1444 	dsa_pointer block;
1445 	dsa_pointer result;
1446 	char	   *object;
1447 	size_t		size;
1448 
1449 	/*
1450 	 * Even though ensure_active_superblock can in turn call alloc_object if
1451 	 * it needs to allocate a new span, that's always from a different pool,
1452 	 * and the order of lock acquisition is always the same, so it's OK that
1453 	 * we hold this lock for the duration of this function.
1454 	 */
1455 	Assert(!LWLockHeldByMe(DSA_SCLASS_LOCK(area, size_class)));
1456 	LWLockAcquire(DSA_SCLASS_LOCK(area, size_class), LW_EXCLUSIVE);
1457 
1458 	/*
1459 	 * If there's no active superblock, we must successfully obtain one or
1460 	 * fail the request.
1461 	 */
1462 	if (!DsaPointerIsValid(pool->spans[1]) &&
1463 		!ensure_active_superblock(area, pool, size_class))
1464 	{
1465 		result = InvalidDsaPointer;
1466 	}
1467 	else
1468 	{
1469 		/*
1470 		 * There should be a block in fullness class 1 at this point, and it
1471 		 * should never be completely full.  Thus we can either pop an object
1472 		 * from the free list or, failing that, initialize a new object.
1473 		 */
1474 		Assert(DsaPointerIsValid(pool->spans[1]));
1475 		span = (dsa_area_span *)
1476 			dsa_get_address(area, pool->spans[1]);
1477 		Assert(span->nallocatable > 0);
1478 		block = span->start;
1479 		Assert(size_class < DSA_NUM_SIZE_CLASSES);
1480 		size = dsa_size_classes[size_class];
1481 		if (span->firstfree != DSA_SPAN_NOTHING_FREE)
1482 		{
1483 			result = block + span->firstfree * size;
1484 			object = dsa_get_address(area, result);
1485 			span->firstfree = NextFreeObjectIndex(object);
1486 		}
1487 		else
1488 		{
1489 			result = block + span->ninitialized * size;
1490 			++span->ninitialized;
1491 		}
1492 		--span->nallocatable;
1493 
1494 		/* If it's now full, move it to the highest-numbered fullness class. */
1495 		if (span->nallocatable == 0)
1496 			transfer_first_span(area, pool, 1, DSA_FULLNESS_CLASSES - 1);
1497 	}
1498 
1499 	Assert(LWLockHeldByMe(DSA_SCLASS_LOCK(area, size_class)));
1500 	LWLockRelease(DSA_SCLASS_LOCK(area, size_class));
1501 
1502 	return result;
1503 }
1504 
1505 /*
1506  * Ensure an active (i.e. fullness class 1) superblock, unless all existing
1507  * superblocks are completely full and no more can be allocated.
1508  *
1509  * Fullness classes K of 0..N are loosely intended to represent blocks whose
1510  * utilization percentage is at least K/N, but we only enforce this rigorously
1511  * for the highest-numbered fullness class, which always contains exactly
1512  * those blocks that are completely full.  It's otherwise acceptable for a
1513  * block to be in a higher-numbered fullness class than the one to which it
1514  * logically belongs.  In addition, the active block, which is always the
1515  * first block in fullness class 1, is permitted to have a higher allocation
1516  * percentage than would normally be allowable for that fullness class; we
1517  * don't move it until it's completely full, and then it goes to the
1518  * highest-numbered fullness class.
1519  *
1520  * It might seem odd that the active block is the head of fullness class 1
1521  * rather than fullness class 0, but experience with other allocators has
1522  * shown that it's usually better to allocate from a block that's moderately
1523  * full rather than one that's nearly empty.  Insofar as is reasonably
1524  * possible, we want to avoid performing new allocations in a block that would
1525  * otherwise become empty soon.
1526  */
1527 static bool
ensure_active_superblock(dsa_area * area,dsa_area_pool * pool,int size_class)1528 ensure_active_superblock(dsa_area *area, dsa_area_pool *pool,
1529 						 int size_class)
1530 {
1531 	dsa_pointer span_pointer;
1532 	dsa_pointer start_pointer;
1533 	size_t		obsize = dsa_size_classes[size_class];
1534 	size_t		nmax;
1535 	int			fclass;
1536 	size_t		npages = 1;
1537 	size_t		first_page;
1538 	size_t		i;
1539 	dsa_segment_map *segment_map;
1540 
1541 	Assert(LWLockHeldByMe(DSA_SCLASS_LOCK(area, size_class)));
1542 
1543 	/*
1544 	 * Compute the number of objects that will fit in a block of this size
1545 	 * class.  Span-of-spans blocks are just a single page, and the first
1546 	 * object isn't available for use because it describes the block-of-spans
1547 	 * itself.
1548 	 */
1549 	if (size_class == DSA_SCLASS_BLOCK_OF_SPANS)
1550 		nmax = FPM_PAGE_SIZE / obsize - 1;
1551 	else
1552 		nmax = DSA_SUPERBLOCK_SIZE / obsize;
1553 
1554 	/*
1555 	 * If fullness class 1 is empty, try to find a span to put in it by
1556 	 * scanning higher-numbered fullness classes (excluding the last one,
1557 	 * whose blocks are certain to all be completely full).
1558 	 */
1559 	for (fclass = 2; fclass < DSA_FULLNESS_CLASSES - 1; ++fclass)
1560 	{
1561 		span_pointer = pool->spans[fclass];
1562 
1563 		while (DsaPointerIsValid(span_pointer))
1564 		{
1565 			int			tfclass;
1566 			dsa_area_span *span;
1567 			dsa_area_span *nextspan;
1568 			dsa_area_span *prevspan;
1569 			dsa_pointer next_span_pointer;
1570 
1571 			span = (dsa_area_span *)
1572 				dsa_get_address(area, span_pointer);
1573 			next_span_pointer = span->nextspan;
1574 
1575 			/* Figure out what fullness class should contain this span. */
1576 			tfclass = (nmax - span->nallocatable)
1577 				* (DSA_FULLNESS_CLASSES - 1) / nmax;
1578 
1579 			/* Look up next span. */
1580 			if (DsaPointerIsValid(span->nextspan))
1581 				nextspan = (dsa_area_span *)
1582 					dsa_get_address(area, span->nextspan);
1583 			else
1584 				nextspan = NULL;
1585 
1586 			/*
1587 			 * If utilization has dropped enough that this now belongs in some
1588 			 * other fullness class, move it there.
1589 			 */
1590 			if (tfclass < fclass)
1591 			{
1592 				/* Remove from the current fullness class list. */
1593 				if (pool->spans[fclass] == span_pointer)
1594 				{
1595 					/* It was the head; remove it. */
1596 					Assert(!DsaPointerIsValid(span->prevspan));
1597 					pool->spans[fclass] = span->nextspan;
1598 					if (nextspan != NULL)
1599 						nextspan->prevspan = InvalidDsaPointer;
1600 				}
1601 				else
1602 				{
1603 					/* It was not the head. */
1604 					Assert(DsaPointerIsValid(span->prevspan));
1605 					prevspan = (dsa_area_span *)
1606 						dsa_get_address(area, span->prevspan);
1607 					prevspan->nextspan = span->nextspan;
1608 				}
1609 				if (nextspan != NULL)
1610 					nextspan->prevspan = span->prevspan;
1611 
1612 				/* Push onto the head of the new fullness class list. */
1613 				span->nextspan = pool->spans[tfclass];
1614 				pool->spans[tfclass] = span_pointer;
1615 				span->prevspan = InvalidDsaPointer;
1616 				if (DsaPointerIsValid(span->nextspan))
1617 				{
1618 					nextspan = (dsa_area_span *)
1619 						dsa_get_address(area, span->nextspan);
1620 					nextspan->prevspan = span_pointer;
1621 				}
1622 				span->fclass = tfclass;
1623 			}
1624 
1625 			/* Advance to next span on list. */
1626 			span_pointer = next_span_pointer;
1627 		}
1628 
1629 		/* Stop now if we found a suitable block. */
1630 		if (DsaPointerIsValid(pool->spans[1]))
1631 			return true;
1632 	}
1633 
1634 	/*
1635 	 * If there are no blocks that properly belong in fullness class 1, pick
1636 	 * one from some other fullness class and move it there anyway, so that we
1637 	 * have an allocation target.  Our last choice is to transfer a block
1638 	 * that's almost empty (and might become completely empty soon if left
1639 	 * alone), but even that is better than failing, which is what we must do
1640 	 * if there are no blocks at all with freespace.
1641 	 */
1642 	Assert(!DsaPointerIsValid(pool->spans[1]));
1643 	for (fclass = 2; fclass < DSA_FULLNESS_CLASSES - 1; ++fclass)
1644 		if (transfer_first_span(area, pool, fclass, 1))
1645 			return true;
1646 	if (!DsaPointerIsValid(pool->spans[1]) &&
1647 		transfer_first_span(area, pool, 0, 1))
1648 		return true;
1649 
1650 	/*
1651 	 * We failed to find an existing span with free objects, so we need to
1652 	 * allocate a new superblock and construct a new span to manage it.
1653 	 *
1654 	 * First, get a dsa_area_span object to describe the new superblock block
1655 	 * ... unless this allocation is for a dsa_area_span object, in which case
1656 	 * that's surely not going to work.  We handle that case by storing the
1657 	 * span describing a block-of-spans inline.
1658 	 */
1659 	if (size_class != DSA_SCLASS_BLOCK_OF_SPANS)
1660 	{
1661 		span_pointer = alloc_object(area, DSA_SCLASS_BLOCK_OF_SPANS);
1662 		if (!DsaPointerIsValid(span_pointer))
1663 			return false;
1664 		npages = DSA_PAGES_PER_SUPERBLOCK;
1665 	}
1666 
1667 	/* Find or create a segment and allocate the superblock. */
1668 	LWLockAcquire(DSA_AREA_LOCK(area), LW_EXCLUSIVE);
1669 	segment_map = get_best_segment(area, npages);
1670 	if (segment_map == NULL)
1671 	{
1672 		segment_map = make_new_segment(area, npages);
1673 		if (segment_map == NULL)
1674 		{
1675 			LWLockRelease(DSA_AREA_LOCK(area));
1676 			return false;
1677 		}
1678 	}
1679 	/*
1680 	 * This shouldn't happen: get_best_segment() or make_new_segment()
1681 	 * promised that we can successfully allocate npages.
1682 	 */
1683 	if (!FreePageManagerGet(segment_map->fpm, npages, &first_page))
1684 		elog(FATAL,
1685 			 "dsa_allocate could not find %zu free pages for superblock",
1686 			 npages);
1687 	LWLockRelease(DSA_AREA_LOCK(area));
1688 
1689 	/* Compute the start of the superblock. */
1690 	start_pointer =
1691 		DSA_MAKE_POINTER(get_segment_index(area, segment_map),
1692 						 first_page * FPM_PAGE_SIZE);
1693 
1694 	/*
1695 	 * If this is a block-of-spans, carve the descriptor right out of the
1696 	 * allocated space.
1697 	 */
1698 	if (size_class == DSA_SCLASS_BLOCK_OF_SPANS)
1699 	{
1700 		/*
1701 		 * We have a pointer into the segment.  We need to build a dsa_pointer
1702 		 * from the segment index and offset into the segment.
1703 		 */
1704 		span_pointer = start_pointer;
1705 	}
1706 
1707 	/* Initialize span and pagemap. */
1708 	init_span(area, span_pointer, pool, start_pointer, npages, size_class);
1709 	for (i = 0; i < npages; ++i)
1710 		segment_map->pagemap[first_page + i] = span_pointer;
1711 
1712 	return true;
1713 }
1714 
1715 /*
1716  * Return the segment map corresponding to a given segment index, mapping the
1717  * segment in if necessary.  For internal segment book-keeping, this is called
1718  * with the area lock held.  It is also called by dsa_free and dsa_get_address
1719  * without any locking, relying on the fact they have a known live segment
1720  * index and they always call check_for_freed_segments to ensures that any
1721  * freed segment occupying the same slot is detached first.
1722  */
1723 static dsa_segment_map *
get_segment_by_index(dsa_area * area,dsa_segment_index index)1724 get_segment_by_index(dsa_area *area, dsa_segment_index index)
1725 {
1726 	if (unlikely(area->segment_maps[index].mapped_address == NULL))
1727 	{
1728 		dsm_handle	handle;
1729 		dsm_segment *segment;
1730 		dsa_segment_map *segment_map;
1731 
1732 		/*
1733 		 * If we are reached by dsa_free or dsa_get_address, there must be at
1734 		 * least one object allocated in the referenced segment.  Otherwise,
1735 		 * their caller has a double-free or access-after-free bug, which we
1736 		 * have no hope of detecting.  So we know it's safe to access this
1737 		 * array slot without holding a lock; it won't change underneath us.
1738 		 * Furthermore, we know that we can see the latest contents of the
1739 		 * slot, as explained in check_for_freed_segments, which those
1740 		 * functions call before arriving here.
1741 		 */
1742 		handle = area->control->segment_handles[index];
1743 
1744 		/* It's an error to try to access an unused slot. */
1745 		if (handle == DSM_HANDLE_INVALID)
1746 			elog(ERROR,
1747 				 "dsa_area could not attach to a segment that has been freed");
1748 
1749 		segment = dsm_attach(handle);
1750 		if (segment == NULL)
1751 			elog(ERROR, "dsa_area could not attach to segment");
1752 		if (area->mapping_pinned)
1753 			dsm_pin_mapping(segment);
1754 		segment_map = &area->segment_maps[index];
1755 		segment_map->segment = segment;
1756 		segment_map->mapped_address = dsm_segment_address(segment);
1757 		segment_map->header =
1758 			(dsa_segment_header *) segment_map->mapped_address;
1759 		segment_map->fpm = (FreePageManager *)
1760 			(segment_map->mapped_address +
1761 			 MAXALIGN(sizeof(dsa_segment_header)));
1762 		segment_map->pagemap = (dsa_pointer *)
1763 			(segment_map->mapped_address +
1764 			 MAXALIGN(sizeof(dsa_segment_header)) +
1765 			 MAXALIGN(sizeof(FreePageManager)));
1766 
1767 		/* Remember the highest index this backend has ever mapped. */
1768 		if (area->high_segment_index < index)
1769 			area->high_segment_index = index;
1770 
1771 		Assert(segment_map->header->magic ==
1772 			   (DSA_SEGMENT_HEADER_MAGIC ^ area->control->handle ^ index));
1773 	}
1774 
1775 	/*
1776 	 * Callers of dsa_get_address() and dsa_free() don't hold the area lock,
1777 	 * but it's a bug in the calling code and undefined behavior if the
1778 	 * address is not live (ie if the segment might possibly have been freed,
1779 	 * they're trying to use a dangling pointer).
1780 	 *
1781 	 * For dsa.c code that holds the area lock to manipulate segment_bins
1782 	 * lists, it would be a bug if we ever reach a freed segment here.  After
1783 	 * it's marked as freed, the only thing any backend should do with it is
1784 	 * unmap it, and it should always have done that in
1785 	 * check_for_freed_segments_locked() before arriving here to resolve an
1786 	 * index to a segment_map.
1787 	 *
1788 	 * Either way we can assert that we aren't returning a freed segment.
1789 	 */
1790 	Assert(!area->segment_maps[index].header->freed);
1791 
1792 	return &area->segment_maps[index];
1793 }
1794 
1795 /*
1796  * Return a superblock to the free page manager.  If the underlying segment
1797  * has become entirely free, then return it to the operating system.
1798  *
1799  * The appropriate pool lock must be held.
1800  */
1801 static void
destroy_superblock(dsa_area * area,dsa_pointer span_pointer)1802 destroy_superblock(dsa_area *area, dsa_pointer span_pointer)
1803 {
1804 	dsa_area_span *span = dsa_get_address(area, span_pointer);
1805 	int			size_class = span->size_class;
1806 	dsa_segment_map *segment_map;
1807 
1808 
1809 	/* Remove it from its fullness class list. */
1810 	unlink_span(area, span);
1811 
1812 	/*
1813 	 * Note: Here we acquire the area lock while we already hold a per-pool
1814 	 * lock.  We never hold the area lock and then take a pool lock, or we
1815 	 * could deadlock.
1816 	 */
1817 	LWLockAcquire(DSA_AREA_LOCK(area), LW_EXCLUSIVE);
1818 	check_for_freed_segments_locked(area);
1819 	segment_map =
1820 		get_segment_by_index(area, DSA_EXTRACT_SEGMENT_NUMBER(span->start));
1821 	FreePageManagerPut(segment_map->fpm,
1822 					   DSA_EXTRACT_OFFSET(span->start) / FPM_PAGE_SIZE,
1823 					   span->npages);
1824 	/* Check if the segment is now entirely free. */
1825 	if (fpm_largest(segment_map->fpm) == segment_map->header->usable_pages)
1826 	{
1827 		dsa_segment_index index = get_segment_index(area, segment_map);
1828 
1829 		/* If it's not the segment with extra control data, free it. */
1830 		if (index != 0)
1831 		{
1832 			/*
1833 			 * Give it back to the OS, and allow other backends to detect that
1834 			 * they need to detach.
1835 			 */
1836 			unlink_segment(area, segment_map);
1837 			segment_map->header->freed = true;
1838 			Assert(area->control->total_segment_size >=
1839 				   segment_map->header->size);
1840 			area->control->total_segment_size -=
1841 				segment_map->header->size;
1842 			dsm_unpin_segment(dsm_segment_handle(segment_map->segment));
1843 			dsm_detach(segment_map->segment);
1844 			area->control->segment_handles[index] = DSM_HANDLE_INVALID;
1845 			++area->control->freed_segment_counter;
1846 			segment_map->segment = NULL;
1847 			segment_map->header = NULL;
1848 			segment_map->mapped_address = NULL;
1849 		}
1850 	}
1851 	LWLockRelease(DSA_AREA_LOCK(area));
1852 
1853 	/*
1854 	 * Span-of-spans blocks store the span which describes them within the
1855 	 * block itself, so freeing the storage implicitly frees the descriptor
1856 	 * also.  If this is a block of any other type, we need to separately free
1857 	 * the span object also.  This recursive call to dsa_free will acquire the
1858 	 * span pool's lock.  We can't deadlock because the acquisition order is
1859 	 * always some other pool and then the span pool.
1860 	 */
1861 	if (size_class != DSA_SCLASS_BLOCK_OF_SPANS)
1862 		dsa_free(area, span_pointer);
1863 }
1864 
1865 static void
unlink_span(dsa_area * area,dsa_area_span * span)1866 unlink_span(dsa_area *area, dsa_area_span *span)
1867 {
1868 	if (DsaPointerIsValid(span->nextspan))
1869 	{
1870 		dsa_area_span *next = dsa_get_address(area, span->nextspan);
1871 
1872 		next->prevspan = span->prevspan;
1873 	}
1874 	if (DsaPointerIsValid(span->prevspan))
1875 	{
1876 		dsa_area_span *prev = dsa_get_address(area, span->prevspan);
1877 
1878 		prev->nextspan = span->nextspan;
1879 	}
1880 	else
1881 	{
1882 		dsa_area_pool *pool = dsa_get_address(area, span->pool);
1883 
1884 		pool->spans[span->fclass] = span->nextspan;
1885 	}
1886 }
1887 
1888 static void
add_span_to_fullness_class(dsa_area * area,dsa_area_span * span,dsa_pointer span_pointer,int fclass)1889 add_span_to_fullness_class(dsa_area *area, dsa_area_span *span,
1890 						   dsa_pointer span_pointer,
1891 						   int fclass)
1892 {
1893 	dsa_area_pool *pool = dsa_get_address(area, span->pool);
1894 
1895 	if (DsaPointerIsValid(pool->spans[fclass]))
1896 	{
1897 		dsa_area_span *head = dsa_get_address(area,
1898 											  pool->spans[fclass]);
1899 
1900 		head->prevspan = span_pointer;
1901 	}
1902 	span->prevspan = InvalidDsaPointer;
1903 	span->nextspan = pool->spans[fclass];
1904 	pool->spans[fclass] = span_pointer;
1905 	span->fclass = fclass;
1906 }
1907 
1908 /*
1909  * Detach from an area that was either created or attached to by this process.
1910  */
1911 void
dsa_detach(dsa_area * area)1912 dsa_detach(dsa_area *area)
1913 {
1914 	int			i;
1915 
1916 	/* Detach from all segments. */
1917 	for (i = 0; i <= area->high_segment_index; ++i)
1918 		if (area->segment_maps[i].segment != NULL)
1919 			dsm_detach(area->segment_maps[i].segment);
1920 
1921 	/*
1922 	 * Note that 'detaching' (= detaching from DSM segments) doesn't include
1923 	 * 'releasing' (= adjusting the reference count).  It would be nice to
1924 	 * combine these operations, but client code might never get around to
1925 	 * calling dsa_detach because of an error path, and a detach hook on any
1926 	 * particular segment is too late to detach other segments in the area
1927 	 * without risking a 'leak' warning in the non-error path.
1928 	 */
1929 
1930 	/* Free the backend-local area object. */
1931 	pfree(area);
1932 }
1933 
1934 /*
1935  * Unlink a segment from the bin that contains it.
1936  */
1937 static void
unlink_segment(dsa_area * area,dsa_segment_map * segment_map)1938 unlink_segment(dsa_area *area, dsa_segment_map *segment_map)
1939 {
1940 	if (segment_map->header->prev != DSA_SEGMENT_INDEX_NONE)
1941 	{
1942 		dsa_segment_map *prev;
1943 
1944 		prev = get_segment_by_index(area, segment_map->header->prev);
1945 		prev->header->next = segment_map->header->next;
1946 	}
1947 	else
1948 	{
1949 		Assert(area->control->segment_bins[segment_map->header->bin] ==
1950 			   get_segment_index(area, segment_map));
1951 		area->control->segment_bins[segment_map->header->bin] =
1952 			segment_map->header->next;
1953 	}
1954 	if (segment_map->header->next != DSA_SEGMENT_INDEX_NONE)
1955 	{
1956 		dsa_segment_map *next;
1957 
1958 		next = get_segment_by_index(area, segment_map->header->next);
1959 		next->header->prev = segment_map->header->prev;
1960 	}
1961 }
1962 
1963 /*
1964  * Find a segment that could satisfy a request for 'npages' of contiguous
1965  * memory, or return NULL if none can be found.  This may involve attaching to
1966  * segments that weren't previously attached so that we can query their free
1967  * pages map.
1968  */
1969 static dsa_segment_map *
get_best_segment(dsa_area * area,size_t npages)1970 get_best_segment(dsa_area *area, size_t npages)
1971 {
1972 	size_t		bin;
1973 
1974 	Assert(LWLockHeldByMe(DSA_AREA_LOCK(area)));
1975 	check_for_freed_segments_locked(area);
1976 
1977 	/*
1978 	 * Start searching from the first bin that *might* have enough contiguous
1979 	 * pages.
1980 	 */
1981 	for (bin = contiguous_pages_to_segment_bin(npages);
1982 		 bin < DSA_NUM_SEGMENT_BINS;
1983 		 ++bin)
1984 	{
1985 		/*
1986 		 * The minimum contiguous size that any segment in this bin should
1987 		 * have.  We'll re-bin if we see segments with fewer.
1988 		 */
1989 		size_t		threshold = (size_t) 1 << (bin - 1);
1990 		dsa_segment_index segment_index;
1991 
1992 		/* Search this bin for a segment with enough contiguous space. */
1993 		segment_index = area->control->segment_bins[bin];
1994 		while (segment_index != DSA_SEGMENT_INDEX_NONE)
1995 		{
1996 			dsa_segment_map *segment_map;
1997 			dsa_segment_index next_segment_index;
1998 			size_t		contiguous_pages;
1999 
2000 			segment_map = get_segment_by_index(area, segment_index);
2001 			next_segment_index = segment_map->header->next;
2002 			contiguous_pages = fpm_largest(segment_map->fpm);
2003 
2004 			/* Not enough for the request, still enough for this bin. */
2005 			if (contiguous_pages >= threshold && contiguous_pages < npages)
2006 			{
2007 				segment_index = next_segment_index;
2008 				continue;
2009 			}
2010 
2011 			/* Re-bin it if it's no longer in the appropriate bin. */
2012 			if (contiguous_pages < threshold)
2013 			{
2014 				size_t		new_bin;
2015 
2016 				new_bin = contiguous_pages_to_segment_bin(contiguous_pages);
2017 
2018 				/* Remove it from its current bin. */
2019 				unlink_segment(area, segment_map);
2020 
2021 				/* Push it onto the front of its new bin. */
2022 				segment_map->header->prev = DSA_SEGMENT_INDEX_NONE;
2023 				segment_map->header->next =
2024 					area->control->segment_bins[new_bin];
2025 				segment_map->header->bin = new_bin;
2026 				area->control->segment_bins[new_bin] = segment_index;
2027 				if (segment_map->header->next != DSA_SEGMENT_INDEX_NONE)
2028 				{
2029 					dsa_segment_map *next;
2030 
2031 					next = get_segment_by_index(area,
2032 												segment_map->header->next);
2033 					Assert(next->header->bin == new_bin);
2034 					next->header->prev = segment_index;
2035 				}
2036 
2037 				/*
2038 				 * But fall through to see if it's enough to satisfy this
2039 				 * request anyway....
2040 				 */
2041 			}
2042 
2043 			/* Check if we are done. */
2044 			if (contiguous_pages >= npages)
2045 				return segment_map;
2046 
2047 			/* Continue searching the same bin. */
2048 			segment_index = next_segment_index;
2049 		}
2050 	}
2051 
2052 	/* Not found. */
2053 	return NULL;
2054 }
2055 
2056 /*
2057  * Create a new segment that can handle at least requested_pages.  Returns
2058  * NULL if the requested total size limit or maximum allowed number of
2059  * segments would be exceeded.
2060  */
2061 static dsa_segment_map *
make_new_segment(dsa_area * area,size_t requested_pages)2062 make_new_segment(dsa_area *area, size_t requested_pages)
2063 {
2064 	dsa_segment_index new_index;
2065 	size_t		metadata_bytes;
2066 	size_t		total_size;
2067 	size_t		total_pages;
2068 	size_t		usable_pages;
2069 	dsa_segment_map *segment_map;
2070 	dsm_segment *segment;
2071 
2072 	Assert(LWLockHeldByMe(DSA_AREA_LOCK(area)));
2073 
2074 	/* Find a segment slot that is not in use (linearly for now). */
2075 	for (new_index = 1; new_index < DSA_MAX_SEGMENTS; ++new_index)
2076 	{
2077 		if (area->control->segment_handles[new_index] == DSM_HANDLE_INVALID)
2078 			break;
2079 	}
2080 	if (new_index == DSA_MAX_SEGMENTS)
2081 		return NULL;
2082 
2083 	/*
2084 	 * If the total size limit is already exceeded, then we exit early and
2085 	 * avoid arithmetic wraparound in the unsigned expressions below.
2086 	 */
2087 	if (area->control->total_segment_size >=
2088 		area->control->max_total_segment_size)
2089 		return NULL;
2090 
2091 	/*
2092 	 * The size should be at least as big as requested, and at least big
2093 	 * enough to follow a geometric series that approximately doubles the
2094 	 * total storage each time we create a new segment.  We use geometric
2095 	 * growth because the underlying DSM system isn't designed for large
2096 	 * numbers of segments (otherwise we might even consider just using one
2097 	 * DSM segment for each large allocation and for each superblock, and then
2098 	 * we wouldn't need to use FreePageManager).
2099 	 *
2100 	 * We decide on a total segment size first, so that we produce tidy
2101 	 * power-of-two sized segments.  This is a good property to have if we
2102 	 * move to huge pages in the future.  Then we work back to the number of
2103 	 * pages we can fit.
2104 	 */
2105 	total_size = DSA_INITIAL_SEGMENT_SIZE *
2106 		((size_t) 1 << (new_index / DSA_NUM_SEGMENTS_AT_EACH_SIZE));
2107 	total_size = Min(total_size, DSA_MAX_SEGMENT_SIZE);
2108 	total_size = Min(total_size,
2109 					 area->control->max_total_segment_size -
2110 					 area->control->total_segment_size);
2111 
2112 	total_pages = total_size / FPM_PAGE_SIZE;
2113 	metadata_bytes =
2114 		MAXALIGN(sizeof(dsa_segment_header)) +
2115 		MAXALIGN(sizeof(FreePageManager)) +
2116 		sizeof(dsa_pointer) * total_pages;
2117 
2118 	/* Add padding up to next page boundary. */
2119 	if (metadata_bytes % FPM_PAGE_SIZE != 0)
2120 		metadata_bytes += FPM_PAGE_SIZE - (metadata_bytes % FPM_PAGE_SIZE);
2121 	if (total_size <= metadata_bytes)
2122 		return NULL;
2123 	usable_pages = (total_size - metadata_bytes) / FPM_PAGE_SIZE;
2124 	Assert(metadata_bytes + usable_pages * FPM_PAGE_SIZE <= total_size);
2125 
2126 	/* See if that is enough... */
2127 	if (requested_pages > usable_pages)
2128 	{
2129 		/*
2130 		 * We'll make an odd-sized segment, working forward from the requested
2131 		 * number of pages.
2132 		 */
2133 		usable_pages = requested_pages;
2134 		metadata_bytes =
2135 			MAXALIGN(sizeof(dsa_segment_header)) +
2136 			MAXALIGN(sizeof(FreePageManager)) +
2137 			usable_pages * sizeof(dsa_pointer);
2138 
2139 		/* Add padding up to next page boundary. */
2140 		if (metadata_bytes % FPM_PAGE_SIZE != 0)
2141 			metadata_bytes += FPM_PAGE_SIZE - (metadata_bytes % FPM_PAGE_SIZE);
2142 		total_size = metadata_bytes + usable_pages * FPM_PAGE_SIZE;
2143 
2144 		/* Is that too large for dsa_pointer's addressing scheme? */
2145 		if (total_size > DSA_MAX_SEGMENT_SIZE)
2146 			return NULL;
2147 
2148 		/* Would that exceed the limit? */
2149 		if (total_size > area->control->max_total_segment_size -
2150 			area->control->total_segment_size)
2151 			return NULL;
2152 	}
2153 
2154 	/* Create the segment. */
2155 	segment = dsm_create(total_size, 0);
2156 	if (segment == NULL)
2157 		return NULL;
2158 	dsm_pin_segment(segment);
2159 	if (area->mapping_pinned)
2160 		dsm_pin_mapping(segment);
2161 
2162 	/* Store the handle in shared memory to be found by index. */
2163 	area->control->segment_handles[new_index] =
2164 		dsm_segment_handle(segment);
2165 	/* Track the highest segment index in the history of the area. */
2166 	if (area->control->high_segment_index < new_index)
2167 		area->control->high_segment_index = new_index;
2168 	/* Track the highest segment index this backend has ever mapped. */
2169 	if (area->high_segment_index < new_index)
2170 		area->high_segment_index = new_index;
2171 	/* Track total size of all segments. */
2172 	area->control->total_segment_size += total_size;
2173 	Assert(area->control->total_segment_size <=
2174 		   area->control->max_total_segment_size);
2175 
2176 	/* Build a segment map for this segment in this backend. */
2177 	segment_map = &area->segment_maps[new_index];
2178 	segment_map->segment = segment;
2179 	segment_map->mapped_address = dsm_segment_address(segment);
2180 	segment_map->header = (dsa_segment_header *) segment_map->mapped_address;
2181 	segment_map->fpm = (FreePageManager *)
2182 		(segment_map->mapped_address +
2183 		 MAXALIGN(sizeof(dsa_segment_header)));
2184 	segment_map->pagemap = (dsa_pointer *)
2185 		(segment_map->mapped_address +
2186 		 MAXALIGN(sizeof(dsa_segment_header)) +
2187 		 MAXALIGN(sizeof(FreePageManager)));
2188 
2189 	/* Set up the free page map. */
2190 	FreePageManagerInitialize(segment_map->fpm, segment_map->mapped_address);
2191 	FreePageManagerPut(segment_map->fpm, metadata_bytes / FPM_PAGE_SIZE,
2192 					   usable_pages);
2193 
2194 	/* Set up the segment header and put it in the appropriate bin. */
2195 	segment_map->header->magic =
2196 		DSA_SEGMENT_HEADER_MAGIC ^ area->control->handle ^ new_index;
2197 	segment_map->header->usable_pages = usable_pages;
2198 	segment_map->header->size = total_size;
2199 	segment_map->header->bin = contiguous_pages_to_segment_bin(usable_pages);
2200 	segment_map->header->prev = DSA_SEGMENT_INDEX_NONE;
2201 	segment_map->header->next =
2202 		area->control->segment_bins[segment_map->header->bin];
2203 	segment_map->header->freed = false;
2204 	area->control->segment_bins[segment_map->header->bin] = new_index;
2205 	if (segment_map->header->next != DSA_SEGMENT_INDEX_NONE)
2206 	{
2207 		dsa_segment_map *next =
2208 		get_segment_by_index(area, segment_map->header->next);
2209 
2210 		Assert(next->header->bin == segment_map->header->bin);
2211 		next->header->prev = new_index;
2212 	}
2213 
2214 	return segment_map;
2215 }
2216 
2217 /*
2218  * Check if any segments have been freed by destroy_superblock, so we can
2219  * detach from them in this backend.  This function is called by
2220  * dsa_get_address and dsa_free to make sure that a dsa_pointer they have
2221  * received can be resolved to the correct segment.
2222  *
2223  * The danger we want to defend against is that there could be an old segment
2224  * mapped into a given slot in this backend, and the dsa_pointer they have
2225  * might refer to some new segment in the same slot.  So those functions must
2226  * be sure to process all instructions to detach from a freed segment that had
2227  * been generated by the time this process received the dsa_pointer, before
2228  * they call get_segment_by_index.
2229  */
2230 static void
check_for_freed_segments(dsa_area * area)2231 check_for_freed_segments(dsa_area *area)
2232 {
2233 	size_t		freed_segment_counter;
2234 
2235 	/*
2236 	 * Any other process that has freed a segment has incremented
2237 	 * free_segment_counter while holding an LWLock, and that must precede any
2238 	 * backend creating a new segment in the same slot while holding an
2239 	 * LWLock, and that must precede the creation of any dsa_pointer pointing
2240 	 * into the new segment which might reach us here, and the caller must
2241 	 * have sent the dsa_pointer to this process using appropriate memory
2242 	 * synchronization (some kind of locking or atomic primitive or system
2243 	 * call).  So all we need to do on the reading side is ask for the load of
2244 	 * freed_segment_counter to follow the caller's load of the dsa_pointer it
2245 	 * has, and we can be sure to detect any segments that had been freed as
2246 	 * of the time that the dsa_pointer reached this process.
2247 	 */
2248 	pg_read_barrier();
2249 	freed_segment_counter = area->control->freed_segment_counter;
2250 	if (unlikely(area->freed_segment_counter != freed_segment_counter))
2251 	{
2252 		/* Check all currently mapped segments to find what's been freed. */
2253 		LWLockAcquire(DSA_AREA_LOCK(area), LW_EXCLUSIVE);
2254 		check_for_freed_segments_locked(area);
2255 		LWLockRelease(DSA_AREA_LOCK(area));
2256 	}
2257 }
2258 
2259 /*
2260  * Workhorse for check_for_free_segments(), and also used directly in path
2261  * where the area lock is already held.  This should be called after acquiring
2262  * the lock but before looking up any segment by index number, to make sure we
2263  * unmap any stale segments that might have previously had the same index as a
2264  * current segment.
2265  */
2266 static void
check_for_freed_segments_locked(dsa_area * area)2267 check_for_freed_segments_locked(dsa_area *area)
2268 {
2269 	size_t		freed_segment_counter;
2270 	int		i;
2271 
2272 	Assert(LWLockHeldByMe(DSA_AREA_LOCK(area)));
2273 	freed_segment_counter = area->control->freed_segment_counter;
2274 	if (unlikely(area->freed_segment_counter != freed_segment_counter))
2275 	{
2276 		for (i = 0; i <= area->high_segment_index; ++i)
2277 		{
2278 			if (area->segment_maps[i].header != NULL &&
2279 				area->segment_maps[i].header->freed)
2280 			{
2281 				dsm_detach(area->segment_maps[i].segment);
2282 				area->segment_maps[i].segment = NULL;
2283 				area->segment_maps[i].header = NULL;
2284 				area->segment_maps[i].mapped_address = NULL;
2285 			}
2286 		}
2287 		area->freed_segment_counter = freed_segment_counter;
2288 	}
2289 }
2290