1 /*-------------------------------------------------------------------------
2  *
3  * dsm.c
4  *	  manage dynamic shared memory segments
5  *
6  * This file provides a set of services to make programming with dynamic
7  * shared memory segments more convenient.  Unlike the low-level
8  * facilities provided by dsm_impl.h and dsm_impl.c, mappings and segments
9  * created using this module will be cleaned up automatically.  Mappings
10  * will be removed when the resource owner under which they were created
11  * is cleaned up, unless dsm_pin_mapping() is used, in which case they
12  * have session lifespan.  Segments will be removed when there are no
13  * remaining mappings, or at postmaster shutdown in any case.  After a
14  * hard postmaster crash, remaining segments will be removed, if they
15  * still exist, at the next postmaster startup.
16  *
17  * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
18  * Portions Copyright (c) 1994, Regents of the University of California
19  *
20  *
21  * IDENTIFICATION
22  *	  src/backend/storage/ipc/dsm.c
23  *
24  *-------------------------------------------------------------------------
25  */
26 
27 #include "postgres.h"
28 
29 #include <fcntl.h>
30 #include <string.h>
31 #include <unistd.h>
32 #ifndef WIN32
33 #include <sys/mman.h>
34 #endif
35 #include <sys/stat.h>
36 
37 #include "lib/ilist.h"
38 #include "miscadmin.h"
39 #include "storage/dsm.h"
40 #include "storage/ipc.h"
41 #include "storage/lwlock.h"
42 #include "storage/pg_shmem.h"
43 #include "utils/guc.h"
44 #include "utils/memutils.h"
45 #include "utils/resowner_private.h"
46 
47 #define PG_DYNSHMEM_CONTROL_MAGIC		0x9a503d32
48 
49 /*
50  * There's no point in getting too cheap here, because the minimum allocation
51  * is one OS page, which is probably at least 4KB and could easily be as high
52  * as 64KB.  Each currently sizeof(dsm_control_item), currently 8 bytes.
53  */
54 #define PG_DYNSHMEM_FIXED_SLOTS			64
55 #define PG_DYNSHMEM_SLOTS_PER_BACKEND	2
56 
57 #define INVALID_CONTROL_SLOT		((uint32) -1)
58 
59 /* Backend-local tracking for on-detach callbacks. */
60 typedef struct dsm_segment_detach_callback
61 {
62 	on_dsm_detach_callback function;
63 	Datum		arg;
64 	slist_node	node;
65 } dsm_segment_detach_callback;
66 
67 /* Backend-local state for a dynamic shared memory segment. */
68 struct dsm_segment
69 {
70 	dlist_node	node;			/* List link in dsm_segment_list. */
71 	ResourceOwner resowner;		/* Resource owner. */
72 	dsm_handle	handle;			/* Segment name. */
73 	uint32		control_slot;	/* Slot in control segment. */
74 	void	   *impl_private;	/* Implementation-specific private data. */
75 	void	   *mapped_address; /* Mapping address, or NULL if unmapped. */
76 	Size		mapped_size;	/* Size of our mapping. */
77 	slist_head	on_detach;		/* On-detach callbacks. */
78 };
79 
80 /* Shared-memory state for a dynamic shared memory segment. */
81 typedef struct dsm_control_item
82 {
83 	dsm_handle	handle;
84 	uint32		refcnt;			/* 2+ = active, 1 = moribund, 0 = gone */
85 } dsm_control_item;
86 
87 /* Layout of the dynamic shared memory control segment. */
88 typedef struct dsm_control_header
89 {
90 	uint32		magic;
91 	uint32		nitems;
92 	uint32		maxitems;
93 	dsm_control_item item[FLEXIBLE_ARRAY_MEMBER];
94 } dsm_control_header;
95 
96 static void dsm_cleanup_for_mmap(void);
97 static void dsm_postmaster_shutdown(int code, Datum arg);
98 static dsm_segment *dsm_create_descriptor(void);
99 static bool dsm_control_segment_sane(dsm_control_header *control,
100 						 Size mapped_size);
101 static uint64 dsm_control_bytes_needed(uint32 nitems);
102 
103 /* Has this backend initialized the dynamic shared memory system yet? */
104 static bool dsm_init_done = false;
105 
106 /*
107  * List of dynamic shared memory segments used by this backend.
108  *
109  * At process exit time, we must decrement the reference count of each
110  * segment we have attached; this list makes it possible to find all such
111  * segments.
112  *
113  * This list should always be empty in the postmaster.  We could probably
114  * allow the postmaster to map dynamic shared memory segments before it
115  * begins to start child processes, provided that each process adjusted
116  * the reference counts for those segments in the control segment at
117  * startup time, but there's no obvious need for such a facility, which
118  * would also be complex to handle in the EXEC_BACKEND case.  Once the
119  * postmaster has begun spawning children, there's an additional problem:
120  * each new mapping would require an update to the control segment,
121  * which requires locking, in which the postmaster must not be involved.
122  */
123 static dlist_head dsm_segment_list = DLIST_STATIC_INIT(dsm_segment_list);
124 
125 /*
126  * Control segment information.
127  *
128  * Unlike ordinary shared memory segments, the control segment is not
129  * reference counted; instead, it lasts for the postmaster's entire
130  * life cycle.  For simplicity, it doesn't have a dsm_segment object either.
131  */
132 static dsm_handle dsm_control_handle;
133 static dsm_control_header *dsm_control;
134 static Size dsm_control_mapped_size = 0;
135 static void *dsm_control_impl_private = NULL;
136 
137 /*
138  * Start up the dynamic shared memory system.
139  *
140  * This is called just once during each cluster lifetime, at postmaster
141  * startup time.
142  */
143 void
dsm_postmaster_startup(PGShmemHeader * shim)144 dsm_postmaster_startup(PGShmemHeader *shim)
145 {
146 	void	   *dsm_control_address = NULL;
147 	uint32		maxitems;
148 	Size		segsize;
149 
150 	Assert(!IsUnderPostmaster);
151 
152 	/* If dynamic shared memory is disabled, there's nothing to do. */
153 	if (dynamic_shared_memory_type == DSM_IMPL_NONE)
154 		return;
155 
156 	/*
157 	 * If we're using the mmap implementations, clean up any leftovers.
158 	 * Cleanup isn't needed on Windows, and happens earlier in startup for
159 	 * POSIX and System V shared memory, via a direct call to
160 	 * dsm_cleanup_using_control_segment.
161 	 */
162 	if (dynamic_shared_memory_type == DSM_IMPL_MMAP)
163 		dsm_cleanup_for_mmap();
164 
165 	/* Determine size for new control segment. */
166 	maxitems = PG_DYNSHMEM_FIXED_SLOTS
167 		+ PG_DYNSHMEM_SLOTS_PER_BACKEND * MaxBackends;
168 	elog(DEBUG2, "dynamic shared memory system will support %u segments",
169 		 maxitems);
170 	segsize = dsm_control_bytes_needed(maxitems);
171 
172 	/*
173 	 * Loop until we find an unused identifier for the new control segment. We
174 	 * sometimes use 0 as a sentinel value indicating that no control segment
175 	 * is known to exist, so avoid using that value for a real control
176 	 * segment.
177 	 */
178 	for (;;)
179 	{
180 		Assert(dsm_control_address == NULL);
181 		Assert(dsm_control_mapped_size == 0);
182 		dsm_control_handle = random();
183 		if (dsm_control_handle == 0)
184 			continue;
185 		if (dsm_impl_op(DSM_OP_CREATE, dsm_control_handle, segsize,
186 						&dsm_control_impl_private, &dsm_control_address,
187 						&dsm_control_mapped_size, ERROR))
188 			break;
189 	}
190 	dsm_control = dsm_control_address;
191 	on_shmem_exit(dsm_postmaster_shutdown, PointerGetDatum(shim));
192 	elog(DEBUG2,
193 		 "created dynamic shared memory control segment %u (%zu bytes)",
194 		 dsm_control_handle, segsize);
195 	shim->dsm_control = dsm_control_handle;
196 
197 	/* Initialize control segment. */
198 	dsm_control->magic = PG_DYNSHMEM_CONTROL_MAGIC;
199 	dsm_control->nitems = 0;
200 	dsm_control->maxitems = maxitems;
201 }
202 
203 /*
204  * Determine whether the control segment from the previous postmaster
205  * invocation still exists.  If so, remove the dynamic shared memory
206  * segments to which it refers, and then the control segment itself.
207  */
208 void
dsm_cleanup_using_control_segment(dsm_handle old_control_handle)209 dsm_cleanup_using_control_segment(dsm_handle old_control_handle)
210 {
211 	void	   *mapped_address = NULL;
212 	void	   *junk_mapped_address = NULL;
213 	void	   *impl_private = NULL;
214 	void	   *junk_impl_private = NULL;
215 	Size		mapped_size = 0;
216 	Size		junk_mapped_size = 0;
217 	uint32		nitems;
218 	uint32		i;
219 	dsm_control_header *old_control;
220 
221 	/* If dynamic shared memory is disabled, there's nothing to do. */
222 	if (dynamic_shared_memory_type == DSM_IMPL_NONE)
223 		return;
224 
225 	/*
226 	 * Try to attach the segment.  If this fails, it probably just means that
227 	 * the operating system has been rebooted and the segment no longer
228 	 * exists, or an unrelated process has used the same shm ID.  So just fall
229 	 * out quietly.
230 	 */
231 	if (!dsm_impl_op(DSM_OP_ATTACH, old_control_handle, 0, &impl_private,
232 					 &mapped_address, &mapped_size, DEBUG1))
233 		return;
234 
235 	/*
236 	 * We've managed to reattach it, but the contents might not be sane. If
237 	 * they aren't, we disregard the segment after all.
238 	 */
239 	old_control = (dsm_control_header *) mapped_address;
240 	if (!dsm_control_segment_sane(old_control, mapped_size))
241 	{
242 		dsm_impl_op(DSM_OP_DETACH, old_control_handle, 0, &impl_private,
243 					&mapped_address, &mapped_size, LOG);
244 		return;
245 	}
246 
247 	/*
248 	 * OK, the control segment looks basically valid, so we can use it to get
249 	 * a list of segments that need to be removed.
250 	 */
251 	nitems = old_control->nitems;
252 	for (i = 0; i < nitems; ++i)
253 	{
254 		dsm_handle	handle;
255 		uint32		refcnt;
256 
257 		/* If the reference count is 0, the slot is actually unused. */
258 		refcnt = old_control->item[i].refcnt;
259 		if (refcnt == 0)
260 			continue;
261 
262 		/* Log debugging information. */
263 		handle = old_control->item[i].handle;
264 		elog(DEBUG2, "cleaning up orphaned dynamic shared memory with ID %u (reference count %u)",
265 			 handle, refcnt);
266 
267 		/* Destroy the referenced segment. */
268 		dsm_impl_op(DSM_OP_DESTROY, handle, 0, &junk_impl_private,
269 					&junk_mapped_address, &junk_mapped_size, LOG);
270 	}
271 
272 	/* Destroy the old control segment, too. */
273 	elog(DEBUG2,
274 		 "cleaning up dynamic shared memory control segment with ID %u",
275 		 old_control_handle);
276 	dsm_impl_op(DSM_OP_DESTROY, old_control_handle, 0, &impl_private,
277 				&mapped_address, &mapped_size, LOG);
278 }
279 
280 /*
281  * When we're using the mmap shared memory implementation, "shared memory"
282  * segments might even manage to survive an operating system reboot.
283  * But there's no guarantee as to exactly what will survive: some segments
284  * may survive, and others may not, and the contents of some may be out
285  * of date.  In particular, the control segment may be out of date, so we
286  * can't rely on it to figure out what to remove.  However, since we know
287  * what directory contains the files we used as shared memory, we can simply
288  * scan the directory and blow everything away that shouldn't be there.
289  */
290 static void
dsm_cleanup_for_mmap(void)291 dsm_cleanup_for_mmap(void)
292 {
293 	DIR		   *dir;
294 	struct dirent *dent;
295 
296 	/* Open the directory; can't use AllocateDir in postmaster. */
297 	if ((dir = AllocateDir(PG_DYNSHMEM_DIR)) == NULL)
298 		ereport(ERROR,
299 				(errcode_for_file_access(),
300 				 errmsg("could not open directory \"%s\": %m",
301 						PG_DYNSHMEM_DIR)));
302 
303 	/* Scan for something with a name of the correct format. */
304 	while ((dent = ReadDir(dir, PG_DYNSHMEM_DIR)) != NULL)
305 	{
306 		if (strncmp(dent->d_name, PG_DYNSHMEM_MMAP_FILE_PREFIX,
307 					strlen(PG_DYNSHMEM_MMAP_FILE_PREFIX)) == 0)
308 		{
309 			char		buf[MAXPGPATH + sizeof(PG_DYNSHMEM_DIR)];
310 
311 			snprintf(buf, sizeof(buf), PG_DYNSHMEM_DIR "/%s", dent->d_name);
312 
313 			elog(DEBUG2, "removing file \"%s\"", buf);
314 
315 			/* We found a matching file; so remove it. */
316 			if (unlink(buf) != 0)
317 			{
318 				int			save_errno;
319 
320 				save_errno = errno;
321 				closedir(dir);
322 				errno = save_errno;
323 
324 				ereport(ERROR,
325 						(errcode_for_file_access(),
326 						 errmsg("could not remove file \"%s\": %m", buf)));
327 			}
328 		}
329 	}
330 
331 	/* Cleanup complete. */
332 	FreeDir(dir);
333 }
334 
335 /*
336  * At shutdown time, we iterate over the control segment and remove all
337  * remaining dynamic shared memory segments.  We avoid throwing errors here;
338  * the postmaster is shutting down either way, and this is just non-critical
339  * resource cleanup.
340  */
341 static void
dsm_postmaster_shutdown(int code,Datum arg)342 dsm_postmaster_shutdown(int code, Datum arg)
343 {
344 	uint32		nitems;
345 	uint32		i;
346 	void	   *dsm_control_address;
347 	void	   *junk_mapped_address = NULL;
348 	void	   *junk_impl_private = NULL;
349 	Size		junk_mapped_size = 0;
350 	PGShmemHeader *shim = (PGShmemHeader *) DatumGetPointer(arg);
351 
352 	/*
353 	 * If some other backend exited uncleanly, it might have corrupted the
354 	 * control segment while it was dying.  In that case, we warn and ignore
355 	 * the contents of the control segment.  This may end up leaving behind
356 	 * stray shared memory segments, but there's not much we can do about that
357 	 * if the metadata is gone.
358 	 */
359 	nitems = dsm_control->nitems;
360 	if (!dsm_control_segment_sane(dsm_control, dsm_control_mapped_size))
361 	{
362 		ereport(LOG,
363 				(errmsg("dynamic shared memory control segment is corrupt")));
364 		return;
365 	}
366 
367 	/* Remove any remaining segments. */
368 	for (i = 0; i < nitems; ++i)
369 	{
370 		dsm_handle	handle;
371 
372 		/* If the reference count is 0, the slot is actually unused. */
373 		if (dsm_control->item[i].refcnt == 0)
374 			continue;
375 
376 		/* Log debugging information. */
377 		handle = dsm_control->item[i].handle;
378 		elog(DEBUG2, "cleaning up orphaned dynamic shared memory with ID %u",
379 			 handle);
380 
381 		/* Destroy the segment. */
382 		dsm_impl_op(DSM_OP_DESTROY, handle, 0, &junk_impl_private,
383 					&junk_mapped_address, &junk_mapped_size, LOG);
384 	}
385 
386 	/* Remove the control segment itself. */
387 	elog(DEBUG2,
388 		 "cleaning up dynamic shared memory control segment with ID %u",
389 		 dsm_control_handle);
390 	dsm_control_address = dsm_control;
391 	dsm_impl_op(DSM_OP_DESTROY, dsm_control_handle, 0,
392 				&dsm_control_impl_private, &dsm_control_address,
393 				&dsm_control_mapped_size, LOG);
394 	dsm_control = dsm_control_address;
395 	shim->dsm_control = 0;
396 }
397 
398 /*
399  * Prepare this backend for dynamic shared memory usage.  Under EXEC_BACKEND,
400  * we must reread the state file and map the control segment; in other cases,
401  * we'll have inherited the postmaster's mapping and global variables.
402  */
403 static void
dsm_backend_startup(void)404 dsm_backend_startup(void)
405 {
406 	/* If dynamic shared memory is disabled, reject this. */
407 	if (dynamic_shared_memory_type == DSM_IMPL_NONE)
408 		ereport(ERROR,
409 				(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
410 				 errmsg("dynamic shared memory is disabled"),
411 				 errhint("Set dynamic_shared_memory_type to a value other than \"none\".")));
412 
413 #ifdef EXEC_BACKEND
414 	{
415 		void	   *control_address = NULL;
416 
417 		/* Attach control segment. */
418 		Assert(dsm_control_handle != 0);
419 		dsm_impl_op(DSM_OP_ATTACH, dsm_control_handle, 0,
420 					&dsm_control_impl_private, &control_address,
421 					&dsm_control_mapped_size, ERROR);
422 		dsm_control = control_address;
423 		/* If control segment doesn't look sane, something is badly wrong. */
424 		if (!dsm_control_segment_sane(dsm_control, dsm_control_mapped_size))
425 		{
426 			dsm_impl_op(DSM_OP_DETACH, dsm_control_handle, 0,
427 						&dsm_control_impl_private, &control_address,
428 						&dsm_control_mapped_size, WARNING);
429 			ereport(FATAL,
430 					(errcode(ERRCODE_INTERNAL_ERROR),
431 			  errmsg("dynamic shared memory control segment is not valid")));
432 		}
433 	}
434 #endif
435 
436 	dsm_init_done = true;
437 }
438 
439 #ifdef EXEC_BACKEND
440 /*
441  * When running under EXEC_BACKEND, we get a callback here when the main
442  * shared memory segment is re-attached, so that we can record the control
443  * handle retrieved from it.
444  */
445 void
dsm_set_control_handle(dsm_handle h)446 dsm_set_control_handle(dsm_handle h)
447 {
448 	Assert(dsm_control_handle == 0 && h != 0);
449 	dsm_control_handle = h;
450 }
451 #endif
452 
453 /*
454  * Create a new dynamic shared memory segment.
455  */
456 dsm_segment *
dsm_create(Size size,int flags)457 dsm_create(Size size, int flags)
458 {
459 	dsm_segment *seg;
460 	uint32		i;
461 	uint32		nitems;
462 
463 	/* Unsafe in postmaster (and pointless in a stand-alone backend). */
464 	Assert(IsUnderPostmaster);
465 
466 	if (!dsm_init_done)
467 		dsm_backend_startup();
468 
469 	/* Create a new segment descriptor. */
470 	seg = dsm_create_descriptor();
471 
472 	/* Loop until we find an unused segment identifier. */
473 	for (;;)
474 	{
475 		Assert(seg->mapped_address == NULL && seg->mapped_size == 0);
476 		seg->handle = random();
477 		if (dsm_impl_op(DSM_OP_CREATE, seg->handle, size, &seg->impl_private,
478 						&seg->mapped_address, &seg->mapped_size, ERROR))
479 			break;
480 	}
481 
482 	/* Lock the control segment so we can register the new segment. */
483 	LWLockAcquire(DynamicSharedMemoryControlLock, LW_EXCLUSIVE);
484 
485 	/* Search the control segment for an unused slot. */
486 	nitems = dsm_control->nitems;
487 	for (i = 0; i < nitems; ++i)
488 	{
489 		if (dsm_control->item[i].refcnt == 0)
490 		{
491 			dsm_control->item[i].handle = seg->handle;
492 			/* refcnt of 1 triggers destruction, so start at 2 */
493 			dsm_control->item[i].refcnt = 2;
494 			seg->control_slot = i;
495 			LWLockRelease(DynamicSharedMemoryControlLock);
496 			return seg;
497 		}
498 	}
499 
500 	/* Verify that we can support an additional mapping. */
501 	if (nitems >= dsm_control->maxitems)
502 	{
503 		LWLockRelease(DynamicSharedMemoryControlLock);
504 		dsm_impl_op(DSM_OP_DESTROY, seg->handle, 0, &seg->impl_private,
505 					&seg->mapped_address, &seg->mapped_size, WARNING);
506 		if (seg->resowner != NULL)
507 			ResourceOwnerForgetDSM(seg->resowner, seg);
508 		dlist_delete(&seg->node);
509 		pfree(seg);
510 
511 		if ((flags & DSM_CREATE_NULL_IF_MAXSEGMENTS) != 0)
512 			return NULL;
513 		ereport(ERROR,
514 				(errcode(ERRCODE_INSUFFICIENT_RESOURCES),
515 				 errmsg("too many dynamic shared memory segments")));
516 	}
517 
518 	/* Enter the handle into a new array slot. */
519 	dsm_control->item[nitems].handle = seg->handle;
520 	/* refcnt of 1 triggers destruction, so start at 2 */
521 	dsm_control->item[nitems].refcnt = 2;
522 	seg->control_slot = nitems;
523 	dsm_control->nitems++;
524 	LWLockRelease(DynamicSharedMemoryControlLock);
525 
526 	return seg;
527 }
528 
529 /*
530  * Attach a dynamic shared memory segment.
531  *
532  * See comments for dsm_segment_handle() for an explanation of how this
533  * is intended to be used.
534  *
535  * This function will return NULL if the segment isn't known to the system.
536  * This can happen if we're asked to attach the segment, but then everyone
537  * else detaches it (causing it to be destroyed) before we get around to
538  * attaching it.
539  */
540 dsm_segment *
dsm_attach(dsm_handle h)541 dsm_attach(dsm_handle h)
542 {
543 	dsm_segment *seg;
544 	dlist_iter	iter;
545 	uint32		i;
546 	uint32		nitems;
547 
548 	/* Unsafe in postmaster (and pointless in a stand-alone backend). */
549 	Assert(IsUnderPostmaster);
550 
551 	if (!dsm_init_done)
552 		dsm_backend_startup();
553 
554 	/*
555 	 * Since this is just a debugging cross-check, we could leave it out
556 	 * altogether, or include it only in assert-enabled builds.  But since the
557 	 * list of attached segments should normally be very short, let's include
558 	 * it always for right now.
559 	 *
560 	 * If you're hitting this error, you probably want to attempt to find an
561 	 * existing mapping via dsm_find_mapping() before calling dsm_attach() to
562 	 * create a new one.
563 	 */
564 	dlist_foreach(iter, &dsm_segment_list)
565 	{
566 		seg = dlist_container(dsm_segment, node, iter.cur);
567 		if (seg->handle == h)
568 			elog(ERROR, "can't attach the same segment more than once");
569 	}
570 
571 	/* Create a new segment descriptor. */
572 	seg = dsm_create_descriptor();
573 	seg->handle = h;
574 
575 	/* Bump reference count for this segment in shared memory. */
576 	LWLockAcquire(DynamicSharedMemoryControlLock, LW_EXCLUSIVE);
577 	nitems = dsm_control->nitems;
578 	for (i = 0; i < nitems; ++i)
579 	{
580 		/*
581 		 * If the reference count is 0, the slot is actually unused.  If the
582 		 * reference count is 1, the slot is still in use, but the segment is
583 		 * in the process of going away; even if the handle matches, another
584 		 * slot may already have started using the same handle value by
585 		 * coincidence so we have to keep searching.
586 		 */
587 		if (dsm_control->item[i].refcnt <= 1)
588 			continue;
589 
590 		/* If the handle doesn't match, it's not the slot we want. */
591 		if (dsm_control->item[i].handle != seg->handle)
592 			continue;
593 
594 		/* Otherwise we've found a match. */
595 		dsm_control->item[i].refcnt++;
596 		seg->control_slot = i;
597 		break;
598 	}
599 	LWLockRelease(DynamicSharedMemoryControlLock);
600 
601 	/*
602 	 * If we didn't find the handle we're looking for in the control segment,
603 	 * it probably means that everyone else who had it mapped, including the
604 	 * original creator, died before we got to this point. It's up to the
605 	 * caller to decide what to do about that.
606 	 */
607 	if (seg->control_slot == INVALID_CONTROL_SLOT)
608 	{
609 		dsm_detach(seg);
610 		return NULL;
611 	}
612 
613 	/* Here's where we actually try to map the segment. */
614 	dsm_impl_op(DSM_OP_ATTACH, seg->handle, 0, &seg->impl_private,
615 				&seg->mapped_address, &seg->mapped_size, ERROR);
616 
617 	return seg;
618 }
619 
620 /*
621  * At backend shutdown time, detach any segments that are still attached.
622  * (This is similar to dsm_detach_all, except that there's no reason to
623  * unmap the control segment before exiting, so we don't bother.)
624  */
625 void
dsm_backend_shutdown(void)626 dsm_backend_shutdown(void)
627 {
628 	while (!dlist_is_empty(&dsm_segment_list))
629 	{
630 		dsm_segment *seg;
631 
632 		seg = dlist_head_element(dsm_segment, node, &dsm_segment_list);
633 		dsm_detach(seg);
634 	}
635 }
636 
637 /*
638  * Detach all shared memory segments, including the control segments.  This
639  * should be called, along with PGSharedMemoryDetach, in processes that
640  * might inherit mappings but are not intended to be connected to dynamic
641  * shared memory.
642  */
643 void
dsm_detach_all(void)644 dsm_detach_all(void)
645 {
646 	void	   *control_address = dsm_control;
647 
648 	while (!dlist_is_empty(&dsm_segment_list))
649 	{
650 		dsm_segment *seg;
651 
652 		seg = dlist_head_element(dsm_segment, node, &dsm_segment_list);
653 		dsm_detach(seg);
654 	}
655 
656 	if (control_address != NULL)
657 		dsm_impl_op(DSM_OP_DETACH, dsm_control_handle, 0,
658 					&dsm_control_impl_private, &control_address,
659 					&dsm_control_mapped_size, ERROR);
660 }
661 
662 /*
663  * Resize an existing shared memory segment.
664  *
665  * This may cause the shared memory segment to be remapped at a different
666  * address.  For the caller's convenience, we return the mapped address.
667  */
668 void *
dsm_resize(dsm_segment * seg,Size size)669 dsm_resize(dsm_segment *seg, Size size)
670 {
671 	Assert(seg->control_slot != INVALID_CONTROL_SLOT);
672 	dsm_impl_op(DSM_OP_RESIZE, seg->handle, size, &seg->impl_private,
673 				&seg->mapped_address, &seg->mapped_size, ERROR);
674 	return seg->mapped_address;
675 }
676 
677 /*
678  * Remap an existing shared memory segment.
679  *
680  * This is intended to be used when some other process has extended the
681  * mapping using dsm_resize(), but we've still only got the initial
682  * portion mapped.  Since this might change the address at which the
683  * segment is mapped, we return the new mapped address.
684  */
685 void *
dsm_remap(dsm_segment * seg)686 dsm_remap(dsm_segment *seg)
687 {
688 	dsm_impl_op(DSM_OP_ATTACH, seg->handle, 0, &seg->impl_private,
689 				&seg->mapped_address, &seg->mapped_size, ERROR);
690 
691 	return seg->mapped_address;
692 }
693 
694 /*
695  * Detach from a shared memory segment, destroying the segment if we
696  * remove the last reference.
697  *
698  * This function should never fail.  It will often be invoked when aborting
699  * a transaction, and a further error won't serve any purpose.  It's not a
700  * complete disaster if we fail to unmap or destroy the segment; it means a
701  * resource leak, but that doesn't necessarily preclude further operations.
702  */
703 void
dsm_detach(dsm_segment * seg)704 dsm_detach(dsm_segment *seg)
705 {
706 	/*
707 	 * Invoke registered callbacks.  Just in case one of those callbacks
708 	 * throws a further error that brings us back here, pop the callback
709 	 * before invoking it, to avoid infinite error recursion.  Don't allow
710 	 * interrupts while running the individual callbacks in non-error code
711 	 * paths, to avoid leaving cleanup work unfinished if we're interrupted by
712 	 * a statement timeout or similar.
713 	 */
714 	HOLD_INTERRUPTS();
715 	while (!slist_is_empty(&seg->on_detach))
716 	{
717 		slist_node *node;
718 		dsm_segment_detach_callback *cb;
719 		on_dsm_detach_callback function;
720 		Datum		arg;
721 
722 		node = slist_pop_head_node(&seg->on_detach);
723 		cb = slist_container(dsm_segment_detach_callback, node, node);
724 		function = cb->function;
725 		arg = cb->arg;
726 		pfree(cb);
727 
728 		function(seg, arg);
729 	}
730 	RESUME_INTERRUPTS();
731 
732 	/*
733 	 * Try to remove the mapping, if one exists.  Normally, there will be, but
734 	 * maybe not, if we failed partway through a create or attach operation.
735 	 * We remove the mapping before decrementing the reference count so that
736 	 * the process that sees a zero reference count can be certain that no
737 	 * remaining mappings exist.  Even if this fails, we pretend that it
738 	 * works, because retrying is likely to fail in the same way.
739 	 */
740 	if (seg->mapped_address != NULL)
741 	{
742 		dsm_impl_op(DSM_OP_DETACH, seg->handle, 0, &seg->impl_private,
743 					&seg->mapped_address, &seg->mapped_size, WARNING);
744 		seg->impl_private = NULL;
745 		seg->mapped_address = NULL;
746 		seg->mapped_size = 0;
747 	}
748 
749 	/* Reduce reference count, if we previously increased it. */
750 	if (seg->control_slot != INVALID_CONTROL_SLOT)
751 	{
752 		uint32		refcnt;
753 		uint32		control_slot = seg->control_slot;
754 
755 		LWLockAcquire(DynamicSharedMemoryControlLock, LW_EXCLUSIVE);
756 		Assert(dsm_control->item[control_slot].handle == seg->handle);
757 		Assert(dsm_control->item[control_slot].refcnt > 1);
758 		refcnt = --dsm_control->item[control_slot].refcnt;
759 		seg->control_slot = INVALID_CONTROL_SLOT;
760 		LWLockRelease(DynamicSharedMemoryControlLock);
761 
762 		/* If new reference count is 1, try to destroy the segment. */
763 		if (refcnt == 1)
764 		{
765 			/*
766 			 * If we fail to destroy the segment here, or are killed before we
767 			 * finish doing so, the reference count will remain at 1, which
768 			 * will mean that nobody else can attach to the segment.  At
769 			 * postmaster shutdown time, or when a new postmaster is started
770 			 * after a hard kill, another attempt will be made to remove the
771 			 * segment.
772 			 *
773 			 * The main case we're worried about here is being killed by a
774 			 * signal before we can finish removing the segment.  In that
775 			 * case, it's important to be sure that the segment still gets
776 			 * removed. If we actually fail to remove the segment for some
777 			 * other reason, the postmaster may not have any better luck than
778 			 * we did.  There's not much we can do about that, though.
779 			 */
780 			if (dsm_impl_op(DSM_OP_DESTROY, seg->handle, 0, &seg->impl_private,
781 							&seg->mapped_address, &seg->mapped_size, WARNING))
782 			{
783 				LWLockAcquire(DynamicSharedMemoryControlLock, LW_EXCLUSIVE);
784 				Assert(dsm_control->item[control_slot].handle == seg->handle);
785 				Assert(dsm_control->item[control_slot].refcnt == 1);
786 				dsm_control->item[control_slot].refcnt = 0;
787 				LWLockRelease(DynamicSharedMemoryControlLock);
788 			}
789 		}
790 	}
791 
792 	/* Clean up our remaining backend-private data structures. */
793 	if (seg->resowner != NULL)
794 		ResourceOwnerForgetDSM(seg->resowner, seg);
795 	dlist_delete(&seg->node);
796 	pfree(seg);
797 }
798 
799 /*
800  * Keep a dynamic shared memory mapping until end of session.
801  *
802  * By default, mappings are owned by the current resource owner, which
803  * typically means they stick around for the duration of the current query
804  * only.
805  */
806 void
dsm_pin_mapping(dsm_segment * seg)807 dsm_pin_mapping(dsm_segment *seg)
808 {
809 	if (seg->resowner != NULL)
810 	{
811 		ResourceOwnerForgetDSM(seg->resowner, seg);
812 		seg->resowner = NULL;
813 	}
814 }
815 
816 /*
817  * Arrange to remove a dynamic shared memory mapping at cleanup time.
818  *
819  * dsm_pin_mapping() can be used to preserve a mapping for the entire
820  * lifetime of a process; this function reverses that decision, making
821  * the segment owned by the current resource owner.  This may be useful
822  * just before performing some operation that will invalidate the segment
823  * for future use by this backend.
824  */
825 void
dsm_unpin_mapping(dsm_segment * seg)826 dsm_unpin_mapping(dsm_segment *seg)
827 {
828 	Assert(seg->resowner == NULL);
829 	ResourceOwnerEnlargeDSMs(CurrentResourceOwner);
830 	seg->resowner = CurrentResourceOwner;
831 	ResourceOwnerRememberDSM(seg->resowner, seg);
832 }
833 
834 /*
835  * Keep a dynamic shared memory segment until postmaster shutdown.
836  *
837  * This function should not be called more than once per segment;
838  * on Windows, doing so will create unnecessary handles which will
839  * consume system resources to no benefit.
840  *
841  * Note that this function does not arrange for the current process to
842  * keep the segment mapped indefinitely; if that behavior is desired,
843  * dsm_pin_mapping() should be used from each process that needs to
844  * retain the mapping.
845  */
846 void
dsm_pin_segment(dsm_segment * seg)847 dsm_pin_segment(dsm_segment *seg)
848 {
849 	/*
850 	 * Bump reference count for this segment in shared memory. This will
851 	 * ensure that even if there is no session which is attached to this
852 	 * segment, it will remain until postmaster shutdown.
853 	 */
854 	LWLockAcquire(DynamicSharedMemoryControlLock, LW_EXCLUSIVE);
855 	dsm_control->item[seg->control_slot].refcnt++;
856 	LWLockRelease(DynamicSharedMemoryControlLock);
857 
858 	dsm_impl_pin_segment(seg->handle, seg->impl_private);
859 }
860 
861 /*
862  * Find an existing mapping for a shared memory segment, if there is one.
863  */
864 dsm_segment *
dsm_find_mapping(dsm_handle h)865 dsm_find_mapping(dsm_handle h)
866 {
867 	dlist_iter	iter;
868 	dsm_segment *seg;
869 
870 	dlist_foreach(iter, &dsm_segment_list)
871 	{
872 		seg = dlist_container(dsm_segment, node, iter.cur);
873 		if (seg->handle == h)
874 			return seg;
875 	}
876 
877 	return NULL;
878 }
879 
880 /*
881  * Get the address at which a dynamic shared memory segment is mapped.
882  */
883 void *
dsm_segment_address(dsm_segment * seg)884 dsm_segment_address(dsm_segment *seg)
885 {
886 	Assert(seg->mapped_address != NULL);
887 	return seg->mapped_address;
888 }
889 
890 /*
891  * Get the size of a mapping.
892  */
893 Size
dsm_segment_map_length(dsm_segment * seg)894 dsm_segment_map_length(dsm_segment *seg)
895 {
896 	Assert(seg->mapped_address != NULL);
897 	return seg->mapped_size;
898 }
899 
900 /*
901  * Get a handle for a mapping.
902  *
903  * To establish communication via dynamic shared memory between two backends,
904  * one of them should first call dsm_create() to establish a new shared
905  * memory mapping.  That process should then call dsm_segment_handle() to
906  * obtain a handle for the mapping, and pass that handle to the
907  * coordinating backend via some means (e.g. bgw_main_arg, or via the
908  * main shared memory segment).  The recipient, once in possession of the
909  * handle, should call dsm_attach().
910  */
911 dsm_handle
dsm_segment_handle(dsm_segment * seg)912 dsm_segment_handle(dsm_segment *seg)
913 {
914 	return seg->handle;
915 }
916 
917 /*
918  * Register an on-detach callback for a dynamic shared memory segment.
919  */
920 void
on_dsm_detach(dsm_segment * seg,on_dsm_detach_callback function,Datum arg)921 on_dsm_detach(dsm_segment *seg, on_dsm_detach_callback function, Datum arg)
922 {
923 	dsm_segment_detach_callback *cb;
924 
925 	cb = MemoryContextAlloc(TopMemoryContext,
926 							sizeof(dsm_segment_detach_callback));
927 	cb->function = function;
928 	cb->arg = arg;
929 	slist_push_head(&seg->on_detach, &cb->node);
930 }
931 
932 /*
933  * Unregister an on-detach callback for a dynamic shared memory segment.
934  */
935 void
cancel_on_dsm_detach(dsm_segment * seg,on_dsm_detach_callback function,Datum arg)936 cancel_on_dsm_detach(dsm_segment *seg, on_dsm_detach_callback function,
937 					 Datum arg)
938 {
939 	slist_mutable_iter iter;
940 
941 	slist_foreach_modify(iter, &seg->on_detach)
942 	{
943 		dsm_segment_detach_callback *cb;
944 
945 		cb = slist_container(dsm_segment_detach_callback, node, iter.cur);
946 		if (cb->function == function && cb->arg == arg)
947 		{
948 			slist_delete_current(&iter);
949 			pfree(cb);
950 			break;
951 		}
952 	}
953 }
954 
955 /*
956  * Discard all registered on-detach callbacks without executing them.
957  */
958 void
reset_on_dsm_detach(void)959 reset_on_dsm_detach(void)
960 {
961 	dlist_iter	iter;
962 
963 	dlist_foreach(iter, &dsm_segment_list)
964 	{
965 		dsm_segment *seg = dlist_container(dsm_segment, node, iter.cur);
966 
967 		/* Throw away explicit on-detach actions one by one. */
968 		while (!slist_is_empty(&seg->on_detach))
969 		{
970 			slist_node *node;
971 			dsm_segment_detach_callback *cb;
972 
973 			node = slist_pop_head_node(&seg->on_detach);
974 			cb = slist_container(dsm_segment_detach_callback, node, node);
975 			pfree(cb);
976 		}
977 
978 		/*
979 		 * Decrementing the reference count is a sort of implicit on-detach
980 		 * action; make sure we don't do that, either.
981 		 */
982 		seg->control_slot = INVALID_CONTROL_SLOT;
983 	}
984 }
985 
986 /*
987  * Create a segment descriptor.
988  */
989 static dsm_segment *
dsm_create_descriptor(void)990 dsm_create_descriptor(void)
991 {
992 	dsm_segment *seg;
993 
994 	ResourceOwnerEnlargeDSMs(CurrentResourceOwner);
995 
996 	seg = MemoryContextAlloc(TopMemoryContext, sizeof(dsm_segment));
997 	dlist_push_head(&dsm_segment_list, &seg->node);
998 
999 	/* seg->handle must be initialized by the caller */
1000 	seg->control_slot = INVALID_CONTROL_SLOT;
1001 	seg->impl_private = NULL;
1002 	seg->mapped_address = NULL;
1003 	seg->mapped_size = 0;
1004 
1005 	seg->resowner = CurrentResourceOwner;
1006 	ResourceOwnerRememberDSM(CurrentResourceOwner, seg);
1007 
1008 	slist_init(&seg->on_detach);
1009 
1010 	return seg;
1011 }
1012 
1013 /*
1014  * Sanity check a control segment.
1015  *
1016  * The goal here isn't to detect everything that could possibly be wrong with
1017  * the control segment; there's not enough information for that.  Rather, the
1018  * goal is to make sure that someone can iterate over the items in the segment
1019  * without overrunning the end of the mapping and crashing.  We also check
1020  * the magic number since, if that's messed up, this may not even be one of
1021  * our segments at all.
1022  */
1023 static bool
dsm_control_segment_sane(dsm_control_header * control,Size mapped_size)1024 dsm_control_segment_sane(dsm_control_header *control, Size mapped_size)
1025 {
1026 	if (mapped_size < offsetof(dsm_control_header, item))
1027 		return false;			/* Mapped size too short to read header. */
1028 	if (control->magic != PG_DYNSHMEM_CONTROL_MAGIC)
1029 		return false;			/* Magic number doesn't match. */
1030 	if (dsm_control_bytes_needed(control->maxitems) > mapped_size)
1031 		return false;			/* Max item count won't fit in map. */
1032 	if (control->nitems > control->maxitems)
1033 		return false;			/* Overfull. */
1034 	return true;
1035 }
1036 
1037 /*
1038  * Compute the number of control-segment bytes needed to store a given
1039  * number of items.
1040  */
1041 static uint64
dsm_control_bytes_needed(uint32 nitems)1042 dsm_control_bytes_needed(uint32 nitems)
1043 {
1044 	return offsetof(dsm_control_header, item)
1045 		+sizeof(dsm_control_item) * (uint64) nitems;
1046 }
1047