1 /*-------------------------------------------------------------------------
2  *
3  * dsm.c
4  *	  manage dynamic shared memory segments
5  *
6  * This file provides a set of services to make programming with dynamic
7  * shared memory segments more convenient.  Unlike the low-level
8  * facilities provided by dsm_impl.h and dsm_impl.c, mappings and segments
9  * created using this module will be cleaned up automatically.  Mappings
10  * will be removed when the resource owner under which they were created
11  * is cleaned up, unless dsm_pin_mapping() is used, in which case they
12  * have session lifespan.  Segments will be removed when there are no
13  * remaining mappings, or at postmaster shutdown in any case.  After a
14  * hard postmaster crash, remaining segments will be removed, if they
15  * still exist, at the next postmaster startup.
16  *
17  * Portions Copyright (c) 1996-2020, PostgreSQL Global Development Group
18  * Portions Copyright (c) 1994, Regents of the University of California
19  *
20  *
21  * IDENTIFICATION
22  *	  src/backend/storage/ipc/dsm.c
23  *
24  *-------------------------------------------------------------------------
25  */
26 
27 #include "postgres.h"
28 
29 #include <fcntl.h>
30 #include <unistd.h>
31 #ifndef WIN32
32 #include <sys/mman.h>
33 #endif
34 #include <sys/stat.h>
35 
36 #include "lib/ilist.h"
37 #include "miscadmin.h"
38 #include "storage/dsm.h"
39 #include "storage/ipc.h"
40 #include "storage/lwlock.h"
41 #include "storage/pg_shmem.h"
42 #include "utils/guc.h"
43 #include "utils/memutils.h"
44 #include "utils/resowner_private.h"
45 
46 #define PG_DYNSHMEM_CONTROL_MAGIC		0x9a503d32
47 
48 #define PG_DYNSHMEM_FIXED_SLOTS			64
49 #define PG_DYNSHMEM_SLOTS_PER_BACKEND	5
50 
51 #define INVALID_CONTROL_SLOT		((uint32) -1)
52 
53 /* Backend-local tracking for on-detach callbacks. */
54 typedef struct dsm_segment_detach_callback
55 {
56 	on_dsm_detach_callback function;
57 	Datum		arg;
58 	slist_node	node;
59 } dsm_segment_detach_callback;
60 
61 /* Backend-local state for a dynamic shared memory segment. */
62 struct dsm_segment
63 {
64 	dlist_node	node;			/* List link in dsm_segment_list. */
65 	ResourceOwner resowner;		/* Resource owner. */
66 	dsm_handle	handle;			/* Segment name. */
67 	uint32		control_slot;	/* Slot in control segment. */
68 	void	   *impl_private;	/* Implementation-specific private data. */
69 	void	   *mapped_address; /* Mapping address, or NULL if unmapped. */
70 	Size		mapped_size;	/* Size of our mapping. */
71 	slist_head	on_detach;		/* On-detach callbacks. */
72 };
73 
74 /* Shared-memory state for a dynamic shared memory segment. */
75 typedef struct dsm_control_item
76 {
77 	dsm_handle	handle;
78 	uint32		refcnt;			/* 2+ = active, 1 = moribund, 0 = gone */
79 	void	   *impl_private_pm_handle; /* only needed on Windows */
80 	bool		pinned;
81 } dsm_control_item;
82 
83 /* Layout of the dynamic shared memory control segment. */
84 typedef struct dsm_control_header
85 {
86 	uint32		magic;
87 	uint32		nitems;
88 	uint32		maxitems;
89 	dsm_control_item item[FLEXIBLE_ARRAY_MEMBER];
90 } dsm_control_header;
91 
92 static void dsm_cleanup_for_mmap(void);
93 static void dsm_postmaster_shutdown(int code, Datum arg);
94 static dsm_segment *dsm_create_descriptor(void);
95 static bool dsm_control_segment_sane(dsm_control_header *control,
96 									 Size mapped_size);
97 static uint64 dsm_control_bytes_needed(uint32 nitems);
98 
99 /* Has this backend initialized the dynamic shared memory system yet? */
100 static bool dsm_init_done = false;
101 
102 /*
103  * List of dynamic shared memory segments used by this backend.
104  *
105  * At process exit time, we must decrement the reference count of each
106  * segment we have attached; this list makes it possible to find all such
107  * segments.
108  *
109  * This list should always be empty in the postmaster.  We could probably
110  * allow the postmaster to map dynamic shared memory segments before it
111  * begins to start child processes, provided that each process adjusted
112  * the reference counts for those segments in the control segment at
113  * startup time, but there's no obvious need for such a facility, which
114  * would also be complex to handle in the EXEC_BACKEND case.  Once the
115  * postmaster has begun spawning children, there's an additional problem:
116  * each new mapping would require an update to the control segment,
117  * which requires locking, in which the postmaster must not be involved.
118  */
119 static dlist_head dsm_segment_list = DLIST_STATIC_INIT(dsm_segment_list);
120 
121 /*
122  * Control segment information.
123  *
124  * Unlike ordinary shared memory segments, the control segment is not
125  * reference counted; instead, it lasts for the postmaster's entire
126  * life cycle.  For simplicity, it doesn't have a dsm_segment object either.
127  */
128 static dsm_handle dsm_control_handle;
129 static dsm_control_header *dsm_control;
130 static Size dsm_control_mapped_size = 0;
131 static void *dsm_control_impl_private = NULL;
132 
133 /*
134  * Start up the dynamic shared memory system.
135  *
136  * This is called just once during each cluster lifetime, at postmaster
137  * startup time.
138  */
139 void
dsm_postmaster_startup(PGShmemHeader * shim)140 dsm_postmaster_startup(PGShmemHeader *shim)
141 {
142 	void	   *dsm_control_address = NULL;
143 	uint32		maxitems;
144 	Size		segsize;
145 
146 	Assert(!IsUnderPostmaster);
147 
148 	/*
149 	 * If we're using the mmap implementations, clean up any leftovers.
150 	 * Cleanup isn't needed on Windows, and happens earlier in startup for
151 	 * POSIX and System V shared memory, via a direct call to
152 	 * dsm_cleanup_using_control_segment.
153 	 */
154 	if (dynamic_shared_memory_type == DSM_IMPL_MMAP)
155 		dsm_cleanup_for_mmap();
156 
157 	/* Determine size for new control segment. */
158 	maxitems = PG_DYNSHMEM_FIXED_SLOTS
159 		+ PG_DYNSHMEM_SLOTS_PER_BACKEND * MaxBackends;
160 	elog(DEBUG2, "dynamic shared memory system will support %u segments",
161 		 maxitems);
162 	segsize = dsm_control_bytes_needed(maxitems);
163 
164 	/*
165 	 * Loop until we find an unused identifier for the new control segment. We
166 	 * sometimes use 0 as a sentinel value indicating that no control segment
167 	 * is known to exist, so avoid using that value for a real control
168 	 * segment.
169 	 */
170 	for (;;)
171 	{
172 		Assert(dsm_control_address == NULL);
173 		Assert(dsm_control_mapped_size == 0);
174 		dsm_control_handle = random();
175 		if (dsm_control_handle == DSM_HANDLE_INVALID)
176 			continue;
177 		if (dsm_impl_op(DSM_OP_CREATE, dsm_control_handle, segsize,
178 						&dsm_control_impl_private, &dsm_control_address,
179 						&dsm_control_mapped_size, ERROR))
180 			break;
181 	}
182 	dsm_control = dsm_control_address;
183 	on_shmem_exit(dsm_postmaster_shutdown, PointerGetDatum(shim));
184 	elog(DEBUG2,
185 		 "created dynamic shared memory control segment %u (%zu bytes)",
186 		 dsm_control_handle, segsize);
187 	shim->dsm_control = dsm_control_handle;
188 
189 	/* Initialize control segment. */
190 	dsm_control->magic = PG_DYNSHMEM_CONTROL_MAGIC;
191 	dsm_control->nitems = 0;
192 	dsm_control->maxitems = maxitems;
193 }
194 
195 /*
196  * Determine whether the control segment from the previous postmaster
197  * invocation still exists.  If so, remove the dynamic shared memory
198  * segments to which it refers, and then the control segment itself.
199  */
200 void
dsm_cleanup_using_control_segment(dsm_handle old_control_handle)201 dsm_cleanup_using_control_segment(dsm_handle old_control_handle)
202 {
203 	void	   *mapped_address = NULL;
204 	void	   *junk_mapped_address = NULL;
205 	void	   *impl_private = NULL;
206 	void	   *junk_impl_private = NULL;
207 	Size		mapped_size = 0;
208 	Size		junk_mapped_size = 0;
209 	uint32		nitems;
210 	uint32		i;
211 	dsm_control_header *old_control;
212 
213 	/*
214 	 * Try to attach the segment.  If this fails, it probably just means that
215 	 * the operating system has been rebooted and the segment no longer
216 	 * exists, or an unrelated process has used the same shm ID.  So just fall
217 	 * out quietly.
218 	 */
219 	if (!dsm_impl_op(DSM_OP_ATTACH, old_control_handle, 0, &impl_private,
220 					 &mapped_address, &mapped_size, DEBUG1))
221 		return;
222 
223 	/*
224 	 * We've managed to reattach it, but the contents might not be sane. If
225 	 * they aren't, we disregard the segment after all.
226 	 */
227 	old_control = (dsm_control_header *) mapped_address;
228 	if (!dsm_control_segment_sane(old_control, mapped_size))
229 	{
230 		dsm_impl_op(DSM_OP_DETACH, old_control_handle, 0, &impl_private,
231 					&mapped_address, &mapped_size, LOG);
232 		return;
233 	}
234 
235 	/*
236 	 * OK, the control segment looks basically valid, so we can use it to get
237 	 * a list of segments that need to be removed.
238 	 */
239 	nitems = old_control->nitems;
240 	for (i = 0; i < nitems; ++i)
241 	{
242 		dsm_handle	handle;
243 		uint32		refcnt;
244 
245 		/* If the reference count is 0, the slot is actually unused. */
246 		refcnt = old_control->item[i].refcnt;
247 		if (refcnt == 0)
248 			continue;
249 
250 		/* Log debugging information. */
251 		handle = old_control->item[i].handle;
252 		elog(DEBUG2, "cleaning up orphaned dynamic shared memory with ID %u (reference count %u)",
253 			 handle, refcnt);
254 
255 		/* Destroy the referenced segment. */
256 		dsm_impl_op(DSM_OP_DESTROY, handle, 0, &junk_impl_private,
257 					&junk_mapped_address, &junk_mapped_size, LOG);
258 	}
259 
260 	/* Destroy the old control segment, too. */
261 	elog(DEBUG2,
262 		 "cleaning up dynamic shared memory control segment with ID %u",
263 		 old_control_handle);
264 	dsm_impl_op(DSM_OP_DESTROY, old_control_handle, 0, &impl_private,
265 				&mapped_address, &mapped_size, LOG);
266 }
267 
268 /*
269  * When we're using the mmap shared memory implementation, "shared memory"
270  * segments might even manage to survive an operating system reboot.
271  * But there's no guarantee as to exactly what will survive: some segments
272  * may survive, and others may not, and the contents of some may be out
273  * of date.  In particular, the control segment may be out of date, so we
274  * can't rely on it to figure out what to remove.  However, since we know
275  * what directory contains the files we used as shared memory, we can simply
276  * scan the directory and blow everything away that shouldn't be there.
277  */
278 static void
dsm_cleanup_for_mmap(void)279 dsm_cleanup_for_mmap(void)
280 {
281 	DIR		   *dir;
282 	struct dirent *dent;
283 
284 	/* Scan the directory for something with a name of the correct format. */
285 	dir = AllocateDir(PG_DYNSHMEM_DIR);
286 
287 	while ((dent = ReadDir(dir, PG_DYNSHMEM_DIR)) != NULL)
288 	{
289 		if (strncmp(dent->d_name, PG_DYNSHMEM_MMAP_FILE_PREFIX,
290 					strlen(PG_DYNSHMEM_MMAP_FILE_PREFIX)) == 0)
291 		{
292 			char		buf[MAXPGPATH + sizeof(PG_DYNSHMEM_DIR)];
293 
294 			snprintf(buf, sizeof(buf), PG_DYNSHMEM_DIR "/%s", dent->d_name);
295 
296 			elog(DEBUG2, "removing file \"%s\"", buf);
297 
298 			/* We found a matching file; so remove it. */
299 			if (unlink(buf) != 0)
300 				ereport(ERROR,
301 						(errcode_for_file_access(),
302 						 errmsg("could not remove file \"%s\": %m", buf)));
303 		}
304 	}
305 
306 	/* Cleanup complete. */
307 	FreeDir(dir);
308 }
309 
310 /*
311  * At shutdown time, we iterate over the control segment and remove all
312  * remaining dynamic shared memory segments.  We avoid throwing errors here;
313  * the postmaster is shutting down either way, and this is just non-critical
314  * resource cleanup.
315  */
316 static void
dsm_postmaster_shutdown(int code,Datum arg)317 dsm_postmaster_shutdown(int code, Datum arg)
318 {
319 	uint32		nitems;
320 	uint32		i;
321 	void	   *dsm_control_address;
322 	void	   *junk_mapped_address = NULL;
323 	void	   *junk_impl_private = NULL;
324 	Size		junk_mapped_size = 0;
325 	PGShmemHeader *shim = (PGShmemHeader *) DatumGetPointer(arg);
326 
327 	/*
328 	 * If some other backend exited uncleanly, it might have corrupted the
329 	 * control segment while it was dying.  In that case, we warn and ignore
330 	 * the contents of the control segment.  This may end up leaving behind
331 	 * stray shared memory segments, but there's not much we can do about that
332 	 * if the metadata is gone.
333 	 */
334 	nitems = dsm_control->nitems;
335 	if (!dsm_control_segment_sane(dsm_control, dsm_control_mapped_size))
336 	{
337 		ereport(LOG,
338 				(errmsg("dynamic shared memory control segment is corrupt")));
339 		return;
340 	}
341 
342 	/* Remove any remaining segments. */
343 	for (i = 0; i < nitems; ++i)
344 	{
345 		dsm_handle	handle;
346 
347 		/* If the reference count is 0, the slot is actually unused. */
348 		if (dsm_control->item[i].refcnt == 0)
349 			continue;
350 
351 		/* Log debugging information. */
352 		handle = dsm_control->item[i].handle;
353 		elog(DEBUG2, "cleaning up orphaned dynamic shared memory with ID %u",
354 			 handle);
355 
356 		/* Destroy the segment. */
357 		dsm_impl_op(DSM_OP_DESTROY, handle, 0, &junk_impl_private,
358 					&junk_mapped_address, &junk_mapped_size, LOG);
359 	}
360 
361 	/* Remove the control segment itself. */
362 	elog(DEBUG2,
363 		 "cleaning up dynamic shared memory control segment with ID %u",
364 		 dsm_control_handle);
365 	dsm_control_address = dsm_control;
366 	dsm_impl_op(DSM_OP_DESTROY, dsm_control_handle, 0,
367 				&dsm_control_impl_private, &dsm_control_address,
368 				&dsm_control_mapped_size, LOG);
369 	dsm_control = dsm_control_address;
370 	shim->dsm_control = 0;
371 }
372 
373 /*
374  * Prepare this backend for dynamic shared memory usage.  Under EXEC_BACKEND,
375  * we must reread the state file and map the control segment; in other cases,
376  * we'll have inherited the postmaster's mapping and global variables.
377  */
378 static void
dsm_backend_startup(void)379 dsm_backend_startup(void)
380 {
381 #ifdef EXEC_BACKEND
382 	{
383 		void	   *control_address = NULL;
384 
385 		/* Attach control segment. */
386 		Assert(dsm_control_handle != 0);
387 		dsm_impl_op(DSM_OP_ATTACH, dsm_control_handle, 0,
388 					&dsm_control_impl_private, &control_address,
389 					&dsm_control_mapped_size, ERROR);
390 		dsm_control = control_address;
391 		/* If control segment doesn't look sane, something is badly wrong. */
392 		if (!dsm_control_segment_sane(dsm_control, dsm_control_mapped_size))
393 		{
394 			dsm_impl_op(DSM_OP_DETACH, dsm_control_handle, 0,
395 						&dsm_control_impl_private, &control_address,
396 						&dsm_control_mapped_size, WARNING);
397 			ereport(FATAL,
398 					(errcode(ERRCODE_INTERNAL_ERROR),
399 					 errmsg("dynamic shared memory control segment is not valid")));
400 		}
401 	}
402 #endif
403 
404 	dsm_init_done = true;
405 }
406 
407 #ifdef EXEC_BACKEND
408 /*
409  * When running under EXEC_BACKEND, we get a callback here when the main
410  * shared memory segment is re-attached, so that we can record the control
411  * handle retrieved from it.
412  */
413 void
dsm_set_control_handle(dsm_handle h)414 dsm_set_control_handle(dsm_handle h)
415 {
416 	Assert(dsm_control_handle == 0 && h != 0);
417 	dsm_control_handle = h;
418 }
419 #endif
420 
421 /*
422  * Create a new dynamic shared memory segment.
423  *
424  * If there is a non-NULL CurrentResourceOwner, the new segment is associated
425  * with it and must be detached before the resource owner releases, or a
426  * warning will be logged.  If CurrentResourceOwner is NULL, the segment
427  * remains attached until explicitly detached or the session ends.
428  * Creating with a NULL CurrentResourceOwner is equivalent to creating
429  * with a non-NULL CurrentResourceOwner and then calling dsm_pin_mapping.
430  */
431 dsm_segment *
dsm_create(Size size,int flags)432 dsm_create(Size size, int flags)
433 {
434 	dsm_segment *seg;
435 	uint32		i;
436 	uint32		nitems;
437 
438 	/* Unsafe in postmaster (and pointless in a stand-alone backend). */
439 	Assert(IsUnderPostmaster);
440 
441 	if (!dsm_init_done)
442 		dsm_backend_startup();
443 
444 	/* Create a new segment descriptor. */
445 	seg = dsm_create_descriptor();
446 
447 	/* Loop until we find an unused segment identifier. */
448 	for (;;)
449 	{
450 		Assert(seg->mapped_address == NULL && seg->mapped_size == 0);
451 		seg->handle = random();
452 		if (seg->handle == DSM_HANDLE_INVALID)	/* Reserve sentinel */
453 			continue;
454 		if (dsm_impl_op(DSM_OP_CREATE, seg->handle, size, &seg->impl_private,
455 						&seg->mapped_address, &seg->mapped_size, ERROR))
456 			break;
457 	}
458 
459 	/* Lock the control segment so we can register the new segment. */
460 	LWLockAcquire(DynamicSharedMemoryControlLock, LW_EXCLUSIVE);
461 
462 	/* Search the control segment for an unused slot. */
463 	nitems = dsm_control->nitems;
464 	for (i = 0; i < nitems; ++i)
465 	{
466 		if (dsm_control->item[i].refcnt == 0)
467 		{
468 			dsm_control->item[i].handle = seg->handle;
469 			/* refcnt of 1 triggers destruction, so start at 2 */
470 			dsm_control->item[i].refcnt = 2;
471 			dsm_control->item[i].impl_private_pm_handle = NULL;
472 			dsm_control->item[i].pinned = false;
473 			seg->control_slot = i;
474 			LWLockRelease(DynamicSharedMemoryControlLock);
475 			return seg;
476 		}
477 	}
478 
479 	/* Verify that we can support an additional mapping. */
480 	if (nitems >= dsm_control->maxitems)
481 	{
482 		LWLockRelease(DynamicSharedMemoryControlLock);
483 		dsm_impl_op(DSM_OP_DESTROY, seg->handle, 0, &seg->impl_private,
484 					&seg->mapped_address, &seg->mapped_size, WARNING);
485 		if (seg->resowner != NULL)
486 			ResourceOwnerForgetDSM(seg->resowner, seg);
487 		dlist_delete(&seg->node);
488 		pfree(seg);
489 
490 		if ((flags & DSM_CREATE_NULL_IF_MAXSEGMENTS) != 0)
491 			return NULL;
492 		ereport(ERROR,
493 				(errcode(ERRCODE_INSUFFICIENT_RESOURCES),
494 				 errmsg("too many dynamic shared memory segments")));
495 	}
496 
497 	/* Enter the handle into a new array slot. */
498 	dsm_control->item[nitems].handle = seg->handle;
499 	/* refcnt of 1 triggers destruction, so start at 2 */
500 	dsm_control->item[nitems].refcnt = 2;
501 	dsm_control->item[nitems].impl_private_pm_handle = NULL;
502 	dsm_control->item[nitems].pinned = false;
503 	seg->control_slot = nitems;
504 	dsm_control->nitems++;
505 	LWLockRelease(DynamicSharedMemoryControlLock);
506 
507 	return seg;
508 }
509 
510 /*
511  * Attach a dynamic shared memory segment.
512  *
513  * See comments for dsm_segment_handle() for an explanation of how this
514  * is intended to be used.
515  *
516  * This function will return NULL if the segment isn't known to the system.
517  * This can happen if we're asked to attach the segment, but then everyone
518  * else detaches it (causing it to be destroyed) before we get around to
519  * attaching it.
520  *
521  * If there is a non-NULL CurrentResourceOwner, the attached segment is
522  * associated with it and must be detached before the resource owner releases,
523  * or a warning will be logged.  Otherwise the segment remains attached until
524  * explicitly detached or the session ends.  See the note atop dsm_create().
525  */
526 dsm_segment *
dsm_attach(dsm_handle h)527 dsm_attach(dsm_handle h)
528 {
529 	dsm_segment *seg;
530 	dlist_iter	iter;
531 	uint32		i;
532 	uint32		nitems;
533 
534 	/* Unsafe in postmaster (and pointless in a stand-alone backend). */
535 	Assert(IsUnderPostmaster);
536 
537 	if (!dsm_init_done)
538 		dsm_backend_startup();
539 
540 	/*
541 	 * Since this is just a debugging cross-check, we could leave it out
542 	 * altogether, or include it only in assert-enabled builds.  But since the
543 	 * list of attached segments should normally be very short, let's include
544 	 * it always for right now.
545 	 *
546 	 * If you're hitting this error, you probably want to attempt to find an
547 	 * existing mapping via dsm_find_mapping() before calling dsm_attach() to
548 	 * create a new one.
549 	 */
550 	dlist_foreach(iter, &dsm_segment_list)
551 	{
552 		seg = dlist_container(dsm_segment, node, iter.cur);
553 		if (seg->handle == h)
554 			elog(ERROR, "can't attach the same segment more than once");
555 	}
556 
557 	/* Create a new segment descriptor. */
558 	seg = dsm_create_descriptor();
559 	seg->handle = h;
560 
561 	/* Bump reference count for this segment in shared memory. */
562 	LWLockAcquire(DynamicSharedMemoryControlLock, LW_EXCLUSIVE);
563 	nitems = dsm_control->nitems;
564 	for (i = 0; i < nitems; ++i)
565 	{
566 		/*
567 		 * If the reference count is 0, the slot is actually unused.  If the
568 		 * reference count is 1, the slot is still in use, but the segment is
569 		 * in the process of going away; even if the handle matches, another
570 		 * slot may already have started using the same handle value by
571 		 * coincidence so we have to keep searching.
572 		 */
573 		if (dsm_control->item[i].refcnt <= 1)
574 			continue;
575 
576 		/* If the handle doesn't match, it's not the slot we want. */
577 		if (dsm_control->item[i].handle != seg->handle)
578 			continue;
579 
580 		/* Otherwise we've found a match. */
581 		dsm_control->item[i].refcnt++;
582 		seg->control_slot = i;
583 		break;
584 	}
585 	LWLockRelease(DynamicSharedMemoryControlLock);
586 
587 	/*
588 	 * If we didn't find the handle we're looking for in the control segment,
589 	 * it probably means that everyone else who had it mapped, including the
590 	 * original creator, died before we got to this point. It's up to the
591 	 * caller to decide what to do about that.
592 	 */
593 	if (seg->control_slot == INVALID_CONTROL_SLOT)
594 	{
595 		dsm_detach(seg);
596 		return NULL;
597 	}
598 
599 	/* Here's where we actually try to map the segment. */
600 	dsm_impl_op(DSM_OP_ATTACH, seg->handle, 0, &seg->impl_private,
601 				&seg->mapped_address, &seg->mapped_size, ERROR);
602 
603 	return seg;
604 }
605 
606 /*
607  * At backend shutdown time, detach any segments that are still attached.
608  * (This is similar to dsm_detach_all, except that there's no reason to
609  * unmap the control segment before exiting, so we don't bother.)
610  */
611 void
dsm_backend_shutdown(void)612 dsm_backend_shutdown(void)
613 {
614 	while (!dlist_is_empty(&dsm_segment_list))
615 	{
616 		dsm_segment *seg;
617 
618 		seg = dlist_head_element(dsm_segment, node, &dsm_segment_list);
619 		dsm_detach(seg);
620 	}
621 }
622 
623 /*
624  * Detach all shared memory segments, including the control segments.  This
625  * should be called, along with PGSharedMemoryDetach, in processes that
626  * might inherit mappings but are not intended to be connected to dynamic
627  * shared memory.
628  */
629 void
dsm_detach_all(void)630 dsm_detach_all(void)
631 {
632 	void	   *control_address = dsm_control;
633 
634 	while (!dlist_is_empty(&dsm_segment_list))
635 	{
636 		dsm_segment *seg;
637 
638 		seg = dlist_head_element(dsm_segment, node, &dsm_segment_list);
639 		dsm_detach(seg);
640 	}
641 
642 	if (control_address != NULL)
643 		dsm_impl_op(DSM_OP_DETACH, dsm_control_handle, 0,
644 					&dsm_control_impl_private, &control_address,
645 					&dsm_control_mapped_size, ERROR);
646 }
647 
648 /*
649  * Detach from a shared memory segment, destroying the segment if we
650  * remove the last reference.
651  *
652  * This function should never fail.  It will often be invoked when aborting
653  * a transaction, and a further error won't serve any purpose.  It's not a
654  * complete disaster if we fail to unmap or destroy the segment; it means a
655  * resource leak, but that doesn't necessarily preclude further operations.
656  */
657 void
dsm_detach(dsm_segment * seg)658 dsm_detach(dsm_segment *seg)
659 {
660 	/*
661 	 * Invoke registered callbacks.  Just in case one of those callbacks
662 	 * throws a further error that brings us back here, pop the callback
663 	 * before invoking it, to avoid infinite error recursion.  Don't allow
664 	 * interrupts while running the individual callbacks in non-error code
665 	 * paths, to avoid leaving cleanup work unfinished if we're interrupted by
666 	 * a statement timeout or similar.
667 	 */
668 	HOLD_INTERRUPTS();
669 	while (!slist_is_empty(&seg->on_detach))
670 	{
671 		slist_node *node;
672 		dsm_segment_detach_callback *cb;
673 		on_dsm_detach_callback function;
674 		Datum		arg;
675 
676 		node = slist_pop_head_node(&seg->on_detach);
677 		cb = slist_container(dsm_segment_detach_callback, node, node);
678 		function = cb->function;
679 		arg = cb->arg;
680 		pfree(cb);
681 
682 		function(seg, arg);
683 	}
684 	RESUME_INTERRUPTS();
685 
686 	/*
687 	 * Try to remove the mapping, if one exists.  Normally, there will be, but
688 	 * maybe not, if we failed partway through a create or attach operation.
689 	 * We remove the mapping before decrementing the reference count so that
690 	 * the process that sees a zero reference count can be certain that no
691 	 * remaining mappings exist.  Even if this fails, we pretend that it
692 	 * works, because retrying is likely to fail in the same way.
693 	 */
694 	if (seg->mapped_address != NULL)
695 	{
696 		dsm_impl_op(DSM_OP_DETACH, seg->handle, 0, &seg->impl_private,
697 					&seg->mapped_address, &seg->mapped_size, WARNING);
698 		seg->impl_private = NULL;
699 		seg->mapped_address = NULL;
700 		seg->mapped_size = 0;
701 	}
702 
703 	/* Reduce reference count, if we previously increased it. */
704 	if (seg->control_slot != INVALID_CONTROL_SLOT)
705 	{
706 		uint32		refcnt;
707 		uint32		control_slot = seg->control_slot;
708 
709 		LWLockAcquire(DynamicSharedMemoryControlLock, LW_EXCLUSIVE);
710 		Assert(dsm_control->item[control_slot].handle == seg->handle);
711 		Assert(dsm_control->item[control_slot].refcnt > 1);
712 		refcnt = --dsm_control->item[control_slot].refcnt;
713 		seg->control_slot = INVALID_CONTROL_SLOT;
714 		LWLockRelease(DynamicSharedMemoryControlLock);
715 
716 		/* If new reference count is 1, try to destroy the segment. */
717 		if (refcnt == 1)
718 		{
719 			/* A pinned segment should never reach 1. */
720 			Assert(!dsm_control->item[control_slot].pinned);
721 
722 			/*
723 			 * If we fail to destroy the segment here, or are killed before we
724 			 * finish doing so, the reference count will remain at 1, which
725 			 * will mean that nobody else can attach to the segment.  At
726 			 * postmaster shutdown time, or when a new postmaster is started
727 			 * after a hard kill, another attempt will be made to remove the
728 			 * segment.
729 			 *
730 			 * The main case we're worried about here is being killed by a
731 			 * signal before we can finish removing the segment.  In that
732 			 * case, it's important to be sure that the segment still gets
733 			 * removed. If we actually fail to remove the segment for some
734 			 * other reason, the postmaster may not have any better luck than
735 			 * we did.  There's not much we can do about that, though.
736 			 */
737 			if (dsm_impl_op(DSM_OP_DESTROY, seg->handle, 0, &seg->impl_private,
738 							&seg->mapped_address, &seg->mapped_size, WARNING))
739 			{
740 				LWLockAcquire(DynamicSharedMemoryControlLock, LW_EXCLUSIVE);
741 				Assert(dsm_control->item[control_slot].handle == seg->handle);
742 				Assert(dsm_control->item[control_slot].refcnt == 1);
743 				dsm_control->item[control_slot].refcnt = 0;
744 				LWLockRelease(DynamicSharedMemoryControlLock);
745 			}
746 		}
747 	}
748 
749 	/* Clean up our remaining backend-private data structures. */
750 	if (seg->resowner != NULL)
751 		ResourceOwnerForgetDSM(seg->resowner, seg);
752 	dlist_delete(&seg->node);
753 	pfree(seg);
754 }
755 
756 /*
757  * Keep a dynamic shared memory mapping until end of session.
758  *
759  * By default, mappings are owned by the current resource owner, which
760  * typically means they stick around for the duration of the current query
761  * only.
762  */
763 void
dsm_pin_mapping(dsm_segment * seg)764 dsm_pin_mapping(dsm_segment *seg)
765 {
766 	if (seg->resowner != NULL)
767 	{
768 		ResourceOwnerForgetDSM(seg->resowner, seg);
769 		seg->resowner = NULL;
770 	}
771 }
772 
773 /*
774  * Arrange to remove a dynamic shared memory mapping at cleanup time.
775  *
776  * dsm_pin_mapping() can be used to preserve a mapping for the entire
777  * lifetime of a process; this function reverses that decision, making
778  * the segment owned by the current resource owner.  This may be useful
779  * just before performing some operation that will invalidate the segment
780  * for future use by this backend.
781  */
782 void
dsm_unpin_mapping(dsm_segment * seg)783 dsm_unpin_mapping(dsm_segment *seg)
784 {
785 	Assert(seg->resowner == NULL);
786 	ResourceOwnerEnlargeDSMs(CurrentResourceOwner);
787 	seg->resowner = CurrentResourceOwner;
788 	ResourceOwnerRememberDSM(seg->resowner, seg);
789 }
790 
791 /*
792  * Keep a dynamic shared memory segment until postmaster shutdown, or until
793  * dsm_unpin_segment is called.
794  *
795  * This function should not be called more than once per segment, unless the
796  * segment is explicitly unpinned with dsm_unpin_segment in between calls.
797  *
798  * Note that this function does not arrange for the current process to
799  * keep the segment mapped indefinitely; if that behavior is desired,
800  * dsm_pin_mapping() should be used from each process that needs to
801  * retain the mapping.
802  */
803 void
dsm_pin_segment(dsm_segment * seg)804 dsm_pin_segment(dsm_segment *seg)
805 {
806 	void	   *handle;
807 
808 	/*
809 	 * Bump reference count for this segment in shared memory. This will
810 	 * ensure that even if there is no session which is attached to this
811 	 * segment, it will remain until postmaster shutdown or an explicit call
812 	 * to unpin.
813 	 */
814 	LWLockAcquire(DynamicSharedMemoryControlLock, LW_EXCLUSIVE);
815 	if (dsm_control->item[seg->control_slot].pinned)
816 		elog(ERROR, "cannot pin a segment that is already pinned");
817 	dsm_impl_pin_segment(seg->handle, seg->impl_private, &handle);
818 	dsm_control->item[seg->control_slot].pinned = true;
819 	dsm_control->item[seg->control_slot].refcnt++;
820 	dsm_control->item[seg->control_slot].impl_private_pm_handle = handle;
821 	LWLockRelease(DynamicSharedMemoryControlLock);
822 }
823 
824 /*
825  * Unpin a dynamic shared memory segment that was previously pinned with
826  * dsm_pin_segment.  This function should not be called unless dsm_pin_segment
827  * was previously called for this segment.
828  *
829  * The argument is a dsm_handle rather than a dsm_segment in case you want
830  * to unpin a segment to which you haven't attached.  This turns out to be
831  * useful if, for example, a reference to one shared memory segment is stored
832  * within another shared memory segment.  You might want to unpin the
833  * referenced segment before destroying the referencing segment.
834  */
835 void
dsm_unpin_segment(dsm_handle handle)836 dsm_unpin_segment(dsm_handle handle)
837 {
838 	uint32		control_slot = INVALID_CONTROL_SLOT;
839 	bool		destroy = false;
840 	uint32		i;
841 
842 	/* Find the control slot for the given handle. */
843 	LWLockAcquire(DynamicSharedMemoryControlLock, LW_EXCLUSIVE);
844 	for (i = 0; i < dsm_control->nitems; ++i)
845 	{
846 		/* Skip unused slots and segments that are concurrently going away. */
847 		if (dsm_control->item[i].refcnt <= 1)
848 			continue;
849 
850 		/* If we've found our handle, we can stop searching. */
851 		if (dsm_control->item[i].handle == handle)
852 		{
853 			control_slot = i;
854 			break;
855 		}
856 	}
857 
858 	/*
859 	 * We should definitely have found the slot, and it should not already be
860 	 * in the process of going away, because this function should only be
861 	 * called on a segment which is pinned.
862 	 */
863 	if (control_slot == INVALID_CONTROL_SLOT)
864 		elog(ERROR, "cannot unpin unknown segment handle");
865 	if (!dsm_control->item[control_slot].pinned)
866 		elog(ERROR, "cannot unpin a segment that is not pinned");
867 	Assert(dsm_control->item[control_slot].refcnt > 1);
868 
869 	/*
870 	 * Allow implementation-specific code to run.  We have to do this before
871 	 * releasing the lock, because impl_private_pm_handle may get modified by
872 	 * dsm_impl_unpin_segment.
873 	 */
874 	dsm_impl_unpin_segment(handle,
875 						   &dsm_control->item[control_slot].impl_private_pm_handle);
876 
877 	/* Note that 1 means no references (0 means unused slot). */
878 	if (--dsm_control->item[control_slot].refcnt == 1)
879 		destroy = true;
880 	dsm_control->item[control_slot].pinned = false;
881 
882 	/* Now we can release the lock. */
883 	LWLockRelease(DynamicSharedMemoryControlLock);
884 
885 	/* Clean up resources if that was the last reference. */
886 	if (destroy)
887 	{
888 		void	   *junk_impl_private = NULL;
889 		void	   *junk_mapped_address = NULL;
890 		Size		junk_mapped_size = 0;
891 
892 		/*
893 		 * For an explanation of how error handling works in this case, see
894 		 * comments in dsm_detach.  Note that if we reach this point, the
895 		 * current process certainly does not have the segment mapped, because
896 		 * if it did, the reference count would have still been greater than 1
897 		 * even after releasing the reference count held by the pin.  The fact
898 		 * that there can't be a dsm_segment for this handle makes it OK to
899 		 * pass the mapped size, mapped address, and private data as NULL
900 		 * here.
901 		 */
902 		if (dsm_impl_op(DSM_OP_DESTROY, handle, 0, &junk_impl_private,
903 						&junk_mapped_address, &junk_mapped_size, WARNING))
904 		{
905 			LWLockAcquire(DynamicSharedMemoryControlLock, LW_EXCLUSIVE);
906 			Assert(dsm_control->item[control_slot].handle == handle);
907 			Assert(dsm_control->item[control_slot].refcnt == 1);
908 			dsm_control->item[control_slot].refcnt = 0;
909 			LWLockRelease(DynamicSharedMemoryControlLock);
910 		}
911 	}
912 }
913 
914 /*
915  * Find an existing mapping for a shared memory segment, if there is one.
916  */
917 dsm_segment *
dsm_find_mapping(dsm_handle h)918 dsm_find_mapping(dsm_handle h)
919 {
920 	dlist_iter	iter;
921 	dsm_segment *seg;
922 
923 	dlist_foreach(iter, &dsm_segment_list)
924 	{
925 		seg = dlist_container(dsm_segment, node, iter.cur);
926 		if (seg->handle == h)
927 			return seg;
928 	}
929 
930 	return NULL;
931 }
932 
933 /*
934  * Get the address at which a dynamic shared memory segment is mapped.
935  */
936 void *
dsm_segment_address(dsm_segment * seg)937 dsm_segment_address(dsm_segment *seg)
938 {
939 	Assert(seg->mapped_address != NULL);
940 	return seg->mapped_address;
941 }
942 
943 /*
944  * Get the size of a mapping.
945  */
946 Size
dsm_segment_map_length(dsm_segment * seg)947 dsm_segment_map_length(dsm_segment *seg)
948 {
949 	Assert(seg->mapped_address != NULL);
950 	return seg->mapped_size;
951 }
952 
953 /*
954  * Get a handle for a mapping.
955  *
956  * To establish communication via dynamic shared memory between two backends,
957  * one of them should first call dsm_create() to establish a new shared
958  * memory mapping.  That process should then call dsm_segment_handle() to
959  * obtain a handle for the mapping, and pass that handle to the
960  * coordinating backend via some means (e.g. bgw_main_arg, or via the
961  * main shared memory segment).  The recipient, once in possession of the
962  * handle, should call dsm_attach().
963  */
964 dsm_handle
dsm_segment_handle(dsm_segment * seg)965 dsm_segment_handle(dsm_segment *seg)
966 {
967 	return seg->handle;
968 }
969 
970 /*
971  * Register an on-detach callback for a dynamic shared memory segment.
972  */
973 void
on_dsm_detach(dsm_segment * seg,on_dsm_detach_callback function,Datum arg)974 on_dsm_detach(dsm_segment *seg, on_dsm_detach_callback function, Datum arg)
975 {
976 	dsm_segment_detach_callback *cb;
977 
978 	cb = MemoryContextAlloc(TopMemoryContext,
979 							sizeof(dsm_segment_detach_callback));
980 	cb->function = function;
981 	cb->arg = arg;
982 	slist_push_head(&seg->on_detach, &cb->node);
983 }
984 
985 /*
986  * Unregister an on-detach callback for a dynamic shared memory segment.
987  */
988 void
cancel_on_dsm_detach(dsm_segment * seg,on_dsm_detach_callback function,Datum arg)989 cancel_on_dsm_detach(dsm_segment *seg, on_dsm_detach_callback function,
990 					 Datum arg)
991 {
992 	slist_mutable_iter iter;
993 
994 	slist_foreach_modify(iter, &seg->on_detach)
995 	{
996 		dsm_segment_detach_callback *cb;
997 
998 		cb = slist_container(dsm_segment_detach_callback, node, iter.cur);
999 		if (cb->function == function && cb->arg == arg)
1000 		{
1001 			slist_delete_current(&iter);
1002 			pfree(cb);
1003 			break;
1004 		}
1005 	}
1006 }
1007 
1008 /*
1009  * Discard all registered on-detach callbacks without executing them.
1010  */
1011 void
reset_on_dsm_detach(void)1012 reset_on_dsm_detach(void)
1013 {
1014 	dlist_iter	iter;
1015 
1016 	dlist_foreach(iter, &dsm_segment_list)
1017 	{
1018 		dsm_segment *seg = dlist_container(dsm_segment, node, iter.cur);
1019 
1020 		/* Throw away explicit on-detach actions one by one. */
1021 		while (!slist_is_empty(&seg->on_detach))
1022 		{
1023 			slist_node *node;
1024 			dsm_segment_detach_callback *cb;
1025 
1026 			node = slist_pop_head_node(&seg->on_detach);
1027 			cb = slist_container(dsm_segment_detach_callback, node, node);
1028 			pfree(cb);
1029 		}
1030 
1031 		/*
1032 		 * Decrementing the reference count is a sort of implicit on-detach
1033 		 * action; make sure we don't do that, either.
1034 		 */
1035 		seg->control_slot = INVALID_CONTROL_SLOT;
1036 	}
1037 }
1038 
1039 /*
1040  * Create a segment descriptor.
1041  */
1042 static dsm_segment *
dsm_create_descriptor(void)1043 dsm_create_descriptor(void)
1044 {
1045 	dsm_segment *seg;
1046 
1047 	if (CurrentResourceOwner)
1048 		ResourceOwnerEnlargeDSMs(CurrentResourceOwner);
1049 
1050 	seg = MemoryContextAlloc(TopMemoryContext, sizeof(dsm_segment));
1051 	dlist_push_head(&dsm_segment_list, &seg->node);
1052 
1053 	/* seg->handle must be initialized by the caller */
1054 	seg->control_slot = INVALID_CONTROL_SLOT;
1055 	seg->impl_private = NULL;
1056 	seg->mapped_address = NULL;
1057 	seg->mapped_size = 0;
1058 
1059 	seg->resowner = CurrentResourceOwner;
1060 	if (CurrentResourceOwner)
1061 		ResourceOwnerRememberDSM(CurrentResourceOwner, seg);
1062 
1063 	slist_init(&seg->on_detach);
1064 
1065 	return seg;
1066 }
1067 
1068 /*
1069  * Sanity check a control segment.
1070  *
1071  * The goal here isn't to detect everything that could possibly be wrong with
1072  * the control segment; there's not enough information for that.  Rather, the
1073  * goal is to make sure that someone can iterate over the items in the segment
1074  * without overrunning the end of the mapping and crashing.  We also check
1075  * the magic number since, if that's messed up, this may not even be one of
1076  * our segments at all.
1077  */
1078 static bool
dsm_control_segment_sane(dsm_control_header * control,Size mapped_size)1079 dsm_control_segment_sane(dsm_control_header *control, Size mapped_size)
1080 {
1081 	if (mapped_size < offsetof(dsm_control_header, item))
1082 		return false;			/* Mapped size too short to read header. */
1083 	if (control->magic != PG_DYNSHMEM_CONTROL_MAGIC)
1084 		return false;			/* Magic number doesn't match. */
1085 	if (dsm_control_bytes_needed(control->maxitems) > mapped_size)
1086 		return false;			/* Max item count won't fit in map. */
1087 	if (control->nitems > control->maxitems)
1088 		return false;			/* Overfull. */
1089 	return true;
1090 }
1091 
1092 /*
1093  * Compute the number of control-segment bytes needed to store a given
1094  * number of items.
1095  */
1096 static uint64
dsm_control_bytes_needed(uint32 nitems)1097 dsm_control_bytes_needed(uint32 nitems)
1098 {
1099 	return offsetof(dsm_control_header, item)
1100 		+ sizeof(dsm_control_item) * (uint64) nitems;
1101 }
1102