1 /*-------------------------------------------------------------------------
2  *
3  * dsm.c
4  *	  manage dynamic shared memory segments
5  *
6  * This file provides a set of services to make programming with dynamic
7  * shared memory segments more convenient.  Unlike the low-level
8  * facilities provided by dsm_impl.h and dsm_impl.c, mappings and segments
9  * created using this module will be cleaned up automatically.  Mappings
10  * will be removed when the resource owner under which they were created
11  * is cleaned up, unless dsm_pin_mapping() is used, in which case they
12  * have session lifespan.  Segments will be removed when there are no
13  * remaining mappings, or at postmaster shutdown in any case.  After a
14  * hard postmaster crash, remaining segments will be removed, if they
15  * still exist, at the next postmaster startup.
16  *
17  * Portions Copyright (c) 1996-2018, PostgreSQL Global Development Group
18  * Portions Copyright (c) 1994, Regents of the University of California
19  *
20  *
21  * IDENTIFICATION
22  *	  src/backend/storage/ipc/dsm.c
23  *
24  *-------------------------------------------------------------------------
25  */
26 
27 #include "postgres.h"
28 
29 #include <fcntl.h>
30 #include <unistd.h>
31 #ifndef WIN32
32 #include <sys/mman.h>
33 #endif
34 #include <sys/stat.h>
35 
36 #include "lib/ilist.h"
37 #include "miscadmin.h"
38 #include "storage/dsm.h"
39 #include "storage/ipc.h"
40 #include "storage/lwlock.h"
41 #include "storage/pg_shmem.h"
42 #include "utils/guc.h"
43 #include "utils/memutils.h"
44 #include "utils/resowner_private.h"
45 
46 #define PG_DYNSHMEM_CONTROL_MAGIC		0x9a503d32
47 
48 #define PG_DYNSHMEM_FIXED_SLOTS			64
49 #define PG_DYNSHMEM_SLOTS_PER_BACKEND	5
50 
51 #define INVALID_CONTROL_SLOT		((uint32) -1)
52 
53 /* Backend-local tracking for on-detach callbacks. */
54 typedef struct dsm_segment_detach_callback
55 {
56 	on_dsm_detach_callback function;
57 	Datum		arg;
58 	slist_node	node;
59 } dsm_segment_detach_callback;
60 
61 /* Backend-local state for a dynamic shared memory segment. */
62 struct dsm_segment
63 {
64 	dlist_node	node;			/* List link in dsm_segment_list. */
65 	ResourceOwner resowner;		/* Resource owner. */
66 	dsm_handle	handle;			/* Segment name. */
67 	uint32		control_slot;	/* Slot in control segment. */
68 	void	   *impl_private;	/* Implementation-specific private data. */
69 	void	   *mapped_address; /* Mapping address, or NULL if unmapped. */
70 	Size		mapped_size;	/* Size of our mapping. */
71 	slist_head	on_detach;		/* On-detach callbacks. */
72 };
73 
74 /* Shared-memory state for a dynamic shared memory segment. */
75 typedef struct dsm_control_item
76 {
77 	dsm_handle	handle;
78 	uint32		refcnt;			/* 2+ = active, 1 = moribund, 0 = gone */
79 	void	   *impl_private_pm_handle; /* only needed on Windows */
80 	bool		pinned;
81 } dsm_control_item;
82 
83 /* Layout of the dynamic shared memory control segment. */
84 typedef struct dsm_control_header
85 {
86 	uint32		magic;
87 	uint32		nitems;
88 	uint32		maxitems;
89 	dsm_control_item item[FLEXIBLE_ARRAY_MEMBER];
90 } dsm_control_header;
91 
92 static void dsm_cleanup_for_mmap(void);
93 static void dsm_postmaster_shutdown(int code, Datum arg);
94 static dsm_segment *dsm_create_descriptor(void);
95 static bool dsm_control_segment_sane(dsm_control_header *control,
96 						 Size mapped_size);
97 static uint64 dsm_control_bytes_needed(uint32 nitems);
98 
99 /* Has this backend initialized the dynamic shared memory system yet? */
100 static bool dsm_init_done = false;
101 
102 /*
103  * List of dynamic shared memory segments used by this backend.
104  *
105  * At process exit time, we must decrement the reference count of each
106  * segment we have attached; this list makes it possible to find all such
107  * segments.
108  *
109  * This list should always be empty in the postmaster.  We could probably
110  * allow the postmaster to map dynamic shared memory segments before it
111  * begins to start child processes, provided that each process adjusted
112  * the reference counts for those segments in the control segment at
113  * startup time, but there's no obvious need for such a facility, which
114  * would also be complex to handle in the EXEC_BACKEND case.  Once the
115  * postmaster has begun spawning children, there's an additional problem:
116  * each new mapping would require an update to the control segment,
117  * which requires locking, in which the postmaster must not be involved.
118  */
119 static dlist_head dsm_segment_list = DLIST_STATIC_INIT(dsm_segment_list);
120 
121 /*
122  * Control segment information.
123  *
124  * Unlike ordinary shared memory segments, the control segment is not
125  * reference counted; instead, it lasts for the postmaster's entire
126  * life cycle.  For simplicity, it doesn't have a dsm_segment object either.
127  */
128 static dsm_handle dsm_control_handle;
129 static dsm_control_header *dsm_control;
130 static Size dsm_control_mapped_size = 0;
131 static void *dsm_control_impl_private = NULL;
132 
133 /*
134  * Start up the dynamic shared memory system.
135  *
136  * This is called just once during each cluster lifetime, at postmaster
137  * startup time.
138  */
139 void
dsm_postmaster_startup(PGShmemHeader * shim)140 dsm_postmaster_startup(PGShmemHeader *shim)
141 {
142 	void	   *dsm_control_address = NULL;
143 	uint32		maxitems;
144 	Size		segsize;
145 
146 	Assert(!IsUnderPostmaster);
147 
148 	/* If dynamic shared memory is disabled, there's nothing to do. */
149 	if (dynamic_shared_memory_type == DSM_IMPL_NONE)
150 		return;
151 
152 	/*
153 	 * If we're using the mmap implementations, clean up any leftovers.
154 	 * Cleanup isn't needed on Windows, and happens earlier in startup for
155 	 * POSIX and System V shared memory, via a direct call to
156 	 * dsm_cleanup_using_control_segment.
157 	 */
158 	if (dynamic_shared_memory_type == DSM_IMPL_MMAP)
159 		dsm_cleanup_for_mmap();
160 
161 	/* Determine size for new control segment. */
162 	maxitems = PG_DYNSHMEM_FIXED_SLOTS
163 		+ PG_DYNSHMEM_SLOTS_PER_BACKEND * MaxBackends;
164 	elog(DEBUG2, "dynamic shared memory system will support %u segments",
165 		 maxitems);
166 	segsize = dsm_control_bytes_needed(maxitems);
167 
168 	/*
169 	 * Loop until we find an unused identifier for the new control segment. We
170 	 * sometimes use 0 as a sentinel value indicating that no control segment
171 	 * is known to exist, so avoid using that value for a real control
172 	 * segment.
173 	 */
174 	for (;;)
175 	{
176 		Assert(dsm_control_address == NULL);
177 		Assert(dsm_control_mapped_size == 0);
178 		dsm_control_handle = random();
179 		if (dsm_control_handle == DSM_HANDLE_INVALID)
180 			continue;
181 		if (dsm_impl_op(DSM_OP_CREATE, dsm_control_handle, segsize,
182 						&dsm_control_impl_private, &dsm_control_address,
183 						&dsm_control_mapped_size, ERROR))
184 			break;
185 	}
186 	dsm_control = dsm_control_address;
187 	on_shmem_exit(dsm_postmaster_shutdown, PointerGetDatum(shim));
188 	elog(DEBUG2,
189 		 "created dynamic shared memory control segment %u (%zu bytes)",
190 		 dsm_control_handle, segsize);
191 	shim->dsm_control = dsm_control_handle;
192 
193 	/* Initialize control segment. */
194 	dsm_control->magic = PG_DYNSHMEM_CONTROL_MAGIC;
195 	dsm_control->nitems = 0;
196 	dsm_control->maxitems = maxitems;
197 }
198 
199 /*
200  * Determine whether the control segment from the previous postmaster
201  * invocation still exists.  If so, remove the dynamic shared memory
202  * segments to which it refers, and then the control segment itself.
203  */
204 void
dsm_cleanup_using_control_segment(dsm_handle old_control_handle)205 dsm_cleanup_using_control_segment(dsm_handle old_control_handle)
206 {
207 	void	   *mapped_address = NULL;
208 	void	   *junk_mapped_address = NULL;
209 	void	   *impl_private = NULL;
210 	void	   *junk_impl_private = NULL;
211 	Size		mapped_size = 0;
212 	Size		junk_mapped_size = 0;
213 	uint32		nitems;
214 	uint32		i;
215 	dsm_control_header *old_control;
216 
217 	/* If dynamic shared memory is disabled, there's nothing to do. */
218 	if (dynamic_shared_memory_type == DSM_IMPL_NONE)
219 		return;
220 
221 	/*
222 	 * Try to attach the segment.  If this fails, it probably just means that
223 	 * the operating system has been rebooted and the segment no longer
224 	 * exists, or an unrelated process has used the same shm ID.  So just fall
225 	 * out quietly.
226 	 */
227 	if (!dsm_impl_op(DSM_OP_ATTACH, old_control_handle, 0, &impl_private,
228 					 &mapped_address, &mapped_size, DEBUG1))
229 		return;
230 
231 	/*
232 	 * We've managed to reattach it, but the contents might not be sane. If
233 	 * they aren't, we disregard the segment after all.
234 	 */
235 	old_control = (dsm_control_header *) mapped_address;
236 	if (!dsm_control_segment_sane(old_control, mapped_size))
237 	{
238 		dsm_impl_op(DSM_OP_DETACH, old_control_handle, 0, &impl_private,
239 					&mapped_address, &mapped_size, LOG);
240 		return;
241 	}
242 
243 	/*
244 	 * OK, the control segment looks basically valid, so we can use it to get
245 	 * a list of segments that need to be removed.
246 	 */
247 	nitems = old_control->nitems;
248 	for (i = 0; i < nitems; ++i)
249 	{
250 		dsm_handle	handle;
251 		uint32		refcnt;
252 
253 		/* If the reference count is 0, the slot is actually unused. */
254 		refcnt = old_control->item[i].refcnt;
255 		if (refcnt == 0)
256 			continue;
257 
258 		/* Log debugging information. */
259 		handle = old_control->item[i].handle;
260 		elog(DEBUG2, "cleaning up orphaned dynamic shared memory with ID %u (reference count %u)",
261 			 handle, refcnt);
262 
263 		/* Destroy the referenced segment. */
264 		dsm_impl_op(DSM_OP_DESTROY, handle, 0, &junk_impl_private,
265 					&junk_mapped_address, &junk_mapped_size, LOG);
266 	}
267 
268 	/* Destroy the old control segment, too. */
269 	elog(DEBUG2,
270 		 "cleaning up dynamic shared memory control segment with ID %u",
271 		 old_control_handle);
272 	dsm_impl_op(DSM_OP_DESTROY, old_control_handle, 0, &impl_private,
273 				&mapped_address, &mapped_size, LOG);
274 }
275 
276 /*
277  * When we're using the mmap shared memory implementation, "shared memory"
278  * segments might even manage to survive an operating system reboot.
279  * But there's no guarantee as to exactly what will survive: some segments
280  * may survive, and others may not, and the contents of some may be out
281  * of date.  In particular, the control segment may be out of date, so we
282  * can't rely on it to figure out what to remove.  However, since we know
283  * what directory contains the files we used as shared memory, we can simply
284  * scan the directory and blow everything away that shouldn't be there.
285  */
286 static void
dsm_cleanup_for_mmap(void)287 dsm_cleanup_for_mmap(void)
288 {
289 	DIR		   *dir;
290 	struct dirent *dent;
291 
292 	/* Scan the directory for something with a name of the correct format. */
293 	dir = AllocateDir(PG_DYNSHMEM_DIR);
294 
295 	while ((dent = ReadDir(dir, PG_DYNSHMEM_DIR)) != NULL)
296 	{
297 		if (strncmp(dent->d_name, PG_DYNSHMEM_MMAP_FILE_PREFIX,
298 					strlen(PG_DYNSHMEM_MMAP_FILE_PREFIX)) == 0)
299 		{
300 			char		buf[MAXPGPATH + sizeof(PG_DYNSHMEM_DIR)];
301 
302 			snprintf(buf, sizeof(buf), PG_DYNSHMEM_DIR "/%s", dent->d_name);
303 
304 			elog(DEBUG2, "removing file \"%s\"", buf);
305 
306 			/* We found a matching file; so remove it. */
307 			if (unlink(buf) != 0)
308 				ereport(ERROR,
309 						(errcode_for_file_access(),
310 						 errmsg("could not remove file \"%s\": %m", buf)));
311 		}
312 	}
313 
314 	/* Cleanup complete. */
315 	FreeDir(dir);
316 }
317 
318 /*
319  * At shutdown time, we iterate over the control segment and remove all
320  * remaining dynamic shared memory segments.  We avoid throwing errors here;
321  * the postmaster is shutting down either way, and this is just non-critical
322  * resource cleanup.
323  */
324 static void
dsm_postmaster_shutdown(int code,Datum arg)325 dsm_postmaster_shutdown(int code, Datum arg)
326 {
327 	uint32		nitems;
328 	uint32		i;
329 	void	   *dsm_control_address;
330 	void	   *junk_mapped_address = NULL;
331 	void	   *junk_impl_private = NULL;
332 	Size		junk_mapped_size = 0;
333 	PGShmemHeader *shim = (PGShmemHeader *) DatumGetPointer(arg);
334 
335 	/*
336 	 * If some other backend exited uncleanly, it might have corrupted the
337 	 * control segment while it was dying.  In that case, we warn and ignore
338 	 * the contents of the control segment.  This may end up leaving behind
339 	 * stray shared memory segments, but there's not much we can do about that
340 	 * if the metadata is gone.
341 	 */
342 	nitems = dsm_control->nitems;
343 	if (!dsm_control_segment_sane(dsm_control, dsm_control_mapped_size))
344 	{
345 		ereport(LOG,
346 				(errmsg("dynamic shared memory control segment is corrupt")));
347 		return;
348 	}
349 
350 	/* Remove any remaining segments. */
351 	for (i = 0; i < nitems; ++i)
352 	{
353 		dsm_handle	handle;
354 
355 		/* If the reference count is 0, the slot is actually unused. */
356 		if (dsm_control->item[i].refcnt == 0)
357 			continue;
358 
359 		/* Log debugging information. */
360 		handle = dsm_control->item[i].handle;
361 		elog(DEBUG2, "cleaning up orphaned dynamic shared memory with ID %u",
362 			 handle);
363 
364 		/* Destroy the segment. */
365 		dsm_impl_op(DSM_OP_DESTROY, handle, 0, &junk_impl_private,
366 					&junk_mapped_address, &junk_mapped_size, LOG);
367 	}
368 
369 	/* Remove the control segment itself. */
370 	elog(DEBUG2,
371 		 "cleaning up dynamic shared memory control segment with ID %u",
372 		 dsm_control_handle);
373 	dsm_control_address = dsm_control;
374 	dsm_impl_op(DSM_OP_DESTROY, dsm_control_handle, 0,
375 				&dsm_control_impl_private, &dsm_control_address,
376 				&dsm_control_mapped_size, LOG);
377 	dsm_control = dsm_control_address;
378 	shim->dsm_control = 0;
379 }
380 
381 /*
382  * Prepare this backend for dynamic shared memory usage.  Under EXEC_BACKEND,
383  * we must reread the state file and map the control segment; in other cases,
384  * we'll have inherited the postmaster's mapping and global variables.
385  */
386 static void
dsm_backend_startup(void)387 dsm_backend_startup(void)
388 {
389 	/* If dynamic shared memory is disabled, reject this. */
390 	if (dynamic_shared_memory_type == DSM_IMPL_NONE)
391 		ereport(ERROR,
392 				(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
393 				 errmsg("dynamic shared memory is disabled"),
394 				 errhint("Set dynamic_shared_memory_type to a value other than \"none\".")));
395 
396 #ifdef EXEC_BACKEND
397 	{
398 		void	   *control_address = NULL;
399 
400 		/* Attach control segment. */
401 		Assert(dsm_control_handle != 0);
402 		dsm_impl_op(DSM_OP_ATTACH, dsm_control_handle, 0,
403 					&dsm_control_impl_private, &control_address,
404 					&dsm_control_mapped_size, ERROR);
405 		dsm_control = control_address;
406 		/* If control segment doesn't look sane, something is badly wrong. */
407 		if (!dsm_control_segment_sane(dsm_control, dsm_control_mapped_size))
408 		{
409 			dsm_impl_op(DSM_OP_DETACH, dsm_control_handle, 0,
410 						&dsm_control_impl_private, &control_address,
411 						&dsm_control_mapped_size, WARNING);
412 			ereport(FATAL,
413 					(errcode(ERRCODE_INTERNAL_ERROR),
414 					 errmsg("dynamic shared memory control segment is not valid")));
415 		}
416 	}
417 #endif
418 
419 	dsm_init_done = true;
420 }
421 
422 #ifdef EXEC_BACKEND
423 /*
424  * When running under EXEC_BACKEND, we get a callback here when the main
425  * shared memory segment is re-attached, so that we can record the control
426  * handle retrieved from it.
427  */
428 void
dsm_set_control_handle(dsm_handle h)429 dsm_set_control_handle(dsm_handle h)
430 {
431 	Assert(dsm_control_handle == 0 && h != 0);
432 	dsm_control_handle = h;
433 }
434 #endif
435 
436 /*
437  * Create a new dynamic shared memory segment.
438  *
439  * If there is a non-NULL CurrentResourceOwner, the new segment is associated
440  * with it and must be detached before the resource owner releases, or a
441  * warning will be logged.  If CurrentResourceOwner is NULL, the segment
442  * remains attached until explicitely detached or the session ends.
443  * Creating with a NULL CurrentResourceOwner is equivalent to creating
444  * with a non-NULL CurrentResourceOwner and then calling dsm_pin_mapping.
445  */
446 dsm_segment *
dsm_create(Size size,int flags)447 dsm_create(Size size, int flags)
448 {
449 	dsm_segment *seg;
450 	uint32		i;
451 	uint32		nitems;
452 
453 	/* Unsafe in postmaster (and pointless in a stand-alone backend). */
454 	Assert(IsUnderPostmaster);
455 
456 	if (!dsm_init_done)
457 		dsm_backend_startup();
458 
459 	/* Create a new segment descriptor. */
460 	seg = dsm_create_descriptor();
461 
462 	/* Loop until we find an unused segment identifier. */
463 	for (;;)
464 	{
465 		Assert(seg->mapped_address == NULL && seg->mapped_size == 0);
466 		seg->handle = random();
467 		if (seg->handle == DSM_HANDLE_INVALID)	/* Reserve sentinel */
468 			continue;
469 		if (dsm_impl_op(DSM_OP_CREATE, seg->handle, size, &seg->impl_private,
470 						&seg->mapped_address, &seg->mapped_size, ERROR))
471 			break;
472 	}
473 
474 	/* Lock the control segment so we can register the new segment. */
475 	LWLockAcquire(DynamicSharedMemoryControlLock, LW_EXCLUSIVE);
476 
477 	/* Search the control segment for an unused slot. */
478 	nitems = dsm_control->nitems;
479 	for (i = 0; i < nitems; ++i)
480 	{
481 		if (dsm_control->item[i].refcnt == 0)
482 		{
483 			dsm_control->item[i].handle = seg->handle;
484 			/* refcnt of 1 triggers destruction, so start at 2 */
485 			dsm_control->item[i].refcnt = 2;
486 			dsm_control->item[i].impl_private_pm_handle = NULL;
487 			dsm_control->item[i].pinned = false;
488 			seg->control_slot = i;
489 			LWLockRelease(DynamicSharedMemoryControlLock);
490 			return seg;
491 		}
492 	}
493 
494 	/* Verify that we can support an additional mapping. */
495 	if (nitems >= dsm_control->maxitems)
496 	{
497 		LWLockRelease(DynamicSharedMemoryControlLock);
498 		dsm_impl_op(DSM_OP_DESTROY, seg->handle, 0, &seg->impl_private,
499 					&seg->mapped_address, &seg->mapped_size, WARNING);
500 		if (seg->resowner != NULL)
501 			ResourceOwnerForgetDSM(seg->resowner, seg);
502 		dlist_delete(&seg->node);
503 		pfree(seg);
504 
505 		if ((flags & DSM_CREATE_NULL_IF_MAXSEGMENTS) != 0)
506 			return NULL;
507 		ereport(ERROR,
508 				(errcode(ERRCODE_INSUFFICIENT_RESOURCES),
509 				 errmsg("too many dynamic shared memory segments")));
510 	}
511 
512 	/* Enter the handle into a new array slot. */
513 	dsm_control->item[nitems].handle = seg->handle;
514 	/* refcnt of 1 triggers destruction, so start at 2 */
515 	dsm_control->item[nitems].refcnt = 2;
516 	dsm_control->item[nitems].impl_private_pm_handle = NULL;
517 	dsm_control->item[nitems].pinned = false;
518 	seg->control_slot = nitems;
519 	dsm_control->nitems++;
520 	LWLockRelease(DynamicSharedMemoryControlLock);
521 
522 	return seg;
523 }
524 
525 /*
526  * Attach a dynamic shared memory segment.
527  *
528  * See comments for dsm_segment_handle() for an explanation of how this
529  * is intended to be used.
530  *
531  * This function will return NULL if the segment isn't known to the system.
532  * This can happen if we're asked to attach the segment, but then everyone
533  * else detaches it (causing it to be destroyed) before we get around to
534  * attaching it.
535  *
536  * If there is a non-NULL CurrentResourceOwner, the attached segment is
537  * associated with it and must be detached before the resource owner releases,
538  * or a warning will be logged.  Otherwise the segment remains attached until
539  * explicitely detached or the session ends.  See the note atop dsm_create().
540  */
541 dsm_segment *
dsm_attach(dsm_handle h)542 dsm_attach(dsm_handle h)
543 {
544 	dsm_segment *seg;
545 	dlist_iter	iter;
546 	uint32		i;
547 	uint32		nitems;
548 
549 	/* Unsafe in postmaster (and pointless in a stand-alone backend). */
550 	Assert(IsUnderPostmaster);
551 
552 	if (!dsm_init_done)
553 		dsm_backend_startup();
554 
555 	/*
556 	 * Since this is just a debugging cross-check, we could leave it out
557 	 * altogether, or include it only in assert-enabled builds.  But since the
558 	 * list of attached segments should normally be very short, let's include
559 	 * it always for right now.
560 	 *
561 	 * If you're hitting this error, you probably want to attempt to find an
562 	 * existing mapping via dsm_find_mapping() before calling dsm_attach() to
563 	 * create a new one.
564 	 */
565 	dlist_foreach(iter, &dsm_segment_list)
566 	{
567 		seg = dlist_container(dsm_segment, node, iter.cur);
568 		if (seg->handle == h)
569 			elog(ERROR, "can't attach the same segment more than once");
570 	}
571 
572 	/* Create a new segment descriptor. */
573 	seg = dsm_create_descriptor();
574 	seg->handle = h;
575 
576 	/* Bump reference count for this segment in shared memory. */
577 	LWLockAcquire(DynamicSharedMemoryControlLock, LW_EXCLUSIVE);
578 	nitems = dsm_control->nitems;
579 	for (i = 0; i < nitems; ++i)
580 	{
581 		/*
582 		 * If the reference count is 0, the slot is actually unused.  If the
583 		 * reference count is 1, the slot is still in use, but the segment is
584 		 * in the process of going away; even if the handle matches, another
585 		 * slot may already have started using the same handle value by
586 		 * coincidence so we have to keep searching.
587 		 */
588 		if (dsm_control->item[i].refcnt <= 1)
589 			continue;
590 
591 		/* If the handle doesn't match, it's not the slot we want. */
592 		if (dsm_control->item[i].handle != seg->handle)
593 			continue;
594 
595 		/* Otherwise we've found a match. */
596 		dsm_control->item[i].refcnt++;
597 		seg->control_slot = i;
598 		break;
599 	}
600 	LWLockRelease(DynamicSharedMemoryControlLock);
601 
602 	/*
603 	 * If we didn't find the handle we're looking for in the control segment,
604 	 * it probably means that everyone else who had it mapped, including the
605 	 * original creator, died before we got to this point. It's up to the
606 	 * caller to decide what to do about that.
607 	 */
608 	if (seg->control_slot == INVALID_CONTROL_SLOT)
609 	{
610 		dsm_detach(seg);
611 		return NULL;
612 	}
613 
614 	/* Here's where we actually try to map the segment. */
615 	dsm_impl_op(DSM_OP_ATTACH, seg->handle, 0, &seg->impl_private,
616 				&seg->mapped_address, &seg->mapped_size, ERROR);
617 
618 	return seg;
619 }
620 
621 /*
622  * At backend shutdown time, detach any segments that are still attached.
623  * (This is similar to dsm_detach_all, except that there's no reason to
624  * unmap the control segment before exiting, so we don't bother.)
625  */
626 void
dsm_backend_shutdown(void)627 dsm_backend_shutdown(void)
628 {
629 	while (!dlist_is_empty(&dsm_segment_list))
630 	{
631 		dsm_segment *seg;
632 
633 		seg = dlist_head_element(dsm_segment, node, &dsm_segment_list);
634 		dsm_detach(seg);
635 	}
636 }
637 
638 /*
639  * Detach all shared memory segments, including the control segments.  This
640  * should be called, along with PGSharedMemoryDetach, in processes that
641  * might inherit mappings but are not intended to be connected to dynamic
642  * shared memory.
643  */
644 void
dsm_detach_all(void)645 dsm_detach_all(void)
646 {
647 	void	   *control_address = dsm_control;
648 
649 	while (!dlist_is_empty(&dsm_segment_list))
650 	{
651 		dsm_segment *seg;
652 
653 		seg = dlist_head_element(dsm_segment, node, &dsm_segment_list);
654 		dsm_detach(seg);
655 	}
656 
657 	if (control_address != NULL)
658 		dsm_impl_op(DSM_OP_DETACH, dsm_control_handle, 0,
659 					&dsm_control_impl_private, &control_address,
660 					&dsm_control_mapped_size, ERROR);
661 }
662 
663 /*
664  * Resize an existing shared memory segment.
665  *
666  * This may cause the shared memory segment to be remapped at a different
667  * address.  For the caller's convenience, we return the mapped address.
668  */
669 void *
dsm_resize(dsm_segment * seg,Size size)670 dsm_resize(dsm_segment *seg, Size size)
671 {
672 	Assert(seg->control_slot != INVALID_CONTROL_SLOT);
673 	dsm_impl_op(DSM_OP_RESIZE, seg->handle, size, &seg->impl_private,
674 				&seg->mapped_address, &seg->mapped_size, ERROR);
675 	return seg->mapped_address;
676 }
677 
678 /*
679  * Remap an existing shared memory segment.
680  *
681  * This is intended to be used when some other process has extended the
682  * mapping using dsm_resize(), but we've still only got the initial
683  * portion mapped.  Since this might change the address at which the
684  * segment is mapped, we return the new mapped address.
685  */
686 void *
dsm_remap(dsm_segment * seg)687 dsm_remap(dsm_segment *seg)
688 {
689 	dsm_impl_op(DSM_OP_ATTACH, seg->handle, 0, &seg->impl_private,
690 				&seg->mapped_address, &seg->mapped_size, ERROR);
691 
692 	return seg->mapped_address;
693 }
694 
695 /*
696  * Detach from a shared memory segment, destroying the segment if we
697  * remove the last reference.
698  *
699  * This function should never fail.  It will often be invoked when aborting
700  * a transaction, and a further error won't serve any purpose.  It's not a
701  * complete disaster if we fail to unmap or destroy the segment; it means a
702  * resource leak, but that doesn't necessarily preclude further operations.
703  */
704 void
dsm_detach(dsm_segment * seg)705 dsm_detach(dsm_segment *seg)
706 {
707 	/*
708 	 * Invoke registered callbacks.  Just in case one of those callbacks
709 	 * throws a further error that brings us back here, pop the callback
710 	 * before invoking it, to avoid infinite error recursion.  Don't allow
711 	 * interrupts while running the individual callbacks in non-error code
712 	 * paths, to avoid leaving cleanup work unfinished if we're interrupted by
713 	 * a statement timeout or similar.
714 	 */
715 	HOLD_INTERRUPTS();
716 	while (!slist_is_empty(&seg->on_detach))
717 	{
718 		slist_node *node;
719 		dsm_segment_detach_callback *cb;
720 		on_dsm_detach_callback function;
721 		Datum		arg;
722 
723 		node = slist_pop_head_node(&seg->on_detach);
724 		cb = slist_container(dsm_segment_detach_callback, node, node);
725 		function = cb->function;
726 		arg = cb->arg;
727 		pfree(cb);
728 
729 		function(seg, arg);
730 	}
731 	RESUME_INTERRUPTS();
732 
733 	/*
734 	 * Try to remove the mapping, if one exists.  Normally, there will be, but
735 	 * maybe not, if we failed partway through a create or attach operation.
736 	 * We remove the mapping before decrementing the reference count so that
737 	 * the process that sees a zero reference count can be certain that no
738 	 * remaining mappings exist.  Even if this fails, we pretend that it
739 	 * works, because retrying is likely to fail in the same way.
740 	 */
741 	if (seg->mapped_address != NULL)
742 	{
743 		dsm_impl_op(DSM_OP_DETACH, seg->handle, 0, &seg->impl_private,
744 					&seg->mapped_address, &seg->mapped_size, WARNING);
745 		seg->impl_private = NULL;
746 		seg->mapped_address = NULL;
747 		seg->mapped_size = 0;
748 	}
749 
750 	/* Reduce reference count, if we previously increased it. */
751 	if (seg->control_slot != INVALID_CONTROL_SLOT)
752 	{
753 		uint32		refcnt;
754 		uint32		control_slot = seg->control_slot;
755 
756 		LWLockAcquire(DynamicSharedMemoryControlLock, LW_EXCLUSIVE);
757 		Assert(dsm_control->item[control_slot].handle == seg->handle);
758 		Assert(dsm_control->item[control_slot].refcnt > 1);
759 		refcnt = --dsm_control->item[control_slot].refcnt;
760 		seg->control_slot = INVALID_CONTROL_SLOT;
761 		LWLockRelease(DynamicSharedMemoryControlLock);
762 
763 		/* If new reference count is 1, try to destroy the segment. */
764 		if (refcnt == 1)
765 		{
766 			/* A pinned segment should never reach 1. */
767 			Assert(!dsm_control->item[control_slot].pinned);
768 
769 			/*
770 			 * If we fail to destroy the segment here, or are killed before we
771 			 * finish doing so, the reference count will remain at 1, which
772 			 * will mean that nobody else can attach to the segment.  At
773 			 * postmaster shutdown time, or when a new postmaster is started
774 			 * after a hard kill, another attempt will be made to remove the
775 			 * segment.
776 			 *
777 			 * The main case we're worried about here is being killed by a
778 			 * signal before we can finish removing the segment.  In that
779 			 * case, it's important to be sure that the segment still gets
780 			 * removed. If we actually fail to remove the segment for some
781 			 * other reason, the postmaster may not have any better luck than
782 			 * we did.  There's not much we can do about that, though.
783 			 */
784 			if (dsm_impl_op(DSM_OP_DESTROY, seg->handle, 0, &seg->impl_private,
785 							&seg->mapped_address, &seg->mapped_size, WARNING))
786 			{
787 				LWLockAcquire(DynamicSharedMemoryControlLock, LW_EXCLUSIVE);
788 				Assert(dsm_control->item[control_slot].handle == seg->handle);
789 				Assert(dsm_control->item[control_slot].refcnt == 1);
790 				dsm_control->item[control_slot].refcnt = 0;
791 				LWLockRelease(DynamicSharedMemoryControlLock);
792 			}
793 		}
794 	}
795 
796 	/* Clean up our remaining backend-private data structures. */
797 	if (seg->resowner != NULL)
798 		ResourceOwnerForgetDSM(seg->resowner, seg);
799 	dlist_delete(&seg->node);
800 	pfree(seg);
801 }
802 
803 /*
804  * Keep a dynamic shared memory mapping until end of session.
805  *
806  * By default, mappings are owned by the current resource owner, which
807  * typically means they stick around for the duration of the current query
808  * only.
809  */
810 void
dsm_pin_mapping(dsm_segment * seg)811 dsm_pin_mapping(dsm_segment *seg)
812 {
813 	if (seg->resowner != NULL)
814 	{
815 		ResourceOwnerForgetDSM(seg->resowner, seg);
816 		seg->resowner = NULL;
817 	}
818 }
819 
820 /*
821  * Arrange to remove a dynamic shared memory mapping at cleanup time.
822  *
823  * dsm_pin_mapping() can be used to preserve a mapping for the entire
824  * lifetime of a process; this function reverses that decision, making
825  * the segment owned by the current resource owner.  This may be useful
826  * just before performing some operation that will invalidate the segment
827  * for future use by this backend.
828  */
829 void
dsm_unpin_mapping(dsm_segment * seg)830 dsm_unpin_mapping(dsm_segment *seg)
831 {
832 	Assert(seg->resowner == NULL);
833 	ResourceOwnerEnlargeDSMs(CurrentResourceOwner);
834 	seg->resowner = CurrentResourceOwner;
835 	ResourceOwnerRememberDSM(seg->resowner, seg);
836 }
837 
838 /*
839  * Keep a dynamic shared memory segment until postmaster shutdown, or until
840  * dsm_unpin_segment is called.
841  *
842  * This function should not be called more than once per segment, unless the
843  * segment is explicitly unpinned with dsm_unpin_segment in between calls.
844  *
845  * Note that this function does not arrange for the current process to
846  * keep the segment mapped indefinitely; if that behavior is desired,
847  * dsm_pin_mapping() should be used from each process that needs to
848  * retain the mapping.
849  */
850 void
dsm_pin_segment(dsm_segment * seg)851 dsm_pin_segment(dsm_segment *seg)
852 {
853 	void	   *handle;
854 
855 	/*
856 	 * Bump reference count for this segment in shared memory. This will
857 	 * ensure that even if there is no session which is attached to this
858 	 * segment, it will remain until postmaster shutdown or an explicit call
859 	 * to unpin.
860 	 */
861 	LWLockAcquire(DynamicSharedMemoryControlLock, LW_EXCLUSIVE);
862 	if (dsm_control->item[seg->control_slot].pinned)
863 		elog(ERROR, "cannot pin a segment that is already pinned");
864 	dsm_impl_pin_segment(seg->handle, seg->impl_private, &handle);
865 	dsm_control->item[seg->control_slot].pinned = true;
866 	dsm_control->item[seg->control_slot].refcnt++;
867 	dsm_control->item[seg->control_slot].impl_private_pm_handle = handle;
868 	LWLockRelease(DynamicSharedMemoryControlLock);
869 }
870 
871 /*
872  * Unpin a dynamic shared memory segment that was previously pinned with
873  * dsm_pin_segment.  This function should not be called unless dsm_pin_segment
874  * was previously called for this segment.
875  *
876  * The argument is a dsm_handle rather than a dsm_segment in case you want
877  * to unpin a segment to which you haven't attached.  This turns out to be
878  * useful if, for example, a reference to one shared memory segment is stored
879  * within another shared memory segment.  You might want to unpin the
880  * referenced segment before destroying the referencing segment.
881  */
882 void
dsm_unpin_segment(dsm_handle handle)883 dsm_unpin_segment(dsm_handle handle)
884 {
885 	uint32		control_slot = INVALID_CONTROL_SLOT;
886 	bool		destroy = false;
887 	uint32		i;
888 
889 	/* Find the control slot for the given handle. */
890 	LWLockAcquire(DynamicSharedMemoryControlLock, LW_EXCLUSIVE);
891 	for (i = 0; i < dsm_control->nitems; ++i)
892 	{
893 		/* Skip unused slots and segments that are concurrently going away. */
894 		if (dsm_control->item[i].refcnt <= 1)
895 			continue;
896 
897 		/* If we've found our handle, we can stop searching. */
898 		if (dsm_control->item[i].handle == handle)
899 		{
900 			control_slot = i;
901 			break;
902 		}
903 	}
904 
905 	/*
906 	 * We should definitely have found the slot, and it should not already be
907 	 * in the process of going away, because this function should only be
908 	 * called on a segment which is pinned.
909 	 */
910 	if (control_slot == INVALID_CONTROL_SLOT)
911 		elog(ERROR, "cannot unpin unknown segment handle");
912 	if (!dsm_control->item[control_slot].pinned)
913 		elog(ERROR, "cannot unpin a segment that is not pinned");
914 	Assert(dsm_control->item[control_slot].refcnt > 1);
915 
916 	/*
917 	 * Allow implementation-specific code to run.  We have to do this before
918 	 * releasing the lock, because impl_private_pm_handle may get modified by
919 	 * dsm_impl_unpin_segment.
920 	 */
921 	dsm_impl_unpin_segment(handle,
922 						   &dsm_control->item[control_slot].impl_private_pm_handle);
923 
924 	/* Note that 1 means no references (0 means unused slot). */
925 	if (--dsm_control->item[control_slot].refcnt == 1)
926 		destroy = true;
927 	dsm_control->item[control_slot].pinned = false;
928 
929 	/* Now we can release the lock. */
930 	LWLockRelease(DynamicSharedMemoryControlLock);
931 
932 	/* Clean up resources if that was the last reference. */
933 	if (destroy)
934 	{
935 		void	   *junk_impl_private = NULL;
936 		void	   *junk_mapped_address = NULL;
937 		Size		junk_mapped_size = 0;
938 
939 		/*
940 		 * For an explanation of how error handling works in this case, see
941 		 * comments in dsm_detach.  Note that if we reach this point, the
942 		 * current process certainly does not have the segment mapped, because
943 		 * if it did, the reference count would have still been greater than 1
944 		 * even after releasing the reference count held by the pin.  The fact
945 		 * that there can't be a dsm_segment for this handle makes it OK to
946 		 * pass the mapped size, mapped address, and private data as NULL
947 		 * here.
948 		 */
949 		if (dsm_impl_op(DSM_OP_DESTROY, handle, 0, &junk_impl_private,
950 						&junk_mapped_address, &junk_mapped_size, WARNING))
951 		{
952 			LWLockAcquire(DynamicSharedMemoryControlLock, LW_EXCLUSIVE);
953 			Assert(dsm_control->item[control_slot].handle == handle);
954 			Assert(dsm_control->item[control_slot].refcnt == 1);
955 			dsm_control->item[control_slot].refcnt = 0;
956 			LWLockRelease(DynamicSharedMemoryControlLock);
957 		}
958 	}
959 }
960 
961 /*
962  * Find an existing mapping for a shared memory segment, if there is one.
963  */
964 dsm_segment *
dsm_find_mapping(dsm_handle h)965 dsm_find_mapping(dsm_handle h)
966 {
967 	dlist_iter	iter;
968 	dsm_segment *seg;
969 
970 	dlist_foreach(iter, &dsm_segment_list)
971 	{
972 		seg = dlist_container(dsm_segment, node, iter.cur);
973 		if (seg->handle == h)
974 			return seg;
975 	}
976 
977 	return NULL;
978 }
979 
980 /*
981  * Get the address at which a dynamic shared memory segment is mapped.
982  */
983 void *
dsm_segment_address(dsm_segment * seg)984 dsm_segment_address(dsm_segment *seg)
985 {
986 	Assert(seg->mapped_address != NULL);
987 	return seg->mapped_address;
988 }
989 
990 /*
991  * Get the size of a mapping.
992  */
993 Size
dsm_segment_map_length(dsm_segment * seg)994 dsm_segment_map_length(dsm_segment *seg)
995 {
996 	Assert(seg->mapped_address != NULL);
997 	return seg->mapped_size;
998 }
999 
1000 /*
1001  * Get a handle for a mapping.
1002  *
1003  * To establish communication via dynamic shared memory between two backends,
1004  * one of them should first call dsm_create() to establish a new shared
1005  * memory mapping.  That process should then call dsm_segment_handle() to
1006  * obtain a handle for the mapping, and pass that handle to the
1007  * coordinating backend via some means (e.g. bgw_main_arg, or via the
1008  * main shared memory segment).  The recipient, once in possession of the
1009  * handle, should call dsm_attach().
1010  */
1011 dsm_handle
dsm_segment_handle(dsm_segment * seg)1012 dsm_segment_handle(dsm_segment *seg)
1013 {
1014 	return seg->handle;
1015 }
1016 
1017 /*
1018  * Register an on-detach callback for a dynamic shared memory segment.
1019  */
1020 void
on_dsm_detach(dsm_segment * seg,on_dsm_detach_callback function,Datum arg)1021 on_dsm_detach(dsm_segment *seg, on_dsm_detach_callback function, Datum arg)
1022 {
1023 	dsm_segment_detach_callback *cb;
1024 
1025 	cb = MemoryContextAlloc(TopMemoryContext,
1026 							sizeof(dsm_segment_detach_callback));
1027 	cb->function = function;
1028 	cb->arg = arg;
1029 	slist_push_head(&seg->on_detach, &cb->node);
1030 }
1031 
1032 /*
1033  * Unregister an on-detach callback for a dynamic shared memory segment.
1034  */
1035 void
cancel_on_dsm_detach(dsm_segment * seg,on_dsm_detach_callback function,Datum arg)1036 cancel_on_dsm_detach(dsm_segment *seg, on_dsm_detach_callback function,
1037 					 Datum arg)
1038 {
1039 	slist_mutable_iter iter;
1040 
1041 	slist_foreach_modify(iter, &seg->on_detach)
1042 	{
1043 		dsm_segment_detach_callback *cb;
1044 
1045 		cb = slist_container(dsm_segment_detach_callback, node, iter.cur);
1046 		if (cb->function == function && cb->arg == arg)
1047 		{
1048 			slist_delete_current(&iter);
1049 			pfree(cb);
1050 			break;
1051 		}
1052 	}
1053 }
1054 
1055 /*
1056  * Discard all registered on-detach callbacks without executing them.
1057  */
1058 void
reset_on_dsm_detach(void)1059 reset_on_dsm_detach(void)
1060 {
1061 	dlist_iter	iter;
1062 
1063 	dlist_foreach(iter, &dsm_segment_list)
1064 	{
1065 		dsm_segment *seg = dlist_container(dsm_segment, node, iter.cur);
1066 
1067 		/* Throw away explicit on-detach actions one by one. */
1068 		while (!slist_is_empty(&seg->on_detach))
1069 		{
1070 			slist_node *node;
1071 			dsm_segment_detach_callback *cb;
1072 
1073 			node = slist_pop_head_node(&seg->on_detach);
1074 			cb = slist_container(dsm_segment_detach_callback, node, node);
1075 			pfree(cb);
1076 		}
1077 
1078 		/*
1079 		 * Decrementing the reference count is a sort of implicit on-detach
1080 		 * action; make sure we don't do that, either.
1081 		 */
1082 		seg->control_slot = INVALID_CONTROL_SLOT;
1083 	}
1084 }
1085 
1086 /*
1087  * Create a segment descriptor.
1088  */
1089 static dsm_segment *
dsm_create_descriptor(void)1090 dsm_create_descriptor(void)
1091 {
1092 	dsm_segment *seg;
1093 
1094 	if (CurrentResourceOwner)
1095 		ResourceOwnerEnlargeDSMs(CurrentResourceOwner);
1096 
1097 	seg = MemoryContextAlloc(TopMemoryContext, sizeof(dsm_segment));
1098 	dlist_push_head(&dsm_segment_list, &seg->node);
1099 
1100 	/* seg->handle must be initialized by the caller */
1101 	seg->control_slot = INVALID_CONTROL_SLOT;
1102 	seg->impl_private = NULL;
1103 	seg->mapped_address = NULL;
1104 	seg->mapped_size = 0;
1105 
1106 	seg->resowner = CurrentResourceOwner;
1107 	if (CurrentResourceOwner)
1108 		ResourceOwnerRememberDSM(CurrentResourceOwner, seg);
1109 
1110 	slist_init(&seg->on_detach);
1111 
1112 	return seg;
1113 }
1114 
1115 /*
1116  * Sanity check a control segment.
1117  *
1118  * The goal here isn't to detect everything that could possibly be wrong with
1119  * the control segment; there's not enough information for that.  Rather, the
1120  * goal is to make sure that someone can iterate over the items in the segment
1121  * without overrunning the end of the mapping and crashing.  We also check
1122  * the magic number since, if that's messed up, this may not even be one of
1123  * our segments at all.
1124  */
1125 static bool
dsm_control_segment_sane(dsm_control_header * control,Size mapped_size)1126 dsm_control_segment_sane(dsm_control_header *control, Size mapped_size)
1127 {
1128 	if (mapped_size < offsetof(dsm_control_header, item))
1129 		return false;			/* Mapped size too short to read header. */
1130 	if (control->magic != PG_DYNSHMEM_CONTROL_MAGIC)
1131 		return false;			/* Magic number doesn't match. */
1132 	if (dsm_control_bytes_needed(control->maxitems) > mapped_size)
1133 		return false;			/* Max item count won't fit in map. */
1134 	if (control->nitems > control->maxitems)
1135 		return false;			/* Overfull. */
1136 	return true;
1137 }
1138 
1139 /*
1140  * Compute the number of control-segment bytes needed to store a given
1141  * number of items.
1142  */
1143 static uint64
dsm_control_bytes_needed(uint32 nitems)1144 dsm_control_bytes_needed(uint32 nitems)
1145 {
1146 	return offsetof(dsm_control_header, item)
1147 		+ sizeof(dsm_control_item) * (uint64) nitems;
1148 }
1149