1 /*-------------------------------------------------------------------------
2 *
3 * dsm.c
4 * manage dynamic shared memory segments
5 *
6 * This file provides a set of services to make programming with dynamic
7 * shared memory segments more convenient. Unlike the low-level
8 * facilities provided by dsm_impl.h and dsm_impl.c, mappings and segments
9 * created using this module will be cleaned up automatically. Mappings
10 * will be removed when the resource owner under which they were created
11 * is cleaned up, unless dsm_pin_mapping() is used, in which case they
12 * have session lifespan. Segments will be removed when there are no
13 * remaining mappings, or at postmaster shutdown in any case. After a
14 * hard postmaster crash, remaining segments will be removed, if they
15 * still exist, at the next postmaster startup.
16 *
17 * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
18 * Portions Copyright (c) 1994, Regents of the University of California
19 *
20 *
21 * IDENTIFICATION
22 * src/backend/storage/ipc/dsm.c
23 *
24 *-------------------------------------------------------------------------
25 */
26
27 #include "postgres.h"
28
29 #include <fcntl.h>
30 #include <unistd.h>
31 #ifndef WIN32
32 #include <sys/mman.h>
33 #endif
34 #include <sys/stat.h>
35
36 #include "lib/ilist.h"
37 #include "miscadmin.h"
38 #include "storage/dsm.h"
39 #include "storage/ipc.h"
40 #include "storage/lwlock.h"
41 #include "storage/pg_shmem.h"
42 #include "utils/guc.h"
43 #include "utils/memutils.h"
44 #include "utils/resowner_private.h"
45
46 #define PG_DYNSHMEM_CONTROL_MAGIC 0x9a503d32
47
48 #define PG_DYNSHMEM_FIXED_SLOTS 64
49 #define PG_DYNSHMEM_SLOTS_PER_BACKEND 5
50
51 #define INVALID_CONTROL_SLOT ((uint32) -1)
52
53 /* Backend-local tracking for on-detach callbacks. */
54 typedef struct dsm_segment_detach_callback
55 {
56 on_dsm_detach_callback function;
57 Datum arg;
58 slist_node node;
59 } dsm_segment_detach_callback;
60
61 /* Backend-local state for a dynamic shared memory segment. */
62 struct dsm_segment
63 {
64 dlist_node node; /* List link in dsm_segment_list. */
65 ResourceOwner resowner; /* Resource owner. */
66 dsm_handle handle; /* Segment name. */
67 uint32 control_slot; /* Slot in control segment. */
68 void *impl_private; /* Implementation-specific private data. */
69 void *mapped_address; /* Mapping address, or NULL if unmapped. */
70 Size mapped_size; /* Size of our mapping. */
71 slist_head on_detach; /* On-detach callbacks. */
72 };
73
74 /* Shared-memory state for a dynamic shared memory segment. */
75 typedef struct dsm_control_item
76 {
77 dsm_handle handle;
78 uint32 refcnt; /* 2+ = active, 1 = moribund, 0 = gone */
79 void *impl_private_pm_handle; /* only needed on Windows */
80 bool pinned;
81 } dsm_control_item;
82
83 /* Layout of the dynamic shared memory control segment. */
84 typedef struct dsm_control_header
85 {
86 uint32 magic;
87 uint32 nitems;
88 uint32 maxitems;
89 dsm_control_item item[FLEXIBLE_ARRAY_MEMBER];
90 } dsm_control_header;
91
92 static void dsm_cleanup_for_mmap(void);
93 static void dsm_postmaster_shutdown(int code, Datum arg);
94 static dsm_segment *dsm_create_descriptor(void);
95 static bool dsm_control_segment_sane(dsm_control_header *control,
96 Size mapped_size);
97 static uint64 dsm_control_bytes_needed(uint32 nitems);
98
99 /* Has this backend initialized the dynamic shared memory system yet? */
100 static bool dsm_init_done = false;
101
102 /*
103 * List of dynamic shared memory segments used by this backend.
104 *
105 * At process exit time, we must decrement the reference count of each
106 * segment we have attached; this list makes it possible to find all such
107 * segments.
108 *
109 * This list should always be empty in the postmaster. We could probably
110 * allow the postmaster to map dynamic shared memory segments before it
111 * begins to start child processes, provided that each process adjusted
112 * the reference counts for those segments in the control segment at
113 * startup time, but there's no obvious need for such a facility, which
114 * would also be complex to handle in the EXEC_BACKEND case. Once the
115 * postmaster has begun spawning children, there's an additional problem:
116 * each new mapping would require an update to the control segment,
117 * which requires locking, in which the postmaster must not be involved.
118 */
119 static dlist_head dsm_segment_list = DLIST_STATIC_INIT(dsm_segment_list);
120
121 /*
122 * Control segment information.
123 *
124 * Unlike ordinary shared memory segments, the control segment is not
125 * reference counted; instead, it lasts for the postmaster's entire
126 * life cycle. For simplicity, it doesn't have a dsm_segment object either.
127 */
128 static dsm_handle dsm_control_handle;
129 static dsm_control_header *dsm_control;
130 static Size dsm_control_mapped_size = 0;
131 static void *dsm_control_impl_private = NULL;
132
133 /*
134 * Start up the dynamic shared memory system.
135 *
136 * This is called just once during each cluster lifetime, at postmaster
137 * startup time.
138 */
139 void
dsm_postmaster_startup(PGShmemHeader * shim)140 dsm_postmaster_startup(PGShmemHeader *shim)
141 {
142 void *dsm_control_address = NULL;
143 uint32 maxitems;
144 Size segsize;
145
146 Assert(!IsUnderPostmaster);
147
148 /* If dynamic shared memory is disabled, there's nothing to do. */
149 if (dynamic_shared_memory_type == DSM_IMPL_NONE)
150 return;
151
152 /*
153 * If we're using the mmap implementations, clean up any leftovers.
154 * Cleanup isn't needed on Windows, and happens earlier in startup for
155 * POSIX and System V shared memory, via a direct call to
156 * dsm_cleanup_using_control_segment.
157 */
158 if (dynamic_shared_memory_type == DSM_IMPL_MMAP)
159 dsm_cleanup_for_mmap();
160
161 /* Determine size for new control segment. */
162 maxitems = PG_DYNSHMEM_FIXED_SLOTS
163 + PG_DYNSHMEM_SLOTS_PER_BACKEND * MaxBackends;
164 elog(DEBUG2, "dynamic shared memory system will support %u segments",
165 maxitems);
166 segsize = dsm_control_bytes_needed(maxitems);
167
168 /*
169 * Loop until we find an unused identifier for the new control segment. We
170 * sometimes use 0 as a sentinel value indicating that no control segment
171 * is known to exist, so avoid using that value for a real control
172 * segment.
173 */
174 for (;;)
175 {
176 Assert(dsm_control_address == NULL);
177 Assert(dsm_control_mapped_size == 0);
178 dsm_control_handle = random();
179 if (dsm_control_handle == DSM_HANDLE_INVALID)
180 continue;
181 if (dsm_impl_op(DSM_OP_CREATE, dsm_control_handle, segsize,
182 &dsm_control_impl_private, &dsm_control_address,
183 &dsm_control_mapped_size, ERROR))
184 break;
185 }
186 dsm_control = dsm_control_address;
187 on_shmem_exit(dsm_postmaster_shutdown, PointerGetDatum(shim));
188 elog(DEBUG2,
189 "created dynamic shared memory control segment %u (%zu bytes)",
190 dsm_control_handle, segsize);
191 shim->dsm_control = dsm_control_handle;
192
193 /* Initialize control segment. */
194 dsm_control->magic = PG_DYNSHMEM_CONTROL_MAGIC;
195 dsm_control->nitems = 0;
196 dsm_control->maxitems = maxitems;
197 }
198
199 /*
200 * Determine whether the control segment from the previous postmaster
201 * invocation still exists. If so, remove the dynamic shared memory
202 * segments to which it refers, and then the control segment itself.
203 */
204 void
dsm_cleanup_using_control_segment(dsm_handle old_control_handle)205 dsm_cleanup_using_control_segment(dsm_handle old_control_handle)
206 {
207 void *mapped_address = NULL;
208 void *junk_mapped_address = NULL;
209 void *impl_private = NULL;
210 void *junk_impl_private = NULL;
211 Size mapped_size = 0;
212 Size junk_mapped_size = 0;
213 uint32 nitems;
214 uint32 i;
215 dsm_control_header *old_control;
216
217 /* If dynamic shared memory is disabled, there's nothing to do. */
218 if (dynamic_shared_memory_type == DSM_IMPL_NONE)
219 return;
220
221 /*
222 * Try to attach the segment. If this fails, it probably just means that
223 * the operating system has been rebooted and the segment no longer
224 * exists, or an unrelated process has used the same shm ID. So just fall
225 * out quietly.
226 */
227 if (!dsm_impl_op(DSM_OP_ATTACH, old_control_handle, 0, &impl_private,
228 &mapped_address, &mapped_size, DEBUG1))
229 return;
230
231 /*
232 * We've managed to reattach it, but the contents might not be sane. If
233 * they aren't, we disregard the segment after all.
234 */
235 old_control = (dsm_control_header *) mapped_address;
236 if (!dsm_control_segment_sane(old_control, mapped_size))
237 {
238 dsm_impl_op(DSM_OP_DETACH, old_control_handle, 0, &impl_private,
239 &mapped_address, &mapped_size, LOG);
240 return;
241 }
242
243 /*
244 * OK, the control segment looks basically valid, so we can use it to get
245 * a list of segments that need to be removed.
246 */
247 nitems = old_control->nitems;
248 for (i = 0; i < nitems; ++i)
249 {
250 dsm_handle handle;
251 uint32 refcnt;
252
253 /* If the reference count is 0, the slot is actually unused. */
254 refcnt = old_control->item[i].refcnt;
255 if (refcnt == 0)
256 continue;
257
258 /* Log debugging information. */
259 handle = old_control->item[i].handle;
260 elog(DEBUG2, "cleaning up orphaned dynamic shared memory with ID %u (reference count %u)",
261 handle, refcnt);
262
263 /* Destroy the referenced segment. */
264 dsm_impl_op(DSM_OP_DESTROY, handle, 0, &junk_impl_private,
265 &junk_mapped_address, &junk_mapped_size, LOG);
266 }
267
268 /* Destroy the old control segment, too. */
269 elog(DEBUG2,
270 "cleaning up dynamic shared memory control segment with ID %u",
271 old_control_handle);
272 dsm_impl_op(DSM_OP_DESTROY, old_control_handle, 0, &impl_private,
273 &mapped_address, &mapped_size, LOG);
274 }
275
276 /*
277 * When we're using the mmap shared memory implementation, "shared memory"
278 * segments might even manage to survive an operating system reboot.
279 * But there's no guarantee as to exactly what will survive: some segments
280 * may survive, and others may not, and the contents of some may be out
281 * of date. In particular, the control segment may be out of date, so we
282 * can't rely on it to figure out what to remove. However, since we know
283 * what directory contains the files we used as shared memory, we can simply
284 * scan the directory and blow everything away that shouldn't be there.
285 */
286 static void
dsm_cleanup_for_mmap(void)287 dsm_cleanup_for_mmap(void)
288 {
289 DIR *dir;
290 struct dirent *dent;
291
292 /* Open the directory; can't use AllocateDir in postmaster. */
293 if ((dir = AllocateDir(PG_DYNSHMEM_DIR)) == NULL)
294 ereport(ERROR,
295 (errcode_for_file_access(),
296 errmsg("could not open directory \"%s\": %m",
297 PG_DYNSHMEM_DIR)));
298
299 /* Scan for something with a name of the correct format. */
300 while ((dent = ReadDir(dir, PG_DYNSHMEM_DIR)) != NULL)
301 {
302 if (strncmp(dent->d_name, PG_DYNSHMEM_MMAP_FILE_PREFIX,
303 strlen(PG_DYNSHMEM_MMAP_FILE_PREFIX)) == 0)
304 {
305 char buf[MAXPGPATH + sizeof(PG_DYNSHMEM_DIR)];
306
307 snprintf(buf, sizeof(buf), PG_DYNSHMEM_DIR "/%s", dent->d_name);
308
309 elog(DEBUG2, "removing file \"%s\"", buf);
310
311 /* We found a matching file; so remove it. */
312 if (unlink(buf) != 0)
313 {
314 int save_errno;
315
316 save_errno = errno;
317 closedir(dir);
318 errno = save_errno;
319
320 ereport(ERROR,
321 (errcode_for_file_access(),
322 errmsg("could not remove file \"%s\": %m", buf)));
323 }
324 }
325 }
326
327 /* Cleanup complete. */
328 FreeDir(dir);
329 }
330
331 /*
332 * At shutdown time, we iterate over the control segment and remove all
333 * remaining dynamic shared memory segments. We avoid throwing errors here;
334 * the postmaster is shutting down either way, and this is just non-critical
335 * resource cleanup.
336 */
337 static void
dsm_postmaster_shutdown(int code,Datum arg)338 dsm_postmaster_shutdown(int code, Datum arg)
339 {
340 uint32 nitems;
341 uint32 i;
342 void *dsm_control_address;
343 void *junk_mapped_address = NULL;
344 void *junk_impl_private = NULL;
345 Size junk_mapped_size = 0;
346 PGShmemHeader *shim = (PGShmemHeader *) DatumGetPointer(arg);
347
348 /*
349 * If some other backend exited uncleanly, it might have corrupted the
350 * control segment while it was dying. In that case, we warn and ignore
351 * the contents of the control segment. This may end up leaving behind
352 * stray shared memory segments, but there's not much we can do about that
353 * if the metadata is gone.
354 */
355 nitems = dsm_control->nitems;
356 if (!dsm_control_segment_sane(dsm_control, dsm_control_mapped_size))
357 {
358 ereport(LOG,
359 (errmsg("dynamic shared memory control segment is corrupt")));
360 return;
361 }
362
363 /* Remove any remaining segments. */
364 for (i = 0; i < nitems; ++i)
365 {
366 dsm_handle handle;
367
368 /* If the reference count is 0, the slot is actually unused. */
369 if (dsm_control->item[i].refcnt == 0)
370 continue;
371
372 /* Log debugging information. */
373 handle = dsm_control->item[i].handle;
374 elog(DEBUG2, "cleaning up orphaned dynamic shared memory with ID %u",
375 handle);
376
377 /* Destroy the segment. */
378 dsm_impl_op(DSM_OP_DESTROY, handle, 0, &junk_impl_private,
379 &junk_mapped_address, &junk_mapped_size, LOG);
380 }
381
382 /* Remove the control segment itself. */
383 elog(DEBUG2,
384 "cleaning up dynamic shared memory control segment with ID %u",
385 dsm_control_handle);
386 dsm_control_address = dsm_control;
387 dsm_impl_op(DSM_OP_DESTROY, dsm_control_handle, 0,
388 &dsm_control_impl_private, &dsm_control_address,
389 &dsm_control_mapped_size, LOG);
390 dsm_control = dsm_control_address;
391 shim->dsm_control = 0;
392 }
393
394 /*
395 * Prepare this backend for dynamic shared memory usage. Under EXEC_BACKEND,
396 * we must reread the state file and map the control segment; in other cases,
397 * we'll have inherited the postmaster's mapping and global variables.
398 */
399 static void
dsm_backend_startup(void)400 dsm_backend_startup(void)
401 {
402 /* If dynamic shared memory is disabled, reject this. */
403 if (dynamic_shared_memory_type == DSM_IMPL_NONE)
404 ereport(ERROR,
405 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
406 errmsg("dynamic shared memory is disabled"),
407 errhint("Set dynamic_shared_memory_type to a value other than \"none\".")));
408
409 #ifdef EXEC_BACKEND
410 {
411 void *control_address = NULL;
412
413 /* Attach control segment. */
414 Assert(dsm_control_handle != 0);
415 dsm_impl_op(DSM_OP_ATTACH, dsm_control_handle, 0,
416 &dsm_control_impl_private, &control_address,
417 &dsm_control_mapped_size, ERROR);
418 dsm_control = control_address;
419 /* If control segment doesn't look sane, something is badly wrong. */
420 if (!dsm_control_segment_sane(dsm_control, dsm_control_mapped_size))
421 {
422 dsm_impl_op(DSM_OP_DETACH, dsm_control_handle, 0,
423 &dsm_control_impl_private, &control_address,
424 &dsm_control_mapped_size, WARNING);
425 ereport(FATAL,
426 (errcode(ERRCODE_INTERNAL_ERROR),
427 errmsg("dynamic shared memory control segment is not valid")));
428 }
429 }
430 #endif
431
432 dsm_init_done = true;
433 }
434
435 #ifdef EXEC_BACKEND
436 /*
437 * When running under EXEC_BACKEND, we get a callback here when the main
438 * shared memory segment is re-attached, so that we can record the control
439 * handle retrieved from it.
440 */
441 void
dsm_set_control_handle(dsm_handle h)442 dsm_set_control_handle(dsm_handle h)
443 {
444 Assert(dsm_control_handle == 0 && h != 0);
445 dsm_control_handle = h;
446 }
447 #endif
448
449 /*
450 * Create a new dynamic shared memory segment.
451 *
452 * If there is a non-NULL CurrentResourceOwner, the new segment is associated
453 * with it and must be detached before the resource owner releases, or a
454 * warning will be logged. If CurrentResourceOwner is NULL, the segment
455 * remains attached until explicitely detached or the session ends.
456 * Creating with a NULL CurrentResourceOwner is equivalent to creating
457 * with a non-NULL CurrentResourceOwner and then calling dsm_pin_mapping.
458 */
459 dsm_segment *
dsm_create(Size size,int flags)460 dsm_create(Size size, int flags)
461 {
462 dsm_segment *seg;
463 uint32 i;
464 uint32 nitems;
465
466 /* Unsafe in postmaster (and pointless in a stand-alone backend). */
467 Assert(IsUnderPostmaster);
468
469 if (!dsm_init_done)
470 dsm_backend_startup();
471
472 /* Create a new segment descriptor. */
473 seg = dsm_create_descriptor();
474
475 /* Loop until we find an unused segment identifier. */
476 for (;;)
477 {
478 Assert(seg->mapped_address == NULL && seg->mapped_size == 0);
479 seg->handle = random();
480 if (seg->handle == DSM_HANDLE_INVALID) /* Reserve sentinel */
481 continue;
482 if (dsm_impl_op(DSM_OP_CREATE, seg->handle, size, &seg->impl_private,
483 &seg->mapped_address, &seg->mapped_size, ERROR))
484 break;
485 }
486
487 /* Lock the control segment so we can register the new segment. */
488 LWLockAcquire(DynamicSharedMemoryControlLock, LW_EXCLUSIVE);
489
490 /* Search the control segment for an unused slot. */
491 nitems = dsm_control->nitems;
492 for (i = 0; i < nitems; ++i)
493 {
494 if (dsm_control->item[i].refcnt == 0)
495 {
496 dsm_control->item[i].handle = seg->handle;
497 /* refcnt of 1 triggers destruction, so start at 2 */
498 dsm_control->item[i].refcnt = 2;
499 dsm_control->item[i].impl_private_pm_handle = NULL;
500 dsm_control->item[i].pinned = false;
501 seg->control_slot = i;
502 LWLockRelease(DynamicSharedMemoryControlLock);
503 return seg;
504 }
505 }
506
507 /* Verify that we can support an additional mapping. */
508 if (nitems >= dsm_control->maxitems)
509 {
510 LWLockRelease(DynamicSharedMemoryControlLock);
511 dsm_impl_op(DSM_OP_DESTROY, seg->handle, 0, &seg->impl_private,
512 &seg->mapped_address, &seg->mapped_size, WARNING);
513 if (seg->resowner != NULL)
514 ResourceOwnerForgetDSM(seg->resowner, seg);
515 dlist_delete(&seg->node);
516 pfree(seg);
517
518 if ((flags & DSM_CREATE_NULL_IF_MAXSEGMENTS) != 0)
519 return NULL;
520 ereport(ERROR,
521 (errcode(ERRCODE_INSUFFICIENT_RESOURCES),
522 errmsg("too many dynamic shared memory segments")));
523 }
524
525 /* Enter the handle into a new array slot. */
526 dsm_control->item[nitems].handle = seg->handle;
527 /* refcnt of 1 triggers destruction, so start at 2 */
528 dsm_control->item[nitems].refcnt = 2;
529 dsm_control->item[nitems].impl_private_pm_handle = NULL;
530 dsm_control->item[nitems].pinned = false;
531 seg->control_slot = nitems;
532 dsm_control->nitems++;
533 LWLockRelease(DynamicSharedMemoryControlLock);
534
535 return seg;
536 }
537
538 /*
539 * Attach a dynamic shared memory segment.
540 *
541 * See comments for dsm_segment_handle() for an explanation of how this
542 * is intended to be used.
543 *
544 * This function will return NULL if the segment isn't known to the system.
545 * This can happen if we're asked to attach the segment, but then everyone
546 * else detaches it (causing it to be destroyed) before we get around to
547 * attaching it.
548 *
549 * If there is a non-NULL CurrentResourceOwner, the attached segment is
550 * associated with it and must be detached before the resource owner releases,
551 * or a warning will be logged. Otherwise the segment remains attached until
552 * explicitely detached or the session ends. See the note atop dsm_create().
553 */
554 dsm_segment *
dsm_attach(dsm_handle h)555 dsm_attach(dsm_handle h)
556 {
557 dsm_segment *seg;
558 dlist_iter iter;
559 uint32 i;
560 uint32 nitems;
561
562 /* Unsafe in postmaster (and pointless in a stand-alone backend). */
563 Assert(IsUnderPostmaster);
564
565 if (!dsm_init_done)
566 dsm_backend_startup();
567
568 /*
569 * Since this is just a debugging cross-check, we could leave it out
570 * altogether, or include it only in assert-enabled builds. But since the
571 * list of attached segments should normally be very short, let's include
572 * it always for right now.
573 *
574 * If you're hitting this error, you probably want to attempt to find an
575 * existing mapping via dsm_find_mapping() before calling dsm_attach() to
576 * create a new one.
577 */
578 dlist_foreach(iter, &dsm_segment_list)
579 {
580 seg = dlist_container(dsm_segment, node, iter.cur);
581 if (seg->handle == h)
582 elog(ERROR, "can't attach the same segment more than once");
583 }
584
585 /* Create a new segment descriptor. */
586 seg = dsm_create_descriptor();
587 seg->handle = h;
588
589 /* Bump reference count for this segment in shared memory. */
590 LWLockAcquire(DynamicSharedMemoryControlLock, LW_EXCLUSIVE);
591 nitems = dsm_control->nitems;
592 for (i = 0; i < nitems; ++i)
593 {
594 /*
595 * If the reference count is 0, the slot is actually unused. If the
596 * reference count is 1, the slot is still in use, but the segment is
597 * in the process of going away; even if the handle matches, another
598 * slot may already have started using the same handle value by
599 * coincidence so we have to keep searching.
600 */
601 if (dsm_control->item[i].refcnt <= 1)
602 continue;
603
604 /* If the handle doesn't match, it's not the slot we want. */
605 if (dsm_control->item[i].handle != seg->handle)
606 continue;
607
608 /* Otherwise we've found a match. */
609 dsm_control->item[i].refcnt++;
610 seg->control_slot = i;
611 break;
612 }
613 LWLockRelease(DynamicSharedMemoryControlLock);
614
615 /*
616 * If we didn't find the handle we're looking for in the control segment,
617 * it probably means that everyone else who had it mapped, including the
618 * original creator, died before we got to this point. It's up to the
619 * caller to decide what to do about that.
620 */
621 if (seg->control_slot == INVALID_CONTROL_SLOT)
622 {
623 dsm_detach(seg);
624 return NULL;
625 }
626
627 /* Here's where we actually try to map the segment. */
628 dsm_impl_op(DSM_OP_ATTACH, seg->handle, 0, &seg->impl_private,
629 &seg->mapped_address, &seg->mapped_size, ERROR);
630
631 return seg;
632 }
633
634 /*
635 * At backend shutdown time, detach any segments that are still attached.
636 * (This is similar to dsm_detach_all, except that there's no reason to
637 * unmap the control segment before exiting, so we don't bother.)
638 */
639 void
dsm_backend_shutdown(void)640 dsm_backend_shutdown(void)
641 {
642 while (!dlist_is_empty(&dsm_segment_list))
643 {
644 dsm_segment *seg;
645
646 seg = dlist_head_element(dsm_segment, node, &dsm_segment_list);
647 dsm_detach(seg);
648 }
649 }
650
651 /*
652 * Detach all shared memory segments, including the control segments. This
653 * should be called, along with PGSharedMemoryDetach, in processes that
654 * might inherit mappings but are not intended to be connected to dynamic
655 * shared memory.
656 */
657 void
dsm_detach_all(void)658 dsm_detach_all(void)
659 {
660 void *control_address = dsm_control;
661
662 while (!dlist_is_empty(&dsm_segment_list))
663 {
664 dsm_segment *seg;
665
666 seg = dlist_head_element(dsm_segment, node, &dsm_segment_list);
667 dsm_detach(seg);
668 }
669
670 if (control_address != NULL)
671 dsm_impl_op(DSM_OP_DETACH, dsm_control_handle, 0,
672 &dsm_control_impl_private, &control_address,
673 &dsm_control_mapped_size, ERROR);
674 }
675
676 /*
677 * Resize an existing shared memory segment.
678 *
679 * This may cause the shared memory segment to be remapped at a different
680 * address. For the caller's convenience, we return the mapped address.
681 */
682 void *
dsm_resize(dsm_segment * seg,Size size)683 dsm_resize(dsm_segment *seg, Size size)
684 {
685 Assert(seg->control_slot != INVALID_CONTROL_SLOT);
686 dsm_impl_op(DSM_OP_RESIZE, seg->handle, size, &seg->impl_private,
687 &seg->mapped_address, &seg->mapped_size, ERROR);
688 return seg->mapped_address;
689 }
690
691 /*
692 * Remap an existing shared memory segment.
693 *
694 * This is intended to be used when some other process has extended the
695 * mapping using dsm_resize(), but we've still only got the initial
696 * portion mapped. Since this might change the address at which the
697 * segment is mapped, we return the new mapped address.
698 */
699 void *
dsm_remap(dsm_segment * seg)700 dsm_remap(dsm_segment *seg)
701 {
702 dsm_impl_op(DSM_OP_ATTACH, seg->handle, 0, &seg->impl_private,
703 &seg->mapped_address, &seg->mapped_size, ERROR);
704
705 return seg->mapped_address;
706 }
707
708 /*
709 * Detach from a shared memory segment, destroying the segment if we
710 * remove the last reference.
711 *
712 * This function should never fail. It will often be invoked when aborting
713 * a transaction, and a further error won't serve any purpose. It's not a
714 * complete disaster if we fail to unmap or destroy the segment; it means a
715 * resource leak, but that doesn't necessarily preclude further operations.
716 */
717 void
dsm_detach(dsm_segment * seg)718 dsm_detach(dsm_segment *seg)
719 {
720 /*
721 * Invoke registered callbacks. Just in case one of those callbacks
722 * throws a further error that brings us back here, pop the callback
723 * before invoking it, to avoid infinite error recursion. Don't allow
724 * interrupts while running the individual callbacks in non-error code
725 * paths, to avoid leaving cleanup work unfinished if we're interrupted by
726 * a statement timeout or similar.
727 */
728 HOLD_INTERRUPTS();
729 while (!slist_is_empty(&seg->on_detach))
730 {
731 slist_node *node;
732 dsm_segment_detach_callback *cb;
733 on_dsm_detach_callback function;
734 Datum arg;
735
736 node = slist_pop_head_node(&seg->on_detach);
737 cb = slist_container(dsm_segment_detach_callback, node, node);
738 function = cb->function;
739 arg = cb->arg;
740 pfree(cb);
741
742 function(seg, arg);
743 }
744 RESUME_INTERRUPTS();
745
746 /*
747 * Try to remove the mapping, if one exists. Normally, there will be, but
748 * maybe not, if we failed partway through a create or attach operation.
749 * We remove the mapping before decrementing the reference count so that
750 * the process that sees a zero reference count can be certain that no
751 * remaining mappings exist. Even if this fails, we pretend that it
752 * works, because retrying is likely to fail in the same way.
753 */
754 if (seg->mapped_address != NULL)
755 {
756 dsm_impl_op(DSM_OP_DETACH, seg->handle, 0, &seg->impl_private,
757 &seg->mapped_address, &seg->mapped_size, WARNING);
758 seg->impl_private = NULL;
759 seg->mapped_address = NULL;
760 seg->mapped_size = 0;
761 }
762
763 /* Reduce reference count, if we previously increased it. */
764 if (seg->control_slot != INVALID_CONTROL_SLOT)
765 {
766 uint32 refcnt;
767 uint32 control_slot = seg->control_slot;
768
769 LWLockAcquire(DynamicSharedMemoryControlLock, LW_EXCLUSIVE);
770 Assert(dsm_control->item[control_slot].handle == seg->handle);
771 Assert(dsm_control->item[control_slot].refcnt > 1);
772 refcnt = --dsm_control->item[control_slot].refcnt;
773 seg->control_slot = INVALID_CONTROL_SLOT;
774 LWLockRelease(DynamicSharedMemoryControlLock);
775
776 /* If new reference count is 1, try to destroy the segment. */
777 if (refcnt == 1)
778 {
779 /* A pinned segment should never reach 1. */
780 Assert(!dsm_control->item[control_slot].pinned);
781
782 /*
783 * If we fail to destroy the segment here, or are killed before we
784 * finish doing so, the reference count will remain at 1, which
785 * will mean that nobody else can attach to the segment. At
786 * postmaster shutdown time, or when a new postmaster is started
787 * after a hard kill, another attempt will be made to remove the
788 * segment.
789 *
790 * The main case we're worried about here is being killed by a
791 * signal before we can finish removing the segment. In that
792 * case, it's important to be sure that the segment still gets
793 * removed. If we actually fail to remove the segment for some
794 * other reason, the postmaster may not have any better luck than
795 * we did. There's not much we can do about that, though.
796 */
797 if (dsm_impl_op(DSM_OP_DESTROY, seg->handle, 0, &seg->impl_private,
798 &seg->mapped_address, &seg->mapped_size, WARNING))
799 {
800 LWLockAcquire(DynamicSharedMemoryControlLock, LW_EXCLUSIVE);
801 Assert(dsm_control->item[control_slot].handle == seg->handle);
802 Assert(dsm_control->item[control_slot].refcnt == 1);
803 dsm_control->item[control_slot].refcnt = 0;
804 LWLockRelease(DynamicSharedMemoryControlLock);
805 }
806 }
807 }
808
809 /* Clean up our remaining backend-private data structures. */
810 if (seg->resowner != NULL)
811 ResourceOwnerForgetDSM(seg->resowner, seg);
812 dlist_delete(&seg->node);
813 pfree(seg);
814 }
815
816 /*
817 * Keep a dynamic shared memory mapping until end of session.
818 *
819 * By default, mappings are owned by the current resource owner, which
820 * typically means they stick around for the duration of the current query
821 * only.
822 */
823 void
dsm_pin_mapping(dsm_segment * seg)824 dsm_pin_mapping(dsm_segment *seg)
825 {
826 if (seg->resowner != NULL)
827 {
828 ResourceOwnerForgetDSM(seg->resowner, seg);
829 seg->resowner = NULL;
830 }
831 }
832
833 /*
834 * Arrange to remove a dynamic shared memory mapping at cleanup time.
835 *
836 * dsm_pin_mapping() can be used to preserve a mapping for the entire
837 * lifetime of a process; this function reverses that decision, making
838 * the segment owned by the current resource owner. This may be useful
839 * just before performing some operation that will invalidate the segment
840 * for future use by this backend.
841 */
842 void
dsm_unpin_mapping(dsm_segment * seg)843 dsm_unpin_mapping(dsm_segment *seg)
844 {
845 Assert(seg->resowner == NULL);
846 ResourceOwnerEnlargeDSMs(CurrentResourceOwner);
847 seg->resowner = CurrentResourceOwner;
848 ResourceOwnerRememberDSM(seg->resowner, seg);
849 }
850
851 /*
852 * Keep a dynamic shared memory segment until postmaster shutdown, or until
853 * dsm_unpin_segment is called.
854 *
855 * This function should not be called more than once per segment, unless the
856 * segment is explicitly unpinned with dsm_unpin_segment in between calls.
857 *
858 * Note that this function does not arrange for the current process to
859 * keep the segment mapped indefinitely; if that behavior is desired,
860 * dsm_pin_mapping() should be used from each process that needs to
861 * retain the mapping.
862 */
863 void
dsm_pin_segment(dsm_segment * seg)864 dsm_pin_segment(dsm_segment *seg)
865 {
866 void *handle;
867
868 /*
869 * Bump reference count for this segment in shared memory. This will
870 * ensure that even if there is no session which is attached to this
871 * segment, it will remain until postmaster shutdown or an explicit call
872 * to unpin.
873 */
874 LWLockAcquire(DynamicSharedMemoryControlLock, LW_EXCLUSIVE);
875 if (dsm_control->item[seg->control_slot].pinned)
876 elog(ERROR, "cannot pin a segment that is already pinned");
877 dsm_impl_pin_segment(seg->handle, seg->impl_private, &handle);
878 dsm_control->item[seg->control_slot].pinned = true;
879 dsm_control->item[seg->control_slot].refcnt++;
880 dsm_control->item[seg->control_slot].impl_private_pm_handle = handle;
881 LWLockRelease(DynamicSharedMemoryControlLock);
882 }
883
884 /*
885 * Unpin a dynamic shared memory segment that was previously pinned with
886 * dsm_pin_segment. This function should not be called unless dsm_pin_segment
887 * was previously called for this segment.
888 *
889 * The argument is a dsm_handle rather than a dsm_segment in case you want
890 * to unpin a segment to which you haven't attached. This turns out to be
891 * useful if, for example, a reference to one shared memory segment is stored
892 * within another shared memory segment. You might want to unpin the
893 * referenced segment before destroying the referencing segment.
894 */
895 void
dsm_unpin_segment(dsm_handle handle)896 dsm_unpin_segment(dsm_handle handle)
897 {
898 uint32 control_slot = INVALID_CONTROL_SLOT;
899 bool destroy = false;
900 uint32 i;
901
902 /* Find the control slot for the given handle. */
903 LWLockAcquire(DynamicSharedMemoryControlLock, LW_EXCLUSIVE);
904 for (i = 0; i < dsm_control->nitems; ++i)
905 {
906 /* Skip unused slots and segments that are concurrently going away. */
907 if (dsm_control->item[i].refcnt <= 1)
908 continue;
909
910 /* If we've found our handle, we can stop searching. */
911 if (dsm_control->item[i].handle == handle)
912 {
913 control_slot = i;
914 break;
915 }
916 }
917
918 /*
919 * We should definitely have found the slot, and it should not already be
920 * in the process of going away, because this function should only be
921 * called on a segment which is pinned.
922 */
923 if (control_slot == INVALID_CONTROL_SLOT)
924 elog(ERROR, "cannot unpin unknown segment handle");
925 if (!dsm_control->item[control_slot].pinned)
926 elog(ERROR, "cannot unpin a segment that is not pinned");
927 Assert(dsm_control->item[control_slot].refcnt > 1);
928
929 /*
930 * Allow implementation-specific code to run. We have to do this before
931 * releasing the lock, because impl_private_pm_handle may get modified by
932 * dsm_impl_unpin_segment.
933 */
934 dsm_impl_unpin_segment(handle,
935 &dsm_control->item[control_slot].impl_private_pm_handle);
936
937 /* Note that 1 means no references (0 means unused slot). */
938 if (--dsm_control->item[control_slot].refcnt == 1)
939 destroy = true;
940 dsm_control->item[control_slot].pinned = false;
941
942 /* Now we can release the lock. */
943 LWLockRelease(DynamicSharedMemoryControlLock);
944
945 /* Clean up resources if that was the last reference. */
946 if (destroy)
947 {
948 void *junk_impl_private = NULL;
949 void *junk_mapped_address = NULL;
950 Size junk_mapped_size = 0;
951
952 /*
953 * For an explanation of how error handling works in this case, see
954 * comments in dsm_detach. Note that if we reach this point, the
955 * current process certainly does not have the segment mapped, because
956 * if it did, the reference count would have still been greater than 1
957 * even after releasing the reference count held by the pin. The fact
958 * that there can't be a dsm_segment for this handle makes it OK to
959 * pass the mapped size, mapped address, and private data as NULL
960 * here.
961 */
962 if (dsm_impl_op(DSM_OP_DESTROY, handle, 0, &junk_impl_private,
963 &junk_mapped_address, &junk_mapped_size, WARNING))
964 {
965 LWLockAcquire(DynamicSharedMemoryControlLock, LW_EXCLUSIVE);
966 Assert(dsm_control->item[control_slot].handle == handle);
967 Assert(dsm_control->item[control_slot].refcnt == 1);
968 dsm_control->item[control_slot].refcnt = 0;
969 LWLockRelease(DynamicSharedMemoryControlLock);
970 }
971 }
972 }
973
974 /*
975 * Find an existing mapping for a shared memory segment, if there is one.
976 */
977 dsm_segment *
dsm_find_mapping(dsm_handle h)978 dsm_find_mapping(dsm_handle h)
979 {
980 dlist_iter iter;
981 dsm_segment *seg;
982
983 dlist_foreach(iter, &dsm_segment_list)
984 {
985 seg = dlist_container(dsm_segment, node, iter.cur);
986 if (seg->handle == h)
987 return seg;
988 }
989
990 return NULL;
991 }
992
993 /*
994 * Get the address at which a dynamic shared memory segment is mapped.
995 */
996 void *
dsm_segment_address(dsm_segment * seg)997 dsm_segment_address(dsm_segment *seg)
998 {
999 Assert(seg->mapped_address != NULL);
1000 return seg->mapped_address;
1001 }
1002
1003 /*
1004 * Get the size of a mapping.
1005 */
1006 Size
dsm_segment_map_length(dsm_segment * seg)1007 dsm_segment_map_length(dsm_segment *seg)
1008 {
1009 Assert(seg->mapped_address != NULL);
1010 return seg->mapped_size;
1011 }
1012
1013 /*
1014 * Get a handle for a mapping.
1015 *
1016 * To establish communication via dynamic shared memory between two backends,
1017 * one of them should first call dsm_create() to establish a new shared
1018 * memory mapping. That process should then call dsm_segment_handle() to
1019 * obtain a handle for the mapping, and pass that handle to the
1020 * coordinating backend via some means (e.g. bgw_main_arg, or via the
1021 * main shared memory segment). The recipient, once in possession of the
1022 * handle, should call dsm_attach().
1023 */
1024 dsm_handle
dsm_segment_handle(dsm_segment * seg)1025 dsm_segment_handle(dsm_segment *seg)
1026 {
1027 return seg->handle;
1028 }
1029
1030 /*
1031 * Register an on-detach callback for a dynamic shared memory segment.
1032 */
1033 void
on_dsm_detach(dsm_segment * seg,on_dsm_detach_callback function,Datum arg)1034 on_dsm_detach(dsm_segment *seg, on_dsm_detach_callback function, Datum arg)
1035 {
1036 dsm_segment_detach_callback *cb;
1037
1038 cb = MemoryContextAlloc(TopMemoryContext,
1039 sizeof(dsm_segment_detach_callback));
1040 cb->function = function;
1041 cb->arg = arg;
1042 slist_push_head(&seg->on_detach, &cb->node);
1043 }
1044
1045 /*
1046 * Unregister an on-detach callback for a dynamic shared memory segment.
1047 */
1048 void
cancel_on_dsm_detach(dsm_segment * seg,on_dsm_detach_callback function,Datum arg)1049 cancel_on_dsm_detach(dsm_segment *seg, on_dsm_detach_callback function,
1050 Datum arg)
1051 {
1052 slist_mutable_iter iter;
1053
1054 slist_foreach_modify(iter, &seg->on_detach)
1055 {
1056 dsm_segment_detach_callback *cb;
1057
1058 cb = slist_container(dsm_segment_detach_callback, node, iter.cur);
1059 if (cb->function == function && cb->arg == arg)
1060 {
1061 slist_delete_current(&iter);
1062 pfree(cb);
1063 break;
1064 }
1065 }
1066 }
1067
1068 /*
1069 * Discard all registered on-detach callbacks without executing them.
1070 */
1071 void
reset_on_dsm_detach(void)1072 reset_on_dsm_detach(void)
1073 {
1074 dlist_iter iter;
1075
1076 dlist_foreach(iter, &dsm_segment_list)
1077 {
1078 dsm_segment *seg = dlist_container(dsm_segment, node, iter.cur);
1079
1080 /* Throw away explicit on-detach actions one by one. */
1081 while (!slist_is_empty(&seg->on_detach))
1082 {
1083 slist_node *node;
1084 dsm_segment_detach_callback *cb;
1085
1086 node = slist_pop_head_node(&seg->on_detach);
1087 cb = slist_container(dsm_segment_detach_callback, node, node);
1088 pfree(cb);
1089 }
1090
1091 /*
1092 * Decrementing the reference count is a sort of implicit on-detach
1093 * action; make sure we don't do that, either.
1094 */
1095 seg->control_slot = INVALID_CONTROL_SLOT;
1096 }
1097 }
1098
1099 /*
1100 * Create a segment descriptor.
1101 */
1102 static dsm_segment *
dsm_create_descriptor(void)1103 dsm_create_descriptor(void)
1104 {
1105 dsm_segment *seg;
1106
1107 if (CurrentResourceOwner)
1108 ResourceOwnerEnlargeDSMs(CurrentResourceOwner);
1109
1110 seg = MemoryContextAlloc(TopMemoryContext, sizeof(dsm_segment));
1111 dlist_push_head(&dsm_segment_list, &seg->node);
1112
1113 /* seg->handle must be initialized by the caller */
1114 seg->control_slot = INVALID_CONTROL_SLOT;
1115 seg->impl_private = NULL;
1116 seg->mapped_address = NULL;
1117 seg->mapped_size = 0;
1118
1119 seg->resowner = CurrentResourceOwner;
1120 if (CurrentResourceOwner)
1121 ResourceOwnerRememberDSM(CurrentResourceOwner, seg);
1122
1123 slist_init(&seg->on_detach);
1124
1125 return seg;
1126 }
1127
1128 /*
1129 * Sanity check a control segment.
1130 *
1131 * The goal here isn't to detect everything that could possibly be wrong with
1132 * the control segment; there's not enough information for that. Rather, the
1133 * goal is to make sure that someone can iterate over the items in the segment
1134 * without overrunning the end of the mapping and crashing. We also check
1135 * the magic number since, if that's messed up, this may not even be one of
1136 * our segments at all.
1137 */
1138 static bool
dsm_control_segment_sane(dsm_control_header * control,Size mapped_size)1139 dsm_control_segment_sane(dsm_control_header *control, Size mapped_size)
1140 {
1141 if (mapped_size < offsetof(dsm_control_header, item))
1142 return false; /* Mapped size too short to read header. */
1143 if (control->magic != PG_DYNSHMEM_CONTROL_MAGIC)
1144 return false; /* Magic number doesn't match. */
1145 if (dsm_control_bytes_needed(control->maxitems) > mapped_size)
1146 return false; /* Max item count won't fit in map. */
1147 if (control->nitems > control->maxitems)
1148 return false; /* Overfull. */
1149 return true;
1150 }
1151
1152 /*
1153 * Compute the number of control-segment bytes needed to store a given
1154 * number of items.
1155 */
1156 static uint64
dsm_control_bytes_needed(uint32 nitems)1157 dsm_control_bytes_needed(uint32 nitems)
1158 {
1159 return offsetof(dsm_control_header, item)
1160 + sizeof(dsm_control_item) * (uint64) nitems;
1161 }
1162