1 /*-------------------------------------------------------------------------
2 *
3 * dsm.c
4 * manage dynamic shared memory segments
5 *
6 * This file provides a set of services to make programming with dynamic
7 * shared memory segments more convenient. Unlike the low-level
8 * facilities provided by dsm_impl.h and dsm_impl.c, mappings and segments
9 * created using this module will be cleaned up automatically. Mappings
10 * will be removed when the resource owner under which they were created
11 * is cleaned up, unless dsm_pin_mapping() is used, in which case they
12 * have session lifespan. Segments will be removed when there are no
13 * remaining mappings, or at postmaster shutdown in any case. After a
14 * hard postmaster crash, remaining segments will be removed, if they
15 * still exist, at the next postmaster startup.
16 *
17 * Portions Copyright (c) 1996-2018, PostgreSQL Global Development Group
18 * Portions Copyright (c) 1994, Regents of the University of California
19 *
20 *
21 * IDENTIFICATION
22 * src/backend/storage/ipc/dsm.c
23 *
24 *-------------------------------------------------------------------------
25 */
26
27 #include "postgres.h"
28
29 #include <fcntl.h>
30 #include <unistd.h>
31 #ifndef WIN32
32 #include <sys/mman.h>
33 #endif
34 #include <sys/stat.h>
35
36 #include "lib/ilist.h"
37 #include "miscadmin.h"
38 #include "storage/dsm.h"
39 #include "storage/ipc.h"
40 #include "storage/lwlock.h"
41 #include "storage/pg_shmem.h"
42 #include "utils/guc.h"
43 #include "utils/memutils.h"
44 #include "utils/resowner_private.h"
45
46 #define PG_DYNSHMEM_CONTROL_MAGIC 0x9a503d32
47
48 #define PG_DYNSHMEM_FIXED_SLOTS 64
49 #define PG_DYNSHMEM_SLOTS_PER_BACKEND 5
50
51 #define INVALID_CONTROL_SLOT ((uint32) -1)
52
53 /* Backend-local tracking for on-detach callbacks. */
54 typedef struct dsm_segment_detach_callback
55 {
56 on_dsm_detach_callback function;
57 Datum arg;
58 slist_node node;
59 } dsm_segment_detach_callback;
60
61 /* Backend-local state for a dynamic shared memory segment. */
62 struct dsm_segment
63 {
64 dlist_node node; /* List link in dsm_segment_list. */
65 ResourceOwner resowner; /* Resource owner. */
66 dsm_handle handle; /* Segment name. */
67 uint32 control_slot; /* Slot in control segment. */
68 void *impl_private; /* Implementation-specific private data. */
69 void *mapped_address; /* Mapping address, or NULL if unmapped. */
70 Size mapped_size; /* Size of our mapping. */
71 slist_head on_detach; /* On-detach callbacks. */
72 };
73
74 /* Shared-memory state for a dynamic shared memory segment. */
75 typedef struct dsm_control_item
76 {
77 dsm_handle handle;
78 uint32 refcnt; /* 2+ = active, 1 = moribund, 0 = gone */
79 void *impl_private_pm_handle; /* only needed on Windows */
80 bool pinned;
81 } dsm_control_item;
82
83 /* Layout of the dynamic shared memory control segment. */
84 typedef struct dsm_control_header
85 {
86 uint32 magic;
87 uint32 nitems;
88 uint32 maxitems;
89 dsm_control_item item[FLEXIBLE_ARRAY_MEMBER];
90 } dsm_control_header;
91
92 static void dsm_cleanup_for_mmap(void);
93 static void dsm_postmaster_shutdown(int code, Datum arg);
94 static dsm_segment *dsm_create_descriptor(void);
95 static bool dsm_control_segment_sane(dsm_control_header *control,
96 Size mapped_size);
97 static uint64 dsm_control_bytes_needed(uint32 nitems);
98
99 /* Has this backend initialized the dynamic shared memory system yet? */
100 static bool dsm_init_done = false;
101
102 /*
103 * List of dynamic shared memory segments used by this backend.
104 *
105 * At process exit time, we must decrement the reference count of each
106 * segment we have attached; this list makes it possible to find all such
107 * segments.
108 *
109 * This list should always be empty in the postmaster. We could probably
110 * allow the postmaster to map dynamic shared memory segments before it
111 * begins to start child processes, provided that each process adjusted
112 * the reference counts for those segments in the control segment at
113 * startup time, but there's no obvious need for such a facility, which
114 * would also be complex to handle in the EXEC_BACKEND case. Once the
115 * postmaster has begun spawning children, there's an additional problem:
116 * each new mapping would require an update to the control segment,
117 * which requires locking, in which the postmaster must not be involved.
118 */
119 static dlist_head dsm_segment_list = DLIST_STATIC_INIT(dsm_segment_list);
120
121 /*
122 * Control segment information.
123 *
124 * Unlike ordinary shared memory segments, the control segment is not
125 * reference counted; instead, it lasts for the postmaster's entire
126 * life cycle. For simplicity, it doesn't have a dsm_segment object either.
127 */
128 static dsm_handle dsm_control_handle;
129 static dsm_control_header *dsm_control;
130 static Size dsm_control_mapped_size = 0;
131 static void *dsm_control_impl_private = NULL;
132
133 /*
134 * Start up the dynamic shared memory system.
135 *
136 * This is called just once during each cluster lifetime, at postmaster
137 * startup time.
138 */
139 void
dsm_postmaster_startup(PGShmemHeader * shim)140 dsm_postmaster_startup(PGShmemHeader *shim)
141 {
142 void *dsm_control_address = NULL;
143 uint32 maxitems;
144 Size segsize;
145
146 Assert(!IsUnderPostmaster);
147
148 /* If dynamic shared memory is disabled, there's nothing to do. */
149 if (dynamic_shared_memory_type == DSM_IMPL_NONE)
150 return;
151
152 /*
153 * If we're using the mmap implementations, clean up any leftovers.
154 * Cleanup isn't needed on Windows, and happens earlier in startup for
155 * POSIX and System V shared memory, via a direct call to
156 * dsm_cleanup_using_control_segment.
157 */
158 if (dynamic_shared_memory_type == DSM_IMPL_MMAP)
159 dsm_cleanup_for_mmap();
160
161 /* Determine size for new control segment. */
162 maxitems = PG_DYNSHMEM_FIXED_SLOTS
163 + PG_DYNSHMEM_SLOTS_PER_BACKEND * MaxBackends;
164 elog(DEBUG2, "dynamic shared memory system will support %u segments",
165 maxitems);
166 segsize = dsm_control_bytes_needed(maxitems);
167
168 /*
169 * Loop until we find an unused identifier for the new control segment. We
170 * sometimes use 0 as a sentinel value indicating that no control segment
171 * is known to exist, so avoid using that value for a real control
172 * segment.
173 */
174 for (;;)
175 {
176 Assert(dsm_control_address == NULL);
177 Assert(dsm_control_mapped_size == 0);
178 dsm_control_handle = random();
179 if (dsm_control_handle == DSM_HANDLE_INVALID)
180 continue;
181 if (dsm_impl_op(DSM_OP_CREATE, dsm_control_handle, segsize,
182 &dsm_control_impl_private, &dsm_control_address,
183 &dsm_control_mapped_size, ERROR))
184 break;
185 }
186 dsm_control = dsm_control_address;
187 on_shmem_exit(dsm_postmaster_shutdown, PointerGetDatum(shim));
188 elog(DEBUG2,
189 "created dynamic shared memory control segment %u (%zu bytes)",
190 dsm_control_handle, segsize);
191 shim->dsm_control = dsm_control_handle;
192
193 /* Initialize control segment. */
194 dsm_control->magic = PG_DYNSHMEM_CONTROL_MAGIC;
195 dsm_control->nitems = 0;
196 dsm_control->maxitems = maxitems;
197 }
198
199 /*
200 * Determine whether the control segment from the previous postmaster
201 * invocation still exists. If so, remove the dynamic shared memory
202 * segments to which it refers, and then the control segment itself.
203 */
204 void
dsm_cleanup_using_control_segment(dsm_handle old_control_handle)205 dsm_cleanup_using_control_segment(dsm_handle old_control_handle)
206 {
207 void *mapped_address = NULL;
208 void *junk_mapped_address = NULL;
209 void *impl_private = NULL;
210 void *junk_impl_private = NULL;
211 Size mapped_size = 0;
212 Size junk_mapped_size = 0;
213 uint32 nitems;
214 uint32 i;
215 dsm_control_header *old_control;
216
217 /* If dynamic shared memory is disabled, there's nothing to do. */
218 if (dynamic_shared_memory_type == DSM_IMPL_NONE)
219 return;
220
221 /*
222 * Try to attach the segment. If this fails, it probably just means that
223 * the operating system has been rebooted and the segment no longer
224 * exists, or an unrelated process has used the same shm ID. So just fall
225 * out quietly.
226 */
227 if (!dsm_impl_op(DSM_OP_ATTACH, old_control_handle, 0, &impl_private,
228 &mapped_address, &mapped_size, DEBUG1))
229 return;
230
231 /*
232 * We've managed to reattach it, but the contents might not be sane. If
233 * they aren't, we disregard the segment after all.
234 */
235 old_control = (dsm_control_header *) mapped_address;
236 if (!dsm_control_segment_sane(old_control, mapped_size))
237 {
238 dsm_impl_op(DSM_OP_DETACH, old_control_handle, 0, &impl_private,
239 &mapped_address, &mapped_size, LOG);
240 return;
241 }
242
243 /*
244 * OK, the control segment looks basically valid, so we can use it to get
245 * a list of segments that need to be removed.
246 */
247 nitems = old_control->nitems;
248 for (i = 0; i < nitems; ++i)
249 {
250 dsm_handle handle;
251 uint32 refcnt;
252
253 /* If the reference count is 0, the slot is actually unused. */
254 refcnt = old_control->item[i].refcnt;
255 if (refcnt == 0)
256 continue;
257
258 /* Log debugging information. */
259 handle = old_control->item[i].handle;
260 elog(DEBUG2, "cleaning up orphaned dynamic shared memory with ID %u (reference count %u)",
261 handle, refcnt);
262
263 /* Destroy the referenced segment. */
264 dsm_impl_op(DSM_OP_DESTROY, handle, 0, &junk_impl_private,
265 &junk_mapped_address, &junk_mapped_size, LOG);
266 }
267
268 /* Destroy the old control segment, too. */
269 elog(DEBUG2,
270 "cleaning up dynamic shared memory control segment with ID %u",
271 old_control_handle);
272 dsm_impl_op(DSM_OP_DESTROY, old_control_handle, 0, &impl_private,
273 &mapped_address, &mapped_size, LOG);
274 }
275
276 /*
277 * When we're using the mmap shared memory implementation, "shared memory"
278 * segments might even manage to survive an operating system reboot.
279 * But there's no guarantee as to exactly what will survive: some segments
280 * may survive, and others may not, and the contents of some may be out
281 * of date. In particular, the control segment may be out of date, so we
282 * can't rely on it to figure out what to remove. However, since we know
283 * what directory contains the files we used as shared memory, we can simply
284 * scan the directory and blow everything away that shouldn't be there.
285 */
286 static void
dsm_cleanup_for_mmap(void)287 dsm_cleanup_for_mmap(void)
288 {
289 DIR *dir;
290 struct dirent *dent;
291
292 /* Scan the directory for something with a name of the correct format. */
293 dir = AllocateDir(PG_DYNSHMEM_DIR);
294
295 while ((dent = ReadDir(dir, PG_DYNSHMEM_DIR)) != NULL)
296 {
297 if (strncmp(dent->d_name, PG_DYNSHMEM_MMAP_FILE_PREFIX,
298 strlen(PG_DYNSHMEM_MMAP_FILE_PREFIX)) == 0)
299 {
300 char buf[MAXPGPATH + sizeof(PG_DYNSHMEM_DIR)];
301
302 snprintf(buf, sizeof(buf), PG_DYNSHMEM_DIR "/%s", dent->d_name);
303
304 elog(DEBUG2, "removing file \"%s\"", buf);
305
306 /* We found a matching file; so remove it. */
307 if (unlink(buf) != 0)
308 ereport(ERROR,
309 (errcode_for_file_access(),
310 errmsg("could not remove file \"%s\": %m", buf)));
311 }
312 }
313
314 /* Cleanup complete. */
315 FreeDir(dir);
316 }
317
318 /*
319 * At shutdown time, we iterate over the control segment and remove all
320 * remaining dynamic shared memory segments. We avoid throwing errors here;
321 * the postmaster is shutting down either way, and this is just non-critical
322 * resource cleanup.
323 */
324 static void
dsm_postmaster_shutdown(int code,Datum arg)325 dsm_postmaster_shutdown(int code, Datum arg)
326 {
327 uint32 nitems;
328 uint32 i;
329 void *dsm_control_address;
330 void *junk_mapped_address = NULL;
331 void *junk_impl_private = NULL;
332 Size junk_mapped_size = 0;
333 PGShmemHeader *shim = (PGShmemHeader *) DatumGetPointer(arg);
334
335 /*
336 * If some other backend exited uncleanly, it might have corrupted the
337 * control segment while it was dying. In that case, we warn and ignore
338 * the contents of the control segment. This may end up leaving behind
339 * stray shared memory segments, but there's not much we can do about that
340 * if the metadata is gone.
341 */
342 nitems = dsm_control->nitems;
343 if (!dsm_control_segment_sane(dsm_control, dsm_control_mapped_size))
344 {
345 ereport(LOG,
346 (errmsg("dynamic shared memory control segment is corrupt")));
347 return;
348 }
349
350 /* Remove any remaining segments. */
351 for (i = 0; i < nitems; ++i)
352 {
353 dsm_handle handle;
354
355 /* If the reference count is 0, the slot is actually unused. */
356 if (dsm_control->item[i].refcnt == 0)
357 continue;
358
359 /* Log debugging information. */
360 handle = dsm_control->item[i].handle;
361 elog(DEBUG2, "cleaning up orphaned dynamic shared memory with ID %u",
362 handle);
363
364 /* Destroy the segment. */
365 dsm_impl_op(DSM_OP_DESTROY, handle, 0, &junk_impl_private,
366 &junk_mapped_address, &junk_mapped_size, LOG);
367 }
368
369 /* Remove the control segment itself. */
370 elog(DEBUG2,
371 "cleaning up dynamic shared memory control segment with ID %u",
372 dsm_control_handle);
373 dsm_control_address = dsm_control;
374 dsm_impl_op(DSM_OP_DESTROY, dsm_control_handle, 0,
375 &dsm_control_impl_private, &dsm_control_address,
376 &dsm_control_mapped_size, LOG);
377 dsm_control = dsm_control_address;
378 shim->dsm_control = 0;
379 }
380
381 /*
382 * Prepare this backend for dynamic shared memory usage. Under EXEC_BACKEND,
383 * we must reread the state file and map the control segment; in other cases,
384 * we'll have inherited the postmaster's mapping and global variables.
385 */
386 static void
dsm_backend_startup(void)387 dsm_backend_startup(void)
388 {
389 /* If dynamic shared memory is disabled, reject this. */
390 if (dynamic_shared_memory_type == DSM_IMPL_NONE)
391 ereport(ERROR,
392 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
393 errmsg("dynamic shared memory is disabled"),
394 errhint("Set dynamic_shared_memory_type to a value other than \"none\".")));
395
396 #ifdef EXEC_BACKEND
397 {
398 void *control_address = NULL;
399
400 /* Attach control segment. */
401 Assert(dsm_control_handle != 0);
402 dsm_impl_op(DSM_OP_ATTACH, dsm_control_handle, 0,
403 &dsm_control_impl_private, &control_address,
404 &dsm_control_mapped_size, ERROR);
405 dsm_control = control_address;
406 /* If control segment doesn't look sane, something is badly wrong. */
407 if (!dsm_control_segment_sane(dsm_control, dsm_control_mapped_size))
408 {
409 dsm_impl_op(DSM_OP_DETACH, dsm_control_handle, 0,
410 &dsm_control_impl_private, &control_address,
411 &dsm_control_mapped_size, WARNING);
412 ereport(FATAL,
413 (errcode(ERRCODE_INTERNAL_ERROR),
414 errmsg("dynamic shared memory control segment is not valid")));
415 }
416 }
417 #endif
418
419 dsm_init_done = true;
420 }
421
422 #ifdef EXEC_BACKEND
423 /*
424 * When running under EXEC_BACKEND, we get a callback here when the main
425 * shared memory segment is re-attached, so that we can record the control
426 * handle retrieved from it.
427 */
428 void
dsm_set_control_handle(dsm_handle h)429 dsm_set_control_handle(dsm_handle h)
430 {
431 Assert(dsm_control_handle == 0 && h != 0);
432 dsm_control_handle = h;
433 }
434 #endif
435
436 /*
437 * Create a new dynamic shared memory segment.
438 *
439 * If there is a non-NULL CurrentResourceOwner, the new segment is associated
440 * with it and must be detached before the resource owner releases, or a
441 * warning will be logged. If CurrentResourceOwner is NULL, the segment
442 * remains attached until explicitely detached or the session ends.
443 * Creating with a NULL CurrentResourceOwner is equivalent to creating
444 * with a non-NULL CurrentResourceOwner and then calling dsm_pin_mapping.
445 */
446 dsm_segment *
dsm_create(Size size,int flags)447 dsm_create(Size size, int flags)
448 {
449 dsm_segment *seg;
450 uint32 i;
451 uint32 nitems;
452
453 /* Unsafe in postmaster (and pointless in a stand-alone backend). */
454 Assert(IsUnderPostmaster);
455
456 if (!dsm_init_done)
457 dsm_backend_startup();
458
459 /* Create a new segment descriptor. */
460 seg = dsm_create_descriptor();
461
462 /* Loop until we find an unused segment identifier. */
463 for (;;)
464 {
465 Assert(seg->mapped_address == NULL && seg->mapped_size == 0);
466 seg->handle = random();
467 if (seg->handle == DSM_HANDLE_INVALID) /* Reserve sentinel */
468 continue;
469 if (dsm_impl_op(DSM_OP_CREATE, seg->handle, size, &seg->impl_private,
470 &seg->mapped_address, &seg->mapped_size, ERROR))
471 break;
472 }
473
474 /* Lock the control segment so we can register the new segment. */
475 LWLockAcquire(DynamicSharedMemoryControlLock, LW_EXCLUSIVE);
476
477 /* Search the control segment for an unused slot. */
478 nitems = dsm_control->nitems;
479 for (i = 0; i < nitems; ++i)
480 {
481 if (dsm_control->item[i].refcnt == 0)
482 {
483 dsm_control->item[i].handle = seg->handle;
484 /* refcnt of 1 triggers destruction, so start at 2 */
485 dsm_control->item[i].refcnt = 2;
486 dsm_control->item[i].impl_private_pm_handle = NULL;
487 dsm_control->item[i].pinned = false;
488 seg->control_slot = i;
489 LWLockRelease(DynamicSharedMemoryControlLock);
490 return seg;
491 }
492 }
493
494 /* Verify that we can support an additional mapping. */
495 if (nitems >= dsm_control->maxitems)
496 {
497 LWLockRelease(DynamicSharedMemoryControlLock);
498 dsm_impl_op(DSM_OP_DESTROY, seg->handle, 0, &seg->impl_private,
499 &seg->mapped_address, &seg->mapped_size, WARNING);
500 if (seg->resowner != NULL)
501 ResourceOwnerForgetDSM(seg->resowner, seg);
502 dlist_delete(&seg->node);
503 pfree(seg);
504
505 if ((flags & DSM_CREATE_NULL_IF_MAXSEGMENTS) != 0)
506 return NULL;
507 ereport(ERROR,
508 (errcode(ERRCODE_INSUFFICIENT_RESOURCES),
509 errmsg("too many dynamic shared memory segments")));
510 }
511
512 /* Enter the handle into a new array slot. */
513 dsm_control->item[nitems].handle = seg->handle;
514 /* refcnt of 1 triggers destruction, so start at 2 */
515 dsm_control->item[nitems].refcnt = 2;
516 dsm_control->item[nitems].impl_private_pm_handle = NULL;
517 dsm_control->item[nitems].pinned = false;
518 seg->control_slot = nitems;
519 dsm_control->nitems++;
520 LWLockRelease(DynamicSharedMemoryControlLock);
521
522 return seg;
523 }
524
525 /*
526 * Attach a dynamic shared memory segment.
527 *
528 * See comments for dsm_segment_handle() for an explanation of how this
529 * is intended to be used.
530 *
531 * This function will return NULL if the segment isn't known to the system.
532 * This can happen if we're asked to attach the segment, but then everyone
533 * else detaches it (causing it to be destroyed) before we get around to
534 * attaching it.
535 *
536 * If there is a non-NULL CurrentResourceOwner, the attached segment is
537 * associated with it and must be detached before the resource owner releases,
538 * or a warning will be logged. Otherwise the segment remains attached until
539 * explicitely detached or the session ends. See the note atop dsm_create().
540 */
541 dsm_segment *
dsm_attach(dsm_handle h)542 dsm_attach(dsm_handle h)
543 {
544 dsm_segment *seg;
545 dlist_iter iter;
546 uint32 i;
547 uint32 nitems;
548
549 /* Unsafe in postmaster (and pointless in a stand-alone backend). */
550 Assert(IsUnderPostmaster);
551
552 if (!dsm_init_done)
553 dsm_backend_startup();
554
555 /*
556 * Since this is just a debugging cross-check, we could leave it out
557 * altogether, or include it only in assert-enabled builds. But since the
558 * list of attached segments should normally be very short, let's include
559 * it always for right now.
560 *
561 * If you're hitting this error, you probably want to attempt to find an
562 * existing mapping via dsm_find_mapping() before calling dsm_attach() to
563 * create a new one.
564 */
565 dlist_foreach(iter, &dsm_segment_list)
566 {
567 seg = dlist_container(dsm_segment, node, iter.cur);
568 if (seg->handle == h)
569 elog(ERROR, "can't attach the same segment more than once");
570 }
571
572 /* Create a new segment descriptor. */
573 seg = dsm_create_descriptor();
574 seg->handle = h;
575
576 /* Bump reference count for this segment in shared memory. */
577 LWLockAcquire(DynamicSharedMemoryControlLock, LW_EXCLUSIVE);
578 nitems = dsm_control->nitems;
579 for (i = 0; i < nitems; ++i)
580 {
581 /*
582 * If the reference count is 0, the slot is actually unused. If the
583 * reference count is 1, the slot is still in use, but the segment is
584 * in the process of going away; even if the handle matches, another
585 * slot may already have started using the same handle value by
586 * coincidence so we have to keep searching.
587 */
588 if (dsm_control->item[i].refcnt <= 1)
589 continue;
590
591 /* If the handle doesn't match, it's not the slot we want. */
592 if (dsm_control->item[i].handle != seg->handle)
593 continue;
594
595 /* Otherwise we've found a match. */
596 dsm_control->item[i].refcnt++;
597 seg->control_slot = i;
598 break;
599 }
600 LWLockRelease(DynamicSharedMemoryControlLock);
601
602 /*
603 * If we didn't find the handle we're looking for in the control segment,
604 * it probably means that everyone else who had it mapped, including the
605 * original creator, died before we got to this point. It's up to the
606 * caller to decide what to do about that.
607 */
608 if (seg->control_slot == INVALID_CONTROL_SLOT)
609 {
610 dsm_detach(seg);
611 return NULL;
612 }
613
614 /* Here's where we actually try to map the segment. */
615 dsm_impl_op(DSM_OP_ATTACH, seg->handle, 0, &seg->impl_private,
616 &seg->mapped_address, &seg->mapped_size, ERROR);
617
618 return seg;
619 }
620
621 /*
622 * At backend shutdown time, detach any segments that are still attached.
623 * (This is similar to dsm_detach_all, except that there's no reason to
624 * unmap the control segment before exiting, so we don't bother.)
625 */
626 void
dsm_backend_shutdown(void)627 dsm_backend_shutdown(void)
628 {
629 while (!dlist_is_empty(&dsm_segment_list))
630 {
631 dsm_segment *seg;
632
633 seg = dlist_head_element(dsm_segment, node, &dsm_segment_list);
634 dsm_detach(seg);
635 }
636 }
637
638 /*
639 * Detach all shared memory segments, including the control segments. This
640 * should be called, along with PGSharedMemoryDetach, in processes that
641 * might inherit mappings but are not intended to be connected to dynamic
642 * shared memory.
643 */
644 void
dsm_detach_all(void)645 dsm_detach_all(void)
646 {
647 void *control_address = dsm_control;
648
649 while (!dlist_is_empty(&dsm_segment_list))
650 {
651 dsm_segment *seg;
652
653 seg = dlist_head_element(dsm_segment, node, &dsm_segment_list);
654 dsm_detach(seg);
655 }
656
657 if (control_address != NULL)
658 dsm_impl_op(DSM_OP_DETACH, dsm_control_handle, 0,
659 &dsm_control_impl_private, &control_address,
660 &dsm_control_mapped_size, ERROR);
661 }
662
663 /*
664 * Resize an existing shared memory segment.
665 *
666 * This may cause the shared memory segment to be remapped at a different
667 * address. For the caller's convenience, we return the mapped address.
668 */
669 void *
dsm_resize(dsm_segment * seg,Size size)670 dsm_resize(dsm_segment *seg, Size size)
671 {
672 Assert(seg->control_slot != INVALID_CONTROL_SLOT);
673 dsm_impl_op(DSM_OP_RESIZE, seg->handle, size, &seg->impl_private,
674 &seg->mapped_address, &seg->mapped_size, ERROR);
675 return seg->mapped_address;
676 }
677
678 /*
679 * Remap an existing shared memory segment.
680 *
681 * This is intended to be used when some other process has extended the
682 * mapping using dsm_resize(), but we've still only got the initial
683 * portion mapped. Since this might change the address at which the
684 * segment is mapped, we return the new mapped address.
685 */
686 void *
dsm_remap(dsm_segment * seg)687 dsm_remap(dsm_segment *seg)
688 {
689 dsm_impl_op(DSM_OP_ATTACH, seg->handle, 0, &seg->impl_private,
690 &seg->mapped_address, &seg->mapped_size, ERROR);
691
692 return seg->mapped_address;
693 }
694
695 /*
696 * Detach from a shared memory segment, destroying the segment if we
697 * remove the last reference.
698 *
699 * This function should never fail. It will often be invoked when aborting
700 * a transaction, and a further error won't serve any purpose. It's not a
701 * complete disaster if we fail to unmap or destroy the segment; it means a
702 * resource leak, but that doesn't necessarily preclude further operations.
703 */
704 void
dsm_detach(dsm_segment * seg)705 dsm_detach(dsm_segment *seg)
706 {
707 /*
708 * Invoke registered callbacks. Just in case one of those callbacks
709 * throws a further error that brings us back here, pop the callback
710 * before invoking it, to avoid infinite error recursion. Don't allow
711 * interrupts while running the individual callbacks in non-error code
712 * paths, to avoid leaving cleanup work unfinished if we're interrupted by
713 * a statement timeout or similar.
714 */
715 HOLD_INTERRUPTS();
716 while (!slist_is_empty(&seg->on_detach))
717 {
718 slist_node *node;
719 dsm_segment_detach_callback *cb;
720 on_dsm_detach_callback function;
721 Datum arg;
722
723 node = slist_pop_head_node(&seg->on_detach);
724 cb = slist_container(dsm_segment_detach_callback, node, node);
725 function = cb->function;
726 arg = cb->arg;
727 pfree(cb);
728
729 function(seg, arg);
730 }
731 RESUME_INTERRUPTS();
732
733 /*
734 * Try to remove the mapping, if one exists. Normally, there will be, but
735 * maybe not, if we failed partway through a create or attach operation.
736 * We remove the mapping before decrementing the reference count so that
737 * the process that sees a zero reference count can be certain that no
738 * remaining mappings exist. Even if this fails, we pretend that it
739 * works, because retrying is likely to fail in the same way.
740 */
741 if (seg->mapped_address != NULL)
742 {
743 dsm_impl_op(DSM_OP_DETACH, seg->handle, 0, &seg->impl_private,
744 &seg->mapped_address, &seg->mapped_size, WARNING);
745 seg->impl_private = NULL;
746 seg->mapped_address = NULL;
747 seg->mapped_size = 0;
748 }
749
750 /* Reduce reference count, if we previously increased it. */
751 if (seg->control_slot != INVALID_CONTROL_SLOT)
752 {
753 uint32 refcnt;
754 uint32 control_slot = seg->control_slot;
755
756 LWLockAcquire(DynamicSharedMemoryControlLock, LW_EXCLUSIVE);
757 Assert(dsm_control->item[control_slot].handle == seg->handle);
758 Assert(dsm_control->item[control_slot].refcnt > 1);
759 refcnt = --dsm_control->item[control_slot].refcnt;
760 seg->control_slot = INVALID_CONTROL_SLOT;
761 LWLockRelease(DynamicSharedMemoryControlLock);
762
763 /* If new reference count is 1, try to destroy the segment. */
764 if (refcnt == 1)
765 {
766 /* A pinned segment should never reach 1. */
767 Assert(!dsm_control->item[control_slot].pinned);
768
769 /*
770 * If we fail to destroy the segment here, or are killed before we
771 * finish doing so, the reference count will remain at 1, which
772 * will mean that nobody else can attach to the segment. At
773 * postmaster shutdown time, or when a new postmaster is started
774 * after a hard kill, another attempt will be made to remove the
775 * segment.
776 *
777 * The main case we're worried about here is being killed by a
778 * signal before we can finish removing the segment. In that
779 * case, it's important to be sure that the segment still gets
780 * removed. If we actually fail to remove the segment for some
781 * other reason, the postmaster may not have any better luck than
782 * we did. There's not much we can do about that, though.
783 */
784 if (dsm_impl_op(DSM_OP_DESTROY, seg->handle, 0, &seg->impl_private,
785 &seg->mapped_address, &seg->mapped_size, WARNING))
786 {
787 LWLockAcquire(DynamicSharedMemoryControlLock, LW_EXCLUSIVE);
788 Assert(dsm_control->item[control_slot].handle == seg->handle);
789 Assert(dsm_control->item[control_slot].refcnt == 1);
790 dsm_control->item[control_slot].refcnt = 0;
791 LWLockRelease(DynamicSharedMemoryControlLock);
792 }
793 }
794 }
795
796 /* Clean up our remaining backend-private data structures. */
797 if (seg->resowner != NULL)
798 ResourceOwnerForgetDSM(seg->resowner, seg);
799 dlist_delete(&seg->node);
800 pfree(seg);
801 }
802
803 /*
804 * Keep a dynamic shared memory mapping until end of session.
805 *
806 * By default, mappings are owned by the current resource owner, which
807 * typically means they stick around for the duration of the current query
808 * only.
809 */
810 void
dsm_pin_mapping(dsm_segment * seg)811 dsm_pin_mapping(dsm_segment *seg)
812 {
813 if (seg->resowner != NULL)
814 {
815 ResourceOwnerForgetDSM(seg->resowner, seg);
816 seg->resowner = NULL;
817 }
818 }
819
820 /*
821 * Arrange to remove a dynamic shared memory mapping at cleanup time.
822 *
823 * dsm_pin_mapping() can be used to preserve a mapping for the entire
824 * lifetime of a process; this function reverses that decision, making
825 * the segment owned by the current resource owner. This may be useful
826 * just before performing some operation that will invalidate the segment
827 * for future use by this backend.
828 */
829 void
dsm_unpin_mapping(dsm_segment * seg)830 dsm_unpin_mapping(dsm_segment *seg)
831 {
832 Assert(seg->resowner == NULL);
833 ResourceOwnerEnlargeDSMs(CurrentResourceOwner);
834 seg->resowner = CurrentResourceOwner;
835 ResourceOwnerRememberDSM(seg->resowner, seg);
836 }
837
838 /*
839 * Keep a dynamic shared memory segment until postmaster shutdown, or until
840 * dsm_unpin_segment is called.
841 *
842 * This function should not be called more than once per segment, unless the
843 * segment is explicitly unpinned with dsm_unpin_segment in between calls.
844 *
845 * Note that this function does not arrange for the current process to
846 * keep the segment mapped indefinitely; if that behavior is desired,
847 * dsm_pin_mapping() should be used from each process that needs to
848 * retain the mapping.
849 */
850 void
dsm_pin_segment(dsm_segment * seg)851 dsm_pin_segment(dsm_segment *seg)
852 {
853 void *handle;
854
855 /*
856 * Bump reference count for this segment in shared memory. This will
857 * ensure that even if there is no session which is attached to this
858 * segment, it will remain until postmaster shutdown or an explicit call
859 * to unpin.
860 */
861 LWLockAcquire(DynamicSharedMemoryControlLock, LW_EXCLUSIVE);
862 if (dsm_control->item[seg->control_slot].pinned)
863 elog(ERROR, "cannot pin a segment that is already pinned");
864 dsm_impl_pin_segment(seg->handle, seg->impl_private, &handle);
865 dsm_control->item[seg->control_slot].pinned = true;
866 dsm_control->item[seg->control_slot].refcnt++;
867 dsm_control->item[seg->control_slot].impl_private_pm_handle = handle;
868 LWLockRelease(DynamicSharedMemoryControlLock);
869 }
870
871 /*
872 * Unpin a dynamic shared memory segment that was previously pinned with
873 * dsm_pin_segment. This function should not be called unless dsm_pin_segment
874 * was previously called for this segment.
875 *
876 * The argument is a dsm_handle rather than a dsm_segment in case you want
877 * to unpin a segment to which you haven't attached. This turns out to be
878 * useful if, for example, a reference to one shared memory segment is stored
879 * within another shared memory segment. You might want to unpin the
880 * referenced segment before destroying the referencing segment.
881 */
882 void
dsm_unpin_segment(dsm_handle handle)883 dsm_unpin_segment(dsm_handle handle)
884 {
885 uint32 control_slot = INVALID_CONTROL_SLOT;
886 bool destroy = false;
887 uint32 i;
888
889 /* Find the control slot for the given handle. */
890 LWLockAcquire(DynamicSharedMemoryControlLock, LW_EXCLUSIVE);
891 for (i = 0; i < dsm_control->nitems; ++i)
892 {
893 /* Skip unused slots and segments that are concurrently going away. */
894 if (dsm_control->item[i].refcnt <= 1)
895 continue;
896
897 /* If we've found our handle, we can stop searching. */
898 if (dsm_control->item[i].handle == handle)
899 {
900 control_slot = i;
901 break;
902 }
903 }
904
905 /*
906 * We should definitely have found the slot, and it should not already be
907 * in the process of going away, because this function should only be
908 * called on a segment which is pinned.
909 */
910 if (control_slot == INVALID_CONTROL_SLOT)
911 elog(ERROR, "cannot unpin unknown segment handle");
912 if (!dsm_control->item[control_slot].pinned)
913 elog(ERROR, "cannot unpin a segment that is not pinned");
914 Assert(dsm_control->item[control_slot].refcnt > 1);
915
916 /*
917 * Allow implementation-specific code to run. We have to do this before
918 * releasing the lock, because impl_private_pm_handle may get modified by
919 * dsm_impl_unpin_segment.
920 */
921 dsm_impl_unpin_segment(handle,
922 &dsm_control->item[control_slot].impl_private_pm_handle);
923
924 /* Note that 1 means no references (0 means unused slot). */
925 if (--dsm_control->item[control_slot].refcnt == 1)
926 destroy = true;
927 dsm_control->item[control_slot].pinned = false;
928
929 /* Now we can release the lock. */
930 LWLockRelease(DynamicSharedMemoryControlLock);
931
932 /* Clean up resources if that was the last reference. */
933 if (destroy)
934 {
935 void *junk_impl_private = NULL;
936 void *junk_mapped_address = NULL;
937 Size junk_mapped_size = 0;
938
939 /*
940 * For an explanation of how error handling works in this case, see
941 * comments in dsm_detach. Note that if we reach this point, the
942 * current process certainly does not have the segment mapped, because
943 * if it did, the reference count would have still been greater than 1
944 * even after releasing the reference count held by the pin. The fact
945 * that there can't be a dsm_segment for this handle makes it OK to
946 * pass the mapped size, mapped address, and private data as NULL
947 * here.
948 */
949 if (dsm_impl_op(DSM_OP_DESTROY, handle, 0, &junk_impl_private,
950 &junk_mapped_address, &junk_mapped_size, WARNING))
951 {
952 LWLockAcquire(DynamicSharedMemoryControlLock, LW_EXCLUSIVE);
953 Assert(dsm_control->item[control_slot].handle == handle);
954 Assert(dsm_control->item[control_slot].refcnt == 1);
955 dsm_control->item[control_slot].refcnt = 0;
956 LWLockRelease(DynamicSharedMemoryControlLock);
957 }
958 }
959 }
960
961 /*
962 * Find an existing mapping for a shared memory segment, if there is one.
963 */
964 dsm_segment *
dsm_find_mapping(dsm_handle h)965 dsm_find_mapping(dsm_handle h)
966 {
967 dlist_iter iter;
968 dsm_segment *seg;
969
970 dlist_foreach(iter, &dsm_segment_list)
971 {
972 seg = dlist_container(dsm_segment, node, iter.cur);
973 if (seg->handle == h)
974 return seg;
975 }
976
977 return NULL;
978 }
979
980 /*
981 * Get the address at which a dynamic shared memory segment is mapped.
982 */
983 void *
dsm_segment_address(dsm_segment * seg)984 dsm_segment_address(dsm_segment *seg)
985 {
986 Assert(seg->mapped_address != NULL);
987 return seg->mapped_address;
988 }
989
990 /*
991 * Get the size of a mapping.
992 */
993 Size
dsm_segment_map_length(dsm_segment * seg)994 dsm_segment_map_length(dsm_segment *seg)
995 {
996 Assert(seg->mapped_address != NULL);
997 return seg->mapped_size;
998 }
999
1000 /*
1001 * Get a handle for a mapping.
1002 *
1003 * To establish communication via dynamic shared memory between two backends,
1004 * one of them should first call dsm_create() to establish a new shared
1005 * memory mapping. That process should then call dsm_segment_handle() to
1006 * obtain a handle for the mapping, and pass that handle to the
1007 * coordinating backend via some means (e.g. bgw_main_arg, or via the
1008 * main shared memory segment). The recipient, once in possession of the
1009 * handle, should call dsm_attach().
1010 */
1011 dsm_handle
dsm_segment_handle(dsm_segment * seg)1012 dsm_segment_handle(dsm_segment *seg)
1013 {
1014 return seg->handle;
1015 }
1016
1017 /*
1018 * Register an on-detach callback for a dynamic shared memory segment.
1019 */
1020 void
on_dsm_detach(dsm_segment * seg,on_dsm_detach_callback function,Datum arg)1021 on_dsm_detach(dsm_segment *seg, on_dsm_detach_callback function, Datum arg)
1022 {
1023 dsm_segment_detach_callback *cb;
1024
1025 cb = MemoryContextAlloc(TopMemoryContext,
1026 sizeof(dsm_segment_detach_callback));
1027 cb->function = function;
1028 cb->arg = arg;
1029 slist_push_head(&seg->on_detach, &cb->node);
1030 }
1031
1032 /*
1033 * Unregister an on-detach callback for a dynamic shared memory segment.
1034 */
1035 void
cancel_on_dsm_detach(dsm_segment * seg,on_dsm_detach_callback function,Datum arg)1036 cancel_on_dsm_detach(dsm_segment *seg, on_dsm_detach_callback function,
1037 Datum arg)
1038 {
1039 slist_mutable_iter iter;
1040
1041 slist_foreach_modify(iter, &seg->on_detach)
1042 {
1043 dsm_segment_detach_callback *cb;
1044
1045 cb = slist_container(dsm_segment_detach_callback, node, iter.cur);
1046 if (cb->function == function && cb->arg == arg)
1047 {
1048 slist_delete_current(&iter);
1049 pfree(cb);
1050 break;
1051 }
1052 }
1053 }
1054
1055 /*
1056 * Discard all registered on-detach callbacks without executing them.
1057 */
1058 void
reset_on_dsm_detach(void)1059 reset_on_dsm_detach(void)
1060 {
1061 dlist_iter iter;
1062
1063 dlist_foreach(iter, &dsm_segment_list)
1064 {
1065 dsm_segment *seg = dlist_container(dsm_segment, node, iter.cur);
1066
1067 /* Throw away explicit on-detach actions one by one. */
1068 while (!slist_is_empty(&seg->on_detach))
1069 {
1070 slist_node *node;
1071 dsm_segment_detach_callback *cb;
1072
1073 node = slist_pop_head_node(&seg->on_detach);
1074 cb = slist_container(dsm_segment_detach_callback, node, node);
1075 pfree(cb);
1076 }
1077
1078 /*
1079 * Decrementing the reference count is a sort of implicit on-detach
1080 * action; make sure we don't do that, either.
1081 */
1082 seg->control_slot = INVALID_CONTROL_SLOT;
1083 }
1084 }
1085
1086 /*
1087 * Create a segment descriptor.
1088 */
1089 static dsm_segment *
dsm_create_descriptor(void)1090 dsm_create_descriptor(void)
1091 {
1092 dsm_segment *seg;
1093
1094 if (CurrentResourceOwner)
1095 ResourceOwnerEnlargeDSMs(CurrentResourceOwner);
1096
1097 seg = MemoryContextAlloc(TopMemoryContext, sizeof(dsm_segment));
1098 dlist_push_head(&dsm_segment_list, &seg->node);
1099
1100 /* seg->handle must be initialized by the caller */
1101 seg->control_slot = INVALID_CONTROL_SLOT;
1102 seg->impl_private = NULL;
1103 seg->mapped_address = NULL;
1104 seg->mapped_size = 0;
1105
1106 seg->resowner = CurrentResourceOwner;
1107 if (CurrentResourceOwner)
1108 ResourceOwnerRememberDSM(CurrentResourceOwner, seg);
1109
1110 slist_init(&seg->on_detach);
1111
1112 return seg;
1113 }
1114
1115 /*
1116 * Sanity check a control segment.
1117 *
1118 * The goal here isn't to detect everything that could possibly be wrong with
1119 * the control segment; there's not enough information for that. Rather, the
1120 * goal is to make sure that someone can iterate over the items in the segment
1121 * without overrunning the end of the mapping and crashing. We also check
1122 * the magic number since, if that's messed up, this may not even be one of
1123 * our segments at all.
1124 */
1125 static bool
dsm_control_segment_sane(dsm_control_header * control,Size mapped_size)1126 dsm_control_segment_sane(dsm_control_header *control, Size mapped_size)
1127 {
1128 if (mapped_size < offsetof(dsm_control_header, item))
1129 return false; /* Mapped size too short to read header. */
1130 if (control->magic != PG_DYNSHMEM_CONTROL_MAGIC)
1131 return false; /* Magic number doesn't match. */
1132 if (dsm_control_bytes_needed(control->maxitems) > mapped_size)
1133 return false; /* Max item count won't fit in map. */
1134 if (control->nitems > control->maxitems)
1135 return false; /* Overfull. */
1136 return true;
1137 }
1138
1139 /*
1140 * Compute the number of control-segment bytes needed to store a given
1141 * number of items.
1142 */
1143 static uint64
dsm_control_bytes_needed(uint32 nitems)1144 dsm_control_bytes_needed(uint32 nitems)
1145 {
1146 return offsetof(dsm_control_header, item)
1147 + sizeof(dsm_control_item) * (uint64) nitems;
1148 }
1149