1 /*-------------------------------------------------------------------------
2 *
3 * dsm.c
4 * manage dynamic shared memory segments
5 *
6 * This file provides a set of services to make programming with dynamic
7 * shared memory segments more convenient. Unlike the low-level
8 * facilities provided by dsm_impl.h and dsm_impl.c, mappings and segments
9 * created using this module will be cleaned up automatically. Mappings
10 * will be removed when the resource owner under which they were created
11 * is cleaned up, unless dsm_pin_mapping() is used, in which case they
12 * have session lifespan. Segments will be removed when there are no
13 * remaining mappings, or at postmaster shutdown in any case. After a
14 * hard postmaster crash, remaining segments will be removed, if they
15 * still exist, at the next postmaster startup.
16 *
17 * Portions Copyright (c) 1996-2019, PostgreSQL Global Development Group
18 * Portions Copyright (c) 1994, Regents of the University of California
19 *
20 *
21 * IDENTIFICATION
22 * src/backend/storage/ipc/dsm.c
23 *
24 *-------------------------------------------------------------------------
25 */
26
27 #include "postgres.h"
28
29 #include <fcntl.h>
30 #include <unistd.h>
31 #ifndef WIN32
32 #include <sys/mman.h>
33 #endif
34 #include <sys/stat.h>
35
36 #include "lib/ilist.h"
37 #include "miscadmin.h"
38 #include "storage/dsm.h"
39 #include "storage/ipc.h"
40 #include "storage/lwlock.h"
41 #include "storage/pg_shmem.h"
42 #include "utils/guc.h"
43 #include "utils/memutils.h"
44 #include "utils/resowner_private.h"
45
46 #define PG_DYNSHMEM_CONTROL_MAGIC 0x9a503d32
47
48 #define PG_DYNSHMEM_FIXED_SLOTS 64
49 #define PG_DYNSHMEM_SLOTS_PER_BACKEND 5
50
51 #define INVALID_CONTROL_SLOT ((uint32) -1)
52
53 /* Backend-local tracking for on-detach callbacks. */
54 typedef struct dsm_segment_detach_callback
55 {
56 on_dsm_detach_callback function;
57 Datum arg;
58 slist_node node;
59 } dsm_segment_detach_callback;
60
61 /* Backend-local state for a dynamic shared memory segment. */
62 struct dsm_segment
63 {
64 dlist_node node; /* List link in dsm_segment_list. */
65 ResourceOwner resowner; /* Resource owner. */
66 dsm_handle handle; /* Segment name. */
67 uint32 control_slot; /* Slot in control segment. */
68 void *impl_private; /* Implementation-specific private data. */
69 void *mapped_address; /* Mapping address, or NULL if unmapped. */
70 Size mapped_size; /* Size of our mapping. */
71 slist_head on_detach; /* On-detach callbacks. */
72 };
73
74 /* Shared-memory state for a dynamic shared memory segment. */
75 typedef struct dsm_control_item
76 {
77 dsm_handle handle;
78 uint32 refcnt; /* 2+ = active, 1 = moribund, 0 = gone */
79 void *impl_private_pm_handle; /* only needed on Windows */
80 bool pinned;
81 } dsm_control_item;
82
83 /* Layout of the dynamic shared memory control segment. */
84 typedef struct dsm_control_header
85 {
86 uint32 magic;
87 uint32 nitems;
88 uint32 maxitems;
89 dsm_control_item item[FLEXIBLE_ARRAY_MEMBER];
90 } dsm_control_header;
91
92 static void dsm_cleanup_for_mmap(void);
93 static void dsm_postmaster_shutdown(int code, Datum arg);
94 static dsm_segment *dsm_create_descriptor(void);
95 static bool dsm_control_segment_sane(dsm_control_header *control,
96 Size mapped_size);
97 static uint64 dsm_control_bytes_needed(uint32 nitems);
98
99 /* Has this backend initialized the dynamic shared memory system yet? */
100 static bool dsm_init_done = false;
101
102 /*
103 * List of dynamic shared memory segments used by this backend.
104 *
105 * At process exit time, we must decrement the reference count of each
106 * segment we have attached; this list makes it possible to find all such
107 * segments.
108 *
109 * This list should always be empty in the postmaster. We could probably
110 * allow the postmaster to map dynamic shared memory segments before it
111 * begins to start child processes, provided that each process adjusted
112 * the reference counts for those segments in the control segment at
113 * startup time, but there's no obvious need for such a facility, which
114 * would also be complex to handle in the EXEC_BACKEND case. Once the
115 * postmaster has begun spawning children, there's an additional problem:
116 * each new mapping would require an update to the control segment,
117 * which requires locking, in which the postmaster must not be involved.
118 */
119 static dlist_head dsm_segment_list = DLIST_STATIC_INIT(dsm_segment_list);
120
121 /*
122 * Control segment information.
123 *
124 * Unlike ordinary shared memory segments, the control segment is not
125 * reference counted; instead, it lasts for the postmaster's entire
126 * life cycle. For simplicity, it doesn't have a dsm_segment object either.
127 */
128 static dsm_handle dsm_control_handle;
129 static dsm_control_header *dsm_control;
130 static Size dsm_control_mapped_size = 0;
131 static void *dsm_control_impl_private = NULL;
132
133 /*
134 * Start up the dynamic shared memory system.
135 *
136 * This is called just once during each cluster lifetime, at postmaster
137 * startup time.
138 */
139 void
dsm_postmaster_startup(PGShmemHeader * shim)140 dsm_postmaster_startup(PGShmemHeader *shim)
141 {
142 void *dsm_control_address = NULL;
143 uint32 maxitems;
144 Size segsize;
145
146 Assert(!IsUnderPostmaster);
147
148 /*
149 * If we're using the mmap implementations, clean up any leftovers.
150 * Cleanup isn't needed on Windows, and happens earlier in startup for
151 * POSIX and System V shared memory, via a direct call to
152 * dsm_cleanup_using_control_segment.
153 */
154 if (dynamic_shared_memory_type == DSM_IMPL_MMAP)
155 dsm_cleanup_for_mmap();
156
157 /* Determine size for new control segment. */
158 maxitems = PG_DYNSHMEM_FIXED_SLOTS
159 + PG_DYNSHMEM_SLOTS_PER_BACKEND * MaxBackends;
160 elog(DEBUG2, "dynamic shared memory system will support %u segments",
161 maxitems);
162 segsize = dsm_control_bytes_needed(maxitems);
163
164 /*
165 * Loop until we find an unused identifier for the new control segment. We
166 * sometimes use 0 as a sentinel value indicating that no control segment
167 * is known to exist, so avoid using that value for a real control
168 * segment.
169 */
170 for (;;)
171 {
172 Assert(dsm_control_address == NULL);
173 Assert(dsm_control_mapped_size == 0);
174 dsm_control_handle = random();
175 if (dsm_control_handle == DSM_HANDLE_INVALID)
176 continue;
177 if (dsm_impl_op(DSM_OP_CREATE, dsm_control_handle, segsize,
178 &dsm_control_impl_private, &dsm_control_address,
179 &dsm_control_mapped_size, ERROR))
180 break;
181 }
182 dsm_control = dsm_control_address;
183 on_shmem_exit(dsm_postmaster_shutdown, PointerGetDatum(shim));
184 elog(DEBUG2,
185 "created dynamic shared memory control segment %u (%zu bytes)",
186 dsm_control_handle, segsize);
187 shim->dsm_control = dsm_control_handle;
188
189 /* Initialize control segment. */
190 dsm_control->magic = PG_DYNSHMEM_CONTROL_MAGIC;
191 dsm_control->nitems = 0;
192 dsm_control->maxitems = maxitems;
193 }
194
195 /*
196 * Determine whether the control segment from the previous postmaster
197 * invocation still exists. If so, remove the dynamic shared memory
198 * segments to which it refers, and then the control segment itself.
199 */
200 void
dsm_cleanup_using_control_segment(dsm_handle old_control_handle)201 dsm_cleanup_using_control_segment(dsm_handle old_control_handle)
202 {
203 void *mapped_address = NULL;
204 void *junk_mapped_address = NULL;
205 void *impl_private = NULL;
206 void *junk_impl_private = NULL;
207 Size mapped_size = 0;
208 Size junk_mapped_size = 0;
209 uint32 nitems;
210 uint32 i;
211 dsm_control_header *old_control;
212
213 /*
214 * Try to attach the segment. If this fails, it probably just means that
215 * the operating system has been rebooted and the segment no longer
216 * exists, or an unrelated process has used the same shm ID. So just fall
217 * out quietly.
218 */
219 if (!dsm_impl_op(DSM_OP_ATTACH, old_control_handle, 0, &impl_private,
220 &mapped_address, &mapped_size, DEBUG1))
221 return;
222
223 /*
224 * We've managed to reattach it, but the contents might not be sane. If
225 * they aren't, we disregard the segment after all.
226 */
227 old_control = (dsm_control_header *) mapped_address;
228 if (!dsm_control_segment_sane(old_control, mapped_size))
229 {
230 dsm_impl_op(DSM_OP_DETACH, old_control_handle, 0, &impl_private,
231 &mapped_address, &mapped_size, LOG);
232 return;
233 }
234
235 /*
236 * OK, the control segment looks basically valid, so we can use it to get
237 * a list of segments that need to be removed.
238 */
239 nitems = old_control->nitems;
240 for (i = 0; i < nitems; ++i)
241 {
242 dsm_handle handle;
243 uint32 refcnt;
244
245 /* If the reference count is 0, the slot is actually unused. */
246 refcnt = old_control->item[i].refcnt;
247 if (refcnt == 0)
248 continue;
249
250 /* Log debugging information. */
251 handle = old_control->item[i].handle;
252 elog(DEBUG2, "cleaning up orphaned dynamic shared memory with ID %u (reference count %u)",
253 handle, refcnt);
254
255 /* Destroy the referenced segment. */
256 dsm_impl_op(DSM_OP_DESTROY, handle, 0, &junk_impl_private,
257 &junk_mapped_address, &junk_mapped_size, LOG);
258 }
259
260 /* Destroy the old control segment, too. */
261 elog(DEBUG2,
262 "cleaning up dynamic shared memory control segment with ID %u",
263 old_control_handle);
264 dsm_impl_op(DSM_OP_DESTROY, old_control_handle, 0, &impl_private,
265 &mapped_address, &mapped_size, LOG);
266 }
267
268 /*
269 * When we're using the mmap shared memory implementation, "shared memory"
270 * segments might even manage to survive an operating system reboot.
271 * But there's no guarantee as to exactly what will survive: some segments
272 * may survive, and others may not, and the contents of some may be out
273 * of date. In particular, the control segment may be out of date, so we
274 * can't rely on it to figure out what to remove. However, since we know
275 * what directory contains the files we used as shared memory, we can simply
276 * scan the directory and blow everything away that shouldn't be there.
277 */
278 static void
dsm_cleanup_for_mmap(void)279 dsm_cleanup_for_mmap(void)
280 {
281 DIR *dir;
282 struct dirent *dent;
283
284 /* Scan the directory for something with a name of the correct format. */
285 dir = AllocateDir(PG_DYNSHMEM_DIR);
286
287 while ((dent = ReadDir(dir, PG_DYNSHMEM_DIR)) != NULL)
288 {
289 if (strncmp(dent->d_name, PG_DYNSHMEM_MMAP_FILE_PREFIX,
290 strlen(PG_DYNSHMEM_MMAP_FILE_PREFIX)) == 0)
291 {
292 char buf[MAXPGPATH + sizeof(PG_DYNSHMEM_DIR)];
293
294 snprintf(buf, sizeof(buf), PG_DYNSHMEM_DIR "/%s", dent->d_name);
295
296 elog(DEBUG2, "removing file \"%s\"", buf);
297
298 /* We found a matching file; so remove it. */
299 if (unlink(buf) != 0)
300 ereport(ERROR,
301 (errcode_for_file_access(),
302 errmsg("could not remove file \"%s\": %m", buf)));
303 }
304 }
305
306 /* Cleanup complete. */
307 FreeDir(dir);
308 }
309
310 /*
311 * At shutdown time, we iterate over the control segment and remove all
312 * remaining dynamic shared memory segments. We avoid throwing errors here;
313 * the postmaster is shutting down either way, and this is just non-critical
314 * resource cleanup.
315 */
316 static void
dsm_postmaster_shutdown(int code,Datum arg)317 dsm_postmaster_shutdown(int code, Datum arg)
318 {
319 uint32 nitems;
320 uint32 i;
321 void *dsm_control_address;
322 void *junk_mapped_address = NULL;
323 void *junk_impl_private = NULL;
324 Size junk_mapped_size = 0;
325 PGShmemHeader *shim = (PGShmemHeader *) DatumGetPointer(arg);
326
327 /*
328 * If some other backend exited uncleanly, it might have corrupted the
329 * control segment while it was dying. In that case, we warn and ignore
330 * the contents of the control segment. This may end up leaving behind
331 * stray shared memory segments, but there's not much we can do about that
332 * if the metadata is gone.
333 */
334 nitems = dsm_control->nitems;
335 if (!dsm_control_segment_sane(dsm_control, dsm_control_mapped_size))
336 {
337 ereport(LOG,
338 (errmsg("dynamic shared memory control segment is corrupt")));
339 return;
340 }
341
342 /* Remove any remaining segments. */
343 for (i = 0; i < nitems; ++i)
344 {
345 dsm_handle handle;
346
347 /* If the reference count is 0, the slot is actually unused. */
348 if (dsm_control->item[i].refcnt == 0)
349 continue;
350
351 /* Log debugging information. */
352 handle = dsm_control->item[i].handle;
353 elog(DEBUG2, "cleaning up orphaned dynamic shared memory with ID %u",
354 handle);
355
356 /* Destroy the segment. */
357 dsm_impl_op(DSM_OP_DESTROY, handle, 0, &junk_impl_private,
358 &junk_mapped_address, &junk_mapped_size, LOG);
359 }
360
361 /* Remove the control segment itself. */
362 elog(DEBUG2,
363 "cleaning up dynamic shared memory control segment with ID %u",
364 dsm_control_handle);
365 dsm_control_address = dsm_control;
366 dsm_impl_op(DSM_OP_DESTROY, dsm_control_handle, 0,
367 &dsm_control_impl_private, &dsm_control_address,
368 &dsm_control_mapped_size, LOG);
369 dsm_control = dsm_control_address;
370 shim->dsm_control = 0;
371 }
372
373 /*
374 * Prepare this backend for dynamic shared memory usage. Under EXEC_BACKEND,
375 * we must reread the state file and map the control segment; in other cases,
376 * we'll have inherited the postmaster's mapping and global variables.
377 */
378 static void
dsm_backend_startup(void)379 dsm_backend_startup(void)
380 {
381 #ifdef EXEC_BACKEND
382 {
383 void *control_address = NULL;
384
385 /* Attach control segment. */
386 Assert(dsm_control_handle != 0);
387 dsm_impl_op(DSM_OP_ATTACH, dsm_control_handle, 0,
388 &dsm_control_impl_private, &control_address,
389 &dsm_control_mapped_size, ERROR);
390 dsm_control = control_address;
391 /* If control segment doesn't look sane, something is badly wrong. */
392 if (!dsm_control_segment_sane(dsm_control, dsm_control_mapped_size))
393 {
394 dsm_impl_op(DSM_OP_DETACH, dsm_control_handle, 0,
395 &dsm_control_impl_private, &control_address,
396 &dsm_control_mapped_size, WARNING);
397 ereport(FATAL,
398 (errcode(ERRCODE_INTERNAL_ERROR),
399 errmsg("dynamic shared memory control segment is not valid")));
400 }
401 }
402 #endif
403
404 dsm_init_done = true;
405 }
406
407 #ifdef EXEC_BACKEND
408 /*
409 * When running under EXEC_BACKEND, we get a callback here when the main
410 * shared memory segment is re-attached, so that we can record the control
411 * handle retrieved from it.
412 */
413 void
dsm_set_control_handle(dsm_handle h)414 dsm_set_control_handle(dsm_handle h)
415 {
416 Assert(dsm_control_handle == 0 && h != 0);
417 dsm_control_handle = h;
418 }
419 #endif
420
421 /*
422 * Create a new dynamic shared memory segment.
423 *
424 * If there is a non-NULL CurrentResourceOwner, the new segment is associated
425 * with it and must be detached before the resource owner releases, or a
426 * warning will be logged. If CurrentResourceOwner is NULL, the segment
427 * remains attached until explicitly detached or the session ends.
428 * Creating with a NULL CurrentResourceOwner is equivalent to creating
429 * with a non-NULL CurrentResourceOwner and then calling dsm_pin_mapping.
430 */
431 dsm_segment *
dsm_create(Size size,int flags)432 dsm_create(Size size, int flags)
433 {
434 dsm_segment *seg;
435 uint32 i;
436 uint32 nitems;
437
438 /* Unsafe in postmaster (and pointless in a stand-alone backend). */
439 Assert(IsUnderPostmaster);
440
441 if (!dsm_init_done)
442 dsm_backend_startup();
443
444 /* Create a new segment descriptor. */
445 seg = dsm_create_descriptor();
446
447 /* Loop until we find an unused segment identifier. */
448 for (;;)
449 {
450 Assert(seg->mapped_address == NULL && seg->mapped_size == 0);
451 seg->handle = random();
452 if (seg->handle == DSM_HANDLE_INVALID) /* Reserve sentinel */
453 continue;
454 if (dsm_impl_op(DSM_OP_CREATE, seg->handle, size, &seg->impl_private,
455 &seg->mapped_address, &seg->mapped_size, ERROR))
456 break;
457 }
458
459 /* Lock the control segment so we can register the new segment. */
460 LWLockAcquire(DynamicSharedMemoryControlLock, LW_EXCLUSIVE);
461
462 /* Search the control segment for an unused slot. */
463 nitems = dsm_control->nitems;
464 for (i = 0; i < nitems; ++i)
465 {
466 if (dsm_control->item[i].refcnt == 0)
467 {
468 dsm_control->item[i].handle = seg->handle;
469 /* refcnt of 1 triggers destruction, so start at 2 */
470 dsm_control->item[i].refcnt = 2;
471 dsm_control->item[i].impl_private_pm_handle = NULL;
472 dsm_control->item[i].pinned = false;
473 seg->control_slot = i;
474 LWLockRelease(DynamicSharedMemoryControlLock);
475 return seg;
476 }
477 }
478
479 /* Verify that we can support an additional mapping. */
480 if (nitems >= dsm_control->maxitems)
481 {
482 LWLockRelease(DynamicSharedMemoryControlLock);
483 dsm_impl_op(DSM_OP_DESTROY, seg->handle, 0, &seg->impl_private,
484 &seg->mapped_address, &seg->mapped_size, WARNING);
485 if (seg->resowner != NULL)
486 ResourceOwnerForgetDSM(seg->resowner, seg);
487 dlist_delete(&seg->node);
488 pfree(seg);
489
490 if ((flags & DSM_CREATE_NULL_IF_MAXSEGMENTS) != 0)
491 return NULL;
492 ereport(ERROR,
493 (errcode(ERRCODE_INSUFFICIENT_RESOURCES),
494 errmsg("too many dynamic shared memory segments")));
495 }
496
497 /* Enter the handle into a new array slot. */
498 dsm_control->item[nitems].handle = seg->handle;
499 /* refcnt of 1 triggers destruction, so start at 2 */
500 dsm_control->item[nitems].refcnt = 2;
501 dsm_control->item[nitems].impl_private_pm_handle = NULL;
502 dsm_control->item[nitems].pinned = false;
503 seg->control_slot = nitems;
504 dsm_control->nitems++;
505 LWLockRelease(DynamicSharedMemoryControlLock);
506
507 return seg;
508 }
509
510 /*
511 * Attach a dynamic shared memory segment.
512 *
513 * See comments for dsm_segment_handle() for an explanation of how this
514 * is intended to be used.
515 *
516 * This function will return NULL if the segment isn't known to the system.
517 * This can happen if we're asked to attach the segment, but then everyone
518 * else detaches it (causing it to be destroyed) before we get around to
519 * attaching it.
520 *
521 * If there is a non-NULL CurrentResourceOwner, the attached segment is
522 * associated with it and must be detached before the resource owner releases,
523 * or a warning will be logged. Otherwise the segment remains attached until
524 * explicitly detached or the session ends. See the note atop dsm_create().
525 */
526 dsm_segment *
dsm_attach(dsm_handle h)527 dsm_attach(dsm_handle h)
528 {
529 dsm_segment *seg;
530 dlist_iter iter;
531 uint32 i;
532 uint32 nitems;
533
534 /* Unsafe in postmaster (and pointless in a stand-alone backend). */
535 Assert(IsUnderPostmaster);
536
537 if (!dsm_init_done)
538 dsm_backend_startup();
539
540 /*
541 * Since this is just a debugging cross-check, we could leave it out
542 * altogether, or include it only in assert-enabled builds. But since the
543 * list of attached segments should normally be very short, let's include
544 * it always for right now.
545 *
546 * If you're hitting this error, you probably want to attempt to find an
547 * existing mapping via dsm_find_mapping() before calling dsm_attach() to
548 * create a new one.
549 */
550 dlist_foreach(iter, &dsm_segment_list)
551 {
552 seg = dlist_container(dsm_segment, node, iter.cur);
553 if (seg->handle == h)
554 elog(ERROR, "can't attach the same segment more than once");
555 }
556
557 /* Create a new segment descriptor. */
558 seg = dsm_create_descriptor();
559 seg->handle = h;
560
561 /* Bump reference count for this segment in shared memory. */
562 LWLockAcquire(DynamicSharedMemoryControlLock, LW_EXCLUSIVE);
563 nitems = dsm_control->nitems;
564 for (i = 0; i < nitems; ++i)
565 {
566 /*
567 * If the reference count is 0, the slot is actually unused. If the
568 * reference count is 1, the slot is still in use, but the segment is
569 * in the process of going away; even if the handle matches, another
570 * slot may already have started using the same handle value by
571 * coincidence so we have to keep searching.
572 */
573 if (dsm_control->item[i].refcnt <= 1)
574 continue;
575
576 /* If the handle doesn't match, it's not the slot we want. */
577 if (dsm_control->item[i].handle != seg->handle)
578 continue;
579
580 /* Otherwise we've found a match. */
581 dsm_control->item[i].refcnt++;
582 seg->control_slot = i;
583 break;
584 }
585 LWLockRelease(DynamicSharedMemoryControlLock);
586
587 /*
588 * If we didn't find the handle we're looking for in the control segment,
589 * it probably means that everyone else who had it mapped, including the
590 * original creator, died before we got to this point. It's up to the
591 * caller to decide what to do about that.
592 */
593 if (seg->control_slot == INVALID_CONTROL_SLOT)
594 {
595 dsm_detach(seg);
596 return NULL;
597 }
598
599 /* Here's where we actually try to map the segment. */
600 dsm_impl_op(DSM_OP_ATTACH, seg->handle, 0, &seg->impl_private,
601 &seg->mapped_address, &seg->mapped_size, ERROR);
602
603 return seg;
604 }
605
606 /*
607 * At backend shutdown time, detach any segments that are still attached.
608 * (This is similar to dsm_detach_all, except that there's no reason to
609 * unmap the control segment before exiting, so we don't bother.)
610 */
611 void
dsm_backend_shutdown(void)612 dsm_backend_shutdown(void)
613 {
614 while (!dlist_is_empty(&dsm_segment_list))
615 {
616 dsm_segment *seg;
617
618 seg = dlist_head_element(dsm_segment, node, &dsm_segment_list);
619 dsm_detach(seg);
620 }
621 }
622
623 /*
624 * Detach all shared memory segments, including the control segments. This
625 * should be called, along with PGSharedMemoryDetach, in processes that
626 * might inherit mappings but are not intended to be connected to dynamic
627 * shared memory.
628 */
629 void
dsm_detach_all(void)630 dsm_detach_all(void)
631 {
632 void *control_address = dsm_control;
633
634 while (!dlist_is_empty(&dsm_segment_list))
635 {
636 dsm_segment *seg;
637
638 seg = dlist_head_element(dsm_segment, node, &dsm_segment_list);
639 dsm_detach(seg);
640 }
641
642 if (control_address != NULL)
643 dsm_impl_op(DSM_OP_DETACH, dsm_control_handle, 0,
644 &dsm_control_impl_private, &control_address,
645 &dsm_control_mapped_size, ERROR);
646 }
647
648 /*
649 * Detach from a shared memory segment, destroying the segment if we
650 * remove the last reference.
651 *
652 * This function should never fail. It will often be invoked when aborting
653 * a transaction, and a further error won't serve any purpose. It's not a
654 * complete disaster if we fail to unmap or destroy the segment; it means a
655 * resource leak, but that doesn't necessarily preclude further operations.
656 */
657 void
dsm_detach(dsm_segment * seg)658 dsm_detach(dsm_segment *seg)
659 {
660 /*
661 * Invoke registered callbacks. Just in case one of those callbacks
662 * throws a further error that brings us back here, pop the callback
663 * before invoking it, to avoid infinite error recursion. Don't allow
664 * interrupts while running the individual callbacks in non-error code
665 * paths, to avoid leaving cleanup work unfinished if we're interrupted by
666 * a statement timeout or similar.
667 */
668 HOLD_INTERRUPTS();
669 while (!slist_is_empty(&seg->on_detach))
670 {
671 slist_node *node;
672 dsm_segment_detach_callback *cb;
673 on_dsm_detach_callback function;
674 Datum arg;
675
676 node = slist_pop_head_node(&seg->on_detach);
677 cb = slist_container(dsm_segment_detach_callback, node, node);
678 function = cb->function;
679 arg = cb->arg;
680 pfree(cb);
681
682 function(seg, arg);
683 }
684 RESUME_INTERRUPTS();
685
686 /*
687 * Try to remove the mapping, if one exists. Normally, there will be, but
688 * maybe not, if we failed partway through a create or attach operation.
689 * We remove the mapping before decrementing the reference count so that
690 * the process that sees a zero reference count can be certain that no
691 * remaining mappings exist. Even if this fails, we pretend that it
692 * works, because retrying is likely to fail in the same way.
693 */
694 if (seg->mapped_address != NULL)
695 {
696 dsm_impl_op(DSM_OP_DETACH, seg->handle, 0, &seg->impl_private,
697 &seg->mapped_address, &seg->mapped_size, WARNING);
698 seg->impl_private = NULL;
699 seg->mapped_address = NULL;
700 seg->mapped_size = 0;
701 }
702
703 /* Reduce reference count, if we previously increased it. */
704 if (seg->control_slot != INVALID_CONTROL_SLOT)
705 {
706 uint32 refcnt;
707 uint32 control_slot = seg->control_slot;
708
709 LWLockAcquire(DynamicSharedMemoryControlLock, LW_EXCLUSIVE);
710 Assert(dsm_control->item[control_slot].handle == seg->handle);
711 Assert(dsm_control->item[control_slot].refcnt > 1);
712 refcnt = --dsm_control->item[control_slot].refcnt;
713 seg->control_slot = INVALID_CONTROL_SLOT;
714 LWLockRelease(DynamicSharedMemoryControlLock);
715
716 /* If new reference count is 1, try to destroy the segment. */
717 if (refcnt == 1)
718 {
719 /* A pinned segment should never reach 1. */
720 Assert(!dsm_control->item[control_slot].pinned);
721
722 /*
723 * If we fail to destroy the segment here, or are killed before we
724 * finish doing so, the reference count will remain at 1, which
725 * will mean that nobody else can attach to the segment. At
726 * postmaster shutdown time, or when a new postmaster is started
727 * after a hard kill, another attempt will be made to remove the
728 * segment.
729 *
730 * The main case we're worried about here is being killed by a
731 * signal before we can finish removing the segment. In that
732 * case, it's important to be sure that the segment still gets
733 * removed. If we actually fail to remove the segment for some
734 * other reason, the postmaster may not have any better luck than
735 * we did. There's not much we can do about that, though.
736 */
737 if (dsm_impl_op(DSM_OP_DESTROY, seg->handle, 0, &seg->impl_private,
738 &seg->mapped_address, &seg->mapped_size, WARNING))
739 {
740 LWLockAcquire(DynamicSharedMemoryControlLock, LW_EXCLUSIVE);
741 Assert(dsm_control->item[control_slot].handle == seg->handle);
742 Assert(dsm_control->item[control_slot].refcnt == 1);
743 dsm_control->item[control_slot].refcnt = 0;
744 LWLockRelease(DynamicSharedMemoryControlLock);
745 }
746 }
747 }
748
749 /* Clean up our remaining backend-private data structures. */
750 if (seg->resowner != NULL)
751 ResourceOwnerForgetDSM(seg->resowner, seg);
752 dlist_delete(&seg->node);
753 pfree(seg);
754 }
755
756 /*
757 * Keep a dynamic shared memory mapping until end of session.
758 *
759 * By default, mappings are owned by the current resource owner, which
760 * typically means they stick around for the duration of the current query
761 * only.
762 */
763 void
dsm_pin_mapping(dsm_segment * seg)764 dsm_pin_mapping(dsm_segment *seg)
765 {
766 if (seg->resowner != NULL)
767 {
768 ResourceOwnerForgetDSM(seg->resowner, seg);
769 seg->resowner = NULL;
770 }
771 }
772
773 /*
774 * Arrange to remove a dynamic shared memory mapping at cleanup time.
775 *
776 * dsm_pin_mapping() can be used to preserve a mapping for the entire
777 * lifetime of a process; this function reverses that decision, making
778 * the segment owned by the current resource owner. This may be useful
779 * just before performing some operation that will invalidate the segment
780 * for future use by this backend.
781 */
782 void
dsm_unpin_mapping(dsm_segment * seg)783 dsm_unpin_mapping(dsm_segment *seg)
784 {
785 Assert(seg->resowner == NULL);
786 ResourceOwnerEnlargeDSMs(CurrentResourceOwner);
787 seg->resowner = CurrentResourceOwner;
788 ResourceOwnerRememberDSM(seg->resowner, seg);
789 }
790
791 /*
792 * Keep a dynamic shared memory segment until postmaster shutdown, or until
793 * dsm_unpin_segment is called.
794 *
795 * This function should not be called more than once per segment, unless the
796 * segment is explicitly unpinned with dsm_unpin_segment in between calls.
797 *
798 * Note that this function does not arrange for the current process to
799 * keep the segment mapped indefinitely; if that behavior is desired,
800 * dsm_pin_mapping() should be used from each process that needs to
801 * retain the mapping.
802 */
803 void
dsm_pin_segment(dsm_segment * seg)804 dsm_pin_segment(dsm_segment *seg)
805 {
806 void *handle;
807
808 /*
809 * Bump reference count for this segment in shared memory. This will
810 * ensure that even if there is no session which is attached to this
811 * segment, it will remain until postmaster shutdown or an explicit call
812 * to unpin.
813 */
814 LWLockAcquire(DynamicSharedMemoryControlLock, LW_EXCLUSIVE);
815 if (dsm_control->item[seg->control_slot].pinned)
816 elog(ERROR, "cannot pin a segment that is already pinned");
817 dsm_impl_pin_segment(seg->handle, seg->impl_private, &handle);
818 dsm_control->item[seg->control_slot].pinned = true;
819 dsm_control->item[seg->control_slot].refcnt++;
820 dsm_control->item[seg->control_slot].impl_private_pm_handle = handle;
821 LWLockRelease(DynamicSharedMemoryControlLock);
822 }
823
824 /*
825 * Unpin a dynamic shared memory segment that was previously pinned with
826 * dsm_pin_segment. This function should not be called unless dsm_pin_segment
827 * was previously called for this segment.
828 *
829 * The argument is a dsm_handle rather than a dsm_segment in case you want
830 * to unpin a segment to which you haven't attached. This turns out to be
831 * useful if, for example, a reference to one shared memory segment is stored
832 * within another shared memory segment. You might want to unpin the
833 * referenced segment before destroying the referencing segment.
834 */
835 void
dsm_unpin_segment(dsm_handle handle)836 dsm_unpin_segment(dsm_handle handle)
837 {
838 uint32 control_slot = INVALID_CONTROL_SLOT;
839 bool destroy = false;
840 uint32 i;
841
842 /* Find the control slot for the given handle. */
843 LWLockAcquire(DynamicSharedMemoryControlLock, LW_EXCLUSIVE);
844 for (i = 0; i < dsm_control->nitems; ++i)
845 {
846 /* Skip unused slots and segments that are concurrently going away. */
847 if (dsm_control->item[i].refcnt <= 1)
848 continue;
849
850 /* If we've found our handle, we can stop searching. */
851 if (dsm_control->item[i].handle == handle)
852 {
853 control_slot = i;
854 break;
855 }
856 }
857
858 /*
859 * We should definitely have found the slot, and it should not already be
860 * in the process of going away, because this function should only be
861 * called on a segment which is pinned.
862 */
863 if (control_slot == INVALID_CONTROL_SLOT)
864 elog(ERROR, "cannot unpin unknown segment handle");
865 if (!dsm_control->item[control_slot].pinned)
866 elog(ERROR, "cannot unpin a segment that is not pinned");
867 Assert(dsm_control->item[control_slot].refcnt > 1);
868
869 /*
870 * Allow implementation-specific code to run. We have to do this before
871 * releasing the lock, because impl_private_pm_handle may get modified by
872 * dsm_impl_unpin_segment.
873 */
874 dsm_impl_unpin_segment(handle,
875 &dsm_control->item[control_slot].impl_private_pm_handle);
876
877 /* Note that 1 means no references (0 means unused slot). */
878 if (--dsm_control->item[control_slot].refcnt == 1)
879 destroy = true;
880 dsm_control->item[control_slot].pinned = false;
881
882 /* Now we can release the lock. */
883 LWLockRelease(DynamicSharedMemoryControlLock);
884
885 /* Clean up resources if that was the last reference. */
886 if (destroy)
887 {
888 void *junk_impl_private = NULL;
889 void *junk_mapped_address = NULL;
890 Size junk_mapped_size = 0;
891
892 /*
893 * For an explanation of how error handling works in this case, see
894 * comments in dsm_detach. Note that if we reach this point, the
895 * current process certainly does not have the segment mapped, because
896 * if it did, the reference count would have still been greater than 1
897 * even after releasing the reference count held by the pin. The fact
898 * that there can't be a dsm_segment for this handle makes it OK to
899 * pass the mapped size, mapped address, and private data as NULL
900 * here.
901 */
902 if (dsm_impl_op(DSM_OP_DESTROY, handle, 0, &junk_impl_private,
903 &junk_mapped_address, &junk_mapped_size, WARNING))
904 {
905 LWLockAcquire(DynamicSharedMemoryControlLock, LW_EXCLUSIVE);
906 Assert(dsm_control->item[control_slot].handle == handle);
907 Assert(dsm_control->item[control_slot].refcnt == 1);
908 dsm_control->item[control_slot].refcnt = 0;
909 LWLockRelease(DynamicSharedMemoryControlLock);
910 }
911 }
912 }
913
914 /*
915 * Find an existing mapping for a shared memory segment, if there is one.
916 */
917 dsm_segment *
dsm_find_mapping(dsm_handle h)918 dsm_find_mapping(dsm_handle h)
919 {
920 dlist_iter iter;
921 dsm_segment *seg;
922
923 dlist_foreach(iter, &dsm_segment_list)
924 {
925 seg = dlist_container(dsm_segment, node, iter.cur);
926 if (seg->handle == h)
927 return seg;
928 }
929
930 return NULL;
931 }
932
933 /*
934 * Get the address at which a dynamic shared memory segment is mapped.
935 */
936 void *
dsm_segment_address(dsm_segment * seg)937 dsm_segment_address(dsm_segment *seg)
938 {
939 Assert(seg->mapped_address != NULL);
940 return seg->mapped_address;
941 }
942
943 /*
944 * Get the size of a mapping.
945 */
946 Size
dsm_segment_map_length(dsm_segment * seg)947 dsm_segment_map_length(dsm_segment *seg)
948 {
949 Assert(seg->mapped_address != NULL);
950 return seg->mapped_size;
951 }
952
953 /*
954 * Get a handle for a mapping.
955 *
956 * To establish communication via dynamic shared memory between two backends,
957 * one of them should first call dsm_create() to establish a new shared
958 * memory mapping. That process should then call dsm_segment_handle() to
959 * obtain a handle for the mapping, and pass that handle to the
960 * coordinating backend via some means (e.g. bgw_main_arg, or via the
961 * main shared memory segment). The recipient, once in possession of the
962 * handle, should call dsm_attach().
963 */
964 dsm_handle
dsm_segment_handle(dsm_segment * seg)965 dsm_segment_handle(dsm_segment *seg)
966 {
967 return seg->handle;
968 }
969
970 /*
971 * Register an on-detach callback for a dynamic shared memory segment.
972 */
973 void
on_dsm_detach(dsm_segment * seg,on_dsm_detach_callback function,Datum arg)974 on_dsm_detach(dsm_segment *seg, on_dsm_detach_callback function, Datum arg)
975 {
976 dsm_segment_detach_callback *cb;
977
978 cb = MemoryContextAlloc(TopMemoryContext,
979 sizeof(dsm_segment_detach_callback));
980 cb->function = function;
981 cb->arg = arg;
982 slist_push_head(&seg->on_detach, &cb->node);
983 }
984
985 /*
986 * Unregister an on-detach callback for a dynamic shared memory segment.
987 */
988 void
cancel_on_dsm_detach(dsm_segment * seg,on_dsm_detach_callback function,Datum arg)989 cancel_on_dsm_detach(dsm_segment *seg, on_dsm_detach_callback function,
990 Datum arg)
991 {
992 slist_mutable_iter iter;
993
994 slist_foreach_modify(iter, &seg->on_detach)
995 {
996 dsm_segment_detach_callback *cb;
997
998 cb = slist_container(dsm_segment_detach_callback, node, iter.cur);
999 if (cb->function == function && cb->arg == arg)
1000 {
1001 slist_delete_current(&iter);
1002 pfree(cb);
1003 break;
1004 }
1005 }
1006 }
1007
1008 /*
1009 * Discard all registered on-detach callbacks without executing them.
1010 */
1011 void
reset_on_dsm_detach(void)1012 reset_on_dsm_detach(void)
1013 {
1014 dlist_iter iter;
1015
1016 dlist_foreach(iter, &dsm_segment_list)
1017 {
1018 dsm_segment *seg = dlist_container(dsm_segment, node, iter.cur);
1019
1020 /* Throw away explicit on-detach actions one by one. */
1021 while (!slist_is_empty(&seg->on_detach))
1022 {
1023 slist_node *node;
1024 dsm_segment_detach_callback *cb;
1025
1026 node = slist_pop_head_node(&seg->on_detach);
1027 cb = slist_container(dsm_segment_detach_callback, node, node);
1028 pfree(cb);
1029 }
1030
1031 /*
1032 * Decrementing the reference count is a sort of implicit on-detach
1033 * action; make sure we don't do that, either.
1034 */
1035 seg->control_slot = INVALID_CONTROL_SLOT;
1036 }
1037 }
1038
1039 /*
1040 * Create a segment descriptor.
1041 */
1042 static dsm_segment *
dsm_create_descriptor(void)1043 dsm_create_descriptor(void)
1044 {
1045 dsm_segment *seg;
1046
1047 if (CurrentResourceOwner)
1048 ResourceOwnerEnlargeDSMs(CurrentResourceOwner);
1049
1050 seg = MemoryContextAlloc(TopMemoryContext, sizeof(dsm_segment));
1051 dlist_push_head(&dsm_segment_list, &seg->node);
1052
1053 /* seg->handle must be initialized by the caller */
1054 seg->control_slot = INVALID_CONTROL_SLOT;
1055 seg->impl_private = NULL;
1056 seg->mapped_address = NULL;
1057 seg->mapped_size = 0;
1058
1059 seg->resowner = CurrentResourceOwner;
1060 if (CurrentResourceOwner)
1061 ResourceOwnerRememberDSM(CurrentResourceOwner, seg);
1062
1063 slist_init(&seg->on_detach);
1064
1065 return seg;
1066 }
1067
1068 /*
1069 * Sanity check a control segment.
1070 *
1071 * The goal here isn't to detect everything that could possibly be wrong with
1072 * the control segment; there's not enough information for that. Rather, the
1073 * goal is to make sure that someone can iterate over the items in the segment
1074 * without overrunning the end of the mapping and crashing. We also check
1075 * the magic number since, if that's messed up, this may not even be one of
1076 * our segments at all.
1077 */
1078 static bool
dsm_control_segment_sane(dsm_control_header * control,Size mapped_size)1079 dsm_control_segment_sane(dsm_control_header *control, Size mapped_size)
1080 {
1081 if (mapped_size < offsetof(dsm_control_header, item))
1082 return false; /* Mapped size too short to read header. */
1083 if (control->magic != PG_DYNSHMEM_CONTROL_MAGIC)
1084 return false; /* Magic number doesn't match. */
1085 if (dsm_control_bytes_needed(control->maxitems) > mapped_size)
1086 return false; /* Max item count won't fit in map. */
1087 if (control->nitems > control->maxitems)
1088 return false; /* Overfull. */
1089 return true;
1090 }
1091
1092 /*
1093 * Compute the number of control-segment bytes needed to store a given
1094 * number of items.
1095 */
1096 static uint64
dsm_control_bytes_needed(uint32 nitems)1097 dsm_control_bytes_needed(uint32 nitems)
1098 {
1099 return offsetof(dsm_control_header, item)
1100 + sizeof(dsm_control_item) * (uint64) nitems;
1101 }
1102