1 /*-------------------------------------------------------------------------
2 *
3 * dsm_impl.c
4 * manage dynamic shared memory segments
5 *
6 * This file provides low-level APIs for creating and destroying shared
7 * memory segments using several different possible techniques. We refer
8 * to these segments as dynamic because they can be created, altered, and
9 * destroyed at any point during the server life cycle. This is unlike
10 * the main shared memory segment, of which there is always exactly one
11 * and which is always mapped at a fixed address in every PostgreSQL
12 * background process.
13 *
14 * Because not all systems provide the same primitives in this area, nor
15 * do all primitives behave the same way on all systems, we provide
16 * several implementations of this facility. Many systems implement
17 * POSIX shared memory (shm_open etc.), which is well-suited to our needs
18 * in this area, with the exception that shared memory identifiers live
19 * in a flat system-wide namespace, raising the uncomfortable prospect of
20 * name collisions with other processes (including other copies of
21 * PostgreSQL) running on the same system. Some systems only support
22 * the older System V shared memory interface (shmget etc.) which is
23 * also usable; however, the default allocation limits are often quite
24 * small, and the namespace is even more restricted.
25 *
26 * We also provide an mmap-based shared memory implementation. This may
27 * be useful on systems that provide shared memory via a special-purpose
28 * filesystem; by opting for this implementation, the user can even
29 * control precisely where their shared memory segments are placed. It
30 * can also be used as a fallback for systems where shm_open and shmget
31 * are not available or can't be used for some reason. Of course,
32 * mapping a file residing on an actual spinning disk is a fairly poor
33 * approximation for shared memory because writeback may hurt performance
34 * substantially, but there should be few systems where we must make do
35 * with such poor tools.
36 *
37 * As ever, Windows requires its own implementation.
38 *
39 * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
40 * Portions Copyright (c) 1994, Regents of the University of California
41 *
42 *
43 * IDENTIFICATION
44 * src/backend/storage/ipc/dsm_impl.c
45 *
46 *-------------------------------------------------------------------------
47 */
48
49 #include "postgres.h"
50
51 #include <fcntl.h>
52 #include <unistd.h>
53 #ifndef WIN32
54 #include <sys/mman.h>
55 #endif
56 #include <sys/stat.h>
57 #ifdef HAVE_SYS_IPC_H
58 #include <sys/ipc.h>
59 #endif
60 #ifdef HAVE_SYS_SHM_H
61 #include <sys/shm.h>
62 #endif
63
64 #include "common/file_perm.h"
65 #include "miscadmin.h"
66 #include "pgstat.h"
67 #include "portability/mem.h"
68 #include "postmaster/postmaster.h"
69 #include "storage/dsm_impl.h"
70 #include "storage/fd.h"
71 #include "utils/guc.h"
72 #include "utils/memutils.h"
73
74 #ifdef USE_DSM_POSIX
75 static bool dsm_impl_posix(dsm_op op, dsm_handle handle, Size request_size,
76 void **impl_private, void **mapped_address,
77 Size *mapped_size, int elevel);
78 static int dsm_impl_posix_resize(int fd, off_t size);
79 #endif
80 #ifdef USE_DSM_SYSV
81 static bool dsm_impl_sysv(dsm_op op, dsm_handle handle, Size request_size,
82 void **impl_private, void **mapped_address,
83 Size *mapped_size, int elevel);
84 #endif
85 #ifdef USE_DSM_WINDOWS
86 static bool dsm_impl_windows(dsm_op op, dsm_handle handle, Size request_size,
87 void **impl_private, void **mapped_address,
88 Size *mapped_size, int elevel);
89 #endif
90 #ifdef USE_DSM_MMAP
91 static bool dsm_impl_mmap(dsm_op op, dsm_handle handle, Size request_size,
92 void **impl_private, void **mapped_address,
93 Size *mapped_size, int elevel);
94 #endif
95 static int errcode_for_dynamic_shared_memory(void);
96
97 const struct config_enum_entry dynamic_shared_memory_options[] = {
98 #ifdef USE_DSM_POSIX
99 {"posix", DSM_IMPL_POSIX, false},
100 #endif
101 #ifdef USE_DSM_SYSV
102 {"sysv", DSM_IMPL_SYSV, false},
103 #endif
104 #ifdef USE_DSM_WINDOWS
105 {"windows", DSM_IMPL_WINDOWS, false},
106 #endif
107 #ifdef USE_DSM_MMAP
108 {"mmap", DSM_IMPL_MMAP, false},
109 #endif
110 {NULL, 0, false}
111 };
112
113 /* Implementation selector. */
114 int dynamic_shared_memory_type;
115
116 /* Amount of space reserved for DSM segments in the main area. */
117 int min_dynamic_shared_memory;
118
119 /* Size of buffer to be used for zero-filling. */
120 #define ZBUFFER_SIZE 8192
121
122 #define SEGMENT_NAME_PREFIX "Global/PostgreSQL"
123
124 /*------
125 * Perform a low-level shared memory operation in a platform-specific way,
126 * as dictated by the selected implementation. Each implementation is
127 * required to implement the following primitives.
128 *
129 * DSM_OP_CREATE. Create a segment whose size is the request_size and
130 * map it.
131 *
132 * DSM_OP_ATTACH. Map the segment, whose size must be the request_size.
133 *
134 * DSM_OP_DETACH. Unmap the segment.
135 *
136 * DSM_OP_DESTROY. Unmap the segment, if it is mapped. Destroy the
137 * segment.
138 *
139 * Arguments:
140 * op: The operation to be performed.
141 * handle: The handle of an existing object, or for DSM_OP_CREATE, the
142 * a new handle the caller wants created.
143 * request_size: For DSM_OP_CREATE, the requested size. Otherwise, 0.
144 * impl_private: Private, implementation-specific data. Will be a pointer
145 * to NULL for the first operation on a shared memory segment within this
146 * backend; thereafter, it will point to the value to which it was set
147 * on the previous call.
148 * mapped_address: Pointer to start of current mapping; pointer to NULL
149 * if none. Updated with new mapping address.
150 * mapped_size: Pointer to size of current mapping; pointer to 0 if none.
151 * Updated with new mapped size.
152 * elevel: Level at which to log errors.
153 *
154 * Return value: true on success, false on failure. When false is returned,
155 * a message should first be logged at the specified elevel, except in the
156 * case where DSM_OP_CREATE experiences a name collision, which should
157 * silently return false.
158 *-----
159 */
160 bool
dsm_impl_op(dsm_op op,dsm_handle handle,Size request_size,void ** impl_private,void ** mapped_address,Size * mapped_size,int elevel)161 dsm_impl_op(dsm_op op, dsm_handle handle, Size request_size,
162 void **impl_private, void **mapped_address, Size *mapped_size,
163 int elevel)
164 {
165 Assert(op == DSM_OP_CREATE || request_size == 0);
166 Assert((op != DSM_OP_CREATE && op != DSM_OP_ATTACH) ||
167 (*mapped_address == NULL && *mapped_size == 0));
168
169 switch (dynamic_shared_memory_type)
170 {
171 #ifdef USE_DSM_POSIX
172 case DSM_IMPL_POSIX:
173 return dsm_impl_posix(op, handle, request_size, impl_private,
174 mapped_address, mapped_size, elevel);
175 #endif
176 #ifdef USE_DSM_SYSV
177 case DSM_IMPL_SYSV:
178 return dsm_impl_sysv(op, handle, request_size, impl_private,
179 mapped_address, mapped_size, elevel);
180 #endif
181 #ifdef USE_DSM_WINDOWS
182 case DSM_IMPL_WINDOWS:
183 return dsm_impl_windows(op, handle, request_size, impl_private,
184 mapped_address, mapped_size, elevel);
185 #endif
186 #ifdef USE_DSM_MMAP
187 case DSM_IMPL_MMAP:
188 return dsm_impl_mmap(op, handle, request_size, impl_private,
189 mapped_address, mapped_size, elevel);
190 #endif
191 default:
192 elog(ERROR, "unexpected dynamic shared memory type: %d",
193 dynamic_shared_memory_type);
194 return false;
195 }
196 }
197
198 #ifdef USE_DSM_POSIX
199 /*
200 * Operating system primitives to support POSIX shared memory.
201 *
202 * POSIX shared memory segments are created and attached using shm_open()
203 * and shm_unlink(); other operations, such as sizing or mapping the
204 * segment, are performed as if the shared memory segments were files.
205 *
206 * Indeed, on some platforms, they may be implemented that way. While
207 * POSIX shared memory segments seem intended to exist in a flat namespace,
208 * some operating systems may implement them as files, even going so far
209 * to treat a request for /xyz as a request to create a file by that name
210 * in the root directory. Users of such broken platforms should select
211 * a different shared memory implementation.
212 */
213 static bool
dsm_impl_posix(dsm_op op,dsm_handle handle,Size request_size,void ** impl_private,void ** mapped_address,Size * mapped_size,int elevel)214 dsm_impl_posix(dsm_op op, dsm_handle handle, Size request_size,
215 void **impl_private, void **mapped_address, Size *mapped_size,
216 int elevel)
217 {
218 char name[64];
219 int flags;
220 int fd;
221 char *address;
222
223 snprintf(name, 64, "/PostgreSQL.%u", handle);
224
225 /* Handle teardown cases. */
226 if (op == DSM_OP_DETACH || op == DSM_OP_DESTROY)
227 {
228 if (*mapped_address != NULL
229 && munmap(*mapped_address, *mapped_size) != 0)
230 {
231 ereport(elevel,
232 (errcode_for_dynamic_shared_memory(),
233 errmsg("could not unmap shared memory segment \"%s\": %m",
234 name)));
235 return false;
236 }
237 *mapped_address = NULL;
238 *mapped_size = 0;
239 if (op == DSM_OP_DESTROY && shm_unlink(name) != 0)
240 {
241 ereport(elevel,
242 (errcode_for_dynamic_shared_memory(),
243 errmsg("could not remove shared memory segment \"%s\": %m",
244 name)));
245 return false;
246 }
247 return true;
248 }
249
250 /*
251 * Create new segment or open an existing one for attach.
252 *
253 * Even though we will close the FD before returning, it seems desirable
254 * to use Reserve/ReleaseExternalFD, to reduce the probability of EMFILE
255 * failure. The fact that we won't hold the FD open long justifies using
256 * ReserveExternalFD rather than AcquireExternalFD, though.
257 */
258 ReserveExternalFD();
259
260 flags = O_RDWR | (op == DSM_OP_CREATE ? O_CREAT | O_EXCL : 0);
261 if ((fd = shm_open(name, flags, PG_FILE_MODE_OWNER)) == -1)
262 {
263 ReleaseExternalFD();
264 if (errno != EEXIST)
265 ereport(elevel,
266 (errcode_for_dynamic_shared_memory(),
267 errmsg("could not open shared memory segment \"%s\": %m",
268 name)));
269 return false;
270 }
271
272 /*
273 * If we're attaching the segment, determine the current size; if we are
274 * creating the segment, set the size to the requested value.
275 */
276 if (op == DSM_OP_ATTACH)
277 {
278 struct stat st;
279
280 if (fstat(fd, &st) != 0)
281 {
282 int save_errno;
283
284 /* Back out what's already been done. */
285 save_errno = errno;
286 close(fd);
287 ReleaseExternalFD();
288 errno = save_errno;
289
290 ereport(elevel,
291 (errcode_for_dynamic_shared_memory(),
292 errmsg("could not stat shared memory segment \"%s\": %m",
293 name)));
294 return false;
295 }
296 request_size = st.st_size;
297 }
298 else if (dsm_impl_posix_resize(fd, request_size) != 0)
299 {
300 int save_errno;
301
302 /* Back out what's already been done. */
303 save_errno = errno;
304 close(fd);
305 ReleaseExternalFD();
306 shm_unlink(name);
307 errno = save_errno;
308
309 /*
310 * If we received a query cancel or termination signal, we will have
311 * EINTR set here. If the caller said that errors are OK here, check
312 * for interrupts immediately.
313 */
314 if (errno == EINTR && elevel >= ERROR)
315 CHECK_FOR_INTERRUPTS();
316
317 ereport(elevel,
318 (errcode_for_dynamic_shared_memory(),
319 errmsg("could not resize shared memory segment \"%s\" to %zu bytes: %m",
320 name, request_size)));
321 return false;
322 }
323
324 /* Map it. */
325 address = mmap(NULL, request_size, PROT_READ | PROT_WRITE,
326 MAP_SHARED | MAP_HASSEMAPHORE | MAP_NOSYNC, fd, 0);
327 if (address == MAP_FAILED)
328 {
329 int save_errno;
330
331 /* Back out what's already been done. */
332 save_errno = errno;
333 close(fd);
334 ReleaseExternalFD();
335 if (op == DSM_OP_CREATE)
336 shm_unlink(name);
337 errno = save_errno;
338
339 ereport(elevel,
340 (errcode_for_dynamic_shared_memory(),
341 errmsg("could not map shared memory segment \"%s\": %m",
342 name)));
343 return false;
344 }
345 *mapped_address = address;
346 *mapped_size = request_size;
347 close(fd);
348 ReleaseExternalFD();
349
350 return true;
351 }
352
353 /*
354 * Set the size of a virtual memory region associated with a file descriptor.
355 * If necessary, also ensure that virtual memory is actually allocated by the
356 * operating system, to avoid nasty surprises later.
357 *
358 * Returns non-zero if either truncation or allocation fails, and sets errno.
359 */
360 static int
dsm_impl_posix_resize(int fd,off_t size)361 dsm_impl_posix_resize(int fd, off_t size)
362 {
363 int rc;
364
365 /* Truncate (or extend) the file to the requested size. */
366 rc = ftruncate(fd, size);
367
368 /*
369 * On Linux, a shm_open fd is backed by a tmpfs file. After resizing with
370 * ftruncate, the file may contain a hole. Accessing memory backed by a
371 * hole causes tmpfs to allocate pages, which fails with SIGBUS if there
372 * is no more tmpfs space available. So we ask tmpfs to allocate pages
373 * here, so we can fail gracefully with ENOSPC now rather than risking
374 * SIGBUS later.
375 */
376 #if defined(HAVE_POSIX_FALLOCATE) && defined(__linux__)
377 if (rc == 0)
378 {
379 /*
380 * We may get interrupted. If so, just retry unless there is an
381 * interrupt pending. This avoids the possibility of looping forever
382 * if another backend is repeatedly trying to interrupt us.
383 */
384 pgstat_report_wait_start(WAIT_EVENT_DSM_FILL_ZERO_WRITE);
385 do
386 {
387 rc = posix_fallocate(fd, 0, size);
388 } while (rc == EINTR && !(ProcDiePending || QueryCancelPending));
389 pgstat_report_wait_end();
390
391 /*
392 * The caller expects errno to be set, but posix_fallocate() doesn't
393 * set it. Instead it returns error numbers directly. So set errno,
394 * even though we'll also return rc to indicate success or failure.
395 */
396 errno = rc;
397 }
398 #endif /* HAVE_POSIX_FALLOCATE && __linux__ */
399
400 return rc;
401 }
402
403 #endif /* USE_DSM_POSIX */
404
405 #ifdef USE_DSM_SYSV
406 /*
407 * Operating system primitives to support System V shared memory.
408 *
409 * System V shared memory segments are manipulated using shmget(), shmat(),
410 * shmdt(), and shmctl(). As the default allocation limits for System V
411 * shared memory are usually quite low, the POSIX facilities may be
412 * preferable; but those are not supported everywhere.
413 */
414 static bool
dsm_impl_sysv(dsm_op op,dsm_handle handle,Size request_size,void ** impl_private,void ** mapped_address,Size * mapped_size,int elevel)415 dsm_impl_sysv(dsm_op op, dsm_handle handle, Size request_size,
416 void **impl_private, void **mapped_address, Size *mapped_size,
417 int elevel)
418 {
419 key_t key;
420 int ident;
421 char *address;
422 char name[64];
423 int *ident_cache;
424
425 /*
426 * POSIX shared memory and mmap-based shared memory identify segments with
427 * names. To avoid needless error message variation, we use the handle as
428 * the name.
429 */
430 snprintf(name, 64, "%u", handle);
431
432 /*
433 * The System V shared memory namespace is very restricted; names are of
434 * type key_t, which is expected to be some sort of integer data type, but
435 * not necessarily the same one as dsm_handle. Since we use dsm_handle to
436 * identify shared memory segments across processes, this might seem like
437 * a problem, but it's really not. If dsm_handle is bigger than key_t,
438 * the cast below might truncate away some bits from the handle the
439 * user-provided, but it'll truncate exactly the same bits away in exactly
440 * the same fashion every time we use that handle, which is all that
441 * really matters. Conversely, if dsm_handle is smaller than key_t, we
442 * won't use the full range of available key space, but that's no big deal
443 * either.
444 *
445 * We do make sure that the key isn't negative, because that might not be
446 * portable.
447 */
448 key = (key_t) handle;
449 if (key < 1) /* avoid compiler warning if type is unsigned */
450 key = -key;
451
452 /*
453 * There's one special key, IPC_PRIVATE, which can't be used. If we end
454 * up with that value by chance during a create operation, just pretend it
455 * already exists, so that caller will retry. If we run into it anywhere
456 * else, the caller has passed a handle that doesn't correspond to
457 * anything we ever created, which should not happen.
458 */
459 if (key == IPC_PRIVATE)
460 {
461 if (op != DSM_OP_CREATE)
462 elog(DEBUG4, "System V shared memory key may not be IPC_PRIVATE");
463 errno = EEXIST;
464 return false;
465 }
466
467 /*
468 * Before we can do anything with a shared memory segment, we have to map
469 * the shared memory key to a shared memory identifier using shmget(). To
470 * avoid repeated lookups, we store the key using impl_private.
471 */
472 if (*impl_private != NULL)
473 {
474 ident_cache = *impl_private;
475 ident = *ident_cache;
476 }
477 else
478 {
479 int flags = IPCProtection;
480 size_t segsize;
481
482 /*
483 * Allocate the memory BEFORE acquiring the resource, so that we don't
484 * leak the resource if memory allocation fails.
485 */
486 ident_cache = MemoryContextAlloc(TopMemoryContext, sizeof(int));
487
488 /*
489 * When using shmget to find an existing segment, we must pass the
490 * size as 0. Passing a non-zero size which is greater than the
491 * actual size will result in EINVAL.
492 */
493 segsize = 0;
494
495 if (op == DSM_OP_CREATE)
496 {
497 flags |= IPC_CREAT | IPC_EXCL;
498 segsize = request_size;
499 }
500
501 if ((ident = shmget(key, segsize, flags)) == -1)
502 {
503 if (errno != EEXIST)
504 {
505 int save_errno = errno;
506
507 pfree(ident_cache);
508 errno = save_errno;
509 ereport(elevel,
510 (errcode_for_dynamic_shared_memory(),
511 errmsg("could not get shared memory segment: %m")));
512 }
513 return false;
514 }
515
516 *ident_cache = ident;
517 *impl_private = ident_cache;
518 }
519
520 /* Handle teardown cases. */
521 if (op == DSM_OP_DETACH || op == DSM_OP_DESTROY)
522 {
523 pfree(ident_cache);
524 *impl_private = NULL;
525 if (*mapped_address != NULL && shmdt(*mapped_address) != 0)
526 {
527 ereport(elevel,
528 (errcode_for_dynamic_shared_memory(),
529 errmsg("could not unmap shared memory segment \"%s\": %m",
530 name)));
531 return false;
532 }
533 *mapped_address = NULL;
534 *mapped_size = 0;
535 if (op == DSM_OP_DESTROY && shmctl(ident, IPC_RMID, NULL) < 0)
536 {
537 ereport(elevel,
538 (errcode_for_dynamic_shared_memory(),
539 errmsg("could not remove shared memory segment \"%s\": %m",
540 name)));
541 return false;
542 }
543 return true;
544 }
545
546 /* If we're attaching it, we must use IPC_STAT to determine the size. */
547 if (op == DSM_OP_ATTACH)
548 {
549 struct shmid_ds shm;
550
551 if (shmctl(ident, IPC_STAT, &shm) != 0)
552 {
553 ereport(elevel,
554 (errcode_for_dynamic_shared_memory(),
555 errmsg("could not stat shared memory segment \"%s\": %m",
556 name)));
557 return false;
558 }
559 request_size = shm.shm_segsz;
560 }
561
562 /* Map it. */
563 address = shmat(ident, NULL, PG_SHMAT_FLAGS);
564 if (address == (void *) -1)
565 {
566 int save_errno;
567
568 /* Back out what's already been done. */
569 save_errno = errno;
570 if (op == DSM_OP_CREATE)
571 shmctl(ident, IPC_RMID, NULL);
572 errno = save_errno;
573
574 ereport(elevel,
575 (errcode_for_dynamic_shared_memory(),
576 errmsg("could not map shared memory segment \"%s\": %m",
577 name)));
578 return false;
579 }
580 *mapped_address = address;
581 *mapped_size = request_size;
582
583 return true;
584 }
585 #endif
586
587 #ifdef USE_DSM_WINDOWS
588 /*
589 * Operating system primitives to support Windows shared memory.
590 *
591 * Windows shared memory implementation is done using file mapping
592 * which can be backed by either physical file or system paging file.
593 * Current implementation uses system paging file as other effects
594 * like performance are not clear for physical file and it is used in similar
595 * way for main shared memory in windows.
596 *
597 * A memory mapping object is a kernel object - they always get deleted when
598 * the last reference to them goes away, either explicitly via a CloseHandle or
599 * when the process containing the reference exits.
600 */
601 static bool
dsm_impl_windows(dsm_op op,dsm_handle handle,Size request_size,void ** impl_private,void ** mapped_address,Size * mapped_size,int elevel)602 dsm_impl_windows(dsm_op op, dsm_handle handle, Size request_size,
603 void **impl_private, void **mapped_address,
604 Size *mapped_size, int elevel)
605 {
606 char *address;
607 HANDLE hmap;
608 char name[64];
609 MEMORY_BASIC_INFORMATION info;
610
611 /*
612 * Storing the shared memory segment in the Global\ namespace, can allow
613 * any process running in any session to access that file mapping object
614 * provided that the caller has the required access rights. But to avoid
615 * issues faced in main shared memory, we are using the naming convention
616 * similar to main shared memory. We can change here once issue mentioned
617 * in GetSharedMemName is resolved.
618 */
619 snprintf(name, 64, "%s.%u", SEGMENT_NAME_PREFIX, handle);
620
621 /*
622 * Handle teardown cases. Since Windows automatically destroys the object
623 * when no references remain, we can treat it the same as detach.
624 */
625 if (op == DSM_OP_DETACH || op == DSM_OP_DESTROY)
626 {
627 if (*mapped_address != NULL
628 && UnmapViewOfFile(*mapped_address) == 0)
629 {
630 _dosmaperr(GetLastError());
631 ereport(elevel,
632 (errcode_for_dynamic_shared_memory(),
633 errmsg("could not unmap shared memory segment \"%s\": %m",
634 name)));
635 return false;
636 }
637 if (*impl_private != NULL
638 && CloseHandle(*impl_private) == 0)
639 {
640 _dosmaperr(GetLastError());
641 ereport(elevel,
642 (errcode_for_dynamic_shared_memory(),
643 errmsg("could not remove shared memory segment \"%s\": %m",
644 name)));
645 return false;
646 }
647
648 *impl_private = NULL;
649 *mapped_address = NULL;
650 *mapped_size = 0;
651 return true;
652 }
653
654 /* Create new segment or open an existing one for attach. */
655 if (op == DSM_OP_CREATE)
656 {
657 DWORD size_high;
658 DWORD size_low;
659 DWORD errcode;
660
661 /* Shifts >= the width of the type are undefined. */
662 #ifdef _WIN64
663 size_high = request_size >> 32;
664 #else
665 size_high = 0;
666 #endif
667 size_low = (DWORD) request_size;
668
669 /* CreateFileMapping might not clear the error code on success */
670 SetLastError(0);
671
672 hmap = CreateFileMapping(INVALID_HANDLE_VALUE, /* Use the pagefile */
673 NULL, /* Default security attrs */
674 PAGE_READWRITE, /* Memory is read/write */
675 size_high, /* Upper 32 bits of size */
676 size_low, /* Lower 32 bits of size */
677 name);
678
679 errcode = GetLastError();
680 if (errcode == ERROR_ALREADY_EXISTS || errcode == ERROR_ACCESS_DENIED)
681 {
682 /*
683 * On Windows, when the segment already exists, a handle for the
684 * existing segment is returned. We must close it before
685 * returning. However, if the existing segment is created by a
686 * service, then it returns ERROR_ACCESS_DENIED. We don't do
687 * _dosmaperr here, so errno won't be modified.
688 */
689 if (hmap)
690 CloseHandle(hmap);
691 return false;
692 }
693
694 if (!hmap)
695 {
696 _dosmaperr(errcode);
697 ereport(elevel,
698 (errcode_for_dynamic_shared_memory(),
699 errmsg("could not create shared memory segment \"%s\": %m",
700 name)));
701 return false;
702 }
703 }
704 else
705 {
706 hmap = OpenFileMapping(FILE_MAP_WRITE | FILE_MAP_READ,
707 FALSE, /* do not inherit the name */
708 name); /* name of mapping object */
709 if (!hmap)
710 {
711 _dosmaperr(GetLastError());
712 ereport(elevel,
713 (errcode_for_dynamic_shared_memory(),
714 errmsg("could not open shared memory segment \"%s\": %m",
715 name)));
716 return false;
717 }
718 }
719
720 /* Map it. */
721 address = MapViewOfFile(hmap, FILE_MAP_WRITE | FILE_MAP_READ,
722 0, 0, 0);
723 if (!address)
724 {
725 int save_errno;
726
727 _dosmaperr(GetLastError());
728 /* Back out what's already been done. */
729 save_errno = errno;
730 CloseHandle(hmap);
731 errno = save_errno;
732
733 ereport(elevel,
734 (errcode_for_dynamic_shared_memory(),
735 errmsg("could not map shared memory segment \"%s\": %m",
736 name)));
737 return false;
738 }
739
740 /*
741 * VirtualQuery gives size in page_size units, which is 4K for Windows. We
742 * need size only when we are attaching, but it's better to get the size
743 * when creating new segment to keep size consistent both for
744 * DSM_OP_CREATE and DSM_OP_ATTACH.
745 */
746 if (VirtualQuery(address, &info, sizeof(info)) == 0)
747 {
748 int save_errno;
749
750 _dosmaperr(GetLastError());
751 /* Back out what's already been done. */
752 save_errno = errno;
753 UnmapViewOfFile(address);
754 CloseHandle(hmap);
755 errno = save_errno;
756
757 ereport(elevel,
758 (errcode_for_dynamic_shared_memory(),
759 errmsg("could not stat shared memory segment \"%s\": %m",
760 name)));
761 return false;
762 }
763
764 *mapped_address = address;
765 *mapped_size = info.RegionSize;
766 *impl_private = hmap;
767
768 return true;
769 }
770 #endif
771
772 #ifdef USE_DSM_MMAP
773 /*
774 * Operating system primitives to support mmap-based shared memory.
775 *
776 * Calling this "shared memory" is somewhat of a misnomer, because what
777 * we're really doing is creating a bunch of files and mapping them into
778 * our address space. The operating system may feel obliged to
779 * synchronize the contents to disk even if nothing is being paged out,
780 * which will not serve us well. The user can relocate the pg_dynshmem
781 * directory to a ramdisk to avoid this problem, if available.
782 */
783 static bool
dsm_impl_mmap(dsm_op op,dsm_handle handle,Size request_size,void ** impl_private,void ** mapped_address,Size * mapped_size,int elevel)784 dsm_impl_mmap(dsm_op op, dsm_handle handle, Size request_size,
785 void **impl_private, void **mapped_address, Size *mapped_size,
786 int elevel)
787 {
788 char name[64];
789 int flags;
790 int fd;
791 char *address;
792
793 snprintf(name, 64, PG_DYNSHMEM_DIR "/" PG_DYNSHMEM_MMAP_FILE_PREFIX "%u",
794 handle);
795
796 /* Handle teardown cases. */
797 if (op == DSM_OP_DETACH || op == DSM_OP_DESTROY)
798 {
799 if (*mapped_address != NULL
800 && munmap(*mapped_address, *mapped_size) != 0)
801 {
802 ereport(elevel,
803 (errcode_for_dynamic_shared_memory(),
804 errmsg("could not unmap shared memory segment \"%s\": %m",
805 name)));
806 return false;
807 }
808 *mapped_address = NULL;
809 *mapped_size = 0;
810 if (op == DSM_OP_DESTROY && unlink(name) != 0)
811 {
812 ereport(elevel,
813 (errcode_for_dynamic_shared_memory(),
814 errmsg("could not remove shared memory segment \"%s\": %m",
815 name)));
816 return false;
817 }
818 return true;
819 }
820
821 /* Create new segment or open an existing one for attach. */
822 flags = O_RDWR | (op == DSM_OP_CREATE ? O_CREAT | O_EXCL : 0);
823 if ((fd = OpenTransientFile(name, flags)) == -1)
824 {
825 if (errno != EEXIST)
826 ereport(elevel,
827 (errcode_for_dynamic_shared_memory(),
828 errmsg("could not open shared memory segment \"%s\": %m",
829 name)));
830 return false;
831 }
832
833 /*
834 * If we're attaching the segment, determine the current size; if we are
835 * creating the segment, set the size to the requested value.
836 */
837 if (op == DSM_OP_ATTACH)
838 {
839 struct stat st;
840
841 if (fstat(fd, &st) != 0)
842 {
843 int save_errno;
844
845 /* Back out what's already been done. */
846 save_errno = errno;
847 CloseTransientFile(fd);
848 errno = save_errno;
849
850 ereport(elevel,
851 (errcode_for_dynamic_shared_memory(),
852 errmsg("could not stat shared memory segment \"%s\": %m",
853 name)));
854 return false;
855 }
856 request_size = st.st_size;
857 }
858 else
859 {
860 /*
861 * Allocate a buffer full of zeros.
862 *
863 * Note: palloc zbuffer, instead of just using a local char array, to
864 * ensure it is reasonably well-aligned; this may save a few cycles
865 * transferring data to the kernel.
866 */
867 char *zbuffer = (char *) palloc0(ZBUFFER_SIZE);
868 uint32 remaining = request_size;
869 bool success = true;
870
871 /*
872 * Zero-fill the file. We have to do this the hard way to ensure that
873 * all the file space has really been allocated, so that we don't
874 * later seg fault when accessing the memory mapping. This is pretty
875 * pessimal.
876 */
877 while (success && remaining > 0)
878 {
879 Size goal = remaining;
880
881 if (goal > ZBUFFER_SIZE)
882 goal = ZBUFFER_SIZE;
883 pgstat_report_wait_start(WAIT_EVENT_DSM_FILL_ZERO_WRITE);
884 if (write(fd, zbuffer, goal) == goal)
885 remaining -= goal;
886 else
887 success = false;
888 pgstat_report_wait_end();
889 }
890
891 if (!success)
892 {
893 int save_errno;
894
895 /* Back out what's already been done. */
896 save_errno = errno;
897 CloseTransientFile(fd);
898 unlink(name);
899 errno = save_errno ? save_errno : ENOSPC;
900
901 ereport(elevel,
902 (errcode_for_dynamic_shared_memory(),
903 errmsg("could not resize shared memory segment \"%s\" to %zu bytes: %m",
904 name, request_size)));
905 return false;
906 }
907 }
908
909 /* Map it. */
910 address = mmap(NULL, request_size, PROT_READ | PROT_WRITE,
911 MAP_SHARED | MAP_HASSEMAPHORE | MAP_NOSYNC, fd, 0);
912 if (address == MAP_FAILED)
913 {
914 int save_errno;
915
916 /* Back out what's already been done. */
917 save_errno = errno;
918 CloseTransientFile(fd);
919 if (op == DSM_OP_CREATE)
920 unlink(name);
921 errno = save_errno;
922
923 ereport(elevel,
924 (errcode_for_dynamic_shared_memory(),
925 errmsg("could not map shared memory segment \"%s\": %m",
926 name)));
927 return false;
928 }
929 *mapped_address = address;
930 *mapped_size = request_size;
931
932 if (CloseTransientFile(fd) != 0)
933 {
934 ereport(elevel,
935 (errcode_for_file_access(),
936 errmsg("could not close shared memory segment \"%s\": %m",
937 name)));
938 return false;
939 }
940
941 return true;
942 }
943 #endif
944
945 /*
946 * Implementation-specific actions that must be performed when a segment is to
947 * be preserved even when no backend has it attached.
948 *
949 * Except on Windows, we don't need to do anything at all. But since Windows
950 * cleans up segments automatically when no references remain, we duplicate
951 * the segment handle into the postmaster process. The postmaster needn't
952 * do anything to receive the handle; Windows transfers it automatically.
953 */
954 void
dsm_impl_pin_segment(dsm_handle handle,void * impl_private,void ** impl_private_pm_handle)955 dsm_impl_pin_segment(dsm_handle handle, void *impl_private,
956 void **impl_private_pm_handle)
957 {
958 switch (dynamic_shared_memory_type)
959 {
960 #ifdef USE_DSM_WINDOWS
961 case DSM_IMPL_WINDOWS:
962 {
963 HANDLE hmap;
964
965 if (!DuplicateHandle(GetCurrentProcess(), impl_private,
966 PostmasterHandle, &hmap, 0, FALSE,
967 DUPLICATE_SAME_ACCESS))
968 {
969 char name[64];
970
971 snprintf(name, 64, "%s.%u", SEGMENT_NAME_PREFIX, handle);
972 _dosmaperr(GetLastError());
973 ereport(ERROR,
974 (errcode_for_dynamic_shared_memory(),
975 errmsg("could not duplicate handle for \"%s\": %m",
976 name)));
977 }
978
979 /*
980 * Here, we remember the handle that we created in the
981 * postmaster process. This handle isn't actually usable in
982 * any process other than the postmaster, but that doesn't
983 * matter. We're just holding onto it so that, if the segment
984 * is unpinned, dsm_impl_unpin_segment can close it.
985 */
986 *impl_private_pm_handle = hmap;
987 break;
988 }
989 #endif
990 default:
991 break;
992 }
993 }
994
995 /*
996 * Implementation-specific actions that must be performed when a segment is no
997 * longer to be preserved, so that it will be cleaned up when all backends
998 * have detached from it.
999 *
1000 * Except on Windows, we don't need to do anything at all. For Windows, we
1001 * close the extra handle that dsm_impl_pin_segment created in the
1002 * postmaster's process space.
1003 */
1004 void
dsm_impl_unpin_segment(dsm_handle handle,void ** impl_private)1005 dsm_impl_unpin_segment(dsm_handle handle, void **impl_private)
1006 {
1007 switch (dynamic_shared_memory_type)
1008 {
1009 #ifdef USE_DSM_WINDOWS
1010 case DSM_IMPL_WINDOWS:
1011 {
1012 if (*impl_private &&
1013 !DuplicateHandle(PostmasterHandle, *impl_private,
1014 NULL, NULL, 0, FALSE,
1015 DUPLICATE_CLOSE_SOURCE))
1016 {
1017 char name[64];
1018
1019 snprintf(name, 64, "%s.%u", SEGMENT_NAME_PREFIX, handle);
1020 _dosmaperr(GetLastError());
1021 ereport(ERROR,
1022 (errcode_for_dynamic_shared_memory(),
1023 errmsg("could not duplicate handle for \"%s\": %m",
1024 name)));
1025 }
1026
1027 *impl_private = NULL;
1028 break;
1029 }
1030 #endif
1031 default:
1032 break;
1033 }
1034 }
1035
1036 static int
errcode_for_dynamic_shared_memory(void)1037 errcode_for_dynamic_shared_memory(void)
1038 {
1039 if (errno == EFBIG || errno == ENOMEM)
1040 return errcode(ERRCODE_OUT_OF_MEMORY);
1041 else
1042 return errcode_for_file_access();
1043 }
1044