1 /*-------------------------------------------------------------------------
2 *
3 * dsm_impl.c
4 * manage dynamic shared memory segments
5 *
6 * This file provides low-level APIs for creating and destroying shared
7 * memory segments using several different possible techniques. We refer
8 * to these segments as dynamic because they can be created, altered, and
9 * destroyed at any point during the server life cycle. This is unlike
10 * the main shared memory segment, of which there is always exactly one
11 * and which is always mapped at a fixed address in every PostgreSQL
12 * background process.
13 *
14 * Because not all systems provide the same primitives in this area, nor
15 * do all primitives behave the same way on all systems, we provide
16 * several implementations of this facility. Many systems implement
17 * POSIX shared memory (shm_open etc.), which is well-suited to our needs
18 * in this area, with the exception that shared memory identifiers live
19 * in a flat system-wide namespace, raising the uncomfortable prospect of
20 * name collisions with other processes (including other copies of
21 * PostgreSQL) running on the same system. Some systems only support
22 * the older System V shared memory interface (shmget etc.) which is
23 * also usable; however, the default allocation limits are often quite
24 * small, and the namespace is even more restricted.
25 *
26 * We also provide an mmap-based shared memory implementation. This may
27 * be useful on systems that provide shared memory via a special-purpose
28 * filesystem; by opting for this implementation, the user can even
29 * control precisely where their shared memory segments are placed. It
30 * can also be used as a fallback for systems where shm_open and shmget
31 * are not available or can't be used for some reason. Of course,
32 * mapping a file residing on an actual spinning disk is a fairly poor
33 * approximation for shared memory because writeback may hurt performance
34 * substantially, but there should be few systems where we must make do
35 * with such poor tools.
36 *
37 * As ever, Windows requires its own implementation.
38 *
39 * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
40 * Portions Copyright (c) 1994, Regents of the University of California
41 *
42 *
43 * IDENTIFICATION
44 * src/backend/storage/ipc/dsm_impl.c
45 *
46 *-------------------------------------------------------------------------
47 */
48
49 #include "postgres.h"
50 #include "miscadmin.h"
51
52 #include <fcntl.h>
53 #include <unistd.h>
54 #ifndef WIN32
55 #include <sys/mman.h>
56 #endif
57 #include <sys/stat.h>
58 #ifdef HAVE_SYS_IPC_H
59 #include <sys/ipc.h>
60 #endif
61 #ifdef HAVE_SYS_SHM_H
62 #include <sys/shm.h>
63 #endif
64 #include "pgstat.h"
65
66 #include "portability/mem.h"
67 #include "storage/dsm_impl.h"
68 #include "storage/fd.h"
69 #include "utils/guc.h"
70 #include "utils/memutils.h"
71 #include "postmaster/postmaster.h"
72
73 #ifdef USE_DSM_POSIX
74 static bool dsm_impl_posix(dsm_op op, dsm_handle handle, Size request_size,
75 void **impl_private, void **mapped_address,
76 Size *mapped_size, int elevel);
77 static int dsm_impl_posix_resize(int fd, off_t size);
78 #endif
79 #ifdef USE_DSM_SYSV
80 static bool dsm_impl_sysv(dsm_op op, dsm_handle handle, Size request_size,
81 void **impl_private, void **mapped_address,
82 Size *mapped_size, int elevel);
83 #endif
84 #ifdef USE_DSM_WINDOWS
85 static bool dsm_impl_windows(dsm_op op, dsm_handle handle, Size request_size,
86 void **impl_private, void **mapped_address,
87 Size *mapped_size, int elevel);
88 #endif
89 #ifdef USE_DSM_MMAP
90 static bool dsm_impl_mmap(dsm_op op, dsm_handle handle, Size request_size,
91 void **impl_private, void **mapped_address,
92 Size *mapped_size, int elevel);
93 #endif
94 static int errcode_for_dynamic_shared_memory(void);
95
96 const struct config_enum_entry dynamic_shared_memory_options[] = {
97 #ifdef USE_DSM_POSIX
98 {"posix", DSM_IMPL_POSIX, false},
99 #endif
100 #ifdef USE_DSM_SYSV
101 {"sysv", DSM_IMPL_SYSV, false},
102 #endif
103 #ifdef USE_DSM_WINDOWS
104 {"windows", DSM_IMPL_WINDOWS, false},
105 #endif
106 #ifdef USE_DSM_MMAP
107 {"mmap", DSM_IMPL_MMAP, false},
108 #endif
109 {"none", DSM_IMPL_NONE, false},
110 {NULL, 0, false}
111 };
112
113 /* Implementation selector. */
114 int dynamic_shared_memory_type;
115
116 /* Size of buffer to be used for zero-filling. */
117 #define ZBUFFER_SIZE 8192
118
119 #define SEGMENT_NAME_PREFIX "Global/PostgreSQL"
120
121 /*------
122 * Perform a low-level shared memory operation in a platform-specific way,
123 * as dictated by the selected implementation. Each implementation is
124 * required to implement the following primitives.
125 *
126 * DSM_OP_CREATE. Create a segment whose size is the request_size and
127 * map it.
128 *
129 * DSM_OP_ATTACH. Map the segment, whose size must be the request_size.
130 * The segment may already be mapped; any existing mapping should be removed
131 * before creating a new one.
132 *
133 * DSM_OP_DETACH. Unmap the segment.
134 *
135 * DSM_OP_RESIZE. Resize the segment to the given request_size and
136 * remap the segment at that new size.
137 *
138 * DSM_OP_DESTROY. Unmap the segment, if it is mapped. Destroy the
139 * segment.
140 *
141 * Arguments:
142 * op: The operation to be performed.
143 * handle: The handle of an existing object, or for DSM_OP_CREATE, the
144 * a new handle the caller wants created.
145 * request_size: For DSM_OP_CREATE, the requested size. For DSM_OP_RESIZE,
146 * the new size. Otherwise, 0.
147 * impl_private: Private, implementation-specific data. Will be a pointer
148 * to NULL for the first operation on a shared memory segment within this
149 * backend; thereafter, it will point to the value to which it was set
150 * on the previous call.
151 * mapped_address: Pointer to start of current mapping; pointer to NULL
152 * if none. Updated with new mapping address.
153 * mapped_size: Pointer to size of current mapping; pointer to 0 if none.
154 * Updated with new mapped size.
155 * elevel: Level at which to log errors.
156 *
157 * Return value: true on success, false on failure. When false is returned,
158 * a message should first be logged at the specified elevel, except in the
159 * case where DSM_OP_CREATE experiences a name collision, which should
160 * silently return false.
161 *-----
162 */
163 bool
dsm_impl_op(dsm_op op,dsm_handle handle,Size request_size,void ** impl_private,void ** mapped_address,Size * mapped_size,int elevel)164 dsm_impl_op(dsm_op op, dsm_handle handle, Size request_size,
165 void **impl_private, void **mapped_address, Size *mapped_size,
166 int elevel)
167 {
168 Assert(op == DSM_OP_CREATE || op == DSM_OP_RESIZE || request_size == 0);
169 Assert((op != DSM_OP_CREATE && op != DSM_OP_ATTACH) ||
170 (*mapped_address == NULL && *mapped_size == 0));
171
172 switch (dynamic_shared_memory_type)
173 {
174 #ifdef USE_DSM_POSIX
175 case DSM_IMPL_POSIX:
176 return dsm_impl_posix(op, handle, request_size, impl_private,
177 mapped_address, mapped_size, elevel);
178 #endif
179 #ifdef USE_DSM_SYSV
180 case DSM_IMPL_SYSV:
181 return dsm_impl_sysv(op, handle, request_size, impl_private,
182 mapped_address, mapped_size, elevel);
183 #endif
184 #ifdef USE_DSM_WINDOWS
185 case DSM_IMPL_WINDOWS:
186 return dsm_impl_windows(op, handle, request_size, impl_private,
187 mapped_address, mapped_size, elevel);
188 #endif
189 #ifdef USE_DSM_MMAP
190 case DSM_IMPL_MMAP:
191 return dsm_impl_mmap(op, handle, request_size, impl_private,
192 mapped_address, mapped_size, elevel);
193 #endif
194 default:
195 elog(ERROR, "unexpected dynamic shared memory type: %d",
196 dynamic_shared_memory_type);
197 return false;
198 }
199 }
200
201 /*
202 * Does the current dynamic shared memory implementation support resizing
203 * segments? (The answer here could be platform-dependent in the future,
204 * since AIX allows shmctl(shmid, SHM_RESIZE, &buffer), though you apparently
205 * can't resize segments to anything larger than 256MB that way. For now,
206 * we keep it simple.)
207 */
208 bool
dsm_impl_can_resize(void)209 dsm_impl_can_resize(void)
210 {
211 switch (dynamic_shared_memory_type)
212 {
213 case DSM_IMPL_NONE:
214 return false;
215 case DSM_IMPL_POSIX:
216 return true;
217 case DSM_IMPL_SYSV:
218 return false;
219 case DSM_IMPL_WINDOWS:
220 return false;
221 case DSM_IMPL_MMAP:
222 return true;
223 default:
224 return false; /* should not happen */
225 }
226 }
227
228 #ifdef USE_DSM_POSIX
229 /*
230 * Operating system primitives to support POSIX shared memory.
231 *
232 * POSIX shared memory segments are created and attached using shm_open()
233 * and shm_unlink(); other operations, such as sizing or mapping the
234 * segment, are performed as if the shared memory segments were files.
235 *
236 * Indeed, on some platforms, they may be implemented that way. While
237 * POSIX shared memory segments seem intended to exist in a flat namespace,
238 * some operating systems may implement them as files, even going so far
239 * to treat a request for /xyz as a request to create a file by that name
240 * in the root directory. Users of such broken platforms should select
241 * a different shared memory implementation.
242 */
243 static bool
dsm_impl_posix(dsm_op op,dsm_handle handle,Size request_size,void ** impl_private,void ** mapped_address,Size * mapped_size,int elevel)244 dsm_impl_posix(dsm_op op, dsm_handle handle, Size request_size,
245 void **impl_private, void **mapped_address, Size *mapped_size,
246 int elevel)
247 {
248 char name[64];
249 int flags;
250 int fd;
251 char *address;
252
253 snprintf(name, 64, "/PostgreSQL.%u", handle);
254
255 /* Handle teardown cases. */
256 if (op == DSM_OP_DETACH || op == DSM_OP_DESTROY)
257 {
258 if (*mapped_address != NULL
259 && munmap(*mapped_address, *mapped_size) != 0)
260 {
261 ereport(elevel,
262 (errcode_for_dynamic_shared_memory(),
263 errmsg("could not unmap shared memory segment \"%s\": %m",
264 name)));
265 return false;
266 }
267 *mapped_address = NULL;
268 *mapped_size = 0;
269 if (op == DSM_OP_DESTROY && shm_unlink(name) != 0)
270 {
271 ereport(elevel,
272 (errcode_for_dynamic_shared_memory(),
273 errmsg("could not remove shared memory segment \"%s\": %m",
274 name)));
275 return false;
276 }
277 return true;
278 }
279
280 /*
281 * Create new segment or open an existing one for attach or resize.
282 *
283 * Even though we're not going through fd.c, we should be safe against
284 * running out of file descriptors, because of NUM_RESERVED_FDS. We're
285 * only opening one extra descriptor here, and we'll close it before
286 * returning.
287 */
288 flags = O_RDWR | (op == DSM_OP_CREATE ? O_CREAT | O_EXCL : 0);
289 if ((fd = shm_open(name, flags, 0600)) == -1)
290 {
291 if (errno != EEXIST)
292 ereport(elevel,
293 (errcode_for_dynamic_shared_memory(),
294 errmsg("could not open shared memory segment \"%s\": %m",
295 name)));
296 return false;
297 }
298
299 /*
300 * If we're attaching the segment, determine the current size; if we are
301 * creating or resizing the segment, set the size to the requested value.
302 */
303 if (op == DSM_OP_ATTACH)
304 {
305 struct stat st;
306
307 if (fstat(fd, &st) != 0)
308 {
309 int save_errno;
310
311 /* Back out what's already been done. */
312 save_errno = errno;
313 close(fd);
314 errno = save_errno;
315
316 ereport(elevel,
317 (errcode_for_dynamic_shared_memory(),
318 errmsg("could not stat shared memory segment \"%s\": %m",
319 name)));
320 return false;
321 }
322 request_size = st.st_size;
323 }
324 else if (*mapped_size != request_size &&
325 dsm_impl_posix_resize(fd, request_size) != 0)
326 {
327 int save_errno;
328
329 /* Back out what's already been done. */
330 save_errno = errno;
331 close(fd);
332 if (op == DSM_OP_CREATE)
333 shm_unlink(name);
334 errno = save_errno;
335
336 /*
337 * If we received a query cancel or termination signal, we will have
338 * EINTR set here. If the caller said that errors are OK here, check
339 * for interrupts immediately.
340 */
341 if (errno == EINTR && elevel >= ERROR)
342 CHECK_FOR_INTERRUPTS();
343
344 ereport(elevel,
345 (errcode_for_dynamic_shared_memory(),
346 errmsg("could not resize shared memory segment \"%s\" to %zu bytes: %m",
347 name, request_size)));
348 return false;
349 }
350
351 /*
352 * If we're reattaching or resizing, we must remove any existing mapping,
353 * unless we've already got the right thing mapped.
354 */
355 if (*mapped_address != NULL)
356 {
357 if (*mapped_size == request_size)
358 return true;
359 if (munmap(*mapped_address, *mapped_size) != 0)
360 {
361 int save_errno;
362
363 /* Back out what's already been done. */
364 save_errno = errno;
365 close(fd);
366 if (op == DSM_OP_CREATE)
367 shm_unlink(name);
368 errno = save_errno;
369
370 ereport(elevel,
371 (errcode_for_dynamic_shared_memory(),
372 errmsg("could not unmap shared memory segment \"%s\": %m",
373 name)));
374 return false;
375 }
376 *mapped_address = NULL;
377 *mapped_size = 0;
378 }
379
380 /* Map it. */
381 address = mmap(NULL, request_size, PROT_READ | PROT_WRITE,
382 MAP_SHARED | MAP_HASSEMAPHORE | MAP_NOSYNC, fd, 0);
383 if (address == MAP_FAILED)
384 {
385 int save_errno;
386
387 /* Back out what's already been done. */
388 save_errno = errno;
389 close(fd);
390 if (op == DSM_OP_CREATE)
391 shm_unlink(name);
392 errno = save_errno;
393
394 ereport(elevel,
395 (errcode_for_dynamic_shared_memory(),
396 errmsg("could not map shared memory segment \"%s\": %m",
397 name)));
398 return false;
399 }
400 *mapped_address = address;
401 *mapped_size = request_size;
402 close(fd);
403
404 return true;
405 }
406
407 /*
408 * Set the size of a virtual memory region associated with a file descriptor.
409 * If necessary, also ensure that virtual memory is actually allocated by the
410 * operating system, to avoid nasty surprises later.
411 *
412 * Returns non-zero if either truncation or allocation fails, and sets errno.
413 */
414 static int
dsm_impl_posix_resize(int fd,off_t size)415 dsm_impl_posix_resize(int fd, off_t size)
416 {
417 int rc;
418
419 /* Truncate (or extend) the file to the requested size. */
420 rc = ftruncate(fd, size);
421
422 /*
423 * On Linux, a shm_open fd is backed by a tmpfs file. After resizing with
424 * ftruncate, the file may contain a hole. Accessing memory backed by a
425 * hole causes tmpfs to allocate pages, which fails with SIGBUS if there
426 * is no more tmpfs space available. So we ask tmpfs to allocate pages
427 * here, so we can fail gracefully with ENOSPC now rather than risking
428 * SIGBUS later.
429 */
430 #if defined(HAVE_POSIX_FALLOCATE) && defined(__linux__)
431 if (rc == 0)
432 {
433 /*
434 * We may get interrupted. If so, just retry unless there is an
435 * interrupt pending. This avoids the possibility of looping forever
436 * if another backend is repeatedly trying to interrupt us.
437 */
438 do
439 {
440 rc = posix_fallocate(fd, 0, size);
441 } while (rc == EINTR && !(ProcDiePending || QueryCancelPending));
442
443 /*
444 * The caller expects errno to be set, but posix_fallocate() doesn't
445 * set it. Instead it returns error numbers directly. So set errno,
446 * even though we'll also return rc to indicate success or failure.
447 */
448 errno = rc;
449 }
450 #endif /* HAVE_POSIX_FALLOCATE && __linux__ */
451
452 return rc;
453 }
454
455 #endif /* USE_DSM_POSIX */
456
457 #ifdef USE_DSM_SYSV
458 /*
459 * Operating system primitives to support System V shared memory.
460 *
461 * System V shared memory segments are manipulated using shmget(), shmat(),
462 * shmdt(), and shmctl(). There's no portable way to resize such
463 * segments. As the default allocation limits for System V shared memory
464 * are usually quite low, the POSIX facilities may be preferable; but
465 * those are not supported everywhere.
466 */
467 static bool
dsm_impl_sysv(dsm_op op,dsm_handle handle,Size request_size,void ** impl_private,void ** mapped_address,Size * mapped_size,int elevel)468 dsm_impl_sysv(dsm_op op, dsm_handle handle, Size request_size,
469 void **impl_private, void **mapped_address, Size *mapped_size,
470 int elevel)
471 {
472 key_t key;
473 int ident;
474 char *address;
475 char name[64];
476 int *ident_cache;
477
478 /* Resize is not supported for System V shared memory. */
479 if (op == DSM_OP_RESIZE)
480 {
481 elog(elevel, "System V shared memory segments cannot be resized");
482 return false;
483 }
484
485 /* Since resize isn't supported, reattach is a no-op. */
486 if (op == DSM_OP_ATTACH && *mapped_address != NULL)
487 return true;
488
489 /*
490 * POSIX shared memory and mmap-based shared memory identify segments with
491 * names. To avoid needless error message variation, we use the handle as
492 * the name.
493 */
494 snprintf(name, 64, "%u", handle);
495
496 /*
497 * The System V shared memory namespace is very restricted; names are of
498 * type key_t, which is expected to be some sort of integer data type, but
499 * not necessarily the same one as dsm_handle. Since we use dsm_handle to
500 * identify shared memory segments across processes, this might seem like
501 * a problem, but it's really not. If dsm_handle is bigger than key_t,
502 * the cast below might truncate away some bits from the handle the
503 * user-provided, but it'll truncate exactly the same bits away in exactly
504 * the same fashion every time we use that handle, which is all that
505 * really matters. Conversely, if dsm_handle is smaller than key_t, we
506 * won't use the full range of available key space, but that's no big deal
507 * either.
508 *
509 * We do make sure that the key isn't negative, because that might not be
510 * portable.
511 */
512 key = (key_t) handle;
513 if (key < 1) /* avoid compiler warning if type is unsigned */
514 key = -key;
515
516 /*
517 * There's one special key, IPC_PRIVATE, which can't be used. If we end
518 * up with that value by chance during a create operation, just pretend it
519 * already exists, so that caller will retry. If we run into it anywhere
520 * else, the caller has passed a handle that doesn't correspond to
521 * anything we ever created, which should not happen.
522 */
523 if (key == IPC_PRIVATE)
524 {
525 if (op != DSM_OP_CREATE)
526 elog(DEBUG4, "System V shared memory key may not be IPC_PRIVATE");
527 errno = EEXIST;
528 return false;
529 }
530
531 /*
532 * Before we can do anything with a shared memory segment, we have to map
533 * the shared memory key to a shared memory identifier using shmget(). To
534 * avoid repeated lookups, we store the key using impl_private.
535 */
536 if (*impl_private != NULL)
537 {
538 ident_cache = *impl_private;
539 ident = *ident_cache;
540 }
541 else
542 {
543 int flags = IPCProtection;
544 size_t segsize;
545
546 /*
547 * Allocate the memory BEFORE acquiring the resource, so that we don't
548 * leak the resource if memory allocation fails.
549 */
550 ident_cache = MemoryContextAlloc(TopMemoryContext, sizeof(int));
551
552 /*
553 * When using shmget to find an existing segment, we must pass the
554 * size as 0. Passing a non-zero size which is greater than the
555 * actual size will result in EINVAL.
556 */
557 segsize = 0;
558
559 if (op == DSM_OP_CREATE)
560 {
561 flags |= IPC_CREAT | IPC_EXCL;
562 segsize = request_size;
563 }
564
565 if ((ident = shmget(key, segsize, flags)) == -1)
566 {
567 if (errno != EEXIST)
568 {
569 int save_errno = errno;
570
571 pfree(ident_cache);
572 errno = save_errno;
573 ereport(elevel,
574 (errcode_for_dynamic_shared_memory(),
575 errmsg("could not get shared memory segment: %m")));
576 }
577 return false;
578 }
579
580 *ident_cache = ident;
581 *impl_private = ident_cache;
582 }
583
584 /* Handle teardown cases. */
585 if (op == DSM_OP_DETACH || op == DSM_OP_DESTROY)
586 {
587 pfree(ident_cache);
588 *impl_private = NULL;
589 if (*mapped_address != NULL && shmdt(*mapped_address) != 0)
590 {
591 ereport(elevel,
592 (errcode_for_dynamic_shared_memory(),
593 errmsg("could not unmap shared memory segment \"%s\": %m",
594 name)));
595 return false;
596 }
597 *mapped_address = NULL;
598 *mapped_size = 0;
599 if (op == DSM_OP_DESTROY && shmctl(ident, IPC_RMID, NULL) < 0)
600 {
601 ereport(elevel,
602 (errcode_for_dynamic_shared_memory(),
603 errmsg("could not remove shared memory segment \"%s\": %m",
604 name)));
605 return false;
606 }
607 return true;
608 }
609
610 /* If we're attaching it, we must use IPC_STAT to determine the size. */
611 if (op == DSM_OP_ATTACH)
612 {
613 struct shmid_ds shm;
614
615 if (shmctl(ident, IPC_STAT, &shm) != 0)
616 {
617 ereport(elevel,
618 (errcode_for_dynamic_shared_memory(),
619 errmsg("could not stat shared memory segment \"%s\": %m",
620 name)));
621 return false;
622 }
623 request_size = shm.shm_segsz;
624 }
625
626 /* Map it. */
627 address = shmat(ident, NULL, PG_SHMAT_FLAGS);
628 if (address == (void *) -1)
629 {
630 int save_errno;
631
632 /* Back out what's already been done. */
633 save_errno = errno;
634 if (op == DSM_OP_CREATE)
635 shmctl(ident, IPC_RMID, NULL);
636 errno = save_errno;
637
638 ereport(elevel,
639 (errcode_for_dynamic_shared_memory(),
640 errmsg("could not map shared memory segment \"%s\": %m",
641 name)));
642 return false;
643 }
644 *mapped_address = address;
645 *mapped_size = request_size;
646
647 return true;
648 }
649 #endif
650
651 #ifdef USE_DSM_WINDOWS
652 /*
653 * Operating system primitives to support Windows shared memory.
654 *
655 * Windows shared memory implementation is done using file mapping
656 * which can be backed by either physical file or system paging file.
657 * Current implementation uses system paging file as other effects
658 * like performance are not clear for physical file and it is used in similar
659 * way for main shared memory in windows.
660 *
661 * A memory mapping object is a kernel object - they always get deleted when
662 * the last reference to them goes away, either explicitly via a CloseHandle or
663 * when the process containing the reference exits.
664 */
665 static bool
dsm_impl_windows(dsm_op op,dsm_handle handle,Size request_size,void ** impl_private,void ** mapped_address,Size * mapped_size,int elevel)666 dsm_impl_windows(dsm_op op, dsm_handle handle, Size request_size,
667 void **impl_private, void **mapped_address,
668 Size *mapped_size, int elevel)
669 {
670 char *address;
671 HANDLE hmap;
672 char name[64];
673 MEMORY_BASIC_INFORMATION info;
674
675 /* Resize is not supported for Windows shared memory. */
676 if (op == DSM_OP_RESIZE)
677 {
678 elog(elevel, "Windows shared memory segments cannot be resized");
679 return false;
680 }
681
682 /* Since resize isn't supported, reattach is a no-op. */
683 if (op == DSM_OP_ATTACH && *mapped_address != NULL)
684 return true;
685
686 /*
687 * Storing the shared memory segment in the Global\ namespace, can allow
688 * any process running in any session to access that file mapping object
689 * provided that the caller has the required access rights. But to avoid
690 * issues faced in main shared memory, we are using the naming convention
691 * similar to main shared memory. We can change here once issue mentioned
692 * in GetSharedMemName is resolved.
693 */
694 snprintf(name, 64, "%s.%u", SEGMENT_NAME_PREFIX, handle);
695
696 /*
697 * Handle teardown cases. Since Windows automatically destroys the object
698 * when no references reamin, we can treat it the same as detach.
699 */
700 if (op == DSM_OP_DETACH || op == DSM_OP_DESTROY)
701 {
702 if (*mapped_address != NULL
703 && UnmapViewOfFile(*mapped_address) == 0)
704 {
705 _dosmaperr(GetLastError());
706 ereport(elevel,
707 (errcode_for_dynamic_shared_memory(),
708 errmsg("could not unmap shared memory segment \"%s\": %m",
709 name)));
710 return false;
711 }
712 if (*impl_private != NULL
713 && CloseHandle(*impl_private) == 0)
714 {
715 _dosmaperr(GetLastError());
716 ereport(elevel,
717 (errcode_for_dynamic_shared_memory(),
718 errmsg("could not remove shared memory segment \"%s\": %m",
719 name)));
720 return false;
721 }
722
723 *impl_private = NULL;
724 *mapped_address = NULL;
725 *mapped_size = 0;
726 return true;
727 }
728
729 /* Create new segment or open an existing one for attach. */
730 if (op == DSM_OP_CREATE)
731 {
732 DWORD size_high;
733 DWORD size_low;
734 DWORD errcode;
735
736 /* Shifts >= the width of the type are undefined. */
737 #ifdef _WIN64
738 size_high = request_size >> 32;
739 #else
740 size_high = 0;
741 #endif
742 size_low = (DWORD) request_size;
743
744 /* CreateFileMapping might not clear the error code on success */
745 SetLastError(0);
746
747 hmap = CreateFileMapping(INVALID_HANDLE_VALUE, /* Use the pagefile */
748 NULL, /* Default security attrs */
749 PAGE_READWRITE, /* Memory is read/write */
750 size_high, /* Upper 32 bits of size */
751 size_low, /* Lower 32 bits of size */
752 name);
753
754 errcode = GetLastError();
755 if (errcode == ERROR_ALREADY_EXISTS || errcode == ERROR_ACCESS_DENIED)
756 {
757 /*
758 * On Windows, when the segment already exists, a handle for the
759 * existing segment is returned. We must close it before
760 * returning. However, if the existing segment is created by a
761 * service, then it returns ERROR_ACCESS_DENIED. We don't do
762 * _dosmaperr here, so errno won't be modified.
763 */
764 if (hmap)
765 CloseHandle(hmap);
766 return false;
767 }
768
769 if (!hmap)
770 {
771 _dosmaperr(errcode);
772 ereport(elevel,
773 (errcode_for_dynamic_shared_memory(),
774 errmsg("could not create shared memory segment \"%s\": %m",
775 name)));
776 return false;
777 }
778 }
779 else
780 {
781 hmap = OpenFileMapping(FILE_MAP_WRITE | FILE_MAP_READ,
782 FALSE, /* do not inherit the name */
783 name); /* name of mapping object */
784 if (!hmap)
785 {
786 _dosmaperr(GetLastError());
787 ereport(elevel,
788 (errcode_for_dynamic_shared_memory(),
789 errmsg("could not open shared memory segment \"%s\": %m",
790 name)));
791 return false;
792 }
793 }
794
795 /* Map it. */
796 address = MapViewOfFile(hmap, FILE_MAP_WRITE | FILE_MAP_READ,
797 0, 0, 0);
798 if (!address)
799 {
800 int save_errno;
801
802 _dosmaperr(GetLastError());
803 /* Back out what's already been done. */
804 save_errno = errno;
805 CloseHandle(hmap);
806 errno = save_errno;
807
808 ereport(elevel,
809 (errcode_for_dynamic_shared_memory(),
810 errmsg("could not map shared memory segment \"%s\": %m",
811 name)));
812 return false;
813 }
814
815 /*
816 * VirtualQuery gives size in page_size units, which is 4K for Windows. We
817 * need size only when we are attaching, but it's better to get the size
818 * when creating new segment to keep size consistent both for
819 * DSM_OP_CREATE and DSM_OP_ATTACH.
820 */
821 if (VirtualQuery(address, &info, sizeof(info)) == 0)
822 {
823 int save_errno;
824
825 _dosmaperr(GetLastError());
826 /* Back out what's already been done. */
827 save_errno = errno;
828 UnmapViewOfFile(address);
829 CloseHandle(hmap);
830 errno = save_errno;
831
832 ereport(elevel,
833 (errcode_for_dynamic_shared_memory(),
834 errmsg("could not stat shared memory segment \"%s\": %m",
835 name)));
836 return false;
837 }
838
839 *mapped_address = address;
840 *mapped_size = info.RegionSize;
841 *impl_private = hmap;
842
843 return true;
844 }
845 #endif
846
847 #ifdef USE_DSM_MMAP
848 /*
849 * Operating system primitives to support mmap-based shared memory.
850 *
851 * Calling this "shared memory" is somewhat of a misnomer, because what
852 * we're really doing is creating a bunch of files and mapping them into
853 * our address space. The operating system may feel obliged to
854 * synchronize the contents to disk even if nothing is being paged out,
855 * which will not serve us well. The user can relocate the pg_dynshmem
856 * directory to a ramdisk to avoid this problem, if available.
857 */
858 static bool
dsm_impl_mmap(dsm_op op,dsm_handle handle,Size request_size,void ** impl_private,void ** mapped_address,Size * mapped_size,int elevel)859 dsm_impl_mmap(dsm_op op, dsm_handle handle, Size request_size,
860 void **impl_private, void **mapped_address, Size *mapped_size,
861 int elevel)
862 {
863 char name[64];
864 int flags;
865 int fd;
866 char *address;
867
868 snprintf(name, 64, PG_DYNSHMEM_DIR "/" PG_DYNSHMEM_MMAP_FILE_PREFIX "%u",
869 handle);
870
871 /* Handle teardown cases. */
872 if (op == DSM_OP_DETACH || op == DSM_OP_DESTROY)
873 {
874 if (*mapped_address != NULL
875 && munmap(*mapped_address, *mapped_size) != 0)
876 {
877 ereport(elevel,
878 (errcode_for_dynamic_shared_memory(),
879 errmsg("could not unmap shared memory segment \"%s\": %m",
880 name)));
881 return false;
882 }
883 *mapped_address = NULL;
884 *mapped_size = 0;
885 if (op == DSM_OP_DESTROY && unlink(name) != 0)
886 {
887 ereport(elevel,
888 (errcode_for_dynamic_shared_memory(),
889 errmsg("could not remove shared memory segment \"%s\": %m",
890 name)));
891 return false;
892 }
893 return true;
894 }
895
896 /* Create new segment or open an existing one for attach or resize. */
897 flags = O_RDWR | (op == DSM_OP_CREATE ? O_CREAT | O_EXCL : 0);
898 if ((fd = OpenTransientFile(name, flags, 0600)) == -1)
899 {
900 if (errno != EEXIST)
901 ereport(elevel,
902 (errcode_for_dynamic_shared_memory(),
903 errmsg("could not open shared memory segment \"%s\": %m",
904 name)));
905 return false;
906 }
907
908 /*
909 * If we're attaching the segment, determine the current size; if we are
910 * creating or resizing the segment, set the size to the requested value.
911 */
912 if (op == DSM_OP_ATTACH)
913 {
914 struct stat st;
915
916 if (fstat(fd, &st) != 0)
917 {
918 int save_errno;
919
920 /* Back out what's already been done. */
921 save_errno = errno;
922 CloseTransientFile(fd);
923 errno = save_errno;
924
925 ereport(elevel,
926 (errcode_for_dynamic_shared_memory(),
927 errmsg("could not stat shared memory segment \"%s\": %m",
928 name)));
929 return false;
930 }
931 request_size = st.st_size;
932 }
933 else if (*mapped_size > request_size && ftruncate(fd, request_size))
934 {
935 int save_errno;
936
937 /* Back out what's already been done. */
938 save_errno = errno;
939 CloseTransientFile(fd);
940 if (op == DSM_OP_CREATE)
941 unlink(name);
942 errno = save_errno;
943
944 ereport(elevel,
945 (errcode_for_dynamic_shared_memory(),
946 errmsg("could not resize shared memory segment \"%s\" to %zu bytes: %m",
947 name, request_size)));
948 return false;
949 }
950 else if (*mapped_size < request_size)
951 {
952 /*
953 * Allocate a buffer full of zeros.
954 *
955 * Note: palloc zbuffer, instead of just using a local char array, to
956 * ensure it is reasonably well-aligned; this may save a few cycles
957 * transferring data to the kernel.
958 */
959 char *zbuffer = (char *) palloc0(ZBUFFER_SIZE);
960 uint32 remaining = request_size;
961 bool success = true;
962
963 /*
964 * Zero-fill the file. We have to do this the hard way to ensure that
965 * all the file space has really been allocated, so that we don't
966 * later seg fault when accessing the memory mapping. This is pretty
967 * pessimal.
968 */
969 while (success && remaining > 0)
970 {
971 Size goal = remaining;
972
973 if (goal > ZBUFFER_SIZE)
974 goal = ZBUFFER_SIZE;
975 pgstat_report_wait_start(WAIT_EVENT_DSM_FILL_ZERO_WRITE);
976 if (write(fd, zbuffer, goal) == goal)
977 remaining -= goal;
978 else
979 success = false;
980 pgstat_report_wait_end();
981 }
982
983 if (!success)
984 {
985 int save_errno;
986
987 /* Back out what's already been done. */
988 save_errno = errno;
989 CloseTransientFile(fd);
990 if (op == DSM_OP_CREATE)
991 unlink(name);
992 errno = save_errno ? save_errno : ENOSPC;
993
994 ereport(elevel,
995 (errcode_for_dynamic_shared_memory(),
996 errmsg("could not resize shared memory segment \"%s\" to %zu bytes: %m",
997 name, request_size)));
998 return false;
999 }
1000 }
1001
1002 /*
1003 * If we're reattaching or resizing, we must remove any existing mapping,
1004 * unless we've already got the right thing mapped.
1005 */
1006 if (*mapped_address != NULL)
1007 {
1008 if (*mapped_size == request_size)
1009 return true;
1010 if (munmap(*mapped_address, *mapped_size) != 0)
1011 {
1012 int save_errno;
1013
1014 /* Back out what's already been done. */
1015 save_errno = errno;
1016 CloseTransientFile(fd);
1017 if (op == DSM_OP_CREATE)
1018 unlink(name);
1019 errno = save_errno;
1020
1021 ereport(elevel,
1022 (errcode_for_dynamic_shared_memory(),
1023 errmsg("could not unmap shared memory segment \"%s\": %m",
1024 name)));
1025 return false;
1026 }
1027 *mapped_address = NULL;
1028 *mapped_size = 0;
1029 }
1030
1031 /* Map it. */
1032 address = mmap(NULL, request_size, PROT_READ | PROT_WRITE,
1033 MAP_SHARED | MAP_HASSEMAPHORE | MAP_NOSYNC, fd, 0);
1034 if (address == MAP_FAILED)
1035 {
1036 int save_errno;
1037
1038 /* Back out what's already been done. */
1039 save_errno = errno;
1040 CloseTransientFile(fd);
1041 if (op == DSM_OP_CREATE)
1042 unlink(name);
1043 errno = save_errno;
1044
1045 ereport(elevel,
1046 (errcode_for_dynamic_shared_memory(),
1047 errmsg("could not map shared memory segment \"%s\": %m",
1048 name)));
1049 return false;
1050 }
1051 *mapped_address = address;
1052 *mapped_size = request_size;
1053 CloseTransientFile(fd);
1054
1055 return true;
1056 }
1057 #endif
1058
1059 /*
1060 * Implementation-specific actions that must be performed when a segment is to
1061 * be preserved even when no backend has it attached.
1062 *
1063 * Except on Windows, we don't need to do anything at all. But since Windows
1064 * cleans up segments automatically when no references remain, we duplicate
1065 * the segment handle into the postmaster process. The postmaster needn't
1066 * do anything to receive the handle; Windows transfers it automatically.
1067 */
1068 void
dsm_impl_pin_segment(dsm_handle handle,void * impl_private,void ** impl_private_pm_handle)1069 dsm_impl_pin_segment(dsm_handle handle, void *impl_private,
1070 void **impl_private_pm_handle)
1071 {
1072 switch (dynamic_shared_memory_type)
1073 {
1074 #ifdef USE_DSM_WINDOWS
1075 case DSM_IMPL_WINDOWS:
1076 {
1077 HANDLE hmap;
1078
1079 if (!DuplicateHandle(GetCurrentProcess(), impl_private,
1080 PostmasterHandle, &hmap, 0, FALSE,
1081 DUPLICATE_SAME_ACCESS))
1082 {
1083 char name[64];
1084
1085 snprintf(name, 64, "%s.%u", SEGMENT_NAME_PREFIX, handle);
1086 _dosmaperr(GetLastError());
1087 ereport(ERROR,
1088 (errcode_for_dynamic_shared_memory(),
1089 errmsg("could not duplicate handle for \"%s\": %m",
1090 name)));
1091 }
1092
1093 /*
1094 * Here, we remember the handle that we created in the
1095 * postmaster process. This handle isn't actually usable in
1096 * any process other than the postmaster, but that doesn't
1097 * matter. We're just holding onto it so that, if the segment
1098 * is unpinned, dsm_impl_unpin_segment can close it.
1099 */
1100 *impl_private_pm_handle = hmap;
1101 break;
1102 }
1103 #endif
1104 default:
1105 break;
1106 }
1107 }
1108
1109 /*
1110 * Implementation-specific actions that must be performed when a segment is no
1111 * longer to be preserved, so that it will be cleaned up when all backends
1112 * have detached from it.
1113 *
1114 * Except on Windows, we don't need to do anything at all. For Windows, we
1115 * close the extra handle that dsm_impl_pin_segment created in the
1116 * postmaster's process space.
1117 */
1118 void
dsm_impl_unpin_segment(dsm_handle handle,void ** impl_private)1119 dsm_impl_unpin_segment(dsm_handle handle, void **impl_private)
1120 {
1121 switch (dynamic_shared_memory_type)
1122 {
1123 #ifdef USE_DSM_WINDOWS
1124 case DSM_IMPL_WINDOWS:
1125 {
1126 if (*impl_private &&
1127 !DuplicateHandle(PostmasterHandle, *impl_private,
1128 NULL, NULL, 0, FALSE,
1129 DUPLICATE_CLOSE_SOURCE))
1130 {
1131 char name[64];
1132
1133 snprintf(name, 64, "%s.%u", SEGMENT_NAME_PREFIX, handle);
1134 _dosmaperr(GetLastError());
1135 ereport(ERROR,
1136 (errcode_for_dynamic_shared_memory(),
1137 errmsg("could not duplicate handle for \"%s\": %m",
1138 name)));
1139 }
1140
1141 *impl_private = NULL;
1142 break;
1143 }
1144 #endif
1145 default:
1146 break;
1147 }
1148 }
1149
1150 static int
errcode_for_dynamic_shared_memory(void)1151 errcode_for_dynamic_shared_memory(void)
1152 {
1153 if (errno == EFBIG || errno == ENOMEM)
1154 return errcode(ERRCODE_OUT_OF_MEMORY);
1155 else
1156 return errcode_for_file_access();
1157 }
1158