1 /*-------------------------------------------------------------------------
2 *
3 * dsm_impl.c
4 * manage dynamic shared memory segments
5 *
6 * This file provides low-level APIs for creating and destroying shared
7 * memory segments using several different possible techniques. We refer
8 * to these segments as dynamic because they can be created, altered, and
9 * destroyed at any point during the server life cycle. This is unlike
10 * the main shared memory segment, of which there is always exactly one
11 * and which is always mapped at a fixed address in every PostgreSQL
12 * background process.
13 *
14 * Because not all systems provide the same primitives in this area, nor
15 * do all primitives behave the same way on all systems, we provide
16 * several implementations of this facility. Many systems implement
17 * POSIX shared memory (shm_open etc.), which is well-suited to our needs
18 * in this area, with the exception that shared memory identifiers live
19 * in a flat system-wide namespace, raising the uncomfortable prospect of
20 * name collisions with other processes (including other copies of
21 * PostgreSQL) running on the same system. Some systems only support
22 * the older System V shared memory interface (shmget etc.) which is
23 * also usable; however, the default allocation limits are often quite
24 * small, and the namespace is even more restricted.
25 *
26 * We also provide an mmap-based shared memory implementation. This may
27 * be useful on systems that provide shared memory via a special-purpose
28 * filesystem; by opting for this implementation, the user can even
29 * control precisely where their shared memory segments are placed. It
30 * can also be used as a fallback for systems where shm_open and shmget
31 * are not available or can't be used for some reason. Of course,
32 * mapping a file residing on an actual spinning disk is a fairly poor
33 * approximation for shared memory because writeback may hurt performance
34 * substantially, but there should be few systems where we must make do
35 * with such poor tools.
36 *
37 * As ever, Windows requires its own implementation.
38 *
39 * Portions Copyright (c) 1996-2018, PostgreSQL Global Development Group
40 * Portions Copyright (c) 1994, Regents of the University of California
41 *
42 *
43 * IDENTIFICATION
44 * src/backend/storage/ipc/dsm_impl.c
45 *
46 *-------------------------------------------------------------------------
47 */
48
49 #include "postgres.h"
50 #include "miscadmin.h"
51
52 #include <fcntl.h>
53 #include <unistd.h>
54 #ifndef WIN32
55 #include <sys/mman.h>
56 #endif
57 #include <sys/stat.h>
58 #ifdef HAVE_SYS_IPC_H
59 #include <sys/ipc.h>
60 #endif
61 #ifdef HAVE_SYS_SHM_H
62 #include <sys/shm.h>
63 #endif
64 #include "common/file_perm.h"
65 #include "pgstat.h"
66
67 #include "portability/mem.h"
68 #include "storage/dsm_impl.h"
69 #include "storage/fd.h"
70 #include "utils/guc.h"
71 #include "utils/memutils.h"
72 #include "postmaster/postmaster.h"
73
74 #ifdef USE_DSM_POSIX
75 static bool dsm_impl_posix(dsm_op op, dsm_handle handle, Size request_size,
76 void **impl_private, void **mapped_address,
77 Size *mapped_size, int elevel);
78 static int dsm_impl_posix_resize(int fd, off_t size);
79 #endif
80 #ifdef USE_DSM_SYSV
81 static bool dsm_impl_sysv(dsm_op op, dsm_handle handle, Size request_size,
82 void **impl_private, void **mapped_address,
83 Size *mapped_size, int elevel);
84 #endif
85 #ifdef USE_DSM_WINDOWS
86 static bool dsm_impl_windows(dsm_op op, dsm_handle handle, Size request_size,
87 void **impl_private, void **mapped_address,
88 Size *mapped_size, int elevel);
89 #endif
90 #ifdef USE_DSM_MMAP
91 static bool dsm_impl_mmap(dsm_op op, dsm_handle handle, Size request_size,
92 void **impl_private, void **mapped_address,
93 Size *mapped_size, int elevel);
94 #endif
95 static int errcode_for_dynamic_shared_memory(void);
96
97 const struct config_enum_entry dynamic_shared_memory_options[] = {
98 #ifdef USE_DSM_POSIX
99 {"posix", DSM_IMPL_POSIX, false},
100 #endif
101 #ifdef USE_DSM_SYSV
102 {"sysv", DSM_IMPL_SYSV, false},
103 #endif
104 #ifdef USE_DSM_WINDOWS
105 {"windows", DSM_IMPL_WINDOWS, false},
106 #endif
107 #ifdef USE_DSM_MMAP
108 {"mmap", DSM_IMPL_MMAP, false},
109 #endif
110 {"none", DSM_IMPL_NONE, false},
111 {NULL, 0, false}
112 };
113
114 /* Implementation selector. */
115 int dynamic_shared_memory_type;
116
117 /* Size of buffer to be used for zero-filling. */
118 #define ZBUFFER_SIZE 8192
119
120 #define SEGMENT_NAME_PREFIX "Global/PostgreSQL"
121
122 /*------
123 * Perform a low-level shared memory operation in a platform-specific way,
124 * as dictated by the selected implementation. Each implementation is
125 * required to implement the following primitives.
126 *
127 * DSM_OP_CREATE. Create a segment whose size is the request_size and
128 * map it.
129 *
130 * DSM_OP_ATTACH. Map the segment, whose size must be the request_size.
131 * The segment may already be mapped; any existing mapping should be removed
132 * before creating a new one.
133 *
134 * DSM_OP_DETACH. Unmap the segment.
135 *
136 * DSM_OP_RESIZE. Resize the segment to the given request_size and
137 * remap the segment at that new size.
138 *
139 * DSM_OP_DESTROY. Unmap the segment, if it is mapped. Destroy the
140 * segment.
141 *
142 * Arguments:
143 * op: The operation to be performed.
144 * handle: The handle of an existing object, or for DSM_OP_CREATE, the
145 * a new handle the caller wants created.
146 * request_size: For DSM_OP_CREATE, the requested size. For DSM_OP_RESIZE,
147 * the new size. Otherwise, 0.
148 * impl_private: Private, implementation-specific data. Will be a pointer
149 * to NULL for the first operation on a shared memory segment within this
150 * backend; thereafter, it will point to the value to which it was set
151 * on the previous call.
152 * mapped_address: Pointer to start of current mapping; pointer to NULL
153 * if none. Updated with new mapping address.
154 * mapped_size: Pointer to size of current mapping; pointer to 0 if none.
155 * Updated with new mapped size.
156 * elevel: Level at which to log errors.
157 *
158 * Return value: true on success, false on failure. When false is returned,
159 * a message should first be logged at the specified elevel, except in the
160 * case where DSM_OP_CREATE experiences a name collision, which should
161 * silently return false.
162 *-----
163 */
164 bool
dsm_impl_op(dsm_op op,dsm_handle handle,Size request_size,void ** impl_private,void ** mapped_address,Size * mapped_size,int elevel)165 dsm_impl_op(dsm_op op, dsm_handle handle, Size request_size,
166 void **impl_private, void **mapped_address, Size *mapped_size,
167 int elevel)
168 {
169 Assert(op == DSM_OP_CREATE || op == DSM_OP_RESIZE || request_size == 0);
170 Assert((op != DSM_OP_CREATE && op != DSM_OP_ATTACH) ||
171 (*mapped_address == NULL && *mapped_size == 0));
172
173 switch (dynamic_shared_memory_type)
174 {
175 #ifdef USE_DSM_POSIX
176 case DSM_IMPL_POSIX:
177 return dsm_impl_posix(op, handle, request_size, impl_private,
178 mapped_address, mapped_size, elevel);
179 #endif
180 #ifdef USE_DSM_SYSV
181 case DSM_IMPL_SYSV:
182 return dsm_impl_sysv(op, handle, request_size, impl_private,
183 mapped_address, mapped_size, elevel);
184 #endif
185 #ifdef USE_DSM_WINDOWS
186 case DSM_IMPL_WINDOWS:
187 return dsm_impl_windows(op, handle, request_size, impl_private,
188 mapped_address, mapped_size, elevel);
189 #endif
190 #ifdef USE_DSM_MMAP
191 case DSM_IMPL_MMAP:
192 return dsm_impl_mmap(op, handle, request_size, impl_private,
193 mapped_address, mapped_size, elevel);
194 #endif
195 default:
196 elog(ERROR, "unexpected dynamic shared memory type: %d",
197 dynamic_shared_memory_type);
198 return false;
199 }
200 }
201
202 /*
203 * Does the current dynamic shared memory implementation support resizing
204 * segments? (The answer here could be platform-dependent in the future,
205 * since AIX allows shmctl(shmid, SHM_RESIZE, &buffer), though you apparently
206 * can't resize segments to anything larger than 256MB that way. For now,
207 * we keep it simple.)
208 */
209 bool
dsm_impl_can_resize(void)210 dsm_impl_can_resize(void)
211 {
212 switch (dynamic_shared_memory_type)
213 {
214 case DSM_IMPL_NONE:
215 return false;
216 case DSM_IMPL_POSIX:
217 return true;
218 case DSM_IMPL_SYSV:
219 return false;
220 case DSM_IMPL_WINDOWS:
221 return false;
222 case DSM_IMPL_MMAP:
223 return true;
224 default:
225 return false; /* should not happen */
226 }
227 }
228
229 #ifdef USE_DSM_POSIX
230 /*
231 * Operating system primitives to support POSIX shared memory.
232 *
233 * POSIX shared memory segments are created and attached using shm_open()
234 * and shm_unlink(); other operations, such as sizing or mapping the
235 * segment, are performed as if the shared memory segments were files.
236 *
237 * Indeed, on some platforms, they may be implemented that way. While
238 * POSIX shared memory segments seem intended to exist in a flat namespace,
239 * some operating systems may implement them as files, even going so far
240 * to treat a request for /xyz as a request to create a file by that name
241 * in the root directory. Users of such broken platforms should select
242 * a different shared memory implementation.
243 */
244 static bool
dsm_impl_posix(dsm_op op,dsm_handle handle,Size request_size,void ** impl_private,void ** mapped_address,Size * mapped_size,int elevel)245 dsm_impl_posix(dsm_op op, dsm_handle handle, Size request_size,
246 void **impl_private, void **mapped_address, Size *mapped_size,
247 int elevel)
248 {
249 char name[64];
250 int flags;
251 int fd;
252 char *address;
253
254 snprintf(name, 64, "/PostgreSQL.%u", handle);
255
256 /* Handle teardown cases. */
257 if (op == DSM_OP_DETACH || op == DSM_OP_DESTROY)
258 {
259 if (*mapped_address != NULL
260 && munmap(*mapped_address, *mapped_size) != 0)
261 {
262 ereport(elevel,
263 (errcode_for_dynamic_shared_memory(),
264 errmsg("could not unmap shared memory segment \"%s\": %m",
265 name)));
266 return false;
267 }
268 *mapped_address = NULL;
269 *mapped_size = 0;
270 if (op == DSM_OP_DESTROY && shm_unlink(name) != 0)
271 {
272 ereport(elevel,
273 (errcode_for_dynamic_shared_memory(),
274 errmsg("could not remove shared memory segment \"%s\": %m",
275 name)));
276 return false;
277 }
278 return true;
279 }
280
281 /*
282 * Create new segment or open an existing one for attach or resize.
283 *
284 * Even though we're not going through fd.c, we should be safe against
285 * running out of file descriptors, because of NUM_RESERVED_FDS. We're
286 * only opening one extra descriptor here, and we'll close it before
287 * returning.
288 */
289 flags = O_RDWR | (op == DSM_OP_CREATE ? O_CREAT | O_EXCL : 0);
290 if ((fd = shm_open(name, flags, PG_FILE_MODE_OWNER)) == -1)
291 {
292 if (errno != EEXIST)
293 ereport(elevel,
294 (errcode_for_dynamic_shared_memory(),
295 errmsg("could not open shared memory segment \"%s\": %m",
296 name)));
297 return false;
298 }
299
300 /*
301 * If we're attaching the segment, determine the current size; if we are
302 * creating or resizing the segment, set the size to the requested value.
303 */
304 if (op == DSM_OP_ATTACH)
305 {
306 struct stat st;
307
308 if (fstat(fd, &st) != 0)
309 {
310 int save_errno;
311
312 /* Back out what's already been done. */
313 save_errno = errno;
314 close(fd);
315 errno = save_errno;
316
317 ereport(elevel,
318 (errcode_for_dynamic_shared_memory(),
319 errmsg("could not stat shared memory segment \"%s\": %m",
320 name)));
321 return false;
322 }
323 request_size = st.st_size;
324 }
325 else if (*mapped_size != request_size &&
326 dsm_impl_posix_resize(fd, request_size) != 0)
327 {
328 int save_errno;
329
330 /* Back out what's already been done. */
331 save_errno = errno;
332 close(fd);
333 if (op == DSM_OP_CREATE)
334 shm_unlink(name);
335 errno = save_errno;
336
337 /*
338 * If we received a query cancel or termination signal, we will have
339 * EINTR set here. If the caller said that errors are OK here, check
340 * for interrupts immediately.
341 */
342 if (errno == EINTR && elevel >= ERROR)
343 CHECK_FOR_INTERRUPTS();
344
345 ereport(elevel,
346 (errcode_for_dynamic_shared_memory(),
347 errmsg("could not resize shared memory segment \"%s\" to %zu bytes: %m",
348 name, request_size)));
349 return false;
350 }
351
352 /*
353 * If we're reattaching or resizing, we must remove any existing mapping,
354 * unless we've already got the right thing mapped.
355 */
356 if (*mapped_address != NULL)
357 {
358 if (*mapped_size == request_size)
359 return true;
360 if (munmap(*mapped_address, *mapped_size) != 0)
361 {
362 int save_errno;
363
364 /* Back out what's already been done. */
365 save_errno = errno;
366 close(fd);
367 if (op == DSM_OP_CREATE)
368 shm_unlink(name);
369 errno = save_errno;
370
371 ereport(elevel,
372 (errcode_for_dynamic_shared_memory(),
373 errmsg("could not unmap shared memory segment \"%s\": %m",
374 name)));
375 return false;
376 }
377 *mapped_address = NULL;
378 *mapped_size = 0;
379 }
380
381 /* Map it. */
382 address = mmap(NULL, request_size, PROT_READ | PROT_WRITE,
383 MAP_SHARED | MAP_HASSEMAPHORE | MAP_NOSYNC, fd, 0);
384 if (address == MAP_FAILED)
385 {
386 int save_errno;
387
388 /* Back out what's already been done. */
389 save_errno = errno;
390 close(fd);
391 if (op == DSM_OP_CREATE)
392 shm_unlink(name);
393 errno = save_errno;
394
395 ereport(elevel,
396 (errcode_for_dynamic_shared_memory(),
397 errmsg("could not map shared memory segment \"%s\": %m",
398 name)));
399 return false;
400 }
401 *mapped_address = address;
402 *mapped_size = request_size;
403 close(fd);
404
405 return true;
406 }
407
408 /*
409 * Set the size of a virtual memory region associated with a file descriptor.
410 * If necessary, also ensure that virtual memory is actually allocated by the
411 * operating system, to avoid nasty surprises later.
412 *
413 * Returns non-zero if either truncation or allocation fails, and sets errno.
414 */
415 static int
dsm_impl_posix_resize(int fd,off_t size)416 dsm_impl_posix_resize(int fd, off_t size)
417 {
418 int rc;
419
420 /* Truncate (or extend) the file to the requested size. */
421 rc = ftruncate(fd, size);
422
423 /*
424 * On Linux, a shm_open fd is backed by a tmpfs file. After resizing with
425 * ftruncate, the file may contain a hole. Accessing memory backed by a
426 * hole causes tmpfs to allocate pages, which fails with SIGBUS if there
427 * is no more tmpfs space available. So we ask tmpfs to allocate pages
428 * here, so we can fail gracefully with ENOSPC now rather than risking
429 * SIGBUS later.
430 */
431 #if defined(HAVE_POSIX_FALLOCATE) && defined(__linux__)
432 if (rc == 0)
433 {
434 /*
435 * We may get interrupted. If so, just retry unless there is an
436 * interrupt pending. This avoids the possibility of looping forever
437 * if another backend is repeatedly trying to interrupt us.
438 */
439 do
440 {
441 rc = posix_fallocate(fd, 0, size);
442 } while (rc == EINTR && !(ProcDiePending || QueryCancelPending));
443
444 /*
445 * The caller expects errno to be set, but posix_fallocate() doesn't
446 * set it. Instead it returns error numbers directly. So set errno,
447 * even though we'll also return rc to indicate success or failure.
448 */
449 errno = rc;
450 }
451 #endif /* HAVE_POSIX_FALLOCATE && __linux__ */
452
453 return rc;
454 }
455
456 #endif /* USE_DSM_POSIX */
457
458 #ifdef USE_DSM_SYSV
459 /*
460 * Operating system primitives to support System V shared memory.
461 *
462 * System V shared memory segments are manipulated using shmget(), shmat(),
463 * shmdt(), and shmctl(). There's no portable way to resize such
464 * segments. As the default allocation limits for System V shared memory
465 * are usually quite low, the POSIX facilities may be preferable; but
466 * those are not supported everywhere.
467 */
468 static bool
dsm_impl_sysv(dsm_op op,dsm_handle handle,Size request_size,void ** impl_private,void ** mapped_address,Size * mapped_size,int elevel)469 dsm_impl_sysv(dsm_op op, dsm_handle handle, Size request_size,
470 void **impl_private, void **mapped_address, Size *mapped_size,
471 int elevel)
472 {
473 key_t key;
474 int ident;
475 char *address;
476 char name[64];
477 int *ident_cache;
478
479 /* Resize is not supported for System V shared memory. */
480 if (op == DSM_OP_RESIZE)
481 {
482 elog(elevel, "System V shared memory segments cannot be resized");
483 return false;
484 }
485
486 /* Since resize isn't supported, reattach is a no-op. */
487 if (op == DSM_OP_ATTACH && *mapped_address != NULL)
488 return true;
489
490 /*
491 * POSIX shared memory and mmap-based shared memory identify segments with
492 * names. To avoid needless error message variation, we use the handle as
493 * the name.
494 */
495 snprintf(name, 64, "%u", handle);
496
497 /*
498 * The System V shared memory namespace is very restricted; names are of
499 * type key_t, which is expected to be some sort of integer data type, but
500 * not necessarily the same one as dsm_handle. Since we use dsm_handle to
501 * identify shared memory segments across processes, this might seem like
502 * a problem, but it's really not. If dsm_handle is bigger than key_t,
503 * the cast below might truncate away some bits from the handle the
504 * user-provided, but it'll truncate exactly the same bits away in exactly
505 * the same fashion every time we use that handle, which is all that
506 * really matters. Conversely, if dsm_handle is smaller than key_t, we
507 * won't use the full range of available key space, but that's no big deal
508 * either.
509 *
510 * We do make sure that the key isn't negative, because that might not be
511 * portable.
512 */
513 key = (key_t) handle;
514 if (key < 1) /* avoid compiler warning if type is unsigned */
515 key = -key;
516
517 /*
518 * There's one special key, IPC_PRIVATE, which can't be used. If we end
519 * up with that value by chance during a create operation, just pretend it
520 * already exists, so that caller will retry. If we run into it anywhere
521 * else, the caller has passed a handle that doesn't correspond to
522 * anything we ever created, which should not happen.
523 */
524 if (key == IPC_PRIVATE)
525 {
526 if (op != DSM_OP_CREATE)
527 elog(DEBUG4, "System V shared memory key may not be IPC_PRIVATE");
528 errno = EEXIST;
529 return false;
530 }
531
532 /*
533 * Before we can do anything with a shared memory segment, we have to map
534 * the shared memory key to a shared memory identifier using shmget(). To
535 * avoid repeated lookups, we store the key using impl_private.
536 */
537 if (*impl_private != NULL)
538 {
539 ident_cache = *impl_private;
540 ident = *ident_cache;
541 }
542 else
543 {
544 int flags = IPCProtection;
545 size_t segsize;
546
547 /*
548 * Allocate the memory BEFORE acquiring the resource, so that we don't
549 * leak the resource if memory allocation fails.
550 */
551 ident_cache = MemoryContextAlloc(TopMemoryContext, sizeof(int));
552
553 /*
554 * When using shmget to find an existing segment, we must pass the
555 * size as 0. Passing a non-zero size which is greater than the
556 * actual size will result in EINVAL.
557 */
558 segsize = 0;
559
560 if (op == DSM_OP_CREATE)
561 {
562 flags |= IPC_CREAT | IPC_EXCL;
563 segsize = request_size;
564 }
565
566 if ((ident = shmget(key, segsize, flags)) == -1)
567 {
568 if (errno != EEXIST)
569 {
570 int save_errno = errno;
571
572 pfree(ident_cache);
573 errno = save_errno;
574 ereport(elevel,
575 (errcode_for_dynamic_shared_memory(),
576 errmsg("could not get shared memory segment: %m")));
577 }
578 return false;
579 }
580
581 *ident_cache = ident;
582 *impl_private = ident_cache;
583 }
584
585 /* Handle teardown cases. */
586 if (op == DSM_OP_DETACH || op == DSM_OP_DESTROY)
587 {
588 pfree(ident_cache);
589 *impl_private = NULL;
590 if (*mapped_address != NULL && shmdt(*mapped_address) != 0)
591 {
592 ereport(elevel,
593 (errcode_for_dynamic_shared_memory(),
594 errmsg("could not unmap shared memory segment \"%s\": %m",
595 name)));
596 return false;
597 }
598 *mapped_address = NULL;
599 *mapped_size = 0;
600 if (op == DSM_OP_DESTROY && shmctl(ident, IPC_RMID, NULL) < 0)
601 {
602 ereport(elevel,
603 (errcode_for_dynamic_shared_memory(),
604 errmsg("could not remove shared memory segment \"%s\": %m",
605 name)));
606 return false;
607 }
608 return true;
609 }
610
611 /* If we're attaching it, we must use IPC_STAT to determine the size. */
612 if (op == DSM_OP_ATTACH)
613 {
614 struct shmid_ds shm;
615
616 if (shmctl(ident, IPC_STAT, &shm) != 0)
617 {
618 ereport(elevel,
619 (errcode_for_dynamic_shared_memory(),
620 errmsg("could not stat shared memory segment \"%s\": %m",
621 name)));
622 return false;
623 }
624 request_size = shm.shm_segsz;
625 }
626
627 /* Map it. */
628 address = shmat(ident, NULL, PG_SHMAT_FLAGS);
629 if (address == (void *) -1)
630 {
631 int save_errno;
632
633 /* Back out what's already been done. */
634 save_errno = errno;
635 if (op == DSM_OP_CREATE)
636 shmctl(ident, IPC_RMID, NULL);
637 errno = save_errno;
638
639 ereport(elevel,
640 (errcode_for_dynamic_shared_memory(),
641 errmsg("could not map shared memory segment \"%s\": %m",
642 name)));
643 return false;
644 }
645 *mapped_address = address;
646 *mapped_size = request_size;
647
648 return true;
649 }
650 #endif
651
652 #ifdef USE_DSM_WINDOWS
653 /*
654 * Operating system primitives to support Windows shared memory.
655 *
656 * Windows shared memory implementation is done using file mapping
657 * which can be backed by either physical file or system paging file.
658 * Current implementation uses system paging file as other effects
659 * like performance are not clear for physical file and it is used in similar
660 * way for main shared memory in windows.
661 *
662 * A memory mapping object is a kernel object - they always get deleted when
663 * the last reference to them goes away, either explicitly via a CloseHandle or
664 * when the process containing the reference exits.
665 */
666 static bool
dsm_impl_windows(dsm_op op,dsm_handle handle,Size request_size,void ** impl_private,void ** mapped_address,Size * mapped_size,int elevel)667 dsm_impl_windows(dsm_op op, dsm_handle handle, Size request_size,
668 void **impl_private, void **mapped_address,
669 Size *mapped_size, int elevel)
670 {
671 char *address;
672 HANDLE hmap;
673 char name[64];
674 MEMORY_BASIC_INFORMATION info;
675
676 /* Resize is not supported for Windows shared memory. */
677 if (op == DSM_OP_RESIZE)
678 {
679 elog(elevel, "Windows shared memory segments cannot be resized");
680 return false;
681 }
682
683 /* Since resize isn't supported, reattach is a no-op. */
684 if (op == DSM_OP_ATTACH && *mapped_address != NULL)
685 return true;
686
687 /*
688 * Storing the shared memory segment in the Global\ namespace, can allow
689 * any process running in any session to access that file mapping object
690 * provided that the caller has the required access rights. But to avoid
691 * issues faced in main shared memory, we are using the naming convention
692 * similar to main shared memory. We can change here once issue mentioned
693 * in GetSharedMemName is resolved.
694 */
695 snprintf(name, 64, "%s.%u", SEGMENT_NAME_PREFIX, handle);
696
697 /*
698 * Handle teardown cases. Since Windows automatically destroys the object
699 * when no references remain, we can treat it the same as detach.
700 */
701 if (op == DSM_OP_DETACH || op == DSM_OP_DESTROY)
702 {
703 if (*mapped_address != NULL
704 && UnmapViewOfFile(*mapped_address) == 0)
705 {
706 _dosmaperr(GetLastError());
707 ereport(elevel,
708 (errcode_for_dynamic_shared_memory(),
709 errmsg("could not unmap shared memory segment \"%s\": %m",
710 name)));
711 return false;
712 }
713 if (*impl_private != NULL
714 && CloseHandle(*impl_private) == 0)
715 {
716 _dosmaperr(GetLastError());
717 ereport(elevel,
718 (errcode_for_dynamic_shared_memory(),
719 errmsg("could not remove shared memory segment \"%s\": %m",
720 name)));
721 return false;
722 }
723
724 *impl_private = NULL;
725 *mapped_address = NULL;
726 *mapped_size = 0;
727 return true;
728 }
729
730 /* Create new segment or open an existing one for attach. */
731 if (op == DSM_OP_CREATE)
732 {
733 DWORD size_high;
734 DWORD size_low;
735 DWORD errcode;
736
737 /* Shifts >= the width of the type are undefined. */
738 #ifdef _WIN64
739 size_high = request_size >> 32;
740 #else
741 size_high = 0;
742 #endif
743 size_low = (DWORD) request_size;
744
745 /* CreateFileMapping might not clear the error code on success */
746 SetLastError(0);
747
748 hmap = CreateFileMapping(INVALID_HANDLE_VALUE, /* Use the pagefile */
749 NULL, /* Default security attrs */
750 PAGE_READWRITE, /* Memory is read/write */
751 size_high, /* Upper 32 bits of size */
752 size_low, /* Lower 32 bits of size */
753 name);
754
755 errcode = GetLastError();
756 if (errcode == ERROR_ALREADY_EXISTS || errcode == ERROR_ACCESS_DENIED)
757 {
758 /*
759 * On Windows, when the segment already exists, a handle for the
760 * existing segment is returned. We must close it before
761 * returning. However, if the existing segment is created by a
762 * service, then it returns ERROR_ACCESS_DENIED. We don't do
763 * _dosmaperr here, so errno won't be modified.
764 */
765 if (hmap)
766 CloseHandle(hmap);
767 return false;
768 }
769
770 if (!hmap)
771 {
772 _dosmaperr(errcode);
773 ereport(elevel,
774 (errcode_for_dynamic_shared_memory(),
775 errmsg("could not create shared memory segment \"%s\": %m",
776 name)));
777 return false;
778 }
779 }
780 else
781 {
782 hmap = OpenFileMapping(FILE_MAP_WRITE | FILE_MAP_READ,
783 FALSE, /* do not inherit the name */
784 name); /* name of mapping object */
785 if (!hmap)
786 {
787 _dosmaperr(GetLastError());
788 ereport(elevel,
789 (errcode_for_dynamic_shared_memory(),
790 errmsg("could not open shared memory segment \"%s\": %m",
791 name)));
792 return false;
793 }
794 }
795
796 /* Map it. */
797 address = MapViewOfFile(hmap, FILE_MAP_WRITE | FILE_MAP_READ,
798 0, 0, 0);
799 if (!address)
800 {
801 int save_errno;
802
803 _dosmaperr(GetLastError());
804 /* Back out what's already been done. */
805 save_errno = errno;
806 CloseHandle(hmap);
807 errno = save_errno;
808
809 ereport(elevel,
810 (errcode_for_dynamic_shared_memory(),
811 errmsg("could not map shared memory segment \"%s\": %m",
812 name)));
813 return false;
814 }
815
816 /*
817 * VirtualQuery gives size in page_size units, which is 4K for Windows. We
818 * need size only when we are attaching, but it's better to get the size
819 * when creating new segment to keep size consistent both for
820 * DSM_OP_CREATE and DSM_OP_ATTACH.
821 */
822 if (VirtualQuery(address, &info, sizeof(info)) == 0)
823 {
824 int save_errno;
825
826 _dosmaperr(GetLastError());
827 /* Back out what's already been done. */
828 save_errno = errno;
829 UnmapViewOfFile(address);
830 CloseHandle(hmap);
831 errno = save_errno;
832
833 ereport(elevel,
834 (errcode_for_dynamic_shared_memory(),
835 errmsg("could not stat shared memory segment \"%s\": %m",
836 name)));
837 return false;
838 }
839
840 *mapped_address = address;
841 *mapped_size = info.RegionSize;
842 *impl_private = hmap;
843
844 return true;
845 }
846 #endif
847
848 #ifdef USE_DSM_MMAP
849 /*
850 * Operating system primitives to support mmap-based shared memory.
851 *
852 * Calling this "shared memory" is somewhat of a misnomer, because what
853 * we're really doing is creating a bunch of files and mapping them into
854 * our address space. The operating system may feel obliged to
855 * synchronize the contents to disk even if nothing is being paged out,
856 * which will not serve us well. The user can relocate the pg_dynshmem
857 * directory to a ramdisk to avoid this problem, if available.
858 */
859 static bool
dsm_impl_mmap(dsm_op op,dsm_handle handle,Size request_size,void ** impl_private,void ** mapped_address,Size * mapped_size,int elevel)860 dsm_impl_mmap(dsm_op op, dsm_handle handle, Size request_size,
861 void **impl_private, void **mapped_address, Size *mapped_size,
862 int elevel)
863 {
864 char name[64];
865 int flags;
866 int fd;
867 char *address;
868
869 snprintf(name, 64, PG_DYNSHMEM_DIR "/" PG_DYNSHMEM_MMAP_FILE_PREFIX "%u",
870 handle);
871
872 /* Handle teardown cases. */
873 if (op == DSM_OP_DETACH || op == DSM_OP_DESTROY)
874 {
875 if (*mapped_address != NULL
876 && munmap(*mapped_address, *mapped_size) != 0)
877 {
878 ereport(elevel,
879 (errcode_for_dynamic_shared_memory(),
880 errmsg("could not unmap shared memory segment \"%s\": %m",
881 name)));
882 return false;
883 }
884 *mapped_address = NULL;
885 *mapped_size = 0;
886 if (op == DSM_OP_DESTROY && unlink(name) != 0)
887 {
888 ereport(elevel,
889 (errcode_for_dynamic_shared_memory(),
890 errmsg("could not remove shared memory segment \"%s\": %m",
891 name)));
892 return false;
893 }
894 return true;
895 }
896
897 /* Create new segment or open an existing one for attach or resize. */
898 flags = O_RDWR | (op == DSM_OP_CREATE ? O_CREAT | O_EXCL : 0);
899 if ((fd = OpenTransientFile(name, flags)) == -1)
900 {
901 if (errno != EEXIST)
902 ereport(elevel,
903 (errcode_for_dynamic_shared_memory(),
904 errmsg("could not open shared memory segment \"%s\": %m",
905 name)));
906 return false;
907 }
908
909 /*
910 * If we're attaching the segment, determine the current size; if we are
911 * creating or resizing the segment, set the size to the requested value.
912 */
913 if (op == DSM_OP_ATTACH)
914 {
915 struct stat st;
916
917 if (fstat(fd, &st) != 0)
918 {
919 int save_errno;
920
921 /* Back out what's already been done. */
922 save_errno = errno;
923 CloseTransientFile(fd);
924 errno = save_errno;
925
926 ereport(elevel,
927 (errcode_for_dynamic_shared_memory(),
928 errmsg("could not stat shared memory segment \"%s\": %m",
929 name)));
930 return false;
931 }
932 request_size = st.st_size;
933 }
934 else if (*mapped_size > request_size && ftruncate(fd, request_size))
935 {
936 int save_errno;
937
938 /* Back out what's already been done. */
939 save_errno = errno;
940 CloseTransientFile(fd);
941 if (op == DSM_OP_CREATE)
942 unlink(name);
943 errno = save_errno;
944
945 ereport(elevel,
946 (errcode_for_dynamic_shared_memory(),
947 errmsg("could not resize shared memory segment \"%s\" to %zu bytes: %m",
948 name, request_size)));
949 return false;
950 }
951 else if (*mapped_size < request_size)
952 {
953 /*
954 * Allocate a buffer full of zeros.
955 *
956 * Note: palloc zbuffer, instead of just using a local char array, to
957 * ensure it is reasonably well-aligned; this may save a few cycles
958 * transferring data to the kernel.
959 */
960 char *zbuffer = (char *) palloc0(ZBUFFER_SIZE);
961 uint32 remaining = request_size;
962 bool success = true;
963
964 /*
965 * Zero-fill the file. We have to do this the hard way to ensure that
966 * all the file space has really been allocated, so that we don't
967 * later seg fault when accessing the memory mapping. This is pretty
968 * pessimal.
969 */
970 while (success && remaining > 0)
971 {
972 Size goal = remaining;
973
974 if (goal > ZBUFFER_SIZE)
975 goal = ZBUFFER_SIZE;
976 pgstat_report_wait_start(WAIT_EVENT_DSM_FILL_ZERO_WRITE);
977 if (write(fd, zbuffer, goal) == goal)
978 remaining -= goal;
979 else
980 success = false;
981 pgstat_report_wait_end();
982 }
983
984 if (!success)
985 {
986 int save_errno;
987
988 /* Back out what's already been done. */
989 save_errno = errno;
990 CloseTransientFile(fd);
991 if (op == DSM_OP_CREATE)
992 unlink(name);
993 errno = save_errno ? save_errno : ENOSPC;
994
995 ereport(elevel,
996 (errcode_for_dynamic_shared_memory(),
997 errmsg("could not resize shared memory segment \"%s\" to %zu bytes: %m",
998 name, request_size)));
999 return false;
1000 }
1001 }
1002
1003 /*
1004 * If we're reattaching or resizing, we must remove any existing mapping,
1005 * unless we've already got the right thing mapped.
1006 */
1007 if (*mapped_address != NULL)
1008 {
1009 if (*mapped_size == request_size)
1010 return true;
1011 if (munmap(*mapped_address, *mapped_size) != 0)
1012 {
1013 int save_errno;
1014
1015 /* Back out what's already been done. */
1016 save_errno = errno;
1017 CloseTransientFile(fd);
1018 if (op == DSM_OP_CREATE)
1019 unlink(name);
1020 errno = save_errno;
1021
1022 ereport(elevel,
1023 (errcode_for_dynamic_shared_memory(),
1024 errmsg("could not unmap shared memory segment \"%s\": %m",
1025 name)));
1026 return false;
1027 }
1028 *mapped_address = NULL;
1029 *mapped_size = 0;
1030 }
1031
1032 /* Map it. */
1033 address = mmap(NULL, request_size, PROT_READ | PROT_WRITE,
1034 MAP_SHARED | MAP_HASSEMAPHORE | MAP_NOSYNC, fd, 0);
1035 if (address == MAP_FAILED)
1036 {
1037 int save_errno;
1038
1039 /* Back out what's already been done. */
1040 save_errno = errno;
1041 CloseTransientFile(fd);
1042 if (op == DSM_OP_CREATE)
1043 unlink(name);
1044 errno = save_errno;
1045
1046 ereport(elevel,
1047 (errcode_for_dynamic_shared_memory(),
1048 errmsg("could not map shared memory segment \"%s\": %m",
1049 name)));
1050 return false;
1051 }
1052 *mapped_address = address;
1053 *mapped_size = request_size;
1054 CloseTransientFile(fd);
1055
1056 return true;
1057 }
1058 #endif
1059
1060 /*
1061 * Implementation-specific actions that must be performed when a segment is to
1062 * be preserved even when no backend has it attached.
1063 *
1064 * Except on Windows, we don't need to do anything at all. But since Windows
1065 * cleans up segments automatically when no references remain, we duplicate
1066 * the segment handle into the postmaster process. The postmaster needn't
1067 * do anything to receive the handle; Windows transfers it automatically.
1068 */
1069 void
dsm_impl_pin_segment(dsm_handle handle,void * impl_private,void ** impl_private_pm_handle)1070 dsm_impl_pin_segment(dsm_handle handle, void *impl_private,
1071 void **impl_private_pm_handle)
1072 {
1073 switch (dynamic_shared_memory_type)
1074 {
1075 #ifdef USE_DSM_WINDOWS
1076 case DSM_IMPL_WINDOWS:
1077 {
1078 HANDLE hmap;
1079
1080 if (!DuplicateHandle(GetCurrentProcess(), impl_private,
1081 PostmasterHandle, &hmap, 0, FALSE,
1082 DUPLICATE_SAME_ACCESS))
1083 {
1084 char name[64];
1085
1086 snprintf(name, 64, "%s.%u", SEGMENT_NAME_PREFIX, handle);
1087 _dosmaperr(GetLastError());
1088 ereport(ERROR,
1089 (errcode_for_dynamic_shared_memory(),
1090 errmsg("could not duplicate handle for \"%s\": %m",
1091 name)));
1092 }
1093
1094 /*
1095 * Here, we remember the handle that we created in the
1096 * postmaster process. This handle isn't actually usable in
1097 * any process other than the postmaster, but that doesn't
1098 * matter. We're just holding onto it so that, if the segment
1099 * is unpinned, dsm_impl_unpin_segment can close it.
1100 */
1101 *impl_private_pm_handle = hmap;
1102 break;
1103 }
1104 #endif
1105 default:
1106 break;
1107 }
1108 }
1109
1110 /*
1111 * Implementation-specific actions that must be performed when a segment is no
1112 * longer to be preserved, so that it will be cleaned up when all backends
1113 * have detached from it.
1114 *
1115 * Except on Windows, we don't need to do anything at all. For Windows, we
1116 * close the extra handle that dsm_impl_pin_segment created in the
1117 * postmaster's process space.
1118 */
1119 void
dsm_impl_unpin_segment(dsm_handle handle,void ** impl_private)1120 dsm_impl_unpin_segment(dsm_handle handle, void **impl_private)
1121 {
1122 switch (dynamic_shared_memory_type)
1123 {
1124 #ifdef USE_DSM_WINDOWS
1125 case DSM_IMPL_WINDOWS:
1126 {
1127 if (*impl_private &&
1128 !DuplicateHandle(PostmasterHandle, *impl_private,
1129 NULL, NULL, 0, FALSE,
1130 DUPLICATE_CLOSE_SOURCE))
1131 {
1132 char name[64];
1133
1134 snprintf(name, 64, "%s.%u", SEGMENT_NAME_PREFIX, handle);
1135 _dosmaperr(GetLastError());
1136 ereport(ERROR,
1137 (errcode_for_dynamic_shared_memory(),
1138 errmsg("could not duplicate handle for \"%s\": %m",
1139 name)));
1140 }
1141
1142 *impl_private = NULL;
1143 break;
1144 }
1145 #endif
1146 default:
1147 break;
1148 }
1149 }
1150
1151 static int
errcode_for_dynamic_shared_memory(void)1152 errcode_for_dynamic_shared_memory(void)
1153 {
1154 if (errno == EFBIG || errno == ENOMEM)
1155 return errcode(ERRCODE_OUT_OF_MEMORY);
1156 else
1157 return errcode_for_file_access();
1158 }
1159