1 /*-------------------------------------------------------------------------
2  *
3  * dsm_impl.c
4  *	  manage dynamic shared memory segments
5  *
6  * This file provides low-level APIs for creating and destroying shared
7  * memory segments using several different possible techniques.  We refer
8  * to these segments as dynamic because they can be created, altered, and
9  * destroyed at any point during the server life cycle.  This is unlike
10  * the main shared memory segment, of which there is always exactly one
11  * and which is always mapped at a fixed address in every PostgreSQL
12  * background process.
13  *
14  * Because not all systems provide the same primitives in this area, nor
15  * do all primitives behave the same way on all systems, we provide
16  * several implementations of this facility.  Many systems implement
17  * POSIX shared memory (shm_open etc.), which is well-suited to our needs
18  * in this area, with the exception that shared memory identifiers live
19  * in a flat system-wide namespace, raising the uncomfortable prospect of
20  * name collisions with other processes (including other copies of
21  * PostgreSQL) running on the same system.  Some systems only support
22  * the older System V shared memory interface (shmget etc.) which is
23  * also usable; however, the default allocation limits are often quite
24  * small, and the namespace is even more restricted.
25  *
26  * We also provide an mmap-based shared memory implementation.  This may
27  * be useful on systems that provide shared memory via a special-purpose
28  * filesystem; by opting for this implementation, the user can even
29  * control precisely where their shared memory segments are placed.  It
30  * can also be used as a fallback for systems where shm_open and shmget
31  * are not available or can't be used for some reason.  Of course,
32  * mapping a file residing on an actual spinning disk is a fairly poor
33  * approximation for shared memory because writeback may hurt performance
34  * substantially, but there should be few systems where we must make do
35  * with such poor tools.
36  *
37  * As ever, Windows requires its own implementation.
38  *
39  * Portions Copyright (c) 1996-2018, PostgreSQL Global Development Group
40  * Portions Copyright (c) 1994, Regents of the University of California
41  *
42  *
43  * IDENTIFICATION
44  *	  src/backend/storage/ipc/dsm_impl.c
45  *
46  *-------------------------------------------------------------------------
47  */
48 
49 #include "postgres.h"
50 #include "miscadmin.h"
51 
52 #include <fcntl.h>
53 #include <unistd.h>
54 #ifndef WIN32
55 #include <sys/mman.h>
56 #endif
57 #include <sys/stat.h>
58 #ifdef HAVE_SYS_IPC_H
59 #include <sys/ipc.h>
60 #endif
61 #ifdef HAVE_SYS_SHM_H
62 #include <sys/shm.h>
63 #endif
64 #include "common/file_perm.h"
65 #include "pgstat.h"
66 
67 #include "portability/mem.h"
68 #include "storage/dsm_impl.h"
69 #include "storage/fd.h"
70 #include "utils/guc.h"
71 #include "utils/memutils.h"
72 #include "postmaster/postmaster.h"
73 
74 #ifdef USE_DSM_POSIX
75 static bool dsm_impl_posix(dsm_op op, dsm_handle handle, Size request_size,
76 			   void **impl_private, void **mapped_address,
77 			   Size *mapped_size, int elevel);
78 static int	dsm_impl_posix_resize(int fd, off_t size);
79 #endif
80 #ifdef USE_DSM_SYSV
81 static bool dsm_impl_sysv(dsm_op op, dsm_handle handle, Size request_size,
82 			  void **impl_private, void **mapped_address,
83 			  Size *mapped_size, int elevel);
84 #endif
85 #ifdef USE_DSM_WINDOWS
86 static bool dsm_impl_windows(dsm_op op, dsm_handle handle, Size request_size,
87 				 void **impl_private, void **mapped_address,
88 				 Size *mapped_size, int elevel);
89 #endif
90 #ifdef USE_DSM_MMAP
91 static bool dsm_impl_mmap(dsm_op op, dsm_handle handle, Size request_size,
92 			  void **impl_private, void **mapped_address,
93 			  Size *mapped_size, int elevel);
94 #endif
95 static int	errcode_for_dynamic_shared_memory(void);
96 
97 const struct config_enum_entry dynamic_shared_memory_options[] = {
98 #ifdef USE_DSM_POSIX
99 	{"posix", DSM_IMPL_POSIX, false},
100 #endif
101 #ifdef USE_DSM_SYSV
102 	{"sysv", DSM_IMPL_SYSV, false},
103 #endif
104 #ifdef USE_DSM_WINDOWS
105 	{"windows", DSM_IMPL_WINDOWS, false},
106 #endif
107 #ifdef USE_DSM_MMAP
108 	{"mmap", DSM_IMPL_MMAP, false},
109 #endif
110 	{"none", DSM_IMPL_NONE, false},
111 	{NULL, 0, false}
112 };
113 
114 /* Implementation selector. */
115 int			dynamic_shared_memory_type;
116 
117 /* Size of buffer to be used for zero-filling. */
118 #define ZBUFFER_SIZE				8192
119 
120 #define SEGMENT_NAME_PREFIX			"Global/PostgreSQL"
121 
122 /*------
123  * Perform a low-level shared memory operation in a platform-specific way,
124  * as dictated by the selected implementation.  Each implementation is
125  * required to implement the following primitives.
126  *
127  * DSM_OP_CREATE.  Create a segment whose size is the request_size and
128  * map it.
129  *
130  * DSM_OP_ATTACH.  Map the segment, whose size must be the request_size.
131  * The segment may already be mapped; any existing mapping should be removed
132  * before creating a new one.
133  *
134  * DSM_OP_DETACH.  Unmap the segment.
135  *
136  * DSM_OP_RESIZE.  Resize the segment to the given request_size and
137  * remap the segment at that new size.
138  *
139  * DSM_OP_DESTROY.  Unmap the segment, if it is mapped.  Destroy the
140  * segment.
141  *
142  * Arguments:
143  *	 op: The operation to be performed.
144  *	 handle: The handle of an existing object, or for DSM_OP_CREATE, the
145  *	   a new handle the caller wants created.
146  *	 request_size: For DSM_OP_CREATE, the requested size.  For DSM_OP_RESIZE,
147  *	   the new size.  Otherwise, 0.
148  *	 impl_private: Private, implementation-specific data.  Will be a pointer
149  *	   to NULL for the first operation on a shared memory segment within this
150  *	   backend; thereafter, it will point to the value to which it was set
151  *	   on the previous call.
152  *	 mapped_address: Pointer to start of current mapping; pointer to NULL
153  *	   if none.  Updated with new mapping address.
154  *	 mapped_size: Pointer to size of current mapping; pointer to 0 if none.
155  *	   Updated with new mapped size.
156  *	 elevel: Level at which to log errors.
157  *
158  * Return value: true on success, false on failure.  When false is returned,
159  * a message should first be logged at the specified elevel, except in the
160  * case where DSM_OP_CREATE experiences a name collision, which should
161  * silently return false.
162  *-----
163  */
164 bool
165 dsm_impl_op(dsm_op op, dsm_handle handle, Size request_size,
166 			void **impl_private, void **mapped_address, Size *mapped_size,
167 			int elevel)
168 {
169 	Assert(op == DSM_OP_CREATE || op == DSM_OP_RESIZE || request_size == 0);
170 	Assert((op != DSM_OP_CREATE && op != DSM_OP_ATTACH) ||
171 		   (*mapped_address == NULL && *mapped_size == 0));
172 
173 	switch (dynamic_shared_memory_type)
174 	{
175 #ifdef USE_DSM_POSIX
176 		case DSM_IMPL_POSIX:
177 			return dsm_impl_posix(op, handle, request_size, impl_private,
178 								  mapped_address, mapped_size, elevel);
179 #endif
180 #ifdef USE_DSM_SYSV
181 		case DSM_IMPL_SYSV:
182 			return dsm_impl_sysv(op, handle, request_size, impl_private,
183 								 mapped_address, mapped_size, elevel);
184 #endif
185 #ifdef USE_DSM_WINDOWS
186 		case DSM_IMPL_WINDOWS:
187 			return dsm_impl_windows(op, handle, request_size, impl_private,
188 									mapped_address, mapped_size, elevel);
189 #endif
190 #ifdef USE_DSM_MMAP
191 		case DSM_IMPL_MMAP:
192 			return dsm_impl_mmap(op, handle, request_size, impl_private,
193 								 mapped_address, mapped_size, elevel);
194 #endif
195 		default:
196 			elog(ERROR, "unexpected dynamic shared memory type: %d",
197 				 dynamic_shared_memory_type);
198 			return false;
199 	}
200 }
201 
202 /*
203  * Does the current dynamic shared memory implementation support resizing
204  * segments?  (The answer here could be platform-dependent in the future,
205  * since AIX allows shmctl(shmid, SHM_RESIZE, &buffer), though you apparently
206  * can't resize segments to anything larger than 256MB that way.  For now,
207  * we keep it simple.)
208  */
209 bool
210 dsm_impl_can_resize(void)
211 {
212 	switch (dynamic_shared_memory_type)
213 	{
214 		case DSM_IMPL_NONE:
215 			return false;
216 		case DSM_IMPL_POSIX:
217 			return true;
218 		case DSM_IMPL_SYSV:
219 			return false;
220 		case DSM_IMPL_WINDOWS:
221 			return false;
222 		case DSM_IMPL_MMAP:
223 			return true;
224 		default:
225 			return false;		/* should not happen */
226 	}
227 }
228 
229 #ifdef USE_DSM_POSIX
230 /*
231  * Operating system primitives to support POSIX shared memory.
232  *
233  * POSIX shared memory segments are created and attached using shm_open()
234  * and shm_unlink(); other operations, such as sizing or mapping the
235  * segment, are performed as if the shared memory segments were files.
236  *
237  * Indeed, on some platforms, they may be implemented that way.  While
238  * POSIX shared memory segments seem intended to exist in a flat namespace,
239  * some operating systems may implement them as files, even going so far
240  * to treat a request for /xyz as a request to create a file by that name
241  * in the root directory.  Users of such broken platforms should select
242  * a different shared memory implementation.
243  */
244 static bool
245 dsm_impl_posix(dsm_op op, dsm_handle handle, Size request_size,
246 			   void **impl_private, void **mapped_address, Size *mapped_size,
247 			   int elevel)
248 {
249 	char		name[64];
250 	int			flags;
251 	int			fd;
252 	char	   *address;
253 
254 	snprintf(name, 64, "/PostgreSQL.%u", handle);
255 
256 	/* Handle teardown cases. */
257 	if (op == DSM_OP_DETACH || op == DSM_OP_DESTROY)
258 	{
259 		if (*mapped_address != NULL
260 			&& munmap(*mapped_address, *mapped_size) != 0)
261 		{
262 			ereport(elevel,
263 					(errcode_for_dynamic_shared_memory(),
264 					 errmsg("could not unmap shared memory segment \"%s\": %m",
265 							name)));
266 			return false;
267 		}
268 		*mapped_address = NULL;
269 		*mapped_size = 0;
270 		if (op == DSM_OP_DESTROY && shm_unlink(name) != 0)
271 		{
272 			ereport(elevel,
273 					(errcode_for_dynamic_shared_memory(),
274 					 errmsg("could not remove shared memory segment \"%s\": %m",
275 							name)));
276 			return false;
277 		}
278 		return true;
279 	}
280 
281 	/*
282 	 * Create new segment or open an existing one for attach or resize.
283 	 *
284 	 * Even though we're not going through fd.c, we should be safe against
285 	 * running out of file descriptors, because of NUM_RESERVED_FDS.  We're
286 	 * only opening one extra descriptor here, and we'll close it before
287 	 * returning.
288 	 */
289 	flags = O_RDWR | (op == DSM_OP_CREATE ? O_CREAT | O_EXCL : 0);
290 	if ((fd = shm_open(name, flags, PG_FILE_MODE_OWNER)) == -1)
291 	{
292 		if (errno != EEXIST)
293 			ereport(elevel,
294 					(errcode_for_dynamic_shared_memory(),
295 					 errmsg("could not open shared memory segment \"%s\": %m",
296 							name)));
297 		return false;
298 	}
299 
300 	/*
301 	 * If we're attaching the segment, determine the current size; if we are
302 	 * creating or resizing the segment, set the size to the requested value.
303 	 */
304 	if (op == DSM_OP_ATTACH)
305 	{
306 		struct stat st;
307 
308 		if (fstat(fd, &st) != 0)
309 		{
310 			int			save_errno;
311 
312 			/* Back out what's already been done. */
313 			save_errno = errno;
314 			close(fd);
315 			errno = save_errno;
316 
317 			ereport(elevel,
318 					(errcode_for_dynamic_shared_memory(),
319 					 errmsg("could not stat shared memory segment \"%s\": %m",
320 							name)));
321 			return false;
322 		}
323 		request_size = st.st_size;
324 	}
325 	else if (*mapped_size != request_size &&
326 			 dsm_impl_posix_resize(fd, request_size) != 0)
327 	{
328 		int			save_errno;
329 
330 		/* Back out what's already been done. */
331 		save_errno = errno;
332 		close(fd);
333 		if (op == DSM_OP_CREATE)
334 			shm_unlink(name);
335 		errno = save_errno;
336 
337 		/*
338 		 * If we received a query cancel or termination signal, we will have
339 		 * EINTR set here.  If the caller said that errors are OK here, check
340 		 * for interrupts immediately.
341 		 */
342 		if (errno == EINTR && elevel >= ERROR)
343 			CHECK_FOR_INTERRUPTS();
344 
345 		ereport(elevel,
346 				(errcode_for_dynamic_shared_memory(),
347 				 errmsg("could not resize shared memory segment \"%s\" to %zu bytes: %m",
348 						name, request_size)));
349 		return false;
350 	}
351 
352 	/*
353 	 * If we're reattaching or resizing, we must remove any existing mapping,
354 	 * unless we've already got the right thing mapped.
355 	 */
356 	if (*mapped_address != NULL)
357 	{
358 		if (*mapped_size == request_size)
359 			return true;
360 		if (munmap(*mapped_address, *mapped_size) != 0)
361 		{
362 			int			save_errno;
363 
364 			/* Back out what's already been done. */
365 			save_errno = errno;
366 			close(fd);
367 			if (op == DSM_OP_CREATE)
368 				shm_unlink(name);
369 			errno = save_errno;
370 
371 			ereport(elevel,
372 					(errcode_for_dynamic_shared_memory(),
373 					 errmsg("could not unmap shared memory segment \"%s\": %m",
374 							name)));
375 			return false;
376 		}
377 		*mapped_address = NULL;
378 		*mapped_size = 0;
379 	}
380 
381 	/* Map it. */
382 	address = mmap(NULL, request_size, PROT_READ | PROT_WRITE,
383 				   MAP_SHARED | MAP_HASSEMAPHORE | MAP_NOSYNC, fd, 0);
384 	if (address == MAP_FAILED)
385 	{
386 		int			save_errno;
387 
388 		/* Back out what's already been done. */
389 		save_errno = errno;
390 		close(fd);
391 		if (op == DSM_OP_CREATE)
392 			shm_unlink(name);
393 		errno = save_errno;
394 
395 		ereport(elevel,
396 				(errcode_for_dynamic_shared_memory(),
397 				 errmsg("could not map shared memory segment \"%s\": %m",
398 						name)));
399 		return false;
400 	}
401 	*mapped_address = address;
402 	*mapped_size = request_size;
403 	close(fd);
404 
405 	return true;
406 }
407 
408 /*
409  * Set the size of a virtual memory region associated with a file descriptor.
410  * If necessary, also ensure that virtual memory is actually allocated by the
411  * operating system, to avoid nasty surprises later.
412  *
413  * Returns non-zero if either truncation or allocation fails, and sets errno.
414  */
415 static int
416 dsm_impl_posix_resize(int fd, off_t size)
417 {
418 	int			rc;
419 
420 	/* Truncate (or extend) the file to the requested size. */
421 	rc = ftruncate(fd, size);
422 
423 	/*
424 	 * On Linux, a shm_open fd is backed by a tmpfs file.  After resizing with
425 	 * ftruncate, the file may contain a hole.  Accessing memory backed by a
426 	 * hole causes tmpfs to allocate pages, which fails with SIGBUS if there
427 	 * is no more tmpfs space available.  So we ask tmpfs to allocate pages
428 	 * here, so we can fail gracefully with ENOSPC now rather than risking
429 	 * SIGBUS later.
430 	 */
431 #if defined(HAVE_POSIX_FALLOCATE) && defined(__linux__)
432 	if (rc == 0)
433 	{
434 		/*
435 		 * We may get interrupted.  If so, just retry unless there is an
436 		 * interrupt pending.  This avoids the possibility of looping forever
437 		 * if another backend is repeatedly trying to interrupt us.
438 		 */
439 		do
440 		{
441 			rc = posix_fallocate(fd, 0, size);
442 		} while (rc == EINTR && !(ProcDiePending || QueryCancelPending));
443 
444 		/*
445 		 * The caller expects errno to be set, but posix_fallocate() doesn't
446 		 * set it.  Instead it returns error numbers directly.  So set errno,
447 		 * even though we'll also return rc to indicate success or failure.
448 		 */
449 		errno = rc;
450 	}
451 #endif							/* HAVE_POSIX_FALLOCATE && __linux__ */
452 
453 	return rc;
454 }
455 
456 #endif							/* USE_DSM_POSIX */
457 
458 #ifdef USE_DSM_SYSV
459 /*
460  * Operating system primitives to support System V shared memory.
461  *
462  * System V shared memory segments are manipulated using shmget(), shmat(),
463  * shmdt(), and shmctl().  There's no portable way to resize such
464  * segments.  As the default allocation limits for System V shared memory
465  * are usually quite low, the POSIX facilities may be preferable; but
466  * those are not supported everywhere.
467  */
468 static bool
469 dsm_impl_sysv(dsm_op op, dsm_handle handle, Size request_size,
470 			  void **impl_private, void **mapped_address, Size *mapped_size,
471 			  int elevel)
472 {
473 	key_t		key;
474 	int			ident;
475 	char	   *address;
476 	char		name[64];
477 	int		   *ident_cache;
478 
479 	/* Resize is not supported for System V shared memory. */
480 	if (op == DSM_OP_RESIZE)
481 	{
482 		elog(elevel, "System V shared memory segments cannot be resized");
483 		return false;
484 	}
485 
486 	/* Since resize isn't supported, reattach is a no-op. */
487 	if (op == DSM_OP_ATTACH && *mapped_address != NULL)
488 		return true;
489 
490 	/*
491 	 * POSIX shared memory and mmap-based shared memory identify segments with
492 	 * names.  To avoid needless error message variation, we use the handle as
493 	 * the name.
494 	 */
495 	snprintf(name, 64, "%u", handle);
496 
497 	/*
498 	 * The System V shared memory namespace is very restricted; names are of
499 	 * type key_t, which is expected to be some sort of integer data type, but
500 	 * not necessarily the same one as dsm_handle.  Since we use dsm_handle to
501 	 * identify shared memory segments across processes, this might seem like
502 	 * a problem, but it's really not.  If dsm_handle is bigger than key_t,
503 	 * the cast below might truncate away some bits from the handle the
504 	 * user-provided, but it'll truncate exactly the same bits away in exactly
505 	 * the same fashion every time we use that handle, which is all that
506 	 * really matters.  Conversely, if dsm_handle is smaller than key_t, we
507 	 * won't use the full range of available key space, but that's no big deal
508 	 * either.
509 	 *
510 	 * We do make sure that the key isn't negative, because that might not be
511 	 * portable.
512 	 */
513 	key = (key_t) handle;
514 	if (key < 1)				/* avoid compiler warning if type is unsigned */
515 		key = -key;
516 
517 	/*
518 	 * There's one special key, IPC_PRIVATE, which can't be used.  If we end
519 	 * up with that value by chance during a create operation, just pretend it
520 	 * already exists, so that caller will retry.  If we run into it anywhere
521 	 * else, the caller has passed a handle that doesn't correspond to
522 	 * anything we ever created, which should not happen.
523 	 */
524 	if (key == IPC_PRIVATE)
525 	{
526 		if (op != DSM_OP_CREATE)
527 			elog(DEBUG4, "System V shared memory key may not be IPC_PRIVATE");
528 		errno = EEXIST;
529 		return false;
530 	}
531 
532 	/*
533 	 * Before we can do anything with a shared memory segment, we have to map
534 	 * the shared memory key to a shared memory identifier using shmget(). To
535 	 * avoid repeated lookups, we store the key using impl_private.
536 	 */
537 	if (*impl_private != NULL)
538 	{
539 		ident_cache = *impl_private;
540 		ident = *ident_cache;
541 	}
542 	else
543 	{
544 		int			flags = IPCProtection;
545 		size_t		segsize;
546 
547 		/*
548 		 * Allocate the memory BEFORE acquiring the resource, so that we don't
549 		 * leak the resource if memory allocation fails.
550 		 */
551 		ident_cache = MemoryContextAlloc(TopMemoryContext, sizeof(int));
552 
553 		/*
554 		 * When using shmget to find an existing segment, we must pass the
555 		 * size as 0.  Passing a non-zero size which is greater than the
556 		 * actual size will result in EINVAL.
557 		 */
558 		segsize = 0;
559 
560 		if (op == DSM_OP_CREATE)
561 		{
562 			flags |= IPC_CREAT | IPC_EXCL;
563 			segsize = request_size;
564 		}
565 
566 		if ((ident = shmget(key, segsize, flags)) == -1)
567 		{
568 			if (errno != EEXIST)
569 			{
570 				int			save_errno = errno;
571 
572 				pfree(ident_cache);
573 				errno = save_errno;
574 				ereport(elevel,
575 						(errcode_for_dynamic_shared_memory(),
576 						 errmsg("could not get shared memory segment: %m")));
577 			}
578 			return false;
579 		}
580 
581 		*ident_cache = ident;
582 		*impl_private = ident_cache;
583 	}
584 
585 	/* Handle teardown cases. */
586 	if (op == DSM_OP_DETACH || op == DSM_OP_DESTROY)
587 	{
588 		pfree(ident_cache);
589 		*impl_private = NULL;
590 		if (*mapped_address != NULL && shmdt(*mapped_address) != 0)
591 		{
592 			ereport(elevel,
593 					(errcode_for_dynamic_shared_memory(),
594 					 errmsg("could not unmap shared memory segment \"%s\": %m",
595 							name)));
596 			return false;
597 		}
598 		*mapped_address = NULL;
599 		*mapped_size = 0;
600 		if (op == DSM_OP_DESTROY && shmctl(ident, IPC_RMID, NULL) < 0)
601 		{
602 			ereport(elevel,
603 					(errcode_for_dynamic_shared_memory(),
604 					 errmsg("could not remove shared memory segment \"%s\": %m",
605 							name)));
606 			return false;
607 		}
608 		return true;
609 	}
610 
611 	/* If we're attaching it, we must use IPC_STAT to determine the size. */
612 	if (op == DSM_OP_ATTACH)
613 	{
614 		struct shmid_ds shm;
615 
616 		if (shmctl(ident, IPC_STAT, &shm) != 0)
617 		{
618 			ereport(elevel,
619 					(errcode_for_dynamic_shared_memory(),
620 					 errmsg("could not stat shared memory segment \"%s\": %m",
621 							name)));
622 			return false;
623 		}
624 		request_size = shm.shm_segsz;
625 	}
626 
627 	/* Map it. */
628 	address = shmat(ident, NULL, PG_SHMAT_FLAGS);
629 	if (address == (void *) -1)
630 	{
631 		int			save_errno;
632 
633 		/* Back out what's already been done. */
634 		save_errno = errno;
635 		if (op == DSM_OP_CREATE)
636 			shmctl(ident, IPC_RMID, NULL);
637 		errno = save_errno;
638 
639 		ereport(elevel,
640 				(errcode_for_dynamic_shared_memory(),
641 				 errmsg("could not map shared memory segment \"%s\": %m",
642 						name)));
643 		return false;
644 	}
645 	*mapped_address = address;
646 	*mapped_size = request_size;
647 
648 	return true;
649 }
650 #endif
651 
652 #ifdef USE_DSM_WINDOWS
653 /*
654  * Operating system primitives to support Windows shared memory.
655  *
656  * Windows shared memory implementation is done using file mapping
657  * which can be backed by either physical file or system paging file.
658  * Current implementation uses system paging file as other effects
659  * like performance are not clear for physical file and it is used in similar
660  * way for main shared memory in windows.
661  *
662  * A memory mapping object is a kernel object - they always get deleted when
663  * the last reference to them goes away, either explicitly via a CloseHandle or
664  * when the process containing the reference exits.
665  */
666 static bool
667 dsm_impl_windows(dsm_op op, dsm_handle handle, Size request_size,
668 				 void **impl_private, void **mapped_address,
669 				 Size *mapped_size, int elevel)
670 {
671 	char	   *address;
672 	HANDLE		hmap;
673 	char		name[64];
674 	MEMORY_BASIC_INFORMATION info;
675 
676 	/* Resize is not supported for Windows shared memory. */
677 	if (op == DSM_OP_RESIZE)
678 	{
679 		elog(elevel, "Windows shared memory segments cannot be resized");
680 		return false;
681 	}
682 
683 	/* Since resize isn't supported, reattach is a no-op. */
684 	if (op == DSM_OP_ATTACH && *mapped_address != NULL)
685 		return true;
686 
687 	/*
688 	 * Storing the shared memory segment in the Global\ namespace, can allow
689 	 * any process running in any session to access that file mapping object
690 	 * provided that the caller has the required access rights. But to avoid
691 	 * issues faced in main shared memory, we are using the naming convention
692 	 * similar to main shared memory. We can change here once issue mentioned
693 	 * in GetSharedMemName is resolved.
694 	 */
695 	snprintf(name, 64, "%s.%u", SEGMENT_NAME_PREFIX, handle);
696 
697 	/*
698 	 * Handle teardown cases.  Since Windows automatically destroys the object
699 	 * when no references remain, we can treat it the same as detach.
700 	 */
701 	if (op == DSM_OP_DETACH || op == DSM_OP_DESTROY)
702 	{
703 		if (*mapped_address != NULL
704 			&& UnmapViewOfFile(*mapped_address) == 0)
705 		{
706 			_dosmaperr(GetLastError());
707 			ereport(elevel,
708 					(errcode_for_dynamic_shared_memory(),
709 					 errmsg("could not unmap shared memory segment \"%s\": %m",
710 							name)));
711 			return false;
712 		}
713 		if (*impl_private != NULL
714 			&& CloseHandle(*impl_private) == 0)
715 		{
716 			_dosmaperr(GetLastError());
717 			ereport(elevel,
718 					(errcode_for_dynamic_shared_memory(),
719 					 errmsg("could not remove shared memory segment \"%s\": %m",
720 							name)));
721 			return false;
722 		}
723 
724 		*impl_private = NULL;
725 		*mapped_address = NULL;
726 		*mapped_size = 0;
727 		return true;
728 	}
729 
730 	/* Create new segment or open an existing one for attach. */
731 	if (op == DSM_OP_CREATE)
732 	{
733 		DWORD		size_high;
734 		DWORD		size_low;
735 		DWORD		errcode;
736 
737 		/* Shifts >= the width of the type are undefined. */
738 #ifdef _WIN64
739 		size_high = request_size >> 32;
740 #else
741 		size_high = 0;
742 #endif
743 		size_low = (DWORD) request_size;
744 
745 		/* CreateFileMapping might not clear the error code on success */
746 		SetLastError(0);
747 
748 		hmap = CreateFileMapping(INVALID_HANDLE_VALUE,	/* Use the pagefile */
749 								 NULL,	/* Default security attrs */
750 								 PAGE_READWRITE,	/* Memory is read/write */
751 								 size_high, /* Upper 32 bits of size */
752 								 size_low,	/* Lower 32 bits of size */
753 								 name);
754 
755 		errcode = GetLastError();
756 		if (errcode == ERROR_ALREADY_EXISTS || errcode == ERROR_ACCESS_DENIED)
757 		{
758 			/*
759 			 * On Windows, when the segment already exists, a handle for the
760 			 * existing segment is returned.  We must close it before
761 			 * returning.  However, if the existing segment is created by a
762 			 * service, then it returns ERROR_ACCESS_DENIED. We don't do
763 			 * _dosmaperr here, so errno won't be modified.
764 			 */
765 			if (hmap)
766 				CloseHandle(hmap);
767 			return false;
768 		}
769 
770 		if (!hmap)
771 		{
772 			_dosmaperr(errcode);
773 			ereport(elevel,
774 					(errcode_for_dynamic_shared_memory(),
775 					 errmsg("could not create shared memory segment \"%s\": %m",
776 							name)));
777 			return false;
778 		}
779 	}
780 	else
781 	{
782 		hmap = OpenFileMapping(FILE_MAP_WRITE | FILE_MAP_READ,
783 							   FALSE,	/* do not inherit the name */
784 							   name);	/* name of mapping object */
785 		if (!hmap)
786 		{
787 			_dosmaperr(GetLastError());
788 			ereport(elevel,
789 					(errcode_for_dynamic_shared_memory(),
790 					 errmsg("could not open shared memory segment \"%s\": %m",
791 							name)));
792 			return false;
793 		}
794 	}
795 
796 	/* Map it. */
797 	address = MapViewOfFile(hmap, FILE_MAP_WRITE | FILE_MAP_READ,
798 							0, 0, 0);
799 	if (!address)
800 	{
801 		int			save_errno;
802 
803 		_dosmaperr(GetLastError());
804 		/* Back out what's already been done. */
805 		save_errno = errno;
806 		CloseHandle(hmap);
807 		errno = save_errno;
808 
809 		ereport(elevel,
810 				(errcode_for_dynamic_shared_memory(),
811 				 errmsg("could not map shared memory segment \"%s\": %m",
812 						name)));
813 		return false;
814 	}
815 
816 	/*
817 	 * VirtualQuery gives size in page_size units, which is 4K for Windows. We
818 	 * need size only when we are attaching, but it's better to get the size
819 	 * when creating new segment to keep size consistent both for
820 	 * DSM_OP_CREATE and DSM_OP_ATTACH.
821 	 */
822 	if (VirtualQuery(address, &info, sizeof(info)) == 0)
823 	{
824 		int			save_errno;
825 
826 		_dosmaperr(GetLastError());
827 		/* Back out what's already been done. */
828 		save_errno = errno;
829 		UnmapViewOfFile(address);
830 		CloseHandle(hmap);
831 		errno = save_errno;
832 
833 		ereport(elevel,
834 				(errcode_for_dynamic_shared_memory(),
835 				 errmsg("could not stat shared memory segment \"%s\": %m",
836 						name)));
837 		return false;
838 	}
839 
840 	*mapped_address = address;
841 	*mapped_size = info.RegionSize;
842 	*impl_private = hmap;
843 
844 	return true;
845 }
846 #endif
847 
848 #ifdef USE_DSM_MMAP
849 /*
850  * Operating system primitives to support mmap-based shared memory.
851  *
852  * Calling this "shared memory" is somewhat of a misnomer, because what
853  * we're really doing is creating a bunch of files and mapping them into
854  * our address space.  The operating system may feel obliged to
855  * synchronize the contents to disk even if nothing is being paged out,
856  * which will not serve us well.  The user can relocate the pg_dynshmem
857  * directory to a ramdisk to avoid this problem, if available.
858  */
859 static bool
860 dsm_impl_mmap(dsm_op op, dsm_handle handle, Size request_size,
861 			  void **impl_private, void **mapped_address, Size *mapped_size,
862 			  int elevel)
863 {
864 	char		name[64];
865 	int			flags;
866 	int			fd;
867 	char	   *address;
868 
869 	snprintf(name, 64, PG_DYNSHMEM_DIR "/" PG_DYNSHMEM_MMAP_FILE_PREFIX "%u",
870 			 handle);
871 
872 	/* Handle teardown cases. */
873 	if (op == DSM_OP_DETACH || op == DSM_OP_DESTROY)
874 	{
875 		if (*mapped_address != NULL
876 			&& munmap(*mapped_address, *mapped_size) != 0)
877 		{
878 			ereport(elevel,
879 					(errcode_for_dynamic_shared_memory(),
880 					 errmsg("could not unmap shared memory segment \"%s\": %m",
881 							name)));
882 			return false;
883 		}
884 		*mapped_address = NULL;
885 		*mapped_size = 0;
886 		if (op == DSM_OP_DESTROY && unlink(name) != 0)
887 		{
888 			ereport(elevel,
889 					(errcode_for_dynamic_shared_memory(),
890 					 errmsg("could not remove shared memory segment \"%s\": %m",
891 							name)));
892 			return false;
893 		}
894 		return true;
895 	}
896 
897 	/* Create new segment or open an existing one for attach or resize. */
898 	flags = O_RDWR | (op == DSM_OP_CREATE ? O_CREAT | O_EXCL : 0);
899 	if ((fd = OpenTransientFile(name, flags)) == -1)
900 	{
901 		if (errno != EEXIST)
902 			ereport(elevel,
903 					(errcode_for_dynamic_shared_memory(),
904 					 errmsg("could not open shared memory segment \"%s\": %m",
905 							name)));
906 		return false;
907 	}
908 
909 	/*
910 	 * If we're attaching the segment, determine the current size; if we are
911 	 * creating or resizing the segment, set the size to the requested value.
912 	 */
913 	if (op == DSM_OP_ATTACH)
914 	{
915 		struct stat st;
916 
917 		if (fstat(fd, &st) != 0)
918 		{
919 			int			save_errno;
920 
921 			/* Back out what's already been done. */
922 			save_errno = errno;
923 			CloseTransientFile(fd);
924 			errno = save_errno;
925 
926 			ereport(elevel,
927 					(errcode_for_dynamic_shared_memory(),
928 					 errmsg("could not stat shared memory segment \"%s\": %m",
929 							name)));
930 			return false;
931 		}
932 		request_size = st.st_size;
933 	}
934 	else if (*mapped_size > request_size && ftruncate(fd, request_size))
935 	{
936 		int			save_errno;
937 
938 		/* Back out what's already been done. */
939 		save_errno = errno;
940 		CloseTransientFile(fd);
941 		if (op == DSM_OP_CREATE)
942 			unlink(name);
943 		errno = save_errno;
944 
945 		ereport(elevel,
946 				(errcode_for_dynamic_shared_memory(),
947 				 errmsg("could not resize shared memory segment \"%s\" to %zu bytes: %m",
948 						name, request_size)));
949 		return false;
950 	}
951 	else if (*mapped_size < request_size)
952 	{
953 		/*
954 		 * Allocate a buffer full of zeros.
955 		 *
956 		 * Note: palloc zbuffer, instead of just using a local char array, to
957 		 * ensure it is reasonably well-aligned; this may save a few cycles
958 		 * transferring data to the kernel.
959 		 */
960 		char	   *zbuffer = (char *) palloc0(ZBUFFER_SIZE);
961 		uint32		remaining = request_size;
962 		bool		success = true;
963 
964 		/*
965 		 * Zero-fill the file. We have to do this the hard way to ensure that
966 		 * all the file space has really been allocated, so that we don't
967 		 * later seg fault when accessing the memory mapping.  This is pretty
968 		 * pessimal.
969 		 */
970 		while (success && remaining > 0)
971 		{
972 			Size		goal = remaining;
973 
974 			if (goal > ZBUFFER_SIZE)
975 				goal = ZBUFFER_SIZE;
976 			pgstat_report_wait_start(WAIT_EVENT_DSM_FILL_ZERO_WRITE);
977 			if (write(fd, zbuffer, goal) == goal)
978 				remaining -= goal;
979 			else
980 				success = false;
981 			pgstat_report_wait_end();
982 		}
983 
984 		if (!success)
985 		{
986 			int			save_errno;
987 
988 			/* Back out what's already been done. */
989 			save_errno = errno;
990 			CloseTransientFile(fd);
991 			if (op == DSM_OP_CREATE)
992 				unlink(name);
993 			errno = save_errno ? save_errno : ENOSPC;
994 
995 			ereport(elevel,
996 					(errcode_for_dynamic_shared_memory(),
997 					 errmsg("could not resize shared memory segment \"%s\" to %zu bytes: %m",
998 							name, request_size)));
999 			return false;
1000 		}
1001 	}
1002 
1003 	/*
1004 	 * If we're reattaching or resizing, we must remove any existing mapping,
1005 	 * unless we've already got the right thing mapped.
1006 	 */
1007 	if (*mapped_address != NULL)
1008 	{
1009 		if (*mapped_size == request_size)
1010 			return true;
1011 		if (munmap(*mapped_address, *mapped_size) != 0)
1012 		{
1013 			int			save_errno;
1014 
1015 			/* Back out what's already been done. */
1016 			save_errno = errno;
1017 			CloseTransientFile(fd);
1018 			if (op == DSM_OP_CREATE)
1019 				unlink(name);
1020 			errno = save_errno;
1021 
1022 			ereport(elevel,
1023 					(errcode_for_dynamic_shared_memory(),
1024 					 errmsg("could not unmap shared memory segment \"%s\": %m",
1025 							name)));
1026 			return false;
1027 		}
1028 		*mapped_address = NULL;
1029 		*mapped_size = 0;
1030 	}
1031 
1032 	/* Map it. */
1033 	address = mmap(NULL, request_size, PROT_READ | PROT_WRITE,
1034 				   MAP_SHARED | MAP_HASSEMAPHORE | MAP_NOSYNC, fd, 0);
1035 	if (address == MAP_FAILED)
1036 	{
1037 		int			save_errno;
1038 
1039 		/* Back out what's already been done. */
1040 		save_errno = errno;
1041 		CloseTransientFile(fd);
1042 		if (op == DSM_OP_CREATE)
1043 			unlink(name);
1044 		errno = save_errno;
1045 
1046 		ereport(elevel,
1047 				(errcode_for_dynamic_shared_memory(),
1048 				 errmsg("could not map shared memory segment \"%s\": %m",
1049 						name)));
1050 		return false;
1051 	}
1052 	*mapped_address = address;
1053 	*mapped_size = request_size;
1054 	CloseTransientFile(fd);
1055 
1056 	return true;
1057 }
1058 #endif
1059 
1060 /*
1061  * Implementation-specific actions that must be performed when a segment is to
1062  * be preserved even when no backend has it attached.
1063  *
1064  * Except on Windows, we don't need to do anything at all.  But since Windows
1065  * cleans up segments automatically when no references remain, we duplicate
1066  * the segment handle into the postmaster process.  The postmaster needn't
1067  * do anything to receive the handle; Windows transfers it automatically.
1068  */
1069 void
1070 dsm_impl_pin_segment(dsm_handle handle, void *impl_private,
1071 					 void **impl_private_pm_handle)
1072 {
1073 	switch (dynamic_shared_memory_type)
1074 	{
1075 #ifdef USE_DSM_WINDOWS
1076 		case DSM_IMPL_WINDOWS:
1077 			{
1078 				HANDLE		hmap;
1079 
1080 				if (!DuplicateHandle(GetCurrentProcess(), impl_private,
1081 									 PostmasterHandle, &hmap, 0, FALSE,
1082 									 DUPLICATE_SAME_ACCESS))
1083 				{
1084 					char		name[64];
1085 
1086 					snprintf(name, 64, "%s.%u", SEGMENT_NAME_PREFIX, handle);
1087 					_dosmaperr(GetLastError());
1088 					ereport(ERROR,
1089 							(errcode_for_dynamic_shared_memory(),
1090 							 errmsg("could not duplicate handle for \"%s\": %m",
1091 									name)));
1092 				}
1093 
1094 				/*
1095 				 * Here, we remember the handle that we created in the
1096 				 * postmaster process.  This handle isn't actually usable in
1097 				 * any process other than the postmaster, but that doesn't
1098 				 * matter.  We're just holding onto it so that, if the segment
1099 				 * is unpinned, dsm_impl_unpin_segment can close it.
1100 				 */
1101 				*impl_private_pm_handle = hmap;
1102 				break;
1103 			}
1104 #endif
1105 		default:
1106 			break;
1107 	}
1108 }
1109 
1110 /*
1111  * Implementation-specific actions that must be performed when a segment is no
1112  * longer to be preserved, so that it will be cleaned up when all backends
1113  * have detached from it.
1114  *
1115  * Except on Windows, we don't need to do anything at all.  For Windows, we
1116  * close the extra handle that dsm_impl_pin_segment created in the
1117  * postmaster's process space.
1118  */
1119 void
1120 dsm_impl_unpin_segment(dsm_handle handle, void **impl_private)
1121 {
1122 	switch (dynamic_shared_memory_type)
1123 	{
1124 #ifdef USE_DSM_WINDOWS
1125 		case DSM_IMPL_WINDOWS:
1126 			{
1127 				if (*impl_private &&
1128 					!DuplicateHandle(PostmasterHandle, *impl_private,
1129 									 NULL, NULL, 0, FALSE,
1130 									 DUPLICATE_CLOSE_SOURCE))
1131 				{
1132 					char		name[64];
1133 
1134 					snprintf(name, 64, "%s.%u", SEGMENT_NAME_PREFIX, handle);
1135 					_dosmaperr(GetLastError());
1136 					ereport(ERROR,
1137 							(errcode_for_dynamic_shared_memory(),
1138 							 errmsg("could not duplicate handle for \"%s\": %m",
1139 									name)));
1140 				}
1141 
1142 				*impl_private = NULL;
1143 				break;
1144 			}
1145 #endif
1146 		default:
1147 			break;
1148 	}
1149 }
1150 
1151 static int
1152 errcode_for_dynamic_shared_memory(void)
1153 {
1154 	if (errno == EFBIG || errno == ENOMEM)
1155 		return errcode(ERRCODE_OUT_OF_MEMORY);
1156 	else
1157 		return errcode_for_file_access();
1158 }
1159