1 /*-------------------------------------------------------------------------
2  *
3  * dsm_impl.c
4  *	  manage dynamic shared memory segments
5  *
6  * This file provides low-level APIs for creating and destroying shared
7  * memory segments using several different possible techniques.  We refer
8  * to these segments as dynamic because they can be created, altered, and
9  * destroyed at any point during the server life cycle.  This is unlike
10  * the main shared memory segment, of which there is always exactly one
11  * and which is always mapped at a fixed address in every PostgreSQL
12  * background process.
13  *
14  * Because not all systems provide the same primitives in this area, nor
15  * do all primitives behave the same way on all systems, we provide
16  * several implementations of this facility.  Many systems implement
17  * POSIX shared memory (shm_open etc.), which is well-suited to our needs
18  * in this area, with the exception that shared memory identifiers live
19  * in a flat system-wide namespace, raising the uncomfortable prospect of
20  * name collisions with other processes (including other copies of
21  * PostgreSQL) running on the same system.  Some systems only support
22  * the older System V shared memory interface (shmget etc.) which is
23  * also usable; however, the default allocation limits are often quite
24  * small, and the namespace is even more restricted.
25  *
26  * We also provide an mmap-based shared memory implementation.  This may
27  * be useful on systems that provide shared memory via a special-purpose
28  * filesystem; by opting for this implementation, the user can even
29  * control precisely where their shared memory segments are placed.  It
30  * can also be used as a fallback for systems where shm_open and shmget
31  * are not available or can't be used for some reason.  Of course,
32  * mapping a file residing on an actual spinning disk is a fairly poor
33  * approximation for shared memory because writeback may hurt performance
34  * substantially, but there should be few systems where we must make do
35  * with such poor tools.
36  *
37  * As ever, Windows requires its own implementation.
38  *
39  * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
40  * Portions Copyright (c) 1994, Regents of the University of California
41  *
42  *
43  * IDENTIFICATION
44  *	  src/backend/storage/ipc/dsm_impl.c
45  *
46  *-------------------------------------------------------------------------
47  */
48 
49 #include "postgres.h"
50 #include "miscadmin.h"
51 
52 #include <fcntl.h>
53 #include <unistd.h>
54 #ifndef WIN32
55 #include <sys/mman.h>
56 #endif
57 #include <sys/stat.h>
58 #ifdef HAVE_SYS_IPC_H
59 #include <sys/ipc.h>
60 #endif
61 #ifdef HAVE_SYS_SHM_H
62 #include <sys/shm.h>
63 #endif
64 #include "pgstat.h"
65 
66 #include "portability/mem.h"
67 #include "storage/dsm_impl.h"
68 #include "storage/fd.h"
69 #include "utils/guc.h"
70 #include "utils/memutils.h"
71 #include "postmaster/postmaster.h"
72 
73 #ifdef USE_DSM_POSIX
74 static bool dsm_impl_posix(dsm_op op, dsm_handle handle, Size request_size,
75 			   void **impl_private, void **mapped_address,
76 			   Size *mapped_size, int elevel);
77 static int	dsm_impl_posix_resize(int fd, off_t size);
78 #endif
79 #ifdef USE_DSM_SYSV
80 static bool dsm_impl_sysv(dsm_op op, dsm_handle handle, Size request_size,
81 			  void **impl_private, void **mapped_address,
82 			  Size *mapped_size, int elevel);
83 #endif
84 #ifdef USE_DSM_WINDOWS
85 static bool dsm_impl_windows(dsm_op op, dsm_handle handle, Size request_size,
86 				 void **impl_private, void **mapped_address,
87 				 Size *mapped_size, int elevel);
88 #endif
89 #ifdef USE_DSM_MMAP
90 static bool dsm_impl_mmap(dsm_op op, dsm_handle handle, Size request_size,
91 			  void **impl_private, void **mapped_address,
92 			  Size *mapped_size, int elevel);
93 #endif
94 static int	errcode_for_dynamic_shared_memory(void);
95 
96 const struct config_enum_entry dynamic_shared_memory_options[] = {
97 #ifdef USE_DSM_POSIX
98 	{"posix", DSM_IMPL_POSIX, false},
99 #endif
100 #ifdef USE_DSM_SYSV
101 	{"sysv", DSM_IMPL_SYSV, false},
102 #endif
103 #ifdef USE_DSM_WINDOWS
104 	{"windows", DSM_IMPL_WINDOWS, false},
105 #endif
106 #ifdef USE_DSM_MMAP
107 	{"mmap", DSM_IMPL_MMAP, false},
108 #endif
109 	{"none", DSM_IMPL_NONE, false},
110 	{NULL, 0, false}
111 };
112 
113 /* Implementation selector. */
114 int			dynamic_shared_memory_type;
115 
116 /* Size of buffer to be used for zero-filling. */
117 #define ZBUFFER_SIZE				8192
118 
119 #define SEGMENT_NAME_PREFIX			"Global/PostgreSQL"
120 
121 /*------
122  * Perform a low-level shared memory operation in a platform-specific way,
123  * as dictated by the selected implementation.  Each implementation is
124  * required to implement the following primitives.
125  *
126  * DSM_OP_CREATE.  Create a segment whose size is the request_size and
127  * map it.
128  *
129  * DSM_OP_ATTACH.  Map the segment, whose size must be the request_size.
130  * The segment may already be mapped; any existing mapping should be removed
131  * before creating a new one.
132  *
133  * DSM_OP_DETACH.  Unmap the segment.
134  *
135  * DSM_OP_RESIZE.  Resize the segment to the given request_size and
136  * remap the segment at that new size.
137  *
138  * DSM_OP_DESTROY.  Unmap the segment, if it is mapped.  Destroy the
139  * segment.
140  *
141  * Arguments:
142  *	 op: The operation to be performed.
143  *	 handle: The handle of an existing object, or for DSM_OP_CREATE, the
144  *	   a new handle the caller wants created.
145  *	 request_size: For DSM_OP_CREATE, the requested size.  For DSM_OP_RESIZE,
146  *	   the new size.  Otherwise, 0.
147  *	 impl_private: Private, implementation-specific data.  Will be a pointer
148  *	   to NULL for the first operation on a shared memory segment within this
149  *	   backend; thereafter, it will point to the value to which it was set
150  *	   on the previous call.
151  *	 mapped_address: Pointer to start of current mapping; pointer to NULL
152  *	   if none.  Updated with new mapping address.
153  *	 mapped_size: Pointer to size of current mapping; pointer to 0 if none.
154  *	   Updated with new mapped size.
155  *	 elevel: Level at which to log errors.
156  *
157  * Return value: true on success, false on failure.  When false is returned,
158  * a message should first be logged at the specified elevel, except in the
159  * case where DSM_OP_CREATE experiences a name collision, which should
160  * silently return false.
161  *-----
162  */
163 bool
dsm_impl_op(dsm_op op,dsm_handle handle,Size request_size,void ** impl_private,void ** mapped_address,Size * mapped_size,int elevel)164 dsm_impl_op(dsm_op op, dsm_handle handle, Size request_size,
165 			void **impl_private, void **mapped_address, Size *mapped_size,
166 			int elevel)
167 {
168 	Assert(op == DSM_OP_CREATE || op == DSM_OP_RESIZE || request_size == 0);
169 	Assert((op != DSM_OP_CREATE && op != DSM_OP_ATTACH) ||
170 		   (*mapped_address == NULL && *mapped_size == 0));
171 
172 	switch (dynamic_shared_memory_type)
173 	{
174 #ifdef USE_DSM_POSIX
175 		case DSM_IMPL_POSIX:
176 			return dsm_impl_posix(op, handle, request_size, impl_private,
177 								  mapped_address, mapped_size, elevel);
178 #endif
179 #ifdef USE_DSM_SYSV
180 		case DSM_IMPL_SYSV:
181 			return dsm_impl_sysv(op, handle, request_size, impl_private,
182 								 mapped_address, mapped_size, elevel);
183 #endif
184 #ifdef USE_DSM_WINDOWS
185 		case DSM_IMPL_WINDOWS:
186 			return dsm_impl_windows(op, handle, request_size, impl_private,
187 									mapped_address, mapped_size, elevel);
188 #endif
189 #ifdef USE_DSM_MMAP
190 		case DSM_IMPL_MMAP:
191 			return dsm_impl_mmap(op, handle, request_size, impl_private,
192 								 mapped_address, mapped_size, elevel);
193 #endif
194 		default:
195 			elog(ERROR, "unexpected dynamic shared memory type: %d",
196 				 dynamic_shared_memory_type);
197 			return false;
198 	}
199 }
200 
201 /*
202  * Does the current dynamic shared memory implementation support resizing
203  * segments?  (The answer here could be platform-dependent in the future,
204  * since AIX allows shmctl(shmid, SHM_RESIZE, &buffer), though you apparently
205  * can't resize segments to anything larger than 256MB that way.  For now,
206  * we keep it simple.)
207  */
208 bool
dsm_impl_can_resize(void)209 dsm_impl_can_resize(void)
210 {
211 	switch (dynamic_shared_memory_type)
212 	{
213 		case DSM_IMPL_NONE:
214 			return false;
215 		case DSM_IMPL_POSIX:
216 			return true;
217 		case DSM_IMPL_SYSV:
218 			return false;
219 		case DSM_IMPL_WINDOWS:
220 			return false;
221 		case DSM_IMPL_MMAP:
222 			return true;
223 		default:
224 			return false;		/* should not happen */
225 	}
226 }
227 
228 #ifdef USE_DSM_POSIX
229 /*
230  * Operating system primitives to support POSIX shared memory.
231  *
232  * POSIX shared memory segments are created and attached using shm_open()
233  * and shm_unlink(); other operations, such as sizing or mapping the
234  * segment, are performed as if the shared memory segments were files.
235  *
236  * Indeed, on some platforms, they may be implemented that way.  While
237  * POSIX shared memory segments seem intended to exist in a flat namespace,
238  * some operating systems may implement them as files, even going so far
239  * to treat a request for /xyz as a request to create a file by that name
240  * in the root directory.  Users of such broken platforms should select
241  * a different shared memory implementation.
242  */
243 static bool
dsm_impl_posix(dsm_op op,dsm_handle handle,Size request_size,void ** impl_private,void ** mapped_address,Size * mapped_size,int elevel)244 dsm_impl_posix(dsm_op op, dsm_handle handle, Size request_size,
245 			   void **impl_private, void **mapped_address, Size *mapped_size,
246 			   int elevel)
247 {
248 	char		name[64];
249 	int			flags;
250 	int			fd;
251 	char	   *address;
252 
253 	snprintf(name, 64, "/PostgreSQL.%u", handle);
254 
255 	/* Handle teardown cases. */
256 	if (op == DSM_OP_DETACH || op == DSM_OP_DESTROY)
257 	{
258 		if (*mapped_address != NULL
259 			&& munmap(*mapped_address, *mapped_size) != 0)
260 		{
261 			ereport(elevel,
262 					(errcode_for_dynamic_shared_memory(),
263 					 errmsg("could not unmap shared memory segment \"%s\": %m",
264 							name)));
265 			return false;
266 		}
267 		*mapped_address = NULL;
268 		*mapped_size = 0;
269 		if (op == DSM_OP_DESTROY && shm_unlink(name) != 0)
270 		{
271 			ereport(elevel,
272 					(errcode_for_dynamic_shared_memory(),
273 					 errmsg("could not remove shared memory segment \"%s\": %m",
274 							name)));
275 			return false;
276 		}
277 		return true;
278 	}
279 
280 	/*
281 	 * Create new segment or open an existing one for attach or resize.
282 	 *
283 	 * Even though we're not going through fd.c, we should be safe against
284 	 * running out of file descriptors, because of NUM_RESERVED_FDS.  We're
285 	 * only opening one extra descriptor here, and we'll close it before
286 	 * returning.
287 	 */
288 	flags = O_RDWR | (op == DSM_OP_CREATE ? O_CREAT | O_EXCL : 0);
289 	if ((fd = shm_open(name, flags, 0600)) == -1)
290 	{
291 		if (errno != EEXIST)
292 			ereport(elevel,
293 					(errcode_for_dynamic_shared_memory(),
294 					 errmsg("could not open shared memory segment \"%s\": %m",
295 							name)));
296 		return false;
297 	}
298 
299 	/*
300 	 * If we're attaching the segment, determine the current size; if we are
301 	 * creating or resizing the segment, set the size to the requested value.
302 	 */
303 	if (op == DSM_OP_ATTACH)
304 	{
305 		struct stat st;
306 
307 		if (fstat(fd, &st) != 0)
308 		{
309 			int			save_errno;
310 
311 			/* Back out what's already been done. */
312 			save_errno = errno;
313 			close(fd);
314 			errno = save_errno;
315 
316 			ereport(elevel,
317 					(errcode_for_dynamic_shared_memory(),
318 					 errmsg("could not stat shared memory segment \"%s\": %m",
319 							name)));
320 			return false;
321 		}
322 		request_size = st.st_size;
323 	}
324 	else if (*mapped_size != request_size &&
325 			 dsm_impl_posix_resize(fd, request_size) != 0)
326 	{
327 		int			save_errno;
328 
329 		/* Back out what's already been done. */
330 		save_errno = errno;
331 		close(fd);
332 		if (op == DSM_OP_CREATE)
333 			shm_unlink(name);
334 		errno = save_errno;
335 
336 		/*
337 		 * If we received a query cancel or termination signal, we will have
338 		 * EINTR set here.  If the caller said that errors are OK here, check
339 		 * for interrupts immediately.
340 		 */
341 		if (errno == EINTR && elevel >= ERROR)
342 			CHECK_FOR_INTERRUPTS();
343 
344 		ereport(elevel,
345 				(errcode_for_dynamic_shared_memory(),
346 				 errmsg("could not resize shared memory segment \"%s\" to %zu bytes: %m",
347 						name, request_size)));
348 		return false;
349 	}
350 
351 	/*
352 	 * If we're reattaching or resizing, we must remove any existing mapping,
353 	 * unless we've already got the right thing mapped.
354 	 */
355 	if (*mapped_address != NULL)
356 	{
357 		if (*mapped_size == request_size)
358 			return true;
359 		if (munmap(*mapped_address, *mapped_size) != 0)
360 		{
361 			int			save_errno;
362 
363 			/* Back out what's already been done. */
364 			save_errno = errno;
365 			close(fd);
366 			if (op == DSM_OP_CREATE)
367 				shm_unlink(name);
368 			errno = save_errno;
369 
370 			ereport(elevel,
371 					(errcode_for_dynamic_shared_memory(),
372 					 errmsg("could not unmap shared memory segment \"%s\": %m",
373 							name)));
374 			return false;
375 		}
376 		*mapped_address = NULL;
377 		*mapped_size = 0;
378 	}
379 
380 	/* Map it. */
381 	address = mmap(NULL, request_size, PROT_READ | PROT_WRITE,
382 				   MAP_SHARED | MAP_HASSEMAPHORE | MAP_NOSYNC, fd, 0);
383 	if (address == MAP_FAILED)
384 	{
385 		int			save_errno;
386 
387 		/* Back out what's already been done. */
388 		save_errno = errno;
389 		close(fd);
390 		if (op == DSM_OP_CREATE)
391 			shm_unlink(name);
392 		errno = save_errno;
393 
394 		ereport(elevel,
395 				(errcode_for_dynamic_shared_memory(),
396 				 errmsg("could not map shared memory segment \"%s\": %m",
397 						name)));
398 		return false;
399 	}
400 	*mapped_address = address;
401 	*mapped_size = request_size;
402 	close(fd);
403 
404 	return true;
405 }
406 
407 /*
408  * Set the size of a virtual memory region associated with a file descriptor.
409  * If necessary, also ensure that virtual memory is actually allocated by the
410  * operating system, to avoid nasty surprises later.
411  *
412  * Returns non-zero if either truncation or allocation fails, and sets errno.
413  */
414 static int
dsm_impl_posix_resize(int fd,off_t size)415 dsm_impl_posix_resize(int fd, off_t size)
416 {
417 	int			rc;
418 
419 	/* Truncate (or extend) the file to the requested size. */
420 	rc = ftruncate(fd, size);
421 
422 	/*
423 	 * On Linux, a shm_open fd is backed by a tmpfs file.  After resizing with
424 	 * ftruncate, the file may contain a hole.  Accessing memory backed by a
425 	 * hole causes tmpfs to allocate pages, which fails with SIGBUS if there
426 	 * is no more tmpfs space available.  So we ask tmpfs to allocate pages
427 	 * here, so we can fail gracefully with ENOSPC now rather than risking
428 	 * SIGBUS later.
429 	 */
430 #if defined(HAVE_POSIX_FALLOCATE) && defined(__linux__)
431 	if (rc == 0)
432 	{
433 		/*
434 		 * We may get interrupted.  If so, just retry unless there is an
435 		 * interrupt pending.  This avoids the possibility of looping forever
436 		 * if another backend is repeatedly trying to interrupt us.
437 		 */
438 		do
439 		{
440 			rc = posix_fallocate(fd, 0, size);
441 		} while (rc == EINTR && !(ProcDiePending || QueryCancelPending));
442 
443 		/*
444 		 * The caller expects errno to be set, but posix_fallocate() doesn't
445 		 * set it.  Instead it returns error numbers directly.  So set errno,
446 		 * even though we'll also return rc to indicate success or failure.
447 		 */
448 		errno = rc;
449 	}
450 #endif							/* HAVE_POSIX_FALLOCATE && __linux__ */
451 
452 	return rc;
453 }
454 
455 #endif							/* USE_DSM_POSIX */
456 
457 #ifdef USE_DSM_SYSV
458 /*
459  * Operating system primitives to support System V shared memory.
460  *
461  * System V shared memory segments are manipulated using shmget(), shmat(),
462  * shmdt(), and shmctl().  There's no portable way to resize such
463  * segments.  As the default allocation limits for System V shared memory
464  * are usually quite low, the POSIX facilities may be preferable; but
465  * those are not supported everywhere.
466  */
467 static bool
dsm_impl_sysv(dsm_op op,dsm_handle handle,Size request_size,void ** impl_private,void ** mapped_address,Size * mapped_size,int elevel)468 dsm_impl_sysv(dsm_op op, dsm_handle handle, Size request_size,
469 			  void **impl_private, void **mapped_address, Size *mapped_size,
470 			  int elevel)
471 {
472 	key_t		key;
473 	int			ident;
474 	char	   *address;
475 	char		name[64];
476 	int		   *ident_cache;
477 
478 	/* Resize is not supported for System V shared memory. */
479 	if (op == DSM_OP_RESIZE)
480 	{
481 		elog(elevel, "System V shared memory segments cannot be resized");
482 		return false;
483 	}
484 
485 	/* Since resize isn't supported, reattach is a no-op. */
486 	if (op == DSM_OP_ATTACH && *mapped_address != NULL)
487 		return true;
488 
489 	/*
490 	 * POSIX shared memory and mmap-based shared memory identify segments with
491 	 * names.  To avoid needless error message variation, we use the handle as
492 	 * the name.
493 	 */
494 	snprintf(name, 64, "%u", handle);
495 
496 	/*
497 	 * The System V shared memory namespace is very restricted; names are of
498 	 * type key_t, which is expected to be some sort of integer data type, but
499 	 * not necessarily the same one as dsm_handle.  Since we use dsm_handle to
500 	 * identify shared memory segments across processes, this might seem like
501 	 * a problem, but it's really not.  If dsm_handle is bigger than key_t,
502 	 * the cast below might truncate away some bits from the handle the
503 	 * user-provided, but it'll truncate exactly the same bits away in exactly
504 	 * the same fashion every time we use that handle, which is all that
505 	 * really matters.  Conversely, if dsm_handle is smaller than key_t, we
506 	 * won't use the full range of available key space, but that's no big deal
507 	 * either.
508 	 *
509 	 * We do make sure that the key isn't negative, because that might not be
510 	 * portable.
511 	 */
512 	key = (key_t) handle;
513 	if (key < 1)				/* avoid compiler warning if type is unsigned */
514 		key = -key;
515 
516 	/*
517 	 * There's one special key, IPC_PRIVATE, which can't be used.  If we end
518 	 * up with that value by chance during a create operation, just pretend it
519 	 * already exists, so that caller will retry.  If we run into it anywhere
520 	 * else, the caller has passed a handle that doesn't correspond to
521 	 * anything we ever created, which should not happen.
522 	 */
523 	if (key == IPC_PRIVATE)
524 	{
525 		if (op != DSM_OP_CREATE)
526 			elog(DEBUG4, "System V shared memory key may not be IPC_PRIVATE");
527 		errno = EEXIST;
528 		return false;
529 	}
530 
531 	/*
532 	 * Before we can do anything with a shared memory segment, we have to map
533 	 * the shared memory key to a shared memory identifier using shmget(). To
534 	 * avoid repeated lookups, we store the key using impl_private.
535 	 */
536 	if (*impl_private != NULL)
537 	{
538 		ident_cache = *impl_private;
539 		ident = *ident_cache;
540 	}
541 	else
542 	{
543 		int			flags = IPCProtection;
544 		size_t		segsize;
545 
546 		/*
547 		 * Allocate the memory BEFORE acquiring the resource, so that we don't
548 		 * leak the resource if memory allocation fails.
549 		 */
550 		ident_cache = MemoryContextAlloc(TopMemoryContext, sizeof(int));
551 
552 		/*
553 		 * When using shmget to find an existing segment, we must pass the
554 		 * size as 0.  Passing a non-zero size which is greater than the
555 		 * actual size will result in EINVAL.
556 		 */
557 		segsize = 0;
558 
559 		if (op == DSM_OP_CREATE)
560 		{
561 			flags |= IPC_CREAT | IPC_EXCL;
562 			segsize = request_size;
563 		}
564 
565 		if ((ident = shmget(key, segsize, flags)) == -1)
566 		{
567 			if (errno != EEXIST)
568 			{
569 				int			save_errno = errno;
570 
571 				pfree(ident_cache);
572 				errno = save_errno;
573 				ereport(elevel,
574 						(errcode_for_dynamic_shared_memory(),
575 						 errmsg("could not get shared memory segment: %m")));
576 			}
577 			return false;
578 		}
579 
580 		*ident_cache = ident;
581 		*impl_private = ident_cache;
582 	}
583 
584 	/* Handle teardown cases. */
585 	if (op == DSM_OP_DETACH || op == DSM_OP_DESTROY)
586 	{
587 		pfree(ident_cache);
588 		*impl_private = NULL;
589 		if (*mapped_address != NULL && shmdt(*mapped_address) != 0)
590 		{
591 			ereport(elevel,
592 					(errcode_for_dynamic_shared_memory(),
593 					 errmsg("could not unmap shared memory segment \"%s\": %m",
594 							name)));
595 			return false;
596 		}
597 		*mapped_address = NULL;
598 		*mapped_size = 0;
599 		if (op == DSM_OP_DESTROY && shmctl(ident, IPC_RMID, NULL) < 0)
600 		{
601 			ereport(elevel,
602 					(errcode_for_dynamic_shared_memory(),
603 					 errmsg("could not remove shared memory segment \"%s\": %m",
604 							name)));
605 			return false;
606 		}
607 		return true;
608 	}
609 
610 	/* If we're attaching it, we must use IPC_STAT to determine the size. */
611 	if (op == DSM_OP_ATTACH)
612 	{
613 		struct shmid_ds shm;
614 
615 		if (shmctl(ident, IPC_STAT, &shm) != 0)
616 		{
617 			ereport(elevel,
618 					(errcode_for_dynamic_shared_memory(),
619 					 errmsg("could not stat shared memory segment \"%s\": %m",
620 							name)));
621 			return false;
622 		}
623 		request_size = shm.shm_segsz;
624 	}
625 
626 	/* Map it. */
627 	address = shmat(ident, NULL, PG_SHMAT_FLAGS);
628 	if (address == (void *) -1)
629 	{
630 		int			save_errno;
631 
632 		/* Back out what's already been done. */
633 		save_errno = errno;
634 		if (op == DSM_OP_CREATE)
635 			shmctl(ident, IPC_RMID, NULL);
636 		errno = save_errno;
637 
638 		ereport(elevel,
639 				(errcode_for_dynamic_shared_memory(),
640 				 errmsg("could not map shared memory segment \"%s\": %m",
641 						name)));
642 		return false;
643 	}
644 	*mapped_address = address;
645 	*mapped_size = request_size;
646 
647 	return true;
648 }
649 #endif
650 
651 #ifdef USE_DSM_WINDOWS
652 /*
653  * Operating system primitives to support Windows shared memory.
654  *
655  * Windows shared memory implementation is done using file mapping
656  * which can be backed by either physical file or system paging file.
657  * Current implementation uses system paging file as other effects
658  * like performance are not clear for physical file and it is used in similar
659  * way for main shared memory in windows.
660  *
661  * A memory mapping object is a kernel object - they always get deleted when
662  * the last reference to them goes away, either explicitly via a CloseHandle or
663  * when the process containing the reference exits.
664  */
665 static bool
dsm_impl_windows(dsm_op op,dsm_handle handle,Size request_size,void ** impl_private,void ** mapped_address,Size * mapped_size,int elevel)666 dsm_impl_windows(dsm_op op, dsm_handle handle, Size request_size,
667 				 void **impl_private, void **mapped_address,
668 				 Size *mapped_size, int elevel)
669 {
670 	char	   *address;
671 	HANDLE		hmap;
672 	char		name[64];
673 	MEMORY_BASIC_INFORMATION info;
674 
675 	/* Resize is not supported for Windows shared memory. */
676 	if (op == DSM_OP_RESIZE)
677 	{
678 		elog(elevel, "Windows shared memory segments cannot be resized");
679 		return false;
680 	}
681 
682 	/* Since resize isn't supported, reattach is a no-op. */
683 	if (op == DSM_OP_ATTACH && *mapped_address != NULL)
684 		return true;
685 
686 	/*
687 	 * Storing the shared memory segment in the Global\ namespace, can allow
688 	 * any process running in any session to access that file mapping object
689 	 * provided that the caller has the required access rights. But to avoid
690 	 * issues faced in main shared memory, we are using the naming convention
691 	 * similar to main shared memory. We can change here once issue mentioned
692 	 * in GetSharedMemName is resolved.
693 	 */
694 	snprintf(name, 64, "%s.%u", SEGMENT_NAME_PREFIX, handle);
695 
696 	/*
697 	 * Handle teardown cases.  Since Windows automatically destroys the object
698 	 * when no references reamin, we can treat it the same as detach.
699 	 */
700 	if (op == DSM_OP_DETACH || op == DSM_OP_DESTROY)
701 	{
702 		if (*mapped_address != NULL
703 			&& UnmapViewOfFile(*mapped_address) == 0)
704 		{
705 			_dosmaperr(GetLastError());
706 			ereport(elevel,
707 					(errcode_for_dynamic_shared_memory(),
708 					 errmsg("could not unmap shared memory segment \"%s\": %m",
709 							name)));
710 			return false;
711 		}
712 		if (*impl_private != NULL
713 			&& CloseHandle(*impl_private) == 0)
714 		{
715 			_dosmaperr(GetLastError());
716 			ereport(elevel,
717 					(errcode_for_dynamic_shared_memory(),
718 					 errmsg("could not remove shared memory segment \"%s\": %m",
719 							name)));
720 			return false;
721 		}
722 
723 		*impl_private = NULL;
724 		*mapped_address = NULL;
725 		*mapped_size = 0;
726 		return true;
727 	}
728 
729 	/* Create new segment or open an existing one for attach. */
730 	if (op == DSM_OP_CREATE)
731 	{
732 		DWORD		size_high;
733 		DWORD		size_low;
734 		DWORD		errcode;
735 
736 		/* Shifts >= the width of the type are undefined. */
737 #ifdef _WIN64
738 		size_high = request_size >> 32;
739 #else
740 		size_high = 0;
741 #endif
742 		size_low = (DWORD) request_size;
743 
744 		/* CreateFileMapping might not clear the error code on success */
745 		SetLastError(0);
746 
747 		hmap = CreateFileMapping(INVALID_HANDLE_VALUE,	/* Use the pagefile */
748 								 NULL,	/* Default security attrs */
749 								 PAGE_READWRITE,	/* Memory is read/write */
750 								 size_high, /* Upper 32 bits of size */
751 								 size_low,	/* Lower 32 bits of size */
752 								 name);
753 
754 		errcode = GetLastError();
755 		if (errcode == ERROR_ALREADY_EXISTS || errcode == ERROR_ACCESS_DENIED)
756 		{
757 			/*
758 			 * On Windows, when the segment already exists, a handle for the
759 			 * existing segment is returned.  We must close it before
760 			 * returning.  However, if the existing segment is created by a
761 			 * service, then it returns ERROR_ACCESS_DENIED. We don't do
762 			 * _dosmaperr here, so errno won't be modified.
763 			 */
764 			if (hmap)
765 				CloseHandle(hmap);
766 			return false;
767 		}
768 
769 		if (!hmap)
770 		{
771 			_dosmaperr(errcode);
772 			ereport(elevel,
773 					(errcode_for_dynamic_shared_memory(),
774 					 errmsg("could not create shared memory segment \"%s\": %m",
775 							name)));
776 			return false;
777 		}
778 	}
779 	else
780 	{
781 		hmap = OpenFileMapping(FILE_MAP_WRITE | FILE_MAP_READ,
782 							   FALSE,	/* do not inherit the name */
783 							   name);	/* name of mapping object */
784 		if (!hmap)
785 		{
786 			_dosmaperr(GetLastError());
787 			ereport(elevel,
788 					(errcode_for_dynamic_shared_memory(),
789 					 errmsg("could not open shared memory segment \"%s\": %m",
790 							name)));
791 			return false;
792 		}
793 	}
794 
795 	/* Map it. */
796 	address = MapViewOfFile(hmap, FILE_MAP_WRITE | FILE_MAP_READ,
797 							0, 0, 0);
798 	if (!address)
799 	{
800 		int			save_errno;
801 
802 		_dosmaperr(GetLastError());
803 		/* Back out what's already been done. */
804 		save_errno = errno;
805 		CloseHandle(hmap);
806 		errno = save_errno;
807 
808 		ereport(elevel,
809 				(errcode_for_dynamic_shared_memory(),
810 				 errmsg("could not map shared memory segment \"%s\": %m",
811 						name)));
812 		return false;
813 	}
814 
815 	/*
816 	 * VirtualQuery gives size in page_size units, which is 4K for Windows. We
817 	 * need size only when we are attaching, but it's better to get the size
818 	 * when creating new segment to keep size consistent both for
819 	 * DSM_OP_CREATE and DSM_OP_ATTACH.
820 	 */
821 	if (VirtualQuery(address, &info, sizeof(info)) == 0)
822 	{
823 		int			save_errno;
824 
825 		_dosmaperr(GetLastError());
826 		/* Back out what's already been done. */
827 		save_errno = errno;
828 		UnmapViewOfFile(address);
829 		CloseHandle(hmap);
830 		errno = save_errno;
831 
832 		ereport(elevel,
833 				(errcode_for_dynamic_shared_memory(),
834 				 errmsg("could not stat shared memory segment \"%s\": %m",
835 						name)));
836 		return false;
837 	}
838 
839 	*mapped_address = address;
840 	*mapped_size = info.RegionSize;
841 	*impl_private = hmap;
842 
843 	return true;
844 }
845 #endif
846 
847 #ifdef USE_DSM_MMAP
848 /*
849  * Operating system primitives to support mmap-based shared memory.
850  *
851  * Calling this "shared memory" is somewhat of a misnomer, because what
852  * we're really doing is creating a bunch of files and mapping them into
853  * our address space.  The operating system may feel obliged to
854  * synchronize the contents to disk even if nothing is being paged out,
855  * which will not serve us well.  The user can relocate the pg_dynshmem
856  * directory to a ramdisk to avoid this problem, if available.
857  */
858 static bool
dsm_impl_mmap(dsm_op op,dsm_handle handle,Size request_size,void ** impl_private,void ** mapped_address,Size * mapped_size,int elevel)859 dsm_impl_mmap(dsm_op op, dsm_handle handle, Size request_size,
860 			  void **impl_private, void **mapped_address, Size *mapped_size,
861 			  int elevel)
862 {
863 	char		name[64];
864 	int			flags;
865 	int			fd;
866 	char	   *address;
867 
868 	snprintf(name, 64, PG_DYNSHMEM_DIR "/" PG_DYNSHMEM_MMAP_FILE_PREFIX "%u",
869 			 handle);
870 
871 	/* Handle teardown cases. */
872 	if (op == DSM_OP_DETACH || op == DSM_OP_DESTROY)
873 	{
874 		if (*mapped_address != NULL
875 			&& munmap(*mapped_address, *mapped_size) != 0)
876 		{
877 			ereport(elevel,
878 					(errcode_for_dynamic_shared_memory(),
879 					 errmsg("could not unmap shared memory segment \"%s\": %m",
880 							name)));
881 			return false;
882 		}
883 		*mapped_address = NULL;
884 		*mapped_size = 0;
885 		if (op == DSM_OP_DESTROY && unlink(name) != 0)
886 		{
887 			ereport(elevel,
888 					(errcode_for_dynamic_shared_memory(),
889 					 errmsg("could not remove shared memory segment \"%s\": %m",
890 							name)));
891 			return false;
892 		}
893 		return true;
894 	}
895 
896 	/* Create new segment or open an existing one for attach or resize. */
897 	flags = O_RDWR | (op == DSM_OP_CREATE ? O_CREAT | O_EXCL : 0);
898 	if ((fd = OpenTransientFile(name, flags, 0600)) == -1)
899 	{
900 		if (errno != EEXIST)
901 			ereport(elevel,
902 					(errcode_for_dynamic_shared_memory(),
903 					 errmsg("could not open shared memory segment \"%s\": %m",
904 							name)));
905 		return false;
906 	}
907 
908 	/*
909 	 * If we're attaching the segment, determine the current size; if we are
910 	 * creating or resizing the segment, set the size to the requested value.
911 	 */
912 	if (op == DSM_OP_ATTACH)
913 	{
914 		struct stat st;
915 
916 		if (fstat(fd, &st) != 0)
917 		{
918 			int			save_errno;
919 
920 			/* Back out what's already been done. */
921 			save_errno = errno;
922 			CloseTransientFile(fd);
923 			errno = save_errno;
924 
925 			ereport(elevel,
926 					(errcode_for_dynamic_shared_memory(),
927 					 errmsg("could not stat shared memory segment \"%s\": %m",
928 							name)));
929 			return false;
930 		}
931 		request_size = st.st_size;
932 	}
933 	else if (*mapped_size > request_size && ftruncate(fd, request_size))
934 	{
935 		int			save_errno;
936 
937 		/* Back out what's already been done. */
938 		save_errno = errno;
939 		CloseTransientFile(fd);
940 		if (op == DSM_OP_CREATE)
941 			unlink(name);
942 		errno = save_errno;
943 
944 		ereport(elevel,
945 				(errcode_for_dynamic_shared_memory(),
946 				 errmsg("could not resize shared memory segment \"%s\" to %zu bytes: %m",
947 						name, request_size)));
948 		return false;
949 	}
950 	else if (*mapped_size < request_size)
951 	{
952 		/*
953 		 * Allocate a buffer full of zeros.
954 		 *
955 		 * Note: palloc zbuffer, instead of just using a local char array, to
956 		 * ensure it is reasonably well-aligned; this may save a few cycles
957 		 * transferring data to the kernel.
958 		 */
959 		char	   *zbuffer = (char *) palloc0(ZBUFFER_SIZE);
960 		uint32		remaining = request_size;
961 		bool		success = true;
962 
963 		/*
964 		 * Zero-fill the file. We have to do this the hard way to ensure that
965 		 * all the file space has really been allocated, so that we don't
966 		 * later seg fault when accessing the memory mapping.  This is pretty
967 		 * pessimal.
968 		 */
969 		while (success && remaining > 0)
970 		{
971 			Size		goal = remaining;
972 
973 			if (goal > ZBUFFER_SIZE)
974 				goal = ZBUFFER_SIZE;
975 			pgstat_report_wait_start(WAIT_EVENT_DSM_FILL_ZERO_WRITE);
976 			if (write(fd, zbuffer, goal) == goal)
977 				remaining -= goal;
978 			else
979 				success = false;
980 			pgstat_report_wait_end();
981 		}
982 
983 		if (!success)
984 		{
985 			int			save_errno;
986 
987 			/* Back out what's already been done. */
988 			save_errno = errno;
989 			CloseTransientFile(fd);
990 			if (op == DSM_OP_CREATE)
991 				unlink(name);
992 			errno = save_errno ? save_errno : ENOSPC;
993 
994 			ereport(elevel,
995 					(errcode_for_dynamic_shared_memory(),
996 					 errmsg("could not resize shared memory segment \"%s\" to %zu bytes: %m",
997 							name, request_size)));
998 			return false;
999 		}
1000 	}
1001 
1002 	/*
1003 	 * If we're reattaching or resizing, we must remove any existing mapping,
1004 	 * unless we've already got the right thing mapped.
1005 	 */
1006 	if (*mapped_address != NULL)
1007 	{
1008 		if (*mapped_size == request_size)
1009 			return true;
1010 		if (munmap(*mapped_address, *mapped_size) != 0)
1011 		{
1012 			int			save_errno;
1013 
1014 			/* Back out what's already been done. */
1015 			save_errno = errno;
1016 			CloseTransientFile(fd);
1017 			if (op == DSM_OP_CREATE)
1018 				unlink(name);
1019 			errno = save_errno;
1020 
1021 			ereport(elevel,
1022 					(errcode_for_dynamic_shared_memory(),
1023 					 errmsg("could not unmap shared memory segment \"%s\": %m",
1024 							name)));
1025 			return false;
1026 		}
1027 		*mapped_address = NULL;
1028 		*mapped_size = 0;
1029 	}
1030 
1031 	/* Map it. */
1032 	address = mmap(NULL, request_size, PROT_READ | PROT_WRITE,
1033 				   MAP_SHARED | MAP_HASSEMAPHORE | MAP_NOSYNC, fd, 0);
1034 	if (address == MAP_FAILED)
1035 	{
1036 		int			save_errno;
1037 
1038 		/* Back out what's already been done. */
1039 		save_errno = errno;
1040 		CloseTransientFile(fd);
1041 		if (op == DSM_OP_CREATE)
1042 			unlink(name);
1043 		errno = save_errno;
1044 
1045 		ereport(elevel,
1046 				(errcode_for_dynamic_shared_memory(),
1047 				 errmsg("could not map shared memory segment \"%s\": %m",
1048 						name)));
1049 		return false;
1050 	}
1051 	*mapped_address = address;
1052 	*mapped_size = request_size;
1053 	CloseTransientFile(fd);
1054 
1055 	return true;
1056 }
1057 #endif
1058 
1059 /*
1060  * Implementation-specific actions that must be performed when a segment is to
1061  * be preserved even when no backend has it attached.
1062  *
1063  * Except on Windows, we don't need to do anything at all.  But since Windows
1064  * cleans up segments automatically when no references remain, we duplicate
1065  * the segment handle into the postmaster process.  The postmaster needn't
1066  * do anything to receive the handle; Windows transfers it automatically.
1067  */
1068 void
dsm_impl_pin_segment(dsm_handle handle,void * impl_private,void ** impl_private_pm_handle)1069 dsm_impl_pin_segment(dsm_handle handle, void *impl_private,
1070 					 void **impl_private_pm_handle)
1071 {
1072 	switch (dynamic_shared_memory_type)
1073 	{
1074 #ifdef USE_DSM_WINDOWS
1075 		case DSM_IMPL_WINDOWS:
1076 			{
1077 				HANDLE		hmap;
1078 
1079 				if (!DuplicateHandle(GetCurrentProcess(), impl_private,
1080 									 PostmasterHandle, &hmap, 0, FALSE,
1081 									 DUPLICATE_SAME_ACCESS))
1082 				{
1083 					char		name[64];
1084 
1085 					snprintf(name, 64, "%s.%u", SEGMENT_NAME_PREFIX, handle);
1086 					_dosmaperr(GetLastError());
1087 					ereport(ERROR,
1088 							(errcode_for_dynamic_shared_memory(),
1089 							 errmsg("could not duplicate handle for \"%s\": %m",
1090 									name)));
1091 				}
1092 
1093 				/*
1094 				 * Here, we remember the handle that we created in the
1095 				 * postmaster process.  This handle isn't actually usable in
1096 				 * any process other than the postmaster, but that doesn't
1097 				 * matter.  We're just holding onto it so that, if the segment
1098 				 * is unpinned, dsm_impl_unpin_segment can close it.
1099 				 */
1100 				*impl_private_pm_handle = hmap;
1101 				break;
1102 			}
1103 #endif
1104 		default:
1105 			break;
1106 	}
1107 }
1108 
1109 /*
1110  * Implementation-specific actions that must be performed when a segment is no
1111  * longer to be preserved, so that it will be cleaned up when all backends
1112  * have detached from it.
1113  *
1114  * Except on Windows, we don't need to do anything at all.  For Windows, we
1115  * close the extra handle that dsm_impl_pin_segment created in the
1116  * postmaster's process space.
1117  */
1118 void
dsm_impl_unpin_segment(dsm_handle handle,void ** impl_private)1119 dsm_impl_unpin_segment(dsm_handle handle, void **impl_private)
1120 {
1121 	switch (dynamic_shared_memory_type)
1122 	{
1123 #ifdef USE_DSM_WINDOWS
1124 		case DSM_IMPL_WINDOWS:
1125 			{
1126 				if (*impl_private &&
1127 					!DuplicateHandle(PostmasterHandle, *impl_private,
1128 									 NULL, NULL, 0, FALSE,
1129 									 DUPLICATE_CLOSE_SOURCE))
1130 				{
1131 					char		name[64];
1132 
1133 					snprintf(name, 64, "%s.%u", SEGMENT_NAME_PREFIX, handle);
1134 					_dosmaperr(GetLastError());
1135 					ereport(ERROR,
1136 							(errcode_for_dynamic_shared_memory(),
1137 							 errmsg("could not duplicate handle for \"%s\": %m",
1138 									name)));
1139 				}
1140 
1141 				*impl_private = NULL;
1142 				break;
1143 			}
1144 #endif
1145 		default:
1146 			break;
1147 	}
1148 }
1149 
1150 static int
errcode_for_dynamic_shared_memory(void)1151 errcode_for_dynamic_shared_memory(void)
1152 {
1153 	if (errno == EFBIG || errno == ENOMEM)
1154 		return errcode(ERRCODE_OUT_OF_MEMORY);
1155 	else
1156 		return errcode_for_file_access();
1157 }
1158