1 /*-------------------------------------------------------------------------
2  *
3  * dsm_impl.c
4  *	  manage dynamic shared memory segments
5  *
6  * This file provides low-level APIs for creating and destroying shared
7  * memory segments using several different possible techniques.  We refer
8  * to these segments as dynamic because they can be created, altered, and
9  * destroyed at any point during the server life cycle.  This is unlike
10  * the main shared memory segment, of which there is always exactly one
11  * and which is always mapped at a fixed address in every PostgreSQL
12  * background process.
13  *
14  * Because not all systems provide the same primitives in this area, nor
15  * do all primitives behave the same way on all systems, we provide
16  * several implementations of this facility.  Many systems implement
17  * POSIX shared memory (shm_open etc.), which is well-suited to our needs
18  * in this area, with the exception that shared memory identifiers live
19  * in a flat system-wide namespace, raising the uncomfortable prospect of
20  * name collisions with other processes (including other copies of
21  * PostgreSQL) running on the same system.  Some systems only support
22  * the older System V shared memory interface (shmget etc.) which is
23  * also usable; however, the default allocation limits are often quite
24  * small, and the namespace is even more restricted.
25  *
26  * We also provide an mmap-based shared memory implementation.  This may
27  * be useful on systems that provide shared memory via a special-purpose
28  * filesystem; by opting for this implementation, the user can even
29  * control precisely where their shared memory segments are placed.  It
30  * can also be used as a fallback for systems where shm_open and shmget
31  * are not available or can't be used for some reason.  Of course,
32  * mapping a file residing on an actual spinning disk is a fairly poor
33  * approximation for shared memory because writeback may hurt performance
34  * substantially, but there should be few systems where we must make do
35  * with such poor tools.
36  *
37  * As ever, Windows requires its own implementation.
38  *
39  * Portions Copyright (c) 1996-2019, PostgreSQL Global Development Group
40  * Portions Copyright (c) 1994, Regents of the University of California
41  *
42  *
43  * IDENTIFICATION
44  *	  src/backend/storage/ipc/dsm_impl.c
45  *
46  *-------------------------------------------------------------------------
47  */
48 
49 #include "postgres.h"
50 #include "miscadmin.h"
51 
52 #include <fcntl.h>
53 #include <unistd.h>
54 #ifndef WIN32
55 #include <sys/mman.h>
56 #endif
57 #include <sys/stat.h>
58 #ifdef HAVE_SYS_IPC_H
59 #include <sys/ipc.h>
60 #endif
61 #ifdef HAVE_SYS_SHM_H
62 #include <sys/shm.h>
63 #endif
64 #include "common/file_perm.h"
65 #include "pgstat.h"
66 
67 #include "portability/mem.h"
68 #include "storage/dsm_impl.h"
69 #include "storage/fd.h"
70 #include "utils/guc.h"
71 #include "utils/memutils.h"
72 #include "postmaster/postmaster.h"
73 
74 #ifdef USE_DSM_POSIX
75 static bool dsm_impl_posix(dsm_op op, dsm_handle handle, Size request_size,
76 						   void **impl_private, void **mapped_address,
77 						   Size *mapped_size, int elevel);
78 static int	dsm_impl_posix_resize(int fd, off_t size);
79 #endif
80 #ifdef USE_DSM_SYSV
81 static bool dsm_impl_sysv(dsm_op op, dsm_handle handle, Size request_size,
82 						  void **impl_private, void **mapped_address,
83 						  Size *mapped_size, int elevel);
84 #endif
85 #ifdef USE_DSM_WINDOWS
86 static bool dsm_impl_windows(dsm_op op, dsm_handle handle, Size request_size,
87 							 void **impl_private, void **mapped_address,
88 							 Size *mapped_size, int elevel);
89 #endif
90 #ifdef USE_DSM_MMAP
91 static bool dsm_impl_mmap(dsm_op op, dsm_handle handle, Size request_size,
92 						  void **impl_private, void **mapped_address,
93 						  Size *mapped_size, int elevel);
94 #endif
95 static int	errcode_for_dynamic_shared_memory(void);
96 
97 const struct config_enum_entry dynamic_shared_memory_options[] = {
98 #ifdef USE_DSM_POSIX
99 	{"posix", DSM_IMPL_POSIX, false},
100 #endif
101 #ifdef USE_DSM_SYSV
102 	{"sysv", DSM_IMPL_SYSV, false},
103 #endif
104 #ifdef USE_DSM_WINDOWS
105 	{"windows", DSM_IMPL_WINDOWS, false},
106 #endif
107 #ifdef USE_DSM_MMAP
108 	{"mmap", DSM_IMPL_MMAP, false},
109 #endif
110 	{NULL, 0, false}
111 };
112 
113 /* Implementation selector. */
114 int			dynamic_shared_memory_type;
115 
116 /* Size of buffer to be used for zero-filling. */
117 #define ZBUFFER_SIZE				8192
118 
119 #define SEGMENT_NAME_PREFIX			"Global/PostgreSQL"
120 
121 /*------
122  * Perform a low-level shared memory operation in a platform-specific way,
123  * as dictated by the selected implementation.  Each implementation is
124  * required to implement the following primitives.
125  *
126  * DSM_OP_CREATE.  Create a segment whose size is the request_size and
127  * map it.
128  *
129  * DSM_OP_ATTACH.  Map the segment, whose size must be the request_size.
130  *
131  * DSM_OP_DETACH.  Unmap the segment.
132  *
133  * DSM_OP_DESTROY.  Unmap the segment, if it is mapped.  Destroy the
134  * segment.
135  *
136  * Arguments:
137  *	 op: The operation to be performed.
138  *	 handle: The handle of an existing object, or for DSM_OP_CREATE, the
139  *	   a new handle the caller wants created.
140  *	 request_size: For DSM_OP_CREATE, the requested size.  Otherwise, 0.
141  *	 impl_private: Private, implementation-specific data.  Will be a pointer
142  *	   to NULL for the first operation on a shared memory segment within this
143  *	   backend; thereafter, it will point to the value to which it was set
144  *	   on the previous call.
145  *	 mapped_address: Pointer to start of current mapping; pointer to NULL
146  *	   if none.  Updated with new mapping address.
147  *	 mapped_size: Pointer to size of current mapping; pointer to 0 if none.
148  *	   Updated with new mapped size.
149  *	 elevel: Level at which to log errors.
150  *
151  * Return value: true on success, false on failure.  When false is returned,
152  * a message should first be logged at the specified elevel, except in the
153  * case where DSM_OP_CREATE experiences a name collision, which should
154  * silently return false.
155  *-----
156  */
157 bool
dsm_impl_op(dsm_op op,dsm_handle handle,Size request_size,void ** impl_private,void ** mapped_address,Size * mapped_size,int elevel)158 dsm_impl_op(dsm_op op, dsm_handle handle, Size request_size,
159 			void **impl_private, void **mapped_address, Size *mapped_size,
160 			int elevel)
161 {
162 	Assert(op == DSM_OP_CREATE || request_size == 0);
163 	Assert((op != DSM_OP_CREATE && op != DSM_OP_ATTACH) ||
164 		   (*mapped_address == NULL && *mapped_size == 0));
165 
166 	switch (dynamic_shared_memory_type)
167 	{
168 #ifdef USE_DSM_POSIX
169 		case DSM_IMPL_POSIX:
170 			return dsm_impl_posix(op, handle, request_size, impl_private,
171 								  mapped_address, mapped_size, elevel);
172 #endif
173 #ifdef USE_DSM_SYSV
174 		case DSM_IMPL_SYSV:
175 			return dsm_impl_sysv(op, handle, request_size, impl_private,
176 								 mapped_address, mapped_size, elevel);
177 #endif
178 #ifdef USE_DSM_WINDOWS
179 		case DSM_IMPL_WINDOWS:
180 			return dsm_impl_windows(op, handle, request_size, impl_private,
181 									mapped_address, mapped_size, elevel);
182 #endif
183 #ifdef USE_DSM_MMAP
184 		case DSM_IMPL_MMAP:
185 			return dsm_impl_mmap(op, handle, request_size, impl_private,
186 								 mapped_address, mapped_size, elevel);
187 #endif
188 		default:
189 			elog(ERROR, "unexpected dynamic shared memory type: %d",
190 				 dynamic_shared_memory_type);
191 			return false;
192 	}
193 }
194 
195 #ifdef USE_DSM_POSIX
196 /*
197  * Operating system primitives to support POSIX shared memory.
198  *
199  * POSIX shared memory segments are created and attached using shm_open()
200  * and shm_unlink(); other operations, such as sizing or mapping the
201  * segment, are performed as if the shared memory segments were files.
202  *
203  * Indeed, on some platforms, they may be implemented that way.  While
204  * POSIX shared memory segments seem intended to exist in a flat namespace,
205  * some operating systems may implement them as files, even going so far
206  * to treat a request for /xyz as a request to create a file by that name
207  * in the root directory.  Users of such broken platforms should select
208  * a different shared memory implementation.
209  */
210 static bool
dsm_impl_posix(dsm_op op,dsm_handle handle,Size request_size,void ** impl_private,void ** mapped_address,Size * mapped_size,int elevel)211 dsm_impl_posix(dsm_op op, dsm_handle handle, Size request_size,
212 			   void **impl_private, void **mapped_address, Size *mapped_size,
213 			   int elevel)
214 {
215 	char		name[64];
216 	int			flags;
217 	int			fd;
218 	char	   *address;
219 
220 	snprintf(name, 64, "/PostgreSQL.%u", handle);
221 
222 	/* Handle teardown cases. */
223 	if (op == DSM_OP_DETACH || op == DSM_OP_DESTROY)
224 	{
225 		if (*mapped_address != NULL
226 			&& munmap(*mapped_address, *mapped_size) != 0)
227 		{
228 			ereport(elevel,
229 					(errcode_for_dynamic_shared_memory(),
230 					 errmsg("could not unmap shared memory segment \"%s\": %m",
231 							name)));
232 			return false;
233 		}
234 		*mapped_address = NULL;
235 		*mapped_size = 0;
236 		if (op == DSM_OP_DESTROY && shm_unlink(name) != 0)
237 		{
238 			ereport(elevel,
239 					(errcode_for_dynamic_shared_memory(),
240 					 errmsg("could not remove shared memory segment \"%s\": %m",
241 							name)));
242 			return false;
243 		}
244 		return true;
245 	}
246 
247 	/*
248 	 * Create new segment or open an existing one for attach.
249 	 *
250 	 * Even though we're not going through fd.c, we should be safe against
251 	 * running out of file descriptors, because of NUM_RESERVED_FDS.  We're
252 	 * only opening one extra descriptor here, and we'll close it before
253 	 * returning.
254 	 */
255 	flags = O_RDWR | (op == DSM_OP_CREATE ? O_CREAT | O_EXCL : 0);
256 	if ((fd = shm_open(name, flags, PG_FILE_MODE_OWNER)) == -1)
257 	{
258 		if (errno != EEXIST)
259 			ereport(elevel,
260 					(errcode_for_dynamic_shared_memory(),
261 					 errmsg("could not open shared memory segment \"%s\": %m",
262 							name)));
263 		return false;
264 	}
265 
266 	/*
267 	 * If we're attaching the segment, determine the current size; if we are
268 	 * creating the segment, set the size to the requested value.
269 	 */
270 	if (op == DSM_OP_ATTACH)
271 	{
272 		struct stat st;
273 
274 		if (fstat(fd, &st) != 0)
275 		{
276 			int			save_errno;
277 
278 			/* Back out what's already been done. */
279 			save_errno = errno;
280 			close(fd);
281 			errno = save_errno;
282 
283 			ereport(elevel,
284 					(errcode_for_dynamic_shared_memory(),
285 					 errmsg("could not stat shared memory segment \"%s\": %m",
286 							name)));
287 			return false;
288 		}
289 		request_size = st.st_size;
290 	}
291 	else if (dsm_impl_posix_resize(fd, request_size) != 0)
292 	{
293 		int			save_errno;
294 
295 		/* Back out what's already been done. */
296 		save_errno = errno;
297 		close(fd);
298 		shm_unlink(name);
299 		errno = save_errno;
300 
301 		/*
302 		 * If we received a query cancel or termination signal, we will have
303 		 * EINTR set here.  If the caller said that errors are OK here, check
304 		 * for interrupts immediately.
305 		 */
306 		if (errno == EINTR && elevel >= ERROR)
307 			CHECK_FOR_INTERRUPTS();
308 
309 		ereport(elevel,
310 				(errcode_for_dynamic_shared_memory(),
311 				 errmsg("could not resize shared memory segment \"%s\" to %zu bytes: %m",
312 						name, request_size)));
313 		return false;
314 	}
315 
316 	/* Map it. */
317 	address = mmap(NULL, request_size, PROT_READ | PROT_WRITE,
318 				   MAP_SHARED | MAP_HASSEMAPHORE | MAP_NOSYNC, fd, 0);
319 	if (address == MAP_FAILED)
320 	{
321 		int			save_errno;
322 
323 		/* Back out what's already been done. */
324 		save_errno = errno;
325 		close(fd);
326 		if (op == DSM_OP_CREATE)
327 			shm_unlink(name);
328 		errno = save_errno;
329 
330 		ereport(elevel,
331 				(errcode_for_dynamic_shared_memory(),
332 				 errmsg("could not map shared memory segment \"%s\": %m",
333 						name)));
334 		return false;
335 	}
336 	*mapped_address = address;
337 	*mapped_size = request_size;
338 	close(fd);
339 
340 	return true;
341 }
342 
343 /*
344  * Set the size of a virtual memory region associated with a file descriptor.
345  * If necessary, also ensure that virtual memory is actually allocated by the
346  * operating system, to avoid nasty surprises later.
347  *
348  * Returns non-zero if either truncation or allocation fails, and sets errno.
349  */
350 static int
dsm_impl_posix_resize(int fd,off_t size)351 dsm_impl_posix_resize(int fd, off_t size)
352 {
353 	int			rc;
354 
355 	/* Truncate (or extend) the file to the requested size. */
356 	rc = ftruncate(fd, size);
357 
358 	/*
359 	 * On Linux, a shm_open fd is backed by a tmpfs file.  After resizing with
360 	 * ftruncate, the file may contain a hole.  Accessing memory backed by a
361 	 * hole causes tmpfs to allocate pages, which fails with SIGBUS if there
362 	 * is no more tmpfs space available.  So we ask tmpfs to allocate pages
363 	 * here, so we can fail gracefully with ENOSPC now rather than risking
364 	 * SIGBUS later.
365 	 */
366 #if defined(HAVE_POSIX_FALLOCATE) && defined(__linux__)
367 	if (rc == 0)
368 	{
369 		/*
370 		 * We may get interrupted.  If so, just retry unless there is an
371 		 * interrupt pending.  This avoids the possibility of looping forever
372 		 * if another backend is repeatedly trying to interrupt us.
373 		 */
374 		do
375 		{
376 			rc = posix_fallocate(fd, 0, size);
377 		} while (rc == EINTR && !(ProcDiePending || QueryCancelPending));
378 
379 		/*
380 		 * The caller expects errno to be set, but posix_fallocate() doesn't
381 		 * set it.  Instead it returns error numbers directly.  So set errno,
382 		 * even though we'll also return rc to indicate success or failure.
383 		 */
384 		errno = rc;
385 	}
386 #endif							/* HAVE_POSIX_FALLOCATE && __linux__ */
387 
388 	return rc;
389 }
390 
391 #endif							/* USE_DSM_POSIX */
392 
393 #ifdef USE_DSM_SYSV
394 /*
395  * Operating system primitives to support System V shared memory.
396  *
397  * System V shared memory segments are manipulated using shmget(), shmat(),
398  * shmdt(), and shmctl().  As the default allocation limits for System V
399  * shared memory are usually quite low, the POSIX facilities may be
400  * preferable; but those are not supported everywhere.
401  */
402 static bool
dsm_impl_sysv(dsm_op op,dsm_handle handle,Size request_size,void ** impl_private,void ** mapped_address,Size * mapped_size,int elevel)403 dsm_impl_sysv(dsm_op op, dsm_handle handle, Size request_size,
404 			  void **impl_private, void **mapped_address, Size *mapped_size,
405 			  int elevel)
406 {
407 	key_t		key;
408 	int			ident;
409 	char	   *address;
410 	char		name[64];
411 	int		   *ident_cache;
412 
413 	/*
414 	 * POSIX shared memory and mmap-based shared memory identify segments with
415 	 * names.  To avoid needless error message variation, we use the handle as
416 	 * the name.
417 	 */
418 	snprintf(name, 64, "%u", handle);
419 
420 	/*
421 	 * The System V shared memory namespace is very restricted; names are of
422 	 * type key_t, which is expected to be some sort of integer data type, but
423 	 * not necessarily the same one as dsm_handle.  Since we use dsm_handle to
424 	 * identify shared memory segments across processes, this might seem like
425 	 * a problem, but it's really not.  If dsm_handle is bigger than key_t,
426 	 * the cast below might truncate away some bits from the handle the
427 	 * user-provided, but it'll truncate exactly the same bits away in exactly
428 	 * the same fashion every time we use that handle, which is all that
429 	 * really matters.  Conversely, if dsm_handle is smaller than key_t, we
430 	 * won't use the full range of available key space, but that's no big deal
431 	 * either.
432 	 *
433 	 * We do make sure that the key isn't negative, because that might not be
434 	 * portable.
435 	 */
436 	key = (key_t) handle;
437 	if (key < 1)				/* avoid compiler warning if type is unsigned */
438 		key = -key;
439 
440 	/*
441 	 * There's one special key, IPC_PRIVATE, which can't be used.  If we end
442 	 * up with that value by chance during a create operation, just pretend it
443 	 * already exists, so that caller will retry.  If we run into it anywhere
444 	 * else, the caller has passed a handle that doesn't correspond to
445 	 * anything we ever created, which should not happen.
446 	 */
447 	if (key == IPC_PRIVATE)
448 	{
449 		if (op != DSM_OP_CREATE)
450 			elog(DEBUG4, "System V shared memory key may not be IPC_PRIVATE");
451 		errno = EEXIST;
452 		return false;
453 	}
454 
455 	/*
456 	 * Before we can do anything with a shared memory segment, we have to map
457 	 * the shared memory key to a shared memory identifier using shmget(). To
458 	 * avoid repeated lookups, we store the key using impl_private.
459 	 */
460 	if (*impl_private != NULL)
461 	{
462 		ident_cache = *impl_private;
463 		ident = *ident_cache;
464 	}
465 	else
466 	{
467 		int			flags = IPCProtection;
468 		size_t		segsize;
469 
470 		/*
471 		 * Allocate the memory BEFORE acquiring the resource, so that we don't
472 		 * leak the resource if memory allocation fails.
473 		 */
474 		ident_cache = MemoryContextAlloc(TopMemoryContext, sizeof(int));
475 
476 		/*
477 		 * When using shmget to find an existing segment, we must pass the
478 		 * size as 0.  Passing a non-zero size which is greater than the
479 		 * actual size will result in EINVAL.
480 		 */
481 		segsize = 0;
482 
483 		if (op == DSM_OP_CREATE)
484 		{
485 			flags |= IPC_CREAT | IPC_EXCL;
486 			segsize = request_size;
487 		}
488 
489 		if ((ident = shmget(key, segsize, flags)) == -1)
490 		{
491 			if (errno != EEXIST)
492 			{
493 				int			save_errno = errno;
494 
495 				pfree(ident_cache);
496 				errno = save_errno;
497 				ereport(elevel,
498 						(errcode_for_dynamic_shared_memory(),
499 						 errmsg("could not get shared memory segment: %m")));
500 			}
501 			return false;
502 		}
503 
504 		*ident_cache = ident;
505 		*impl_private = ident_cache;
506 	}
507 
508 	/* Handle teardown cases. */
509 	if (op == DSM_OP_DETACH || op == DSM_OP_DESTROY)
510 	{
511 		pfree(ident_cache);
512 		*impl_private = NULL;
513 		if (*mapped_address != NULL && shmdt(*mapped_address) != 0)
514 		{
515 			ereport(elevel,
516 					(errcode_for_dynamic_shared_memory(),
517 					 errmsg("could not unmap shared memory segment \"%s\": %m",
518 							name)));
519 			return false;
520 		}
521 		*mapped_address = NULL;
522 		*mapped_size = 0;
523 		if (op == DSM_OP_DESTROY && shmctl(ident, IPC_RMID, NULL) < 0)
524 		{
525 			ereport(elevel,
526 					(errcode_for_dynamic_shared_memory(),
527 					 errmsg("could not remove shared memory segment \"%s\": %m",
528 							name)));
529 			return false;
530 		}
531 		return true;
532 	}
533 
534 	/* If we're attaching it, we must use IPC_STAT to determine the size. */
535 	if (op == DSM_OP_ATTACH)
536 	{
537 		struct shmid_ds shm;
538 
539 		if (shmctl(ident, IPC_STAT, &shm) != 0)
540 		{
541 			ereport(elevel,
542 					(errcode_for_dynamic_shared_memory(),
543 					 errmsg("could not stat shared memory segment \"%s\": %m",
544 							name)));
545 			return false;
546 		}
547 		request_size = shm.shm_segsz;
548 	}
549 
550 	/* Map it. */
551 	address = shmat(ident, NULL, PG_SHMAT_FLAGS);
552 	if (address == (void *) -1)
553 	{
554 		int			save_errno;
555 
556 		/* Back out what's already been done. */
557 		save_errno = errno;
558 		if (op == DSM_OP_CREATE)
559 			shmctl(ident, IPC_RMID, NULL);
560 		errno = save_errno;
561 
562 		ereport(elevel,
563 				(errcode_for_dynamic_shared_memory(),
564 				 errmsg("could not map shared memory segment \"%s\": %m",
565 						name)));
566 		return false;
567 	}
568 	*mapped_address = address;
569 	*mapped_size = request_size;
570 
571 	return true;
572 }
573 #endif
574 
575 #ifdef USE_DSM_WINDOWS
576 /*
577  * Operating system primitives to support Windows shared memory.
578  *
579  * Windows shared memory implementation is done using file mapping
580  * which can be backed by either physical file or system paging file.
581  * Current implementation uses system paging file as other effects
582  * like performance are not clear for physical file and it is used in similar
583  * way for main shared memory in windows.
584  *
585  * A memory mapping object is a kernel object - they always get deleted when
586  * the last reference to them goes away, either explicitly via a CloseHandle or
587  * when the process containing the reference exits.
588  */
589 static bool
dsm_impl_windows(dsm_op op,dsm_handle handle,Size request_size,void ** impl_private,void ** mapped_address,Size * mapped_size,int elevel)590 dsm_impl_windows(dsm_op op, dsm_handle handle, Size request_size,
591 				 void **impl_private, void **mapped_address,
592 				 Size *mapped_size, int elevel)
593 {
594 	char	   *address;
595 	HANDLE		hmap;
596 	char		name[64];
597 	MEMORY_BASIC_INFORMATION info;
598 
599 	/*
600 	 * Storing the shared memory segment in the Global\ namespace, can allow
601 	 * any process running in any session to access that file mapping object
602 	 * provided that the caller has the required access rights. But to avoid
603 	 * issues faced in main shared memory, we are using the naming convention
604 	 * similar to main shared memory. We can change here once issue mentioned
605 	 * in GetSharedMemName is resolved.
606 	 */
607 	snprintf(name, 64, "%s.%u", SEGMENT_NAME_PREFIX, handle);
608 
609 	/*
610 	 * Handle teardown cases.  Since Windows automatically destroys the object
611 	 * when no references remain, we can treat it the same as detach.
612 	 */
613 	if (op == DSM_OP_DETACH || op == DSM_OP_DESTROY)
614 	{
615 		if (*mapped_address != NULL
616 			&& UnmapViewOfFile(*mapped_address) == 0)
617 		{
618 			_dosmaperr(GetLastError());
619 			ereport(elevel,
620 					(errcode_for_dynamic_shared_memory(),
621 					 errmsg("could not unmap shared memory segment \"%s\": %m",
622 							name)));
623 			return false;
624 		}
625 		if (*impl_private != NULL
626 			&& CloseHandle(*impl_private) == 0)
627 		{
628 			_dosmaperr(GetLastError());
629 			ereport(elevel,
630 					(errcode_for_dynamic_shared_memory(),
631 					 errmsg("could not remove shared memory segment \"%s\": %m",
632 							name)));
633 			return false;
634 		}
635 
636 		*impl_private = NULL;
637 		*mapped_address = NULL;
638 		*mapped_size = 0;
639 		return true;
640 	}
641 
642 	/* Create new segment or open an existing one for attach. */
643 	if (op == DSM_OP_CREATE)
644 	{
645 		DWORD		size_high;
646 		DWORD		size_low;
647 		DWORD		errcode;
648 
649 		/* Shifts >= the width of the type are undefined. */
650 #ifdef _WIN64
651 		size_high = request_size >> 32;
652 #else
653 		size_high = 0;
654 #endif
655 		size_low = (DWORD) request_size;
656 
657 		/* CreateFileMapping might not clear the error code on success */
658 		SetLastError(0);
659 
660 		hmap = CreateFileMapping(INVALID_HANDLE_VALUE,	/* Use the pagefile */
661 								 NULL,	/* Default security attrs */
662 								 PAGE_READWRITE,	/* Memory is read/write */
663 								 size_high, /* Upper 32 bits of size */
664 								 size_low,	/* Lower 32 bits of size */
665 								 name);
666 
667 		errcode = GetLastError();
668 		if (errcode == ERROR_ALREADY_EXISTS || errcode == ERROR_ACCESS_DENIED)
669 		{
670 			/*
671 			 * On Windows, when the segment already exists, a handle for the
672 			 * existing segment is returned.  We must close it before
673 			 * returning.  However, if the existing segment is created by a
674 			 * service, then it returns ERROR_ACCESS_DENIED. We don't do
675 			 * _dosmaperr here, so errno won't be modified.
676 			 */
677 			if (hmap)
678 				CloseHandle(hmap);
679 			return false;
680 		}
681 
682 		if (!hmap)
683 		{
684 			_dosmaperr(errcode);
685 			ereport(elevel,
686 					(errcode_for_dynamic_shared_memory(),
687 					 errmsg("could not create shared memory segment \"%s\": %m",
688 							name)));
689 			return false;
690 		}
691 	}
692 	else
693 	{
694 		hmap = OpenFileMapping(FILE_MAP_WRITE | FILE_MAP_READ,
695 							   FALSE,	/* do not inherit the name */
696 							   name);	/* name of mapping object */
697 		if (!hmap)
698 		{
699 			_dosmaperr(GetLastError());
700 			ereport(elevel,
701 					(errcode_for_dynamic_shared_memory(),
702 					 errmsg("could not open shared memory segment \"%s\": %m",
703 							name)));
704 			return false;
705 		}
706 	}
707 
708 	/* Map it. */
709 	address = MapViewOfFile(hmap, FILE_MAP_WRITE | FILE_MAP_READ,
710 							0, 0, 0);
711 	if (!address)
712 	{
713 		int			save_errno;
714 
715 		_dosmaperr(GetLastError());
716 		/* Back out what's already been done. */
717 		save_errno = errno;
718 		CloseHandle(hmap);
719 		errno = save_errno;
720 
721 		ereport(elevel,
722 				(errcode_for_dynamic_shared_memory(),
723 				 errmsg("could not map shared memory segment \"%s\": %m",
724 						name)));
725 		return false;
726 	}
727 
728 	/*
729 	 * VirtualQuery gives size in page_size units, which is 4K for Windows. We
730 	 * need size only when we are attaching, but it's better to get the size
731 	 * when creating new segment to keep size consistent both for
732 	 * DSM_OP_CREATE and DSM_OP_ATTACH.
733 	 */
734 	if (VirtualQuery(address, &info, sizeof(info)) == 0)
735 	{
736 		int			save_errno;
737 
738 		_dosmaperr(GetLastError());
739 		/* Back out what's already been done. */
740 		save_errno = errno;
741 		UnmapViewOfFile(address);
742 		CloseHandle(hmap);
743 		errno = save_errno;
744 
745 		ereport(elevel,
746 				(errcode_for_dynamic_shared_memory(),
747 				 errmsg("could not stat shared memory segment \"%s\": %m",
748 						name)));
749 		return false;
750 	}
751 
752 	*mapped_address = address;
753 	*mapped_size = info.RegionSize;
754 	*impl_private = hmap;
755 
756 	return true;
757 }
758 #endif
759 
760 #ifdef USE_DSM_MMAP
761 /*
762  * Operating system primitives to support mmap-based shared memory.
763  *
764  * Calling this "shared memory" is somewhat of a misnomer, because what
765  * we're really doing is creating a bunch of files and mapping them into
766  * our address space.  The operating system may feel obliged to
767  * synchronize the contents to disk even if nothing is being paged out,
768  * which will not serve us well.  The user can relocate the pg_dynshmem
769  * directory to a ramdisk to avoid this problem, if available.
770  */
771 static bool
dsm_impl_mmap(dsm_op op,dsm_handle handle,Size request_size,void ** impl_private,void ** mapped_address,Size * mapped_size,int elevel)772 dsm_impl_mmap(dsm_op op, dsm_handle handle, Size request_size,
773 			  void **impl_private, void **mapped_address, Size *mapped_size,
774 			  int elevel)
775 {
776 	char		name[64];
777 	int			flags;
778 	int			fd;
779 	char	   *address;
780 
781 	snprintf(name, 64, PG_DYNSHMEM_DIR "/" PG_DYNSHMEM_MMAP_FILE_PREFIX "%u",
782 			 handle);
783 
784 	/* Handle teardown cases. */
785 	if (op == DSM_OP_DETACH || op == DSM_OP_DESTROY)
786 	{
787 		if (*mapped_address != NULL
788 			&& munmap(*mapped_address, *mapped_size) != 0)
789 		{
790 			ereport(elevel,
791 					(errcode_for_dynamic_shared_memory(),
792 					 errmsg("could not unmap shared memory segment \"%s\": %m",
793 							name)));
794 			return false;
795 		}
796 		*mapped_address = NULL;
797 		*mapped_size = 0;
798 		if (op == DSM_OP_DESTROY && unlink(name) != 0)
799 		{
800 			ereport(elevel,
801 					(errcode_for_dynamic_shared_memory(),
802 					 errmsg("could not remove shared memory segment \"%s\": %m",
803 							name)));
804 			return false;
805 		}
806 		return true;
807 	}
808 
809 	/* Create new segment or open an existing one for attach. */
810 	flags = O_RDWR | (op == DSM_OP_CREATE ? O_CREAT | O_EXCL : 0);
811 	if ((fd = OpenTransientFile(name, flags)) == -1)
812 	{
813 		if (errno != EEXIST)
814 			ereport(elevel,
815 					(errcode_for_dynamic_shared_memory(),
816 					 errmsg("could not open shared memory segment \"%s\": %m",
817 							name)));
818 		return false;
819 	}
820 
821 	/*
822 	 * If we're attaching the segment, determine the current size; if we are
823 	 * creating the segment, set the size to the requested value.
824 	 */
825 	if (op == DSM_OP_ATTACH)
826 	{
827 		struct stat st;
828 
829 		if (fstat(fd, &st) != 0)
830 		{
831 			int			save_errno;
832 
833 			/* Back out what's already been done. */
834 			save_errno = errno;
835 			CloseTransientFile(fd);
836 			errno = save_errno;
837 
838 			ereport(elevel,
839 					(errcode_for_dynamic_shared_memory(),
840 					 errmsg("could not stat shared memory segment \"%s\": %m",
841 							name)));
842 			return false;
843 		}
844 		request_size = st.st_size;
845 	}
846 	else
847 	{
848 		/*
849 		 * Allocate a buffer full of zeros.
850 		 *
851 		 * Note: palloc zbuffer, instead of just using a local char array, to
852 		 * ensure it is reasonably well-aligned; this may save a few cycles
853 		 * transferring data to the kernel.
854 		 */
855 		char	   *zbuffer = (char *) palloc0(ZBUFFER_SIZE);
856 		uint32		remaining = request_size;
857 		bool		success = true;
858 
859 		/*
860 		 * Zero-fill the file. We have to do this the hard way to ensure that
861 		 * all the file space has really been allocated, so that we don't
862 		 * later seg fault when accessing the memory mapping.  This is pretty
863 		 * pessimal.
864 		 */
865 		while (success && remaining > 0)
866 		{
867 			Size		goal = remaining;
868 
869 			if (goal > ZBUFFER_SIZE)
870 				goal = ZBUFFER_SIZE;
871 			pgstat_report_wait_start(WAIT_EVENT_DSM_FILL_ZERO_WRITE);
872 			if (write(fd, zbuffer, goal) == goal)
873 				remaining -= goal;
874 			else
875 				success = false;
876 			pgstat_report_wait_end();
877 		}
878 
879 		if (!success)
880 		{
881 			int			save_errno;
882 
883 			/* Back out what's already been done. */
884 			save_errno = errno;
885 			CloseTransientFile(fd);
886 			unlink(name);
887 			errno = save_errno ? save_errno : ENOSPC;
888 
889 			ereport(elevel,
890 					(errcode_for_dynamic_shared_memory(),
891 					 errmsg("could not resize shared memory segment \"%s\" to %zu bytes: %m",
892 							name, request_size)));
893 			return false;
894 		}
895 	}
896 
897 	/* Map it. */
898 	address = mmap(NULL, request_size, PROT_READ | PROT_WRITE,
899 				   MAP_SHARED | MAP_HASSEMAPHORE | MAP_NOSYNC, fd, 0);
900 	if (address == MAP_FAILED)
901 	{
902 		int			save_errno;
903 
904 		/* Back out what's already been done. */
905 		save_errno = errno;
906 		CloseTransientFile(fd);
907 		if (op == DSM_OP_CREATE)
908 			unlink(name);
909 		errno = save_errno;
910 
911 		ereport(elevel,
912 				(errcode_for_dynamic_shared_memory(),
913 				 errmsg("could not map shared memory segment \"%s\": %m",
914 						name)));
915 		return false;
916 	}
917 	*mapped_address = address;
918 	*mapped_size = request_size;
919 
920 	if (CloseTransientFile(fd))
921 	{
922 		ereport(elevel,
923 				(errcode_for_file_access(),
924 				 errmsg("could not close shared memory segment \"%s\": %m",
925 						name)));
926 		return false;
927 	}
928 
929 	return true;
930 }
931 #endif
932 
933 /*
934  * Implementation-specific actions that must be performed when a segment is to
935  * be preserved even when no backend has it attached.
936  *
937  * Except on Windows, we don't need to do anything at all.  But since Windows
938  * cleans up segments automatically when no references remain, we duplicate
939  * the segment handle into the postmaster process.  The postmaster needn't
940  * do anything to receive the handle; Windows transfers it automatically.
941  */
942 void
dsm_impl_pin_segment(dsm_handle handle,void * impl_private,void ** impl_private_pm_handle)943 dsm_impl_pin_segment(dsm_handle handle, void *impl_private,
944 					 void **impl_private_pm_handle)
945 {
946 	switch (dynamic_shared_memory_type)
947 	{
948 #ifdef USE_DSM_WINDOWS
949 		case DSM_IMPL_WINDOWS:
950 			{
951 				HANDLE		hmap;
952 
953 				if (!DuplicateHandle(GetCurrentProcess(), impl_private,
954 									 PostmasterHandle, &hmap, 0, FALSE,
955 									 DUPLICATE_SAME_ACCESS))
956 				{
957 					char		name[64];
958 
959 					snprintf(name, 64, "%s.%u", SEGMENT_NAME_PREFIX, handle);
960 					_dosmaperr(GetLastError());
961 					ereport(ERROR,
962 							(errcode_for_dynamic_shared_memory(),
963 							 errmsg("could not duplicate handle for \"%s\": %m",
964 									name)));
965 				}
966 
967 				/*
968 				 * Here, we remember the handle that we created in the
969 				 * postmaster process.  This handle isn't actually usable in
970 				 * any process other than the postmaster, but that doesn't
971 				 * matter.  We're just holding onto it so that, if the segment
972 				 * is unpinned, dsm_impl_unpin_segment can close it.
973 				 */
974 				*impl_private_pm_handle = hmap;
975 				break;
976 			}
977 #endif
978 		default:
979 			break;
980 	}
981 }
982 
983 /*
984  * Implementation-specific actions that must be performed when a segment is no
985  * longer to be preserved, so that it will be cleaned up when all backends
986  * have detached from it.
987  *
988  * Except on Windows, we don't need to do anything at all.  For Windows, we
989  * close the extra handle that dsm_impl_pin_segment created in the
990  * postmaster's process space.
991  */
992 void
dsm_impl_unpin_segment(dsm_handle handle,void ** impl_private)993 dsm_impl_unpin_segment(dsm_handle handle, void **impl_private)
994 {
995 	switch (dynamic_shared_memory_type)
996 	{
997 #ifdef USE_DSM_WINDOWS
998 		case DSM_IMPL_WINDOWS:
999 			{
1000 				if (*impl_private &&
1001 					!DuplicateHandle(PostmasterHandle, *impl_private,
1002 									 NULL, NULL, 0, FALSE,
1003 									 DUPLICATE_CLOSE_SOURCE))
1004 				{
1005 					char		name[64];
1006 
1007 					snprintf(name, 64, "%s.%u", SEGMENT_NAME_PREFIX, handle);
1008 					_dosmaperr(GetLastError());
1009 					ereport(ERROR,
1010 							(errcode_for_dynamic_shared_memory(),
1011 							 errmsg("could not duplicate handle for \"%s\": %m",
1012 									name)));
1013 				}
1014 
1015 				*impl_private = NULL;
1016 				break;
1017 			}
1018 #endif
1019 		default:
1020 			break;
1021 	}
1022 }
1023 
1024 static int
errcode_for_dynamic_shared_memory(void)1025 errcode_for_dynamic_shared_memory(void)
1026 {
1027 	if (errno == EFBIG || errno == ENOMEM)
1028 		return errcode(ERRCODE_OUT_OF_MEMORY);
1029 	else
1030 		return errcode_for_file_access();
1031 }
1032