1 /*-------------------------------------------------------------------------
2  *
3  * dsm_impl.c
4  *	  manage dynamic shared memory segments
5  *
6  * This file provides low-level APIs for creating and destroying shared
7  * memory segments using several different possible techniques.  We refer
8  * to these segments as dynamic because they can be created, altered, and
9  * destroyed at any point during the server life cycle.  This is unlike
10  * the main shared memory segment, of which there is always exactly one
11  * and which is always mapped at a fixed address in every PostgreSQL
12  * background process.
13  *
14  * Because not all systems provide the same primitives in this area, nor
15  * do all primitives behave the same way on all systems, we provide
16  * several implementations of this facility.  Many systems implement
17  * POSIX shared memory (shm_open etc.), which is well-suited to our needs
18  * in this area, with the exception that shared memory identifiers live
19  * in a flat system-wide namespace, raising the uncomfortable prospect of
20  * name collisions with other processes (including other copies of
21  * PostgreSQL) running on the same system.  Some systems only support
22  * the older System V shared memory interface (shmget etc.) which is
23  * also usable; however, the default allocation limits are often quite
24  * small, and the namespace is even more restricted.
25  *
26  * We also provide an mmap-based shared memory implementation.  This may
27  * be useful on systems that provide shared memory via a special-purpose
28  * filesystem; by opting for this implementation, the user can even
29  * control precisely where their shared memory segments are placed.  It
30  * can also be used as a fallback for systems where shm_open and shmget
31  * are not available or can't be used for some reason.  Of course,
32  * mapping a file residing on an actual spinning disk is a fairly poor
33  * approximation for shared memory because writeback may hurt performance
34  * substantially, but there should be few systems where we must make do
35  * with such poor tools.
36  *
37  * As ever, Windows requires its own implementation.
38  *
39  * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
40  * Portions Copyright (c) 1994, Regents of the University of California
41  *
42  *
43  * IDENTIFICATION
44  *	  src/backend/storage/ipc/dsm_impl.c
45  *
46  *-------------------------------------------------------------------------
47  */
48 
49 #include "postgres.h"
50 
51 #include <fcntl.h>
52 #include <unistd.h>
53 #ifndef WIN32
54 #include <sys/mman.h>
55 #endif
56 #include <sys/stat.h>
57 #ifdef HAVE_SYS_IPC_H
58 #include <sys/ipc.h>
59 #endif
60 #ifdef HAVE_SYS_SHM_H
61 #include <sys/shm.h>
62 #endif
63 
64 #include "common/file_perm.h"
65 #include "miscadmin.h"
66 #include "pgstat.h"
67 #include "portability/mem.h"
68 #include "postmaster/postmaster.h"
69 #include "storage/dsm_impl.h"
70 #include "storage/fd.h"
71 #include "utils/guc.h"
72 #include "utils/memutils.h"
73 
74 #ifdef USE_DSM_POSIX
75 static bool dsm_impl_posix(dsm_op op, dsm_handle handle, Size request_size,
76 						   void **impl_private, void **mapped_address,
77 						   Size *mapped_size, int elevel);
78 static int	dsm_impl_posix_resize(int fd, off_t size);
79 #endif
80 #ifdef USE_DSM_SYSV
81 static bool dsm_impl_sysv(dsm_op op, dsm_handle handle, Size request_size,
82 						  void **impl_private, void **mapped_address,
83 						  Size *mapped_size, int elevel);
84 #endif
85 #ifdef USE_DSM_WINDOWS
86 static bool dsm_impl_windows(dsm_op op, dsm_handle handle, Size request_size,
87 							 void **impl_private, void **mapped_address,
88 							 Size *mapped_size, int elevel);
89 #endif
90 #ifdef USE_DSM_MMAP
91 static bool dsm_impl_mmap(dsm_op op, dsm_handle handle, Size request_size,
92 						  void **impl_private, void **mapped_address,
93 						  Size *mapped_size, int elevel);
94 #endif
95 static int	errcode_for_dynamic_shared_memory(void);
96 
97 const struct config_enum_entry dynamic_shared_memory_options[] = {
98 #ifdef USE_DSM_POSIX
99 	{"posix", DSM_IMPL_POSIX, false},
100 #endif
101 #ifdef USE_DSM_SYSV
102 	{"sysv", DSM_IMPL_SYSV, false},
103 #endif
104 #ifdef USE_DSM_WINDOWS
105 	{"windows", DSM_IMPL_WINDOWS, false},
106 #endif
107 #ifdef USE_DSM_MMAP
108 	{"mmap", DSM_IMPL_MMAP, false},
109 #endif
110 	{NULL, 0, false}
111 };
112 
113 /* Implementation selector. */
114 int			dynamic_shared_memory_type;
115 
116 /* Amount of space reserved for DSM segments in the main area. */
117 int			min_dynamic_shared_memory;
118 
119 /* Size of buffer to be used for zero-filling. */
120 #define ZBUFFER_SIZE				8192
121 
122 #define SEGMENT_NAME_PREFIX			"Global/PostgreSQL"
123 
124 /*------
125  * Perform a low-level shared memory operation in a platform-specific way,
126  * as dictated by the selected implementation.  Each implementation is
127  * required to implement the following primitives.
128  *
129  * DSM_OP_CREATE.  Create a segment whose size is the request_size and
130  * map it.
131  *
132  * DSM_OP_ATTACH.  Map the segment, whose size must be the request_size.
133  *
134  * DSM_OP_DETACH.  Unmap the segment.
135  *
136  * DSM_OP_DESTROY.  Unmap the segment, if it is mapped.  Destroy the
137  * segment.
138  *
139  * Arguments:
140  *	 op: The operation to be performed.
141  *	 handle: The handle of an existing object, or for DSM_OP_CREATE, the
142  *	   a new handle the caller wants created.
143  *	 request_size: For DSM_OP_CREATE, the requested size.  Otherwise, 0.
144  *	 impl_private: Private, implementation-specific data.  Will be a pointer
145  *	   to NULL for the first operation on a shared memory segment within this
146  *	   backend; thereafter, it will point to the value to which it was set
147  *	   on the previous call.
148  *	 mapped_address: Pointer to start of current mapping; pointer to NULL
149  *	   if none.  Updated with new mapping address.
150  *	 mapped_size: Pointer to size of current mapping; pointer to 0 if none.
151  *	   Updated with new mapped size.
152  *	 elevel: Level at which to log errors.
153  *
154  * Return value: true on success, false on failure.  When false is returned,
155  * a message should first be logged at the specified elevel, except in the
156  * case where DSM_OP_CREATE experiences a name collision, which should
157  * silently return false.
158  *-----
159  */
160 bool
dsm_impl_op(dsm_op op,dsm_handle handle,Size request_size,void ** impl_private,void ** mapped_address,Size * mapped_size,int elevel)161 dsm_impl_op(dsm_op op, dsm_handle handle, Size request_size,
162 			void **impl_private, void **mapped_address, Size *mapped_size,
163 			int elevel)
164 {
165 	Assert(op == DSM_OP_CREATE || request_size == 0);
166 	Assert((op != DSM_OP_CREATE && op != DSM_OP_ATTACH) ||
167 		   (*mapped_address == NULL && *mapped_size == 0));
168 
169 	switch (dynamic_shared_memory_type)
170 	{
171 #ifdef USE_DSM_POSIX
172 		case DSM_IMPL_POSIX:
173 			return dsm_impl_posix(op, handle, request_size, impl_private,
174 								  mapped_address, mapped_size, elevel);
175 #endif
176 #ifdef USE_DSM_SYSV
177 		case DSM_IMPL_SYSV:
178 			return dsm_impl_sysv(op, handle, request_size, impl_private,
179 								 mapped_address, mapped_size, elevel);
180 #endif
181 #ifdef USE_DSM_WINDOWS
182 		case DSM_IMPL_WINDOWS:
183 			return dsm_impl_windows(op, handle, request_size, impl_private,
184 									mapped_address, mapped_size, elevel);
185 #endif
186 #ifdef USE_DSM_MMAP
187 		case DSM_IMPL_MMAP:
188 			return dsm_impl_mmap(op, handle, request_size, impl_private,
189 								 mapped_address, mapped_size, elevel);
190 #endif
191 		default:
192 			elog(ERROR, "unexpected dynamic shared memory type: %d",
193 				 dynamic_shared_memory_type);
194 			return false;
195 	}
196 }
197 
198 #ifdef USE_DSM_POSIX
199 /*
200  * Operating system primitives to support POSIX shared memory.
201  *
202  * POSIX shared memory segments are created and attached using shm_open()
203  * and shm_unlink(); other operations, such as sizing or mapping the
204  * segment, are performed as if the shared memory segments were files.
205  *
206  * Indeed, on some platforms, they may be implemented that way.  While
207  * POSIX shared memory segments seem intended to exist in a flat namespace,
208  * some operating systems may implement them as files, even going so far
209  * to treat a request for /xyz as a request to create a file by that name
210  * in the root directory.  Users of such broken platforms should select
211  * a different shared memory implementation.
212  */
213 static bool
dsm_impl_posix(dsm_op op,dsm_handle handle,Size request_size,void ** impl_private,void ** mapped_address,Size * mapped_size,int elevel)214 dsm_impl_posix(dsm_op op, dsm_handle handle, Size request_size,
215 			   void **impl_private, void **mapped_address, Size *mapped_size,
216 			   int elevel)
217 {
218 	char		name[64];
219 	int			flags;
220 	int			fd;
221 	char	   *address;
222 
223 	snprintf(name, 64, "/PostgreSQL.%u", handle);
224 
225 	/* Handle teardown cases. */
226 	if (op == DSM_OP_DETACH || op == DSM_OP_DESTROY)
227 	{
228 		if (*mapped_address != NULL
229 			&& munmap(*mapped_address, *mapped_size) != 0)
230 		{
231 			ereport(elevel,
232 					(errcode_for_dynamic_shared_memory(),
233 					 errmsg("could not unmap shared memory segment \"%s\": %m",
234 							name)));
235 			return false;
236 		}
237 		*mapped_address = NULL;
238 		*mapped_size = 0;
239 		if (op == DSM_OP_DESTROY && shm_unlink(name) != 0)
240 		{
241 			ereport(elevel,
242 					(errcode_for_dynamic_shared_memory(),
243 					 errmsg("could not remove shared memory segment \"%s\": %m",
244 							name)));
245 			return false;
246 		}
247 		return true;
248 	}
249 
250 	/*
251 	 * Create new segment or open an existing one for attach.
252 	 *
253 	 * Even though we will close the FD before returning, it seems desirable
254 	 * to use Reserve/ReleaseExternalFD, to reduce the probability of EMFILE
255 	 * failure.  The fact that we won't hold the FD open long justifies using
256 	 * ReserveExternalFD rather than AcquireExternalFD, though.
257 	 */
258 	ReserveExternalFD();
259 
260 	flags = O_RDWR | (op == DSM_OP_CREATE ? O_CREAT | O_EXCL : 0);
261 	if ((fd = shm_open(name, flags, PG_FILE_MODE_OWNER)) == -1)
262 	{
263 		ReleaseExternalFD();
264 		if (errno != EEXIST)
265 			ereport(elevel,
266 					(errcode_for_dynamic_shared_memory(),
267 					 errmsg("could not open shared memory segment \"%s\": %m",
268 							name)));
269 		return false;
270 	}
271 
272 	/*
273 	 * If we're attaching the segment, determine the current size; if we are
274 	 * creating the segment, set the size to the requested value.
275 	 */
276 	if (op == DSM_OP_ATTACH)
277 	{
278 		struct stat st;
279 
280 		if (fstat(fd, &st) != 0)
281 		{
282 			int			save_errno;
283 
284 			/* Back out what's already been done. */
285 			save_errno = errno;
286 			close(fd);
287 			ReleaseExternalFD();
288 			errno = save_errno;
289 
290 			ereport(elevel,
291 					(errcode_for_dynamic_shared_memory(),
292 					 errmsg("could not stat shared memory segment \"%s\": %m",
293 							name)));
294 			return false;
295 		}
296 		request_size = st.st_size;
297 	}
298 	else if (dsm_impl_posix_resize(fd, request_size) != 0)
299 	{
300 		int			save_errno;
301 
302 		/* Back out what's already been done. */
303 		save_errno = errno;
304 		close(fd);
305 		ReleaseExternalFD();
306 		shm_unlink(name);
307 		errno = save_errno;
308 
309 		/*
310 		 * If we received a query cancel or termination signal, we will have
311 		 * EINTR set here.  If the caller said that errors are OK here, check
312 		 * for interrupts immediately.
313 		 */
314 		if (errno == EINTR && elevel >= ERROR)
315 			CHECK_FOR_INTERRUPTS();
316 
317 		ereport(elevel,
318 				(errcode_for_dynamic_shared_memory(),
319 				 errmsg("could not resize shared memory segment \"%s\" to %zu bytes: %m",
320 						name, request_size)));
321 		return false;
322 	}
323 
324 	/* Map it. */
325 	address = mmap(NULL, request_size, PROT_READ | PROT_WRITE,
326 				   MAP_SHARED | MAP_HASSEMAPHORE | MAP_NOSYNC, fd, 0);
327 	if (address == MAP_FAILED)
328 	{
329 		int			save_errno;
330 
331 		/* Back out what's already been done. */
332 		save_errno = errno;
333 		close(fd);
334 		ReleaseExternalFD();
335 		if (op == DSM_OP_CREATE)
336 			shm_unlink(name);
337 		errno = save_errno;
338 
339 		ereport(elevel,
340 				(errcode_for_dynamic_shared_memory(),
341 				 errmsg("could not map shared memory segment \"%s\": %m",
342 						name)));
343 		return false;
344 	}
345 	*mapped_address = address;
346 	*mapped_size = request_size;
347 	close(fd);
348 	ReleaseExternalFD();
349 
350 	return true;
351 }
352 
353 /*
354  * Set the size of a virtual memory region associated with a file descriptor.
355  * If necessary, also ensure that virtual memory is actually allocated by the
356  * operating system, to avoid nasty surprises later.
357  *
358  * Returns non-zero if either truncation or allocation fails, and sets errno.
359  */
360 static int
dsm_impl_posix_resize(int fd,off_t size)361 dsm_impl_posix_resize(int fd, off_t size)
362 {
363 	int			rc;
364 
365 	/* Truncate (or extend) the file to the requested size. */
366 	rc = ftruncate(fd, size);
367 
368 	/*
369 	 * On Linux, a shm_open fd is backed by a tmpfs file.  After resizing with
370 	 * ftruncate, the file may contain a hole.  Accessing memory backed by a
371 	 * hole causes tmpfs to allocate pages, which fails with SIGBUS if there
372 	 * is no more tmpfs space available.  So we ask tmpfs to allocate pages
373 	 * here, so we can fail gracefully with ENOSPC now rather than risking
374 	 * SIGBUS later.
375 	 */
376 #if defined(HAVE_POSIX_FALLOCATE) && defined(__linux__)
377 	if (rc == 0)
378 	{
379 		/*
380 		 * We may get interrupted.  If so, just retry unless there is an
381 		 * interrupt pending.  This avoids the possibility of looping forever
382 		 * if another backend is repeatedly trying to interrupt us.
383 		 */
384 		pgstat_report_wait_start(WAIT_EVENT_DSM_FILL_ZERO_WRITE);
385 		do
386 		{
387 			rc = posix_fallocate(fd, 0, size);
388 		} while (rc == EINTR && !(ProcDiePending || QueryCancelPending));
389 		pgstat_report_wait_end();
390 
391 		/*
392 		 * The caller expects errno to be set, but posix_fallocate() doesn't
393 		 * set it.  Instead it returns error numbers directly.  So set errno,
394 		 * even though we'll also return rc to indicate success or failure.
395 		 */
396 		errno = rc;
397 	}
398 #endif							/* HAVE_POSIX_FALLOCATE && __linux__ */
399 
400 	return rc;
401 }
402 
403 #endif							/* USE_DSM_POSIX */
404 
405 #ifdef USE_DSM_SYSV
406 /*
407  * Operating system primitives to support System V shared memory.
408  *
409  * System V shared memory segments are manipulated using shmget(), shmat(),
410  * shmdt(), and shmctl().  As the default allocation limits for System V
411  * shared memory are usually quite low, the POSIX facilities may be
412  * preferable; but those are not supported everywhere.
413  */
414 static bool
dsm_impl_sysv(dsm_op op,dsm_handle handle,Size request_size,void ** impl_private,void ** mapped_address,Size * mapped_size,int elevel)415 dsm_impl_sysv(dsm_op op, dsm_handle handle, Size request_size,
416 			  void **impl_private, void **mapped_address, Size *mapped_size,
417 			  int elevel)
418 {
419 	key_t		key;
420 	int			ident;
421 	char	   *address;
422 	char		name[64];
423 	int		   *ident_cache;
424 
425 	/*
426 	 * POSIX shared memory and mmap-based shared memory identify segments with
427 	 * names.  To avoid needless error message variation, we use the handle as
428 	 * the name.
429 	 */
430 	snprintf(name, 64, "%u", handle);
431 
432 	/*
433 	 * The System V shared memory namespace is very restricted; names are of
434 	 * type key_t, which is expected to be some sort of integer data type, but
435 	 * not necessarily the same one as dsm_handle.  Since we use dsm_handle to
436 	 * identify shared memory segments across processes, this might seem like
437 	 * a problem, but it's really not.  If dsm_handle is bigger than key_t,
438 	 * the cast below might truncate away some bits from the handle the
439 	 * user-provided, but it'll truncate exactly the same bits away in exactly
440 	 * the same fashion every time we use that handle, which is all that
441 	 * really matters.  Conversely, if dsm_handle is smaller than key_t, we
442 	 * won't use the full range of available key space, but that's no big deal
443 	 * either.
444 	 *
445 	 * We do make sure that the key isn't negative, because that might not be
446 	 * portable.
447 	 */
448 	key = (key_t) handle;
449 	if (key < 1)				/* avoid compiler warning if type is unsigned */
450 		key = -key;
451 
452 	/*
453 	 * There's one special key, IPC_PRIVATE, which can't be used.  If we end
454 	 * up with that value by chance during a create operation, just pretend it
455 	 * already exists, so that caller will retry.  If we run into it anywhere
456 	 * else, the caller has passed a handle that doesn't correspond to
457 	 * anything we ever created, which should not happen.
458 	 */
459 	if (key == IPC_PRIVATE)
460 	{
461 		if (op != DSM_OP_CREATE)
462 			elog(DEBUG4, "System V shared memory key may not be IPC_PRIVATE");
463 		errno = EEXIST;
464 		return false;
465 	}
466 
467 	/*
468 	 * Before we can do anything with a shared memory segment, we have to map
469 	 * the shared memory key to a shared memory identifier using shmget(). To
470 	 * avoid repeated lookups, we store the key using impl_private.
471 	 */
472 	if (*impl_private != NULL)
473 	{
474 		ident_cache = *impl_private;
475 		ident = *ident_cache;
476 	}
477 	else
478 	{
479 		int			flags = IPCProtection;
480 		size_t		segsize;
481 
482 		/*
483 		 * Allocate the memory BEFORE acquiring the resource, so that we don't
484 		 * leak the resource if memory allocation fails.
485 		 */
486 		ident_cache = MemoryContextAlloc(TopMemoryContext, sizeof(int));
487 
488 		/*
489 		 * When using shmget to find an existing segment, we must pass the
490 		 * size as 0.  Passing a non-zero size which is greater than the
491 		 * actual size will result in EINVAL.
492 		 */
493 		segsize = 0;
494 
495 		if (op == DSM_OP_CREATE)
496 		{
497 			flags |= IPC_CREAT | IPC_EXCL;
498 			segsize = request_size;
499 		}
500 
501 		if ((ident = shmget(key, segsize, flags)) == -1)
502 		{
503 			if (errno != EEXIST)
504 			{
505 				int			save_errno = errno;
506 
507 				pfree(ident_cache);
508 				errno = save_errno;
509 				ereport(elevel,
510 						(errcode_for_dynamic_shared_memory(),
511 						 errmsg("could not get shared memory segment: %m")));
512 			}
513 			return false;
514 		}
515 
516 		*ident_cache = ident;
517 		*impl_private = ident_cache;
518 	}
519 
520 	/* Handle teardown cases. */
521 	if (op == DSM_OP_DETACH || op == DSM_OP_DESTROY)
522 	{
523 		pfree(ident_cache);
524 		*impl_private = NULL;
525 		if (*mapped_address != NULL && shmdt(*mapped_address) != 0)
526 		{
527 			ereport(elevel,
528 					(errcode_for_dynamic_shared_memory(),
529 					 errmsg("could not unmap shared memory segment \"%s\": %m",
530 							name)));
531 			return false;
532 		}
533 		*mapped_address = NULL;
534 		*mapped_size = 0;
535 		if (op == DSM_OP_DESTROY && shmctl(ident, IPC_RMID, NULL) < 0)
536 		{
537 			ereport(elevel,
538 					(errcode_for_dynamic_shared_memory(),
539 					 errmsg("could not remove shared memory segment \"%s\": %m",
540 							name)));
541 			return false;
542 		}
543 		return true;
544 	}
545 
546 	/* If we're attaching it, we must use IPC_STAT to determine the size. */
547 	if (op == DSM_OP_ATTACH)
548 	{
549 		struct shmid_ds shm;
550 
551 		if (shmctl(ident, IPC_STAT, &shm) != 0)
552 		{
553 			ereport(elevel,
554 					(errcode_for_dynamic_shared_memory(),
555 					 errmsg("could not stat shared memory segment \"%s\": %m",
556 							name)));
557 			return false;
558 		}
559 		request_size = shm.shm_segsz;
560 	}
561 
562 	/* Map it. */
563 	address = shmat(ident, NULL, PG_SHMAT_FLAGS);
564 	if (address == (void *) -1)
565 	{
566 		int			save_errno;
567 
568 		/* Back out what's already been done. */
569 		save_errno = errno;
570 		if (op == DSM_OP_CREATE)
571 			shmctl(ident, IPC_RMID, NULL);
572 		errno = save_errno;
573 
574 		ereport(elevel,
575 				(errcode_for_dynamic_shared_memory(),
576 				 errmsg("could not map shared memory segment \"%s\": %m",
577 						name)));
578 		return false;
579 	}
580 	*mapped_address = address;
581 	*mapped_size = request_size;
582 
583 	return true;
584 }
585 #endif
586 
587 #ifdef USE_DSM_WINDOWS
588 /*
589  * Operating system primitives to support Windows shared memory.
590  *
591  * Windows shared memory implementation is done using file mapping
592  * which can be backed by either physical file or system paging file.
593  * Current implementation uses system paging file as other effects
594  * like performance are not clear for physical file and it is used in similar
595  * way for main shared memory in windows.
596  *
597  * A memory mapping object is a kernel object - they always get deleted when
598  * the last reference to them goes away, either explicitly via a CloseHandle or
599  * when the process containing the reference exits.
600  */
601 static bool
dsm_impl_windows(dsm_op op,dsm_handle handle,Size request_size,void ** impl_private,void ** mapped_address,Size * mapped_size,int elevel)602 dsm_impl_windows(dsm_op op, dsm_handle handle, Size request_size,
603 				 void **impl_private, void **mapped_address,
604 				 Size *mapped_size, int elevel)
605 {
606 	char	   *address;
607 	HANDLE		hmap;
608 	char		name[64];
609 	MEMORY_BASIC_INFORMATION info;
610 
611 	/*
612 	 * Storing the shared memory segment in the Global\ namespace, can allow
613 	 * any process running in any session to access that file mapping object
614 	 * provided that the caller has the required access rights. But to avoid
615 	 * issues faced in main shared memory, we are using the naming convention
616 	 * similar to main shared memory. We can change here once issue mentioned
617 	 * in GetSharedMemName is resolved.
618 	 */
619 	snprintf(name, 64, "%s.%u", SEGMENT_NAME_PREFIX, handle);
620 
621 	/*
622 	 * Handle teardown cases.  Since Windows automatically destroys the object
623 	 * when no references remain, we can treat it the same as detach.
624 	 */
625 	if (op == DSM_OP_DETACH || op == DSM_OP_DESTROY)
626 	{
627 		if (*mapped_address != NULL
628 			&& UnmapViewOfFile(*mapped_address) == 0)
629 		{
630 			_dosmaperr(GetLastError());
631 			ereport(elevel,
632 					(errcode_for_dynamic_shared_memory(),
633 					 errmsg("could not unmap shared memory segment \"%s\": %m",
634 							name)));
635 			return false;
636 		}
637 		if (*impl_private != NULL
638 			&& CloseHandle(*impl_private) == 0)
639 		{
640 			_dosmaperr(GetLastError());
641 			ereport(elevel,
642 					(errcode_for_dynamic_shared_memory(),
643 					 errmsg("could not remove shared memory segment \"%s\": %m",
644 							name)));
645 			return false;
646 		}
647 
648 		*impl_private = NULL;
649 		*mapped_address = NULL;
650 		*mapped_size = 0;
651 		return true;
652 	}
653 
654 	/* Create new segment or open an existing one for attach. */
655 	if (op == DSM_OP_CREATE)
656 	{
657 		DWORD		size_high;
658 		DWORD		size_low;
659 		DWORD		errcode;
660 
661 		/* Shifts >= the width of the type are undefined. */
662 #ifdef _WIN64
663 		size_high = request_size >> 32;
664 #else
665 		size_high = 0;
666 #endif
667 		size_low = (DWORD) request_size;
668 
669 		/* CreateFileMapping might not clear the error code on success */
670 		SetLastError(0);
671 
672 		hmap = CreateFileMapping(INVALID_HANDLE_VALUE,	/* Use the pagefile */
673 								 NULL,	/* Default security attrs */
674 								 PAGE_READWRITE,	/* Memory is read/write */
675 								 size_high, /* Upper 32 bits of size */
676 								 size_low,	/* Lower 32 bits of size */
677 								 name);
678 
679 		errcode = GetLastError();
680 		if (errcode == ERROR_ALREADY_EXISTS || errcode == ERROR_ACCESS_DENIED)
681 		{
682 			/*
683 			 * On Windows, when the segment already exists, a handle for the
684 			 * existing segment is returned.  We must close it before
685 			 * returning.  However, if the existing segment is created by a
686 			 * service, then it returns ERROR_ACCESS_DENIED. We don't do
687 			 * _dosmaperr here, so errno won't be modified.
688 			 */
689 			if (hmap)
690 				CloseHandle(hmap);
691 			return false;
692 		}
693 
694 		if (!hmap)
695 		{
696 			_dosmaperr(errcode);
697 			ereport(elevel,
698 					(errcode_for_dynamic_shared_memory(),
699 					 errmsg("could not create shared memory segment \"%s\": %m",
700 							name)));
701 			return false;
702 		}
703 	}
704 	else
705 	{
706 		hmap = OpenFileMapping(FILE_MAP_WRITE | FILE_MAP_READ,
707 							   FALSE,	/* do not inherit the name */
708 							   name);	/* name of mapping object */
709 		if (!hmap)
710 		{
711 			_dosmaperr(GetLastError());
712 			ereport(elevel,
713 					(errcode_for_dynamic_shared_memory(),
714 					 errmsg("could not open shared memory segment \"%s\": %m",
715 							name)));
716 			return false;
717 		}
718 	}
719 
720 	/* Map it. */
721 	address = MapViewOfFile(hmap, FILE_MAP_WRITE | FILE_MAP_READ,
722 							0, 0, 0);
723 	if (!address)
724 	{
725 		int			save_errno;
726 
727 		_dosmaperr(GetLastError());
728 		/* Back out what's already been done. */
729 		save_errno = errno;
730 		CloseHandle(hmap);
731 		errno = save_errno;
732 
733 		ereport(elevel,
734 				(errcode_for_dynamic_shared_memory(),
735 				 errmsg("could not map shared memory segment \"%s\": %m",
736 						name)));
737 		return false;
738 	}
739 
740 	/*
741 	 * VirtualQuery gives size in page_size units, which is 4K for Windows. We
742 	 * need size only when we are attaching, but it's better to get the size
743 	 * when creating new segment to keep size consistent both for
744 	 * DSM_OP_CREATE and DSM_OP_ATTACH.
745 	 */
746 	if (VirtualQuery(address, &info, sizeof(info)) == 0)
747 	{
748 		int			save_errno;
749 
750 		_dosmaperr(GetLastError());
751 		/* Back out what's already been done. */
752 		save_errno = errno;
753 		UnmapViewOfFile(address);
754 		CloseHandle(hmap);
755 		errno = save_errno;
756 
757 		ereport(elevel,
758 				(errcode_for_dynamic_shared_memory(),
759 				 errmsg("could not stat shared memory segment \"%s\": %m",
760 						name)));
761 		return false;
762 	}
763 
764 	*mapped_address = address;
765 	*mapped_size = info.RegionSize;
766 	*impl_private = hmap;
767 
768 	return true;
769 }
770 #endif
771 
772 #ifdef USE_DSM_MMAP
773 /*
774  * Operating system primitives to support mmap-based shared memory.
775  *
776  * Calling this "shared memory" is somewhat of a misnomer, because what
777  * we're really doing is creating a bunch of files and mapping them into
778  * our address space.  The operating system may feel obliged to
779  * synchronize the contents to disk even if nothing is being paged out,
780  * which will not serve us well.  The user can relocate the pg_dynshmem
781  * directory to a ramdisk to avoid this problem, if available.
782  */
783 static bool
dsm_impl_mmap(dsm_op op,dsm_handle handle,Size request_size,void ** impl_private,void ** mapped_address,Size * mapped_size,int elevel)784 dsm_impl_mmap(dsm_op op, dsm_handle handle, Size request_size,
785 			  void **impl_private, void **mapped_address, Size *mapped_size,
786 			  int elevel)
787 {
788 	char		name[64];
789 	int			flags;
790 	int			fd;
791 	char	   *address;
792 
793 	snprintf(name, 64, PG_DYNSHMEM_DIR "/" PG_DYNSHMEM_MMAP_FILE_PREFIX "%u",
794 			 handle);
795 
796 	/* Handle teardown cases. */
797 	if (op == DSM_OP_DETACH || op == DSM_OP_DESTROY)
798 	{
799 		if (*mapped_address != NULL
800 			&& munmap(*mapped_address, *mapped_size) != 0)
801 		{
802 			ereport(elevel,
803 					(errcode_for_dynamic_shared_memory(),
804 					 errmsg("could not unmap shared memory segment \"%s\": %m",
805 							name)));
806 			return false;
807 		}
808 		*mapped_address = NULL;
809 		*mapped_size = 0;
810 		if (op == DSM_OP_DESTROY && unlink(name) != 0)
811 		{
812 			ereport(elevel,
813 					(errcode_for_dynamic_shared_memory(),
814 					 errmsg("could not remove shared memory segment \"%s\": %m",
815 							name)));
816 			return false;
817 		}
818 		return true;
819 	}
820 
821 	/* Create new segment or open an existing one for attach. */
822 	flags = O_RDWR | (op == DSM_OP_CREATE ? O_CREAT | O_EXCL : 0);
823 	if ((fd = OpenTransientFile(name, flags)) == -1)
824 	{
825 		if (errno != EEXIST)
826 			ereport(elevel,
827 					(errcode_for_dynamic_shared_memory(),
828 					 errmsg("could not open shared memory segment \"%s\": %m",
829 							name)));
830 		return false;
831 	}
832 
833 	/*
834 	 * If we're attaching the segment, determine the current size; if we are
835 	 * creating the segment, set the size to the requested value.
836 	 */
837 	if (op == DSM_OP_ATTACH)
838 	{
839 		struct stat st;
840 
841 		if (fstat(fd, &st) != 0)
842 		{
843 			int			save_errno;
844 
845 			/* Back out what's already been done. */
846 			save_errno = errno;
847 			CloseTransientFile(fd);
848 			errno = save_errno;
849 
850 			ereport(elevel,
851 					(errcode_for_dynamic_shared_memory(),
852 					 errmsg("could not stat shared memory segment \"%s\": %m",
853 							name)));
854 			return false;
855 		}
856 		request_size = st.st_size;
857 	}
858 	else
859 	{
860 		/*
861 		 * Allocate a buffer full of zeros.
862 		 *
863 		 * Note: palloc zbuffer, instead of just using a local char array, to
864 		 * ensure it is reasonably well-aligned; this may save a few cycles
865 		 * transferring data to the kernel.
866 		 */
867 		char	   *zbuffer = (char *) palloc0(ZBUFFER_SIZE);
868 		uint32		remaining = request_size;
869 		bool		success = true;
870 
871 		/*
872 		 * Zero-fill the file. We have to do this the hard way to ensure that
873 		 * all the file space has really been allocated, so that we don't
874 		 * later seg fault when accessing the memory mapping.  This is pretty
875 		 * pessimal.
876 		 */
877 		while (success && remaining > 0)
878 		{
879 			Size		goal = remaining;
880 
881 			if (goal > ZBUFFER_SIZE)
882 				goal = ZBUFFER_SIZE;
883 			pgstat_report_wait_start(WAIT_EVENT_DSM_FILL_ZERO_WRITE);
884 			if (write(fd, zbuffer, goal) == goal)
885 				remaining -= goal;
886 			else
887 				success = false;
888 			pgstat_report_wait_end();
889 		}
890 
891 		if (!success)
892 		{
893 			int			save_errno;
894 
895 			/* Back out what's already been done. */
896 			save_errno = errno;
897 			CloseTransientFile(fd);
898 			unlink(name);
899 			errno = save_errno ? save_errno : ENOSPC;
900 
901 			ereport(elevel,
902 					(errcode_for_dynamic_shared_memory(),
903 					 errmsg("could not resize shared memory segment \"%s\" to %zu bytes: %m",
904 							name, request_size)));
905 			return false;
906 		}
907 	}
908 
909 	/* Map it. */
910 	address = mmap(NULL, request_size, PROT_READ | PROT_WRITE,
911 				   MAP_SHARED | MAP_HASSEMAPHORE | MAP_NOSYNC, fd, 0);
912 	if (address == MAP_FAILED)
913 	{
914 		int			save_errno;
915 
916 		/* Back out what's already been done. */
917 		save_errno = errno;
918 		CloseTransientFile(fd);
919 		if (op == DSM_OP_CREATE)
920 			unlink(name);
921 		errno = save_errno;
922 
923 		ereport(elevel,
924 				(errcode_for_dynamic_shared_memory(),
925 				 errmsg("could not map shared memory segment \"%s\": %m",
926 						name)));
927 		return false;
928 	}
929 	*mapped_address = address;
930 	*mapped_size = request_size;
931 
932 	if (CloseTransientFile(fd) != 0)
933 	{
934 		ereport(elevel,
935 				(errcode_for_file_access(),
936 				 errmsg("could not close shared memory segment \"%s\": %m",
937 						name)));
938 		return false;
939 	}
940 
941 	return true;
942 }
943 #endif
944 
945 /*
946  * Implementation-specific actions that must be performed when a segment is to
947  * be preserved even when no backend has it attached.
948  *
949  * Except on Windows, we don't need to do anything at all.  But since Windows
950  * cleans up segments automatically when no references remain, we duplicate
951  * the segment handle into the postmaster process.  The postmaster needn't
952  * do anything to receive the handle; Windows transfers it automatically.
953  */
954 void
dsm_impl_pin_segment(dsm_handle handle,void * impl_private,void ** impl_private_pm_handle)955 dsm_impl_pin_segment(dsm_handle handle, void *impl_private,
956 					 void **impl_private_pm_handle)
957 {
958 	switch (dynamic_shared_memory_type)
959 	{
960 #ifdef USE_DSM_WINDOWS
961 		case DSM_IMPL_WINDOWS:
962 			{
963 				HANDLE		hmap;
964 
965 				if (!DuplicateHandle(GetCurrentProcess(), impl_private,
966 									 PostmasterHandle, &hmap, 0, FALSE,
967 									 DUPLICATE_SAME_ACCESS))
968 				{
969 					char		name[64];
970 
971 					snprintf(name, 64, "%s.%u", SEGMENT_NAME_PREFIX, handle);
972 					_dosmaperr(GetLastError());
973 					ereport(ERROR,
974 							(errcode_for_dynamic_shared_memory(),
975 							 errmsg("could not duplicate handle for \"%s\": %m",
976 									name)));
977 				}
978 
979 				/*
980 				 * Here, we remember the handle that we created in the
981 				 * postmaster process.  This handle isn't actually usable in
982 				 * any process other than the postmaster, but that doesn't
983 				 * matter.  We're just holding onto it so that, if the segment
984 				 * is unpinned, dsm_impl_unpin_segment can close it.
985 				 */
986 				*impl_private_pm_handle = hmap;
987 				break;
988 			}
989 #endif
990 		default:
991 			break;
992 	}
993 }
994 
995 /*
996  * Implementation-specific actions that must be performed when a segment is no
997  * longer to be preserved, so that it will be cleaned up when all backends
998  * have detached from it.
999  *
1000  * Except on Windows, we don't need to do anything at all.  For Windows, we
1001  * close the extra handle that dsm_impl_pin_segment created in the
1002  * postmaster's process space.
1003  */
1004 void
dsm_impl_unpin_segment(dsm_handle handle,void ** impl_private)1005 dsm_impl_unpin_segment(dsm_handle handle, void **impl_private)
1006 {
1007 	switch (dynamic_shared_memory_type)
1008 	{
1009 #ifdef USE_DSM_WINDOWS
1010 		case DSM_IMPL_WINDOWS:
1011 			{
1012 				if (*impl_private &&
1013 					!DuplicateHandle(PostmasterHandle, *impl_private,
1014 									 NULL, NULL, 0, FALSE,
1015 									 DUPLICATE_CLOSE_SOURCE))
1016 				{
1017 					char		name[64];
1018 
1019 					snprintf(name, 64, "%s.%u", SEGMENT_NAME_PREFIX, handle);
1020 					_dosmaperr(GetLastError());
1021 					ereport(ERROR,
1022 							(errcode_for_dynamic_shared_memory(),
1023 							 errmsg("could not duplicate handle for \"%s\": %m",
1024 									name)));
1025 				}
1026 
1027 				*impl_private = NULL;
1028 				break;
1029 			}
1030 #endif
1031 		default:
1032 			break;
1033 	}
1034 }
1035 
1036 static int
errcode_for_dynamic_shared_memory(void)1037 errcode_for_dynamic_shared_memory(void)
1038 {
1039 	if (errno == EFBIG || errno == ENOMEM)
1040 		return errcode(ERRCODE_OUT_OF_MEMORY);
1041 	else
1042 		return errcode_for_file_access();
1043 }
1044