1 /*------------------------------------------------------------------------- 2 * 3 * dsm_impl.c 4 * manage dynamic shared memory segments 5 * 6 * This file provides low-level APIs for creating and destroying shared 7 * memory segments using several different possible techniques. We refer 8 * to these segments as dynamic because they can be created, altered, and 9 * destroyed at any point during the server life cycle. This is unlike 10 * the main shared memory segment, of which there is always exactly one 11 * and which is always mapped at a fixed address in every PostgreSQL 12 * background process. 13 * 14 * Because not all systems provide the same primitives in this area, nor 15 * do all primitives behave the same way on all systems, we provide 16 * several implementations of this facility. Many systems implement 17 * POSIX shared memory (shm_open etc.), which is well-suited to our needs 18 * in this area, with the exception that shared memory identifiers live 19 * in a flat system-wide namespace, raising the uncomfortable prospect of 20 * name collisions with other processes (including other copies of 21 * PostgreSQL) running on the same system. Some systems only support 22 * the older System V shared memory interface (shmget etc.) which is 23 * also usable; however, the default allocation limits are often quite 24 * small, and the namespace is even more restricted. 25 * 26 * We also provide an mmap-based shared memory implementation. This may 27 * be useful on systems that provide shared memory via a special-purpose 28 * filesystem; by opting for this implementation, the user can even 29 * control precisely where their shared memory segments are placed. It 30 * can also be used as a fallback for systems where shm_open and shmget 31 * are not available or can't be used for some reason. Of course, 32 * mapping a file residing on an actual spinning disk is a fairly poor 33 * approximation for shared memory because writeback may hurt performance 34 * substantially, but there should be few systems where we must make do 35 * with such poor tools. 36 * 37 * As ever, Windows requires its own implementation. 38 * 39 * Portions Copyright (c) 1996-2018, PostgreSQL Global Development Group 40 * Portions Copyright (c) 1994, Regents of the University of California 41 * 42 * 43 * IDENTIFICATION 44 * src/backend/storage/ipc/dsm_impl.c 45 * 46 *------------------------------------------------------------------------- 47 */ 48 49 #include "postgres.h" 50 #include "miscadmin.h" 51 52 #include <fcntl.h> 53 #include <unistd.h> 54 #ifndef WIN32 55 #include <sys/mman.h> 56 #endif 57 #include <sys/stat.h> 58 #ifdef HAVE_SYS_IPC_H 59 #include <sys/ipc.h> 60 #endif 61 #ifdef HAVE_SYS_SHM_H 62 #include <sys/shm.h> 63 #endif 64 #include "common/file_perm.h" 65 #include "pgstat.h" 66 67 #include "portability/mem.h" 68 #include "storage/dsm_impl.h" 69 #include "storage/fd.h" 70 #include "utils/guc.h" 71 #include "utils/memutils.h" 72 #include "postmaster/postmaster.h" 73 74 #ifdef USE_DSM_POSIX 75 static bool dsm_impl_posix(dsm_op op, dsm_handle handle, Size request_size, 76 void **impl_private, void **mapped_address, 77 Size *mapped_size, int elevel); 78 static int dsm_impl_posix_resize(int fd, off_t size); 79 #endif 80 #ifdef USE_DSM_SYSV 81 static bool dsm_impl_sysv(dsm_op op, dsm_handle handle, Size request_size, 82 void **impl_private, void **mapped_address, 83 Size *mapped_size, int elevel); 84 #endif 85 #ifdef USE_DSM_WINDOWS 86 static bool dsm_impl_windows(dsm_op op, dsm_handle handle, Size request_size, 87 void **impl_private, void **mapped_address, 88 Size *mapped_size, int elevel); 89 #endif 90 #ifdef USE_DSM_MMAP 91 static bool dsm_impl_mmap(dsm_op op, dsm_handle handle, Size request_size, 92 void **impl_private, void **mapped_address, 93 Size *mapped_size, int elevel); 94 #endif 95 static int errcode_for_dynamic_shared_memory(void); 96 97 const struct config_enum_entry dynamic_shared_memory_options[] = { 98 #ifdef USE_DSM_POSIX 99 {"posix", DSM_IMPL_POSIX, false}, 100 #endif 101 #ifdef USE_DSM_SYSV 102 {"sysv", DSM_IMPL_SYSV, false}, 103 #endif 104 #ifdef USE_DSM_WINDOWS 105 {"windows", DSM_IMPL_WINDOWS, false}, 106 #endif 107 #ifdef USE_DSM_MMAP 108 {"mmap", DSM_IMPL_MMAP, false}, 109 #endif 110 {"none", DSM_IMPL_NONE, false}, 111 {NULL, 0, false} 112 }; 113 114 /* Implementation selector. */ 115 int dynamic_shared_memory_type; 116 117 /* Size of buffer to be used for zero-filling. */ 118 #define ZBUFFER_SIZE 8192 119 120 #define SEGMENT_NAME_PREFIX "Global/PostgreSQL" 121 122 /*------ 123 * Perform a low-level shared memory operation in a platform-specific way, 124 * as dictated by the selected implementation. Each implementation is 125 * required to implement the following primitives. 126 * 127 * DSM_OP_CREATE. Create a segment whose size is the request_size and 128 * map it. 129 * 130 * DSM_OP_ATTACH. Map the segment, whose size must be the request_size. 131 * The segment may already be mapped; any existing mapping should be removed 132 * before creating a new one. 133 * 134 * DSM_OP_DETACH. Unmap the segment. 135 * 136 * DSM_OP_RESIZE. Resize the segment to the given request_size and 137 * remap the segment at that new size. 138 * 139 * DSM_OP_DESTROY. Unmap the segment, if it is mapped. Destroy the 140 * segment. 141 * 142 * Arguments: 143 * op: The operation to be performed. 144 * handle: The handle of an existing object, or for DSM_OP_CREATE, the 145 * a new handle the caller wants created. 146 * request_size: For DSM_OP_CREATE, the requested size. For DSM_OP_RESIZE, 147 * the new size. Otherwise, 0. 148 * impl_private: Private, implementation-specific data. Will be a pointer 149 * to NULL for the first operation on a shared memory segment within this 150 * backend; thereafter, it will point to the value to which it was set 151 * on the previous call. 152 * mapped_address: Pointer to start of current mapping; pointer to NULL 153 * if none. Updated with new mapping address. 154 * mapped_size: Pointer to size of current mapping; pointer to 0 if none. 155 * Updated with new mapped size. 156 * elevel: Level at which to log errors. 157 * 158 * Return value: true on success, false on failure. When false is returned, 159 * a message should first be logged at the specified elevel, except in the 160 * case where DSM_OP_CREATE experiences a name collision, which should 161 * silently return false. 162 *----- 163 */ 164 bool 165 dsm_impl_op(dsm_op op, dsm_handle handle, Size request_size, 166 void **impl_private, void **mapped_address, Size *mapped_size, 167 int elevel) 168 { 169 Assert(op == DSM_OP_CREATE || op == DSM_OP_RESIZE || request_size == 0); 170 Assert((op != DSM_OP_CREATE && op != DSM_OP_ATTACH) || 171 (*mapped_address == NULL && *mapped_size == 0)); 172 173 switch (dynamic_shared_memory_type) 174 { 175 #ifdef USE_DSM_POSIX 176 case DSM_IMPL_POSIX: 177 return dsm_impl_posix(op, handle, request_size, impl_private, 178 mapped_address, mapped_size, elevel); 179 #endif 180 #ifdef USE_DSM_SYSV 181 case DSM_IMPL_SYSV: 182 return dsm_impl_sysv(op, handle, request_size, impl_private, 183 mapped_address, mapped_size, elevel); 184 #endif 185 #ifdef USE_DSM_WINDOWS 186 case DSM_IMPL_WINDOWS: 187 return dsm_impl_windows(op, handle, request_size, impl_private, 188 mapped_address, mapped_size, elevel); 189 #endif 190 #ifdef USE_DSM_MMAP 191 case DSM_IMPL_MMAP: 192 return dsm_impl_mmap(op, handle, request_size, impl_private, 193 mapped_address, mapped_size, elevel); 194 #endif 195 default: 196 elog(ERROR, "unexpected dynamic shared memory type: %d", 197 dynamic_shared_memory_type); 198 return false; 199 } 200 } 201 202 /* 203 * Does the current dynamic shared memory implementation support resizing 204 * segments? (The answer here could be platform-dependent in the future, 205 * since AIX allows shmctl(shmid, SHM_RESIZE, &buffer), though you apparently 206 * can't resize segments to anything larger than 256MB that way. For now, 207 * we keep it simple.) 208 */ 209 bool 210 dsm_impl_can_resize(void) 211 { 212 switch (dynamic_shared_memory_type) 213 { 214 case DSM_IMPL_NONE: 215 return false; 216 case DSM_IMPL_POSIX: 217 return true; 218 case DSM_IMPL_SYSV: 219 return false; 220 case DSM_IMPL_WINDOWS: 221 return false; 222 case DSM_IMPL_MMAP: 223 return true; 224 default: 225 return false; /* should not happen */ 226 } 227 } 228 229 #ifdef USE_DSM_POSIX 230 /* 231 * Operating system primitives to support POSIX shared memory. 232 * 233 * POSIX shared memory segments are created and attached using shm_open() 234 * and shm_unlink(); other operations, such as sizing or mapping the 235 * segment, are performed as if the shared memory segments were files. 236 * 237 * Indeed, on some platforms, they may be implemented that way. While 238 * POSIX shared memory segments seem intended to exist in a flat namespace, 239 * some operating systems may implement them as files, even going so far 240 * to treat a request for /xyz as a request to create a file by that name 241 * in the root directory. Users of such broken platforms should select 242 * a different shared memory implementation. 243 */ 244 static bool 245 dsm_impl_posix(dsm_op op, dsm_handle handle, Size request_size, 246 void **impl_private, void **mapped_address, Size *mapped_size, 247 int elevel) 248 { 249 char name[64]; 250 int flags; 251 int fd; 252 char *address; 253 254 snprintf(name, 64, "/PostgreSQL.%u", handle); 255 256 /* Handle teardown cases. */ 257 if (op == DSM_OP_DETACH || op == DSM_OP_DESTROY) 258 { 259 if (*mapped_address != NULL 260 && munmap(*mapped_address, *mapped_size) != 0) 261 { 262 ereport(elevel, 263 (errcode_for_dynamic_shared_memory(), 264 errmsg("could not unmap shared memory segment \"%s\": %m", 265 name))); 266 return false; 267 } 268 *mapped_address = NULL; 269 *mapped_size = 0; 270 if (op == DSM_OP_DESTROY && shm_unlink(name) != 0) 271 { 272 ereport(elevel, 273 (errcode_for_dynamic_shared_memory(), 274 errmsg("could not remove shared memory segment \"%s\": %m", 275 name))); 276 return false; 277 } 278 return true; 279 } 280 281 /* 282 * Create new segment or open an existing one for attach or resize. 283 * 284 * Even though we're not going through fd.c, we should be safe against 285 * running out of file descriptors, because of NUM_RESERVED_FDS. We're 286 * only opening one extra descriptor here, and we'll close it before 287 * returning. 288 */ 289 flags = O_RDWR | (op == DSM_OP_CREATE ? O_CREAT | O_EXCL : 0); 290 if ((fd = shm_open(name, flags, PG_FILE_MODE_OWNER)) == -1) 291 { 292 if (errno != EEXIST) 293 ereport(elevel, 294 (errcode_for_dynamic_shared_memory(), 295 errmsg("could not open shared memory segment \"%s\": %m", 296 name))); 297 return false; 298 } 299 300 /* 301 * If we're attaching the segment, determine the current size; if we are 302 * creating or resizing the segment, set the size to the requested value. 303 */ 304 if (op == DSM_OP_ATTACH) 305 { 306 struct stat st; 307 308 if (fstat(fd, &st) != 0) 309 { 310 int save_errno; 311 312 /* Back out what's already been done. */ 313 save_errno = errno; 314 close(fd); 315 errno = save_errno; 316 317 ereport(elevel, 318 (errcode_for_dynamic_shared_memory(), 319 errmsg("could not stat shared memory segment \"%s\": %m", 320 name))); 321 return false; 322 } 323 request_size = st.st_size; 324 } 325 else if (*mapped_size != request_size && 326 dsm_impl_posix_resize(fd, request_size) != 0) 327 { 328 int save_errno; 329 330 /* Back out what's already been done. */ 331 save_errno = errno; 332 close(fd); 333 if (op == DSM_OP_CREATE) 334 shm_unlink(name); 335 errno = save_errno; 336 337 /* 338 * If we received a query cancel or termination signal, we will have 339 * EINTR set here. If the caller said that errors are OK here, check 340 * for interrupts immediately. 341 */ 342 if (errno == EINTR && elevel >= ERROR) 343 CHECK_FOR_INTERRUPTS(); 344 345 ereport(elevel, 346 (errcode_for_dynamic_shared_memory(), 347 errmsg("could not resize shared memory segment \"%s\" to %zu bytes: %m", 348 name, request_size))); 349 return false; 350 } 351 352 /* 353 * If we're reattaching or resizing, we must remove any existing mapping, 354 * unless we've already got the right thing mapped. 355 */ 356 if (*mapped_address != NULL) 357 { 358 if (*mapped_size == request_size) 359 return true; 360 if (munmap(*mapped_address, *mapped_size) != 0) 361 { 362 int save_errno; 363 364 /* Back out what's already been done. */ 365 save_errno = errno; 366 close(fd); 367 if (op == DSM_OP_CREATE) 368 shm_unlink(name); 369 errno = save_errno; 370 371 ereport(elevel, 372 (errcode_for_dynamic_shared_memory(), 373 errmsg("could not unmap shared memory segment \"%s\": %m", 374 name))); 375 return false; 376 } 377 *mapped_address = NULL; 378 *mapped_size = 0; 379 } 380 381 /* Map it. */ 382 address = mmap(NULL, request_size, PROT_READ | PROT_WRITE, 383 MAP_SHARED | MAP_HASSEMAPHORE | MAP_NOSYNC, fd, 0); 384 if (address == MAP_FAILED) 385 { 386 int save_errno; 387 388 /* Back out what's already been done. */ 389 save_errno = errno; 390 close(fd); 391 if (op == DSM_OP_CREATE) 392 shm_unlink(name); 393 errno = save_errno; 394 395 ereport(elevel, 396 (errcode_for_dynamic_shared_memory(), 397 errmsg("could not map shared memory segment \"%s\": %m", 398 name))); 399 return false; 400 } 401 *mapped_address = address; 402 *mapped_size = request_size; 403 close(fd); 404 405 return true; 406 } 407 408 /* 409 * Set the size of a virtual memory region associated with a file descriptor. 410 * If necessary, also ensure that virtual memory is actually allocated by the 411 * operating system, to avoid nasty surprises later. 412 * 413 * Returns non-zero if either truncation or allocation fails, and sets errno. 414 */ 415 static int 416 dsm_impl_posix_resize(int fd, off_t size) 417 { 418 int rc; 419 420 /* Truncate (or extend) the file to the requested size. */ 421 rc = ftruncate(fd, size); 422 423 /* 424 * On Linux, a shm_open fd is backed by a tmpfs file. After resizing with 425 * ftruncate, the file may contain a hole. Accessing memory backed by a 426 * hole causes tmpfs to allocate pages, which fails with SIGBUS if there 427 * is no more tmpfs space available. So we ask tmpfs to allocate pages 428 * here, so we can fail gracefully with ENOSPC now rather than risking 429 * SIGBUS later. 430 */ 431 #if defined(HAVE_POSIX_FALLOCATE) && defined(__linux__) 432 if (rc == 0) 433 { 434 /* 435 * We may get interrupted. If so, just retry unless there is an 436 * interrupt pending. This avoids the possibility of looping forever 437 * if another backend is repeatedly trying to interrupt us. 438 */ 439 do 440 { 441 rc = posix_fallocate(fd, 0, size); 442 } while (rc == EINTR && !(ProcDiePending || QueryCancelPending)); 443 444 /* 445 * The caller expects errno to be set, but posix_fallocate() doesn't 446 * set it. Instead it returns error numbers directly. So set errno, 447 * even though we'll also return rc to indicate success or failure. 448 */ 449 errno = rc; 450 } 451 #endif /* HAVE_POSIX_FALLOCATE && __linux__ */ 452 453 return rc; 454 } 455 456 #endif /* USE_DSM_POSIX */ 457 458 #ifdef USE_DSM_SYSV 459 /* 460 * Operating system primitives to support System V shared memory. 461 * 462 * System V shared memory segments are manipulated using shmget(), shmat(), 463 * shmdt(), and shmctl(). There's no portable way to resize such 464 * segments. As the default allocation limits for System V shared memory 465 * are usually quite low, the POSIX facilities may be preferable; but 466 * those are not supported everywhere. 467 */ 468 static bool 469 dsm_impl_sysv(dsm_op op, dsm_handle handle, Size request_size, 470 void **impl_private, void **mapped_address, Size *mapped_size, 471 int elevel) 472 { 473 key_t key; 474 int ident; 475 char *address; 476 char name[64]; 477 int *ident_cache; 478 479 /* Resize is not supported for System V shared memory. */ 480 if (op == DSM_OP_RESIZE) 481 { 482 elog(elevel, "System V shared memory segments cannot be resized"); 483 return false; 484 } 485 486 /* Since resize isn't supported, reattach is a no-op. */ 487 if (op == DSM_OP_ATTACH && *mapped_address != NULL) 488 return true; 489 490 /* 491 * POSIX shared memory and mmap-based shared memory identify segments with 492 * names. To avoid needless error message variation, we use the handle as 493 * the name. 494 */ 495 snprintf(name, 64, "%u", handle); 496 497 /* 498 * The System V shared memory namespace is very restricted; names are of 499 * type key_t, which is expected to be some sort of integer data type, but 500 * not necessarily the same one as dsm_handle. Since we use dsm_handle to 501 * identify shared memory segments across processes, this might seem like 502 * a problem, but it's really not. If dsm_handle is bigger than key_t, 503 * the cast below might truncate away some bits from the handle the 504 * user-provided, but it'll truncate exactly the same bits away in exactly 505 * the same fashion every time we use that handle, which is all that 506 * really matters. Conversely, if dsm_handle is smaller than key_t, we 507 * won't use the full range of available key space, but that's no big deal 508 * either. 509 * 510 * We do make sure that the key isn't negative, because that might not be 511 * portable. 512 */ 513 key = (key_t) handle; 514 if (key < 1) /* avoid compiler warning if type is unsigned */ 515 key = -key; 516 517 /* 518 * There's one special key, IPC_PRIVATE, which can't be used. If we end 519 * up with that value by chance during a create operation, just pretend it 520 * already exists, so that caller will retry. If we run into it anywhere 521 * else, the caller has passed a handle that doesn't correspond to 522 * anything we ever created, which should not happen. 523 */ 524 if (key == IPC_PRIVATE) 525 { 526 if (op != DSM_OP_CREATE) 527 elog(DEBUG4, "System V shared memory key may not be IPC_PRIVATE"); 528 errno = EEXIST; 529 return false; 530 } 531 532 /* 533 * Before we can do anything with a shared memory segment, we have to map 534 * the shared memory key to a shared memory identifier using shmget(). To 535 * avoid repeated lookups, we store the key using impl_private. 536 */ 537 if (*impl_private != NULL) 538 { 539 ident_cache = *impl_private; 540 ident = *ident_cache; 541 } 542 else 543 { 544 int flags = IPCProtection; 545 size_t segsize; 546 547 /* 548 * Allocate the memory BEFORE acquiring the resource, so that we don't 549 * leak the resource if memory allocation fails. 550 */ 551 ident_cache = MemoryContextAlloc(TopMemoryContext, sizeof(int)); 552 553 /* 554 * When using shmget to find an existing segment, we must pass the 555 * size as 0. Passing a non-zero size which is greater than the 556 * actual size will result in EINVAL. 557 */ 558 segsize = 0; 559 560 if (op == DSM_OP_CREATE) 561 { 562 flags |= IPC_CREAT | IPC_EXCL; 563 segsize = request_size; 564 } 565 566 if ((ident = shmget(key, segsize, flags)) == -1) 567 { 568 if (errno != EEXIST) 569 { 570 int save_errno = errno; 571 572 pfree(ident_cache); 573 errno = save_errno; 574 ereport(elevel, 575 (errcode_for_dynamic_shared_memory(), 576 errmsg("could not get shared memory segment: %m"))); 577 } 578 return false; 579 } 580 581 *ident_cache = ident; 582 *impl_private = ident_cache; 583 } 584 585 /* Handle teardown cases. */ 586 if (op == DSM_OP_DETACH || op == DSM_OP_DESTROY) 587 { 588 pfree(ident_cache); 589 *impl_private = NULL; 590 if (*mapped_address != NULL && shmdt(*mapped_address) != 0) 591 { 592 ereport(elevel, 593 (errcode_for_dynamic_shared_memory(), 594 errmsg("could not unmap shared memory segment \"%s\": %m", 595 name))); 596 return false; 597 } 598 *mapped_address = NULL; 599 *mapped_size = 0; 600 if (op == DSM_OP_DESTROY && shmctl(ident, IPC_RMID, NULL) < 0) 601 { 602 ereport(elevel, 603 (errcode_for_dynamic_shared_memory(), 604 errmsg("could not remove shared memory segment \"%s\": %m", 605 name))); 606 return false; 607 } 608 return true; 609 } 610 611 /* If we're attaching it, we must use IPC_STAT to determine the size. */ 612 if (op == DSM_OP_ATTACH) 613 { 614 struct shmid_ds shm; 615 616 if (shmctl(ident, IPC_STAT, &shm) != 0) 617 { 618 ereport(elevel, 619 (errcode_for_dynamic_shared_memory(), 620 errmsg("could not stat shared memory segment \"%s\": %m", 621 name))); 622 return false; 623 } 624 request_size = shm.shm_segsz; 625 } 626 627 /* Map it. */ 628 address = shmat(ident, NULL, PG_SHMAT_FLAGS); 629 if (address == (void *) -1) 630 { 631 int save_errno; 632 633 /* Back out what's already been done. */ 634 save_errno = errno; 635 if (op == DSM_OP_CREATE) 636 shmctl(ident, IPC_RMID, NULL); 637 errno = save_errno; 638 639 ereport(elevel, 640 (errcode_for_dynamic_shared_memory(), 641 errmsg("could not map shared memory segment \"%s\": %m", 642 name))); 643 return false; 644 } 645 *mapped_address = address; 646 *mapped_size = request_size; 647 648 return true; 649 } 650 #endif 651 652 #ifdef USE_DSM_WINDOWS 653 /* 654 * Operating system primitives to support Windows shared memory. 655 * 656 * Windows shared memory implementation is done using file mapping 657 * which can be backed by either physical file or system paging file. 658 * Current implementation uses system paging file as other effects 659 * like performance are not clear for physical file and it is used in similar 660 * way for main shared memory in windows. 661 * 662 * A memory mapping object is a kernel object - they always get deleted when 663 * the last reference to them goes away, either explicitly via a CloseHandle or 664 * when the process containing the reference exits. 665 */ 666 static bool 667 dsm_impl_windows(dsm_op op, dsm_handle handle, Size request_size, 668 void **impl_private, void **mapped_address, 669 Size *mapped_size, int elevel) 670 { 671 char *address; 672 HANDLE hmap; 673 char name[64]; 674 MEMORY_BASIC_INFORMATION info; 675 676 /* Resize is not supported for Windows shared memory. */ 677 if (op == DSM_OP_RESIZE) 678 { 679 elog(elevel, "Windows shared memory segments cannot be resized"); 680 return false; 681 } 682 683 /* Since resize isn't supported, reattach is a no-op. */ 684 if (op == DSM_OP_ATTACH && *mapped_address != NULL) 685 return true; 686 687 /* 688 * Storing the shared memory segment in the Global\ namespace, can allow 689 * any process running in any session to access that file mapping object 690 * provided that the caller has the required access rights. But to avoid 691 * issues faced in main shared memory, we are using the naming convention 692 * similar to main shared memory. We can change here once issue mentioned 693 * in GetSharedMemName is resolved. 694 */ 695 snprintf(name, 64, "%s.%u", SEGMENT_NAME_PREFIX, handle); 696 697 /* 698 * Handle teardown cases. Since Windows automatically destroys the object 699 * when no references remain, we can treat it the same as detach. 700 */ 701 if (op == DSM_OP_DETACH || op == DSM_OP_DESTROY) 702 { 703 if (*mapped_address != NULL 704 && UnmapViewOfFile(*mapped_address) == 0) 705 { 706 _dosmaperr(GetLastError()); 707 ereport(elevel, 708 (errcode_for_dynamic_shared_memory(), 709 errmsg("could not unmap shared memory segment \"%s\": %m", 710 name))); 711 return false; 712 } 713 if (*impl_private != NULL 714 && CloseHandle(*impl_private) == 0) 715 { 716 _dosmaperr(GetLastError()); 717 ereport(elevel, 718 (errcode_for_dynamic_shared_memory(), 719 errmsg("could not remove shared memory segment \"%s\": %m", 720 name))); 721 return false; 722 } 723 724 *impl_private = NULL; 725 *mapped_address = NULL; 726 *mapped_size = 0; 727 return true; 728 } 729 730 /* Create new segment or open an existing one for attach. */ 731 if (op == DSM_OP_CREATE) 732 { 733 DWORD size_high; 734 DWORD size_low; 735 DWORD errcode; 736 737 /* Shifts >= the width of the type are undefined. */ 738 #ifdef _WIN64 739 size_high = request_size >> 32; 740 #else 741 size_high = 0; 742 #endif 743 size_low = (DWORD) request_size; 744 745 /* CreateFileMapping might not clear the error code on success */ 746 SetLastError(0); 747 748 hmap = CreateFileMapping(INVALID_HANDLE_VALUE, /* Use the pagefile */ 749 NULL, /* Default security attrs */ 750 PAGE_READWRITE, /* Memory is read/write */ 751 size_high, /* Upper 32 bits of size */ 752 size_low, /* Lower 32 bits of size */ 753 name); 754 755 errcode = GetLastError(); 756 if (errcode == ERROR_ALREADY_EXISTS || errcode == ERROR_ACCESS_DENIED) 757 { 758 /* 759 * On Windows, when the segment already exists, a handle for the 760 * existing segment is returned. We must close it before 761 * returning. However, if the existing segment is created by a 762 * service, then it returns ERROR_ACCESS_DENIED. We don't do 763 * _dosmaperr here, so errno won't be modified. 764 */ 765 if (hmap) 766 CloseHandle(hmap); 767 return false; 768 } 769 770 if (!hmap) 771 { 772 _dosmaperr(errcode); 773 ereport(elevel, 774 (errcode_for_dynamic_shared_memory(), 775 errmsg("could not create shared memory segment \"%s\": %m", 776 name))); 777 return false; 778 } 779 } 780 else 781 { 782 hmap = OpenFileMapping(FILE_MAP_WRITE | FILE_MAP_READ, 783 FALSE, /* do not inherit the name */ 784 name); /* name of mapping object */ 785 if (!hmap) 786 { 787 _dosmaperr(GetLastError()); 788 ereport(elevel, 789 (errcode_for_dynamic_shared_memory(), 790 errmsg("could not open shared memory segment \"%s\": %m", 791 name))); 792 return false; 793 } 794 } 795 796 /* Map it. */ 797 address = MapViewOfFile(hmap, FILE_MAP_WRITE | FILE_MAP_READ, 798 0, 0, 0); 799 if (!address) 800 { 801 int save_errno; 802 803 _dosmaperr(GetLastError()); 804 /* Back out what's already been done. */ 805 save_errno = errno; 806 CloseHandle(hmap); 807 errno = save_errno; 808 809 ereport(elevel, 810 (errcode_for_dynamic_shared_memory(), 811 errmsg("could not map shared memory segment \"%s\": %m", 812 name))); 813 return false; 814 } 815 816 /* 817 * VirtualQuery gives size in page_size units, which is 4K for Windows. We 818 * need size only when we are attaching, but it's better to get the size 819 * when creating new segment to keep size consistent both for 820 * DSM_OP_CREATE and DSM_OP_ATTACH. 821 */ 822 if (VirtualQuery(address, &info, sizeof(info)) == 0) 823 { 824 int save_errno; 825 826 _dosmaperr(GetLastError()); 827 /* Back out what's already been done. */ 828 save_errno = errno; 829 UnmapViewOfFile(address); 830 CloseHandle(hmap); 831 errno = save_errno; 832 833 ereport(elevel, 834 (errcode_for_dynamic_shared_memory(), 835 errmsg("could not stat shared memory segment \"%s\": %m", 836 name))); 837 return false; 838 } 839 840 *mapped_address = address; 841 *mapped_size = info.RegionSize; 842 *impl_private = hmap; 843 844 return true; 845 } 846 #endif 847 848 #ifdef USE_DSM_MMAP 849 /* 850 * Operating system primitives to support mmap-based shared memory. 851 * 852 * Calling this "shared memory" is somewhat of a misnomer, because what 853 * we're really doing is creating a bunch of files and mapping them into 854 * our address space. The operating system may feel obliged to 855 * synchronize the contents to disk even if nothing is being paged out, 856 * which will not serve us well. The user can relocate the pg_dynshmem 857 * directory to a ramdisk to avoid this problem, if available. 858 */ 859 static bool 860 dsm_impl_mmap(dsm_op op, dsm_handle handle, Size request_size, 861 void **impl_private, void **mapped_address, Size *mapped_size, 862 int elevel) 863 { 864 char name[64]; 865 int flags; 866 int fd; 867 char *address; 868 869 snprintf(name, 64, PG_DYNSHMEM_DIR "/" PG_DYNSHMEM_MMAP_FILE_PREFIX "%u", 870 handle); 871 872 /* Handle teardown cases. */ 873 if (op == DSM_OP_DETACH || op == DSM_OP_DESTROY) 874 { 875 if (*mapped_address != NULL 876 && munmap(*mapped_address, *mapped_size) != 0) 877 { 878 ereport(elevel, 879 (errcode_for_dynamic_shared_memory(), 880 errmsg("could not unmap shared memory segment \"%s\": %m", 881 name))); 882 return false; 883 } 884 *mapped_address = NULL; 885 *mapped_size = 0; 886 if (op == DSM_OP_DESTROY && unlink(name) != 0) 887 { 888 ereport(elevel, 889 (errcode_for_dynamic_shared_memory(), 890 errmsg("could not remove shared memory segment \"%s\": %m", 891 name))); 892 return false; 893 } 894 return true; 895 } 896 897 /* Create new segment or open an existing one for attach or resize. */ 898 flags = O_RDWR | (op == DSM_OP_CREATE ? O_CREAT | O_EXCL : 0); 899 if ((fd = OpenTransientFile(name, flags)) == -1) 900 { 901 if (errno != EEXIST) 902 ereport(elevel, 903 (errcode_for_dynamic_shared_memory(), 904 errmsg("could not open shared memory segment \"%s\": %m", 905 name))); 906 return false; 907 } 908 909 /* 910 * If we're attaching the segment, determine the current size; if we are 911 * creating or resizing the segment, set the size to the requested value. 912 */ 913 if (op == DSM_OP_ATTACH) 914 { 915 struct stat st; 916 917 if (fstat(fd, &st) != 0) 918 { 919 int save_errno; 920 921 /* Back out what's already been done. */ 922 save_errno = errno; 923 CloseTransientFile(fd); 924 errno = save_errno; 925 926 ereport(elevel, 927 (errcode_for_dynamic_shared_memory(), 928 errmsg("could not stat shared memory segment \"%s\": %m", 929 name))); 930 return false; 931 } 932 request_size = st.st_size; 933 } 934 else if (*mapped_size > request_size && ftruncate(fd, request_size)) 935 { 936 int save_errno; 937 938 /* Back out what's already been done. */ 939 save_errno = errno; 940 CloseTransientFile(fd); 941 if (op == DSM_OP_CREATE) 942 unlink(name); 943 errno = save_errno; 944 945 ereport(elevel, 946 (errcode_for_dynamic_shared_memory(), 947 errmsg("could not resize shared memory segment \"%s\" to %zu bytes: %m", 948 name, request_size))); 949 return false; 950 } 951 else if (*mapped_size < request_size) 952 { 953 /* 954 * Allocate a buffer full of zeros. 955 * 956 * Note: palloc zbuffer, instead of just using a local char array, to 957 * ensure it is reasonably well-aligned; this may save a few cycles 958 * transferring data to the kernel. 959 */ 960 char *zbuffer = (char *) palloc0(ZBUFFER_SIZE); 961 uint32 remaining = request_size; 962 bool success = true; 963 964 /* 965 * Zero-fill the file. We have to do this the hard way to ensure that 966 * all the file space has really been allocated, so that we don't 967 * later seg fault when accessing the memory mapping. This is pretty 968 * pessimal. 969 */ 970 while (success && remaining > 0) 971 { 972 Size goal = remaining; 973 974 if (goal > ZBUFFER_SIZE) 975 goal = ZBUFFER_SIZE; 976 pgstat_report_wait_start(WAIT_EVENT_DSM_FILL_ZERO_WRITE); 977 if (write(fd, zbuffer, goal) == goal) 978 remaining -= goal; 979 else 980 success = false; 981 pgstat_report_wait_end(); 982 } 983 984 if (!success) 985 { 986 int save_errno; 987 988 /* Back out what's already been done. */ 989 save_errno = errno; 990 CloseTransientFile(fd); 991 if (op == DSM_OP_CREATE) 992 unlink(name); 993 errno = save_errno ? save_errno : ENOSPC; 994 995 ereport(elevel, 996 (errcode_for_dynamic_shared_memory(), 997 errmsg("could not resize shared memory segment \"%s\" to %zu bytes: %m", 998 name, request_size))); 999 return false; 1000 } 1001 } 1002 1003 /* 1004 * If we're reattaching or resizing, we must remove any existing mapping, 1005 * unless we've already got the right thing mapped. 1006 */ 1007 if (*mapped_address != NULL) 1008 { 1009 if (*mapped_size == request_size) 1010 return true; 1011 if (munmap(*mapped_address, *mapped_size) != 0) 1012 { 1013 int save_errno; 1014 1015 /* Back out what's already been done. */ 1016 save_errno = errno; 1017 CloseTransientFile(fd); 1018 if (op == DSM_OP_CREATE) 1019 unlink(name); 1020 errno = save_errno; 1021 1022 ereport(elevel, 1023 (errcode_for_dynamic_shared_memory(), 1024 errmsg("could not unmap shared memory segment \"%s\": %m", 1025 name))); 1026 return false; 1027 } 1028 *mapped_address = NULL; 1029 *mapped_size = 0; 1030 } 1031 1032 /* Map it. */ 1033 address = mmap(NULL, request_size, PROT_READ | PROT_WRITE, 1034 MAP_SHARED | MAP_HASSEMAPHORE | MAP_NOSYNC, fd, 0); 1035 if (address == MAP_FAILED) 1036 { 1037 int save_errno; 1038 1039 /* Back out what's already been done. */ 1040 save_errno = errno; 1041 CloseTransientFile(fd); 1042 if (op == DSM_OP_CREATE) 1043 unlink(name); 1044 errno = save_errno; 1045 1046 ereport(elevel, 1047 (errcode_for_dynamic_shared_memory(), 1048 errmsg("could not map shared memory segment \"%s\": %m", 1049 name))); 1050 return false; 1051 } 1052 *mapped_address = address; 1053 *mapped_size = request_size; 1054 CloseTransientFile(fd); 1055 1056 return true; 1057 } 1058 #endif 1059 1060 /* 1061 * Implementation-specific actions that must be performed when a segment is to 1062 * be preserved even when no backend has it attached. 1063 * 1064 * Except on Windows, we don't need to do anything at all. But since Windows 1065 * cleans up segments automatically when no references remain, we duplicate 1066 * the segment handle into the postmaster process. The postmaster needn't 1067 * do anything to receive the handle; Windows transfers it automatically. 1068 */ 1069 void 1070 dsm_impl_pin_segment(dsm_handle handle, void *impl_private, 1071 void **impl_private_pm_handle) 1072 { 1073 switch (dynamic_shared_memory_type) 1074 { 1075 #ifdef USE_DSM_WINDOWS 1076 case DSM_IMPL_WINDOWS: 1077 { 1078 HANDLE hmap; 1079 1080 if (!DuplicateHandle(GetCurrentProcess(), impl_private, 1081 PostmasterHandle, &hmap, 0, FALSE, 1082 DUPLICATE_SAME_ACCESS)) 1083 { 1084 char name[64]; 1085 1086 snprintf(name, 64, "%s.%u", SEGMENT_NAME_PREFIX, handle); 1087 _dosmaperr(GetLastError()); 1088 ereport(ERROR, 1089 (errcode_for_dynamic_shared_memory(), 1090 errmsg("could not duplicate handle for \"%s\": %m", 1091 name))); 1092 } 1093 1094 /* 1095 * Here, we remember the handle that we created in the 1096 * postmaster process. This handle isn't actually usable in 1097 * any process other than the postmaster, but that doesn't 1098 * matter. We're just holding onto it so that, if the segment 1099 * is unpinned, dsm_impl_unpin_segment can close it. 1100 */ 1101 *impl_private_pm_handle = hmap; 1102 break; 1103 } 1104 #endif 1105 default: 1106 break; 1107 } 1108 } 1109 1110 /* 1111 * Implementation-specific actions that must be performed when a segment is no 1112 * longer to be preserved, so that it will be cleaned up when all backends 1113 * have detached from it. 1114 * 1115 * Except on Windows, we don't need to do anything at all. For Windows, we 1116 * close the extra handle that dsm_impl_pin_segment created in the 1117 * postmaster's process space. 1118 */ 1119 void 1120 dsm_impl_unpin_segment(dsm_handle handle, void **impl_private) 1121 { 1122 switch (dynamic_shared_memory_type) 1123 { 1124 #ifdef USE_DSM_WINDOWS 1125 case DSM_IMPL_WINDOWS: 1126 { 1127 if (*impl_private && 1128 !DuplicateHandle(PostmasterHandle, *impl_private, 1129 NULL, NULL, 0, FALSE, 1130 DUPLICATE_CLOSE_SOURCE)) 1131 { 1132 char name[64]; 1133 1134 snprintf(name, 64, "%s.%u", SEGMENT_NAME_PREFIX, handle); 1135 _dosmaperr(GetLastError()); 1136 ereport(ERROR, 1137 (errcode_for_dynamic_shared_memory(), 1138 errmsg("could not duplicate handle for \"%s\": %m", 1139 name))); 1140 } 1141 1142 *impl_private = NULL; 1143 break; 1144 } 1145 #endif 1146 default: 1147 break; 1148 } 1149 } 1150 1151 static int 1152 errcode_for_dynamic_shared_memory(void) 1153 { 1154 if (errno == EFBIG || errno == ENOMEM) 1155 return errcode(ERRCODE_OUT_OF_MEMORY); 1156 else 1157 return errcode_for_file_access(); 1158 } 1159