xref: /freebsd/contrib/jemalloc/src/pages.c (revision 9768746b)
1 #define JEMALLOC_PAGES_C_
2 #include "jemalloc/internal/jemalloc_preamble.h"
3 
4 #include "jemalloc/internal/pages.h"
5 
6 #include "jemalloc/internal/jemalloc_internal_includes.h"
7 
8 #include "jemalloc/internal/assert.h"
9 #include "jemalloc/internal/malloc_io.h"
10 
11 #ifdef JEMALLOC_SYSCTL_VM_OVERCOMMIT
12 #include <sys/sysctl.h>
13 #ifdef __FreeBSD__
14 #include <sys/auxv.h>
15 #include <vm/vm_param.h>
16 #include <vm/vm.h>
17 #endif
18 #endif
19 
20 /******************************************************************************/
21 /* Data. */
22 
23 /* Actual operating system page size, detected during bootstrap, <= PAGE. */
24 static size_t	os_page;
25 
26 #ifndef _WIN32
27 #  define PAGES_PROT_COMMIT (PROT_READ | PROT_WRITE)
28 #  define PAGES_PROT_DECOMMIT (PROT_NONE)
29 static int	mmap_flags;
30 #endif
31 static bool	os_overcommits;
32 
33 const char *thp_mode_names[] = {
34 	"default",
35 	"always",
36 	"never",
37 	"not supported"
38 };
39 thp_mode_t opt_thp = THP_MODE_DEFAULT;
40 thp_mode_t init_system_thp_mode;
41 
42 /* Runtime support for lazy purge. Irrelevant when !pages_can_purge_lazy. */
43 static bool pages_can_purge_lazy_runtime = true;
44 
45 /******************************************************************************/
46 /*
47  * Function prototypes for static functions that are referenced prior to
48  * definition.
49  */
50 
51 static void os_pages_unmap(void *addr, size_t size);
52 
53 /******************************************************************************/
54 
55 static void *
56 os_pages_map(void *addr, size_t size, size_t alignment, bool *commit) {
57 	assert(ALIGNMENT_ADDR2BASE(addr, os_page) == addr);
58 	assert(ALIGNMENT_CEILING(size, os_page) == size);
59 	assert(size != 0);
60 
61 	if (os_overcommits) {
62 		*commit = true;
63 	}
64 
65 	void *ret;
66 #ifdef _WIN32
67 	/*
68 	 * If VirtualAlloc can't allocate at the given address when one is
69 	 * given, it fails and returns NULL.
70 	 */
71 	ret = VirtualAlloc(addr, size, MEM_RESERVE | (*commit ? MEM_COMMIT : 0),
72 	    PAGE_READWRITE);
73 #else
74 	/*
75 	 * We don't use MAP_FIXED here, because it can cause the *replacement*
76 	 * of existing mappings, and we only want to create new mappings.
77 	 */
78 	{
79 		int prot = *commit ? PAGES_PROT_COMMIT : PAGES_PROT_DECOMMIT;
80 
81 		ret = mmap(addr, size, prot, mmap_flags, -1, 0);
82 	}
83 	assert(ret != NULL);
84 
85 	if (ret == MAP_FAILED) {
86 		ret = NULL;
87 	} else if (addr != NULL && ret != addr) {
88 		/*
89 		 * We succeeded in mapping memory, but not in the right place.
90 		 */
91 		os_pages_unmap(ret, size);
92 		ret = NULL;
93 	}
94 #endif
95 	assert(ret == NULL || (addr == NULL && ret != addr) || (addr != NULL &&
96 	    ret == addr));
97 	return ret;
98 }
99 
100 static void *
101 os_pages_trim(void *addr, size_t alloc_size, size_t leadsize, size_t size,
102     bool *commit) {
103 	void *ret = (void *)((uintptr_t)addr + leadsize);
104 
105 	assert(alloc_size >= leadsize + size);
106 #ifdef _WIN32
107 	os_pages_unmap(addr, alloc_size);
108 	void *new_addr = os_pages_map(ret, size, PAGE, commit);
109 	if (new_addr == ret) {
110 		return ret;
111 	}
112 	if (new_addr != NULL) {
113 		os_pages_unmap(new_addr, size);
114 	}
115 	return NULL;
116 #else
117 	size_t trailsize = alloc_size - leadsize - size;
118 
119 	if (leadsize != 0) {
120 		os_pages_unmap(addr, leadsize);
121 	}
122 	if (trailsize != 0) {
123 		os_pages_unmap((void *)((uintptr_t)ret + size), trailsize);
124 	}
125 	return ret;
126 #endif
127 }
128 
129 static void
130 os_pages_unmap(void *addr, size_t size) {
131 	assert(ALIGNMENT_ADDR2BASE(addr, os_page) == addr);
132 	assert(ALIGNMENT_CEILING(size, os_page) == size);
133 
134 #ifdef _WIN32
135 	if (VirtualFree(addr, 0, MEM_RELEASE) == 0)
136 #else
137 	if (munmap(addr, size) == -1)
138 #endif
139 	{
140 		char buf[BUFERROR_BUF];
141 
142 		buferror(get_errno(), buf, sizeof(buf));
143 		malloc_printf("<jemalloc>: Error in "
144 #ifdef _WIN32
145 		    "VirtualFree"
146 #else
147 		    "munmap"
148 #endif
149 		    "(): %s\n", buf);
150 		if (opt_abort) {
151 			abort();
152 		}
153 	}
154 }
155 
156 static void *
157 pages_map_slow(size_t size, size_t alignment, bool *commit) {
158 	size_t alloc_size = size + alignment - os_page;
159 	/* Beware size_t wrap-around. */
160 	if (alloc_size < size) {
161 		return NULL;
162 	}
163 
164 	void *ret;
165 	do {
166 		void *pages = os_pages_map(NULL, alloc_size, alignment, commit);
167 		if (pages == NULL) {
168 			return NULL;
169 		}
170 		size_t leadsize = ALIGNMENT_CEILING((uintptr_t)pages, alignment)
171 		    - (uintptr_t)pages;
172 		ret = os_pages_trim(pages, alloc_size, leadsize, size, commit);
173 	} while (ret == NULL);
174 
175 	assert(ret != NULL);
176 	assert(PAGE_ADDR2BASE(ret) == ret);
177 	return ret;
178 }
179 
180 void *
181 pages_map(void *addr, size_t size, size_t alignment, bool *commit) {
182 	assert(alignment >= PAGE);
183 	assert(ALIGNMENT_ADDR2BASE(addr, alignment) == addr);
184 
185 #if defined(__FreeBSD__) && defined(MAP_EXCL)
186 	/*
187 	 * FreeBSD has mechanisms both to mmap at specific address without
188 	 * touching existing mappings, and to mmap with specific alignment.
189 	 */
190 	{
191 		if (os_overcommits) {
192 			*commit = true;
193 		}
194 
195 		int prot = *commit ? PAGES_PROT_COMMIT : PAGES_PROT_DECOMMIT;
196 		int flags = mmap_flags;
197 
198 		if (addr != NULL) {
199 			flags |= MAP_FIXED | MAP_EXCL;
200 		} else {
201 			unsigned alignment_bits = ffs_zu(alignment);
202 			assert(alignment_bits > 1);
203 			flags |= MAP_ALIGNED(alignment_bits - 1);
204 		}
205 
206 		void *ret = mmap(addr, size, prot, flags, -1, 0);
207 		if (ret == MAP_FAILED) {
208 			ret = NULL;
209 		}
210 
211 		return ret;
212 	}
213 #endif
214 	/*
215 	 * Ideally, there would be a way to specify alignment to mmap() (like
216 	 * NetBSD has), but in the absence of such a feature, we have to work
217 	 * hard to efficiently create aligned mappings.  The reliable, but
218 	 * slow method is to create a mapping that is over-sized, then trim the
219 	 * excess.  However, that always results in one or two calls to
220 	 * os_pages_unmap(), and it can leave holes in the process's virtual
221 	 * memory map if memory grows downward.
222 	 *
223 	 * Optimistically try mapping precisely the right amount before falling
224 	 * back to the slow method, with the expectation that the optimistic
225 	 * approach works most of the time.
226 	 */
227 
228 	void *ret = os_pages_map(addr, size, os_page, commit);
229 	if (ret == NULL || ret == addr) {
230 		return ret;
231 	}
232 	assert(addr == NULL);
233 	if (ALIGNMENT_ADDR2OFFSET(ret, alignment) != 0) {
234 		os_pages_unmap(ret, size);
235 		return pages_map_slow(size, alignment, commit);
236 	}
237 
238 	assert(PAGE_ADDR2BASE(ret) == ret);
239 	return ret;
240 }
241 
242 void
243 pages_unmap(void *addr, size_t size) {
244 	assert(PAGE_ADDR2BASE(addr) == addr);
245 	assert(PAGE_CEILING(size) == size);
246 
247 	os_pages_unmap(addr, size);
248 }
249 
250 static bool
251 pages_commit_impl(void *addr, size_t size, bool commit) {
252 	assert(PAGE_ADDR2BASE(addr) == addr);
253 	assert(PAGE_CEILING(size) == size);
254 
255 	if (os_overcommits) {
256 		return true;
257 	}
258 
259 #ifdef _WIN32
260 	return (commit ? (addr != VirtualAlloc(addr, size, MEM_COMMIT,
261 	    PAGE_READWRITE)) : (!VirtualFree(addr, size, MEM_DECOMMIT)));
262 #else
263 	{
264 		int prot = commit ? PAGES_PROT_COMMIT : PAGES_PROT_DECOMMIT;
265 		void *result = mmap(addr, size, prot, mmap_flags | MAP_FIXED,
266 		    -1, 0);
267 		if (result == MAP_FAILED) {
268 			return true;
269 		}
270 		if (result != addr) {
271 			/*
272 			 * We succeeded in mapping memory, but not in the right
273 			 * place.
274 			 */
275 			os_pages_unmap(result, size);
276 			return true;
277 		}
278 		return false;
279 	}
280 #endif
281 }
282 
283 bool
284 pages_commit(void *addr, size_t size) {
285 	return pages_commit_impl(addr, size, true);
286 }
287 
288 bool
289 pages_decommit(void *addr, size_t size) {
290 	return pages_commit_impl(addr, size, false);
291 }
292 
293 bool
294 pages_purge_lazy(void *addr, size_t size) {
295 	assert(ALIGNMENT_ADDR2BASE(addr, os_page) == addr);
296 	assert(PAGE_CEILING(size) == size);
297 
298 	if (!pages_can_purge_lazy) {
299 		return true;
300 	}
301 	if (!pages_can_purge_lazy_runtime) {
302 		/*
303 		 * Built with lazy purge enabled, but detected it was not
304 		 * supported on the current system.
305 		 */
306 		return true;
307 	}
308 
309 #ifdef _WIN32
310 	VirtualAlloc(addr, size, MEM_RESET, PAGE_READWRITE);
311 	return false;
312 #elif defined(JEMALLOC_PURGE_MADVISE_FREE)
313 	return (madvise(addr, size,
314 #  ifdef MADV_FREE
315 	    MADV_FREE
316 #  else
317 	    JEMALLOC_MADV_FREE
318 #  endif
319 	    ) != 0);
320 #elif defined(JEMALLOC_PURGE_MADVISE_DONTNEED) && \
321     !defined(JEMALLOC_PURGE_MADVISE_DONTNEED_ZEROS)
322 	return (madvise(addr, size, MADV_DONTNEED) != 0);
323 #else
324 	not_reached();
325 #endif
326 }
327 
328 bool
329 pages_purge_forced(void *addr, size_t size) {
330 	assert(PAGE_ADDR2BASE(addr) == addr);
331 	assert(PAGE_CEILING(size) == size);
332 
333 	if (!pages_can_purge_forced) {
334 		return true;
335 	}
336 
337 #if defined(JEMALLOC_PURGE_MADVISE_DONTNEED) && \
338     defined(JEMALLOC_PURGE_MADVISE_DONTNEED_ZEROS)
339 	return (madvise(addr, size, MADV_DONTNEED) != 0);
340 #elif defined(JEMALLOC_MAPS_COALESCE)
341 	/* Try to overlay a new demand-zeroed mapping. */
342 	return pages_commit(addr, size);
343 #else
344 	not_reached();
345 #endif
346 }
347 
348 static bool
349 pages_huge_impl(void *addr, size_t size, bool aligned) {
350 	if (aligned) {
351 		assert(HUGEPAGE_ADDR2BASE(addr) == addr);
352 		assert(HUGEPAGE_CEILING(size) == size);
353 	}
354 #ifdef JEMALLOC_HAVE_MADVISE_HUGE
355 	return (madvise(addr, size, MADV_HUGEPAGE) != 0);
356 #else
357 	return true;
358 #endif
359 }
360 
361 bool
362 pages_huge(void *addr, size_t size) {
363 	return pages_huge_impl(addr, size, true);
364 }
365 
366 static bool
367 pages_huge_unaligned(void *addr, size_t size) {
368 	return pages_huge_impl(addr, size, false);
369 }
370 
371 static bool
372 pages_nohuge_impl(void *addr, size_t size, bool aligned) {
373 	if (aligned) {
374 		assert(HUGEPAGE_ADDR2BASE(addr) == addr);
375 		assert(HUGEPAGE_CEILING(size) == size);
376 	}
377 
378 #ifdef JEMALLOC_HAVE_MADVISE_HUGE
379 	return (madvise(addr, size, MADV_NOHUGEPAGE) != 0);
380 #else
381 	return false;
382 #endif
383 }
384 
385 bool
386 pages_nohuge(void *addr, size_t size) {
387 	return pages_nohuge_impl(addr, size, true);
388 }
389 
390 static bool
391 pages_nohuge_unaligned(void *addr, size_t size) {
392 	return pages_nohuge_impl(addr, size, false);
393 }
394 
395 bool
396 pages_dontdump(void *addr, size_t size) {
397 	assert(PAGE_ADDR2BASE(addr) == addr);
398 	assert(PAGE_CEILING(size) == size);
399 #ifdef JEMALLOC_MADVISE_DONTDUMP
400 	return madvise(addr, size, MADV_DONTDUMP) != 0;
401 #else
402 	return false;
403 #endif
404 }
405 
406 bool
407 pages_dodump(void *addr, size_t size) {
408 	assert(PAGE_ADDR2BASE(addr) == addr);
409 	assert(PAGE_CEILING(size) == size);
410 #ifdef JEMALLOC_MADVISE_DONTDUMP
411 	return madvise(addr, size, MADV_DODUMP) != 0;
412 #else
413 	return false;
414 #endif
415 }
416 
417 
418 static size_t
419 os_page_detect(void) {
420 #ifdef _WIN32
421 	SYSTEM_INFO si;
422 	GetSystemInfo(&si);
423 	return si.dwPageSize;
424 #elif defined(__FreeBSD__)
425 	/*
426 	 * This returns the value obtained from
427 	 * the auxv vector, avoiding a syscall.
428 	 */
429 	return getpagesize();
430 #else
431 	long result = sysconf(_SC_PAGESIZE);
432 	if (result == -1) {
433 		return LG_PAGE;
434 	}
435 	return (size_t)result;
436 #endif
437 }
438 
439 #ifdef JEMALLOC_SYSCTL_VM_OVERCOMMIT
440 static bool
441 os_overcommits_sysctl(void) {
442 	int vm_overcommit;
443 	size_t sz;
444 
445 #ifdef ELF_BSDF_VMNOOVERCOMMIT
446 	int bsdflags;
447 
448 	if (_elf_aux_info(AT_BSDFLAGS, &bsdflags, sizeof(bsdflags)) == 0)
449 		return ((bsdflags & ELF_BSDF_VMNOOVERCOMMIT) == 0);
450 #endif
451 
452 	sz = sizeof(vm_overcommit);
453 #if defined(__FreeBSD__) && defined(VM_OVERCOMMIT)
454 	int mib[2];
455 
456 	mib[0] = CTL_VM;
457 	mib[1] = VM_OVERCOMMIT;
458 	if (sysctl(mib, 2, &vm_overcommit, &sz, NULL, 0) != 0) {
459 		return false; /* Error. */
460 	}
461 #else
462 	if (sysctlbyname("vm.overcommit", &vm_overcommit, &sz, NULL, 0) != 0) {
463 		return false; /* Error. */
464 	}
465 #endif
466 
467 #ifndef SWAP_RESERVE_FORCE_ON
468 #define	SWAP_RESERVE_FORCE_ON		(1 << 0)
469 #define	SWAP_RESERVE_RLIMIT_ON		(1 << 1)
470 #endif
471 	return ((vm_overcommit & (SWAP_RESERVE_FORCE_ON |
472 	    SWAP_RESERVE_RLIMIT_ON)) == 0);
473 }
474 #endif
475 
476 #ifdef JEMALLOC_PROC_SYS_VM_OVERCOMMIT_MEMORY
477 /*
478  * Use syscall(2) rather than {open,read,close}(2) when possible to avoid
479  * reentry during bootstrapping if another library has interposed system call
480  * wrappers.
481  */
482 static bool
483 os_overcommits_proc(void) {
484 	int fd;
485 	char buf[1];
486 
487 #if defined(JEMALLOC_USE_SYSCALL) && defined(SYS_open)
488 	#if defined(O_CLOEXEC)
489 		fd = (int)syscall(SYS_open, "/proc/sys/vm/overcommit_memory", O_RDONLY |
490 			O_CLOEXEC);
491 	#else
492 		fd = (int)syscall(SYS_open, "/proc/sys/vm/overcommit_memory", O_RDONLY);
493 		if (fd != -1) {
494 			fcntl(fd, F_SETFD, fcntl(fd, F_GETFD) | FD_CLOEXEC);
495 		}
496 	#endif
497 #elif defined(JEMALLOC_USE_SYSCALL) && defined(SYS_openat)
498 	#if defined(O_CLOEXEC)
499 		fd = (int)syscall(SYS_openat,
500 			AT_FDCWD, "/proc/sys/vm/overcommit_memory", O_RDONLY | O_CLOEXEC);
501 	#else
502 		fd = (int)syscall(SYS_openat,
503 			AT_FDCWD, "/proc/sys/vm/overcommit_memory", O_RDONLY);
504 		if (fd != -1) {
505 			fcntl(fd, F_SETFD, fcntl(fd, F_GETFD) | FD_CLOEXEC);
506 		}
507 	#endif
508 #else
509 	#if defined(O_CLOEXEC)
510 		fd = open("/proc/sys/vm/overcommit_memory", O_RDONLY | O_CLOEXEC);
511 	#else
512 		fd = open("/proc/sys/vm/overcommit_memory", O_RDONLY);
513 		if (fd != -1) {
514 			fcntl(fd, F_SETFD, fcntl(fd, F_GETFD) | FD_CLOEXEC);
515 		}
516 	#endif
517 #endif
518 
519 	if (fd == -1) {
520 		return false; /* Error. */
521 	}
522 
523 	ssize_t nread = malloc_read_fd(fd, &buf, sizeof(buf));
524 #if defined(JEMALLOC_USE_SYSCALL) && defined(SYS_close)
525 	syscall(SYS_close, fd);
526 #else
527 	close(fd);
528 #endif
529 
530 	if (nread < 1) {
531 		return false; /* Error. */
532 	}
533 	/*
534 	 * /proc/sys/vm/overcommit_memory meanings:
535 	 * 0: Heuristic overcommit.
536 	 * 1: Always overcommit.
537 	 * 2: Never overcommit.
538 	 */
539 	return (buf[0] == '0' || buf[0] == '1');
540 }
541 #endif
542 
543 void
544 pages_set_thp_state (void *ptr, size_t size) {
545 	if (opt_thp == thp_mode_default || opt_thp == init_system_thp_mode) {
546 		return;
547 	}
548 	assert(opt_thp != thp_mode_not_supported &&
549 	    init_system_thp_mode != thp_mode_not_supported);
550 
551 	if (opt_thp == thp_mode_always
552 	    && init_system_thp_mode != thp_mode_never) {
553 		assert(init_system_thp_mode == thp_mode_default);
554 		pages_huge_unaligned(ptr, size);
555 	} else if (opt_thp == thp_mode_never) {
556 		assert(init_system_thp_mode == thp_mode_default ||
557 		    init_system_thp_mode == thp_mode_always);
558 		pages_nohuge_unaligned(ptr, size);
559 	}
560 }
561 
562 static void
563 init_thp_state(void) {
564 	if (!have_madvise_huge) {
565 		if (metadata_thp_enabled() && opt_abort) {
566 			malloc_write("<jemalloc>: no MADV_HUGEPAGE support\n");
567 			abort();
568 		}
569 		goto label_error;
570 	}
571 
572 	static const char sys_state_madvise[] = "always [madvise] never\n";
573 	static const char sys_state_always[] = "[always] madvise never\n";
574 	static const char sys_state_never[] = "always madvise [never]\n";
575 	char buf[sizeof(sys_state_madvise)];
576 
577 #if defined(JEMALLOC_USE_SYSCALL) && defined(SYS_open)
578 	int fd = (int)syscall(SYS_open,
579 	    "/sys/kernel/mm/transparent_hugepage/enabled", O_RDONLY);
580 #else
581 	int fd = open("/sys/kernel/mm/transparent_hugepage/enabled", O_RDONLY);
582 #endif
583 	if (fd == -1) {
584 		goto label_error;
585 	}
586 
587 	ssize_t nread = malloc_read_fd(fd, &buf, sizeof(buf));
588 #if defined(JEMALLOC_USE_SYSCALL) && defined(SYS_close)
589 	syscall(SYS_close, fd);
590 #else
591 	close(fd);
592 #endif
593 
594         if (nread < 0) {
595 		goto label_error;
596         }
597 
598 	if (strncmp(buf, sys_state_madvise, (size_t)nread) == 0) {
599 		init_system_thp_mode = thp_mode_default;
600 	} else if (strncmp(buf, sys_state_always, (size_t)nread) == 0) {
601 		init_system_thp_mode = thp_mode_always;
602 	} else if (strncmp(buf, sys_state_never, (size_t)nread) == 0) {
603 		init_system_thp_mode = thp_mode_never;
604 	} else {
605 		goto label_error;
606 	}
607 	return;
608 label_error:
609 	opt_thp = init_system_thp_mode = thp_mode_not_supported;
610 }
611 
612 bool
613 pages_boot(void) {
614 	os_page = os_page_detect();
615 	if (os_page > PAGE) {
616 		malloc_write("<jemalloc>: Unsupported system page size\n");
617 		if (opt_abort) {
618 			abort();
619 		}
620 		return true;
621 	}
622 
623 #ifndef _WIN32
624 	mmap_flags = MAP_PRIVATE | MAP_ANON;
625 #endif
626 
627 #ifdef JEMALLOC_SYSCTL_VM_OVERCOMMIT
628 	os_overcommits = os_overcommits_sysctl();
629 #elif defined(JEMALLOC_PROC_SYS_VM_OVERCOMMIT_MEMORY)
630 	os_overcommits = os_overcommits_proc();
631 #  ifdef MAP_NORESERVE
632 	if (os_overcommits) {
633 		mmap_flags |= MAP_NORESERVE;
634 	}
635 #  endif
636 #else
637 	os_overcommits = false;
638 #endif
639 
640 	init_thp_state();
641 
642 #ifdef __FreeBSD__
643 	/*
644 	 * FreeBSD doesn't need the check; madvise(2) is known to work.
645 	 */
646 #else
647 	/* Detect lazy purge runtime support. */
648 	if (pages_can_purge_lazy) {
649 		bool committed = false;
650 		void *madv_free_page = os_pages_map(NULL, PAGE, PAGE, &committed);
651 		if (madv_free_page == NULL) {
652 			return true;
653 		}
654 		assert(pages_can_purge_lazy_runtime);
655 		if (pages_purge_lazy(madv_free_page, PAGE)) {
656 			pages_can_purge_lazy_runtime = false;
657 		}
658 		os_pages_unmap(madv_free_page, PAGE);
659 	}
660 #endif
661 
662 	return false;
663 }
664