1// Copyright 2010 The Go Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style
3// license that can be found in the LICENSE file.
4
5package runtime
6
7import (
8	"runtime/internal/atomic"
9	"unsafe"
10)
11
12const (
13	_EACCES = 13
14	_EINVAL = 22
15)
16
17// Don't split the stack as this method may be invoked without a valid G, which
18// prevents us from allocating more stack.
19//go:nosplit
20func sysAlloc(n uintptr, sysStat *uint64) unsafe.Pointer {
21	p, err := mmap(nil, n, _PROT_READ|_PROT_WRITE, _MAP_ANON|_MAP_PRIVATE, -1, 0)
22	if err != 0 {
23		if err == _EACCES {
24			print("runtime: mmap: access denied\n")
25			exit(2)
26		}
27		if err == _EAGAIN {
28			print("runtime: mmap: too much locked memory (check 'ulimit -l').\n")
29			exit(2)
30		}
31		return nil
32	}
33	mSysStatInc(sysStat, n)
34	return p
35}
36
37var adviseUnused = uint32(_MADV_FREE)
38
39func sysUnused(v unsafe.Pointer, n uintptr) {
40	// By default, Linux's "transparent huge page" support will
41	// merge pages into a huge page if there's even a single
42	// present regular page, undoing the effects of madvise(adviseUnused)
43	// below. On amd64, that means khugepaged can turn a single
44	// 4KB page to 2MB, bloating the process's RSS by as much as
45	// 512X. (See issue #8832 and Linux kernel bug
46	// https://bugzilla.kernel.org/show_bug.cgi?id=93111)
47	//
48	// To work around this, we explicitly disable transparent huge
49	// pages when we release pages of the heap. However, we have
50	// to do this carefully because changing this flag tends to
51	// split the VMA (memory mapping) containing v in to three
52	// VMAs in order to track the different values of the
53	// MADV_NOHUGEPAGE flag in the different regions. There's a
54	// default limit of 65530 VMAs per address space (sysctl
55	// vm.max_map_count), so we must be careful not to create too
56	// many VMAs (see issue #12233).
57	//
58	// Since huge pages are huge, there's little use in adjusting
59	// the MADV_NOHUGEPAGE flag on a fine granularity, so we avoid
60	// exploding the number of VMAs by only adjusting the
61	// MADV_NOHUGEPAGE flag on a large granularity. This still
62	// gets most of the benefit of huge pages while keeping the
63	// number of VMAs under control. With hugePageSize = 2MB, even
64	// a pessimal heap can reach 128GB before running out of VMAs.
65	if physHugePageSize != 0 {
66		// If it's a large allocation, we want to leave huge
67		// pages enabled. Hence, we only adjust the huge page
68		// flag on the huge pages containing v and v+n-1, and
69		// only if those aren't aligned.
70		var head, tail uintptr
71		if uintptr(v)&(physHugePageSize-1) != 0 {
72			// Compute huge page containing v.
73			head = alignDown(uintptr(v), physHugePageSize)
74		}
75		if (uintptr(v)+n)&(physHugePageSize-1) != 0 {
76			// Compute huge page containing v+n-1.
77			tail = alignDown(uintptr(v)+n-1, physHugePageSize)
78		}
79
80		// Note that madvise will return EINVAL if the flag is
81		// already set, which is quite likely. We ignore
82		// errors.
83		if head != 0 && head+physHugePageSize == tail {
84			// head and tail are different but adjacent,
85			// so do this in one call.
86			madvise(unsafe.Pointer(head), 2*physHugePageSize, _MADV_NOHUGEPAGE)
87		} else {
88			// Advise the huge pages containing v and v+n-1.
89			if head != 0 {
90				madvise(unsafe.Pointer(head), physHugePageSize, _MADV_NOHUGEPAGE)
91			}
92			if tail != 0 && tail != head {
93				madvise(unsafe.Pointer(tail), physHugePageSize, _MADV_NOHUGEPAGE)
94			}
95		}
96	}
97
98	if uintptr(v)&(physPageSize-1) != 0 || n&(physPageSize-1) != 0 {
99		// madvise will round this to any physical page
100		// *covered* by this range, so an unaligned madvise
101		// will release more memory than intended.
102		throw("unaligned sysUnused")
103	}
104
105	var advise uint32
106	if debug.madvdontneed != 0 {
107		advise = _MADV_DONTNEED
108	} else {
109		advise = atomic.Load(&adviseUnused)
110	}
111	if errno := madvise(v, n, int32(advise)); advise == _MADV_FREE && errno != 0 {
112		// MADV_FREE was added in Linux 4.5. Fall back to MADV_DONTNEED if it is
113		// not supported.
114		atomic.Store(&adviseUnused, _MADV_DONTNEED)
115		madvise(v, n, _MADV_DONTNEED)
116	}
117}
118
119func sysUsed(v unsafe.Pointer, n uintptr) {
120	// Partially undo the NOHUGEPAGE marks from sysUnused
121	// for whole huge pages between v and v+n. This may
122	// leave huge pages off at the end points v and v+n
123	// even though allocations may cover these entire huge
124	// pages. We could detect this and undo NOHUGEPAGE on
125	// the end points as well, but it's probably not worth
126	// the cost because when neighboring allocations are
127	// freed sysUnused will just set NOHUGEPAGE again.
128	sysHugePage(v, n)
129}
130
131func sysHugePage(v unsafe.Pointer, n uintptr) {
132	if physHugePageSize != 0 {
133		// Round v up to a huge page boundary.
134		beg := alignUp(uintptr(v), physHugePageSize)
135		// Round v+n down to a huge page boundary.
136		end := alignDown(uintptr(v)+n, physHugePageSize)
137
138		if beg < end {
139			madvise(unsafe.Pointer(beg), end-beg, _MADV_HUGEPAGE)
140		}
141	}
142}
143
144// Don't split the stack as this function may be invoked without a valid G,
145// which prevents us from allocating more stack.
146//go:nosplit
147func sysFree(v unsafe.Pointer, n uintptr, sysStat *uint64) {
148	mSysStatDec(sysStat, n)
149	munmap(v, n)
150}
151
152func sysFault(v unsafe.Pointer, n uintptr) {
153	mmap(v, n, _PROT_NONE, _MAP_ANON|_MAP_PRIVATE|_MAP_FIXED, -1, 0)
154}
155
156func sysReserve(v unsafe.Pointer, n uintptr) unsafe.Pointer {
157	p, err := mmap(v, n, _PROT_NONE, _MAP_ANON|_MAP_PRIVATE, -1, 0)
158	if err != 0 {
159		return nil
160	}
161	return p
162}
163
164func sysMap(v unsafe.Pointer, n uintptr, sysStat *uint64) {
165	mSysStatInc(sysStat, n)
166
167	p, err := mmap(v, n, _PROT_READ|_PROT_WRITE, _MAP_ANON|_MAP_FIXED|_MAP_PRIVATE, -1, 0)
168	if err == _ENOMEM {
169		throw("runtime: out of memory")
170	}
171	if p != v || err != 0 {
172		throw("runtime: cannot map pages in arena address space")
173	}
174}
175