1// Copyright 2019 The Go Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style
3// license that can be found in the LICENSE file.
4
5// Scavenging free pages.
6//
7// This file implements scavenging (the release of physical pages backing mapped
8// memory) of free and unused pages in the heap as a way to deal with page-level
9// fragmentation and reduce the RSS of Go applications.
10//
11// Scavenging in Go happens on two fronts: there's the background
12// (asynchronous) scavenger and the heap-growth (synchronous) scavenger.
13//
14// The former happens on a goroutine much like the background sweeper which is
15// soft-capped at using scavengePercent of the mutator's time, based on
16// order-of-magnitude estimates of the costs of scavenging. The background
17// scavenger's primary goal is to bring the estimated heap RSS of the
18// application down to a goal.
19//
20// That goal is defined as:
21//   (retainExtraPercent+100) / 100 * (next_gc / last_next_gc) * last_heap_inuse
22//
23// Essentially, we wish to have the application's RSS track the heap goal, but
24// the heap goal is defined in terms of bytes of objects, rather than pages like
25// RSS. As a result, we need to take into account for fragmentation internal to
26// spans. next_gc / last_next_gc defines the ratio between the current heap goal
27// and the last heap goal, which tells us by how much the heap is growing and
28// shrinking. We estimate what the heap will grow to in terms of pages by taking
29// this ratio and multiplying it by heap_inuse at the end of the last GC, which
30// allows us to account for this additional fragmentation. Note that this
31// procedure makes the assumption that the degree of fragmentation won't change
32// dramatically over the next GC cycle. Overestimating the amount of
33// fragmentation simply results in higher memory use, which will be accounted
34// for by the next pacing up date. Underestimating the fragmentation however
35// could lead to performance degradation. Handling this case is not within the
36// scope of the scavenger. Situations where the amount of fragmentation balloons
37// over the course of a single GC cycle should be considered pathologies,
38// flagged as bugs, and fixed appropriately.
39//
40// An additional factor of retainExtraPercent is added as a buffer to help ensure
41// that there's more unscavenged memory to allocate out of, since each allocation
42// out of scavenged memory incurs a potentially expensive page fault.
43//
44// The goal is updated after each GC and the scavenger's pacing parameters
45// (which live in mheap_) are updated to match. The pacing parameters work much
46// like the background sweeping parameters. The parameters define a line whose
47// horizontal axis is time and vertical axis is estimated heap RSS, and the
48// scavenger attempts to stay below that line at all times.
49//
50// The synchronous heap-growth scavenging happens whenever the heap grows in
51// size, for some definition of heap-growth. The intuition behind this is that
52// the application had to grow the heap because existing fragments were
53// not sufficiently large to satisfy a page-level memory allocation, so we
54// scavenge those fragments eagerly to offset the growth in RSS that results.
55
56package runtime
57
58import (
59	"runtime/internal/atomic"
60	"runtime/internal/sys"
61	"unsafe"
62)
63
64const (
65	// The background scavenger is paced according to these parameters.
66	//
67	// scavengePercent represents the portion of mutator time we're willing
68	// to spend on scavenging in percent.
69	scavengePercent = 1 // 1%
70
71	// retainExtraPercent represents the amount of memory over the heap goal
72	// that the scavenger should keep as a buffer space for the allocator.
73	//
74	// The purpose of maintaining this overhead is to have a greater pool of
75	// unscavenged memory available for allocation (since using scavenged memory
76	// incurs an additional cost), to account for heap fragmentation and
77	// the ever-changing layout of the heap.
78	retainExtraPercent = 10
79
80	// maxPagesPerPhysPage is the maximum number of supported runtime pages per
81	// physical page, based on maxPhysPageSize.
82	maxPagesPerPhysPage = maxPhysPageSize / pageSize
83
84	// scavengeCostRatio is the approximate ratio between the costs of using previously
85	// scavenged memory and scavenging memory.
86	//
87	// For most systems the cost of scavenging greatly outweighs the costs
88	// associated with using scavenged memory, making this constant 0. On other systems
89	// (especially ones where "sysUsed" is not just a no-op) this cost is non-trivial.
90	//
91	// This ratio is used as part of multiplicative factor to help the scavenger account
92	// for the additional costs of using scavenged memory in its pacing.
93	scavengeCostRatio = 0.7 * (sys.GoosDarwin + sys.GoosIos)
94
95	// scavengeReservationShards determines the amount of memory the scavenger
96	// should reserve for scavenging at a time. Specifically, the amount of
97	// memory reserved is (heap size in bytes) / scavengeReservationShards.
98	scavengeReservationShards = 64
99)
100
101// heapRetained returns an estimate of the current heap RSS.
102func heapRetained() uint64 {
103	return memstats.heap_sys.load() - atomic.Load64(&memstats.heap_released)
104}
105
106// gcPaceScavenger updates the scavenger's pacing, particularly
107// its rate and RSS goal.
108//
109// The RSS goal is based on the current heap goal with a small overhead
110// to accommodate non-determinism in the allocator.
111//
112// The pacing is based on scavengePageRate, which applies to both regular and
113// huge pages. See that constant for more information.
114//
115// mheap_.lock must be held or the world must be stopped.
116func gcPaceScavenger() {
117	// If we're called before the first GC completed, disable scavenging.
118	// We never scavenge before the 2nd GC cycle anyway (we don't have enough
119	// information about the heap yet) so this is fine, and avoids a fault
120	// or garbage data later.
121	if memstats.last_next_gc == 0 {
122		mheap_.scavengeGoal = ^uint64(0)
123		return
124	}
125	// Compute our scavenging goal.
126	goalRatio := float64(atomic.Load64(&memstats.next_gc)) / float64(memstats.last_next_gc)
127	retainedGoal := uint64(float64(memstats.last_heap_inuse) * goalRatio)
128	// Add retainExtraPercent overhead to retainedGoal. This calculation
129	// looks strange but the purpose is to arrive at an integer division
130	// (e.g. if retainExtraPercent = 12.5, then we get a divisor of 8)
131	// that also avoids the overflow from a multiplication.
132	retainedGoal += retainedGoal / (1.0 / (retainExtraPercent / 100.0))
133	// Align it to a physical page boundary to make the following calculations
134	// a bit more exact.
135	retainedGoal = (retainedGoal + uint64(physPageSize) - 1) &^ (uint64(physPageSize) - 1)
136
137	// Represents where we are now in the heap's contribution to RSS in bytes.
138	//
139	// Guaranteed to always be a multiple of physPageSize on systems where
140	// physPageSize <= pageSize since we map heap_sys at a rate larger than
141	// any physPageSize and released memory in multiples of the physPageSize.
142	//
143	// However, certain functions recategorize heap_sys as other stats (e.g.
144	// stack_sys) and this happens in multiples of pageSize, so on systems
145	// where physPageSize > pageSize the calculations below will not be exact.
146	// Generally this is OK since we'll be off by at most one regular
147	// physical page.
148	retainedNow := heapRetained()
149
150	// If we're already below our goal, or within one page of our goal, then disable
151	// the background scavenger. We disable the background scavenger if there's
152	// less than one physical page of work to do because it's not worth it.
153	if retainedNow <= retainedGoal || retainedNow-retainedGoal < uint64(physPageSize) {
154		mheap_.scavengeGoal = ^uint64(0)
155		return
156	}
157	mheap_.scavengeGoal = retainedGoal
158}
159
160// Sleep/wait state of the background scavenger.
161var scavenge struct {
162	lock       mutex
163	g          *g
164	parked     bool
165	timer      *timer
166	sysmonWake uint32 // Set atomically.
167}
168
169// readyForScavenger signals sysmon to wake the scavenger because
170// there may be new work to do.
171//
172// There may be a significant delay between when this function runs
173// and when the scavenger is kicked awake, but it may be safely invoked
174// in contexts where wakeScavenger is unsafe to call directly.
175func readyForScavenger() {
176	atomic.Store(&scavenge.sysmonWake, 1)
177}
178
179// wakeScavenger immediately unparks the scavenger if necessary.
180//
181// May run without a P, but it may allocate, so it must not be called
182// on any allocation path.
183//
184// mheap_.lock, scavenge.lock, and sched.lock must not be held.
185func wakeScavenger() {
186	lock(&scavenge.lock)
187	if scavenge.parked {
188		// Notify sysmon that it shouldn't bother waking up the scavenger.
189		atomic.Store(&scavenge.sysmonWake, 0)
190
191		// Try to stop the timer but we don't really care if we succeed.
192		// It's possible that either a timer was never started, or that
193		// we're racing with it.
194		// In the case that we're racing with there's the low chance that
195		// we experience a spurious wake-up of the scavenger, but that's
196		// totally safe.
197		stopTimer(scavenge.timer)
198
199		// Unpark the goroutine and tell it that there may have been a pacing
200		// change. Note that we skip the scheduler's runnext slot because we
201		// want to avoid having the scavenger interfere with the fair
202		// scheduling of user goroutines. In effect, this schedules the
203		// scavenger at a "lower priority" but that's OK because it'll
204		// catch up on the work it missed when it does get scheduled.
205		scavenge.parked = false
206
207		// Ready the goroutine by injecting it. We use injectglist instead
208		// of ready or goready in order to allow us to run this function
209		// without a P. injectglist also avoids placing the goroutine in
210		// the current P's runnext slot, which is desireable to prevent
211		// the scavenger from interfering with user goroutine scheduling
212		// too much.
213		var list gList
214		list.push(scavenge.g)
215		injectglist(&list)
216	}
217	unlock(&scavenge.lock)
218}
219
220// scavengeSleep attempts to put the scavenger to sleep for ns.
221//
222// Note that this function should only be called by the scavenger.
223//
224// The scavenger may be woken up earlier by a pacing change, and it may not go
225// to sleep at all if there's a pending pacing change.
226//
227// Returns the amount of time actually slept.
228func scavengeSleep(ns int64) int64 {
229	lock(&scavenge.lock)
230
231	// Set the timer.
232	//
233	// This must happen here instead of inside gopark
234	// because we can't close over any variables without
235	// failing escape analysis.
236	start := nanotime()
237	resetTimer(scavenge.timer, start+ns)
238
239	// Mark ourself as asleep and go to sleep.
240	scavenge.parked = true
241	goparkunlock(&scavenge.lock, waitReasonSleep, traceEvGoSleep, 2)
242
243	// Return how long we actually slept for.
244	return nanotime() - start
245}
246
247// Background scavenger.
248//
249// The background scavenger maintains the RSS of the application below
250// the line described by the proportional scavenging statistics in
251// the mheap struct.
252func bgscavenge(c chan int) {
253	setSystemGoroutine()
254
255	scavenge.g = getg()
256
257	lockInit(&scavenge.lock, lockRankScavenge)
258	lock(&scavenge.lock)
259	scavenge.parked = true
260
261	scavenge.timer = new(timer)
262	scavenge.timer.f = func(_ interface{}, _ uintptr) {
263		wakeScavenger()
264	}
265
266	c <- 1
267	goparkunlock(&scavenge.lock, waitReasonGCScavengeWait, traceEvGoBlock, 1)
268
269	// Exponentially-weighted moving average of the fraction of time this
270	// goroutine spends scavenging (that is, percent of a single CPU).
271	// It represents a measure of scheduling overheads which might extend
272	// the sleep or the critical time beyond what's expected. Assume no
273	// overhead to begin with.
274	//
275	// TODO(mknyszek): Consider making this based on total CPU time of the
276	// application (i.e. scavengePercent * GOMAXPROCS). This isn't really
277	// feasible now because the scavenger acquires the heap lock over the
278	// scavenging operation, which means scavenging effectively blocks
279	// allocators and isn't scalable. However, given a scalable allocator,
280	// it makes sense to also make the scavenger scale with it; if you're
281	// allocating more frequently, then presumably you're also generating
282	// more work for the scavenger.
283	const idealFraction = scavengePercent / 100.0
284	scavengeEWMA := float64(idealFraction)
285
286	for {
287		released := uintptr(0)
288
289		// Time in scavenging critical section.
290		crit := float64(0)
291
292		// Run on the system stack since we grab the heap lock,
293		// and a stack growth with the heap lock means a deadlock.
294		systemstack(func() {
295			lock(&mheap_.lock)
296
297			// If background scavenging is disabled or if there's no work to do just park.
298			retained, goal := heapRetained(), mheap_.scavengeGoal
299			if retained <= goal {
300				unlock(&mheap_.lock)
301				return
302			}
303
304			// Scavenge one page, and measure the amount of time spent scavenging.
305			start := nanotime()
306			released = mheap_.pages.scavenge(physPageSize, true)
307			mheap_.pages.scav.released += released
308			crit = float64(nanotime() - start)
309
310			unlock(&mheap_.lock)
311		})
312
313		if released == 0 {
314			lock(&scavenge.lock)
315			scavenge.parked = true
316			goparkunlock(&scavenge.lock, waitReasonGCScavengeWait, traceEvGoBlock, 1)
317			continue
318		}
319
320		if released < physPageSize {
321			// If this happens, it means that we may have attempted to release part
322			// of a physical page, but the likely effect of that is that it released
323			// the whole physical page, some of which may have still been in-use.
324			// This could lead to memory corruption. Throw.
325			throw("released less than one physical page of memory")
326		}
327
328		// On some platforms we may see crit as zero if the time it takes to scavenge
329		// memory is less than the minimum granularity of its clock (e.g. Windows).
330		// In this case, just assume scavenging takes 10 µs per regular physical page
331		// (determined empirically), and conservatively ignore the impact of huge pages
332		// on timing.
333		//
334		// We shouldn't ever see a crit value less than zero unless there's a bug of
335		// some kind, either on our side or in the platform we're running on, but be
336		// defensive in that case as well.
337		const approxCritNSPerPhysicalPage = 10e3
338		if crit <= 0 {
339			crit = approxCritNSPerPhysicalPage * float64(released/physPageSize)
340		}
341
342		// Multiply the critical time by 1 + the ratio of the costs of using
343		// scavenged memory vs. scavenging memory. This forces us to pay down
344		// the cost of reusing this memory eagerly by sleeping for a longer period
345		// of time and scavenging less frequently. More concretely, we avoid situations
346		// where we end up scavenging so often that we hurt allocation performance
347		// because of the additional overheads of using scavenged memory.
348		crit *= 1 + scavengeCostRatio
349
350		// If we spent more than 10 ms (for example, if the OS scheduled us away, or someone
351		// put their machine to sleep) in the critical section, bound the time we use to
352		// calculate at 10 ms to avoid letting the sleep time get arbitrarily high.
353		const maxCrit = 10e6
354		if crit > maxCrit {
355			crit = maxCrit
356		}
357
358		// Compute the amount of time to sleep, assuming we want to use at most
359		// scavengePercent of CPU time. Take into account scheduling overheads
360		// that may extend the length of our sleep by multiplying by how far
361		// off we are from the ideal ratio. For example, if we're sleeping too
362		// much, then scavengeEMWA < idealFraction, so we'll adjust the sleep time
363		// down.
364		adjust := scavengeEWMA / idealFraction
365		sleepTime := int64(adjust * crit / (scavengePercent / 100.0))
366
367		// Go to sleep.
368		slept := scavengeSleep(sleepTime)
369
370		// Compute the new ratio.
371		fraction := crit / (crit + float64(slept))
372
373		// Set a lower bound on the fraction.
374		// Due to OS-related anomalies we may "sleep" for an inordinate amount
375		// of time. Let's avoid letting the ratio get out of hand by bounding
376		// the sleep time we use in our EWMA.
377		const minFraction = 1 / 1000
378		if fraction < minFraction {
379			fraction = minFraction
380		}
381
382		// Update scavengeEWMA by merging in the new crit/slept ratio.
383		const alpha = 0.5
384		scavengeEWMA = alpha*fraction + (1-alpha)*scavengeEWMA
385	}
386}
387
388// scavenge scavenges nbytes worth of free pages, starting with the
389// highest address first. Successive calls continue from where it left
390// off until the heap is exhausted. Call scavengeStartGen to bring it
391// back to the top of the heap.
392//
393// Returns the amount of memory scavenged in bytes.
394//
395// p.mheapLock must be held, but may be temporarily released if
396// mayUnlock == true.
397//
398// Must run on the system stack because p.mheapLock must be held.
399//
400//go:systemstack
401func (p *pageAlloc) scavenge(nbytes uintptr, mayUnlock bool) uintptr {
402	assertLockHeld(p.mheapLock)
403
404	var (
405		addrs addrRange
406		gen   uint32
407	)
408	released := uintptr(0)
409	for released < nbytes {
410		if addrs.size() == 0 {
411			if addrs, gen = p.scavengeReserve(); addrs.size() == 0 {
412				break
413			}
414		}
415		r, a := p.scavengeOne(addrs, nbytes-released, mayUnlock)
416		released += r
417		addrs = a
418	}
419	// Only unreserve the space which hasn't been scavenged or searched
420	// to ensure we always make progress.
421	p.scavengeUnreserve(addrs, gen)
422	return released
423}
424
425// printScavTrace prints a scavenge trace line to standard error.
426//
427// released should be the amount of memory released since the last time this
428// was called, and forced indicates whether the scavenge was forced by the
429// application.
430func printScavTrace(gen uint32, released uintptr, forced bool) {
431	printlock()
432	print("scav ", gen, " ",
433		released>>10, " KiB work, ",
434		atomic.Load64(&memstats.heap_released)>>10, " KiB total, ",
435		(atomic.Load64(&memstats.heap_inuse)*100)/heapRetained(), "% util",
436	)
437	if forced {
438		print(" (forced)")
439	}
440	println()
441	printunlock()
442}
443
444// scavengeStartGen starts a new scavenge generation, resetting
445// the scavenger's search space to the full in-use address space.
446//
447// p.mheapLock must be held.
448//
449// Must run on the system stack because p.mheapLock must be held.
450//
451//go:systemstack
452func (p *pageAlloc) scavengeStartGen() {
453	assertLockHeld(p.mheapLock)
454
455	if debug.scavtrace > 0 {
456		printScavTrace(p.scav.gen, p.scav.released, false)
457	}
458	p.inUse.cloneInto(&p.scav.inUse)
459
460	// Pick the new starting address for the scavenger cycle.
461	var startAddr offAddr
462	if p.scav.scavLWM.lessThan(p.scav.freeHWM) {
463		// The "free" high watermark exceeds the "scavenged" low watermark,
464		// so there are free scavengable pages in parts of the address space
465		// that the scavenger already searched, the high watermark being the
466		// highest one. Pick that as our new starting point to ensure we
467		// see those pages.
468		startAddr = p.scav.freeHWM
469	} else {
470		// The "free" high watermark does not exceed the "scavenged" low
471		// watermark. This means the allocator didn't free any memory in
472		// the range we scavenged last cycle, so we might as well continue
473		// scavenging from where we were.
474		startAddr = p.scav.scavLWM
475	}
476	p.scav.inUse.removeGreaterEqual(startAddr.addr())
477
478	// reservationBytes may be zero if p.inUse.totalBytes is small, or if
479	// scavengeReservationShards is large. This case is fine as the scavenger
480	// will simply be turned off, but it does mean that scavengeReservationShards,
481	// in concert with pallocChunkBytes, dictates the minimum heap size at which
482	// the scavenger triggers. In practice this minimum is generally less than an
483	// arena in size, so virtually every heap has the scavenger on.
484	p.scav.reservationBytes = alignUp(p.inUse.totalBytes, pallocChunkBytes) / scavengeReservationShards
485	p.scav.gen++
486	p.scav.released = 0
487	p.scav.freeHWM = minOffAddr
488	p.scav.scavLWM = maxOffAddr
489}
490
491// scavengeReserve reserves a contiguous range of the address space
492// for scavenging. The maximum amount of space it reserves is proportional
493// to the size of the heap. The ranges are reserved from the high addresses
494// first.
495//
496// Returns the reserved range and the scavenge generation number for it.
497//
498// p.mheapLock must be held.
499//
500// Must run on the system stack because p.mheapLock must be held.
501//
502//go:systemstack
503func (p *pageAlloc) scavengeReserve() (addrRange, uint32) {
504	assertLockHeld(p.mheapLock)
505
506	// Start by reserving the minimum.
507	r := p.scav.inUse.removeLast(p.scav.reservationBytes)
508
509	// Return early if the size is zero; we don't want to use
510	// the bogus address below.
511	if r.size() == 0 {
512		return r, p.scav.gen
513	}
514
515	// The scavenger requires that base be aligned to a
516	// palloc chunk because that's the unit of operation for
517	// the scavenger, so align down, potentially extending
518	// the range.
519	newBase := alignDown(r.base.addr(), pallocChunkBytes)
520
521	// Remove from inUse however much extra we just pulled out.
522	p.scav.inUse.removeGreaterEqual(newBase)
523	r.base = offAddr{newBase}
524	return r, p.scav.gen
525}
526
527// scavengeUnreserve returns an unscavenged portion of a range that was
528// previously reserved with scavengeReserve.
529//
530// p.mheapLock must be held.
531//
532// Must run on the system stack because p.mheapLock must be held.
533//
534//go:systemstack
535func (p *pageAlloc) scavengeUnreserve(r addrRange, gen uint32) {
536	assertLockHeld(p.mheapLock)
537
538	if r.size() == 0 || gen != p.scav.gen {
539		return
540	}
541	if r.base.addr()%pallocChunkBytes != 0 {
542		throw("unreserving unaligned region")
543	}
544	p.scav.inUse.add(r)
545}
546
547// scavengeOne walks over address range work until it finds
548// a contiguous run of pages to scavenge. It will try to scavenge
549// at most max bytes at once, but may scavenge more to avoid
550// breaking huge pages. Once it scavenges some memory it returns
551// how much it scavenged in bytes.
552//
553// Returns the number of bytes scavenged and the part of work
554// which was not yet searched.
555//
556// work's base address must be aligned to pallocChunkBytes.
557//
558// p.mheapLock must be held, but may be temporarily released if
559// mayUnlock == true.
560//
561// Must run on the system stack because p.mheapLock must be held.
562//
563//go:systemstack
564func (p *pageAlloc) scavengeOne(work addrRange, max uintptr, mayUnlock bool) (uintptr, addrRange) {
565	assertLockHeld(p.mheapLock)
566
567	// Defensively check if we've received an empty address range.
568	// If so, just return.
569	if work.size() == 0 {
570		// Nothing to do.
571		return 0, work
572	}
573	// Check the prerequisites of work.
574	if work.base.addr()%pallocChunkBytes != 0 {
575		throw("scavengeOne called with unaligned work region")
576	}
577	// Calculate the maximum number of pages to scavenge.
578	//
579	// This should be alignUp(max, pageSize) / pageSize but max can and will
580	// be ^uintptr(0), so we need to be very careful not to overflow here.
581	// Rather than use alignUp, calculate the number of pages rounded down
582	// first, then add back one if necessary.
583	maxPages := max / pageSize
584	if max%pageSize != 0 {
585		maxPages++
586	}
587
588	// Calculate the minimum number of pages we can scavenge.
589	//
590	// Because we can only scavenge whole physical pages, we must
591	// ensure that we scavenge at least minPages each time, aligned
592	// to minPages*pageSize.
593	minPages := physPageSize / pageSize
594	if minPages < 1 {
595		minPages = 1
596	}
597
598	// Helpers for locking and unlocking only if mayUnlock == true.
599	lockHeap := func() {
600		if mayUnlock {
601			lock(p.mheapLock)
602		}
603	}
604	unlockHeap := func() {
605		if mayUnlock {
606			unlock(p.mheapLock)
607		}
608	}
609
610	// Fast path: check the chunk containing the top-most address in work,
611	// starting at that address's page index in the chunk.
612	//
613	// Note that work.end() is exclusive, so get the chunk we care about
614	// by subtracting 1.
615	maxAddr := work.limit.addr() - 1
616	maxChunk := chunkIndex(maxAddr)
617	if p.summary[len(p.summary)-1][maxChunk].max() >= uint(minPages) {
618		// We only bother looking for a candidate if there at least
619		// minPages free pages at all.
620		base, npages := p.chunkOf(maxChunk).findScavengeCandidate(chunkPageIndex(maxAddr), minPages, maxPages)
621
622		// If we found something, scavenge it and return!
623		if npages != 0 {
624			work.limit = offAddr{p.scavengeRangeLocked(maxChunk, base, npages)}
625
626			assertLockHeld(p.mheapLock) // Must be locked on return.
627			return uintptr(npages) * pageSize, work
628		}
629	}
630	// Update the limit to reflect the fact that we checked maxChunk already.
631	work.limit = offAddr{chunkBase(maxChunk)}
632
633	// findCandidate finds the next scavenge candidate in work optimistically.
634	//
635	// Returns the candidate chunk index and true on success, and false on failure.
636	//
637	// The heap need not be locked.
638	findCandidate := func(work addrRange) (chunkIdx, bool) {
639		// Iterate over this work's chunks.
640		for i := chunkIndex(work.limit.addr() - 1); i >= chunkIndex(work.base.addr()); i-- {
641			// If this chunk is totally in-use or has no unscavenged pages, don't bother
642			// doing a more sophisticated check.
643			//
644			// Note we're accessing the summary and the chunks without a lock, but
645			// that's fine. We're being optimistic anyway.
646
647			// Check quickly if there are enough free pages at all.
648			if p.summary[len(p.summary)-1][i].max() < uint(minPages) {
649				continue
650			}
651
652			// Run over the chunk looking harder for a candidate. Again, we could
653			// race with a lot of different pieces of code, but we're just being
654			// optimistic. Make sure we load the l2 pointer atomically though, to
655			// avoid races with heap growth. It may or may not be possible to also
656			// see a nil pointer in this case if we do race with heap growth, but
657			// just defensively ignore the nils. This operation is optimistic anyway.
658			l2 := (*[1 << pallocChunksL2Bits]pallocData)(atomic.Loadp(unsafe.Pointer(&p.chunks[i.l1()])))
659			if l2 != nil && l2[i.l2()].hasScavengeCandidate(minPages) {
660				return i, true
661			}
662		}
663		return 0, false
664	}
665
666	// Slow path: iterate optimistically over the in-use address space
667	// looking for any free and unscavenged page. If we think we see something,
668	// lock and verify it!
669	for work.size() != 0 {
670		unlockHeap()
671
672		// Search for the candidate.
673		candidateChunkIdx, ok := findCandidate(work)
674
675		// Lock the heap. We need to do this now if we found a candidate or not.
676		// If we did, we'll verify it. If not, we need to lock before returning
677		// anyway.
678		lockHeap()
679
680		if !ok {
681			// We didn't find a candidate, so we're done.
682			work.limit = work.base
683			break
684		}
685
686		// Find, verify, and scavenge if we can.
687		chunk := p.chunkOf(candidateChunkIdx)
688		base, npages := chunk.findScavengeCandidate(pallocChunkPages-1, minPages, maxPages)
689		if npages > 0 {
690			work.limit = offAddr{p.scavengeRangeLocked(candidateChunkIdx, base, npages)}
691
692			assertLockHeld(p.mheapLock) // Must be locked on return.
693			return uintptr(npages) * pageSize, work
694		}
695
696		// We were fooled, so let's continue from where we left off.
697		work.limit = offAddr{chunkBase(candidateChunkIdx)}
698	}
699
700	assertLockHeld(p.mheapLock) // Must be locked on return.
701	return 0, work
702}
703
704// scavengeRangeLocked scavenges the given region of memory.
705// The region of memory is described by its chunk index (ci),
706// the starting page index of the region relative to that
707// chunk (base), and the length of the region in pages (npages).
708//
709// Returns the base address of the scavenged region.
710//
711// p.mheapLock must be held.
712func (p *pageAlloc) scavengeRangeLocked(ci chunkIdx, base, npages uint) uintptr {
713	assertLockHeld(p.mheapLock)
714
715	p.chunkOf(ci).scavenged.setRange(base, npages)
716
717	// Compute the full address for the start of the range.
718	addr := chunkBase(ci) + uintptr(base)*pageSize
719
720	// Update the scavenge low watermark.
721	if oAddr := (offAddr{addr}); oAddr.lessThan(p.scav.scavLWM) {
722		p.scav.scavLWM = oAddr
723	}
724
725	// Only perform the actual scavenging if we're not in a test.
726	// It's dangerous to do so otherwise.
727	if p.test {
728		return addr
729	}
730	sysUnused(unsafe.Pointer(addr), uintptr(npages)*pageSize)
731
732	// Update global accounting only when not in test, otherwise
733	// the runtime's accounting will be wrong.
734	nbytes := int64(npages) * pageSize
735	atomic.Xadd64(&memstats.heap_released, nbytes)
736
737	// Update consistent accounting too.
738	stats := memstats.heapStats.acquire()
739	atomic.Xaddint64(&stats.committed, -nbytes)
740	atomic.Xaddint64(&stats.released, nbytes)
741	memstats.heapStats.release()
742
743	return addr
744}
745
746// fillAligned returns x but with all zeroes in m-aligned
747// groups of m bits set to 1 if any bit in the group is non-zero.
748//
749// For example, fillAligned(0x0100a3, 8) == 0xff00ff.
750//
751// Note that if m == 1, this is a no-op.
752//
753// m must be a power of 2 <= maxPagesPerPhysPage.
754func fillAligned(x uint64, m uint) uint64 {
755	apply := func(x uint64, c uint64) uint64 {
756		// The technique used it here is derived from
757		// https://graphics.stanford.edu/~seander/bithacks.html#ZeroInWord
758		// and extended for more than just bytes (like nibbles
759		// and uint16s) by using an appropriate constant.
760		//
761		// To summarize the technique, quoting from that page:
762		// "[It] works by first zeroing the high bits of the [8]
763		// bytes in the word. Subsequently, it adds a number that
764		// will result in an overflow to the high bit of a byte if
765		// any of the low bits were initially set. Next the high
766		// bits of the original word are ORed with these values;
767		// thus, the high bit of a byte is set iff any bit in the
768		// byte was set. Finally, we determine if any of these high
769		// bits are zero by ORing with ones everywhere except the
770		// high bits and inverting the result."
771		return ^((((x & c) + c) | x) | c)
772	}
773	// Transform x to contain a 1 bit at the top of each m-aligned
774	// group of m zero bits.
775	switch m {
776	case 1:
777		return x
778	case 2:
779		x = apply(x, 0x5555555555555555)
780	case 4:
781		x = apply(x, 0x7777777777777777)
782	case 8:
783		x = apply(x, 0x7f7f7f7f7f7f7f7f)
784	case 16:
785		x = apply(x, 0x7fff7fff7fff7fff)
786	case 32:
787		x = apply(x, 0x7fffffff7fffffff)
788	case 64: // == maxPagesPerPhysPage
789		x = apply(x, 0x7fffffffffffffff)
790	default:
791		throw("bad m value")
792	}
793	// Now, the top bit of each m-aligned group in x is set
794	// that group was all zero in the original x.
795
796	// From each group of m bits subtract 1.
797	// Because we know only the top bits of each
798	// m-aligned group are set, we know this will
799	// set each group to have all the bits set except
800	// the top bit, so just OR with the original
801	// result to set all the bits.
802	return ^((x - (x >> (m - 1))) | x)
803}
804
805// hasScavengeCandidate returns true if there's any min-page-aligned groups of
806// min pages of free-and-unscavenged memory in the region represented by this
807// pallocData.
808//
809// min must be a non-zero power of 2 <= maxPagesPerPhysPage.
810func (m *pallocData) hasScavengeCandidate(min uintptr) bool {
811	if min&(min-1) != 0 || min == 0 {
812		print("runtime: min = ", min, "\n")
813		throw("min must be a non-zero power of 2")
814	} else if min > maxPagesPerPhysPage {
815		print("runtime: min = ", min, "\n")
816		throw("min too large")
817	}
818
819	// The goal of this search is to see if the chunk contains any free and unscavenged memory.
820	for i := len(m.scavenged) - 1; i >= 0; i-- {
821		// 1s are scavenged OR non-free => 0s are unscavenged AND free
822		//
823		// TODO(mknyszek): Consider splitting up fillAligned into two
824		// functions, since here we technically could get by with just
825		// the first half of its computation. It'll save a few instructions
826		// but adds some additional code complexity.
827		x := fillAligned(m.scavenged[i]|m.pallocBits[i], uint(min))
828
829		// Quickly skip over chunks of non-free or scavenged pages.
830		if x != ^uint64(0) {
831			return true
832		}
833	}
834	return false
835}
836
837// findScavengeCandidate returns a start index and a size for this pallocData
838// segment which represents a contiguous region of free and unscavenged memory.
839//
840// searchIdx indicates the page index within this chunk to start the search, but
841// note that findScavengeCandidate searches backwards through the pallocData. As a
842// a result, it will return the highest scavenge candidate in address order.
843//
844// min indicates a hard minimum size and alignment for runs of pages. That is,
845// findScavengeCandidate will not return a region smaller than min pages in size,
846// or that is min pages or greater in size but not aligned to min. min must be
847// a non-zero power of 2 <= maxPagesPerPhysPage.
848//
849// max is a hint for how big of a region is desired. If max >= pallocChunkPages, then
850// findScavengeCandidate effectively returns entire free and unscavenged regions.
851// If max < pallocChunkPages, it may truncate the returned region such that size is
852// max. However, findScavengeCandidate may still return a larger region if, for
853// example, it chooses to preserve huge pages, or if max is not aligned to min (it
854// will round up). That is, even if max is small, the returned size is not guaranteed
855// to be equal to max. max is allowed to be less than min, in which case it is as if
856// max == min.
857func (m *pallocData) findScavengeCandidate(searchIdx uint, min, max uintptr) (uint, uint) {
858	if min&(min-1) != 0 || min == 0 {
859		print("runtime: min = ", min, "\n")
860		throw("min must be a non-zero power of 2")
861	} else if min > maxPagesPerPhysPage {
862		print("runtime: min = ", min, "\n")
863		throw("min too large")
864	}
865	// max may not be min-aligned, so we might accidentally truncate to
866	// a max value which causes us to return a non-min-aligned value.
867	// To prevent this, align max up to a multiple of min (which is always
868	// a power of 2). This also prevents max from ever being less than
869	// min, unless it's zero, so handle that explicitly.
870	if max == 0 {
871		max = min
872	} else {
873		max = alignUp(max, min)
874	}
875
876	i := int(searchIdx / 64)
877	// Start by quickly skipping over blocks of non-free or scavenged pages.
878	for ; i >= 0; i-- {
879		// 1s are scavenged OR non-free => 0s are unscavenged AND free
880		x := fillAligned(m.scavenged[i]|m.pallocBits[i], uint(min))
881		if x != ^uint64(0) {
882			break
883		}
884	}
885	if i < 0 {
886		// Failed to find any free/unscavenged pages.
887		return 0, 0
888	}
889	// We have something in the 64-bit chunk at i, but it could
890	// extend further. Loop until we find the extent of it.
891
892	// 1s are scavenged OR non-free => 0s are unscavenged AND free
893	x := fillAligned(m.scavenged[i]|m.pallocBits[i], uint(min))
894	z1 := uint(sys.LeadingZeros64(^x))
895	run, end := uint(0), uint(i)*64+(64-z1)
896	if x<<z1 != 0 {
897		// After shifting out z1 bits, we still have 1s,
898		// so the run ends inside this word.
899		run = uint(sys.LeadingZeros64(x << z1))
900	} else {
901		// After shifting out z1 bits, we have no more 1s.
902		// This means the run extends to the bottom of the
903		// word so it may extend into further words.
904		run = 64 - z1
905		for j := i - 1; j >= 0; j-- {
906			x := fillAligned(m.scavenged[j]|m.pallocBits[j], uint(min))
907			run += uint(sys.LeadingZeros64(x))
908			if x != 0 {
909				// The run stopped in this word.
910				break
911			}
912		}
913	}
914
915	// Split the run we found if it's larger than max but hold on to
916	// our original length, since we may need it later.
917	size := run
918	if size > uint(max) {
919		size = uint(max)
920	}
921	start := end - size
922
923	// Each huge page is guaranteed to fit in a single palloc chunk.
924	//
925	// TODO(mknyszek): Support larger huge page sizes.
926	// TODO(mknyszek): Consider taking pages-per-huge-page as a parameter
927	// so we can write tests for this.
928	if physHugePageSize > pageSize && physHugePageSize > physPageSize {
929		// We have huge pages, so let's ensure we don't break one by scavenging
930		// over a huge page boundary. If the range [start, start+size) overlaps with
931		// a free-and-unscavenged huge page, we want to grow the region we scavenge
932		// to include that huge page.
933
934		// Compute the huge page boundary above our candidate.
935		pagesPerHugePage := uintptr(physHugePageSize / pageSize)
936		hugePageAbove := uint(alignUp(uintptr(start), pagesPerHugePage))
937
938		// If that boundary is within our current candidate, then we may be breaking
939		// a huge page.
940		if hugePageAbove <= end {
941			// Compute the huge page boundary below our candidate.
942			hugePageBelow := uint(alignDown(uintptr(start), pagesPerHugePage))
943
944			if hugePageBelow >= end-run {
945				// We're in danger of breaking apart a huge page since start+size crosses
946				// a huge page boundary and rounding down start to the nearest huge
947				// page boundary is included in the full run we found. Include the entire
948				// huge page in the bound by rounding down to the huge page size.
949				size = size + (start - hugePageBelow)
950				start = hugePageBelow
951			}
952		}
953	}
954	return start, size
955}
956