1// Copyright 2009 The Go Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style
3// license that can be found in the LICENSE file.
4
5// Garbage collector (GC).
6//
7// The GC runs concurrently with mutator threads, is type accurate (aka precise), allows multiple
8// GC thread to run in parallel. It is a concurrent mark and sweep that uses a write barrier. It is
9// non-generational and non-compacting. Allocation is done using size segregated per P allocation
10// areas to minimize fragmentation while eliminating locks in the common case.
11//
12// The algorithm decomposes into several steps.
13// This is a high level description of the algorithm being used. For an overview of GC a good
14// place to start is Richard Jones' gchandbook.org.
15//
16// The algorithm's intellectual heritage includes Dijkstra's on-the-fly algorithm, see
17// Edsger W. Dijkstra, Leslie Lamport, A. J. Martin, C. S. Scholten, and E. F. M. Steffens. 1978.
18// On-the-fly garbage collection: an exercise in cooperation. Commun. ACM 21, 11 (November 1978),
19// 966-975.
20// For journal quality proofs that these steps are complete, correct, and terminate see
21// Hudson, R., and Moss, J.E.B. Copying Garbage Collection without stopping the world.
22// Concurrency and Computation: Practice and Experience 15(3-5), 2003.
23//
24// 1. GC performs sweep termination.
25//
26//    a. Stop the world. This causes all Ps to reach a GC safe-point.
27//
28//    b. Sweep any unswept spans. There will only be unswept spans if
29//    this GC cycle was forced before the expected time.
30//
31// 2. GC performs the mark phase.
32//
33//    a. Prepare for the mark phase by setting gcphase to _GCmark
34//    (from _GCoff), enabling the write barrier, enabling mutator
35//    assists, and enqueueing root mark jobs. No objects may be
36//    scanned until all Ps have enabled the write barrier, which is
37//    accomplished using STW.
38//
39//    b. Start the world. From this point, GC work is done by mark
40//    workers started by the scheduler and by assists performed as
41//    part of allocation. The write barrier shades both the
42//    overwritten pointer and the new pointer value for any pointer
43//    writes (see mbarrier.go for details). Newly allocated objects
44//    are immediately marked black.
45//
46//    c. GC performs root marking jobs. This includes scanning all
47//    stacks, shading all globals, and shading any heap pointers in
48//    off-heap runtime data structures. Scanning a stack stops a
49//    goroutine, shades any pointers found on its stack, and then
50//    resumes the goroutine.
51//
52//    d. GC drains the work queue of grey objects, scanning each grey
53//    object to black and shading all pointers found in the object
54//    (which in turn may add those pointers to the work queue).
55//
56//    e. Because GC work is spread across local caches, GC uses a
57//    distributed termination algorithm to detect when there are no
58//    more root marking jobs or grey objects (see gcMarkDone). At this
59//    point, GC transitions to mark termination.
60//
61// 3. GC performs mark termination.
62//
63//    a. Stop the world.
64//
65//    b. Set gcphase to _GCmarktermination, and disable workers and
66//    assists.
67//
68//    c. Perform housekeeping like flushing mcaches.
69//
70// 4. GC performs the sweep phase.
71//
72//    a. Prepare for the sweep phase by setting gcphase to _GCoff,
73//    setting up sweep state and disabling the write barrier.
74//
75//    b. Start the world. From this point on, newly allocated objects
76//    are white, and allocating sweeps spans before use if necessary.
77//
78//    c. GC does concurrent sweeping in the background and in response
79//    to allocation. See description below.
80//
81// 5. When sufficient allocation has taken place, replay the sequence
82// starting with 1 above. See discussion of GC rate below.
83
84// Concurrent sweep.
85//
86// The sweep phase proceeds concurrently with normal program execution.
87// The heap is swept span-by-span both lazily (when a goroutine needs another span)
88// and concurrently in a background goroutine (this helps programs that are not CPU bound).
89// At the end of STW mark termination all spans are marked as "needs sweeping".
90//
91// The background sweeper goroutine simply sweeps spans one-by-one.
92//
93// To avoid requesting more OS memory while there are unswept spans, when a
94// goroutine needs another span, it first attempts to reclaim that much memory
95// by sweeping. When a goroutine needs to allocate a new small-object span, it
96// sweeps small-object spans for the same object size until it frees at least
97// one object. When a goroutine needs to allocate large-object span from heap,
98// it sweeps spans until it frees at least that many pages into heap. There is
99// one case where this may not suffice: if a goroutine sweeps and frees two
100// nonadjacent one-page spans to the heap, it will allocate a new two-page
101// span, but there can still be other one-page unswept spans which could be
102// combined into a two-page span.
103//
104// It's critical to ensure that no operations proceed on unswept spans (that would corrupt
105// mark bits in GC bitmap). During GC all mcaches are flushed into the central cache,
106// so they are empty. When a goroutine grabs a new span into mcache, it sweeps it.
107// When a goroutine explicitly frees an object or sets a finalizer, it ensures that
108// the span is swept (either by sweeping it, or by waiting for the concurrent sweep to finish).
109// The finalizer goroutine is kicked off only when all spans are swept.
110// When the next GC starts, it sweeps all not-yet-swept spans (if any).
111
112// GC rate.
113// Next GC is after we've allocated an extra amount of memory proportional to
114// the amount already in use. The proportion is controlled by GOGC environment variable
115// (100 by default). If GOGC=100 and we're using 4M, we'll GC again when we get to 8M
116// (this mark is tracked in next_gc variable). This keeps the GC cost in linear
117// proportion to the allocation cost. Adjusting GOGC just changes the linear constant
118// (and also the amount of extra memory used).
119
120// Oblets
121//
122// In order to prevent long pauses while scanning large objects and to
123// improve parallelism, the garbage collector breaks up scan jobs for
124// objects larger than maxObletBytes into "oblets" of at most
125// maxObletBytes. When scanning encounters the beginning of a large
126// object, it scans only the first oblet and enqueues the remaining
127// oblets as new scan jobs.
128
129package runtime
130
131import (
132	"internal/cpu"
133	"runtime/internal/atomic"
134	"unsafe"
135)
136
137const (
138	_DebugGC         = 0
139	_ConcurrentSweep = true
140	_FinBlockSize    = 4 * 1024
141
142	// debugScanConservative enables debug logging for stack
143	// frames that are scanned conservatively.
144	debugScanConservative = false
145
146	// sweepMinHeapDistance is a lower bound on the heap distance
147	// (in bytes) reserved for concurrent sweeping between GC
148	// cycles.
149	sweepMinHeapDistance = 1024 * 1024
150)
151
152// heapminimum is the minimum heap size at which to trigger GC.
153// For small heaps, this overrides the usual GOGC*live set rule.
154//
155// When there is a very small live set but a lot of allocation, simply
156// collecting when the heap reaches GOGC*live results in many GC
157// cycles and high total per-GC overhead. This minimum amortizes this
158// per-GC overhead while keeping the heap reasonably small.
159//
160// During initialization this is set to 4MB*GOGC/100. In the case of
161// GOGC==0, this will set heapminimum to 0, resulting in constant
162// collection even when the heap size is small, which is useful for
163// debugging.
164var heapminimum uint64 = defaultHeapMinimum
165
166// defaultHeapMinimum is the value of heapminimum for GOGC==100.
167const defaultHeapMinimum = 4 << 20
168
169// Initialized from $GOGC.  GOGC=off means no GC.
170var gcpercent int32
171
172func gcinit() {
173	if unsafe.Sizeof(workbuf{}) != _WorkbufSize {
174		throw("size of Workbuf is suboptimal")
175	}
176
177	// No sweep on the first cycle.
178	mheap_.sweepdone = 1
179
180	// Set a reasonable initial GC trigger.
181	memstats.triggerRatio = 7 / 8.0
182
183	// Fake a heap_marked value so it looks like a trigger at
184	// heapminimum is the appropriate growth from heap_marked.
185	// This will go into computing the initial GC goal.
186	memstats.heap_marked = uint64(float64(heapminimum) / (1 + memstats.triggerRatio))
187
188	// Set gcpercent from the environment. This will also compute
189	// and set the GC trigger and goal.
190	_ = setGCPercent(readgogc())
191
192	work.startSema = 1
193	work.markDoneSema = 1
194	lockInit(&work.sweepWaiters.lock, lockRankSweepWaiters)
195	lockInit(&work.assistQueue.lock, lockRankAssistQueue)
196	lockInit(&work.wbufSpans.lock, lockRankWbufSpans)
197}
198
199func readgogc() int32 {
200	p := gogetenv("GOGC")
201	if p == "off" {
202		return -1
203	}
204	if n, ok := atoi32(p); ok {
205		return n
206	}
207	return 100
208}
209
210// gcenable is called after the bulk of the runtime initialization,
211// just before we're about to start letting user code run.
212// It kicks off the background sweeper goroutine, the background
213// scavenger goroutine, and enables GC.
214func gcenable() {
215	// Kick off sweeping and scavenging.
216	c := make(chan int, 2)
217	expectSystemGoroutine()
218	go bgsweep(c)
219	expectSystemGoroutine()
220	go bgscavenge(c)
221	<-c
222	<-c
223	memstats.enablegc = true // now that runtime is initialized, GC is okay
224}
225
226//go:linkname setGCPercent runtime_1debug.setGCPercent
227func setGCPercent(in int32) (out int32) {
228	// Run on the system stack since we grab the heap lock.
229	systemstack(func() {
230		lock(&mheap_.lock)
231		out = gcpercent
232		if in < 0 {
233			in = -1
234		}
235		gcpercent = in
236		heapminimum = defaultHeapMinimum * uint64(gcpercent) / 100
237		// Update pacing in response to gcpercent change.
238		gcSetTriggerRatio(memstats.triggerRatio)
239		unlock(&mheap_.lock)
240	})
241
242	// If we just disabled GC, wait for any concurrent GC mark to
243	// finish so we always return with no GC running.
244	if in < 0 {
245		gcWaitOnMark(atomic.Load(&work.cycles))
246	}
247
248	return out
249}
250
251// Garbage collector phase.
252// Indicates to write barrier and synchronization task to perform.
253var gcphase uint32
254
255// The compiler knows about this variable.
256// If you change it, you must change gofrontend/wb.cc, too.
257// If you change the first four bytes, you must also change the write
258// barrier insertion code.
259var writeBarrier struct {
260	enabled bool    // compiler emits a check of this before calling write barrier
261	pad     [3]byte // compiler uses 32-bit load for "enabled" field
262	needed  bool    // whether we need a write barrier for current GC phase
263	cgo     bool    // whether we need a write barrier for a cgo check
264	alignme uint64  // guarantee alignment so that compiler can use a 32 or 64-bit load
265}
266
267// gcBlackenEnabled is 1 if mutator assists and background mark
268// workers are allowed to blacken objects. This must only be set when
269// gcphase == _GCmark.
270var gcBlackenEnabled uint32
271
272const (
273	_GCoff             = iota // GC not running; sweeping in background, write barrier disabled
274	_GCmark                   // GC marking roots and workbufs: allocate black, write barrier ENABLED
275	_GCmarktermination        // GC mark termination: allocate black, P's help GC, write barrier ENABLED
276)
277
278//go:nosplit
279func setGCPhase(x uint32) {
280	atomic.Store(&gcphase, x)
281	writeBarrier.needed = gcphase == _GCmark || gcphase == _GCmarktermination
282	writeBarrier.enabled = writeBarrier.needed || writeBarrier.cgo
283}
284
285// gcMarkWorkerMode represents the mode that a concurrent mark worker
286// should operate in.
287//
288// Concurrent marking happens through four different mechanisms. One
289// is mutator assists, which happen in response to allocations and are
290// not scheduled. The other three are variations in the per-P mark
291// workers and are distinguished by gcMarkWorkerMode.
292type gcMarkWorkerMode int
293
294const (
295	// gcMarkWorkerNotWorker indicates that the next scheduled G is not
296	// starting work and the mode should be ignored.
297	gcMarkWorkerNotWorker gcMarkWorkerMode = iota
298
299	// gcMarkWorkerDedicatedMode indicates that the P of a mark
300	// worker is dedicated to running that mark worker. The mark
301	// worker should run without preemption.
302	gcMarkWorkerDedicatedMode
303
304	// gcMarkWorkerFractionalMode indicates that a P is currently
305	// running the "fractional" mark worker. The fractional worker
306	// is necessary when GOMAXPROCS*gcBackgroundUtilization is not
307	// an integer. The fractional worker should run until it is
308	// preempted and will be scheduled to pick up the fractional
309	// part of GOMAXPROCS*gcBackgroundUtilization.
310	gcMarkWorkerFractionalMode
311
312	// gcMarkWorkerIdleMode indicates that a P is running the mark
313	// worker because it has nothing else to do. The idle worker
314	// should run until it is preempted and account its time
315	// against gcController.idleMarkTime.
316	gcMarkWorkerIdleMode
317)
318
319// gcMarkWorkerModeStrings are the strings labels of gcMarkWorkerModes
320// to use in execution traces.
321var gcMarkWorkerModeStrings = [...]string{
322	"Not worker",
323	"GC (dedicated)",
324	"GC (fractional)",
325	"GC (idle)",
326}
327
328// gcController implements the GC pacing controller that determines
329// when to trigger concurrent garbage collection and how much marking
330// work to do in mutator assists and background marking.
331//
332// It uses a feedback control algorithm to adjust the memstats.gc_trigger
333// trigger based on the heap growth and GC CPU utilization each cycle.
334// This algorithm optimizes for heap growth to match GOGC and for CPU
335// utilization between assist and background marking to be 25% of
336// GOMAXPROCS. The high-level design of this algorithm is documented
337// at https://golang.org/s/go15gcpacing.
338//
339// All fields of gcController are used only during a single mark
340// cycle.
341var gcController gcControllerState
342
343type gcControllerState struct {
344	// scanWork is the total scan work performed this cycle. This
345	// is updated atomically during the cycle. Updates occur in
346	// bounded batches, since it is both written and read
347	// throughout the cycle. At the end of the cycle, this is how
348	// much of the retained heap is scannable.
349	//
350	// Currently this is the bytes of heap scanned. For most uses,
351	// this is an opaque unit of work, but for estimation the
352	// definition is important.
353	scanWork int64
354
355	// bgScanCredit is the scan work credit accumulated by the
356	// concurrent background scan. This credit is accumulated by
357	// the background scan and stolen by mutator assists. This is
358	// updated atomically. Updates occur in bounded batches, since
359	// it is both written and read throughout the cycle.
360	bgScanCredit int64
361
362	// assistTime is the nanoseconds spent in mutator assists
363	// during this cycle. This is updated atomically. Updates
364	// occur in bounded batches, since it is both written and read
365	// throughout the cycle.
366	assistTime int64
367
368	// dedicatedMarkTime is the nanoseconds spent in dedicated
369	// mark workers during this cycle. This is updated atomically
370	// at the end of the concurrent mark phase.
371	dedicatedMarkTime int64
372
373	// fractionalMarkTime is the nanoseconds spent in the
374	// fractional mark worker during this cycle. This is updated
375	// atomically throughout the cycle and will be up-to-date if
376	// the fractional mark worker is not currently running.
377	fractionalMarkTime int64
378
379	// idleMarkTime is the nanoseconds spent in idle marking
380	// during this cycle. This is updated atomically throughout
381	// the cycle.
382	idleMarkTime int64
383
384	// markStartTime is the absolute start time in nanoseconds
385	// that assists and background mark workers started.
386	markStartTime int64
387
388	// dedicatedMarkWorkersNeeded is the number of dedicated mark
389	// workers that need to be started. This is computed at the
390	// beginning of each cycle and decremented atomically as
391	// dedicated mark workers get started.
392	dedicatedMarkWorkersNeeded int64
393
394	// assistWorkPerByte is the ratio of scan work to allocated
395	// bytes that should be performed by mutator assists. This is
396	// computed at the beginning of each cycle and updated every
397	// time heap_scan is updated.
398	//
399	// Stored as a uint64, but it's actually a float64. Use
400	// float64frombits to get the value.
401	//
402	// Read and written atomically.
403	assistWorkPerByte uint64
404
405	// assistBytesPerWork is 1/assistWorkPerByte.
406	//
407	// Stored as a uint64, but it's actually a float64. Use
408	// float64frombits to get the value.
409	//
410	// Read and written atomically.
411	//
412	// Note that because this is read and written independently
413	// from assistWorkPerByte users may notice a skew between
414	// the two values, and such a state should be safe.
415	assistBytesPerWork uint64
416
417	// fractionalUtilizationGoal is the fraction of wall clock
418	// time that should be spent in the fractional mark worker on
419	// each P that isn't running a dedicated worker.
420	//
421	// For example, if the utilization goal is 25% and there are
422	// no dedicated workers, this will be 0.25. If the goal is
423	// 25%, there is one dedicated worker, and GOMAXPROCS is 5,
424	// this will be 0.05 to make up the missing 5%.
425	//
426	// If this is zero, no fractional workers are needed.
427	fractionalUtilizationGoal float64
428
429	_ cpu.CacheLinePad
430}
431
432// startCycle resets the GC controller's state and computes estimates
433// for a new GC cycle. The caller must hold worldsema and the world
434// must be stopped.
435func (c *gcControllerState) startCycle() {
436	c.scanWork = 0
437	c.bgScanCredit = 0
438	c.assistTime = 0
439	c.dedicatedMarkTime = 0
440	c.fractionalMarkTime = 0
441	c.idleMarkTime = 0
442
443	// Ensure that the heap goal is at least a little larger than
444	// the current live heap size. This may not be the case if GC
445	// start is delayed or if the allocation that pushed heap_live
446	// over gc_trigger is large or if the trigger is really close to
447	// GOGC. Assist is proportional to this distance, so enforce a
448	// minimum distance, even if it means going over the GOGC goal
449	// by a tiny bit.
450	if memstats.next_gc < memstats.heap_live+1024*1024 {
451		memstats.next_gc = memstats.heap_live + 1024*1024
452	}
453
454	// Compute the background mark utilization goal. In general,
455	// this may not come out exactly. We round the number of
456	// dedicated workers so that the utilization is closest to
457	// 25%. For small GOMAXPROCS, this would introduce too much
458	// error, so we add fractional workers in that case.
459	totalUtilizationGoal := float64(gomaxprocs) * gcBackgroundUtilization
460	c.dedicatedMarkWorkersNeeded = int64(totalUtilizationGoal + 0.5)
461	utilError := float64(c.dedicatedMarkWorkersNeeded)/totalUtilizationGoal - 1
462	const maxUtilError = 0.3
463	if utilError < -maxUtilError || utilError > maxUtilError {
464		// Rounding put us more than 30% off our goal. With
465		// gcBackgroundUtilization of 25%, this happens for
466		// GOMAXPROCS<=3 or GOMAXPROCS=6. Enable fractional
467		// workers to compensate.
468		if float64(c.dedicatedMarkWorkersNeeded) > totalUtilizationGoal {
469			// Too many dedicated workers.
470			c.dedicatedMarkWorkersNeeded--
471		}
472		c.fractionalUtilizationGoal = (totalUtilizationGoal - float64(c.dedicatedMarkWorkersNeeded)) / float64(gomaxprocs)
473	} else {
474		c.fractionalUtilizationGoal = 0
475	}
476
477	// In STW mode, we just want dedicated workers.
478	if debug.gcstoptheworld > 0 {
479		c.dedicatedMarkWorkersNeeded = int64(gomaxprocs)
480		c.fractionalUtilizationGoal = 0
481	}
482
483	// Clear per-P state
484	for _, p := range allp {
485		p.gcAssistTime = 0
486		p.gcFractionalMarkTime = 0
487	}
488
489	// Compute initial values for controls that are updated
490	// throughout the cycle.
491	c.revise()
492
493	if debug.gcpacertrace > 0 {
494		assistRatio := float64frombits(atomic.Load64(&c.assistWorkPerByte))
495		print("pacer: assist ratio=", assistRatio,
496			" (scan ", memstats.heap_scan>>20, " MB in ",
497			work.initialHeapLive>>20, "->",
498			memstats.next_gc>>20, " MB)",
499			" workers=", c.dedicatedMarkWorkersNeeded,
500			"+", c.fractionalUtilizationGoal, "\n")
501	}
502}
503
504// revise updates the assist ratio during the GC cycle to account for
505// improved estimates. This should be called whenever memstats.heap_scan,
506// memstats.heap_live, or memstats.next_gc is updated. It is safe to
507// call concurrently, but it may race with other calls to revise.
508//
509// The result of this race is that the two assist ratio values may not line
510// up or may be stale. In practice this is OK because the assist ratio
511// moves slowly throughout a GC cycle, and the assist ratio is a best-effort
512// heuristic anyway. Furthermore, no part of the heuristic depends on
513// the two assist ratio values being exact reciprocals of one another, since
514// the two values are used to convert values from different sources.
515//
516// The worst case result of this raciness is that we may miss a larger shift
517// in the ratio (say, if we decide to pace more aggressively against the
518// hard heap goal) but even this "hard goal" is best-effort (see #40460).
519// The dedicated GC should ensure we don't exceed the hard goal by too much
520// in the rare case we do exceed it.
521//
522// It should only be called when gcBlackenEnabled != 0 (because this
523// is when assists are enabled and the necessary statistics are
524// available).
525func (c *gcControllerState) revise() {
526	gcpercent := gcpercent
527	if gcpercent < 0 {
528		// If GC is disabled but we're running a forced GC,
529		// act like GOGC is huge for the below calculations.
530		gcpercent = 100000
531	}
532	live := atomic.Load64(&memstats.heap_live)
533	scan := atomic.Load64(&memstats.heap_scan)
534	work := atomic.Loadint64(&c.scanWork)
535
536	// Assume we're under the soft goal. Pace GC to complete at
537	// next_gc assuming the heap is in steady-state.
538	heapGoal := int64(atomic.Load64(&memstats.next_gc))
539
540	// Compute the expected scan work remaining.
541	//
542	// This is estimated based on the expected
543	// steady-state scannable heap. For example, with
544	// GOGC=100, only half of the scannable heap is
545	// expected to be live, so that's what we target.
546	//
547	// (This is a float calculation to avoid overflowing on
548	// 100*heap_scan.)
549	scanWorkExpected := int64(float64(scan) * 100 / float64(100+gcpercent))
550
551	if int64(live) > heapGoal || work > scanWorkExpected {
552		// We're past the soft goal, or we've already done more scan
553		// work than we expected. Pace GC so that in the worst case it
554		// will complete by the hard goal.
555		const maxOvershoot = 1.1
556		heapGoal = int64(float64(heapGoal) * maxOvershoot)
557
558		// Compute the upper bound on the scan work remaining.
559		scanWorkExpected = int64(scan)
560	}
561
562	// Compute the remaining scan work estimate.
563	//
564	// Note that we currently count allocations during GC as both
565	// scannable heap (heap_scan) and scan work completed
566	// (scanWork), so allocation will change this difference
567	// slowly in the soft regime and not at all in the hard
568	// regime.
569	scanWorkRemaining := scanWorkExpected - work
570	if scanWorkRemaining < 1000 {
571		// We set a somewhat arbitrary lower bound on
572		// remaining scan work since if we aim a little high,
573		// we can miss by a little.
574		//
575		// We *do* need to enforce that this is at least 1,
576		// since marking is racy and double-scanning objects
577		// may legitimately make the remaining scan work
578		// negative, even in the hard goal regime.
579		scanWorkRemaining = 1000
580	}
581
582	// Compute the heap distance remaining.
583	heapRemaining := heapGoal - int64(live)
584	if heapRemaining <= 0 {
585		// This shouldn't happen, but if it does, avoid
586		// dividing by zero or setting the assist negative.
587		heapRemaining = 1
588	}
589
590	// Compute the mutator assist ratio so by the time the mutator
591	// allocates the remaining heap bytes up to next_gc, it will
592	// have done (or stolen) the remaining amount of scan work.
593	// Note that the assist ratio values are updated atomically
594	// but not together. This means there may be some degree of
595	// skew between the two values. This is generally OK as the
596	// values shift relatively slowly over the course of a GC
597	// cycle.
598	assistWorkPerByte := float64(scanWorkRemaining) / float64(heapRemaining)
599	assistBytesPerWork := float64(heapRemaining) / float64(scanWorkRemaining)
600	atomic.Store64(&c.assistWorkPerByte, float64bits(assistWorkPerByte))
601	atomic.Store64(&c.assistBytesPerWork, float64bits(assistBytesPerWork))
602}
603
604// endCycle computes the trigger ratio for the next cycle.
605func (c *gcControllerState) endCycle() float64 {
606	if work.userForced {
607		// Forced GC means this cycle didn't start at the
608		// trigger, so where it finished isn't good
609		// information about how to adjust the trigger.
610		// Just leave it where it is.
611		return memstats.triggerRatio
612	}
613
614	// Proportional response gain for the trigger controller. Must
615	// be in [0, 1]. Lower values smooth out transient effects but
616	// take longer to respond to phase changes. Higher values
617	// react to phase changes quickly, but are more affected by
618	// transient changes. Values near 1 may be unstable.
619	const triggerGain = 0.5
620
621	// Compute next cycle trigger ratio. First, this computes the
622	// "error" for this cycle; that is, how far off the trigger
623	// was from what it should have been, accounting for both heap
624	// growth and GC CPU utilization. We compute the actual heap
625	// growth during this cycle and scale that by how far off from
626	// the goal CPU utilization we were (to estimate the heap
627	// growth if we had the desired CPU utilization). The
628	// difference between this estimate and the GOGC-based goal
629	// heap growth is the error.
630	goalGrowthRatio := gcEffectiveGrowthRatio()
631	actualGrowthRatio := float64(memstats.heap_live)/float64(memstats.heap_marked) - 1
632	assistDuration := nanotime() - c.markStartTime
633
634	// Assume background mark hit its utilization goal.
635	utilization := gcBackgroundUtilization
636	// Add assist utilization; avoid divide by zero.
637	if assistDuration > 0 {
638		utilization += float64(c.assistTime) / float64(assistDuration*int64(gomaxprocs))
639	}
640
641	triggerError := goalGrowthRatio - memstats.triggerRatio - utilization/gcGoalUtilization*(actualGrowthRatio-memstats.triggerRatio)
642
643	// Finally, we adjust the trigger for next time by this error,
644	// damped by the proportional gain.
645	triggerRatio := memstats.triggerRatio + triggerGain*triggerError
646
647	if debug.gcpacertrace > 0 {
648		// Print controller state in terms of the design
649		// document.
650		H_m_prev := memstats.heap_marked
651		h_t := memstats.triggerRatio
652		H_T := memstats.gc_trigger
653		h_a := actualGrowthRatio
654		H_a := memstats.heap_live
655		h_g := goalGrowthRatio
656		H_g := int64(float64(H_m_prev) * (1 + h_g))
657		u_a := utilization
658		u_g := gcGoalUtilization
659		W_a := c.scanWork
660		print("pacer: H_m_prev=", H_m_prev,
661			" h_t=", h_t, " H_T=", H_T,
662			" h_a=", h_a, " H_a=", H_a,
663			" h_g=", h_g, " H_g=", H_g,
664			" u_a=", u_a, " u_g=", u_g,
665			" W_a=", W_a,
666			" goalΔ=", goalGrowthRatio-h_t,
667			" actualΔ=", h_a-h_t,
668			" u_a/u_g=", u_a/u_g,
669			"\n")
670	}
671
672	return triggerRatio
673}
674
675// enlistWorker encourages another dedicated mark worker to start on
676// another P if there are spare worker slots. It is used by putfull
677// when more work is made available.
678//
679//go:nowritebarrier
680func (c *gcControllerState) enlistWorker() {
681	// If there are idle Ps, wake one so it will run an idle worker.
682	// NOTE: This is suspected of causing deadlocks. See golang.org/issue/19112.
683	//
684	//	if atomic.Load(&sched.npidle) != 0 && atomic.Load(&sched.nmspinning) == 0 {
685	//		wakep()
686	//		return
687	//	}
688
689	// There are no idle Ps. If we need more dedicated workers,
690	// try to preempt a running P so it will switch to a worker.
691	if c.dedicatedMarkWorkersNeeded <= 0 {
692		return
693	}
694	// Pick a random other P to preempt.
695	if gomaxprocs <= 1 {
696		return
697	}
698	gp := getg()
699	if gp == nil || gp.m == nil || gp.m.p == 0 {
700		return
701	}
702	myID := gp.m.p.ptr().id
703	for tries := 0; tries < 5; tries++ {
704		id := int32(fastrandn(uint32(gomaxprocs - 1)))
705		if id >= myID {
706			id++
707		}
708		p := allp[id]
709		if p.status != _Prunning {
710			continue
711		}
712		if preemptone(p) {
713			return
714		}
715	}
716}
717
718// findRunnableGCWorker returns a background mark worker for _p_ if it
719// should be run. This must only be called when gcBlackenEnabled != 0.
720func (c *gcControllerState) findRunnableGCWorker(_p_ *p) *g {
721	if gcBlackenEnabled == 0 {
722		throw("gcControllerState.findRunnable: blackening not enabled")
723	}
724
725	if !gcMarkWorkAvailable(_p_) {
726		// No work to be done right now. This can happen at
727		// the end of the mark phase when there are still
728		// assists tapering off. Don't bother running a worker
729		// now because it'll just return immediately.
730		return nil
731	}
732
733	// Grab a worker before we commit to running below.
734	node := (*gcBgMarkWorkerNode)(gcBgMarkWorkerPool.pop())
735	if node == nil {
736		// There is at least one worker per P, so normally there are
737		// enough workers to run on all Ps, if necessary. However, once
738		// a worker enters gcMarkDone it may park without rejoining the
739		// pool, thus freeing a P with no corresponding worker.
740		// gcMarkDone never depends on another worker doing work, so it
741		// is safe to simply do nothing here.
742		//
743		// If gcMarkDone bails out without completing the mark phase,
744		// it will always do so with queued global work. Thus, that P
745		// will be immediately eligible to re-run the worker G it was
746		// just using, ensuring work can complete.
747		return nil
748	}
749
750	decIfPositive := func(ptr *int64) bool {
751		for {
752			v := atomic.Loadint64(ptr)
753			if v <= 0 {
754				return false
755			}
756
757			// TODO: having atomic.Casint64 would be more pleasant.
758			if atomic.Cas64((*uint64)(unsafe.Pointer(ptr)), uint64(v), uint64(v-1)) {
759				return true
760			}
761		}
762	}
763
764	if decIfPositive(&c.dedicatedMarkWorkersNeeded) {
765		// This P is now dedicated to marking until the end of
766		// the concurrent mark phase.
767		_p_.gcMarkWorkerMode = gcMarkWorkerDedicatedMode
768	} else if c.fractionalUtilizationGoal == 0 {
769		// No need for fractional workers.
770		gcBgMarkWorkerPool.push(&node.node)
771		return nil
772	} else {
773		// Is this P behind on the fractional utilization
774		// goal?
775		//
776		// This should be kept in sync with pollFractionalWorkerExit.
777		delta := nanotime() - gcController.markStartTime
778		if delta > 0 && float64(_p_.gcFractionalMarkTime)/float64(delta) > c.fractionalUtilizationGoal {
779			// Nope. No need to run a fractional worker.
780			gcBgMarkWorkerPool.push(&node.node)
781			return nil
782		}
783		// Run a fractional worker.
784		_p_.gcMarkWorkerMode = gcMarkWorkerFractionalMode
785	}
786
787	// Run the background mark worker.
788	gp := node.gp.ptr()
789	casgstatus(gp, _Gwaiting, _Grunnable)
790	if trace.enabled {
791		traceGoUnpark(gp, 0)
792	}
793	return gp
794}
795
796// pollFractionalWorkerExit reports whether a fractional mark worker
797// should self-preempt. It assumes it is called from the fractional
798// worker.
799func pollFractionalWorkerExit() bool {
800	// This should be kept in sync with the fractional worker
801	// scheduler logic in findRunnableGCWorker.
802	now := nanotime()
803	delta := now - gcController.markStartTime
804	if delta <= 0 {
805		return true
806	}
807	p := getg().m.p.ptr()
808	selfTime := p.gcFractionalMarkTime + (now - p.gcMarkWorkerStartTime)
809	// Add some slack to the utilization goal so that the
810	// fractional worker isn't behind again the instant it exits.
811	return float64(selfTime)/float64(delta) > 1.2*gcController.fractionalUtilizationGoal
812}
813
814// gcSetTriggerRatio sets the trigger ratio and updates everything
815// derived from it: the absolute trigger, the heap goal, mark pacing,
816// and sweep pacing.
817//
818// This can be called any time. If GC is the in the middle of a
819// concurrent phase, it will adjust the pacing of that phase.
820//
821// This depends on gcpercent, memstats.heap_marked, and
822// memstats.heap_live. These must be up to date.
823//
824// mheap_.lock must be held or the world must be stopped.
825func gcSetTriggerRatio(triggerRatio float64) {
826	assertWorldStoppedOrLockHeld(&mheap_.lock)
827
828	// Compute the next GC goal, which is when the allocated heap
829	// has grown by GOGC/100 over the heap marked by the last
830	// cycle.
831	goal := ^uint64(0)
832	if gcpercent >= 0 {
833		goal = memstats.heap_marked + memstats.heap_marked*uint64(gcpercent)/100
834	}
835
836	// Set the trigger ratio, capped to reasonable bounds.
837	if gcpercent >= 0 {
838		scalingFactor := float64(gcpercent) / 100
839		// Ensure there's always a little margin so that the
840		// mutator assist ratio isn't infinity.
841		maxTriggerRatio := 0.95 * scalingFactor
842		if triggerRatio > maxTriggerRatio {
843			triggerRatio = maxTriggerRatio
844		}
845
846		// If we let triggerRatio go too low, then if the application
847		// is allocating very rapidly we might end up in a situation
848		// where we're allocating black during a nearly always-on GC.
849		// The result of this is a growing heap and ultimately an
850		// increase in RSS. By capping us at a point >0, we're essentially
851		// saying that we're OK using more CPU during the GC to prevent
852		// this growth in RSS.
853		//
854		// The current constant was chosen empirically: given a sufficiently
855		// fast/scalable allocator with 48 Ps that could drive the trigger ratio
856		// to <0.05, this constant causes applications to retain the same peak
857		// RSS compared to not having this allocator.
858		minTriggerRatio := 0.6 * scalingFactor
859		if triggerRatio < minTriggerRatio {
860			triggerRatio = minTriggerRatio
861		}
862	} else if triggerRatio < 0 {
863		// gcpercent < 0, so just make sure we're not getting a negative
864		// triggerRatio. This case isn't expected to happen in practice,
865		// and doesn't really matter because if gcpercent < 0 then we won't
866		// ever consume triggerRatio further on in this function, but let's
867		// just be defensive here; the triggerRatio being negative is almost
868		// certainly undesirable.
869		triggerRatio = 0
870	}
871	memstats.triggerRatio = triggerRatio
872
873	// Compute the absolute GC trigger from the trigger ratio.
874	//
875	// We trigger the next GC cycle when the allocated heap has
876	// grown by the trigger ratio over the marked heap size.
877	trigger := ^uint64(0)
878	if gcpercent >= 0 {
879		trigger = uint64(float64(memstats.heap_marked) * (1 + triggerRatio))
880		// Don't trigger below the minimum heap size.
881		minTrigger := heapminimum
882		if !isSweepDone() {
883			// Concurrent sweep happens in the heap growth
884			// from heap_live to gc_trigger, so ensure
885			// that concurrent sweep has some heap growth
886			// in which to perform sweeping before we
887			// start the next GC cycle.
888			sweepMin := atomic.Load64(&memstats.heap_live) + sweepMinHeapDistance
889			if sweepMin > minTrigger {
890				minTrigger = sweepMin
891			}
892		}
893		if trigger < minTrigger {
894			trigger = minTrigger
895		}
896		if int64(trigger) < 0 {
897			print("runtime: next_gc=", memstats.next_gc, " heap_marked=", memstats.heap_marked, " heap_live=", memstats.heap_live, " initialHeapLive=", work.initialHeapLive, "triggerRatio=", triggerRatio, " minTrigger=", minTrigger, "\n")
898			throw("gc_trigger underflow")
899		}
900		if trigger > goal {
901			// The trigger ratio is always less than GOGC/100, but
902			// other bounds on the trigger may have raised it.
903			// Push up the goal, too.
904			goal = trigger
905		}
906	}
907
908	// Commit to the trigger and goal.
909	memstats.gc_trigger = trigger
910	atomic.Store64(&memstats.next_gc, goal)
911	if trace.enabled {
912		traceNextGC()
913	}
914
915	// Update mark pacing.
916	if gcphase != _GCoff {
917		gcController.revise()
918	}
919
920	// Update sweep pacing.
921	if isSweepDone() {
922		mheap_.sweepPagesPerByte = 0
923	} else {
924		// Concurrent sweep needs to sweep all of the in-use
925		// pages by the time the allocated heap reaches the GC
926		// trigger. Compute the ratio of in-use pages to sweep
927		// per byte allocated, accounting for the fact that
928		// some might already be swept.
929		heapLiveBasis := atomic.Load64(&memstats.heap_live)
930		heapDistance := int64(trigger) - int64(heapLiveBasis)
931		// Add a little margin so rounding errors and
932		// concurrent sweep are less likely to leave pages
933		// unswept when GC starts.
934		heapDistance -= 1024 * 1024
935		if heapDistance < _PageSize {
936			// Avoid setting the sweep ratio extremely high
937			heapDistance = _PageSize
938		}
939		pagesSwept := atomic.Load64(&mheap_.pagesSwept)
940		pagesInUse := atomic.Load64(&mheap_.pagesInUse)
941		sweepDistancePages := int64(pagesInUse) - int64(pagesSwept)
942		if sweepDistancePages <= 0 {
943			mheap_.sweepPagesPerByte = 0
944		} else {
945			mheap_.sweepPagesPerByte = float64(sweepDistancePages) / float64(heapDistance)
946			mheap_.sweepHeapLiveBasis = heapLiveBasis
947			// Write pagesSweptBasis last, since this
948			// signals concurrent sweeps to recompute
949			// their debt.
950			atomic.Store64(&mheap_.pagesSweptBasis, pagesSwept)
951		}
952	}
953
954	gcPaceScavenger()
955}
956
957// gcEffectiveGrowthRatio returns the current effective heap growth
958// ratio (GOGC/100) based on heap_marked from the previous GC and
959// next_gc for the current GC.
960//
961// This may differ from gcpercent/100 because of various upper and
962// lower bounds on gcpercent. For example, if the heap is smaller than
963// heapminimum, this can be higher than gcpercent/100.
964//
965// mheap_.lock must be held or the world must be stopped.
966func gcEffectiveGrowthRatio() float64 {
967	assertWorldStoppedOrLockHeld(&mheap_.lock)
968
969	egogc := float64(atomic.Load64(&memstats.next_gc)-memstats.heap_marked) / float64(memstats.heap_marked)
970	if egogc < 0 {
971		// Shouldn't happen, but just in case.
972		egogc = 0
973	}
974	return egogc
975}
976
977// gcGoalUtilization is the goal CPU utilization for
978// marking as a fraction of GOMAXPROCS.
979const gcGoalUtilization = 0.30
980
981// gcBackgroundUtilization is the fixed CPU utilization for background
982// marking. It must be <= gcGoalUtilization. The difference between
983// gcGoalUtilization and gcBackgroundUtilization will be made up by
984// mark assists. The scheduler will aim to use within 50% of this
985// goal.
986//
987// Setting this to < gcGoalUtilization avoids saturating the trigger
988// feedback controller when there are no assists, which allows it to
989// better control CPU and heap growth. However, the larger the gap,
990// the more mutator assists are expected to happen, which impact
991// mutator latency.
992const gcBackgroundUtilization = 0.25
993
994// gcCreditSlack is the amount of scan work credit that can
995// accumulate locally before updating gcController.scanWork and,
996// optionally, gcController.bgScanCredit. Lower values give a more
997// accurate assist ratio and make it more likely that assists will
998// successfully steal background credit. Higher values reduce memory
999// contention.
1000const gcCreditSlack = 2000
1001
1002// gcAssistTimeSlack is the nanoseconds of mutator assist time that
1003// can accumulate on a P before updating gcController.assistTime.
1004const gcAssistTimeSlack = 5000
1005
1006// gcOverAssistWork determines how many extra units of scan work a GC
1007// assist does when an assist happens. This amortizes the cost of an
1008// assist by pre-paying for this many bytes of future allocations.
1009const gcOverAssistWork = 64 << 10
1010
1011var work struct {
1012	full  lfstack          // lock-free list of full blocks workbuf
1013	empty lfstack          // lock-free list of empty blocks workbuf
1014	pad0  cpu.CacheLinePad // prevents false-sharing between full/empty and nproc/nwait
1015
1016	wbufSpans struct {
1017		lock mutex
1018		// free is a list of spans dedicated to workbufs, but
1019		// that don't currently contain any workbufs.
1020		free mSpanList
1021		// busy is a list of all spans containing workbufs on
1022		// one of the workbuf lists.
1023		busy mSpanList
1024	}
1025
1026	// Restore 64-bit alignment on 32-bit.
1027	_ uint32
1028
1029	// bytesMarked is the number of bytes marked this cycle. This
1030	// includes bytes blackened in scanned objects, noscan objects
1031	// that go straight to black, and permagrey objects scanned by
1032	// markroot during the concurrent scan phase. This is updated
1033	// atomically during the cycle. Updates may be batched
1034	// arbitrarily, since the value is only read at the end of the
1035	// cycle.
1036	//
1037	// Because of benign races during marking, this number may not
1038	// be the exact number of marked bytes, but it should be very
1039	// close.
1040	//
1041	// Put this field here because it needs 64-bit atomic access
1042	// (and thus 8-byte alignment even on 32-bit architectures).
1043	bytesMarked uint64
1044
1045	markrootNext uint32 // next markroot job
1046	markrootJobs uint32 // number of markroot jobs
1047
1048	nproc  uint32
1049	tstart int64
1050	nwait  uint32
1051
1052	// Number of roots of various root types. Set by gcMarkRootPrepare.
1053	nFlushCacheRoots                    int
1054	nDataRoots, nSpanRoots, nStackRoots int
1055
1056	// Each type of GC state transition is protected by a lock.
1057	// Since multiple threads can simultaneously detect the state
1058	// transition condition, any thread that detects a transition
1059	// condition must acquire the appropriate transition lock,
1060	// re-check the transition condition and return if it no
1061	// longer holds or perform the transition if it does.
1062	// Likewise, any transition must invalidate the transition
1063	// condition before releasing the lock. This ensures that each
1064	// transition is performed by exactly one thread and threads
1065	// that need the transition to happen block until it has
1066	// happened.
1067	//
1068	// startSema protects the transition from "off" to mark or
1069	// mark termination.
1070	startSema uint32
1071	// markDoneSema protects transitions from mark to mark termination.
1072	markDoneSema uint32
1073
1074	bgMarkReady note   // signal background mark worker has started
1075	bgMarkDone  uint32 // cas to 1 when at a background mark completion point
1076	// Background mark completion signaling
1077
1078	// mode is the concurrency mode of the current GC cycle.
1079	mode gcMode
1080
1081	// userForced indicates the current GC cycle was forced by an
1082	// explicit user call.
1083	userForced bool
1084
1085	// totaltime is the CPU nanoseconds spent in GC since the
1086	// program started if debug.gctrace > 0.
1087	totaltime int64
1088
1089	// initialHeapLive is the value of memstats.heap_live at the
1090	// beginning of this GC cycle.
1091	initialHeapLive uint64
1092
1093	// assistQueue is a queue of assists that are blocked because
1094	// there was neither enough credit to steal or enough work to
1095	// do.
1096	assistQueue struct {
1097		lock mutex
1098		q    gQueue
1099	}
1100
1101	// sweepWaiters is a list of blocked goroutines to wake when
1102	// we transition from mark termination to sweep.
1103	sweepWaiters struct {
1104		lock mutex
1105		list gList
1106	}
1107
1108	// cycles is the number of completed GC cycles, where a GC
1109	// cycle is sweep termination, mark, mark termination, and
1110	// sweep. This differs from memstats.numgc, which is
1111	// incremented at mark termination.
1112	cycles uint32
1113
1114	// Timing/utilization stats for this cycle.
1115	stwprocs, maxprocs                 int32
1116	tSweepTerm, tMark, tMarkTerm, tEnd int64 // nanotime() of phase start
1117
1118	pauseNS    int64 // total STW time this cycle
1119	pauseStart int64 // nanotime() of last STW
1120
1121	// debug.gctrace heap sizes for this cycle.
1122	heap0, heap1, heap2, heapGoal uint64
1123}
1124
1125// GC runs a garbage collection and blocks the caller until the
1126// garbage collection is complete. It may also block the entire
1127// program.
1128func GC() {
1129	// We consider a cycle to be: sweep termination, mark, mark
1130	// termination, and sweep. This function shouldn't return
1131	// until a full cycle has been completed, from beginning to
1132	// end. Hence, we always want to finish up the current cycle
1133	// and start a new one. That means:
1134	//
1135	// 1. In sweep termination, mark, or mark termination of cycle
1136	// N, wait until mark termination N completes and transitions
1137	// to sweep N.
1138	//
1139	// 2. In sweep N, help with sweep N.
1140	//
1141	// At this point we can begin a full cycle N+1.
1142	//
1143	// 3. Trigger cycle N+1 by starting sweep termination N+1.
1144	//
1145	// 4. Wait for mark termination N+1 to complete.
1146	//
1147	// 5. Help with sweep N+1 until it's done.
1148	//
1149	// This all has to be written to deal with the fact that the
1150	// GC may move ahead on its own. For example, when we block
1151	// until mark termination N, we may wake up in cycle N+2.
1152
1153	// Wait until the current sweep termination, mark, and mark
1154	// termination complete.
1155	n := atomic.Load(&work.cycles)
1156	gcWaitOnMark(n)
1157
1158	// We're now in sweep N or later. Trigger GC cycle N+1, which
1159	// will first finish sweep N if necessary and then enter sweep
1160	// termination N+1.
1161	gcStart(gcTrigger{kind: gcTriggerCycle, n: n + 1})
1162
1163	// Wait for mark termination N+1 to complete.
1164	gcWaitOnMark(n + 1)
1165
1166	// Finish sweep N+1 before returning. We do this both to
1167	// complete the cycle and because runtime.GC() is often used
1168	// as part of tests and benchmarks to get the system into a
1169	// relatively stable and isolated state.
1170	for atomic.Load(&work.cycles) == n+1 && sweepone() != ^uintptr(0) {
1171		sweep.nbgsweep++
1172		Gosched()
1173	}
1174
1175	// Callers may assume that the heap profile reflects the
1176	// just-completed cycle when this returns (historically this
1177	// happened because this was a STW GC), but right now the
1178	// profile still reflects mark termination N, not N+1.
1179	//
1180	// As soon as all of the sweep frees from cycle N+1 are done,
1181	// we can go ahead and publish the heap profile.
1182	//
1183	// First, wait for sweeping to finish. (We know there are no
1184	// more spans on the sweep queue, but we may be concurrently
1185	// sweeping spans, so we have to wait.)
1186	for atomic.Load(&work.cycles) == n+1 && atomic.Load(&mheap_.sweepers) != 0 {
1187		Gosched()
1188	}
1189
1190	// Now we're really done with sweeping, so we can publish the
1191	// stable heap profile. Only do this if we haven't already hit
1192	// another mark termination.
1193	mp := acquirem()
1194	cycle := atomic.Load(&work.cycles)
1195	if cycle == n+1 || (gcphase == _GCmark && cycle == n+2) {
1196		mProf_PostSweep()
1197	}
1198	releasem(mp)
1199}
1200
1201// gcWaitOnMark blocks until GC finishes the Nth mark phase. If GC has
1202// already completed this mark phase, it returns immediately.
1203func gcWaitOnMark(n uint32) {
1204	for {
1205		// Disable phase transitions.
1206		lock(&work.sweepWaiters.lock)
1207		nMarks := atomic.Load(&work.cycles)
1208		if gcphase != _GCmark {
1209			// We've already completed this cycle's mark.
1210			nMarks++
1211		}
1212		if nMarks > n {
1213			// We're done.
1214			unlock(&work.sweepWaiters.lock)
1215			return
1216		}
1217
1218		// Wait until sweep termination, mark, and mark
1219		// termination of cycle N complete.
1220		work.sweepWaiters.list.push(getg())
1221		goparkunlock(&work.sweepWaiters.lock, waitReasonWaitForGCCycle, traceEvGoBlock, 1)
1222	}
1223}
1224
1225// gcMode indicates how concurrent a GC cycle should be.
1226type gcMode int
1227
1228const (
1229	gcBackgroundMode gcMode = iota // concurrent GC and sweep
1230	gcForceMode                    // stop-the-world GC now, concurrent sweep
1231	gcForceBlockMode               // stop-the-world GC now and STW sweep (forced by user)
1232)
1233
1234// A gcTrigger is a predicate for starting a GC cycle. Specifically,
1235// it is an exit condition for the _GCoff phase.
1236type gcTrigger struct {
1237	kind gcTriggerKind
1238	now  int64  // gcTriggerTime: current time
1239	n    uint32 // gcTriggerCycle: cycle number to start
1240}
1241
1242type gcTriggerKind int
1243
1244const (
1245	// gcTriggerHeap indicates that a cycle should be started when
1246	// the heap size reaches the trigger heap size computed by the
1247	// controller.
1248	gcTriggerHeap gcTriggerKind = iota
1249
1250	// gcTriggerTime indicates that a cycle should be started when
1251	// it's been more than forcegcperiod nanoseconds since the
1252	// previous GC cycle.
1253	gcTriggerTime
1254
1255	// gcTriggerCycle indicates that a cycle should be started if
1256	// we have not yet started cycle number gcTrigger.n (relative
1257	// to work.cycles).
1258	gcTriggerCycle
1259)
1260
1261// test reports whether the trigger condition is satisfied, meaning
1262// that the exit condition for the _GCoff phase has been met. The exit
1263// condition should be tested when allocating.
1264func (t gcTrigger) test() bool {
1265	if !memstats.enablegc || panicking != 0 || gcphase != _GCoff {
1266		return false
1267	}
1268	switch t.kind {
1269	case gcTriggerHeap:
1270		// Non-atomic access to heap_live for performance. If
1271		// we are going to trigger on this, this thread just
1272		// atomically wrote heap_live anyway and we'll see our
1273		// own write.
1274		return memstats.heap_live >= memstats.gc_trigger
1275	case gcTriggerTime:
1276		if gcpercent < 0 {
1277			return false
1278		}
1279		lastgc := int64(atomic.Load64(&memstats.last_gc_nanotime))
1280		return lastgc != 0 && t.now-lastgc > forcegcperiod
1281	case gcTriggerCycle:
1282		// t.n > work.cycles, but accounting for wraparound.
1283		return int32(t.n-work.cycles) > 0
1284	}
1285	return true
1286}
1287
1288// gcStart starts the GC. It transitions from _GCoff to _GCmark (if
1289// debug.gcstoptheworld == 0) or performs all of GC (if
1290// debug.gcstoptheworld != 0).
1291//
1292// This may return without performing this transition in some cases,
1293// such as when called on a system stack or with locks held.
1294func gcStart(trigger gcTrigger) {
1295	// Since this is called from malloc and malloc is called in
1296	// the guts of a number of libraries that might be holding
1297	// locks, don't attempt to start GC in non-preemptible or
1298	// potentially unstable situations.
1299	mp := acquirem()
1300	if gp := getg(); gp == mp.g0 || mp.locks > 1 || mp.preemptoff != "" {
1301		releasem(mp)
1302		return
1303	}
1304	releasem(mp)
1305	mp = nil
1306
1307	// Pick up the remaining unswept/not being swept spans concurrently
1308	//
1309	// This shouldn't happen if we're being invoked in background
1310	// mode since proportional sweep should have just finished
1311	// sweeping everything, but rounding errors, etc, may leave a
1312	// few spans unswept. In forced mode, this is necessary since
1313	// GC can be forced at any point in the sweeping cycle.
1314	//
1315	// We check the transition condition continuously here in case
1316	// this G gets delayed in to the next GC cycle.
1317	for trigger.test() && sweepone() != ^uintptr(0) {
1318		sweep.nbgsweep++
1319	}
1320
1321	// Perform GC initialization and the sweep termination
1322	// transition.
1323	semacquire(&work.startSema)
1324	// Re-check transition condition under transition lock.
1325	if !trigger.test() {
1326		semrelease(&work.startSema)
1327		return
1328	}
1329
1330	// For stats, check if this GC was forced by the user.
1331	work.userForced = trigger.kind == gcTriggerCycle
1332
1333	// In gcstoptheworld debug mode, upgrade the mode accordingly.
1334	// We do this after re-checking the transition condition so
1335	// that multiple goroutines that detect the heap trigger don't
1336	// start multiple STW GCs.
1337	mode := gcBackgroundMode
1338	if debug.gcstoptheworld == 1 {
1339		mode = gcForceMode
1340	} else if debug.gcstoptheworld == 2 {
1341		mode = gcForceBlockMode
1342	}
1343
1344	// Ok, we're doing it! Stop everybody else
1345	semacquire(&gcsema)
1346	semacquire(&worldsema)
1347
1348	if trace.enabled {
1349		traceGCStart()
1350	}
1351
1352	// Check that all Ps have finished deferred mcache flushes.
1353	for _, p := range allp {
1354		if fg := atomic.Load(&p.mcache.flushGen); fg != mheap_.sweepgen {
1355			println("runtime: p", p.id, "flushGen", fg, "!= sweepgen", mheap_.sweepgen)
1356			throw("p mcache not flushed")
1357		}
1358	}
1359
1360	gcBgMarkStartWorkers()
1361
1362	systemstack(gcResetMarkState)
1363
1364	work.stwprocs, work.maxprocs = gomaxprocs, gomaxprocs
1365	if work.stwprocs > ncpu {
1366		// This is used to compute CPU time of the STW phases,
1367		// so it can't be more than ncpu, even if GOMAXPROCS is.
1368		work.stwprocs = ncpu
1369	}
1370	work.heap0 = atomic.Load64(&memstats.heap_live)
1371	work.pauseNS = 0
1372	work.mode = mode
1373
1374	now := nanotime()
1375	work.tSweepTerm = now
1376	work.pauseStart = now
1377	if trace.enabled {
1378		traceGCSTWStart(1)
1379	}
1380	systemstack(stopTheWorldWithSema)
1381	// Finish sweep before we start concurrent scan.
1382	systemstack(func() {
1383		finishsweep_m()
1384	})
1385
1386	// clearpools before we start the GC. If we wait they memory will not be
1387	// reclaimed until the next GC cycle.
1388	clearpools()
1389
1390	work.cycles++
1391
1392	gcController.startCycle()
1393	work.heapGoal = memstats.next_gc
1394
1395	// In STW mode, disable scheduling of user Gs. This may also
1396	// disable scheduling of this goroutine, so it may block as
1397	// soon as we start the world again.
1398	if mode != gcBackgroundMode {
1399		schedEnableUser(false)
1400	}
1401
1402	// Enter concurrent mark phase and enable
1403	// write barriers.
1404	//
1405	// Because the world is stopped, all Ps will
1406	// observe that write barriers are enabled by
1407	// the time we start the world and begin
1408	// scanning.
1409	//
1410	// Write barriers must be enabled before assists are
1411	// enabled because they must be enabled before
1412	// any non-leaf heap objects are marked. Since
1413	// allocations are blocked until assists can
1414	// happen, we want enable assists as early as
1415	// possible.
1416	setGCPhase(_GCmark)
1417
1418	gcBgMarkPrepare() // Must happen before assist enable.
1419	gcMarkRootPrepare()
1420
1421	// Mark all active tinyalloc blocks. Since we're
1422	// allocating from these, they need to be black like
1423	// other allocations. The alternative is to blacken
1424	// the tiny block on every allocation from it, which
1425	// would slow down the tiny allocator.
1426	gcMarkTinyAllocs()
1427
1428	// At this point all Ps have enabled the write
1429	// barrier, thus maintaining the no white to
1430	// black invariant. Enable mutator assists to
1431	// put back-pressure on fast allocating
1432	// mutators.
1433	atomic.Store(&gcBlackenEnabled, 1)
1434
1435	// Assists and workers can start the moment we start
1436	// the world.
1437	gcController.markStartTime = now
1438
1439	// In STW mode, we could block the instant systemstack
1440	// returns, so make sure we're not preemptible.
1441	mp = acquirem()
1442
1443	// Concurrent mark.
1444	systemstack(func() {
1445		now = startTheWorldWithSema(trace.enabled)
1446		work.pauseNS += now - work.pauseStart
1447		work.tMark = now
1448		memstats.gcPauseDist.record(now - work.pauseStart)
1449	})
1450
1451	// Release the world sema before Gosched() in STW mode
1452	// because we will need to reacquire it later but before
1453	// this goroutine becomes runnable again, and we could
1454	// self-deadlock otherwise.
1455	semrelease(&worldsema)
1456	releasem(mp)
1457
1458	// Make sure we block instead of returning to user code
1459	// in STW mode.
1460	if mode != gcBackgroundMode {
1461		Gosched()
1462	}
1463
1464	semrelease(&work.startSema)
1465}
1466
1467// gcMarkDoneFlushed counts the number of P's with flushed work.
1468//
1469// Ideally this would be a captured local in gcMarkDone, but forEachP
1470// escapes its callback closure, so it can't capture anything.
1471//
1472// This is protected by markDoneSema.
1473var gcMarkDoneFlushed uint32
1474
1475// gcMarkDone transitions the GC from mark to mark termination if all
1476// reachable objects have been marked (that is, there are no grey
1477// objects and can be no more in the future). Otherwise, it flushes
1478// all local work to the global queues where it can be discovered by
1479// other workers.
1480//
1481// This should be called when all local mark work has been drained and
1482// there are no remaining workers. Specifically, when
1483//
1484//   work.nwait == work.nproc && !gcMarkWorkAvailable(p)
1485//
1486// The calling context must be preemptible.
1487//
1488// Flushing local work is important because idle Ps may have local
1489// work queued. This is the only way to make that work visible and
1490// drive GC to completion.
1491//
1492// It is explicitly okay to have write barriers in this function. If
1493// it does transition to mark termination, then all reachable objects
1494// have been marked, so the write barrier cannot shade any more
1495// objects.
1496func gcMarkDone() {
1497	// Ensure only one thread is running the ragged barrier at a
1498	// time.
1499	semacquire(&work.markDoneSema)
1500
1501top:
1502	// Re-check transition condition under transition lock.
1503	//
1504	// It's critical that this checks the global work queues are
1505	// empty before performing the ragged barrier. Otherwise,
1506	// there could be global work that a P could take after the P
1507	// has passed the ragged barrier.
1508	if !(gcphase == _GCmark && work.nwait == work.nproc && !gcMarkWorkAvailable(nil)) {
1509		semrelease(&work.markDoneSema)
1510		return
1511	}
1512
1513	// forEachP needs worldsema to execute, and we'll need it to
1514	// stop the world later, so acquire worldsema now.
1515	semacquire(&worldsema)
1516
1517	// Flush all local buffers and collect flushedWork flags.
1518	gcMarkDoneFlushed = 0
1519	systemstack(func() {
1520		gp := getg().m.curg
1521		// Mark the user stack as preemptible so that it may be scanned.
1522		// Otherwise, our attempt to force all P's to a safepoint could
1523		// result in a deadlock as we attempt to preempt a worker that's
1524		// trying to preempt us (e.g. for a stack scan).
1525		casgstatus(gp, _Grunning, _Gwaiting)
1526		forEachP(func(_p_ *p) {
1527			// Flush the write barrier buffer, since this may add
1528			// work to the gcWork.
1529			wbBufFlush1(_p_)
1530
1531			// Flush the gcWork, since this may create global work
1532			// and set the flushedWork flag.
1533			//
1534			// TODO(austin): Break up these workbufs to
1535			// better distribute work.
1536			_p_.gcw.dispose()
1537			// Collect the flushedWork flag.
1538			if _p_.gcw.flushedWork {
1539				atomic.Xadd(&gcMarkDoneFlushed, 1)
1540				_p_.gcw.flushedWork = false
1541			}
1542		})
1543		casgstatus(gp, _Gwaiting, _Grunning)
1544	})
1545
1546	if gcMarkDoneFlushed != 0 {
1547		// More grey objects were discovered since the
1548		// previous termination check, so there may be more
1549		// work to do. Keep going. It's possible the
1550		// transition condition became true again during the
1551		// ragged barrier, so re-check it.
1552		semrelease(&worldsema)
1553		goto top
1554	}
1555
1556	// There was no global work, no local work, and no Ps
1557	// communicated work since we took markDoneSema. Therefore
1558	// there are no grey objects and no more objects can be
1559	// shaded. Transition to mark termination.
1560	now := nanotime()
1561	work.tMarkTerm = now
1562	work.pauseStart = now
1563	getg().m.preemptoff = "gcing"
1564	if trace.enabled {
1565		traceGCSTWStart(0)
1566	}
1567	systemstack(stopTheWorldWithSema)
1568	// The gcphase is _GCmark, it will transition to _GCmarktermination
1569	// below. The important thing is that the wb remains active until
1570	// all marking is complete. This includes writes made by the GC.
1571
1572	// There is sometimes work left over when we enter mark termination due
1573	// to write barriers performed after the completion barrier above.
1574	// Detect this and resume concurrent mark. This is obviously
1575	// unfortunate.
1576	//
1577	// See issue #27993 for details.
1578	//
1579	// Switch to the system stack to call wbBufFlush1, though in this case
1580	// it doesn't matter because we're non-preemptible anyway.
1581	restart := false
1582	systemstack(func() {
1583		for _, p := range allp {
1584			wbBufFlush1(p)
1585			if !p.gcw.empty() {
1586				restart = true
1587				break
1588			}
1589		}
1590	})
1591	if restart {
1592		getg().m.preemptoff = ""
1593		systemstack(func() {
1594			now := startTheWorldWithSema(true)
1595			work.pauseNS += now - work.pauseStart
1596			memstats.gcPauseDist.record(now - work.pauseStart)
1597		})
1598		semrelease(&worldsema)
1599		goto top
1600	}
1601
1602	// Disable assists and background workers. We must do
1603	// this before waking blocked assists.
1604	atomic.Store(&gcBlackenEnabled, 0)
1605
1606	// Wake all blocked assists. These will run when we
1607	// start the world again.
1608	gcWakeAllAssists()
1609
1610	// Likewise, release the transition lock. Blocked
1611	// workers and assists will run when we start the
1612	// world again.
1613	semrelease(&work.markDoneSema)
1614
1615	// In STW mode, re-enable user goroutines. These will be
1616	// queued to run after we start the world.
1617	schedEnableUser(true)
1618
1619	// endCycle depends on all gcWork cache stats being flushed.
1620	// The termination algorithm above ensured that up to
1621	// allocations since the ragged barrier.
1622	nextTriggerRatio := gcController.endCycle()
1623
1624	// Perform mark termination. This will restart the world.
1625	gcMarkTermination(nextTriggerRatio)
1626}
1627
1628// World must be stopped and mark assists and background workers must be
1629// disabled.
1630func gcMarkTermination(nextTriggerRatio float64) {
1631	// Start marktermination (write barrier remains enabled for now).
1632	setGCPhase(_GCmarktermination)
1633
1634	work.heap1 = memstats.heap_live
1635	startTime := nanotime()
1636
1637	mp := acquirem()
1638	mp.preemptoff = "gcing"
1639	_g_ := getg()
1640	_g_.m.traceback = 2
1641	gp := _g_.m.curg
1642	casgstatus(gp, _Grunning, _Gwaiting)
1643	gp.waitreason = waitReasonGarbageCollection
1644
1645	// Run gc on the g0 stack. We do this so that the g stack
1646	// we're currently running on will no longer change. Cuts
1647	// the root set down a bit (g0 stacks are not scanned, and
1648	// we don't need to scan gc's internal state).  We also
1649	// need to switch to g0 so we can shrink the stack.
1650	systemstack(func() {
1651		gcMark(startTime)
1652		// Must return immediately.
1653		// The outer function's stack may have moved
1654		// during gcMark (it shrinks stacks, including the
1655		// outer function's stack), so we must not refer
1656		// to any of its variables. Return back to the
1657		// non-system stack to pick up the new addresses
1658		// before continuing.
1659	})
1660
1661	systemstack(func() {
1662		work.heap2 = work.bytesMarked
1663		if debug.gccheckmark > 0 {
1664			// Run a full non-parallel, stop-the-world
1665			// mark using checkmark bits, to check that we
1666			// didn't forget to mark anything during the
1667			// concurrent mark process.
1668			startCheckmarks()
1669			gcResetMarkState()
1670			gcw := &getg().m.p.ptr().gcw
1671			gcDrain(gcw, 0)
1672			wbBufFlush1(getg().m.p.ptr())
1673			gcw.dispose()
1674			endCheckmarks()
1675		}
1676
1677		// marking is complete so we can turn the write barrier off
1678		setGCPhase(_GCoff)
1679		gcSweep(work.mode)
1680	})
1681
1682	_g_.m.traceback = 0
1683	casgstatus(gp, _Gwaiting, _Grunning)
1684
1685	if trace.enabled {
1686		traceGCDone()
1687	}
1688
1689	// all done
1690	mp.preemptoff = ""
1691
1692	if gcphase != _GCoff {
1693		throw("gc done but gcphase != _GCoff")
1694	}
1695
1696	// Record next_gc and heap_inuse for scavenger.
1697	memstats.last_next_gc = memstats.next_gc
1698	memstats.last_heap_inuse = memstats.heap_inuse
1699
1700	// Update GC trigger and pacing for the next cycle.
1701	gcSetTriggerRatio(nextTriggerRatio)
1702
1703	// Update timing memstats
1704	now := nanotime()
1705	sec, nsec, _ := time_now()
1706	unixNow := sec*1e9 + int64(nsec)
1707	work.pauseNS += now - work.pauseStart
1708	work.tEnd = now
1709	memstats.gcPauseDist.record(now - work.pauseStart)
1710	atomic.Store64(&memstats.last_gc_unix, uint64(unixNow)) // must be Unix time to make sense to user
1711	atomic.Store64(&memstats.last_gc_nanotime, uint64(now)) // monotonic time for us
1712	memstats.pause_ns[memstats.numgc%uint32(len(memstats.pause_ns))] = uint64(work.pauseNS)
1713	memstats.pause_end[memstats.numgc%uint32(len(memstats.pause_end))] = uint64(unixNow)
1714	memstats.pause_total_ns += uint64(work.pauseNS)
1715
1716	// Update work.totaltime.
1717	sweepTermCpu := int64(work.stwprocs) * (work.tMark - work.tSweepTerm)
1718	// We report idle marking time below, but omit it from the
1719	// overall utilization here since it's "free".
1720	markCpu := gcController.assistTime + gcController.dedicatedMarkTime + gcController.fractionalMarkTime
1721	markTermCpu := int64(work.stwprocs) * (work.tEnd - work.tMarkTerm)
1722	cycleCpu := sweepTermCpu + markCpu + markTermCpu
1723	work.totaltime += cycleCpu
1724
1725	// Compute overall GC CPU utilization.
1726	totalCpu := sched.totaltime + (now-sched.procresizetime)*int64(gomaxprocs)
1727	memstats.gc_cpu_fraction = float64(work.totaltime) / float64(totalCpu)
1728
1729	// Reset sweep state.
1730	sweep.nbgsweep = 0
1731	sweep.npausesweep = 0
1732
1733	if work.userForced {
1734		memstats.numforcedgc++
1735	}
1736
1737	// Bump GC cycle count and wake goroutines waiting on sweep.
1738	lock(&work.sweepWaiters.lock)
1739	memstats.numgc++
1740	injectglist(&work.sweepWaiters.list)
1741	unlock(&work.sweepWaiters.lock)
1742
1743	// Finish the current heap profiling cycle and start a new
1744	// heap profiling cycle. We do this before starting the world
1745	// so events don't leak into the wrong cycle.
1746	mProf_NextCycle()
1747
1748	systemstack(func() { startTheWorldWithSema(true) })
1749
1750	// Flush the heap profile so we can start a new cycle next GC.
1751	// This is relatively expensive, so we don't do it with the
1752	// world stopped.
1753	mProf_Flush()
1754
1755	// Prepare workbufs for freeing by the sweeper. We do this
1756	// asynchronously because it can take non-trivial time.
1757	prepareFreeWorkbufs()
1758
1759	// Ensure all mcaches are flushed. Each P will flush its own
1760	// mcache before allocating, but idle Ps may not. Since this
1761	// is necessary to sweep all spans, we need to ensure all
1762	// mcaches are flushed before we start the next GC cycle.
1763	systemstack(func() {
1764		forEachP(func(_p_ *p) {
1765			_p_.mcache.prepareForSweep()
1766		})
1767	})
1768
1769	// Print gctrace before dropping worldsema. As soon as we drop
1770	// worldsema another cycle could start and smash the stats
1771	// we're trying to print.
1772	if debug.gctrace > 0 {
1773		util := int(memstats.gc_cpu_fraction * 100)
1774
1775		var sbuf [24]byte
1776		printlock()
1777		print("gc ", memstats.numgc,
1778			" @", string(itoaDiv(sbuf[:], uint64(work.tSweepTerm-runtimeInitTime)/1e6, 3)), "s ",
1779			util, "%: ")
1780		prev := work.tSweepTerm
1781		for i, ns := range []int64{work.tMark, work.tMarkTerm, work.tEnd} {
1782			if i != 0 {
1783				print("+")
1784			}
1785			print(string(fmtNSAsMS(sbuf[:], uint64(ns-prev))))
1786			prev = ns
1787		}
1788		print(" ms clock, ")
1789		for i, ns := range []int64{sweepTermCpu, gcController.assistTime, gcController.dedicatedMarkTime + gcController.fractionalMarkTime, gcController.idleMarkTime, markTermCpu} {
1790			if i == 2 || i == 3 {
1791				// Separate mark time components with /.
1792				print("/")
1793			} else if i != 0 {
1794				print("+")
1795			}
1796			print(string(fmtNSAsMS(sbuf[:], uint64(ns))))
1797		}
1798		print(" ms cpu, ",
1799			work.heap0>>20, "->", work.heap1>>20, "->", work.heap2>>20, " MB, ",
1800			work.heapGoal>>20, " MB goal, ",
1801			work.maxprocs, " P")
1802		if work.userForced {
1803			print(" (forced)")
1804		}
1805		print("\n")
1806		printunlock()
1807	}
1808
1809	semrelease(&worldsema)
1810	semrelease(&gcsema)
1811	// Careful: another GC cycle may start now.
1812
1813	releasem(mp)
1814	mp = nil
1815
1816	// now that gc is done, kick off finalizer thread if needed
1817	if !concurrentSweep {
1818		// give the queued finalizers, if any, a chance to run
1819		Gosched()
1820	}
1821}
1822
1823// gcBgMarkStartWorkers prepares background mark worker goroutines. These
1824// goroutines will not run until the mark phase, but they must be started while
1825// the work is not stopped and from a regular G stack. The caller must hold
1826// worldsema.
1827func gcBgMarkStartWorkers() {
1828	// Background marking is performed by per-P G's. Ensure that each P has
1829	// a background GC G.
1830	//
1831	// Worker Gs don't exit if gomaxprocs is reduced. If it is raised
1832	// again, we can reuse the old workers; no need to create new workers.
1833	for gcBgMarkWorkerCount < gomaxprocs {
1834		expectSystemGoroutine()
1835		go gcBgMarkWorker()
1836
1837		notetsleepg(&work.bgMarkReady, -1)
1838		noteclear(&work.bgMarkReady)
1839		// The worker is now guaranteed to be added to the pool before
1840		// its P's next findRunnableGCWorker.
1841
1842		gcBgMarkWorkerCount++
1843	}
1844}
1845
1846// gcBgMarkPrepare sets up state for background marking.
1847// Mutator assists must not yet be enabled.
1848func gcBgMarkPrepare() {
1849	// Background marking will stop when the work queues are empty
1850	// and there are no more workers (note that, since this is
1851	// concurrent, this may be a transient state, but mark
1852	// termination will clean it up). Between background workers
1853	// and assists, we don't really know how many workers there
1854	// will be, so we pretend to have an arbitrarily large number
1855	// of workers, almost all of which are "waiting". While a
1856	// worker is working it decrements nwait. If nproc == nwait,
1857	// there are no workers.
1858	work.nproc = ^uint32(0)
1859	work.nwait = ^uint32(0)
1860}
1861
1862// gcBgMarkWorker is an entry in the gcBgMarkWorkerPool. It points to a single
1863// gcBgMarkWorker goroutine.
1864type gcBgMarkWorkerNode struct {
1865	// Unused workers are managed in a lock-free stack. This field must be first.
1866	node lfnode
1867
1868	// The g of this worker.
1869	gp guintptr
1870
1871	// Release this m on park. This is used to communicate with the unlock
1872	// function, which cannot access the G's stack. It is unused outside of
1873	// gcBgMarkWorker().
1874	m muintptr
1875}
1876
1877func gcBgMarkWorker() {
1878	setSystemGoroutine()
1879
1880	gp := getg()
1881
1882	// We pass node to a gopark unlock function, so it can't be on
1883	// the stack (see gopark). Prevent deadlock from recursively
1884	// starting GC by disabling preemption.
1885	gp.m.preemptoff = "GC worker init"
1886	node := new(gcBgMarkWorkerNode)
1887	gp.m.preemptoff = ""
1888
1889	node.gp.set(gp)
1890
1891	node.m.set(acquirem())
1892	notewakeup(&work.bgMarkReady)
1893	// After this point, the background mark worker is generally scheduled
1894	// cooperatively by gcController.findRunnableGCWorker. While performing
1895	// work on the P, preemption is disabled because we are working on
1896	// P-local work buffers. When the preempt flag is set, this puts itself
1897	// into _Gwaiting to be woken up by gcController.findRunnableGCWorker
1898	// at the appropriate time.
1899	//
1900	// When preemption is enabled (e.g., while in gcMarkDone), this worker
1901	// may be preempted and schedule as a _Grunnable G from a runq. That is
1902	// fine; it will eventually gopark again for further scheduling via
1903	// findRunnableGCWorker.
1904	//
1905	// Since we disable preemption before notifying bgMarkReady, we
1906	// guarantee that this G will be in the worker pool for the next
1907	// findRunnableGCWorker. This isn't strictly necessary, but it reduces
1908	// latency between _GCmark starting and the workers starting.
1909
1910	for {
1911		// Go to sleep until woken by
1912		// gcController.findRunnableGCWorker.
1913		gopark(func(g *g, nodep unsafe.Pointer) bool {
1914			node := (*gcBgMarkWorkerNode)(nodep)
1915
1916			if mp := node.m.ptr(); mp != nil {
1917				// The worker G is no longer running; release
1918				// the M.
1919				//
1920				// N.B. it is _safe_ to release the M as soon
1921				// as we are no longer performing P-local mark
1922				// work.
1923				//
1924				// However, since we cooperatively stop work
1925				// when gp.preempt is set, if we releasem in
1926				// the loop then the following call to gopark
1927				// would immediately preempt the G. This is
1928				// also safe, but inefficient: the G must
1929				// schedule again only to enter gopark and park
1930				// again. Thus, we defer the release until
1931				// after parking the G.
1932				releasem(mp)
1933			}
1934
1935			// Release this G to the pool.
1936			gcBgMarkWorkerPool.push(&node.node)
1937			// Note that at this point, the G may immediately be
1938			// rescheduled and may be running.
1939			return true
1940		}, unsafe.Pointer(node), waitReasonGCWorkerIdle, traceEvGoBlock, 0)
1941
1942		// Preemption must not occur here, or another G might see
1943		// p.gcMarkWorkerMode.
1944
1945		// Disable preemption so we can use the gcw. If the
1946		// scheduler wants to preempt us, we'll stop draining,
1947		// dispose the gcw, and then preempt.
1948		node.m.set(acquirem())
1949		pp := gp.m.p.ptr() // P can't change with preemption disabled.
1950
1951		if gcBlackenEnabled == 0 {
1952			println("worker mode", pp.gcMarkWorkerMode)
1953			throw("gcBgMarkWorker: blackening not enabled")
1954		}
1955
1956		if pp.gcMarkWorkerMode == gcMarkWorkerNotWorker {
1957			throw("gcBgMarkWorker: mode not set")
1958		}
1959
1960		startTime := nanotime()
1961		pp.gcMarkWorkerStartTime = startTime
1962
1963		decnwait := atomic.Xadd(&work.nwait, -1)
1964		if decnwait == work.nproc {
1965			println("runtime: work.nwait=", decnwait, "work.nproc=", work.nproc)
1966			throw("work.nwait was > work.nproc")
1967		}
1968
1969		systemstack(func() {
1970			// Mark our goroutine preemptible so its stack
1971			// can be scanned. This lets two mark workers
1972			// scan each other (otherwise, they would
1973			// deadlock). We must not modify anything on
1974			// the G stack. However, stack shrinking is
1975			// disabled for mark workers, so it is safe to
1976			// read from the G stack.
1977			casgstatus(gp, _Grunning, _Gwaiting)
1978			switch pp.gcMarkWorkerMode {
1979			default:
1980				throw("gcBgMarkWorker: unexpected gcMarkWorkerMode")
1981			case gcMarkWorkerDedicatedMode:
1982				gcDrain(&pp.gcw, gcDrainUntilPreempt|gcDrainFlushBgCredit)
1983				if gp.preempt {
1984					// We were preempted. This is
1985					// a useful signal to kick
1986					// everything out of the run
1987					// queue so it can run
1988					// somewhere else.
1989					lock(&sched.lock)
1990					for {
1991						gp, _ := runqget(pp)
1992						if gp == nil {
1993							break
1994						}
1995						globrunqput(gp)
1996					}
1997					unlock(&sched.lock)
1998				}
1999				// Go back to draining, this time
2000				// without preemption.
2001				gcDrain(&pp.gcw, gcDrainFlushBgCredit)
2002			case gcMarkWorkerFractionalMode:
2003				gcDrain(&pp.gcw, gcDrainFractional|gcDrainUntilPreempt|gcDrainFlushBgCredit)
2004			case gcMarkWorkerIdleMode:
2005				gcDrain(&pp.gcw, gcDrainIdle|gcDrainUntilPreempt|gcDrainFlushBgCredit)
2006			}
2007			casgstatus(gp, _Gwaiting, _Grunning)
2008		})
2009
2010		// Account for time.
2011		duration := nanotime() - startTime
2012		switch pp.gcMarkWorkerMode {
2013		case gcMarkWorkerDedicatedMode:
2014			atomic.Xaddint64(&gcController.dedicatedMarkTime, duration)
2015			atomic.Xaddint64(&gcController.dedicatedMarkWorkersNeeded, 1)
2016		case gcMarkWorkerFractionalMode:
2017			atomic.Xaddint64(&gcController.fractionalMarkTime, duration)
2018			atomic.Xaddint64(&pp.gcFractionalMarkTime, duration)
2019		case gcMarkWorkerIdleMode:
2020			atomic.Xaddint64(&gcController.idleMarkTime, duration)
2021		}
2022
2023		// Was this the last worker and did we run out
2024		// of work?
2025		incnwait := atomic.Xadd(&work.nwait, +1)
2026		if incnwait > work.nproc {
2027			println("runtime: p.gcMarkWorkerMode=", pp.gcMarkWorkerMode,
2028				"work.nwait=", incnwait, "work.nproc=", work.nproc)
2029			throw("work.nwait > work.nproc")
2030		}
2031
2032		// We'll releasem after this point and thus this P may run
2033		// something else. We must clear the worker mode to avoid
2034		// attributing the mode to a different (non-worker) G in
2035		// traceGoStart.
2036		pp.gcMarkWorkerMode = gcMarkWorkerNotWorker
2037
2038		// If this worker reached a background mark completion
2039		// point, signal the main GC goroutine.
2040		if incnwait == work.nproc && !gcMarkWorkAvailable(nil) {
2041			// We don't need the P-local buffers here, allow
2042			// preemption becuse we may schedule like a regular
2043			// goroutine in gcMarkDone (block on locks, etc).
2044			releasem(node.m.ptr())
2045			node.m.set(nil)
2046
2047			gcMarkDone()
2048		}
2049	}
2050}
2051
2052// gcMarkWorkAvailable reports whether executing a mark worker
2053// on p is potentially useful. p may be nil, in which case it only
2054// checks the global sources of work.
2055func gcMarkWorkAvailable(p *p) bool {
2056	if p != nil && !p.gcw.empty() {
2057		return true
2058	}
2059	if !work.full.empty() {
2060		return true // global work available
2061	}
2062	if work.markrootNext < work.markrootJobs {
2063		return true // root scan work available
2064	}
2065	return false
2066}
2067
2068// gcMark runs the mark (or, for concurrent GC, mark termination)
2069// All gcWork caches must be empty.
2070// STW is in effect at this point.
2071func gcMark(start_time int64) {
2072	if debug.allocfreetrace > 0 {
2073		tracegc()
2074	}
2075
2076	if gcphase != _GCmarktermination {
2077		throw("in gcMark expecting to see gcphase as _GCmarktermination")
2078	}
2079	work.tstart = start_time
2080
2081	// Check that there's no marking work remaining.
2082	if work.full != 0 || work.markrootNext < work.markrootJobs {
2083		print("runtime: full=", hex(work.full), " next=", work.markrootNext, " jobs=", work.markrootJobs, " nDataRoots=", work.nDataRoots, " nSpanRoots=", work.nSpanRoots, " nStackRoots=", work.nStackRoots, "\n")
2084		panic("non-empty mark queue after concurrent mark")
2085	}
2086
2087	if debug.gccheckmark > 0 {
2088		// This is expensive when there's a large number of
2089		// Gs, so only do it if checkmark is also enabled.
2090		gcMarkRootCheck()
2091	}
2092	if work.full != 0 {
2093		throw("work.full != 0")
2094	}
2095
2096	// Clear out buffers and double-check that all gcWork caches
2097	// are empty. This should be ensured by gcMarkDone before we
2098	// enter mark termination.
2099	//
2100	// TODO: We could clear out buffers just before mark if this
2101	// has a non-negligible impact on STW time.
2102	for _, p := range allp {
2103		// The write barrier may have buffered pointers since
2104		// the gcMarkDone barrier. However, since the barrier
2105		// ensured all reachable objects were marked, all of
2106		// these must be pointers to black objects. Hence we
2107		// can just discard the write barrier buffer.
2108		if debug.gccheckmark > 0 {
2109			// For debugging, flush the buffer and make
2110			// sure it really was all marked.
2111			wbBufFlush1(p)
2112		} else {
2113			p.wbBuf.reset()
2114		}
2115
2116		gcw := &p.gcw
2117		if !gcw.empty() {
2118			printlock()
2119			print("runtime: P ", p.id, " flushedWork ", gcw.flushedWork)
2120			if gcw.wbuf1 == nil {
2121				print(" wbuf1=<nil>")
2122			} else {
2123				print(" wbuf1.n=", gcw.wbuf1.nobj)
2124			}
2125			if gcw.wbuf2 == nil {
2126				print(" wbuf2=<nil>")
2127			} else {
2128				print(" wbuf2.n=", gcw.wbuf2.nobj)
2129			}
2130			print("\n")
2131			throw("P has cached GC work at end of mark termination")
2132		}
2133		// There may still be cached empty buffers, which we
2134		// need to flush since we're going to free them. Also,
2135		// there may be non-zero stats because we allocated
2136		// black after the gcMarkDone barrier.
2137		gcw.dispose()
2138	}
2139
2140	// Update the marked heap stat.
2141	memstats.heap_marked = work.bytesMarked
2142
2143	// Flush scanAlloc from each mcache since we're about to modify
2144	// heap_scan directly. If we were to flush this later, then scanAlloc
2145	// might have incorrect information.
2146	for _, p := range allp {
2147		c := p.mcache
2148		if c == nil {
2149			continue
2150		}
2151		memstats.heap_scan += uint64(c.scanAlloc)
2152		c.scanAlloc = 0
2153	}
2154
2155	// Update other GC heap size stats. This must happen after
2156	// cachestats (which flushes local statistics to these) and
2157	// flushallmcaches (which modifies heap_live).
2158	memstats.heap_live = work.bytesMarked
2159	memstats.heap_scan = uint64(gcController.scanWork)
2160
2161	if trace.enabled {
2162		traceHeapAlloc()
2163	}
2164}
2165
2166// gcSweep must be called on the system stack because it acquires the heap
2167// lock. See mheap for details.
2168//
2169// The world must be stopped.
2170//
2171//go:systemstack
2172func gcSweep(mode gcMode) {
2173	assertWorldStopped()
2174
2175	if gcphase != _GCoff {
2176		throw("gcSweep being done but phase is not GCoff")
2177	}
2178
2179	lock(&mheap_.lock)
2180	mheap_.sweepgen += 2
2181	mheap_.sweepdone = 0
2182	mheap_.pagesSwept = 0
2183	mheap_.sweepArenas = mheap_.allArenas
2184	mheap_.reclaimIndex = 0
2185	mheap_.reclaimCredit = 0
2186	unlock(&mheap_.lock)
2187
2188	sweep.centralIndex.clear()
2189
2190	if !_ConcurrentSweep || mode == gcForceBlockMode {
2191		// Special case synchronous sweep.
2192		// Record that no proportional sweeping has to happen.
2193		lock(&mheap_.lock)
2194		mheap_.sweepPagesPerByte = 0
2195		unlock(&mheap_.lock)
2196		// Sweep all spans eagerly.
2197		for sweepone() != ^uintptr(0) {
2198			sweep.npausesweep++
2199		}
2200		// Free workbufs eagerly.
2201		prepareFreeWorkbufs()
2202		for freeSomeWbufs(false) {
2203		}
2204		// All "free" events for this mark/sweep cycle have
2205		// now happened, so we can make this profile cycle
2206		// available immediately.
2207		mProf_NextCycle()
2208		mProf_Flush()
2209		return
2210	}
2211
2212	// Background sweep.
2213	lock(&sweep.lock)
2214	if sweep.parked {
2215		sweep.parked = false
2216		ready(sweep.g, 0, true)
2217	}
2218	unlock(&sweep.lock)
2219}
2220
2221// gcResetMarkState resets global state prior to marking (concurrent
2222// or STW) and resets the stack scan state of all Gs.
2223//
2224// This is safe to do without the world stopped because any Gs created
2225// during or after this will start out in the reset state.
2226//
2227// gcResetMarkState must be called on the system stack because it acquires
2228// the heap lock. See mheap for details.
2229//
2230//go:systemstack
2231func gcResetMarkState() {
2232	// This may be called during a concurrent phase, so make sure
2233	// allgs doesn't change.
2234	lock(&allglock)
2235	for _, gp := range allgs {
2236		gp.gcscandone = false // set to true in gcphasework
2237		gp.gcAssistBytes = 0
2238	}
2239	unlock(&allglock)
2240
2241	// Clear page marks. This is just 1MB per 64GB of heap, so the
2242	// time here is pretty trivial.
2243	lock(&mheap_.lock)
2244	arenas := mheap_.allArenas
2245	unlock(&mheap_.lock)
2246	for _, ai := range arenas {
2247		ha := mheap_.arenas[ai.l1()][ai.l2()]
2248		for i := range ha.pageMarks {
2249			ha.pageMarks[i] = 0
2250		}
2251	}
2252
2253	work.bytesMarked = 0
2254	work.initialHeapLive = atomic.Load64(&memstats.heap_live)
2255}
2256
2257// Hooks for other packages
2258
2259var poolcleanup func()
2260
2261//go:linkname sync_runtime_registerPoolCleanup sync.runtime__registerPoolCleanup
2262func sync_runtime_registerPoolCleanup(f func()) {
2263	poolcleanup = f
2264}
2265
2266func clearpools() {
2267	// clear sync.Pools
2268	if poolcleanup != nil {
2269		poolcleanup()
2270	}
2271
2272	// Clear central sudog cache.
2273	// Leave per-P caches alone, they have strictly bounded size.
2274	// Disconnect cached list before dropping it on the floor,
2275	// so that a dangling ref to one entry does not pin all of them.
2276	lock(&sched.sudoglock)
2277	var sg, sgnext *sudog
2278	for sg = sched.sudogcache; sg != nil; sg = sgnext {
2279		sgnext = sg.next
2280		sg.next = nil
2281	}
2282	sched.sudogcache = nil
2283	unlock(&sched.sudoglock)
2284
2285	// Clear central defer pools.
2286	// Leave per-P pools alone, they have strictly bounded size.
2287	lock(&sched.deferlock)
2288	// disconnect cached list before dropping it on the floor,
2289	// so that a dangling ref to one entry does not pin all of them.
2290	var d, dlink *_defer
2291	for d = sched.deferpool; d != nil; d = dlink {
2292		dlink = d.link
2293		d.link = nil
2294	}
2295	sched.deferpool = nil
2296	unlock(&sched.deferlock)
2297}
2298
2299// Timing
2300
2301// itoaDiv formats val/(10**dec) into buf.
2302func itoaDiv(buf []byte, val uint64, dec int) []byte {
2303	i := len(buf) - 1
2304	idec := i - dec
2305	for val >= 10 || i >= idec {
2306		buf[i] = byte(val%10 + '0')
2307		i--
2308		if i == idec {
2309			buf[i] = '.'
2310			i--
2311		}
2312		val /= 10
2313	}
2314	buf[i] = byte(val + '0')
2315	return buf[i:]
2316}
2317
2318// fmtNSAsMS nicely formats ns nanoseconds as milliseconds.
2319func fmtNSAsMS(buf []byte, ns uint64) []byte {
2320	if ns >= 10e6 {
2321		// Format as whole milliseconds.
2322		return itoaDiv(buf, ns/1e6, 0)
2323	}
2324	// Format two digits of precision, with at most three decimal places.
2325	x := ns / 1e3
2326	if x == 0 {
2327		buf[0] = '0'
2328		return buf[:1]
2329	}
2330	dec := 3
2331	for x >= 100 {
2332		x /= 10
2333		dec--
2334	}
2335	return itoaDiv(buf, x, dec)
2336}
2337