1// Copyright 2009 The Go Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style
3// license that can be found in the LICENSE file.
4
5// Garbage collector: type and heap bitmaps.
6//
7// Stack, data, and bss bitmaps
8//
9// Stack frames and global variables in the data and bss sections are described
10// by 1-bit bitmaps in which 0 means uninteresting and 1 means live pointer
11// to be visited during GC. The bits in each byte are consumed starting with
12// the low bit: 1<<0, 1<<1, and so on.
13//
14// Heap bitmap
15//
16// The heap bitmap comprises 2 bits for each pointer-sized word in the heap,
17// stored in the heapArena metadata backing each heap arena.
18// That is, if ha is the heapArena for the arena starting a start,
19// then ha.bitmap[0] holds the 2-bit entries for the four words start
20// through start+3*ptrSize, ha.bitmap[1] holds the entries for
21// start+4*ptrSize through start+7*ptrSize, and so on.
22//
23// In each 2-bit entry, the lower bit holds the same information as in the 1-bit
24// bitmaps: 0 means uninteresting and 1 means live pointer to be visited during GC.
25// The meaning of the high bit depends on the position of the word being described
26// in its allocated object. In all words *except* the second word, the
27// high bit indicates that the object is still being described. In
28// these words, if a bit pair with a high bit 0 is encountered, the
29// low bit can also be assumed to be 0, and the object description is
30// over. This 00 is called the ``dead'' encoding: it signals that the
31// rest of the words in the object are uninteresting to the garbage
32// collector.
33//
34// In the second word, the high bit is the GC ``checkmarked'' bit (see below).
35//
36// The 2-bit entries are split when written into the byte, so that the top half
37// of the byte contains 4 high bits and the bottom half contains 4 low (pointer)
38// bits.
39// This form allows a copy from the 1-bit to the 4-bit form to keep the
40// pointer bits contiguous, instead of having to space them out.
41//
42// The code makes use of the fact that the zero value for a heap bitmap
43// has no live pointer bit set and is (depending on position), not used,
44// not checkmarked, and is the dead encoding.
45// These properties must be preserved when modifying the encoding.
46//
47// The bitmap for noscan spans is not maintained. Code must ensure
48// that an object is scannable before consulting its bitmap by
49// checking either the noscan bit in the span or by consulting its
50// type's information.
51//
52// Checkmarks
53//
54// In a concurrent garbage collector, one worries about failing to mark
55// a live object due to mutations without write barriers or bugs in the
56// collector implementation. As a sanity check, the GC has a 'checkmark'
57// mode that retraverses the object graph with the world stopped, to make
58// sure that everything that should be marked is marked.
59// In checkmark mode, in the heap bitmap, the high bit of the 2-bit entry
60// for the second word of the object holds the checkmark bit.
61// When not in checkmark mode, this bit is set to 1.
62//
63// The smallest possible allocation is 8 bytes. On a 32-bit machine, that
64// means every allocated object has two words, so there is room for the
65// checkmark bit. On a 64-bit machine, however, the 8-byte allocation is
66// just one word, so the second bit pair is not available for encoding the
67// checkmark. However, because non-pointer allocations are combined
68// into larger 16-byte (maxTinySize) allocations, a plain 8-byte allocation
69// must be a pointer, so the type bit in the first word is not actually needed.
70// It is still used in general, except in checkmark the type bit is repurposed
71// as the checkmark bit and then reinitialized (to 1) as the type bit when
72// finished.
73//
74
75package runtime
76
77import (
78	"runtime/internal/atomic"
79	"runtime/internal/sys"
80	"unsafe"
81)
82
83const (
84	bitPointer = 1 << 0
85	bitScan    = 1 << 4
86
87	heapBitsShift      = 1     // shift offset between successive bitPointer or bitScan entries
88	wordsPerBitmapByte = 8 / 2 // heap words described by one bitmap byte
89
90	// all scan/pointer bits in a byte
91	bitScanAll    = bitScan | bitScan<<heapBitsShift | bitScan<<(2*heapBitsShift) | bitScan<<(3*heapBitsShift)
92	bitPointerAll = bitPointer | bitPointer<<heapBitsShift | bitPointer<<(2*heapBitsShift) | bitPointer<<(3*heapBitsShift)
93)
94
95// addb returns the byte pointer p+n.
96//go:nowritebarrier
97//go:nosplit
98func addb(p *byte, n uintptr) *byte {
99	// Note: wrote out full expression instead of calling add(p, n)
100	// to reduce the number of temporaries generated by the
101	// compiler for this trivial expression during inlining.
102	return (*byte)(unsafe.Pointer(uintptr(unsafe.Pointer(p)) + n))
103}
104
105// subtractb returns the byte pointer p-n.
106//go:nowritebarrier
107//go:nosplit
108func subtractb(p *byte, n uintptr) *byte {
109	// Note: wrote out full expression instead of calling add(p, -n)
110	// to reduce the number of temporaries generated by the
111	// compiler for this trivial expression during inlining.
112	return (*byte)(unsafe.Pointer(uintptr(unsafe.Pointer(p)) - n))
113}
114
115// add1 returns the byte pointer p+1.
116//go:nowritebarrier
117//go:nosplit
118func add1(p *byte) *byte {
119	// Note: wrote out full expression instead of calling addb(p, 1)
120	// to reduce the number of temporaries generated by the
121	// compiler for this trivial expression during inlining.
122	return (*byte)(unsafe.Pointer(uintptr(unsafe.Pointer(p)) + 1))
123}
124
125// subtract1 returns the byte pointer p-1.
126//go:nowritebarrier
127//
128// nosplit because it is used during write barriers and must not be preempted.
129//go:nosplit
130func subtract1(p *byte) *byte {
131	// Note: wrote out full expression instead of calling subtractb(p, 1)
132	// to reduce the number of temporaries generated by the
133	// compiler for this trivial expression during inlining.
134	return (*byte)(unsafe.Pointer(uintptr(unsafe.Pointer(p)) - 1))
135}
136
137// heapBits provides access to the bitmap bits for a single heap word.
138// The methods on heapBits take value receivers so that the compiler
139// can more easily inline calls to those methods and registerize the
140// struct fields independently.
141type heapBits struct {
142	bitp  *uint8
143	shift uint32
144	arena uint32 // Index of heap arena containing bitp
145	last  *uint8 // Last byte arena's bitmap
146}
147
148// Make the compiler check that heapBits.arena is large enough to hold
149// the maximum arena frame number.
150var _ = heapBits{arena: (1<<heapAddrBits)/heapArenaBytes - 1}
151
152// markBits provides access to the mark bit for an object in the heap.
153// bytep points to the byte holding the mark bit.
154// mask is a byte with a single bit set that can be &ed with *bytep
155// to see if the bit has been set.
156// *m.byte&m.mask != 0 indicates the mark bit is set.
157// index can be used along with span information to generate
158// the address of the object in the heap.
159// We maintain one set of mark bits for allocation and one for
160// marking purposes.
161type markBits struct {
162	bytep *uint8
163	mask  uint8
164	index uintptr
165}
166
167//go:nosplit
168func (s *mspan) allocBitsForIndex(allocBitIndex uintptr) markBits {
169	bytep, mask := s.allocBits.bitp(allocBitIndex)
170	return markBits{bytep, mask, allocBitIndex}
171}
172
173// refillAllocCache takes 8 bytes s.allocBits starting at whichByte
174// and negates them so that ctz (count trailing zeros) instructions
175// can be used. It then places these 8 bytes into the cached 64 bit
176// s.allocCache.
177func (s *mspan) refillAllocCache(whichByte uintptr) {
178	bytes := (*[8]uint8)(unsafe.Pointer(s.allocBits.bytep(whichByte)))
179	aCache := uint64(0)
180	aCache |= uint64(bytes[0])
181	aCache |= uint64(bytes[1]) << (1 * 8)
182	aCache |= uint64(bytes[2]) << (2 * 8)
183	aCache |= uint64(bytes[3]) << (3 * 8)
184	aCache |= uint64(bytes[4]) << (4 * 8)
185	aCache |= uint64(bytes[5]) << (5 * 8)
186	aCache |= uint64(bytes[6]) << (6 * 8)
187	aCache |= uint64(bytes[7]) << (7 * 8)
188	s.allocCache = ^aCache
189}
190
191// nextFreeIndex returns the index of the next free object in s at
192// or after s.freeindex.
193// There are hardware instructions that can be used to make this
194// faster if profiling warrants it.
195func (s *mspan) nextFreeIndex() uintptr {
196	sfreeindex := s.freeindex
197	snelems := s.nelems
198	if sfreeindex == snelems {
199		return sfreeindex
200	}
201	if sfreeindex > snelems {
202		throw("s.freeindex > s.nelems")
203	}
204
205	aCache := s.allocCache
206
207	bitIndex := sys.Ctz64(aCache)
208	for bitIndex == 64 {
209		// Move index to start of next cached bits.
210		sfreeindex = (sfreeindex + 64) &^ (64 - 1)
211		if sfreeindex >= snelems {
212			s.freeindex = snelems
213			return snelems
214		}
215		whichByte := sfreeindex / 8
216		// Refill s.allocCache with the next 64 alloc bits.
217		s.refillAllocCache(whichByte)
218		aCache = s.allocCache
219		bitIndex = sys.Ctz64(aCache)
220		// nothing available in cached bits
221		// grab the next 8 bytes and try again.
222	}
223	result := sfreeindex + uintptr(bitIndex)
224	if result >= snelems {
225		s.freeindex = snelems
226		return snelems
227	}
228
229	s.allocCache >>= uint(bitIndex + 1)
230	sfreeindex = result + 1
231
232	if sfreeindex%64 == 0 && sfreeindex != snelems {
233		// We just incremented s.freeindex so it isn't 0.
234		// As each 1 in s.allocCache was encountered and used for allocation
235		// it was shifted away. At this point s.allocCache contains all 0s.
236		// Refill s.allocCache so that it corresponds
237		// to the bits at s.allocBits starting at s.freeindex.
238		whichByte := sfreeindex / 8
239		s.refillAllocCache(whichByte)
240	}
241	s.freeindex = sfreeindex
242	return result
243}
244
245// isFree reports whether the index'th object in s is unallocated.
246//
247// The caller must ensure s.state is mSpanInUse, and there must have
248// been no preemption points since ensuring this (which could allow a
249// GC transition, which would allow the state to change).
250func (s *mspan) isFree(index uintptr) bool {
251	if index < s.freeindex {
252		return false
253	}
254	bytep, mask := s.allocBits.bitp(index)
255	return *bytep&mask == 0
256}
257
258func (s *mspan) objIndex(p uintptr) uintptr {
259	byteOffset := p - s.base()
260	if byteOffset == 0 {
261		return 0
262	}
263	if s.baseMask != 0 {
264		// s.baseMask is non-0, elemsize is a power of two, so shift by s.divShift
265		return byteOffset >> s.divShift
266	}
267	return uintptr(((uint64(byteOffset) >> s.divShift) * uint64(s.divMul)) >> s.divShift2)
268}
269
270func markBitsForAddr(p uintptr) markBits {
271	s := spanOf(p)
272	objIndex := s.objIndex(p)
273	return s.markBitsForIndex(objIndex)
274}
275
276func (s *mspan) markBitsForIndex(objIndex uintptr) markBits {
277	bytep, mask := s.gcmarkBits.bitp(objIndex)
278	return markBits{bytep, mask, objIndex}
279}
280
281func (s *mspan) markBitsForBase() markBits {
282	return markBits{(*uint8)(s.gcmarkBits), uint8(1), 0}
283}
284
285// isMarked reports whether mark bit m is set.
286func (m markBits) isMarked() bool {
287	return *m.bytep&m.mask != 0
288}
289
290// setMarked sets the marked bit in the markbits, atomically.
291func (m markBits) setMarked() {
292	// Might be racing with other updates, so use atomic update always.
293	// We used to be clever here and use a non-atomic update in certain
294	// cases, but it's not worth the risk.
295	atomic.Or8(m.bytep, m.mask)
296}
297
298// setMarkedNonAtomic sets the marked bit in the markbits, non-atomically.
299func (m markBits) setMarkedNonAtomic() {
300	*m.bytep |= m.mask
301}
302
303// clearMarked clears the marked bit in the markbits, atomically.
304func (m markBits) clearMarked() {
305	// Might be racing with other updates, so use atomic update always.
306	// We used to be clever here and use a non-atomic update in certain
307	// cases, but it's not worth the risk.
308	atomic.And8(m.bytep, ^m.mask)
309}
310
311// markBitsForSpan returns the markBits for the span base address base.
312func markBitsForSpan(base uintptr) (mbits markBits) {
313	mbits = markBitsForAddr(base)
314	if mbits.mask != 1 {
315		throw("markBitsForSpan: unaligned start")
316	}
317	return mbits
318}
319
320// advance advances the markBits to the next object in the span.
321func (m *markBits) advance() {
322	if m.mask == 1<<7 {
323		m.bytep = (*uint8)(unsafe.Pointer(uintptr(unsafe.Pointer(m.bytep)) + 1))
324		m.mask = 1
325	} else {
326		m.mask = m.mask << 1
327	}
328	m.index++
329}
330
331// heapBitsForAddr returns the heapBits for the address addr.
332// The caller must ensure addr is in an allocated span.
333// In particular, be careful not to point past the end of an object.
334//
335// nosplit because it is used during write barriers and must not be preempted.
336//go:nosplit
337func heapBitsForAddr(addr uintptr) (h heapBits) {
338	// 2 bits per word, 4 pairs per byte, and a mask is hard coded.
339	arena := arenaIndex(addr)
340	ha := mheap_.arenas[arena.l1()][arena.l2()]
341	// The compiler uses a load for nil checking ha, but in this
342	// case we'll almost never hit that cache line again, so it
343	// makes more sense to do a value check.
344	if ha == nil {
345		// addr is not in the heap. Return nil heapBits, which
346		// we expect to crash in the caller.
347		return
348	}
349	h.bitp = &ha.bitmap[(addr/(sys.PtrSize*4))%heapArenaBitmapBytes]
350	h.shift = uint32((addr / sys.PtrSize) & 3)
351	h.arena = uint32(arena)
352	h.last = &ha.bitmap[len(ha.bitmap)-1]
353	return
354}
355
356// badPointer throws bad pointer in heap panic.
357func badPointer(s *mspan, p, refBase, refOff uintptr) {
358	// Typically this indicates an incorrect use
359	// of unsafe or cgo to store a bad pointer in
360	// the Go heap. It may also indicate a runtime
361	// bug.
362	//
363	// TODO(austin): We could be more aggressive
364	// and detect pointers to unallocated objects
365	// in allocated spans.
366	printlock()
367	print("runtime: pointer ", hex(p))
368	state := s.state.get()
369	if state != mSpanInUse {
370		print(" to unallocated span")
371	} else {
372		print(" to unused region of span")
373	}
374	print(" span.base()=", hex(s.base()), " span.limit=", hex(s.limit), " span.state=", state, "\n")
375	if refBase != 0 {
376		print("runtime: found in object at *(", hex(refBase), "+", hex(refOff), ")\n")
377		gcDumpObject("object", refBase, refOff)
378	}
379	getg().m.traceback = 2
380	throw("found bad pointer in Go heap (incorrect use of unsafe or cgo?)")
381}
382
383// findObject returns the base address for the heap object containing
384// the address p, the object's span, and the index of the object in s.
385// If p does not point into a heap object, it returns base == 0.
386//
387// If p points is an invalid heap pointer and debug.invalidptr != 0,
388// findObject panics.
389//
390// For gccgo, the forStack parameter is true if the value came from the stack.
391// The stack is collected conservatively and may contain invalid pointers.
392//
393// refBase and refOff optionally give the base address of the object
394// in which the pointer p was found and the byte offset at which it
395// was found. These are used for error reporting.
396//
397// It is nosplit so it is safe for p to be a pointer to the current goroutine's stack.
398// Since p is a uintptr, it would not be adjusted if the stack were to move.
399//go:nosplit
400func findObject(p, refBase, refOff uintptr, forStack bool) (base uintptr, s *mspan, objIndex uintptr) {
401	s = spanOf(p)
402	// If s is nil, the virtual address has never been part of the heap.
403	// This pointer may be to some mmap'd region, so we allow it.
404	if s == nil {
405		return
406	}
407	// If p is a bad pointer, it may not be in s's bounds.
408	//
409	// Check s.state to synchronize with span initialization
410	// before checking other fields. See also spanOfHeap.
411	if state := s.state.get(); state != mSpanInUse || p < s.base() || p >= s.limit {
412		// Pointers into stacks are also ok, the runtime manages these explicitly.
413		if state == mSpanManual || forStack {
414			return
415		}
416		// The following ensures that we are rigorous about what data
417		// structures hold valid pointers.
418		if debug.invalidptr != 0 {
419			badPointer(s, p, refBase, refOff)
420		}
421		return
422	}
423
424	if forStack {
425		// A span can be entered in mheap_.spans, and be set
426		// to mSpanInUse, before it is fully initialized.
427		// All we need in practice is allocBits and gcmarkBits,
428		// so make sure they are set.
429		if s.allocBits == nil || s.gcmarkBits == nil {
430			return
431		}
432	}
433
434	// If this span holds object of a power of 2 size, just mask off the bits to
435	// the interior of the object. Otherwise use the size to get the base.
436	if s.baseMask != 0 {
437		// optimize for power of 2 sized objects.
438		base = s.base()
439		base = base + (p-base)&uintptr(s.baseMask)
440		objIndex = (base - s.base()) >> s.divShift
441		// base = p & s.baseMask is faster for small spans,
442		// but doesn't work for large spans.
443		// Overall, it's faster to use the more general computation above.
444	} else {
445		base = s.base()
446		if p-base >= s.elemsize {
447			// n := (p - base) / s.elemsize, using division by multiplication
448			objIndex = uintptr(p-base) >> s.divShift * uintptr(s.divMul) >> s.divShift2
449			base += objIndex * s.elemsize
450		}
451	}
452	return
453}
454
455// next returns the heapBits describing the next pointer-sized word in memory.
456// That is, if h describes address p, h.next() describes p+ptrSize.
457// Note that next does not modify h. The caller must record the result.
458//
459// nosplit because it is used during write barriers and must not be preempted.
460//go:nosplit
461func (h heapBits) next() heapBits {
462	if h.shift < 3*heapBitsShift {
463		h.shift += heapBitsShift
464	} else if h.bitp != h.last {
465		h.bitp, h.shift = add1(h.bitp), 0
466	} else {
467		// Move to the next arena.
468		return h.nextArena()
469	}
470	return h
471}
472
473// nextArena advances h to the beginning of the next heap arena.
474//
475// This is a slow-path helper to next. gc's inliner knows that
476// heapBits.next can be inlined even though it calls this. This is
477// marked noinline so it doesn't get inlined into next and cause next
478// to be too big to inline.
479//
480//go:nosplit
481//go:noinline
482func (h heapBits) nextArena() heapBits {
483	h.arena++
484	ai := arenaIdx(h.arena)
485	l2 := mheap_.arenas[ai.l1()]
486	if l2 == nil {
487		// We just passed the end of the object, which
488		// was also the end of the heap. Poison h. It
489		// should never be dereferenced at this point.
490		return heapBits{}
491	}
492	ha := l2[ai.l2()]
493	if ha == nil {
494		return heapBits{}
495	}
496	h.bitp, h.shift = &ha.bitmap[0], 0
497	h.last = &ha.bitmap[len(ha.bitmap)-1]
498	return h
499}
500
501// forward returns the heapBits describing n pointer-sized words ahead of h in memory.
502// That is, if h describes address p, h.forward(n) describes p+n*ptrSize.
503// h.forward(1) is equivalent to h.next(), just slower.
504// Note that forward does not modify h. The caller must record the result.
505// bits returns the heap bits for the current word.
506//go:nosplit
507func (h heapBits) forward(n uintptr) heapBits {
508	n += uintptr(h.shift) / heapBitsShift
509	nbitp := uintptr(unsafe.Pointer(h.bitp)) + n/4
510	h.shift = uint32(n%4) * heapBitsShift
511	if nbitp <= uintptr(unsafe.Pointer(h.last)) {
512		h.bitp = (*uint8)(unsafe.Pointer(nbitp))
513		return h
514	}
515
516	// We're in a new heap arena.
517	past := nbitp - (uintptr(unsafe.Pointer(h.last)) + 1)
518	h.arena += 1 + uint32(past/heapArenaBitmapBytes)
519	ai := arenaIdx(h.arena)
520	if l2 := mheap_.arenas[ai.l1()]; l2 != nil && l2[ai.l2()] != nil {
521		a := l2[ai.l2()]
522		h.bitp = &a.bitmap[past%heapArenaBitmapBytes]
523		h.last = &a.bitmap[len(a.bitmap)-1]
524	} else {
525		h.bitp, h.last = nil, nil
526	}
527	return h
528}
529
530// forwardOrBoundary is like forward, but stops at boundaries between
531// contiguous sections of the bitmap. It returns the number of words
532// advanced over, which will be <= n.
533func (h heapBits) forwardOrBoundary(n uintptr) (heapBits, uintptr) {
534	maxn := 4 * ((uintptr(unsafe.Pointer(h.last)) + 1) - uintptr(unsafe.Pointer(h.bitp)))
535	if n > maxn {
536		n = maxn
537	}
538	return h.forward(n), n
539}
540
541// The caller can test morePointers and isPointer by &-ing with bitScan and bitPointer.
542// The result includes in its higher bits the bits for subsequent words
543// described by the same bitmap byte.
544//
545// nosplit because it is used during write barriers and must not be preempted.
546//go:nosplit
547func (h heapBits) bits() uint32 {
548	// The (shift & 31) eliminates a test and conditional branch
549	// from the generated code.
550	return uint32(*h.bitp) >> (h.shift & 31)
551}
552
553// morePointers reports whether this word and all remaining words in this object
554// are scalars.
555// h must not describe the second word of the object.
556func (h heapBits) morePointers() bool {
557	return h.bits()&bitScan != 0
558}
559
560// isPointer reports whether the heap bits describe a pointer word.
561//
562// nosplit because it is used during write barriers and must not be preempted.
563//go:nosplit
564func (h heapBits) isPointer() bool {
565	return h.bits()&bitPointer != 0
566}
567
568// isCheckmarked reports whether the heap bits have the checkmarked bit set.
569// It must be told how large the object at h is, because the encoding of the
570// checkmark bit varies by size.
571// h must describe the initial word of the object.
572func (h heapBits) isCheckmarked(size uintptr) bool {
573	if size == sys.PtrSize {
574		return (*h.bitp>>h.shift)&bitPointer != 0
575	}
576	// All multiword objects are 2-word aligned,
577	// so we know that the initial word's 2-bit pair
578	// and the second word's 2-bit pair are in the
579	// same heap bitmap byte, *h.bitp.
580	return (*h.bitp>>(heapBitsShift+h.shift))&bitScan != 0
581}
582
583// setCheckmarked sets the checkmarked bit.
584// It must be told how large the object at h is, because the encoding of the
585// checkmark bit varies by size.
586// h must describe the initial word of the object.
587func (h heapBits) setCheckmarked(size uintptr) {
588	if size == sys.PtrSize {
589		atomic.Or8(h.bitp, bitPointer<<h.shift)
590		return
591	}
592	atomic.Or8(h.bitp, bitScan<<(heapBitsShift+h.shift))
593}
594
595// bulkBarrierPreWrite executes a write barrier
596// for every pointer slot in the memory range [src, src+size),
597// using pointer/scalar information from [dst, dst+size).
598// This executes the write barriers necessary before a memmove.
599// src, dst, and size must be pointer-aligned.
600// The range [dst, dst+size) must lie within a single object.
601// It does not perform the actual writes.
602//
603// As a special case, src == 0 indicates that this is being used for a
604// memclr. bulkBarrierPreWrite will pass 0 for the src of each write
605// barrier.
606//
607// Callers should call bulkBarrierPreWrite immediately before
608// calling memmove(dst, src, size). This function is marked nosplit
609// to avoid being preempted; the GC must not stop the goroutine
610// between the memmove and the execution of the barriers.
611// The caller is also responsible for cgo pointer checks if this
612// may be writing Go pointers into non-Go memory.
613//
614// The pointer bitmap is not maintained for allocations containing
615// no pointers at all; any caller of bulkBarrierPreWrite must first
616// make sure the underlying allocation contains pointers, usually
617// by checking typ.ptrdata.
618//
619// Callers must perform cgo checks if writeBarrier.cgo.
620//
621//go:nosplit
622func bulkBarrierPreWrite(dst, src, size uintptr) {
623	if (dst|src|size)&(sys.PtrSize-1) != 0 {
624		throw("bulkBarrierPreWrite: unaligned arguments")
625	}
626	if !writeBarrier.needed {
627		return
628	}
629	if s := spanOf(dst); s == nil {
630		// If dst is a global, use the data or BSS bitmaps to
631		// execute write barriers.
632		lo := 0
633		hi := len(gcRootsIndex)
634		for lo < hi {
635			m := lo + (hi-lo)/2
636			pr := gcRootsIndex[m]
637			addr := uintptr(pr.decl)
638			if addr <= dst && dst < addr+pr.size {
639				if dst < addr+pr.ptrdata {
640					bulkBarrierBitmap(dst, src, size, dst-addr, pr.gcdata)
641				}
642				return
643			}
644			if dst < addr {
645				hi = m
646			} else {
647				lo = m + 1
648			}
649		}
650		return
651	} else if s.state.get() != mSpanInUse || dst < s.base() || s.limit <= dst {
652		// dst was heap memory at some point, but isn't now.
653		// It can't be a global. It must be either our stack,
654		// or in the case of direct channel sends, it could be
655		// another stack. Either way, no need for barriers.
656		// This will also catch if dst is in a freed span,
657		// though that should never have.
658		return
659	}
660
661	buf := &getg().m.p.ptr().wbBuf
662	h := heapBitsForAddr(dst)
663	if src == 0 {
664		for i := uintptr(0); i < size; i += sys.PtrSize {
665			if h.isPointer() {
666				dstx := (*uintptr)(unsafe.Pointer(dst + i))
667				if !buf.putFast(*dstx, 0) {
668					wbBufFlush(nil, 0)
669				}
670			}
671			h = h.next()
672		}
673	} else {
674		for i := uintptr(0); i < size; i += sys.PtrSize {
675			if h.isPointer() {
676				dstx := (*uintptr)(unsafe.Pointer(dst + i))
677				srcx := (*uintptr)(unsafe.Pointer(src + i))
678				if !buf.putFast(*dstx, *srcx) {
679					wbBufFlush(nil, 0)
680				}
681			}
682			h = h.next()
683		}
684	}
685}
686
687// bulkBarrierPreWriteSrcOnly is like bulkBarrierPreWrite but
688// does not execute write barriers for [dst, dst+size).
689//
690// In addition to the requirements of bulkBarrierPreWrite
691// callers need to ensure [dst, dst+size) is zeroed.
692//
693// This is used for special cases where e.g. dst was just
694// created and zeroed with malloc.
695//go:nosplit
696func bulkBarrierPreWriteSrcOnly(dst, src, size uintptr) {
697	if (dst|src|size)&(sys.PtrSize-1) != 0 {
698		throw("bulkBarrierPreWrite: unaligned arguments")
699	}
700	if !writeBarrier.needed {
701		return
702	}
703	buf := &getg().m.p.ptr().wbBuf
704	h := heapBitsForAddr(dst)
705	for i := uintptr(0); i < size; i += sys.PtrSize {
706		if h.isPointer() {
707			srcx := (*uintptr)(unsafe.Pointer(src + i))
708			if !buf.putFast(0, *srcx) {
709				wbBufFlush(nil, 0)
710			}
711		}
712		h = h.next()
713	}
714}
715
716// bulkBarrierBitmap executes write barriers for copying from [src,
717// src+size) to [dst, dst+size) using a 1-bit pointer bitmap. src is
718// assumed to start maskOffset bytes into the data covered by the
719// bitmap in bits (which may not be a multiple of 8).
720//
721// This is used by bulkBarrierPreWrite for writes to data and BSS.
722//
723//go:nosplit
724func bulkBarrierBitmap(dst, src, size, maskOffset uintptr, bits *uint8) {
725	word := maskOffset / sys.PtrSize
726	bits = addb(bits, word/8)
727	mask := uint8(1) << (word % 8)
728
729	buf := &getg().m.p.ptr().wbBuf
730	for i := uintptr(0); i < size; i += sys.PtrSize {
731		if mask == 0 {
732			bits = addb(bits, 1)
733			if *bits == 0 {
734				// Skip 8 words.
735				i += 7 * sys.PtrSize
736				continue
737			}
738			mask = 1
739		}
740		if *bits&mask != 0 {
741			dstx := (*uintptr)(unsafe.Pointer(dst + i))
742			if src == 0 {
743				if !buf.putFast(*dstx, 0) {
744					wbBufFlush(nil, 0)
745				}
746			} else {
747				srcx := (*uintptr)(unsafe.Pointer(src + i))
748				if !buf.putFast(*dstx, *srcx) {
749					wbBufFlush(nil, 0)
750				}
751			}
752		}
753		mask <<= 1
754	}
755}
756
757// typeBitsBulkBarrier executes a write barrier for every
758// pointer that would be copied from [src, src+size) to [dst,
759// dst+size) by a memmove using the type bitmap to locate those
760// pointer slots.
761//
762// The type typ must correspond exactly to [src, src+size) and [dst, dst+size).
763// dst, src, and size must be pointer-aligned.
764// The type typ must have a plain bitmap, not a GC program.
765// The only use of this function is in channel sends, and the
766// 64 kB channel element limit takes care of this for us.
767//
768// Must not be preempted because it typically runs right before memmove,
769// and the GC must observe them as an atomic action.
770//
771// Callers must perform cgo checks if writeBarrier.cgo.
772//
773//go:nosplit
774func typeBitsBulkBarrier(typ *_type, dst, src, size uintptr) {
775	if typ == nil {
776		throw("runtime: typeBitsBulkBarrier without type")
777	}
778	if typ.size != size {
779		println("runtime: typeBitsBulkBarrier with type ", typ.string(), " of size ", typ.size, " but memory size", size)
780		throw("runtime: invalid typeBitsBulkBarrier")
781	}
782	if typ.kind&kindGCProg != 0 {
783		println("runtime: typeBitsBulkBarrier with type ", typ.string(), " with GC prog")
784		throw("runtime: invalid typeBitsBulkBarrier")
785	}
786	if !writeBarrier.needed {
787		return
788	}
789	ptrmask := typ.gcdata
790	buf := &getg().m.p.ptr().wbBuf
791	var bits uint32
792	for i := uintptr(0); i < typ.ptrdata; i += sys.PtrSize {
793		if i&(sys.PtrSize*8-1) == 0 {
794			bits = uint32(*ptrmask)
795			ptrmask = addb(ptrmask, 1)
796		} else {
797			bits = bits >> 1
798		}
799		if bits&1 != 0 {
800			dstx := (*uintptr)(unsafe.Pointer(dst + i))
801			srcx := (*uintptr)(unsafe.Pointer(src + i))
802			if !buf.putFast(*dstx, *srcx) {
803				wbBufFlush(nil, 0)
804			}
805		}
806	}
807}
808
809// The methods operating on spans all require that h has been returned
810// by heapBitsForSpan and that size, n, total are the span layout description
811// returned by the mspan's layout method.
812// If total > size*n, it means that there is extra leftover memory in the span,
813// usually due to rounding.
814//
815// TODO(rsc): Perhaps introduce a different heapBitsSpan type.
816
817// initSpan initializes the heap bitmap for a span.
818// It clears all checkmark bits.
819// If this is a span of pointer-sized objects, it initializes all
820// words to pointer/scan.
821// Otherwise, it initializes all words to scalar/dead.
822func (h heapBits) initSpan(s *mspan) {
823	// Clear bits corresponding to objects.
824	nw := (s.npages << _PageShift) / sys.PtrSize
825	if nw%wordsPerBitmapByte != 0 {
826		throw("initSpan: unaligned length")
827	}
828	if h.shift != 0 {
829		throw("initSpan: unaligned base")
830	}
831	isPtrs := sys.PtrSize == 8 && s.elemsize == sys.PtrSize
832	for nw > 0 {
833		hNext, anw := h.forwardOrBoundary(nw)
834		nbyte := anw / wordsPerBitmapByte
835		if isPtrs {
836			bitp := h.bitp
837			for i := uintptr(0); i < nbyte; i++ {
838				*bitp = bitPointerAll | bitScanAll
839				bitp = add1(bitp)
840			}
841		} else {
842			memclrNoHeapPointers(unsafe.Pointer(h.bitp), nbyte)
843		}
844		h = hNext
845		nw -= anw
846	}
847}
848
849// initCheckmarkSpan initializes a span for being checkmarked.
850// It clears the checkmark bits, which are set to 1 in normal operation.
851func (h heapBits) initCheckmarkSpan(size, n, total uintptr) {
852	// The ptrSize == 8 is a compile-time constant false on 32-bit and eliminates this code entirely.
853	if sys.PtrSize == 8 && size == sys.PtrSize {
854		// Checkmark bit is type bit, bottom bit of every 2-bit entry.
855		// Only possible on 64-bit system, since minimum size is 8.
856		// Must clear type bit (checkmark bit) of every word.
857		// The type bit is the lower of every two-bit pair.
858		for i := uintptr(0); i < n; i += wordsPerBitmapByte {
859			*h.bitp &^= bitPointerAll
860			h = h.forward(wordsPerBitmapByte)
861		}
862		return
863	}
864	for i := uintptr(0); i < n; i++ {
865		*h.bitp &^= bitScan << (heapBitsShift + h.shift)
866		h = h.forward(size / sys.PtrSize)
867	}
868}
869
870// clearCheckmarkSpan undoes all the checkmarking in a span.
871// The actual checkmark bits are ignored, so the only work to do
872// is to fix the pointer bits. (Pointer bits are ignored by scanobject
873// but consulted by typedmemmove.)
874func (h heapBits) clearCheckmarkSpan(size, n, total uintptr) {
875	// The ptrSize == 8 is a compile-time constant false on 32-bit and eliminates this code entirely.
876	if sys.PtrSize == 8 && size == sys.PtrSize {
877		// Checkmark bit is type bit, bottom bit of every 2-bit entry.
878		// Only possible on 64-bit system, since minimum size is 8.
879		// Must clear type bit (checkmark bit) of every word.
880		// The type bit is the lower of every two-bit pair.
881		for i := uintptr(0); i < n; i += wordsPerBitmapByte {
882			*h.bitp |= bitPointerAll
883			h = h.forward(wordsPerBitmapByte)
884		}
885	}
886}
887
888// oneBitCount is indexed by byte and produces the
889// number of 1 bits in that byte. For example 128 has 1 bit set
890// and oneBitCount[128] will holds 1.
891var oneBitCount = [256]uint8{
892	0, 1, 1, 2, 1, 2, 2, 3,
893	1, 2, 2, 3, 2, 3, 3, 4,
894	1, 2, 2, 3, 2, 3, 3, 4,
895	2, 3, 3, 4, 3, 4, 4, 5,
896	1, 2, 2, 3, 2, 3, 3, 4,
897	2, 3, 3, 4, 3, 4, 4, 5,
898	2, 3, 3, 4, 3, 4, 4, 5,
899	3, 4, 4, 5, 4, 5, 5, 6,
900	1, 2, 2, 3, 2, 3, 3, 4,
901	2, 3, 3, 4, 3, 4, 4, 5,
902	2, 3, 3, 4, 3, 4, 4, 5,
903	3, 4, 4, 5, 4, 5, 5, 6,
904	2, 3, 3, 4, 3, 4, 4, 5,
905	3, 4, 4, 5, 4, 5, 5, 6,
906	3, 4, 4, 5, 4, 5, 5, 6,
907	4, 5, 5, 6, 5, 6, 6, 7,
908	1, 2, 2, 3, 2, 3, 3, 4,
909	2, 3, 3, 4, 3, 4, 4, 5,
910	2, 3, 3, 4, 3, 4, 4, 5,
911	3, 4, 4, 5, 4, 5, 5, 6,
912	2, 3, 3, 4, 3, 4, 4, 5,
913	3, 4, 4, 5, 4, 5, 5, 6,
914	3, 4, 4, 5, 4, 5, 5, 6,
915	4, 5, 5, 6, 5, 6, 6, 7,
916	2, 3, 3, 4, 3, 4, 4, 5,
917	3, 4, 4, 5, 4, 5, 5, 6,
918	3, 4, 4, 5, 4, 5, 5, 6,
919	4, 5, 5, 6, 5, 6, 6, 7,
920	3, 4, 4, 5, 4, 5, 5, 6,
921	4, 5, 5, 6, 5, 6, 6, 7,
922	4, 5, 5, 6, 5, 6, 6, 7,
923	5, 6, 6, 7, 6, 7, 7, 8}
924
925// countAlloc returns the number of objects allocated in span s by
926// scanning the allocation bitmap.
927// TODO:(rlh) Use popcount intrinsic.
928func (s *mspan) countAlloc() int {
929	count := 0
930	maxIndex := s.nelems / 8
931	for i := uintptr(0); i < maxIndex; i++ {
932		mrkBits := *s.gcmarkBits.bytep(i)
933		count += int(oneBitCount[mrkBits])
934	}
935	if bitsInLastByte := s.nelems % 8; bitsInLastByte != 0 {
936		mrkBits := *s.gcmarkBits.bytep(maxIndex)
937		mask := uint8((1 << bitsInLastByte) - 1)
938		bits := mrkBits & mask
939		count += int(oneBitCount[bits])
940	}
941	return count
942}
943
944// heapBitsSetType records that the new allocation [x, x+size)
945// holds in [x, x+dataSize) one or more values of type typ.
946// (The number of values is given by dataSize / typ.size.)
947// If dataSize < size, the fragment [x+dataSize, x+size) is
948// recorded as non-pointer data.
949// It is known that the type has pointers somewhere;
950// malloc does not call heapBitsSetType when there are no pointers,
951// because all free objects are marked as noscan during
952// heapBitsSweepSpan.
953//
954// There can only be one allocation from a given span active at a time,
955// and the bitmap for a span always falls on byte boundaries,
956// so there are no write-write races for access to the heap bitmap.
957// Hence, heapBitsSetType can access the bitmap without atomics.
958//
959// There can be read-write races between heapBitsSetType and things
960// that read the heap bitmap like scanobject. However, since
961// heapBitsSetType is only used for objects that have not yet been
962// made reachable, readers will ignore bits being modified by this
963// function. This does mean this function cannot transiently modify
964// bits that belong to neighboring objects. Also, on weakly-ordered
965// machines, callers must execute a store/store (publication) barrier
966// between calling this function and making the object reachable.
967func heapBitsSetType(x, size, dataSize uintptr, typ *_type) {
968	const doubleCheck = false // slow but helpful; enable to test modifications to this code
969
970	// dataSize is always size rounded up to the next malloc size class,
971	// except in the case of allocating a defer block, in which case
972	// size is sizeof(_defer{}) (at least 6 words) and dataSize may be
973	// arbitrarily larger.
974	//
975	// The checks for size == sys.PtrSize and size == 2*sys.PtrSize can therefore
976	// assume that dataSize == size without checking it explicitly.
977
978	if sys.PtrSize == 8 && size == sys.PtrSize {
979		// It's one word and it has pointers, it must be a pointer.
980		// Since all allocated one-word objects are pointers
981		// (non-pointers are aggregated into tinySize allocations),
982		// initSpan sets the pointer bits for us. Nothing to do here.
983		if doubleCheck {
984			h := heapBitsForAddr(x)
985			if !h.isPointer() {
986				throw("heapBitsSetType: pointer bit missing")
987			}
988			if !h.morePointers() {
989				throw("heapBitsSetType: scan bit missing")
990			}
991		}
992		return
993	}
994
995	h := heapBitsForAddr(x)
996	ptrmask := typ.gcdata // start of 1-bit pointer mask (or GC program, handled below)
997
998	// Heap bitmap bits for 2-word object are only 4 bits,
999	// so also shared with objects next to it.
1000	// This is called out as a special case primarily for 32-bit systems,
1001	// so that on 32-bit systems the code below can assume all objects
1002	// are 4-word aligned (because they're all 16-byte aligned).
1003	if size == 2*sys.PtrSize {
1004		if typ.size == sys.PtrSize {
1005			// We're allocating a block big enough to hold two pointers.
1006			// On 64-bit, that means the actual object must be two pointers,
1007			// or else we'd have used the one-pointer-sized block.
1008			// On 32-bit, however, this is the 8-byte block, the smallest one.
1009			// So it could be that we're allocating one pointer and this was
1010			// just the smallest block available. Distinguish by checking dataSize.
1011			// (In general the number of instances of typ being allocated is
1012			// dataSize/typ.size.)
1013			if sys.PtrSize == 4 && dataSize == sys.PtrSize {
1014				// 1 pointer object. On 32-bit machines clear the bit for the
1015				// unused second word.
1016				*h.bitp &^= (bitPointer | bitScan | ((bitPointer | bitScan) << heapBitsShift)) << h.shift
1017				*h.bitp |= (bitPointer | bitScan) << h.shift
1018			} else {
1019				// 2-element slice of pointer.
1020				*h.bitp |= (bitPointer | bitScan | bitPointer<<heapBitsShift) << h.shift
1021			}
1022			return
1023		}
1024		// Otherwise typ.size must be 2*sys.PtrSize,
1025		// and typ.kind&kindGCProg == 0.
1026		if doubleCheck {
1027			if typ.size != 2*sys.PtrSize || typ.kind&kindGCProg != 0 {
1028				print("runtime: heapBitsSetType size=", size, " but typ.size=", typ.size, " gcprog=", typ.kind&kindGCProg != 0, "\n")
1029				throw("heapBitsSetType")
1030			}
1031		}
1032		b := uint32(*ptrmask)
1033		hb := (b & 3) | bitScan
1034		// bitPointer == 1, bitScan is 1 << 4, heapBitsShift is 1.
1035		// 110011 is shifted h.shift and complemented.
1036		// This clears out the bits that are about to be
1037		// ored into *h.hbitp in the next instructions.
1038		*h.bitp &^= (bitPointer | bitScan | ((bitPointer | bitScan) << heapBitsShift)) << h.shift
1039		*h.bitp |= uint8(hb << h.shift)
1040		return
1041	}
1042
1043	// Copy from 1-bit ptrmask into 2-bit bitmap.
1044	// The basic approach is to use a single uintptr as a bit buffer,
1045	// alternating between reloading the buffer and writing bitmap bytes.
1046	// In general, one load can supply two bitmap byte writes.
1047	// This is a lot of lines of code, but it compiles into relatively few
1048	// machine instructions.
1049
1050	outOfPlace := false
1051	if arenaIndex(x+size-1) != arenaIdx(h.arena) || (doubleCheck && fastrand()%2 == 0) {
1052		// This object spans heap arenas, so the bitmap may be
1053		// discontiguous. Unroll it into the object instead
1054		// and then copy it out.
1055		//
1056		// In doubleCheck mode, we randomly do this anyway to
1057		// stress test the bitmap copying path.
1058		outOfPlace = true
1059		h.bitp = (*uint8)(unsafe.Pointer(x))
1060		h.last = nil
1061	}
1062
1063	var (
1064		// Ptrmask input.
1065		p     *byte   // last ptrmask byte read
1066		b     uintptr // ptrmask bits already loaded
1067		nb    uintptr // number of bits in b at next read
1068		endp  *byte   // final ptrmask byte to read (then repeat)
1069		endnb uintptr // number of valid bits in *endp
1070		pbits uintptr // alternate source of bits
1071
1072		// Heap bitmap output.
1073		w     uintptr // words processed
1074		nw    uintptr // number of words to process
1075		hbitp *byte   // next heap bitmap byte to write
1076		hb    uintptr // bits being prepared for *hbitp
1077	)
1078
1079	hbitp = h.bitp
1080
1081	// Handle GC program. Delayed until this part of the code
1082	// so that we can use the same double-checking mechanism
1083	// as the 1-bit case. Nothing above could have encountered
1084	// GC programs: the cases were all too small.
1085	if typ.kind&kindGCProg != 0 {
1086		heapBitsSetTypeGCProg(h, typ.ptrdata, typ.size, dataSize, size, addb(typ.gcdata, 4))
1087		if doubleCheck {
1088			// Double-check the heap bits written by GC program
1089			// by running the GC program to create a 1-bit pointer mask
1090			// and then jumping to the double-check code below.
1091			// This doesn't catch bugs shared between the 1-bit and 4-bit
1092			// GC program execution, but it does catch mistakes specific
1093			// to just one of those and bugs in heapBitsSetTypeGCProg's
1094			// implementation of arrays.
1095			lock(&debugPtrmask.lock)
1096			if debugPtrmask.data == nil {
1097				debugPtrmask.data = (*byte)(persistentalloc(1<<20, 1, &memstats.other_sys))
1098			}
1099			ptrmask = debugPtrmask.data
1100			runGCProg(addb(typ.gcdata, 4), nil, ptrmask, 1)
1101		}
1102		goto Phase4
1103	}
1104
1105	// Note about sizes:
1106	//
1107	// typ.size is the number of words in the object,
1108	// and typ.ptrdata is the number of words in the prefix
1109	// of the object that contains pointers. That is, the final
1110	// typ.size - typ.ptrdata words contain no pointers.
1111	// This allows optimization of a common pattern where
1112	// an object has a small header followed by a large scalar
1113	// buffer. If we know the pointers are over, we don't have
1114	// to scan the buffer's heap bitmap at all.
1115	// The 1-bit ptrmasks are sized to contain only bits for
1116	// the typ.ptrdata prefix, zero padded out to a full byte
1117	// of bitmap. This code sets nw (below) so that heap bitmap
1118	// bits are only written for the typ.ptrdata prefix; if there is
1119	// more room in the allocated object, the next heap bitmap
1120	// entry is a 00, indicating that there are no more pointers
1121	// to scan. So only the ptrmask for the ptrdata bytes is needed.
1122	//
1123	// Replicated copies are not as nice: if there is an array of
1124	// objects with scalar tails, all but the last tail does have to
1125	// be initialized, because there is no way to say "skip forward".
1126	// However, because of the possibility of a repeated type with
1127	// size not a multiple of 4 pointers (one heap bitmap byte),
1128	// the code already must handle the last ptrmask byte specially
1129	// by treating it as containing only the bits for endnb pointers,
1130	// where endnb <= 4. We represent large scalar tails that must
1131	// be expanded in the replication by setting endnb larger than 4.
1132	// This will have the effect of reading many bits out of b,
1133	// but once the real bits are shifted out, b will supply as many
1134	// zero bits as we try to read, which is exactly what we need.
1135
1136	p = ptrmask
1137	if typ.size < dataSize {
1138		// Filling in bits for an array of typ.
1139		// Set up for repetition of ptrmask during main loop.
1140		// Note that ptrmask describes only a prefix of
1141		const maxBits = sys.PtrSize*8 - 7
1142		if typ.ptrdata/sys.PtrSize <= maxBits {
1143			// Entire ptrmask fits in uintptr with room for a byte fragment.
1144			// Load into pbits and never read from ptrmask again.
1145			// This is especially important when the ptrmask has
1146			// fewer than 8 bits in it; otherwise the reload in the middle
1147			// of the Phase 2 loop would itself need to loop to gather
1148			// at least 8 bits.
1149
1150			// Accumulate ptrmask into b.
1151			// ptrmask is sized to describe only typ.ptrdata, but we record
1152			// it as describing typ.size bytes, since all the high bits are zero.
1153			nb = typ.ptrdata / sys.PtrSize
1154			for i := uintptr(0); i < nb; i += 8 {
1155				b |= uintptr(*p) << i
1156				p = add1(p)
1157			}
1158			nb = typ.size / sys.PtrSize
1159
1160			// Replicate ptrmask to fill entire pbits uintptr.
1161			// Doubling and truncating is fewer steps than
1162			// iterating by nb each time. (nb could be 1.)
1163			// Since we loaded typ.ptrdata/sys.PtrSize bits
1164			// but are pretending to have typ.size/sys.PtrSize,
1165			// there might be no replication necessary/possible.
1166			pbits = b
1167			endnb = nb
1168			if nb+nb <= maxBits {
1169				for endnb <= sys.PtrSize*8 {
1170					pbits |= pbits << endnb
1171					endnb += endnb
1172				}
1173				// Truncate to a multiple of original ptrmask.
1174				// Because nb+nb <= maxBits, nb fits in a byte.
1175				// Byte division is cheaper than uintptr division.
1176				endnb = uintptr(maxBits/byte(nb)) * nb
1177				pbits &= 1<<endnb - 1
1178				b = pbits
1179				nb = endnb
1180			}
1181
1182			// Clear p and endp as sentinel for using pbits.
1183			// Checked during Phase 2 loop.
1184			p = nil
1185			endp = nil
1186		} else {
1187			// Ptrmask is larger. Read it multiple times.
1188			n := (typ.ptrdata/sys.PtrSize+7)/8 - 1
1189			endp = addb(ptrmask, n)
1190			endnb = typ.size/sys.PtrSize - n*8
1191		}
1192	}
1193	if p != nil {
1194		b = uintptr(*p)
1195		p = add1(p)
1196		nb = 8
1197	}
1198
1199	if typ.size == dataSize {
1200		// Single entry: can stop once we reach the non-pointer data.
1201		nw = typ.ptrdata / sys.PtrSize
1202	} else {
1203		// Repeated instances of typ in an array.
1204		// Have to process first N-1 entries in full, but can stop
1205		// once we reach the non-pointer data in the final entry.
1206		nw = ((dataSize/typ.size-1)*typ.size + typ.ptrdata) / sys.PtrSize
1207	}
1208	if nw == 0 {
1209		// No pointers! Caller was supposed to check.
1210		println("runtime: invalid type ", typ.string())
1211		throw("heapBitsSetType: called with non-pointer type")
1212		return
1213	}
1214	if nw < 2 {
1215		// Must write at least 2 words, because the "no scan"
1216		// encoding doesn't take effect until the third word.
1217		nw = 2
1218	}
1219
1220	// Phase 1: Special case for leading byte (shift==0) or half-byte (shift==2).
1221	// The leading byte is special because it contains the bits for word 1,
1222	// which does not have the scan bit set.
1223	// The leading half-byte is special because it's a half a byte,
1224	// so we have to be careful with the bits already there.
1225	switch {
1226	default:
1227		throw("heapBitsSetType: unexpected shift")
1228
1229	case h.shift == 0:
1230		// Ptrmask and heap bitmap are aligned.
1231		// Handle first byte of bitmap specially.
1232		//
1233		// The first byte we write out covers the first four
1234		// words of the object. The scan/dead bit on the first
1235		// word must be set to scan since there are pointers
1236		// somewhere in the object. The scan/dead bit on the
1237		// second word is the checkmark, so we don't set it.
1238		// In all following words, we set the scan/dead
1239		// appropriately to indicate that the object contains
1240		// to the next 2-bit entry in the bitmap.
1241		//
1242		// TODO: It doesn't matter if we set the checkmark, so
1243		// maybe this case isn't needed any more.
1244		hb = b & bitPointerAll
1245		hb |= bitScan | bitScan<<(2*heapBitsShift) | bitScan<<(3*heapBitsShift)
1246		if w += 4; w >= nw {
1247			goto Phase3
1248		}
1249		*hbitp = uint8(hb)
1250		hbitp = add1(hbitp)
1251		b >>= 4
1252		nb -= 4
1253
1254	case sys.PtrSize == 8 && h.shift == 2:
1255		// Ptrmask and heap bitmap are misaligned.
1256		// The bits for the first two words are in a byte shared
1257		// with another object, so we must be careful with the bits
1258		// already there.
1259		// We took care of 1-word and 2-word objects above,
1260		// so this is at least a 6-word object.
1261		hb = (b & (bitPointer | bitPointer<<heapBitsShift)) << (2 * heapBitsShift)
1262		// This is not noscan, so set the scan bit in the
1263		// first word.
1264		hb |= bitScan << (2 * heapBitsShift)
1265		b >>= 2
1266		nb -= 2
1267		// Note: no bitScan for second word because that's
1268		// the checkmark.
1269		*hbitp &^= uint8((bitPointer | bitScan | (bitPointer << heapBitsShift)) << (2 * heapBitsShift))
1270		*hbitp |= uint8(hb)
1271		hbitp = add1(hbitp)
1272		if w += 2; w >= nw {
1273			// We know that there is more data, because we handled 2-word objects above.
1274			// This must be at least a 6-word object. If we're out of pointer words,
1275			// mark no scan in next bitmap byte and finish.
1276			hb = 0
1277			w += 4
1278			goto Phase3
1279		}
1280	}
1281
1282	// Phase 2: Full bytes in bitmap, up to but not including write to last byte (full or partial) in bitmap.
1283	// The loop computes the bits for that last write but does not execute the write;
1284	// it leaves the bits in hb for processing by phase 3.
1285	// To avoid repeated adjustment of nb, we subtract out the 4 bits we're going to
1286	// use in the first half of the loop right now, and then we only adjust nb explicitly
1287	// if the 8 bits used by each iteration isn't balanced by 8 bits loaded mid-loop.
1288	nb -= 4
1289	for {
1290		// Emit bitmap byte.
1291		// b has at least nb+4 bits, with one exception:
1292		// if w+4 >= nw, then b has only nw-w bits,
1293		// but we'll stop at the break and then truncate
1294		// appropriately in Phase 3.
1295		hb = b & bitPointerAll
1296		hb |= bitScanAll
1297		if w += 4; w >= nw {
1298			break
1299		}
1300		*hbitp = uint8(hb)
1301		hbitp = add1(hbitp)
1302		b >>= 4
1303
1304		// Load more bits. b has nb right now.
1305		if p != endp {
1306			// Fast path: keep reading from ptrmask.
1307			// nb unmodified: we just loaded 8 bits,
1308			// and the next iteration will consume 8 bits,
1309			// leaving us with the same nb the next time we're here.
1310			if nb < 8 {
1311				b |= uintptr(*p) << nb
1312				p = add1(p)
1313			} else {
1314				// Reduce the number of bits in b.
1315				// This is important if we skipped
1316				// over a scalar tail, since nb could
1317				// be larger than the bit width of b.
1318				nb -= 8
1319			}
1320		} else if p == nil {
1321			// Almost as fast path: track bit count and refill from pbits.
1322			// For short repetitions.
1323			if nb < 8 {
1324				b |= pbits << nb
1325				nb += endnb
1326			}
1327			nb -= 8 // for next iteration
1328		} else {
1329			// Slow path: reached end of ptrmask.
1330			// Process final partial byte and rewind to start.
1331			b |= uintptr(*p) << nb
1332			nb += endnb
1333			if nb < 8 {
1334				b |= uintptr(*ptrmask) << nb
1335				p = add1(ptrmask)
1336			} else {
1337				nb -= 8
1338				p = ptrmask
1339			}
1340		}
1341
1342		// Emit bitmap byte.
1343		hb = b & bitPointerAll
1344		hb |= bitScanAll
1345		if w += 4; w >= nw {
1346			break
1347		}
1348		*hbitp = uint8(hb)
1349		hbitp = add1(hbitp)
1350		b >>= 4
1351	}
1352
1353Phase3:
1354	// Phase 3: Write last byte or partial byte and zero the rest of the bitmap entries.
1355	if w > nw {
1356		// Counting the 4 entries in hb not yet written to memory,
1357		// there are more entries than possible pointer slots.
1358		// Discard the excess entries (can't be more than 3).
1359		mask := uintptr(1)<<(4-(w-nw)) - 1
1360		hb &= mask | mask<<4 // apply mask to both pointer bits and scan bits
1361	}
1362
1363	// Change nw from counting possibly-pointer words to total words in allocation.
1364	nw = size / sys.PtrSize
1365
1366	// Write whole bitmap bytes.
1367	// The first is hb, the rest are zero.
1368	if w <= nw {
1369		*hbitp = uint8(hb)
1370		hbitp = add1(hbitp)
1371		hb = 0 // for possible final half-byte below
1372		for w += 4; w <= nw; w += 4 {
1373			*hbitp = 0
1374			hbitp = add1(hbitp)
1375		}
1376	}
1377
1378	// Write final partial bitmap byte if any.
1379	// We know w > nw, or else we'd still be in the loop above.
1380	// It can be bigger only due to the 4 entries in hb that it counts.
1381	// If w == nw+4 then there's nothing left to do: we wrote all nw entries
1382	// and can discard the 4 sitting in hb.
1383	// But if w == nw+2, we need to write first two in hb.
1384	// The byte is shared with the next object, so be careful with
1385	// existing bits.
1386	if w == nw+2 {
1387		*hbitp = *hbitp&^(bitPointer|bitScan|(bitPointer|bitScan)<<heapBitsShift) | uint8(hb)
1388	}
1389
1390Phase4:
1391	// Phase 4: Copy unrolled bitmap to per-arena bitmaps, if necessary.
1392	if outOfPlace {
1393		// TODO: We could probably make this faster by
1394		// handling [x+dataSize, x+size) specially.
1395		h := heapBitsForAddr(x)
1396		// cnw is the number of heap words, or bit pairs
1397		// remaining (like nw above).
1398		cnw := size / sys.PtrSize
1399		src := (*uint8)(unsafe.Pointer(x))
1400		// We know the first and last byte of the bitmap are
1401		// not the same, but it's still possible for small
1402		// objects span arenas, so it may share bitmap bytes
1403		// with neighboring objects.
1404		//
1405		// Handle the first byte specially if it's shared. See
1406		// Phase 1 for why this is the only special case we need.
1407		if doubleCheck {
1408			if !(h.shift == 0 || (sys.PtrSize == 8 && h.shift == 2)) {
1409				print("x=", x, " size=", size, " cnw=", h.shift, "\n")
1410				throw("bad start shift")
1411			}
1412		}
1413		if sys.PtrSize == 8 && h.shift == 2 {
1414			*h.bitp = *h.bitp&^((bitPointer|bitScan|(bitPointer|bitScan)<<heapBitsShift)<<(2*heapBitsShift)) | *src
1415			h = h.next().next()
1416			cnw -= 2
1417			src = addb(src, 1)
1418		}
1419		// We're now byte aligned. Copy out to per-arena
1420		// bitmaps until the last byte (which may again be
1421		// partial).
1422		for cnw >= 4 {
1423			// This loop processes four words at a time,
1424			// so round cnw down accordingly.
1425			hNext, words := h.forwardOrBoundary(cnw / 4 * 4)
1426
1427			// n is the number of bitmap bytes to copy.
1428			n := words / 4
1429			memmove(unsafe.Pointer(h.bitp), unsafe.Pointer(src), n)
1430			cnw -= words
1431			h = hNext
1432			src = addb(src, n)
1433		}
1434		if doubleCheck && h.shift != 0 {
1435			print("cnw=", cnw, " h.shift=", h.shift, "\n")
1436			throw("bad shift after block copy")
1437		}
1438		// Handle the last byte if it's shared.
1439		if cnw == 2 {
1440			*h.bitp = *h.bitp&^(bitPointer|bitScan|(bitPointer|bitScan)<<heapBitsShift) | *src
1441			src = addb(src, 1)
1442			h = h.next().next()
1443		}
1444		if doubleCheck {
1445			if uintptr(unsafe.Pointer(src)) > x+size {
1446				throw("copy exceeded object size")
1447			}
1448			if !(cnw == 0 || cnw == 2) {
1449				print("x=", x, " size=", size, " cnw=", cnw, "\n")
1450				throw("bad number of remaining words")
1451			}
1452			// Set up hbitp so doubleCheck code below can check it.
1453			hbitp = h.bitp
1454		}
1455		// Zero the object where we wrote the bitmap.
1456		memclrNoHeapPointers(unsafe.Pointer(x), uintptr(unsafe.Pointer(src))-x)
1457	}
1458
1459	// Double check the whole bitmap.
1460	if doubleCheck {
1461		// x+size may not point to the heap, so back up one
1462		// word and then call next().
1463		end := heapBitsForAddr(x + size - sys.PtrSize).next()
1464		endAI := arenaIdx(end.arena)
1465		if !outOfPlace && (end.bitp == nil || (end.shift == 0 && end.bitp == &mheap_.arenas[endAI.l1()][endAI.l2()].bitmap[0])) {
1466			// The unrolling code above walks hbitp just
1467			// past the bitmap without moving to the next
1468			// arena. Synthesize this for end.bitp.
1469			end.arena--
1470			endAI = arenaIdx(end.arena)
1471			end.bitp = addb(&mheap_.arenas[endAI.l1()][endAI.l2()].bitmap[0], heapArenaBitmapBytes)
1472			end.last = nil
1473		}
1474		if typ.kind&kindGCProg == 0 && (hbitp != end.bitp || (w == nw+2) != (end.shift == 2)) {
1475			println("ended at wrong bitmap byte for", typ.string(), "x", dataSize/typ.size)
1476			print("typ.size=", typ.size, " typ.ptrdata=", typ.ptrdata, " dataSize=", dataSize, " size=", size, "\n")
1477			print("w=", w, " nw=", nw, " b=", hex(b), " nb=", nb, " hb=", hex(hb), "\n")
1478			h0 := heapBitsForAddr(x)
1479			print("initial bits h0.bitp=", h0.bitp, " h0.shift=", h0.shift, "\n")
1480			print("ended at hbitp=", hbitp, " but next starts at bitp=", end.bitp, " shift=", end.shift, "\n")
1481			throw("bad heapBitsSetType")
1482		}
1483
1484		// Double-check that bits to be written were written correctly.
1485		// Does not check that other bits were not written, unfortunately.
1486		h := heapBitsForAddr(x)
1487		nptr := typ.ptrdata / sys.PtrSize
1488		ndata := typ.size / sys.PtrSize
1489		count := dataSize / typ.size
1490		totalptr := ((count-1)*typ.size + typ.ptrdata) / sys.PtrSize
1491		for i := uintptr(0); i < size/sys.PtrSize; i++ {
1492			j := i % ndata
1493			var have, want uint8
1494			have = (*h.bitp >> h.shift) & (bitPointer | bitScan)
1495			if i >= totalptr {
1496				want = 0 // deadmarker
1497				if typ.kind&kindGCProg != 0 && i < (totalptr+3)/4*4 {
1498					want = bitScan
1499				}
1500			} else {
1501				if j < nptr && (*addb(ptrmask, j/8)>>(j%8))&1 != 0 {
1502					want |= bitPointer
1503				}
1504				if i != 1 {
1505					want |= bitScan
1506				} else {
1507					have &^= bitScan
1508				}
1509			}
1510			if have != want {
1511				println("mismatch writing bits for", typ.string(), "x", dataSize/typ.size)
1512				print("typ.size=", typ.size, " typ.ptrdata=", typ.ptrdata, " dataSize=", dataSize, " size=", size, "\n")
1513				print("kindGCProg=", typ.kind&kindGCProg != 0, " outOfPlace=", outOfPlace, "\n")
1514				print("w=", w, " nw=", nw, " b=", hex(b), " nb=", nb, " hb=", hex(hb), "\n")
1515				h0 := heapBitsForAddr(x)
1516				print("initial bits h0.bitp=", h0.bitp, " h0.shift=", h0.shift, "\n")
1517				print("current bits h.bitp=", h.bitp, " h.shift=", h.shift, " *h.bitp=", hex(*h.bitp), "\n")
1518				print("ptrmask=", ptrmask, " p=", p, " endp=", endp, " endnb=", endnb, " pbits=", hex(pbits), " b=", hex(b), " nb=", nb, "\n")
1519				println("at word", i, "offset", i*sys.PtrSize, "have", hex(have), "want", hex(want))
1520				if typ.kind&kindGCProg != 0 {
1521					println("GC program:")
1522					dumpGCProg(addb(typ.gcdata, 4))
1523				}
1524				throw("bad heapBitsSetType")
1525			}
1526			h = h.next()
1527		}
1528		if ptrmask == debugPtrmask.data {
1529			unlock(&debugPtrmask.lock)
1530		}
1531	}
1532}
1533
1534var debugPtrmask struct {
1535	lock mutex
1536	data *byte
1537}
1538
1539// heapBitsSetTypeGCProg implements heapBitsSetType using a GC program.
1540// progSize is the size of the memory described by the program.
1541// elemSize is the size of the element that the GC program describes (a prefix of).
1542// dataSize is the total size of the intended data, a multiple of elemSize.
1543// allocSize is the total size of the allocated memory.
1544//
1545// GC programs are only used for large allocations.
1546// heapBitsSetType requires that allocSize is a multiple of 4 words,
1547// so that the relevant bitmap bytes are not shared with surrounding
1548// objects.
1549func heapBitsSetTypeGCProg(h heapBits, progSize, elemSize, dataSize, allocSize uintptr, prog *byte) {
1550	if sys.PtrSize == 8 && allocSize%(4*sys.PtrSize) != 0 {
1551		// Alignment will be wrong.
1552		throw("heapBitsSetTypeGCProg: small allocation")
1553	}
1554	var totalBits uintptr
1555	if elemSize == dataSize {
1556		totalBits = runGCProg(prog, nil, h.bitp, 2)
1557		if totalBits*sys.PtrSize != progSize {
1558			println("runtime: heapBitsSetTypeGCProg: total bits", totalBits, "but progSize", progSize)
1559			throw("heapBitsSetTypeGCProg: unexpected bit count")
1560		}
1561	} else {
1562		count := dataSize / elemSize
1563
1564		// Piece together program trailer to run after prog that does:
1565		//	literal(0)
1566		//	repeat(1, elemSize-progSize-1) // zeros to fill element size
1567		//	repeat(elemSize, count-1) // repeat that element for count
1568		// This zero-pads the data remaining in the first element and then
1569		// repeats that first element to fill the array.
1570		var trailer [40]byte // 3 varints (max 10 each) + some bytes
1571		i := 0
1572		if n := elemSize/sys.PtrSize - progSize/sys.PtrSize; n > 0 {
1573			// literal(0)
1574			trailer[i] = 0x01
1575			i++
1576			trailer[i] = 0
1577			i++
1578			if n > 1 {
1579				// repeat(1, n-1)
1580				trailer[i] = 0x81
1581				i++
1582				n--
1583				for ; n >= 0x80; n >>= 7 {
1584					trailer[i] = byte(n | 0x80)
1585					i++
1586				}
1587				trailer[i] = byte(n)
1588				i++
1589			}
1590		}
1591		// repeat(elemSize/ptrSize, count-1)
1592		trailer[i] = 0x80
1593		i++
1594		n := elemSize / sys.PtrSize
1595		for ; n >= 0x80; n >>= 7 {
1596			trailer[i] = byte(n | 0x80)
1597			i++
1598		}
1599		trailer[i] = byte(n)
1600		i++
1601		n = count - 1
1602		for ; n >= 0x80; n >>= 7 {
1603			trailer[i] = byte(n | 0x80)
1604			i++
1605		}
1606		trailer[i] = byte(n)
1607		i++
1608		trailer[i] = 0
1609		i++
1610
1611		runGCProg(prog, &trailer[0], h.bitp, 2)
1612
1613		// Even though we filled in the full array just now,
1614		// record that we only filled in up to the ptrdata of the
1615		// last element. This will cause the code below to
1616		// memclr the dead section of the final array element,
1617		// so that scanobject can stop early in the final element.
1618		totalBits = (elemSize*(count-1) + progSize) / sys.PtrSize
1619	}
1620	endProg := unsafe.Pointer(addb(h.bitp, (totalBits+3)/4))
1621	endAlloc := unsafe.Pointer(addb(h.bitp, allocSize/sys.PtrSize/wordsPerBitmapByte))
1622	memclrNoHeapPointers(endProg, uintptr(endAlloc)-uintptr(endProg))
1623}
1624
1625// progToPointerMask returns the 1-bit pointer mask output by the GC program prog.
1626// size the size of the region described by prog, in bytes.
1627// The resulting bitvector will have no more than size/sys.PtrSize bits.
1628func progToPointerMask(prog *byte, size uintptr) bitvector {
1629	n := (size/sys.PtrSize + 7) / 8
1630	x := (*[1 << 30]byte)(persistentalloc(n+1, 1, &memstats.buckhash_sys))[:n+1]
1631	x[len(x)-1] = 0xa1 // overflow check sentinel
1632	n = runGCProg(prog, nil, &x[0], 1)
1633	if x[len(x)-1] != 0xa1 {
1634		throw("progToPointerMask: overflow")
1635	}
1636	return bitvector{int32(n), &x[0]}
1637}
1638
1639// Packed GC pointer bitmaps, aka GC programs.
1640//
1641// For large types containing arrays, the type information has a
1642// natural repetition that can be encoded to save space in the
1643// binary and in the memory representation of the type information.
1644//
1645// The encoding is a simple Lempel-Ziv style bytecode machine
1646// with the following instructions:
1647//
1648//	00000000: stop
1649//	0nnnnnnn: emit n bits copied from the next (n+7)/8 bytes
1650//	10000000 n c: repeat the previous n bits c times; n, c are varints
1651//	1nnnnnnn c: repeat the previous n bits c times; c is a varint
1652
1653// runGCProg executes the GC program prog, and then trailer if non-nil,
1654// writing to dst with entries of the given size.
1655// If size == 1, dst is a 1-bit pointer mask laid out moving forward from dst.
1656// If size == 2, dst is the 2-bit heap bitmap, and writes move backward
1657// starting at dst (because the heap bitmap does). In this case, the caller guarantees
1658// that only whole bytes in dst need to be written.
1659//
1660// runGCProg returns the number of 1- or 2-bit entries written to memory.
1661func runGCProg(prog, trailer, dst *byte, size int) uintptr {
1662	dstStart := dst
1663
1664	// Bits waiting to be written to memory.
1665	var bits uintptr
1666	var nbits uintptr
1667
1668	p := prog
1669Run:
1670	for {
1671		// Flush accumulated full bytes.
1672		// The rest of the loop assumes that nbits <= 7.
1673		for ; nbits >= 8; nbits -= 8 {
1674			if size == 1 {
1675				*dst = uint8(bits)
1676				dst = add1(dst)
1677				bits >>= 8
1678			} else {
1679				v := bits&bitPointerAll | bitScanAll
1680				*dst = uint8(v)
1681				dst = add1(dst)
1682				bits >>= 4
1683				v = bits&bitPointerAll | bitScanAll
1684				*dst = uint8(v)
1685				dst = add1(dst)
1686				bits >>= 4
1687			}
1688		}
1689
1690		// Process one instruction.
1691		inst := uintptr(*p)
1692		p = add1(p)
1693		n := inst & 0x7F
1694		if inst&0x80 == 0 {
1695			// Literal bits; n == 0 means end of program.
1696			if n == 0 {
1697				// Program is over; continue in trailer if present.
1698				if trailer != nil {
1699					p = trailer
1700					trailer = nil
1701					continue
1702				}
1703				break Run
1704			}
1705			nbyte := n / 8
1706			for i := uintptr(0); i < nbyte; i++ {
1707				bits |= uintptr(*p) << nbits
1708				p = add1(p)
1709				if size == 1 {
1710					*dst = uint8(bits)
1711					dst = add1(dst)
1712					bits >>= 8
1713				} else {
1714					v := bits&0xf | bitScanAll
1715					*dst = uint8(v)
1716					dst = add1(dst)
1717					bits >>= 4
1718					v = bits&0xf | bitScanAll
1719					*dst = uint8(v)
1720					dst = add1(dst)
1721					bits >>= 4
1722				}
1723			}
1724			if n %= 8; n > 0 {
1725				bits |= uintptr(*p) << nbits
1726				p = add1(p)
1727				nbits += n
1728			}
1729			continue Run
1730		}
1731
1732		// Repeat. If n == 0, it is encoded in a varint in the next bytes.
1733		if n == 0 {
1734			for off := uint(0); ; off += 7 {
1735				x := uintptr(*p)
1736				p = add1(p)
1737				n |= (x & 0x7F) << off
1738				if x&0x80 == 0 {
1739					break
1740				}
1741			}
1742		}
1743
1744		// Count is encoded in a varint in the next bytes.
1745		c := uintptr(0)
1746		for off := uint(0); ; off += 7 {
1747			x := uintptr(*p)
1748			p = add1(p)
1749			c |= (x & 0x7F) << off
1750			if x&0x80 == 0 {
1751				break
1752			}
1753		}
1754		c *= n // now total number of bits to copy
1755
1756		// If the number of bits being repeated is small, load them
1757		// into a register and use that register for the entire loop
1758		// instead of repeatedly reading from memory.
1759		// Handling fewer than 8 bits here makes the general loop simpler.
1760		// The cutoff is sys.PtrSize*8 - 7 to guarantee that when we add
1761		// the pattern to a bit buffer holding at most 7 bits (a partial byte)
1762		// it will not overflow.
1763		src := dst
1764		const maxBits = sys.PtrSize*8 - 7
1765		if n <= maxBits {
1766			// Start with bits in output buffer.
1767			pattern := bits
1768			npattern := nbits
1769
1770			// If we need more bits, fetch them from memory.
1771			if size == 1 {
1772				src = subtract1(src)
1773				for npattern < n {
1774					pattern <<= 8
1775					pattern |= uintptr(*src)
1776					src = subtract1(src)
1777					npattern += 8
1778				}
1779			} else {
1780				src = subtract1(src)
1781				for npattern < n {
1782					pattern <<= 4
1783					pattern |= uintptr(*src) & 0xf
1784					src = subtract1(src)
1785					npattern += 4
1786				}
1787			}
1788
1789			// We started with the whole bit output buffer,
1790			// and then we loaded bits from whole bytes.
1791			// Either way, we might now have too many instead of too few.
1792			// Discard the extra.
1793			if npattern > n {
1794				pattern >>= npattern - n
1795				npattern = n
1796			}
1797
1798			// Replicate pattern to at most maxBits.
1799			if npattern == 1 {
1800				// One bit being repeated.
1801				// If the bit is 1, make the pattern all 1s.
1802				// If the bit is 0, the pattern is already all 0s,
1803				// but we can claim that the number of bits
1804				// in the word is equal to the number we need (c),
1805				// because right shift of bits will zero fill.
1806				if pattern == 1 {
1807					pattern = 1<<maxBits - 1
1808					npattern = maxBits
1809				} else {
1810					npattern = c
1811				}
1812			} else {
1813				b := pattern
1814				nb := npattern
1815				if nb+nb <= maxBits {
1816					// Double pattern until the whole uintptr is filled.
1817					for nb <= sys.PtrSize*8 {
1818						b |= b << nb
1819						nb += nb
1820					}
1821					// Trim away incomplete copy of original pattern in high bits.
1822					// TODO(rsc): Replace with table lookup or loop on systems without divide?
1823					nb = maxBits / npattern * npattern
1824					b &= 1<<nb - 1
1825					pattern = b
1826					npattern = nb
1827				}
1828			}
1829
1830			// Add pattern to bit buffer and flush bit buffer, c/npattern times.
1831			// Since pattern contains >8 bits, there will be full bytes to flush
1832			// on each iteration.
1833			for ; c >= npattern; c -= npattern {
1834				bits |= pattern << nbits
1835				nbits += npattern
1836				if size == 1 {
1837					for nbits >= 8 {
1838						*dst = uint8(bits)
1839						dst = add1(dst)
1840						bits >>= 8
1841						nbits -= 8
1842					}
1843				} else {
1844					for nbits >= 4 {
1845						*dst = uint8(bits&0xf | bitScanAll)
1846						dst = add1(dst)
1847						bits >>= 4
1848						nbits -= 4
1849					}
1850				}
1851			}
1852
1853			// Add final fragment to bit buffer.
1854			if c > 0 {
1855				pattern &= 1<<c - 1
1856				bits |= pattern << nbits
1857				nbits += c
1858			}
1859			continue Run
1860		}
1861
1862		// Repeat; n too large to fit in a register.
1863		// Since nbits <= 7, we know the first few bytes of repeated data
1864		// are already written to memory.
1865		off := n - nbits // n > nbits because n > maxBits and nbits <= 7
1866		if size == 1 {
1867			// Leading src fragment.
1868			src = subtractb(src, (off+7)/8)
1869			if frag := off & 7; frag != 0 {
1870				bits |= uintptr(*src) >> (8 - frag) << nbits
1871				src = add1(src)
1872				nbits += frag
1873				c -= frag
1874			}
1875			// Main loop: load one byte, write another.
1876			// The bits are rotating through the bit buffer.
1877			for i := c / 8; i > 0; i-- {
1878				bits |= uintptr(*src) << nbits
1879				src = add1(src)
1880				*dst = uint8(bits)
1881				dst = add1(dst)
1882				bits >>= 8
1883			}
1884			// Final src fragment.
1885			if c %= 8; c > 0 {
1886				bits |= (uintptr(*src) & (1<<c - 1)) << nbits
1887				nbits += c
1888			}
1889		} else {
1890			// Leading src fragment.
1891			src = subtractb(src, (off+3)/4)
1892			if frag := off & 3; frag != 0 {
1893				bits |= (uintptr(*src) & 0xf) >> (4 - frag) << nbits
1894				src = add1(src)
1895				nbits += frag
1896				c -= frag
1897			}
1898			// Main loop: load one byte, write another.
1899			// The bits are rotating through the bit buffer.
1900			for i := c / 4; i > 0; i-- {
1901				bits |= (uintptr(*src) & 0xf) << nbits
1902				src = add1(src)
1903				*dst = uint8(bits&0xf | bitScanAll)
1904				dst = add1(dst)
1905				bits >>= 4
1906			}
1907			// Final src fragment.
1908			if c %= 4; c > 0 {
1909				bits |= (uintptr(*src) & (1<<c - 1)) << nbits
1910				nbits += c
1911			}
1912		}
1913	}
1914
1915	// Write any final bits out, using full-byte writes, even for the final byte.
1916	var totalBits uintptr
1917	if size == 1 {
1918		totalBits = (uintptr(unsafe.Pointer(dst))-uintptr(unsafe.Pointer(dstStart)))*8 + nbits
1919		nbits += -nbits & 7
1920		for ; nbits > 0; nbits -= 8 {
1921			*dst = uint8(bits)
1922			dst = add1(dst)
1923			bits >>= 8
1924		}
1925	} else {
1926		totalBits = (uintptr(unsafe.Pointer(dst))-uintptr(unsafe.Pointer(dstStart)))*4 + nbits
1927		nbits += -nbits & 3
1928		for ; nbits > 0; nbits -= 4 {
1929			v := bits&0xf | bitScanAll
1930			*dst = uint8(v)
1931			dst = add1(dst)
1932			bits >>= 4
1933		}
1934	}
1935	return totalBits
1936}
1937
1938// materializeGCProg allocates space for the (1-bit) pointer bitmask
1939// for an object of size ptrdata.  Then it fills that space with the
1940// pointer bitmask specified by the program prog.
1941// The bitmask starts at s.startAddr.
1942// The result must be deallocated with dematerializeGCProg.
1943func materializeGCProg(ptrdata uintptr, prog *byte) *mspan {
1944	// Each word of ptrdata needs one bit in the bitmap.
1945	bitmapBytes := divRoundUp(ptrdata, 8*sys.PtrSize)
1946	// Compute the number of pages needed for bitmapBytes.
1947	pages := divRoundUp(bitmapBytes, pageSize)
1948	s := mheap_.allocManual(pages, &memstats.gc_sys)
1949	runGCProg(addb(prog, 4), nil, (*byte)(unsafe.Pointer(s.startAddr)), 1)
1950	return s
1951}
1952func dematerializeGCProg(s *mspan) {
1953	mheap_.freeManual(s, &memstats.gc_sys)
1954}
1955
1956func dumpGCProg(p *byte) {
1957	nptr := 0
1958	for {
1959		x := *p
1960		p = add1(p)
1961		if x == 0 {
1962			print("\t", nptr, " end\n")
1963			break
1964		}
1965		if x&0x80 == 0 {
1966			print("\t", nptr, " lit ", x, ":")
1967			n := int(x+7) / 8
1968			for i := 0; i < n; i++ {
1969				print(" ", hex(*p))
1970				p = add1(p)
1971			}
1972			print("\n")
1973			nptr += int(x)
1974		} else {
1975			nbit := int(x &^ 0x80)
1976			if nbit == 0 {
1977				for nb := uint(0); ; nb += 7 {
1978					x := *p
1979					p = add1(p)
1980					nbit |= int(x&0x7f) << nb
1981					if x&0x80 == 0 {
1982						break
1983					}
1984				}
1985			}
1986			count := 0
1987			for nb := uint(0); ; nb += 7 {
1988				x := *p
1989				p = add1(p)
1990				count |= int(x&0x7f) << nb
1991				if x&0x80 == 0 {
1992					break
1993				}
1994			}
1995			print("\t", nptr, " repeat ", nbit, " × ", count, "\n")
1996			nptr += nbit * count
1997		}
1998	}
1999}
2000
2001// Testing.
2002
2003// gcbits returns the GC type info for x, for testing.
2004// The result is the bitmap entries (0 or 1), one entry per byte.
2005//go:linkname reflect_gcbits reflect.gcbits
2006func reflect_gcbits(x interface{}) []byte {
2007	ret := getgcmask(x)
2008	typ := (*ptrtype)(unsafe.Pointer(efaceOf(&x)._type)).elem
2009	nptr := typ.ptrdata / sys.PtrSize
2010	for uintptr(len(ret)) > nptr && ret[len(ret)-1] == 0 {
2011		ret = ret[:len(ret)-1]
2012	}
2013	return ret
2014}
2015
2016// Returns GC type info for the pointer stored in ep for testing.
2017// If ep points to the stack, only static live information will be returned
2018// (i.e. not for objects which are only dynamically live stack objects).
2019func getgcmask(ep interface{}) (mask []byte) {
2020	e := *efaceOf(&ep)
2021	p := e.data
2022	t := e._type
2023	// data or bss
2024	roots := gcRoots
2025	for roots != nil {
2026		for i := 0; i < roots.count; i++ {
2027			pr := roots.roots[i]
2028			addr := uintptr(pr.decl)
2029			if addr <= uintptr(p) && uintptr(p) < addr+pr.size {
2030				n := (*ptrtype)(unsafe.Pointer(t)).elem.size
2031				mask = make([]byte, n/sys.PtrSize)
2032				copy(mask, (*[1 << 29]uint8)(unsafe.Pointer(pr.gcdata))[:pr.ptrdata])
2033			}
2034			return
2035		}
2036		roots = roots.next
2037	}
2038
2039	// heap
2040	if base, s, _ := findObject(uintptr(p), 0, 0, false); base != 0 {
2041		hbits := heapBitsForAddr(base)
2042		n := s.elemsize
2043		mask = make([]byte, n/sys.PtrSize)
2044		for i := uintptr(0); i < n; i += sys.PtrSize {
2045			if hbits.isPointer() {
2046				mask[i/sys.PtrSize] = 1
2047			}
2048			if i != 1*sys.PtrSize && !hbits.morePointers() {
2049				mask = mask[:i/sys.PtrSize]
2050				break
2051			}
2052			hbits = hbits.next()
2053		}
2054		return
2055	}
2056
2057	// otherwise, not something the GC knows about.
2058	// possibly read-only data, like malloc(0).
2059	// must not have pointers
2060	// For gccgo, may live on the stack, which is collected conservatively.
2061	return
2062}
2063