Source: mgcmark_greenteagc.go in package runtime

Source File
	mgcmark_greenteagc.go

Belonging Package
	runtime

// Copyright 2025 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.

// Green Tea mark algorithm
//
// The core idea behind Green Tea is simple: achieve better locality during
// mark/scan by delaying scanning so that we can accumulate objects to scan
// within the same span, then scan the objects that have accumulated on the
// span all together.
//
// By batching objects this way, we increase the chance that adjacent objects
// will be accessed, amortize the cost of accessing object metadata, and create
// better opportunities for prefetching. We can take this even further and
// optimize the scan loop by size class (not yet completed) all the way to the
// point of applying SIMD techniques to really tear through the heap.
//
// Naturally, this depends on being able to create opportunties to batch objects
// together. The basic idea here is to have two sets of mark bits. One set is the
// regular set of mark bits ("marks"), while the other essentially says that the
// objects have been scanned already ("scans"). When we see a pointer for the first
// time we set its mark and enqueue its span. We track these spans in work queues
// with a FIFO policy, unlike workbufs which have a LIFO policy. Empirically, a
// FIFO policy appears to work best for accumulating objects to scan on a span.
// Later, when we dequeue the span, we find both the union and intersection of the
// mark and scan bitsets. The union is then written back into the scan bits, while
// the intersection is used to decide which objects need scanning, such that the GC
// is still precise.
//
// Below is the bulk of the implementation, focusing on the worst case
// for locality, small objects. Specifically, those that are smaller than
// a few cache lines in size and whose metadata is stored the same way (at the
// end of the span).

//go:build goexperiment.greenteagc

package runtime

import (
	"internal/goarch"
	"internal/runtime/atomic"
	"internal/runtime/gc"
	"internal/runtime/gc/scan"
	"internal/runtime/sys"
	"unsafe"
)

const doubleCheckGreenTea = false

// spanInlineMarkBits are mark bits that are inlined into the span
// itself. gcUsesSpanInlineMarkBits may be used to check if objects
// of a particular size use inline mark bits.
//
// Inline mark bits are a little bit more than just mark bits. They
// consist of two parts: scans and marks. Marks are like pre-mark
// bits. They're set once a pointer to an object is discovered for
// the first time. The marks allow us to scan many objects in bulk
// if we queue the whole span for scanning. Before we scan such objects
// in bulk, we copy the marks to the scans, computing a diff along the
// way. The resulting bitmap tells us which objects we should scan.
//
// The inlineMarkBits also hold state sufficient for scanning any
// object in the span, as well as state for acquiring ownership of
// the span for queuing. This avoids the need to look at the mspan when
// scanning.
type spanInlineMarkBits struct {
	scans [63]uint8         // scanned bits.
	owned spanScanOwnership // see the comment on spanScanOwnership.
	marks [63]uint8         // mark bits.
	class spanClass
}

// spanScanOwnership indicates whether some thread has acquired
// the span for scanning, and whether there has been one or more
// attempts to acquire the span. The latter information helps to
// fast-track span scans that only apply to a single mark, skipping
// the relatively costly merge-and-diff process for scans and marks
// by allowing one to just set the mark directly.
type spanScanOwnership uint8

const (
	spanScanUnowned  spanScanOwnership = 0         // Indicates the span is not acquired for scanning.
	spanScanOneMark                    = 1 << iota // Indicates that only one mark bit is set relative to the scan bits.
	spanScanManyMark                               // Indicates one or more scan bits may be set relative to the mark bits.
	// "ManyMark" need not be exactly the value it has. In practice we just
	// want to distinguish "none" from "one" from "many," so a comparison is
	// sufficient (as opposed to a bit test) to check between these cases.
)

// load atomically loads from a pointer to a spanScanOwnership.
func (o *spanScanOwnership) load() spanScanOwnership {
	return spanScanOwnership(atomic.Load8((*uint8)(unsafe.Pointer(o))))
}

func (o *spanScanOwnership) or(v spanScanOwnership) spanScanOwnership {
	// N.B. We round down the address and use Or32 because Or8 doesn't
	// return a result, and it's strictly necessary for this protocol.
	//
	// Making Or8 return a result, while making the code look nicer, would
	// not be strictly better on any supported platform, as an Or8 that
	// returns a result is not a common instruction. On many platforms it
	// would be implemented exactly as it is here, and since Or8 is
	// exclusively used in the runtime and a hot function, we want to keep
	// using its no-result version elsewhere for performance.
	o32 := (*uint32)(unsafe.Pointer(uintptr(unsafe.Pointer(o)) &^ 0b11))
	off := (uintptr(unsafe.Pointer(o)) & 0b11) * 8
	if goarch.BigEndian {
		off = 32 - off - 8
	}
	return spanScanOwnership(atomic.Or32(o32, uint32(v)<<off) >> off)
}

func (imb *spanInlineMarkBits) init(class spanClass, needzero bool) {
	if imb == nil {
		// This nil check and throw is almost pointless. Normally we would
		// expect imb to never be nil. However, this is called on potentially
		// freshly-allocated virtual memory. As of 2025, the compiler-inserted
		// nil check is not a branch but a memory read that we expect to fault
		// if the pointer really is nil.
		//
		// However, this causes a read of the page, and operating systems may
		// take it as a hint to back the accessed memory with a read-only zero
		// page. However, we immediately write to this memory, which can then
		// force operating systems to have to update the page table and flush
		// the TLB, causing a lot of churn for programs that are short-lived
		// and monotonically grow in size.
		//
		// This nil check is thus an explicit branch instead of what the compiler
		// would insert circa 2025, which is a memory read instruction.
		//
		// See go.dev/issue/74375 for details.
		throw("runtime: span inline mark bits nil?")
	}
	if needzero {
		// Use memclrNoHeapPointers to avoid having the compiler make a worse
		// decision. We know that imb is both aligned and a nice power-of-two
		// size that works well for wider SIMD instructions. The compiler likely
		// has no idea that imb is aligned to 128 bytes.
		memclrNoHeapPointers(unsafe.Pointer(imb), unsafe.Sizeof(spanInlineMarkBits{}))
	}
	imb.class = class
}

// tryAcquire attempts to acquire the span for scanning. On success, the caller
// must queue the span for scanning or scan the span immediately.
func (imb *spanInlineMarkBits) tryAcquire() bool {
	switch imb.owned.load() {
	case spanScanUnowned:
		// Try to mark the span as having only one object marked.
		if imb.owned.or(spanScanOneMark) == spanScanUnowned {
			return true
		}
		// If we didn't see an old value of spanScanUnowned, then we must
		// have raced with someone else and seen spanScanOneMark or greater.
		// Fall through and try to set spanScanManyMark.
		fallthrough
	case spanScanOneMark:
		// We may be the first to set *any* bit on owned. In such a case,
		// we still need to make sure the span is queued.
		return imb.owned.or(spanScanManyMark) == spanScanUnowned
	}
	return false
}

// release releases the span for scanning, allowing another thread to queue the span.
//
// Returns an upper bound on the number of mark bits set since the span was queued. The
// upper bound is described as "one" (spanScanOneMark) or "many" (spanScanManyMark, with or
// without spanScanOneMark). If the return value indicates only one mark bit was set, the
// caller can be certain that it was the same mark bit that caused the span to get queued.
// Take note of the fact that this is *only* an upper-bound. In particular, it may still
// turn out that only one mark bit was set, even if the return value indicates "many".
func (imb *spanInlineMarkBits) release() spanScanOwnership {
	return spanScanOwnership(atomic.Xchg8((*uint8)(unsafe.Pointer(&imb.owned)), uint8(spanScanUnowned)))
}

// spanInlineMarkBitsFromBase returns the spanInlineMarkBits for a span whose start address is base.
//
// The span must be gcUsesSpanInlineMarkBits(span.elemsize).
func spanInlineMarkBitsFromBase(base uintptr) *spanInlineMarkBits {
	return (*spanInlineMarkBits)(unsafe.Pointer(base + gc.PageSize - unsafe.Sizeof(spanInlineMarkBits{})))
}

// initInlineMarkBits initializes the inlineMarkBits stored at the end of the span.
func (s *mspan) initInlineMarkBits() {
	if doubleCheckGreenTea && !gcUsesSpanInlineMarkBits(s.elemsize) {
		throw("expected span with inline mark bits")
	}
	// Zeroing is only necessary if this span wasn't just freshly allocated from the OS.
	s.inlineMarkBits().init(s.spanclass, s.needzero != 0)
}

// moveInlineMarks merges the span's inline mark bits into dst and clears them.
//
// gcUsesSpanInlineMarkBits(s.elemsize) must be true.
func (s *mspan) moveInlineMarks(dst *gcBits) {
	if doubleCheckGreenTea && !gcUsesSpanInlineMarkBits(s.elemsize) {
		throw("expected span with inline mark bits")
	}
	bytes := divRoundUp(uintptr(s.nelems), 8)
	imb := s.inlineMarkBits()
	imbMarks := (*gc.ObjMask)(unsafe.Pointer(&imb.marks))
	for i := uintptr(0); i < bytes; i += goarch.PtrSize {
		marks := bswapIfBigEndian(imbMarks[i/goarch.PtrSize])
		if i/goarch.PtrSize == uintptr(len(imb.marks)+1)/goarch.PtrSize-1 {
			marks &^= 0xff << ((goarch.PtrSize - 1) * 8) // mask out class
		}
		*(*uintptr)(unsafe.Pointer(dst.bytep(i))) |= bswapIfBigEndian(marks)
	}
	if doubleCheckGreenTea && !s.spanclass.noscan() && imb.marks != imb.scans {
		throw("marks don't match scans for span with pointer")
	}

	// Reset the inline mark bits.
	imb.init(s.spanclass, true /* We know these bits are always dirty now. */)
}

// inlineMarkBits returns the inline mark bits for the span.
//
// gcUsesSpanInlineMarkBits(s.elemsize) must be true.
func (s *mspan) inlineMarkBits() *spanInlineMarkBits {
	if doubleCheckGreenTea && !gcUsesSpanInlineMarkBits(s.elemsize) {
		throw("expected span with inline mark bits")
	}
	return spanInlineMarkBitsFromBase(s.base())
}

func (s *mspan) markBitsForIndex(objIndex uintptr) (bits markBits) {
	if gcUsesSpanInlineMarkBits(s.elemsize) {
		bits.bytep = &s.inlineMarkBits().marks[objIndex/8]
	} else {
		bits.bytep = s.gcmarkBits.bytep(objIndex / 8)
	}
	bits.mask = uint8(1) << (objIndex % 8)
	bits.index = objIndex
	return
}

func (s *mspan) markBitsForBase() markBits {
	if gcUsesSpanInlineMarkBits(s.elemsize) {
		return markBits{&s.inlineMarkBits().marks[0], uint8(1), 0}
	}
	return markBits{&s.gcmarkBits.x, uint8(1), 0}
}

// scannedBitsForIndex returns a markBits representing the scanned bit
// for objIndex in the inline mark bits.
func (s *mspan) scannedBitsForIndex(objIndex uintptr) markBits {
	return markBits{&s.inlineMarkBits().scans[objIndex/8], uint8(1) << (objIndex % 8), objIndex}
}

// gcUsesSpanInlineMarkBits returns true if a span holding objects of a certain size
// has inline mark bits. size must be the span's elemsize.
//
// nosplit because this is called from gcmarknewobject, which is nosplit.
//
//go:nosplit
func gcUsesSpanInlineMarkBits(size uintptr) bool {
	return heapBitsInSpan(size) && size >= 16
}

// tryDeferToSpanScan tries to queue p on the span it points to, if it
// points to a small object span (gcUsesSpanQueue size).
func tryDeferToSpanScan(p uintptr, gcw *gcWork) bool {
	if useCheckmark {
		return false
	}

	// Quickly to see if this is a span that has inline mark bits.
	ha := heapArenaOf(p)
	if ha == nil {
		return false
	}
	pageIdx := ((p / pageSize) / 8) % uintptr(len(ha.pageInUse))
	pageMask := byte(1 << ((p / pageSize) % 8))
	if ha.pageUseSpanInlineMarkBits[pageIdx]&pageMask == 0 {
		return false
	}

	// Find the object's index from the span class info stored in the inline mark bits.
	base := alignDown(p, gc.PageSize)
	q := spanInlineMarkBitsFromBase(base)
	objIndex := uint16((uint64(p-base) * uint64(gc.SizeClassToDivMagic[q.class.sizeclass()])) >> 32)

	// Set mark bit.
	idx, mask := objIndex/8, uint8(1)<<(objIndex%8)
	if atomic.Load8(&q.marks[idx])&mask != 0 {
		return true
	}
	atomic.Or8(&q.marks[idx], mask)

	// Fast-track noscan objects.
	if q.class.noscan() {
		gcw.bytesMarked += uint64(gc.SizeClassToSize[q.class.sizeclass()])
		return true
	}

	// Queue up the pointer (as a representative for its span).
	if q.tryAcquire() {
		if gcw.spanq.put(makeObjPtr(base, objIndex)) {
			if gcphase == _GCmark {
				// This is intentionally racy; the bit set here might get
				// stomped on by a stealing P. See the comment in tryStealSpan
				// for an explanation as to why this is OK.
				if !work.spanqMask.read(uint32(gcw.id)) {
					work.spanqMask.set(gcw.id)
				}
				gcw.mayNeedWorker = true
			}
			gcw.flushedWork = true
		}
	}
	return true
}

// tryGetSpanFast attempts to get an entire span to scan.
func (w *gcWork) tryGetSpanFast() objptr {
	return w.spanq.tryGetFast()
}

// tryGetSpan attempts to get an entire span to scan.
func (w *gcWork) tryGetSpan() objptr {
	if s := w.spanq.tryGetFast(); s != 0 {
		return s
	}
	// "Steal" from ourselves.
	if s := w.spanq.steal(&w.spanq); s != 0 {
		return s
	}
	// We failed to get any local work, so we're fresh out.
	// Nobody else is going to add work for us. Clear our bit.
	if work.spanqMask.read(uint32(w.id)) {
		work.spanqMask.clear(w.id)
	}
	return 0
}

// spanQueue is a P-local stealable span queue.
type spanQueue struct {
	// head, tail, and ring represent a local non-thread-safe ring buffer.
	head, tail uint32
	ring       [256]objptr

	// putsSinceDrain counts the number of put calls since the last drain.
	putsSinceDrain int

	// chain contains state visible to other Ps.
	//
	// In particular, that means a linked chain of single-producer multi-consumer
	// ring buffers where the single producer is this P only.
	//
	// This linked chain structure is based off the sync.Pool dequeue.
	chain struct {
		// head is the spanSPMC to put to. This is only accessed
		// by the producer, so doesn't need to be synchronized.
		head *spanSPMC

		// tail is the spanSPMC to steal from. This is accessed
		// by consumers, so reads and writes must be atomic.
		tail atomic.UnsafePointer // *spanSPMC
	}
}

// putFast tries to put s onto the queue, but may fail if it's full.
func (q *spanQueue) putFast(s objptr) (ok bool) {
	if q.tail-q.head == uint32(len(q.ring)) {
		return false
	}
	q.ring[q.tail%uint32(len(q.ring))] = s
	q.tail++
	return true
}

// put puts s onto the queue.
//
// Returns whether the caller should spin up a new worker.
func (q *spanQueue) put(s objptr) bool {
	// The constants below define the period of and volume of
	// spans we spill to the spmc chain when the local queue is
	// not full.
	//
	// spillPeriod must be > spillMax, otherwise that sets the
	// effective maximum size of our local span queue. Even if
	// we have a span ring of size N, but we flush K spans every
	// K puts, then K becomes our effective maximum length. When
	// spillPeriod > spillMax, then we're always spilling spans
	// at a slower rate than we're accumulating them.
	const (
		// spillPeriod defines how often to check if we should
		// spill some spans, counted in the number of calls to put.
		spillPeriod = 64

		// spillMax defines, at most, how many spans to drain with
		// each spill.
		spillMax = 16
	)

	if q.putFast(s) {
		// Occasionally try to spill some work to generate parallelism.
		q.putsSinceDrain++
		if q.putsSinceDrain >= spillPeriod {
			// Reset even if we don't drain, so we don't check every time.
			q.putsSinceDrain = 0

			// Try to drain some spans. Don't bother if there's very
			// few of them or there's already spans in the spmc chain.
			n := min((q.tail-q.head)/2, spillMax)
			if n > 4 && q.chainEmpty() {
				q.drain(n)
				return true
			}
		}
		return false
	}

	// We're out of space. Drain out our local spans.
	q.drain(uint32(len(q.ring)) / 2)
	if !q.putFast(s) {
		throw("failed putFast after drain")
	}
	return true
}

// flush publishes all spans in the local queue to the spmc chain.
func (q *spanQueue) flush() {
	n := q.tail - q.head
	if n == 0 {
		return
	}
	q.drain(n)
}

// empty returns true if there's no more work on the queue.
//
// Not thread-safe. Must only be called by the owner of q.
func (q *spanQueue) empty() bool {
	// Check the local queue for work.
	if q.tail-q.head > 0 {
		return false
	}
	return q.chainEmpty()
}

// chainEmpty returns true if the spmc chain is empty.
//
// Thread-safe.
func (q *spanQueue) chainEmpty() bool {
	// Check the rest of the rings for work.
	r := (*spanSPMC)(q.chain.tail.Load())
	for r != nil {
		if !r.empty() {
			return false
		}
		r = (*spanSPMC)(r.prev.Load())
	}
	return true
}

// drain publishes n spans from the local queue to the spmc chain.
func (q *spanQueue) drain(n uint32) {
	q.putsSinceDrain = 0

	if q.chain.head == nil {
		// N.B. We target 1024, but this may be bigger if the physical
		// page size is bigger, or if we can fit more uintptrs into a
		// physical page. See newSpanSPMC docs.
		r := newSpanSPMC(1024)
		q.chain.head = r
		q.chain.tail.StoreNoWB(unsafe.Pointer(r))
	}

	// Try to drain some of the queue to the head spmc.
	if q.tryDrain(q.chain.head, n) {
		return
	}
	// No space. Create a bigger spmc and add it to the chain.

	// Double the size of the next one, up to a maximum.
	//
	// We double each time so we can avoid taking this slow path
	// in the future, which involves a global lock. Ideally we want
	// to hit a steady-state where the deepest any queue goes during
	// a mark phase can fit in the ring.
	//
	// However, we still set a maximum on this. We set the maximum
	// to something large to amortize the cost of lock acquisition, but
	// still at a reasonable size for big heaps and/or a lot of Ps (which
	// tend to be correlated).
	//
	// It's not too bad to burn relatively large-but-fixed amounts of per-P
	// memory if we need to deal with really, really deep queues, since the
	// constants of proportionality are small. Simultaneously, we want to
	// avoid a situation where a single worker ends up queuing O(heap)
	// work and then forever retains a queue of that size.
	const maxCap = 1 << 20 / goarch.PtrSize
	newCap := q.chain.head.cap * 2
	if newCap > maxCap {
		newCap = maxCap
	}
	newHead := newSpanSPMC(newCap)
	if !q.tryDrain(newHead, n) {
		throw("failed to put span on newly-allocated spanSPMC")
	}
	q.chain.head.prev.StoreNoWB(unsafe.Pointer(newHead))
	q.chain.head = newHead
}

// tryDrain attempts to drain n spans from q's local queue to the chain.
//
// Returns whether it succeeded.
func (q *spanQueue) tryDrain(r *spanSPMC, n uint32) bool {
	if q.head+n > q.tail {
		throw("attempt to drain too many elements")
	}
	h := r.head.Load() // synchronize with consumers
	t := r.tail.Load()
	rn := t - h
	if rn+n <= r.cap {
		for i := uint32(0); i < n; i++ {
			*r.slot(t + i) = q.ring[(q.head+i)%uint32(len(q.ring))]
		}
		r.tail.Store(t + n) // Makes the items avail for consumption.
		q.head += n
		return true
	}
	return false
}

// tryGetFast attempts to get a span from the local queue, but may fail if it's empty,
// returning false.
func (q *spanQueue) tryGetFast() objptr {
	if q.tail-q.head == 0 {
		return 0
	}
	s := q.ring[q.head%uint32(len(q.ring))]
	q.head++
	return s
}

// steal takes some spans from the ring chain of another span queue.
//
// q == q2 is OK.
func (q *spanQueue) steal(q2 *spanQueue) objptr {
	r := (*spanSPMC)(q2.chain.tail.Load())
	if r == nil {
		return 0
	}
	for {
		// It's important that we load the next pointer
		// *before* popping the tail. In general, r may be
		// transiently empty, but if next is non-nil before
		// the pop and the pop fails, then r is permanently
		// empty, which is the only condition under which it's
		// safe to drop r from the chain.
		r2 := (*spanSPMC)(r.prev.Load())

		// Try to refill from one of the rings
		if s := q.refill(r); s != 0 {
			return s
		}

		if r2 == nil {
			// This is the only ring. It's empty right
			// now, but could be pushed to in the future.
			return 0
		}

		// The tail of the chain has been drained, so move on
		// to the next ring. Try to drop it from the chain
		// so the next consumer doesn't have to look at the empty
		// ring again.
		if q2.chain.tail.CompareAndSwapNoWB(unsafe.Pointer(r), unsafe.Pointer(r2)) {
			r.dead.Store(true)
		}

		r = r2
	}
}

// refill takes some spans from r and puts them into q's local queue.
//
// One span is removed from the stolen spans and returned on success.
// Failure to steal returns a zero objptr.
//
// steal is thread-safe with respect to r.
func (q *spanQueue) refill(r *spanSPMC) objptr {
	if q.tail-q.head != 0 {
		throw("steal with local work available")
	}

	// Steal some spans.
	var n uint32
	for {
		h := r.head.Load() // load-acquire, synchronize with other consumers
		t := r.tail.Load() // load-acquire, synchronize with the producer
		n = t - h
		n = n - n/2
		if n == 0 {
			return 0
		}
		if n > r.cap { // read inconsistent h and t
			continue
		}
		n = min(n, uint32(len(q.ring)/2))
		for i := uint32(0); i < n; i++ {
			q.ring[i] = *r.slot(h + i)
		}
		if r.head.CompareAndSwap(h, h+n) {
			break
		}
	}

	// Update local queue head and tail to reflect new buffered values.
	q.head = 0
	q.tail = n

	// Pop off the head of the queue and return it.
	return q.tryGetFast()
}

// destroy frees all chains in an empty spanQueue.
//
// Preconditions:
// - World is stopped.
// - GC is outside of the mark phase.
// - (Therefore) the queue is empty.
func (q *spanQueue) destroy() {
	assertWorldStopped()
	if gcphase != _GCoff {
		throw("spanQueue.destroy during the mark phase")
	}
	if !q.empty() {
		throw("spanQueue.destroy on non-empty queue")
	}

	lock(&work.spanSPMCs.lock)

	// Remove, deinitialize, and free each ring.
	for r := (*spanSPMC)(q.chain.tail.Load()); r != nil; r = (*spanSPMC)(r.prev.Load()) {
		work.spanSPMCs.list.remove(unsafe.Pointer(r))
		r.deinit()
		mheap_.spanSPMCAlloc.free(unsafe.Pointer(r))
	}

	q.chain.head = nil
	q.chain.tail.Store(nil)
	q.putsSinceDrain = 0

	unlock(&work.spanSPMCs.lock)
}

// spanSPMC is a ring buffer of objptrs that represent spans.
// Accessed without a lock.
//
// Single-producer, multi-consumer. The only producer is the P that owns this
// queue, but any other P may consume from it.
//
// ## Invariants for memory management
//
// 1. All spanSPMCs are allocated from mheap_.spanSPMCAlloc.
// 2. All allocated spanSPMCs must be on the work.spanSPMCs list.
// 3. spanSPMCs may only be allocated if gcphase != _GCoff.
// 4. spanSPMCs may only be deallocated if gcphase == _GCoff.
//
// Invariants (3) and (4) ensure that we do not need to concern ourselves with
// tricky reuse issues that stem from not knowing when a thread is truly done
// with a spanSPMC. For example, two threads could load the same spanSPMC from
// the tail of the chain. One thread is then paused while the other steals the
// last few elements off of it. It's not safe to free at that point since the
// other thread will still inspect that spanSPMC, and we have no way of knowing
// without more complex and/or heavyweight synchronization.
//
// Instead, we rely on the global synchronization inherent to GC phases, and
// the fact that spanSPMCs are only ever used during the mark phase, to ensure
// memory safety. This means we temporarily waste some memory, but it's only
// until the end of the mark phase.
type spanSPMC struct {
	_ sys.NotInHeap

	// allnode is the linked list node for work.spanSPMCs list. This is
	// used to find and free dead spanSPMCs. Protected by
	// work.spanSPMCs.lock.
	allnode listNodeManual

	// dead indicates whether the spanSPMC is no longer in use.
	// Protected by the CAS to the prev field of the spanSPMC pointing
	// to this spanSPMC. That is, whoever wins that CAS takes ownership
	// of marking this spanSPMC as dead. See spanQueue.steal for details.
	dead atomic.Bool

	// prev is the next link up a spanQueue's SPMC chain, from tail to head,
	// hence the name "prev." Set by a spanQueue's producer, cleared by a
	// CAS in spanQueue.steal.
	prev atomic.UnsafePointer // *spanSPMC

	// head, tail, cap, and ring together represent a fixed-size SPMC lock-free
	// ring buffer of size cap. The ring buffer contains objptr values.
	head atomic.Uint32
	tail atomic.Uint32
	cap  uint32 // cap(ring))
	ring *objptr
}

// newSpanSPMC allocates and initializes a new spmc with the provided capacity.
//
// newSpanSPMC may override the capacity with a larger one if the provided one would
// waste memory.
func newSpanSPMC(cap uint32) *spanSPMC {
	lock(&work.spanSPMCs.lock)
	r := (*spanSPMC)(mheap_.spanSPMCAlloc.alloc())
	work.spanSPMCs.list.push(unsafe.Pointer(r))
	unlock(&work.spanSPMCs.lock)

	// If cap < the capacity of a single physical page, round up.
	pageCap := uint32(physPageSize / goarch.PtrSize) // capacity of a single page
	if cap < pageCap {
		cap = pageCap
	}
	if cap&(cap-1) != 0 {
		throw("spmc capacity must be a power of 2")
	}

	r.cap = cap
	ring := sysAlloc(uintptr(cap)*unsafe.Sizeof(objptr(0)), &memstats.gcMiscSys, "GC span queue")
	atomic.StorepNoWB(unsafe.Pointer(&r.ring), ring)
	return r
}

// empty returns true if the spmc is empty.
//
// empty is thread-safe.
func (r *spanSPMC) empty() bool {
	h := r.head.Load()
	t := r.tail.Load()
	return t == h
}

// deinit frees any resources the spanSPMC is holding onto and zeroes it.
func (r *spanSPMC) deinit() {
	sysFree(unsafe.Pointer(r.ring), uintptr(r.cap)*unsafe.Sizeof(objptr(0)), &memstats.gcMiscSys)
	r.ring = nil
	r.dead.Store(false)
	r.prev.StoreNoWB(nil)
	r.head.Store(0)
	r.tail.Store(0)
	r.cap = 0
	r.allnode = listNodeManual{}
}

// slot returns a pointer to slot i%r.cap.
func (r *spanSPMC) slot(i uint32) *objptr {
	idx := uintptr(i & (r.cap - 1))
	return (*objptr)(unsafe.Add(unsafe.Pointer(r.ring), idx*unsafe.Sizeof(objptr(0))))
}

// freeDeadSpanSPMCs frees dead spanSPMCs back to the OS.
func freeDeadSpanSPMCs() {
	// According to the SPMC memory management invariants, we can only free
	// spanSPMCs outside of the mark phase. We ensure we do this in two ways.
	//
	// 1. We take the work.spanSPMCs lock, which we need anyway. This ensures
	//    that we are non-preemptible. If this path becomes lock-free, we will
	//    need to become non-preemptible in some other way.
	// 2. Once we are non-preemptible, we check the gcphase, and back out if
	//    it's not safe.
	//
	// This way, we ensure that we don't start freeing if we're in the wrong
	// phase, and the phase can't change on us while we're freeing.
	//
	// TODO(go.dev/issue/75771): Due to the grow semantics in
	// spanQueue.drain, we expect a steady-state of around one spanSPMC per
	// P, with some spikes higher when Ps have more than one. For high
	// GOMAXPROCS, or if this list otherwise gets long, it would be nice to
	// have a way to batch work that allows preemption during processing.
	lock(&work.spanSPMCs.lock)
	if gcphase != _GCoff || work.spanSPMCs.list.empty() {
		unlock(&work.spanSPMCs.lock)
		return
	}
	r := (*spanSPMC)(work.spanSPMCs.list.head())
	for r != nil {
		next := (*spanSPMC)(unsafe.Pointer(r.allnode.next))
		if r.dead.Load() {
			// It's dead. Remove, deinitialize and free it.
			work.spanSPMCs.list.remove(unsafe.Pointer(r))
			r.deinit()
			mheap_.spanSPMCAlloc.free(unsafe.Pointer(r))
		}
		r = next
	}
	unlock(&work.spanSPMCs.lock)
}

// tryStealSpan attempts to steal a span from another P's local queue.
//
// Returns a non-zero objptr on success.
func (w *gcWork) tryStealSpan() objptr {
	pp := getg().m.p.ptr()

	for enum := stealOrder.start(cheaprand()); !enum.done(); enum.next() {
		if !work.spanqMask.read(enum.position()) {
			continue
		}
		p2 := allp[enum.position()]
		if pp == p2 {
			continue
		}
		if s := w.spanq.steal(&p2.gcw.spanq); s != 0 {
			return s
		}
		// N.B. This is intentionally racy. We may stomp on a mask set by
		// a P that just put a bunch of work into its local queue.
		//
		// This is OK because the ragged barrier in gcMarkDone will set
		// the bit on each P if there's local work we missed. This race
		// should generally be rare, since the window between noticing
		// an empty local queue and this bit being set is quite small.
		work.spanqMask.clear(int32(enum.position()))
	}
	return 0
}

// objptr consists of a span base and the index of the object in the span.
type objptr uintptr

// makeObjPtr creates an objptr from a span base address and an object index.
func makeObjPtr(spanBase uintptr, objIndex uint16) objptr {
	if doubleCheckGreenTea && spanBase&((1<<gc.PageShift)-1) != 0 {
		throw("created objptr with address that is incorrectly aligned")
	}
	return objptr(spanBase | uintptr(objIndex))
}

func (p objptr) spanBase() uintptr {
	return uintptr(p) &^ ((1 << gc.PageShift) - 1)
}

func (p objptr) objIndex() uint16 {
	return uint16(p) & ((1 << gc.PageShift) - 1)
}

// scanSpan scans objects indicated marks&^scans and then scans those objects,
// queuing the resulting pointers into gcw.
func scanSpan(p objptr, gcw *gcWork) {
	spanBase := p.spanBase()
	imb := spanInlineMarkBitsFromBase(spanBase)
	spanclass := imb.class
	if spanclass.noscan() {
		throw("noscan object in scanSpan")
	}
	elemsize := uintptr(gc.SizeClassToSize[spanclass.sizeclass()])

	// Release span.
	if imb.release() == spanScanOneMark {
		// Nobody else set any mark bits on this span while it was acquired.
		// That means p is the sole object we need to handle. Fast-track it.
		objIndex := p.objIndex()
		bytep := &imb.scans[objIndex/8]
		mask := uint8(1) << (objIndex % 8)
		if atomic.Load8(bytep)&mask != 0 {
			return
		}
		atomic.Or8(bytep, mask)
		gcw.bytesMarked += uint64(elemsize)
		if debug.gctrace > 1 {
			gcw.stats[spanclass.sizeclass()].sparseObjsScanned++
		}
		b := spanBase + uintptr(objIndex)*elemsize
		scanObjectSmall(spanBase, b, elemsize, gcw)
		return
	}

	// Compute nelems.
	divMagic := uint64(gc.SizeClassToDivMagic[spanclass.sizeclass()])
	usableSpanSize := uint64(gc.PageSize - unsafe.Sizeof(spanInlineMarkBits{}))
	if !spanclass.noscan() {
		usableSpanSize -= gc.PageSize / goarch.PtrSize / 8
	}
	nelems := uint16((usableSpanSize * divMagic) >> 32)

	// Grey objects and return if there's nothing else to do.
	var toScan gc.ObjMask
	objsMarked := spanSetScans(spanBase, nelems, imb, &toScan)
	if objsMarked == 0 {
		return
	}
	gcw.bytesMarked += uint64(objsMarked) * uint64(elemsize)

	// Check if we have enough density to make a dartboard scan
	// worthwhile. If not, just do what scanobject does, but
	// localized to the span, using the dartboard.
	if !scan.HasFastScanSpanPacked() || objsMarked < int(nelems/8) {
		if debug.gctrace > 1 {
			gcw.stats[spanclass.sizeclass()].spansSparseScanned++
			gcw.stats[spanclass.sizeclass()].spanObjsSparseScanned += uint64(objsMarked)
		}
		scanObjectsSmall(spanBase, elemsize, nelems, gcw, &toScan)
		return
	}

	// Scan the span.
	//
	// N.B. Use gcw.ptrBuf as the output buffer. This is a bit different
	// from scanObjectsSmall, which puts addresses to dereference. ScanSpanPacked
	// on the other hand, fills gcw.ptrBuf with already dereferenced pointers.
	nptrs := scan.ScanSpanPacked(
		unsafe.Pointer(spanBase),
		&gcw.ptrBuf[0],
		&toScan,
		uintptr(spanclass.sizeclass()),
		spanPtrMaskUnsafe(spanBase),
	)
	gcw.heapScanWork += int64(objsMarked) * int64(elemsize)

	if debug.gctrace > 1 {
		// Write down some statistics.
		gcw.stats[spanclass.sizeclass()].spansDenseScanned++
		gcw.stats[spanclass.sizeclass()].spanObjsDenseScanned += uint64(objsMarked)
	}

	// Process all the pointers we just got.
	for _, p := range gcw.ptrBuf[:nptrs] {
		if !tryDeferToSpanScan(p, gcw) {
			if obj, span, objIndex := findObject(p, 0, 0); obj != 0 {
				greyobject(obj, 0, 0, span, gcw, objIndex)
			}
		}
	}
}

// spanSetScans sets any unset mark bits that have their mark bits set in the inline mark bits.
//
// toScan is populated with bits indicating whether a particular mark bit was set.
//
// Returns the number of objects marked, which could be zero.
func spanSetScans(spanBase uintptr, nelems uint16, imb *spanInlineMarkBits, toScan *gc.ObjMask) int {
	arena, pageIdx, pageMask := pageIndexOf(spanBase)
	if arena.pageMarks[pageIdx]&pageMask == 0 {
		atomic.Or8(&arena.pageMarks[pageIdx], pageMask)
	}

	bytes := divRoundUp(uintptr(nelems), 8)
	objsMarked := 0

	// Careful: these two structures alias since ObjMask is much bigger
	// than marks or scans. We do these unsafe shenanigans so that we can
	// access the marks and scans by uintptrs rather than by byte.
	imbMarks := (*gc.ObjMask)(unsafe.Pointer(&imb.marks))
	imbScans := (*gc.ObjMask)(unsafe.Pointer(&imb.scans))

	// Iterate over one uintptr-sized chunks at a time, computing both
	// the union and intersection of marks and scans. Store the union
	// into scans, and the intersection into toScan.
	for i := uintptr(0); i < bytes; i += goarch.PtrSize {
		scans := atomic.Loaduintptr(&imbScans[i/goarch.PtrSize])
		marks := imbMarks[i/goarch.PtrSize]
		scans = bswapIfBigEndian(scans)
		marks = bswapIfBigEndian(marks)
		if i/goarch.PtrSize == uintptr(len(imb.marks)+1)/goarch.PtrSize-1 {
			scans &^= 0xff << ((goarch.PtrSize - 1) * 8) // mask out owned
			marks &^= 0xff << ((goarch.PtrSize - 1) * 8) // mask out class
		}
		toGrey := marks &^ scans
		toScan[i/goarch.PtrSize] = toGrey

		// If there's anything left to grey, do it.
		if toGrey != 0 {
			toGrey = bswapIfBigEndian(toGrey)
			if goarch.PtrSize == 4 {
				atomic.Or32((*uint32)(unsafe.Pointer(&imbScans[i/goarch.PtrSize])), uint32(toGrey))
			} else {
				atomic.Or64((*uint64)(unsafe.Pointer(&imbScans[i/goarch.PtrSize])), uint64(toGrey))
			}
		}
		objsMarked += sys.OnesCount64(uint64(toGrey))
	}
	return objsMarked
}

func scanObjectSmall(spanBase, b, objSize uintptr, gcw *gcWork) {
	hbitsBase, _ := spanHeapBitsRange(spanBase, gc.PageSize, objSize)
	hbits := (*byte)(unsafe.Pointer(hbitsBase))
	ptrBits := extractHeapBitsSmall(hbits, spanBase, b, objSize)
	gcw.heapScanWork += int64(sys.Len64(uint64(ptrBits)) * goarch.PtrSize)
	nptrs := 0
	n := sys.OnesCount64(uint64(ptrBits))
	for range n {
		k := sys.TrailingZeros64(uint64(ptrBits))
		ptrBits &^= 1 << k
		addr := b + uintptr(k)*goarch.PtrSize

		// Prefetch addr since we're about to use it. This point for prefetching
		// was chosen empirically.
		sys.Prefetch(addr)

		// N.B. ptrBuf is always large enough to hold pointers for an entire 1-page span.
		gcw.ptrBuf[nptrs] = addr
		nptrs++
	}

	// Process all the pointers we just got.
	for _, p := range gcw.ptrBuf[:nptrs] {
		p = *(*uintptr)(unsafe.Pointer(p))
		if p == 0 {
			continue
		}
		if !tryDeferToSpanScan(p, gcw) {
			if obj, span, objIndex := findObject(p, 0, 0); obj != 0 {
				greyobject(obj, 0, 0, span, gcw, objIndex)
			}
		}
	}
}

func scanObjectsSmall(base, objSize uintptr, elems uint16, gcw *gcWork, scans *gc.ObjMask) {
	nptrs := 0
	for i, bits := range scans {
		if i*(goarch.PtrSize*8) > int(elems) {
			break
		}
		n := sys.OnesCount64(uint64(bits))
		hbitsBase, _ := spanHeapBitsRange(base, gc.PageSize, objSize)
		hbits := (*byte)(unsafe.Pointer(hbitsBase))
		for range n {
			j := sys.TrailingZeros64(uint64(bits))
			bits &^= 1 << j

			b := base + uintptr(i*(goarch.PtrSize*8)+j)*objSize
			ptrBits := extractHeapBitsSmall(hbits, base, b, objSize)
			gcw.heapScanWork += int64(sys.Len64(uint64(ptrBits)) * goarch.PtrSize)

			n := sys.OnesCount64(uint64(ptrBits))
			for range n {
				k := sys.TrailingZeros64(uint64(ptrBits))
				ptrBits &^= 1 << k
				addr := b + uintptr(k)*goarch.PtrSize

				// Prefetch addr since we're about to use it. This point for prefetching
				// was chosen empirically.
				sys.Prefetch(addr)

				// N.B. ptrBuf is always large enough to hold pointers for an entire 1-page span.
				gcw.ptrBuf[nptrs] = addr
				nptrs++
			}
		}
	}

	// Process all the pointers we just got.
	for _, p := range gcw.ptrBuf[:nptrs] {
		p = *(*uintptr)(unsafe.Pointer(p))
		if p == 0 {
			continue
		}
		if !tryDeferToSpanScan(p, gcw) {
			if obj, span, objIndex := findObject(p, 0, 0); obj != 0 {
				greyobject(obj, 0, 0, span, gcw, objIndex)
			}
		}
	}
}

func extractHeapBitsSmall(hbits *byte, spanBase, addr, elemsize uintptr) uintptr {
	// These objects are always small enough that their bitmaps
	// fit in a single word, so just load the word or two we need.
	//
	// Mirrors mspan.writeHeapBitsSmall.
	//
	// We should be using heapBits(), but unfortunately it introduces
	// both bounds checks panics and throw which causes us to exceed
	// the nosplit limit in quite a few cases.
	i := (addr - spanBase) / goarch.PtrSize / ptrBits
	j := (addr - spanBase) / goarch.PtrSize % ptrBits
	bits := elemsize / goarch.PtrSize
	word0 := (*uintptr)(unsafe.Pointer(addb(hbits, goarch.PtrSize*(i+0))))
	word1 := (*uintptr)(unsafe.Pointer(addb(hbits, goarch.PtrSize*(i+1))))

	var read uintptr
	if j+bits > ptrBits {
		// Two reads.
		bits0 := ptrBits - j
		bits1 := bits - bits0
		read = *word0 >> j
		read |= (*word1 & ((1 << bits1) - 1)) << bits0
	} else {
		// One read.
		read = (*word0 >> j) & ((1 << bits) - 1)
	}
	return read
}

// spanPtrMaskUnsafe returns the pointer mask for a span with inline mark bits.
//
// The caller must ensure spanBase is the base of a span that:
// - 1 page in size,
// - Uses inline mark bits,
// - Contains pointers.
func spanPtrMaskUnsafe(spanBase uintptr) *gc.PtrMask {
	base := spanBase + gc.PageSize - unsafe.Sizeof(gc.PtrMask{}) - unsafe.Sizeof(spanInlineMarkBits{})
	return (*gc.PtrMask)(unsafe.Pointer(base))
}

type sizeClassScanStats struct {
	spansDenseScanned     uint64 // Spans scanned with ScanSpanPacked.
	spanObjsDenseScanned  uint64 // Objects scanned with ScanSpanPacked.
	spansSparseScanned    uint64 // Spans scanned with scanObjectsSmall.
	spanObjsSparseScanned uint64 // Objects scanned with scanObjectsSmall.
	sparseObjsScanned     uint64 // Objects scanned with scanobject or scanObjectSmall.
	// Note: sparseObjsScanned is sufficient for both cases because
	// a particular size class either uses scanobject or scanObjectSmall,
	// not both. In the latter case, we also know that there was one
	// object scanned per span, so no need for a span counter.
}

func dumpScanStats() {
	var (
		spansDenseScanned     uint64
		spanObjsDenseScanned  uint64
		spansSparseScanned    uint64
		spanObjsSparseScanned uint64
		sparseObjsScanned     uint64
	)
	for _, stats := range memstats.lastScanStats {
		spansDenseScanned += stats.spansDenseScanned
		spanObjsDenseScanned += stats.spanObjsDenseScanned
		spansSparseScanned += stats.spansSparseScanned
		spanObjsSparseScanned += stats.spanObjsSparseScanned
		sparseObjsScanned += stats.sparseObjsScanned
	}
	totalObjs := sparseObjsScanned + spanObjsSparseScanned + spanObjsDenseScanned
	totalSpans := spansSparseScanned + spansDenseScanned
	print("scan: total ", sparseObjsScanned, "+", spanObjsSparseScanned, "+", spanObjsDenseScanned, "=", totalObjs, " objs")
	print(", ", spansSparseScanned, "+", spansDenseScanned, "=", totalSpans, " spans\n")
	for i, stats := range memstats.lastScanStats {
		if stats == (sizeClassScanStats{}) {
			continue
		}
		totalObjs := stats.sparseObjsScanned + stats.spanObjsSparseScanned + stats.spanObjsDenseScanned
		totalSpans := stats.spansSparseScanned + stats.spansDenseScanned
		if i == 0 {
			print("scan: class L ")
		} else {
			print("scan: class ", gc.SizeClassToSize[i], "B ")
		}
		print(stats.sparseObjsScanned, "+", stats.spanObjsSparseScanned, "+", stats.spanObjsDenseScanned, "=", totalObjs, " objs")
		print(", ", stats.spansSparseScanned, "+", stats.spansDenseScanned, "=", totalSpans, " spans\n")
	}
}

func (w *gcWork) flushScanStats(dst *[gc.NumSizeClasses]sizeClassScanStats) {
	for i := range w.stats {
		dst[i].spansDenseScanned += w.stats[i].spansDenseScanned
		dst[i].spanObjsDenseScanned += w.stats[i].spanObjsDenseScanned
		dst[i].spansSparseScanned += w.stats[i].spansSparseScanned
		dst[i].spanObjsSparseScanned += w.stats[i].spanObjsSparseScanned
		dst[i].sparseObjsScanned += w.stats[i].sparseObjsScanned
	}
	clear(w.stats[:])
}

// gcMarkWorkAvailable reports whether there's any non-local work available to do.
//
// This is a heavyweight check and must only be used for correctness, not
// as a hint.
func gcMarkWorkAvailable() bool {
	if !work.full.empty() {
		return true // global work available
	}
	if work.markrootNext.Load() < work.markrootJobs.Load() {
		return true // root scan work available
	}
	if work.spanqMask.any() {
		return true // stealable local work available
	}
	return false
}

// scanObject scans the object starting at b, adding pointers to gcw.
// b must point to the beginning of a heap object or an oblet.
// scanObject consults the GC bitmap for the pointer mask and the
// spans for the size of the object.
//
// Used only for !gcUsesSpanInlineMarkBits spans, but supports all
// object sizes and is safe to be called on all heap objects.
//
//go:nowritebarrier
func scanObject(b uintptr, gcw *gcWork) {
	// Prefetch object before we scan it.
	//
	// This will overlap fetching the beginning of the object with initial
	// setup before we start scanning the object.
	sys.Prefetch(b)

	// Find the bits for b and the size of the object at b.
	//
	// b is either the beginning of an object, in which case this
	// is the size of the object to scan, or it points to an
	// oblet, in which case we compute the size to scan below.
	s := spanOfUnchecked(b)
	n := s.elemsize
	if n == 0 {
		throw("scanObject n == 0")
	}
	if s.spanclass.noscan() {
		// Correctness-wise this is ok, but it's inefficient
		// if noscan objects reach here.
		throw("scanObject of a noscan object")
	}

	var tp typePointers
	if n > maxObletBytes {
		// Large object. Break into oblets for better
		// parallelism and lower latency.
		if b == s.base() {
			// Enqueue the other oblets to scan later.
			// Some oblets may be in b's scalar tail, but
			// these will be marked as "no more pointers",
			// so we'll drop out immediately when we go to
			// scan those.
			for oblet := b + maxObletBytes; oblet < s.base()+s.elemsize; oblet += maxObletBytes {
				if !gcw.putObjFast(oblet) {
					gcw.putObj(oblet)
				}
			}
		}

		// Compute the size of the oblet. Since this object
		// must be a large object, s.base() is the beginning
		// of the object.
		n = s.base() + s.elemsize - b
		n = min(n, maxObletBytes)
		tp = s.typePointersOfUnchecked(s.base())
		tp = tp.fastForward(b-tp.addr, b+n)
	} else {
		tp = s.typePointersOfUnchecked(b)
	}

	var scanSize uintptr
	for {
		var addr uintptr
		if tp, addr = tp.nextFast(); addr == 0 {
			if tp, addr = tp.next(b + n); addr == 0 {
				break
			}
		}

		// Keep track of farthest pointer we found, so we can
		// update heapScanWork. TODO: is there a better metric,
		// now that we can skip scalar portions pretty efficiently?
		scanSize = addr - b + goarch.PtrSize

		// Work here is duplicated in scanblock and above.
		// If you make changes here, make changes there too.
		obj := *(*uintptr)(unsafe.Pointer(addr))

		// At this point we have extracted the next potential pointer.
		// Quickly filter out nil and pointers back to the current object.
		if obj != 0 && obj-b >= n {
			// Test if obj points into the Go heap and, if so,
			// mark the object.
			//
			// Note that it's possible for findObject to
			// fail if obj points to a just-allocated heap
			// object because of a race with growing the
			// heap. In this case, we know the object was
			// just allocated and hence will be marked by
			// allocation itself.
			if !tryDeferToSpanScan(obj, gcw) {
				if obj, span, objIndex := findObject(obj, b, addr-b); obj != 0 {
					greyobject(obj, b, addr-b, span, gcw, objIndex)
				}
			}
		}
	}
	gcw.bytesMarked += uint64(n)
	gcw.heapScanWork += int64(scanSize)
	if debug.gctrace > 1 {
		gcw.stats[s.spanclass.sizeclass()].sparseObjsScanned++
	}
}


The pages are generated with Golds v0.8.3-preview. (GOOS=linux GOARCH=amd64)
Golds is a Go 101 project developed by Tapir Liu.
PR and bug reports are welcome and can be submitted to the issue list.
Please follow @zigo_101 (reachable from the left QR code) to get the latest news of Golds.