// Copyright 2022 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.

package runtime

import 

// gcCPULimiter is a mechanism to limit GC CPU utilization in situations
// where it might become excessive and inhibit application progress (e.g.
// a death spiral).
//
// The core of the limiter is a leaky bucket mechanism that fills with GC
// CPU time and drains with mutator time. Because the bucket fills and
// drains with time directly (i.e. without any weighting), this effectively
// sets a very conservative limit of 50%. This limit could be enforced directly,
// however, but the purpose of the bucket is to accommodate spikes in GC CPU
// utilization without hurting throughput.
//
// Note that the bucket in the leaky bucket mechanism can never go negative,
// so the GC never gets credit for a lot of CPU time spent without the GC
// running. This is intentional, as an application that stays idle for, say,
// an entire day, could build up enough credit to fail to prevent a death
// spiral the following day. The bucket's capacity is the GC's only leeway.
//
// The capacity thus also sets the window the limiter considers. For example,
// if the capacity of the bucket is 1 cpu-second, then the limiter will not
// kick in until at least 1 full cpu-second in the last 2 cpu-second window
// is spent on GC CPU time.
var gcCPULimiter gcCPULimiterState

type gcCPULimiterState struct {
	lock atomic.Uint32

	enabled atomic.Bool

	// gcEnabled is an internal copy of gcBlackenEnabled that determines
	// whether the limiter tracks total assist time.
	//
	// gcBlackenEnabled isn't used directly so as to keep this structure
	// unit-testable.
	gcEnabled bool

	// transitioning is true when the GC is in a STW and transitioning between
	// the mark and sweep phases.
	transitioning bool

	// test indicates whether this instance of the struct was made for testing purposes.
	test bool

	bucket struct {
		// Invariants:
		// - fill >= 0
		// - capacity >= 0
		// - fill <= capacity
		fill, capacity uint64
	}
	// overflow is the cumulative amount of GC CPU time that we tried to fill the
	// bucket with but exceeded its capacity.
	overflow uint64

	// assistTimePool is the accumulated assist time since the last update.
	assistTimePool atomic.Int64

	// idleMarkTimePool is the accumulated idle mark time since the last update.
	idleMarkTimePool atomic.Int64

	// idleTimePool is the accumulated time Ps spent on the idle list since the last update.
	idleTimePool atomic.Int64

	// lastUpdate is the nanotime timestamp of the last time update was called.
	//
	// Updated under lock, but may be read concurrently.
	lastUpdate atomic.Int64

	// lastEnabledCycle is the GC cycle that last had the limiter enabled.
	lastEnabledCycle atomic.Uint32

	// nprocs is an internal copy of gomaxprocs, used to determine total available
	// CPU time.
	//
	// gomaxprocs isn't used directly so as to keep this structure unit-testable.
	nprocs int32
}

// limiting returns true if the CPU limiter is currently enabled, meaning the Go GC
// should take action to limit CPU utilization.
//
// It is safe to call concurrently with other operations.
func ( *gcCPULimiterState) () bool {
	return .enabled.Load()
}

// startGCTransition notifies the limiter of a GC transition.
//
// This call takes ownership of the limiter and disables all other means of
// updating the limiter. Release ownership by calling finishGCTransition.
//
// It is safe to call concurrently with other operations.
func ( *gcCPULimiterState) ( bool,  int64) {
	if !.tryLock() {
		// This must happen during a STW, so we can't fail to acquire the lock.
		// If we did, something went wrong. Throw.
		throw("failed to acquire lock to start a GC transition")
	}
	if .gcEnabled ==  {
		throw("transitioning GC to the same state as before?")
	}
	// Flush whatever was left between the last update and now.
	.updateLocked()
	.gcEnabled = 
	.transitioning = true
	// N.B. finishGCTransition releases the lock.
	//
	// We don't release here to increase the chance that if there's a failure
	// to finish the transition, that we throw on failing to acquire the lock.
}

// finishGCTransition notifies the limiter that the GC transition is complete
// and releases ownership of it. It also accumulates STW time in the bucket.
// now must be the timestamp from the end of the STW pause.
func ( *gcCPULimiterState) ( int64) {
	if !.transitioning {
		throw("finishGCTransition called without starting one?")
	}
	// Count the full nprocs set of CPU time because the world is stopped
	// between startGCTransition and finishGCTransition. Even though the GC
	// isn't running on all CPUs, it is preventing user code from doing so,
	// so it might as well be.
	if  := .lastUpdate.Load();  >=  {
		.accumulate(0, (-)*int64(.nprocs))
	}
	.lastUpdate.Store()
	.transitioning = false
	.unlock()
}

// gcCPULimiterUpdatePeriod dictates the maximum amount of wall-clock time
// we can go before updating the limiter.
const gcCPULimiterUpdatePeriod = 10e6 // 10ms

// needUpdate returns true if the limiter's maximum update period has been
// exceeded, and so would benefit from an update.
func ( *gcCPULimiterState) ( int64) bool {
	return -.lastUpdate.Load() > gcCPULimiterUpdatePeriod
}

// addAssistTime notifies the limiter of additional assist time. It will be
// included in the next update.
func ( *gcCPULimiterState) ( int64) {
	.assistTimePool.Add()
}

// addIdleTime notifies the limiter of additional time a P spent on the idle list. It will be
// subtracted from the total CPU time in the next update.
func ( *gcCPULimiterState) ( int64) {
	.idleTimePool.Add()
}

// update updates the bucket given runtime-specific information. now is the
// current monotonic time in nanoseconds.
//
// This is safe to call concurrently with other operations, except *GCTransition.
func ( *gcCPULimiterState) ( int64) {
	if !.tryLock() {
		// We failed to acquire the lock, which means something else is currently
		// updating. Just drop our update, the next one to update will include
		// our total assist time.
		return
	}
	if .transitioning {
		throw("update during transition")
	}
	.updateLocked()
	.unlock()
}

// updateLocked is the implementation of update. l.lock must be held.
func ( *gcCPULimiterState) ( int64) {
	 := .lastUpdate.Load()
	if  <  {
		// Defensively avoid overflow. This isn't even the latest update anyway.
		return
	}
	 := ( - ) * int64(.nprocs)
	.lastUpdate.Store()

	// Drain the pool of assist time.
	 := .assistTimePool.Load()
	if  != 0 {
		.assistTimePool.Add(-)
	}

	// Drain the pool of idle time.
	 := .idleTimePool.Load()
	if  != 0 {
		.idleTimePool.Add(-)
	}

	if !.test {
		// Consume time from in-flight events. Make sure we're not preemptible so allp can't change.
		//
		// The reason we do this instead of just waiting for those events to finish and push updates
		// is to ensure that all the time we're accounting for happened sometime between lastUpdate
		// and now. This dramatically simplifies reasoning about the limiter because we're not at
		// risk of extra time being accounted for in this window than actually happened in this window,
		// leading to all sorts of weird transient behavior.
		 := acquirem()
		for ,  := range allp {
			,  := .limiterEvent.consume()
			switch  {
			case limiterEventIdleMarkWork:
				fallthrough
			case limiterEventIdle:
				 += 
				sched.idleTime.Add()
			case limiterEventMarkAssist:
				fallthrough
			case limiterEventScavengeAssist:
				 += 
			case limiterEventNone:
				break
			default:
				throw("invalid limiter event type found")
			}
		}
		releasem()
	}

	// Compute total GC time.
	 := 
	if .gcEnabled {
		 += int64(float64() * gcBackgroundUtilization)
	}

	// Subtract out all idle time from the total time. Do this after computing
	// GC time, because the background utilization is dependent on the *real*
	// total time, not the total time after idle time is subtracted.
	//
	// Idle time is counted as any time that a P is on the P idle list plus idle mark
	// time. Idle mark workers soak up time that the application spends idle.
	//
	// On a heavily undersubscribed system, any additional idle time can skew GC CPU
	// utilization, because the GC might be executing continuously and thrashing,
	// yet the CPU utilization with respect to GOMAXPROCS will be quite low, so
	// the limiter fails to turn on. By subtracting idle time, we're removing time that
	// we know the application was idle giving a more accurate picture of whether
	// the GC is thrashing.
	//
	// Note that this can cause the limiter to turn on even if it's not needed. For
	// instance, on a system with 32 Ps but only 1 running goroutine, each GC will have
	// 8 dedicated GC workers. Assuming the GC cycle is half mark phase and half sweep
	// phase, then the GC CPU utilization over that cycle, with idle time removed, will
	// be 8/(8+2) = 80%. Even though the limiter turns on, though, assist should be
	// unnecessary, as the GC has way more CPU time to outpace the 1 goroutine that's
	// running.
	 -= 

	.accumulate(-, )
}

// accumulate adds time to the bucket and signals whether the limiter is enabled.
//
// This is an internal function that deals just with the bucket. Prefer update.
// l.lock must be held.
func ( *gcCPULimiterState) (,  int64) {
	 := .bucket.capacity - .bucket.fill
	 :=  == 0

	// Let's be careful about three things here:
	// 1. The addition and subtraction, for the invariants.
	// 2. Overflow.
	// 3. Excessive mutation of l.enabled, which is accessed
	//    by all assists, potentially more than once.
	 :=  - 

	// Handle limiting case.
	if  > 0 &&  <= uint64() {
		.overflow += uint64() - 
		.bucket.fill = .bucket.capacity
		if ! {
			.enabled.Store(true)
			.lastEnabledCycle.Store(memstats.numgc + 1)
		}
		return
	}

	// Handle non-limiting cases.
	if  < 0 && .bucket.fill <= uint64(-) {
		// Bucket emptied.
		.bucket.fill = 0
	} else {
		// All other cases.
		.bucket.fill -= uint64(-)
	}
	if  != 0 &&  {
		.enabled.Store(false)
	}
}

// tryLock attempts to lock l. Returns true on success.
func ( *gcCPULimiterState) () bool {
	return .lock.CompareAndSwap(0, 1)
}

// unlock releases the lock on l. Must be called if tryLock returns true.
func ( *gcCPULimiterState) () {
	 := .lock.Swap(0)
	if  != 1 {
		throw("double unlock")
	}
}

// capacityPerProc is the limiter's bucket capacity for each P in GOMAXPROCS.
const capacityPerProc = 1e9 // 1 second in nanoseconds

// resetCapacity updates the capacity based on GOMAXPROCS. Must not be called
// while the GC is enabled.
//
// It is safe to call concurrently with other operations.
func ( *gcCPULimiterState) ( int64,  int32) {
	if !.tryLock() {
		// This must happen during a STW, so we can't fail to acquire the lock.
		// If we did, something went wrong. Throw.
		throw("failed to acquire lock to reset capacity")
	}
	// Flush the rest of the time for this period.
	.updateLocked()
	.nprocs = 

	.bucket.capacity = uint64() * capacityPerProc
	if .bucket.fill > .bucket.capacity {
		.bucket.fill = .bucket.capacity
		.enabled.Store(true)
		.lastEnabledCycle.Store(memstats.numgc + 1)
	} else if .bucket.fill < .bucket.capacity {
		.enabled.Store(false)
	}
	.unlock()
}

// limiterEventType indicates the type of an event occurring on some P.
//
// These events represent the full set of events that the GC CPU limiter tracks
// to execute its function.
//
// This type may use no more than limiterEventBits bits of information.
type limiterEventType uint8

const (
	limiterEventNone           limiterEventType = iota // None of the following events.
	limiterEventIdleMarkWork                           // Refers to an idle mark worker (see gcMarkWorkerMode).
	limiterEventMarkAssist                             // Refers to mark assist (see gcAssistAlloc).
	limiterEventScavengeAssist                         // Refers to a scavenge assist (see allocSpan).
	limiterEventIdle                                   // Refers to time a P spent on the idle list.

	limiterEventBits = 3
)

// limiterEventTypeMask is a mask for the bits in p.limiterEventStart that represent
// the event type. The rest of the bits of that field represent a timestamp.
const (
	limiterEventTypeMask  = uint64((1<<limiterEventBits)-1) << (64 - limiterEventBits)
	limiterEventStampNone = limiterEventStamp(0)
)

// limiterEventStamp is a nanotime timestamp packed with a limiterEventType.
type limiterEventStamp uint64

// makeLimiterEventStamp creates a new stamp from the event type and the current timestamp.
func makeLimiterEventStamp( limiterEventType,  int64) limiterEventStamp {
	return limiterEventStamp(uint64()<<(64-limiterEventBits) | (uint64() &^ limiterEventTypeMask))
}

// duration computes the difference between now and the start time stored in the stamp.
//
// Returns 0 if the difference is negative, which may happen if now is stale or if the
// before and after timestamps cross a 2^(64-limiterEventBits) boundary.
func ( limiterEventStamp) ( int64) int64 {
	// The top limiterEventBits bits of the timestamp are derived from the current time
	// when computing a duration.
	 := int64((uint64() & limiterEventTypeMask) | (uint64() &^ limiterEventTypeMask))
	if  <  {
		return 0
	}
	return  - 
}

// type extracts the event type from the stamp.
func ( limiterEventStamp) () limiterEventType {
	return limiterEventType( >> (64 - limiterEventBits))
}

// limiterEvent represents tracking state for an event tracked by the GC CPU limiter.
type limiterEvent struct {
	stamp atomic.Uint64 // Stores a limiterEventStamp.
}

// start begins tracking a new limiter event of the current type. If an event
// is already in flight, then a new event cannot begin because the current time is
// already being attributed to that event. In this case, this function returns false.
// Otherwise, it returns true.
//
// The caller must be non-preemptible until at least stop is called or this function
// returns false. Because this is trying to measure "on-CPU" time of some event, getting
// scheduled away during it can mean that whatever we're measuring isn't a reflection
// of "on-CPU" time. The OS could deschedule us at any time, but we want to maintain as
// close of an approximation as we can.
func ( *limiterEvent) ( limiterEventType,  int64) bool {
	if limiterEventStamp(.stamp.Load()).typ() != limiterEventNone {
		return false
	}
	.stamp.Store(uint64(makeLimiterEventStamp(, )))
	return true
}

// consume acquires the partial event CPU time from any in-flight event.
// It achieves this by storing the current time as the new event time.
//
// Returns the type of the in-flight event, as well as how long it's currently been
// executing for. Returns limiterEventNone if no event is active.
func ( *limiterEvent) ( int64) ( limiterEventType,  int64) {
	// Read the limiter event timestamp and update it to now.
	for {
		 := limiterEventStamp(.stamp.Load())
		 = .typ()
		if  == limiterEventNone {
			// There's no in-flight event, so just push that up.
			return
		}
		 = .duration()
		if  == 0 {
			// We might have a stale now value, or this crossed the
			// 2^(64-limiterEventBits) boundary in the clock readings.
			// Just ignore it.
			return limiterEventNone, 0
		}
		 := makeLimiterEventStamp(, )
		if .stamp.CompareAndSwap(uint64(), uint64()) {
			break
		}
	}
	return
}

// stop stops the active limiter event. Throws if the
//
// The caller must be non-preemptible across the event. See start as to why.
func ( *limiterEvent) ( limiterEventType,  int64) {
	var  limiterEventStamp
	for {
		 = limiterEventStamp(.stamp.Load())
		if .typ() !=  {
			print("runtime: want=", , " got=", .typ(), "\n")
			throw("limiterEvent.stop: found wrong event in p's limiter event slot")
		}
		if .stamp.CompareAndSwap(uint64(), uint64(limiterEventStampNone)) {
			break
		}
	}
	 := .duration()
	if  == 0 {
		// It's possible that we're missing time because we crossed a
		// 2^(64-limiterEventBits) boundary between the start and end.
		// In this case, we're dropping that information. This is OK because
		// at worst it'll cause a transient hiccup that will quickly resolve
		// itself as all new timestamps begin on the other side of the boundary.
		// Such a hiccup should be incredibly rare.
		return
	}
	// Account for the event.
	switch  {
	case limiterEventIdleMarkWork:
		gcCPULimiter.addIdleTime()
	case limiterEventIdle:
		gcCPULimiter.addIdleTime()
		sched.idleTime.Add()
	case limiterEventMarkAssist:
		fallthrough
	case limiterEventScavengeAssist:
		gcCPULimiter.addAssistTime()
	default:
		throw("limiterEvent.stop: invalid limiter event type found")
	}
}