// Copyright 2025 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.

package cgroup

import (
	
	
	
)

var (
	ErrNoCgroup error = stringError("not in a cgroup")

	errMalformedFile error = stringError("malformed file")
)

const _PATH_MAX = 4096

const (
	// Required amount of scratch space for CPULimit.
	//
	// TODO(prattmic): This is shockingly large (~70KiB) due to the (very
	// unlikely) combination of extremely long paths consisting mostly
	// escaped characters. The scratch buffer ends up in .bss in package
	// runtime, so it doesn't contribute to binary size and generally won't
	// be faulted in, but it would still be nice to shrink this. A more
	// complex parser that did not need to keep entire lines in memory
	// could get away with much less. Alternatively, we could do a one-off
	// mmap allocation for this buffer, which is only mapped larger if we
	// actually need the extra space.
	ScratchSize = PathSize + ParseSize

	// Required space to store a path of the cgroup in the filesystem.
	PathSize = _PATH_MAX

	// /proc/self/mountinfo path escape sequences are 4 characters long, so
	// a path consisting entirely of escaped characters could be 4 times
	// larger.
	escapedPathMax = 4 * _PATH_MAX

	// Required space to parse /proc/self/mountinfo and /proc/self/cgroup.
	// See findCPUMount and findCPURelativePath.
	ParseSize = 4 * escapedPathMax
)

// Include explicit NUL to be sure we include it in the slice.
const (
	v2MaxFile    = "/cpu.max\x00"
	v1QuotaFile  = "/cpu.cfs_quota_us\x00"
	v1PeriodFile = "/cpu.cfs_period_us\x00"
)

// Version indicates the cgroup version.
type Version int

const (
	VersionUnknown Version = iota
	V1
	V2
)

// CPU owns the FDs required to read the CPU limit from a cgroup.
type CPU struct {
	version Version

	// For cgroup v1, this is cpu.cfs_quota_us.
	// For cgroup v2, this is cpu.max.
	quotaFD int

	// For cgroup v1, this is cpu.cfs_period_us.
	// For cgroup v2, this is unused.
	periodFD int
}

func ( CPU) () {
	switch .version {
	case V1:
		syscall.Close(.quotaFD)
		syscall.Close(.periodFD)
	case V2:
		syscall.Close(.quotaFD)
	default:
		throw("impossible cgroup version")
	}
}

func checkBufferSize( []byte,  int) {
	if len() !=  {
		println("runtime: cgroup buffer length", len(), "want", )
		throw("runtime: cgroup invalid buffer length")
	}
}

// OpenCPU returns a CPU for the CPU cgroup containing the current process, or
// ErrNoCgroup if the process is not in a CPU cgroup.
//
// scratch must have length ScratchSize.
func ( []byte) (CPU, error) {
	checkBufferSize(, ScratchSize)

	 := [:PathSize]
	 := [PathSize:]

	, ,  := FindCPU(, )
	if  != nil {
		return CPU{}, 
	}

	switch  {
	case 1:
		 := copy([:], v1QuotaFile)
		 := [:+]
		,  := syscall.Open(&[0], syscall.O_RDONLY|syscall.O_CLOEXEC, 0)
		if  != 0 {
			// This may fail if this process was migrated out of
			// the cgroup found by FindCPU and that cgroup has been
			// deleted.
			return CPU{}, errSyscallFailed
		}

		 = copy([:], v1PeriodFile)
		 = [:+]
		,  := syscall.Open(&[0], syscall.O_RDONLY|syscall.O_CLOEXEC, 0)
		if  != 0 {
			// This may fail if this process was migrated out of
			// the cgroup found by FindCPU and that cgroup has been
			// deleted.
			return CPU{}, errSyscallFailed
		}

		 := CPU{
			version:  1,
			quotaFD:  ,
			periodFD: ,
		}
		return , nil
	case 2:
		 := copy([:], v2MaxFile)
		 := [:+]
		,  := syscall.Open(&[0], syscall.O_RDONLY|syscall.O_CLOEXEC, 0)
		if  != 0 {
			// This may fail if this process was migrated out of
			// the cgroup found by FindCPU and that cgroup has been
			// deleted.
			return CPU{}, errSyscallFailed
		}

		 := CPU{
			version:  2,
			quotaFD:  ,
			periodFD: -1,
		}
		return , nil
	default:
		throw("impossible cgroup version")
		panic("unreachable")
	}
}

// Returns average CPU throughput limit from the cgroup, or ok false if there
// is no limit.
func ( CPU) (float64, bool, error) {
	switch .version {
	case 1:
		,  := readV1Number(.quotaFD)
		if  != nil {
			return 0, false, errMalformedFile
		}

		if  < 0 {
			// No limit.
			return 0, false, nil
		}

		,  := readV1Number(.periodFD)
		if  != nil {
			return 0, false, errMalformedFile
		}

		return float64() / float64(), true, nil
	case 2:
		// quotaFD is the cpu.max FD.
		return readV2Limit(.quotaFD)
	default:
		throw("impossible cgroup version")
		panic("unreachable")
	}
}

// Returns the value from the quota/period file.
func readV1Number( int) (int64, error) {
	// The format of the file is "<value>\n" where the value is in
	// int64 microseconds and, if quota, may be -1 to indicate no limit.
	//
	// MaxInt64 requires 19 bytes to display in base 10, thus the
	// conservative max size of this file is 19 + 1 (newline) = 20 bytes.
	// We'll provide a bit more for good measure.
	//
	// Always read from the beginning of the file to get a fresh value.
	var  [64]byte
	,  := syscall.Pread(, [:], 0)
	if  != 0 {
		return 0, errSyscallFailed
	}
	if  == len() {
		return 0, errMalformedFile
	}

	 := [:]
	return parseV1Number()
}

func parseV1Number( []byte) (int64, error) {
	// Ignore trailing newline.
	 := bytealg.IndexByte(, '\n')
	if  < 0 {
		return 0, errMalformedFile
	}
	 = [:]

	,  := strconv.Atoi64(string())
	if ! {
		return 0, errMalformedFile
	}

	return , nil
}

// Returns CPU throughput limit, or ok false if there is no limit.
func readV2Limit( int) (float64, bool, error) {
	// The format of the file is "<quota> <period>\n" where quota and
	// period are microseconds and quota may be "max" to indicate no limit.
	//
	// Note that the kernel is inconsistent about whether the values are
	// uint64 or int64: values are parsed as uint64 but printed as int64.
	// See kernel/sched/core.c:cpu_max_{show,write}.
	//
	// In practice, the kernel limits the period to 1s (1000000us) (see
	// max_cfs_quota_period), and the quota to (1<<44)us (see
	// max_cfs_runtime), so these values can't get large enough for the
	// distinction to matter.
	//
	// MaxInt64 requires 19 bytes to display in base 10, thus the
	// conservative max size of this file is 19 + 19 + 1 (space) + 1
	// (newline) = 40 bytes. We'll provide a bit more for good measure.
	//
	// Always read from the beginning of the file to get a fresh value.
	var  [64]byte
	,  := syscall.Pread(, [:], 0)
	if  != 0 {
		return 0, false, errSyscallFailed
	}
	if  == len() {
		return 0, false, errMalformedFile
	}

	 := [:]
	return parseV2Limit()
}

func parseV2Limit( []byte) (float64, bool, error) {
	 := bytealg.IndexByte(, ' ')
	if  < 0 {
		return 0, false, errMalformedFile
	}

	 := [:]
	if bytealg.Compare(, []byte("max")) == 0 {
		// No limit.
		return 0, false, nil
	}

	 := [+1:]
	// Ignore trailing newline, if any.
	 = bytealg.IndexByte(, '\n')
	if  < 0 {
		return 0, false, errMalformedFile
	}
	 = [:]

	,  := strconv.Atoi64(string())
	if ! {
		return 0, false, errMalformedFile
	}

	,  := strconv.Atoi64(string())
	if ! {
		return 0, false, errMalformedFile
	}

	return float64() / float64(), true, nil
}

// FindCPU finds the path to the CPU cgroup that this process is a member of
// and places it in out. scratch is a scratch buffer for internal use.
//
// out must have length PathSize. scratch must have length ParseSize.
//
// Returns the number of bytes written to out and the cgroup version (1 or 2).
//
// Returns ErrNoCgroup if the process is not in a CPU cgroup.
func ( []byte,  []byte) (int, Version, error) {
	checkBufferSize(, PathSize)
	checkBufferSize(, ParseSize)

	// The cgroup path is <cgroup mount point> + <relative path>.
	//
	// This is racy if our cgroup is changed while this runs. For example,
	// initially there is only a cgroup v2 mount and we are not in a
	// cgroup. After, there a cgroup v1 mount with a CPU controller and we
	// are placed in a cgroup in this hierarchy. In that case, findCPUMount
	// could pick the v2 mount, and findCPURelativePath could find the v2
	// relative path.
	//
	// In this case we'll later fail to read the cgroup files and fall back
	// to assuming no cgroup.

	,  := FindCPUMountPoint(, )
	if  != nil {
		return 0, 0, 
	}

	// The relative path always starts with /, so we can directly append it
	// to the mount point.
	, ,  := FindCPURelativePath([:], )
	if  != nil {
		return 0, 0, 
	}
	 += 

	return , , nil
}

// FindCPURelativePath finds the path to the CPU cgroup that this process is a member of
// relative to the root of the cgroup mount and places it in out. scratch is a
// scratch buffer for internal use.
//
// out must have length PathSize minus the size of the cgroup mount root (if
// known). scratch must have length ParseSize.
//
// Returns the number of bytes written to out and the cgroup version (1 or 2).
//
// Returns ErrNoCgroup if the process is not in a CPU cgroup.
func ( []byte,  []byte) (int, Version, error) {
	 := []byte("/proc/self/cgroup\x00")
	,  := syscall.Open(&[0], syscall.O_RDONLY|syscall.O_CLOEXEC, 0)
	if  == syscall.ENOENT {
		return 0, 0, ErrNoCgroup
	} else if  != 0 {
		return 0, 0, errSyscallFailed
	}

	// The relative path always starts with /, so we can directly append it
	// to the mount point.
	, ,  := parseCPURelativePath(, syscall.Read, [:], )
	if  != nil {
		syscall.Close()
		return 0, 0, 
	}

	syscall.Close()
	return , , nil
}

// Finds the path of the current process's CPU cgroup relative to the cgroup
// mount and writes it to out.
//
// Returns the number of bytes written and the cgroup version (1 or 2).
func parseCPURelativePath( int,  func( int,  []byte) (int, uintptr),  []byte,  []byte) (int, Version, error) {
	// The format of each line is
	//
	//   hierarchy-ID:controller-list:cgroup-path
	//
	// controller-list is comma-separated.
	// See man 5 cgroup for more details.
	//
	// cgroup v2 has hierarchy-ID 0. If a v1 hierarchy contains "cpu", that
	// is the CPU controller. Otherwise the v2 hierarchy (if any) is the
	// CPU controller.
	//
	// hierarchy-ID and controller-list have relatively small maximum
	// sizes, and the path can be up to _PATH_MAX, so we need a bit more
	// than 1 _PATH_MAX of scratch space.

	 := newLineReader(, , )

	// Bytes written to out.
	 := 0

	for {
		 := .next()
		if  == errIncompleteLine {
			// Don't allow incomplete lines. While in theory the
			// incomplete line may be for a controller we don't
			// care about, in practice all lines should be of
			// similar length, so we should just have a buffer big
			// enough for any.
			return 0, 0, 
		} else if  == errEOF {
			break
		} else if  != nil {
			return 0, 0, 
		}

		 := .line()

		// The format of each line is
		//
		//   hierarchy-ID:controller-list:cgroup-path
		//
		// controller-list is comma-separated.
		// See man 5 cgroup for more details.
		 := bytealg.IndexByte(, ':')
		if  < 0 {
			return 0, 0, errMalformedFile
		}

		 := [:]
		 = [+1:]

		 = bytealg.IndexByte(, ':')
		if  < 0 {
			return 0, 0, errMalformedFile
		}

		 := [:]
		 = [+1:]

		 := 

		if string() == "0" {
			// v2 hierarchy.
			 = copy(, )
			// Keep searching, we might find a v1 hierarchy with a
			// CPU controller, which takes precedence.
		} else {
			// v1 hierarchy
			if containsCPU() {
				// Found a v1 CPU controller. This must be the
				// only one, so we're done.
				return copy(, ), V1, nil
			}
		}
	}

	if  == 0 {
		// Found nothing.
		return 0, 0, ErrNoCgroup
	}

	// Must be v2, v1 returns above.
	return , V2, nil
}

// Returns true if comma-separated list b contains "cpu".
func containsCPU( []byte) bool {
	for len() > 0 {
		 := bytealg.IndexByte(, ',')
		if  < 0 {
			// Neither cmd/compile nor gccgo allocates for these string conversions.
			return string() == "cpu"
		}

		 := [:]
		 := [+1:]

		if string() == "cpu" {
			return true
		}

		 = 
	}

	return false
}

// FindCPUMountPoint finds the root of the CPU cgroup mount places it in out.
// scratch is a scratch buffer for internal use.
//
// out must have length PathSize. scratch must have length ParseSize.
//
// Returns the number of bytes written to out.
//
// Returns ErrNoCgroup if the process is not in a CPU cgroup.
func ( []byte,  []byte) (int, error) {
	checkBufferSize(, PathSize)
	checkBufferSize(, ParseSize)

	 := []byte("/proc/self/mountinfo\x00")
	,  := syscall.Open(&[0], syscall.O_RDONLY|syscall.O_CLOEXEC, 0)
	if  == syscall.ENOENT {
		return 0, ErrNoCgroup
	} else if  != 0 {
		return 0, errSyscallFailed
	}

	,  := parseCPUMount(, syscall.Read, , )
	if  != nil {
		syscall.Close()
		return 0, 
	}
	syscall.Close()

	return , nil
}

// Returns the mount point for the cpu cgroup controller (v1 or v2) from
// /proc/self/mountinfo.
func parseCPUMount( int,  func( int,  []byte) (int, uintptr),  []byte,  []byte) (int, error) {
	// The format of each line is:
	//
	// 36 35 98:0 /mnt1 /mnt2 rw,noatime master:1 - ext3 /dev/root rw,errors=continue
	// (1)(2)(3)   (4)   (5)      (6)      (7)   (8) (9)   (10)         (11)
	//
	// (1) mount ID:  unique identifier of the mount (may be reused after umount)
	// (2) parent ID:  ID of parent (or of self for the top of the mount tree)
	// (3) major:minor:  value of st_dev for files on filesystem
	// (4) root:  root of the mount within the filesystem
	// (5) mount point:  mount point relative to the process's root
	// (6) mount options:  per mount options
	// (7) optional fields:  zero or more fields of the form "tag[:value]"
	// (8) separator:  marks the end of the optional fields
	// (9) filesystem type:  name of filesystem of the form "type[.subtype]"
	// (10) mount source:  filesystem specific information or "none"
	// (11) super options:  per super block options
	//
	// See man 5 proc_pid_mountinfo for more details.
	//
	// Note that emitted paths will not contain space, tab, newline, or
	// carriage return. Those are escaped. See Linux show_mountinfo ->
	// show_path. We must unescape before returning.
	//
	// We return the mount point (5) if the filesystem type (9) is cgroup2,
	// or cgroup with "cpu" in the super options (11).
	//
	// (4), (5), and (10) are up to _PATH_MAX. The remaining fields have a
	// small fixed maximum size, so 4*_PATH_MAX is plenty of scratch space.
	// Note that non-cgroup mounts may have arbitrarily long (11), but we
	// can skip those when parsing.

	 := newLineReader(, , )

	// Bytes written to out.
	 := 0

	for {
		//incomplete := false
		 := .next()
		if  == errIncompleteLine {
			// An incomplete line is fine as long as it doesn't
			// impede parsing the fields we need. It shouldn't be
			// possible for any mount to use more than 3*PATH_MAX
			// before (9) because there are two paths and all other
			// earlier fields have bounded options. Only (11) has
			// unbounded options.
		} else if  == errEOF {
			break
		} else if  != nil {
			return 0, 
		}

		 := .line()

		// Skip first four fields.
		for range 4 {
			 := bytealg.IndexByte(, ' ')
			if  < 0 {
				return 0, errMalformedFile
			}
			 = [+1:]
		}

		// (5) mount point:  mount point relative to the process's root
		 := bytealg.IndexByte(, ' ')
		if  < 0 {
			return 0, errMalformedFile
		}
		 := [:]
		 = [+1:]

		// Skip ahead past optional fields, delimited by " - ".
		for {
			 = bytealg.IndexByte(, ' ')
			if  < 0 {
				return 0, errMalformedFile
			}
			if +3 >= len() {
				return 0, errMalformedFile
			}
			 := [ : +3]
			if string() == " - " {
				 = [+3:]
				break
			}
			 = [+1:]
		}

		// (9) filesystem type:  name of filesystem of the form "type[.subtype]"
		 = bytealg.IndexByte(, ' ')
		if  < 0 {
			return 0, errMalformedFile
		}
		 := [:]
		 = [+1:]

		if string() != "cgroup" && string() != "cgroup2" {
			continue
		}

		// As in findCPUPath, cgroup v1 with a CPU controller takes
		// precendence over cgroup v2.
		if string() == "cgroup2" {
			// v2 hierarchy.
			,  = unescapePath(, )
			if  != nil {
				// Don't keep searching on error. The kernel
				// should never produce broken escaping.
				return , 
			}
			// Keep searching, we might find a v1 hierarchy with a
			// CPU controller, which takes precedence.
			continue
		}

		// (10) mount source:  filesystem specific information or "none"
		 = bytealg.IndexByte(, ' ')
		if  < 0 {
			return 0, errMalformedFile
		}
		// Don't care about mount source.
		 = [+1:]

		// (11) super options:  per super block options
		 := 

		// v1 hierarchy
		if containsCPU() {
			// Found a v1 CPU controller. This must be the
			// only one, so we're done.
			return unescapePath(, )
		}
	}

	if  == 0 {
		// Found nothing.
		return 0, ErrNoCgroup
	}

	return , nil
}

var errInvalidEscape error = stringError("invalid path escape sequence")

// unescapePath copies in to out, unescaping escape sequences generated by
// Linux's show_path.
//
// That is, '\', ' ', '\t', and '\n' are converted to octal escape sequences,
// like '\040' for space.
//
// out must be at least as large as in.
//
// Returns the number of bytes written to out.
//
// Also see escapePath in cgroup_linux_test.go.
func unescapePath( []byte,  []byte) (int, error) {
	// Not strictly necessary, but simplifies the implementation and will
	// always hold in users.
	if len() < len() {
		throw("output too small")
	}

	var ,  int
	for  < len() {
		 := []
		if  != '\\' {
			[] = 
			++
			++
			continue
		}

		// Start of escape sequence.

		// Escape sequence is always 4 characters: one slash and three
		// digits.
		if +3 >= len() {
			return , errInvalidEscape
		}

		var  byte
		for  := range 3 {
			 := [+1+]
			if  < '0' ||  > '9' {
				return , errInvalidEscape
			}

			 *= 8
			 +=  - '0'
		}

		[] = 
		++

		 += 4
	}

	return , nil
}