Source File
cgroup_linux.go
Belonging Package
internal/runtime/cgroup
// Copyright 2025 The Go Authors. All rights reserved.// Use of this source code is governed by a BSD-style// license that can be found in the LICENSE file.package cgroupimport ()var (ErrNoCgroup error = stringError("not in a cgroup")errMalformedFile error = stringError("malformed file"))const _PATH_MAX = 4096const (// Required amount of scratch space for CPULimit.//// TODO(prattmic): This is shockingly large (~70KiB) due to the (very// unlikely) combination of extremely long paths consisting mostly// escaped characters. The scratch buffer ends up in .bss in package// runtime, so it doesn't contribute to binary size and generally won't// be faulted in, but it would still be nice to shrink this. A more// complex parser that did not need to keep entire lines in memory// could get away with much less. Alternatively, we could do a one-off// mmap allocation for this buffer, which is only mapped larger if we// actually need the extra space.ScratchSize = PathSize + ParseSize// Required space to store a path of the cgroup in the filesystem.PathSize = _PATH_MAX// /proc/self/mountinfo path escape sequences are 4 characters long, so// a path consisting entirely of escaped characters could be 4 times// larger.escapedPathMax = 4 * _PATH_MAX// Required space to parse /proc/self/mountinfo and /proc/self/cgroup.// See findCPUMount and findCPURelativePath.ParseSize = 4 * escapedPathMax)// Include explicit NUL to be sure we include it in the slice.const (v2MaxFile = "/cpu.max\x00"v1QuotaFile = "/cpu.cfs_quota_us\x00"v1PeriodFile = "/cpu.cfs_period_us\x00")// Version indicates the cgroup version.type Version intconst (VersionUnknown Version = iotaV1V2)// CPU owns the FDs required to read the CPU limit from a cgroup.type CPU struct {version Version// For cgroup v1, this is cpu.cfs_quota_us.// For cgroup v2, this is cpu.max.quotaFD int// For cgroup v1, this is cpu.cfs_period_us.// For cgroup v2, this is unused.periodFD int}func ( CPU) () {switch .version {case V1:syscall.Close(.quotaFD)syscall.Close(.periodFD)case V2:syscall.Close(.quotaFD)default:throw("impossible cgroup version")}}func checkBufferSize( []byte, int) {if len() != {println("runtime: cgroup buffer length", len(), "want", )throw("runtime: cgroup invalid buffer length")}}// OpenCPU returns a CPU for the CPU cgroup containing the current process, or// ErrNoCgroup if the process is not in a CPU cgroup.//// scratch must have length ScratchSize.func ( []byte) (CPU, error) {checkBufferSize(, ScratchSize):= [:PathSize]:= [PathSize:], , := FindCPU(, )if != nil {return CPU{},}switch {case 1::= copy([:], v1QuotaFile):= [:+], := syscall.Open(&[0], syscall.O_RDONLY|syscall.O_CLOEXEC, 0)if != 0 {// This may fail if this process was migrated out of// the cgroup found by FindCPU and that cgroup has been// deleted.return CPU{}, errSyscallFailed}= copy([:], v1PeriodFile)= [:+], := syscall.Open(&[0], syscall.O_RDONLY|syscall.O_CLOEXEC, 0)if != 0 {// This may fail if this process was migrated out of// the cgroup found by FindCPU and that cgroup has been// deleted.return CPU{}, errSyscallFailed}:= CPU{version: 1,quotaFD: ,periodFD: ,}return , nilcase 2::= copy([:], v2MaxFile):= [:+], := syscall.Open(&[0], syscall.O_RDONLY|syscall.O_CLOEXEC, 0)if != 0 {// This may fail if this process was migrated out of// the cgroup found by FindCPU and that cgroup has been// deleted.return CPU{}, errSyscallFailed}:= CPU{version: 2,quotaFD: ,periodFD: -1,}return , nildefault:throw("impossible cgroup version")panic("unreachable")}}// Returns average CPU throughput limit from the cgroup, or ok false if there// is no limit.func ( CPU) (float64, bool, error) {switch .version {case 1:, := readV1Number(.quotaFD)if != nil {return 0, false, errMalformedFile}if < 0 {// No limit.return 0, false, nil}, := readV1Number(.periodFD)if != nil {return 0, false, errMalformedFile}return float64() / float64(), true, nilcase 2:// quotaFD is the cpu.max FD.return readV2Limit(.quotaFD)default:throw("impossible cgroup version")panic("unreachable")}}// Returns the value from the quota/period file.func readV1Number( int) (int64, error) {// The format of the file is "<value>\n" where the value is in// int64 microseconds and, if quota, may be -1 to indicate no limit.//// MaxInt64 requires 19 bytes to display in base 10, thus the// conservative max size of this file is 19 + 1 (newline) = 20 bytes.// We'll provide a bit more for good measure.//// Always read from the beginning of the file to get a fresh value.var [64]byte, := syscall.Pread(, [:], 0)if != 0 {return 0, errSyscallFailed}if == len() {return 0, errMalformedFile}:= [:]return parseV1Number()}func parseV1Number( []byte) (int64, error) {// Ignore trailing newline.:= bytealg.IndexByte(, '\n')if < 0 {return 0, errMalformedFile}= [:], := strconv.Atoi64(string())if ! {return 0, errMalformedFile}return , nil}// Returns CPU throughput limit, or ok false if there is no limit.func readV2Limit( int) (float64, bool, error) {// The format of the file is "<quota> <period>\n" where quota and// period are microseconds and quota may be "max" to indicate no limit.//// Note that the kernel is inconsistent about whether the values are// uint64 or int64: values are parsed as uint64 but printed as int64.// See kernel/sched/core.c:cpu_max_{show,write}.//// In practice, the kernel limits the period to 1s (1000000us) (see// max_cfs_quota_period), and the quota to (1<<44)us (see// max_cfs_runtime), so these values can't get large enough for the// distinction to matter.//// MaxInt64 requires 19 bytes to display in base 10, thus the// conservative max size of this file is 19 + 19 + 1 (space) + 1// (newline) = 40 bytes. We'll provide a bit more for good measure.//// Always read from the beginning of the file to get a fresh value.var [64]byte, := syscall.Pread(, [:], 0)if != 0 {return 0, false, errSyscallFailed}if == len() {return 0, false, errMalformedFile}:= [:]return parseV2Limit()}func parseV2Limit( []byte) (float64, bool, error) {:= bytealg.IndexByte(, ' ')if < 0 {return 0, false, errMalformedFile}:= [:]if bytealg.Compare(, []byte("max")) == 0 {// No limit.return 0, false, nil}:= [+1:]// Ignore trailing newline, if any.= bytealg.IndexByte(, '\n')if < 0 {return 0, false, errMalformedFile}= [:], := strconv.Atoi64(string())if ! {return 0, false, errMalformedFile}, := strconv.Atoi64(string())if ! {return 0, false, errMalformedFile}return float64() / float64(), true, nil}// FindCPU finds the path to the CPU cgroup that this process is a member of// and places it in out. scratch is a scratch buffer for internal use.//// out must have length PathSize. scratch must have length ParseSize.//// Returns the number of bytes written to out and the cgroup version (1 or 2).//// Returns ErrNoCgroup if the process is not in a CPU cgroup.func ( []byte, []byte) (int, Version, error) {checkBufferSize(, PathSize)checkBufferSize(, ParseSize)// The cgroup path is <cgroup mount point> + <relative path>.//// This is racy if our cgroup is changed while this runs. For example,// initially there is only a cgroup v2 mount and we are not in a// cgroup. After, there a cgroup v1 mount with a CPU controller and we// are placed in a cgroup in this hierarchy. In that case, findCPUMount// could pick the v2 mount, and findCPURelativePath could find the v2// relative path.//// In this case we'll later fail to read the cgroup files and fall back// to assuming no cgroup., := FindCPUMountPoint(, )if != nil {return 0, 0,}// The relative path always starts with /, so we can directly append it// to the mount point., , := FindCPURelativePath([:], )if != nil {return 0, 0,}+=return , , nil}// FindCPURelativePath finds the path to the CPU cgroup that this process is a member of// relative to the root of the cgroup mount and places it in out. scratch is a// scratch buffer for internal use.//// out must have length PathSize minus the size of the cgroup mount root (if// known). scratch must have length ParseSize.//// Returns the number of bytes written to out and the cgroup version (1 or 2).//// Returns ErrNoCgroup if the process is not in a CPU cgroup.func ( []byte, []byte) (int, Version, error) {:= []byte("/proc/self/cgroup\x00"), := syscall.Open(&[0], syscall.O_RDONLY|syscall.O_CLOEXEC, 0)if == syscall.ENOENT {return 0, 0, ErrNoCgroup} else if != 0 {return 0, 0, errSyscallFailed}// The relative path always starts with /, so we can directly append it// to the mount point., , := parseCPURelativePath(, syscall.Read, [:], )if != nil {syscall.Close()return 0, 0,}syscall.Close()return , , nil}// Finds the path of the current process's CPU cgroup relative to the cgroup// mount and writes it to out.//// Returns the number of bytes written and the cgroup version (1 or 2).func parseCPURelativePath( int, func( int, []byte) (int, uintptr), []byte, []byte) (int, Version, error) {// The format of each line is//// hierarchy-ID:controller-list:cgroup-path//// controller-list is comma-separated.// See man 5 cgroup for more details.//// cgroup v2 has hierarchy-ID 0. If a v1 hierarchy contains "cpu", that// is the CPU controller. Otherwise the v2 hierarchy (if any) is the// CPU controller.//// hierarchy-ID and controller-list have relatively small maximum// sizes, and the path can be up to _PATH_MAX, so we need a bit more// than 1 _PATH_MAX of scratch space.:= newLineReader(, , )// Bytes written to out.:= 0for {:= .next()if == errIncompleteLine {// Don't allow incomplete lines. While in theory the// incomplete line may be for a controller we don't// care about, in practice all lines should be of// similar length, so we should just have a buffer big// enough for any.return 0, 0,} else if == errEOF {break} else if != nil {return 0, 0,}:= .line()// The format of each line is//// hierarchy-ID:controller-list:cgroup-path//// controller-list is comma-separated.// See man 5 cgroup for more details.:= bytealg.IndexByte(, ':')if < 0 {return 0, 0, errMalformedFile}:= [:]= [+1:]= bytealg.IndexByte(, ':')if < 0 {return 0, 0, errMalformedFile}:= [:]= [+1:]:=if string() == "0" {// v2 hierarchy.= copy(, )// Keep searching, we might find a v1 hierarchy with a// CPU controller, which takes precedence.} else {// v1 hierarchyif containsCPU() {// Found a v1 CPU controller. This must be the// only one, so we're done.return copy(, ), V1, nil}}}if == 0 {// Found nothing.return 0, 0, ErrNoCgroup}// Must be v2, v1 returns above.return , V2, nil}// Returns true if comma-separated list b contains "cpu".func containsCPU( []byte) bool {for len() > 0 {:= bytealg.IndexByte(, ',')if < 0 {// Neither cmd/compile nor gccgo allocates for these string conversions.return string() == "cpu"}:= [:]:= [+1:]if string() == "cpu" {return true}=}return false}// FindCPUMountPoint finds the root of the CPU cgroup mount places it in out.// scratch is a scratch buffer for internal use.//// out must have length PathSize. scratch must have length ParseSize.//// Returns the number of bytes written to out.//// Returns ErrNoCgroup if the process is not in a CPU cgroup.func ( []byte, []byte) (int, error) {checkBufferSize(, PathSize)checkBufferSize(, ParseSize):= []byte("/proc/self/mountinfo\x00"), := syscall.Open(&[0], syscall.O_RDONLY|syscall.O_CLOEXEC, 0)if == syscall.ENOENT {return 0, ErrNoCgroup} else if != 0 {return 0, errSyscallFailed}, := parseCPUMount(, syscall.Read, , )if != nil {syscall.Close()return 0,}syscall.Close()return , nil}// Returns the mount point for the cpu cgroup controller (v1 or v2) from// /proc/self/mountinfo.func parseCPUMount( int, func( int, []byte) (int, uintptr), []byte, []byte) (int, error) {// The format of each line is://// 36 35 98:0 /mnt1 /mnt2 rw,noatime master:1 - ext3 /dev/root rw,errors=continue// (1)(2)(3) (4) (5) (6) (7) (8) (9) (10) (11)//// (1) mount ID: unique identifier of the mount (may be reused after umount)// (2) parent ID: ID of parent (or of self for the top of the mount tree)// (3) major:minor: value of st_dev for files on filesystem// (4) root: root of the mount within the filesystem// (5) mount point: mount point relative to the process's root// (6) mount options: per mount options// (7) optional fields: zero or more fields of the form "tag[:value]"// (8) separator: marks the end of the optional fields// (9) filesystem type: name of filesystem of the form "type[.subtype]"// (10) mount source: filesystem specific information or "none"// (11) super options: per super block options//// See man 5 proc_pid_mountinfo for more details.//// Note that emitted paths will not contain space, tab, newline, or// carriage return. Those are escaped. See Linux show_mountinfo ->// show_path. We must unescape before returning.//// We return the mount point (5) if the filesystem type (9) is cgroup2,// or cgroup with "cpu" in the super options (11).//// (4), (5), and (10) are up to _PATH_MAX. The remaining fields have a// small fixed maximum size, so 4*_PATH_MAX is plenty of scratch space.// Note that non-cgroup mounts may have arbitrarily long (11), but we// can skip those when parsing.:= newLineReader(, , )// Bytes written to out.:= 0for {//incomplete := false:= .next()if == errIncompleteLine {// An incomplete line is fine as long as it doesn't// impede parsing the fields we need. It shouldn't be// possible for any mount to use more than 3*PATH_MAX// before (9) because there are two paths and all other// earlier fields have bounded options. Only (11) has// unbounded options.} else if == errEOF {break} else if != nil {return 0,}:= .line()// Skip first four fields.for range 4 {:= bytealg.IndexByte(, ' ')if < 0 {return 0, errMalformedFile}= [+1:]}// (5) mount point: mount point relative to the process's root:= bytealg.IndexByte(, ' ')if < 0 {return 0, errMalformedFile}:= [:]= [+1:]// Skip ahead past optional fields, delimited by " - ".for {= bytealg.IndexByte(, ' ')if < 0 {return 0, errMalformedFile}if +3 >= len() {return 0, errMalformedFile}:= [ : +3]if string() == " - " {= [+3:]break}= [+1:]}// (9) filesystem type: name of filesystem of the form "type[.subtype]"= bytealg.IndexByte(, ' ')if < 0 {return 0, errMalformedFile}:= [:]= [+1:]if string() != "cgroup" && string() != "cgroup2" {continue}// As in findCPUPath, cgroup v1 with a CPU controller takes// precendence over cgroup v2.if string() == "cgroup2" {// v2 hierarchy., = unescapePath(, )if != nil {// Don't keep searching on error. The kernel// should never produce broken escaping.return ,}// Keep searching, we might find a v1 hierarchy with a// CPU controller, which takes precedence.continue}// (10) mount source: filesystem specific information or "none"= bytealg.IndexByte(, ' ')if < 0 {return 0, errMalformedFile}// Don't care about mount source.= [+1:]// (11) super options: per super block options:=// v1 hierarchyif containsCPU() {// Found a v1 CPU controller. This must be the// only one, so we're done.return unescapePath(, )}}if == 0 {// Found nothing.return 0, ErrNoCgroup}return , nil}var errInvalidEscape error = stringError("invalid path escape sequence")// unescapePath copies in to out, unescaping escape sequences generated by// Linux's show_path.//// That is, '\', ' ', '\t', and '\n' are converted to octal escape sequences,// like '\040' for space.//// out must be at least as large as in.//// Returns the number of bytes written to out.//// Also see escapePath in cgroup_linux_test.go.func unescapePath( []byte, []byte) (int, error) {// Not strictly necessary, but simplifies the implementation and will// always hold in users.if len() < len() {throw("output too small")}var , intfor < len() {:= []if != '\\' {[] =++++continue}// Start of escape sequence.// Escape sequence is always 4 characters: one slash and three// digits.if +3 >= len() {return , errInvalidEscape}var bytefor := range 3 {:= [+1+]if < '0' || > '9' {return , errInvalidEscape}*= 8+= - '0'}[] =+++= 4}return , nil}
![]() |
The pages are generated with Golds v0.7.9-preview. (GOOS=linux GOARCH=amd64) Golds is a Go 101 project developed by Tapir Liu. PR and bug reports are welcome and can be submitted to the issue list. Please follow @zigo_101 (reachable from the left QR code) to get the latest news of Golds. |