// Copyright 2011 The Go Authors. All rights reserved.// Use of this source code is governed by a BSD-style// license that can be found in the LICENSE file.//go:build linuxpackage syscallimport ()// Linux unshare/clone/clone2/clone3 flags, architecture-independent,// copied from linux/sched.h.const (CLONE_VM = 0x00000100// set if VM shared between processesCLONE_FS = 0x00000200// set if fs info shared between processesCLONE_FILES = 0x00000400// set if open files shared between processesCLONE_SIGHAND = 0x00000800// set if signal handlers and blocked signals sharedCLONE_PIDFD = 0x00001000// set if a pidfd should be placed in parentCLONE_PTRACE = 0x00002000// set if we want to let tracing continue on the child tooCLONE_VFORK = 0x00004000// set if the parent wants the child to wake it up on mm_releaseCLONE_PARENT = 0x00008000// set if we want to have the same parent as the clonerCLONE_THREAD = 0x00010000// Same thread group?CLONE_NEWNS = 0x00020000// New mount namespace groupCLONE_SYSVSEM = 0x00040000// share system V SEM_UNDO semanticsCLONE_SETTLS = 0x00080000// create a new TLS for the childCLONE_PARENT_SETTID = 0x00100000// set the TID in the parentCLONE_CHILD_CLEARTID = 0x00200000// clear the TID in the childCLONE_DETACHED = 0x00400000// Unused, ignoredCLONE_UNTRACED = 0x00800000// set if the tracing process can't force CLONE_PTRACE on this cloneCLONE_CHILD_SETTID = 0x01000000// set the TID in the childCLONE_NEWCGROUP = 0x02000000// New cgroup namespaceCLONE_NEWUTS = 0x04000000// New utsname namespaceCLONE_NEWIPC = 0x08000000// New ipc namespaceCLONE_NEWUSER = 0x10000000// New user namespaceCLONE_NEWPID = 0x20000000// New pid namespaceCLONE_NEWNET = 0x40000000// New network namespaceCLONE_IO = 0x80000000// Clone io context// Flags for the clone3() syscall.CLONE_CLEAR_SIGHAND = 0x100000000// Clear any signal handler and reset to SIG_DFL.CLONE_INTO_CGROUP = 0x200000000// Clone into a specific cgroup given the right permissions.// Cloning flags intersect with CSIGNAL so can be used with unshare and clone3 // syscalls only:CLONE_NEWTIME = 0x00000080// New time namespace)// SysProcIDMap holds Container ID to Host ID mappings used for User Namespaces in Linux.// See user_namespaces(7).//// Note that User Namespaces are not available on a number of popular Linux// versions (due to security issues), or are available but subject to AppArmor// restrictions like in Ubuntu 24.04.typeSysProcIDMapstruct { ContainerID int// Container ID. HostID int// Host ID. Size int// Size.}typeSysProcAttrstruct { Chroot string// Chroot. Credential *Credential// Credential.// Ptrace tells the child to call ptrace(PTRACE_TRACEME). // Call runtime.LockOSThread before starting a process with this set, // and don't call UnlockOSThread until done with PtraceSyscall calls. Ptrace bool Setsid bool// Create session.// Setpgid sets the process group ID of the child to Pgid, // or, if Pgid == 0, to the new child's process ID. Setpgid bool// Setctty sets the controlling terminal of the child to // file descriptor Ctty. Ctty must be a descriptor number // in the child process: an index into ProcAttr.Files. // This is only meaningful if Setsid is true. Setctty bool Noctty bool// Detach fd 0 from controlling terminal. Ctty int// Controlling TTY fd.// Foreground places the child process group in the foreground. // This implies Setpgid. The Ctty field must be set to // the descriptor of the controlling TTY. // Unlike Setctty, in this case Ctty must be a descriptor // number in the parent process. Foreground bool Pgid int// Child's process group ID if Setpgid.// Pdeathsig, if non-zero, is a signal that the kernel will send to // the child process when the creating thread dies. Note that the signal // is sent on thread termination, which may happen before process termination. // There are more details at https://go.dev/issue/27505. Pdeathsig Signal Cloneflags uintptr// Flags for clone calls. Unshareflags uintptr// Flags for unshare calls. UidMappings []SysProcIDMap// User ID mappings for user namespaces. GidMappings []SysProcIDMap// Group ID mappings for user namespaces.// GidMappingsEnableSetgroups enabling setgroups syscall. // If false, then setgroups syscall will be disabled for the child process. // This parameter is no-op if GidMappings == nil. Otherwise for unprivileged // users this should be set to false for mappings work. GidMappingsEnableSetgroups bool AmbientCaps []uintptr// Ambient capabilities. UseCgroupFD bool// Whether to make use of the CgroupFD field. CgroupFD int// File descriptor of a cgroup to put the new process into.// PidFD, if not nil, is used to store the pidfd of a child, if the // functionality is supported by the kernel, or -1. Note *PidFD is // changed only if the process starts successfully. PidFD *int}var ( none = [...]byte{'n', 'o', 'n', 'e', 0} slash = [...]byte{'/', 0} forceClone3 = false// Used by unit tests only.)// Implemented in runtime package.func runtime_BeforeFork()func runtime_AfterFork()func runtime_AfterForkInChild()// Fork, dup fd onto 0..len(fd), and exec(argv0, argvv, envv) in child.// If a dup or exec fails, write the errno error to pipe.// (Pipe is close-on-exec so if exec succeeds, it will be closed.)// In the child, this function must not acquire any locks, because// they might have been locked at the time of the fork. This means// no rescheduling, no malloc calls, and no new stack segments.// For the same reason compiler does not race instrument it.// The calls to RawSyscall are okay because they are assembly// functions that do not grow the stack.////go:noracefunc forkAndExecInChild( *byte, , []*byte, , *byte, *ProcAttr, *SysProcAttr, int) ( int, Errno) {// Set up and fork. This returns immediately in the parent or // if there's an error. , , , , := forkAndExecInChild1(, , , , , , , )if {runtime_AfterFork() }if != 0 {return0, }// parent; return PID = int()if .PidFD != nil { *.PidFD = int() }if .UidMappings != nil || .GidMappings != nil {Close([0])varErrno// uid/gid mappings will be written after fork and unshare(2) for user // namespaces.if .Unshareflags&CLONE_NEWUSER == 0 {if := writeUidGidMappings(, ); != nil { = .(Errno) } }RawSyscall(SYS_WRITE, uintptr([1]), uintptr(unsafe.Pointer(&)), unsafe.Sizeof())Close([1]) }return , 0}const _LINUX_CAPABILITY_VERSION_3 = 0x20080522type capHeader struct { version uint32 pid int32}type capData struct { effective uint32 permitted uint32 inheritable uint32}type caps struct { hdr capHeader data [2]capData}// See CAP_TO_INDEX in linux/capability.h:func capToIndex( uintptr) uintptr { return >> 5 }// See CAP_TO_MASK in linux/capability.h:func capToMask( uintptr) uint32 { return1 << uint(&31) }// cloneArgs holds arguments for clone3 Linux syscall.type cloneArgs struct { flags uint64// Flags bit mask pidFD uint64// Where to store PID file descriptor (int *) childTID uint64// Where to store child TID, in child's memory (pid_t *) parentTID uint64// Where to store child TID, in parent's memory (pid_t *) exitSignal uint64// Signal to deliver to parent on child termination stack uint64// Pointer to lowest byte of stack stackSize uint64// Size of stack tls uint64// Location of new TLS setTID uint64// Pointer to a pid_t array (since Linux 5.5) setTIDSize uint64// Number of elements in set_tid (since Linux 5.5) cgroup uint64// File descriptor for target cgroup of child (since Linux 5.7)}// forkAndExecInChild1 implements the body of forkAndExecInChild up to// the parent's post-fork path. This is a separate function so we can// separate the child's and parent's stack frames if we're using// vfork.//// This is go:noinline because the point is to keep the stack frames// of this and forkAndExecInChild separate.////go:noinline//go:norace//go:nocheckptrfunc forkAndExecInChild1( *byte, , []*byte, , *byte, *ProcAttr, *SysProcAttr, int) ( uintptr, int32, Errno, [2]int, bool) {// Defined in linux/prctl.h starting with Linux 4.3.const ( = 0x2f = 0x2 )// vfork requires that the child not touch any of the parent's // active stack frames. Hence, the child does all post-fork // processing in this stack frame and never returns, while the // parent returns immediately from this frame and does all // post-fork processing in the outer frame. // // Declare all variables at top in case any // declarations require heap allocation (e.g., err2). // ":=" should not be used to declare any variable after // the call to runtime_BeforeFork. // // NOTE(bcmills): The allocation behavior described in the above comment // seems to lack a corresponding test, and it may be rendered invalid // by an otherwise-correct change in the compiler.var (Errnointintcaps , uintptr , , []byte , , []byte *cloneArgsint32int *Credential , uintptruintptr ) = -1 := origRlimitNofile.Load()if .UidMappings != nil { = []byte("/proc/self/uid_map\000") = formatIDMappings(.UidMappings) }if .GidMappings != nil { = []byte("/proc/self/setgroups\000") = []byte("/proc/self/gid_map\000")if .GidMappingsEnableSetgroups { = []byte("allow\000") } else { = []byte("deny\000") } = formatIDMappings(.GidMappings) }// Record parent PID so child can test if it has died. , := rawSyscallNoError(SYS_GETPID, 0, 0, 0)// Guard against side effects of shuffling fds below. // Make sure that nextfd is beyond any currently open files so // that we can't run the risk of overwriting any of them. := make([]int, len(.Files)) = len(.Files)for , := range .Files {if < int() { = int() } [] = int() } ++// Allocate another pipe for parent to child communication for // synchronizing writing of User ID/Group ID mappings.if .UidMappings != nil || .GidMappings != nil {if := forkExecPipe([:]); != nil { = .(Errno)return } } = .Cloneflagsif .Cloneflags&CLONE_NEWUSER == 0 && .Unshareflags&CLONE_NEWUSER == 0 { |= CLONE_VFORK | CLONE_VM }if .PidFD != nil { |= CLONE_PIDFD }// Whether to use clone3.if .UseCgroupFD || &CLONE_NEWTIME != 0 || forceClone3 { = &cloneArgs{flags: uint64(),exitSignal: uint64(SIGCHLD), }if .UseCgroupFD { .flags |= CLONE_INTO_CGROUP .cgroup = uint64(.CgroupFD) }if .PidFD != nil { .pidFD = uint64(uintptr(unsafe.Pointer(&))) } }// About to call fork. // No more allocation or calls of non-assembly functions.runtime_BeforeFork() = trueif != nil { , = rawVforkSyscall(_SYS_clone3, uintptr(unsafe.Pointer()), unsafe.Sizeof(*), 0) } else { |= uintptr(SIGCHLD)ifruntime.GOARCH == "s390x" {// On Linux/s390, the first two arguments of clone(2) are swapped. , = rawVforkSyscall(SYS_CLONE, 0, , uintptr(unsafe.Pointer(&))) } else { , = rawVforkSyscall(SYS_CLONE, , 0, uintptr(unsafe.Pointer(&))) } }if != 0 || != 0 {// If we're in the parent, we must return immediately // so we're not in the same stack frame as the child. // This can at most use the return PC, which the child // will not modify, and the results of // rawVforkSyscall, which must have been written after // the child was replaced.return }// Fork succeeded, now in child.// Enable the "keep capabilities" flag to set ambient capabilities later.iflen(.AmbientCaps) > 0 { _, _, = RawSyscall6(SYS_PRCTL, PR_SET_KEEPCAPS, 1, 0, 0, 0, 0)if != 0 {goto } }// Wait for User ID/Group ID mappings to be written.if .UidMappings != nil || .GidMappings != nil {if _, _, = RawSyscall(SYS_CLOSE, uintptr([1]), 0, 0); != 0 {goto } , _, = RawSyscall(SYS_READ, uintptr([0]), uintptr(unsafe.Pointer(&)), unsafe.Sizeof())if != 0 {goto }if != unsafe.Sizeof() { = EINVALgoto }if != 0 { = goto } }// Session IDif .Setsid { _, _, = RawSyscall(SYS_SETSID, 0, 0, 0)if != 0 {goto } }// Set process groupif .Setpgid || .Foreground {// Place child in process group. _, _, = RawSyscall(SYS_SETPGID, 0, uintptr(.Pgid), 0)if != 0 {goto } }if .Foreground { = int32(.Pgid)if == 0 { , _ = rawSyscallNoError(SYS_GETPID, 0, 0, 0) = int32() }// Place process group in foreground. _, _, = RawSyscall(SYS_IOCTL, uintptr(.Ctty), uintptr(TIOCSPGRP), uintptr(unsafe.Pointer(&)))if != 0 {goto } }// Restore the signal mask. We do this after TIOCSPGRP to avoid // having the kernel send a SIGTTOU signal to the process group.runtime_AfterForkInChild()// Unshareif .Unshareflags != 0 { _, _, = RawSyscall(SYS_UNSHARE, .Unshareflags, 0, 0)if != 0 {goto }if .Unshareflags&CLONE_NEWUSER != 0 && .GidMappings != nil { = int(_AT_FDCWD)if , _, = RawSyscall6(SYS_OPENAT, uintptr(), uintptr(unsafe.Pointer(&[0])), uintptr(O_WRONLY), 0, 0, 0); != 0 {goto } , _, = RawSyscall(SYS_WRITE, , uintptr(unsafe.Pointer(&[0])), uintptr(len()))if != 0 {goto }if _, _, = RawSyscall(SYS_CLOSE, , 0, 0); != 0 {goto }if , _, = RawSyscall6(SYS_OPENAT, uintptr(), uintptr(unsafe.Pointer(&[0])), uintptr(O_WRONLY), 0, 0, 0); != 0 {goto } , _, = RawSyscall(SYS_WRITE, , uintptr(unsafe.Pointer(&[0])), uintptr(len()))if != 0 {goto }if _, _, = RawSyscall(SYS_CLOSE, , 0, 0); != 0 {goto } }if .Unshareflags&CLONE_NEWUSER != 0 && .UidMappings != nil { = int(_AT_FDCWD)if , _, = RawSyscall6(SYS_OPENAT, uintptr(), uintptr(unsafe.Pointer(&[0])), uintptr(O_WRONLY), 0, 0, 0); != 0 {goto } , _, = RawSyscall(SYS_WRITE, , uintptr(unsafe.Pointer(&[0])), uintptr(len()))if != 0 {goto }if _, _, = RawSyscall(SYS_CLOSE, , 0, 0); != 0 {goto } }// The unshare system call in Linux doesn't unshare mount points // mounted with --shared. Systemd mounts / with --shared. For a // long discussion of the pros and cons of this see debian bug 739593. // The Go model of unsharing is more like Plan 9, where you ask // to unshare and the namespaces are unconditionally unshared. // To make this model work we must further mark / as MS_PRIVATE. // This is what the standard unshare command does.if .Unshareflags&CLONE_NEWNS == CLONE_NEWNS { _, _, = RawSyscall6(SYS_MOUNT, uintptr(unsafe.Pointer(&none[0])), uintptr(unsafe.Pointer(&slash[0])), 0, MS_REC|MS_PRIVATE, 0, 0)if != 0 {goto } } }// Chrootif != nil { _, _, = RawSyscall(SYS_CHROOT, uintptr(unsafe.Pointer()), 0, 0)if != 0 {goto } }// User and groupsif = .Credential; != nil { = uintptr(len(.Groups)) = uintptr(0)if > 0 { = uintptr(unsafe.Pointer(&.Groups[0])) }if !(.GidMappings != nil && !.GidMappingsEnableSetgroups && == 0) && !.NoSetGroups { _, _, = RawSyscall(_SYS_setgroups, , , 0)if != 0 {goto } } _, _, = RawSyscall(sys_SETGID, uintptr(.Gid), 0, 0)if != 0 {goto } _, _, = RawSyscall(sys_SETUID, uintptr(.Uid), 0, 0)if != 0 {goto } }iflen(.AmbientCaps) != 0 {// Ambient capabilities were added in the 4.3 kernel, // so it is safe to always use _LINUX_CAPABILITY_VERSION_3. .hdr.version = _LINUX_CAPABILITY_VERSION_3if _, _, = RawSyscall(SYS_CAPGET, uintptr(unsafe.Pointer(&.hdr)), uintptr(unsafe.Pointer(&.data[0])), 0); != 0 {goto }for _, = range .AmbientCaps {// Add the c capability to the permitted and inheritable capability mask, // otherwise we will not be able to add it to the ambient capability mask. .data[capToIndex()].permitted |= capToMask() .data[capToIndex()].inheritable |= capToMask() }if _, _, = RawSyscall(SYS_CAPSET, uintptr(unsafe.Pointer(&.hdr)), uintptr(unsafe.Pointer(&.data[0])), 0); != 0 {goto }for _, = range .AmbientCaps { _, _, = RawSyscall6(SYS_PRCTL, , uintptr(), , 0, 0, 0)if != 0 {goto } } }// Chdirif != nil { _, _, = RawSyscall(SYS_CHDIR, uintptr(unsafe.Pointer()), 0, 0)if != 0 {goto } }// Parent death signalif .Pdeathsig != 0 { _, _, = RawSyscall6(SYS_PRCTL, PR_SET_PDEATHSIG, uintptr(.Pdeathsig), 0, 0, 0, 0)if != 0 {goto }// Signal self if parent is already dead. This might cause a // duplicate signal in rare cases, but it won't matter when // using SIGKILL. , _ = rawSyscallNoError(SYS_GETPPID, 0, 0, 0)if != { , _ = rawSyscallNoError(SYS_GETPID, 0, 0, 0) _, _, = RawSyscall(SYS_KILL, , uintptr(.Pdeathsig), 0)if != 0 {goto } } }// Pass 1: look for fd[i] < i and move those up above len(fd) // so that pass 2 won't stomp on an fd it needs later.if < { _, _, = RawSyscall(SYS_DUP3, uintptr(), uintptr(), O_CLOEXEC)if != 0 {goto } = ++ }for = 0; < len(); ++ {if [] >= 0 && [] < {if == { // don't stomp on pipe ++ } _, _, = RawSyscall(SYS_DUP3, uintptr([]), uintptr(), O_CLOEXEC)if != 0 {goto } [] = ++ } }// Pass 2: dup fd[i] down onto i.for = 0; < len(); ++ {if [] == -1 {RawSyscall(SYS_CLOSE, uintptr(), 0, 0)continue }if [] == {// dup2(i, i) won't clear close-on-exec flag on Linux, // probably not elsewhere either. _, _, = RawSyscall(fcntl64Syscall, uintptr([]), F_SETFD, 0)if != 0 {goto }continue }// The new fd is created NOT close-on-exec, // which is exactly what we want. _, _, = RawSyscall(SYS_DUP3, uintptr([]), uintptr(), 0)if != 0 {goto } }// By convention, we don't close-on-exec the fds we are // started with, so if len(fd) < 3, close 0, 1, 2 as needed. // Programs that know they inherit fds >= 3 will need // to set them close-on-exec.for = len(); < 3; ++ {RawSyscall(SYS_CLOSE, uintptr(), 0, 0) }// Detach fd 0 from ttyif .Noctty { _, _, = RawSyscall(SYS_IOCTL, 0, uintptr(TIOCNOTTY), 0)if != 0 {goto } }// Set the controlling TTY to Cttyif .Setctty { _, _, = RawSyscall(SYS_IOCTL, uintptr(.Ctty), uintptr(TIOCSCTTY), 1)if != 0 {goto } }// Restore original rlimit.if != nil {rawSetrlimit(RLIMIT_NOFILE, ) }// Enable tracing if requested. // Do this right before exec so that we don't unnecessarily trace the runtime // setting up after the fork. See issue #21428.if .Ptrace { _, _, = RawSyscall(SYS_PTRACE, uintptr(PTRACE_TRACEME), 0, 0)if != 0 {goto } }// Time to exec. _, _, = RawSyscall(SYS_EXECVE,uintptr(unsafe.Pointer()),uintptr(unsafe.Pointer(&[0])),uintptr(unsafe.Pointer(&[0]))):// send error code on pipeRawSyscall(SYS_WRITE, uintptr(), uintptr(unsafe.Pointer(&)), unsafe.Sizeof())for {RawSyscall(SYS_EXIT, 253, 0, 0) }}func formatIDMappings( []SysProcIDMap) []byte {var []bytefor , := range { = append(, itoa.Itoa(.ContainerID)+" "+itoa.Itoa(.HostID)+" "+itoa.Itoa(.Size)+"\n"...) }return}// writeIDMappings writes the user namespace User ID or Group ID mappings to the specified path.func writeIDMappings( string, []SysProcIDMap) error { , := Open(, O_RDWR, 0)if != nil {return }if , := Write(, formatIDMappings()); != nil {Close()return }if := Close(); != nil {return }returnnil}// writeSetgroups writes to /proc/PID/setgroups "deny" if enable is false// and "allow" if enable is true.// This is needed since kernel 3.19, because you can't write gid_map without// disabling setgroups() system call.func writeSetgroups( int, bool) error { := "/proc/" + itoa.Itoa() + "/setgroups" , := Open(, O_RDWR, 0)if != nil {return }var []byteif { = []byte("allow") } else { = []byte("deny") }if , := Write(, ); != nil {Close()return }returnClose()}// writeUidGidMappings writes User ID and Group ID mappings for user namespaces// for a process and it is called from the parent process.func writeUidGidMappings( int, *SysProcAttr) error {if .UidMappings != nil { := "/proc/" + itoa.Itoa() + "/uid_map"if := writeIDMappings(, .UidMappings); != nil {return } }if .GidMappings != nil {// If the kernel is too old to support /proc/PID/setgroups, writeSetGroups will return ENOENT; this is OK.if := writeSetgroups(, .GidMappingsEnableSetgroups); != nil && != ENOENT {return } := "/proc/" + itoa.Itoa() + "/gid_map"if := writeIDMappings(, .GidMappings); != nil {return } }returnnil}
The pages are generated with Goldsv0.7.0-preview. (GOOS=linux GOARCH=amd64)
Golds is a Go 101 project developed by Tapir Liu.
PR and bug reports are welcome and can be submitted to the issue list.
Please follow @zigo_101 (reachable from the left QR code) to get the latest news of Golds.