runc/libcontainer/configs/config.go中定义了container对应的Namespaces。另外对于User Namespaces,还定义了UidMappings和GidMappings for user map。
// Config defines configuration options for executing a process inside a contained environment.
type Config struct {
...
// Namespaces specifies the container's namespaces that it should setup when cloning the init process
// If a namespace is not provided that namespace is shared from the container's parent process
Namespaces Namespaces `json:"namespaces"`
// UidMappings is an array of User ID mappings for User Namespaces
UidMappings []IDMap `json:"uid_mappings"`
// GidMappings is an array of Group ID mappings for User Namespaces
GidMappings []IDMap `json:"gid_mappings"`
...
}
runC中namespace的源码主要在: runc/libcontainer/configs/namespaces_unix.go
runC支持的namespce type包括($nsName) “net”、”mnt”、”pid”、”ipc”、”user”、”uts”:
const (
NEWNET NamespaceType = "NEWNET"
NEWPID NamespaceType = "NEWPID"
NEWNS NamespaceType = "NEWNS"
NEWUTS NamespaceType = "NEWUTS"
NEWIPC NamespaceType = "NEWIPC"
NEWUSER NamespaceType = "NEWUSER"
)
除了验证 Namespce Type是否在以上常量中,还要去验证 /proc/self/ns/$nsName是否存在并且可以read,都通过时,才认为该Namespace是在当前系统中是被支持的。
// IsNamespaceSupported returns whether a namespace is available or
// not
func IsNamespaceSupported(ns NamespaceType) bool {
...
supported, ok := supportedNamespaces[ns]
if ok {
return supported
}
...
// 除了验证 Namespce Type是都在指定列表中,还要去验证 /proc/self/ns/$nsName是否存在并且可以read
_, err := os.Stat(fmt.Sprintf("/proc/self/ns/%s", nsFile))
supported = err == nil
...
return supported
}
如下是NameSpace的完整定义,很简单,只包括NamespaceType 和对应的Path。
// Namespace defines configuration for each namespace. It specifies an
// alternate path that is able to be joined via setns.
type Namespace struct {
Type NamespaceType `json:"type"`
Path string `json:"path"`
}
从Namespace的GetPath方法中可见,一个pid对应的namespace path为 /proc/ pid/ns/ nsName。
func (n *Namespace) GetPath(pid int) string {
if n.Path != "" {
return n.Path
}
return fmt.Sprintf("/proc/%d/ns/%s", pid, NsName(n.Type))
}
除此之外,还定义了以下常用方法:
func (n *Namespaces) Remove(t NamespaceType) bool {...}
func (n *Namespaces) Add(t NamespaceType, path string) {...}
func (n *Namespaces) index(t NamespaceType) int {...}
func (n *Namespaces) Contains(t NamespaceType) bool {...}
func (n *Namespaces) PathOf(t NamespaceType) string {...}
在runc/libcontainer/configs/namespaces_syscall.go中,定义了linux clone时这些namespace对应的clone flags。
var namespaceInfo = map[NamespaceType]int{
NEWNET: syscall.CLONE_NEWNET,
NEWNS: syscall.CLONE_NEWNS,
NEWUSER: syscall.CLONE_NEWUSER,
NEWIPC: syscall.CLONE_NEWIPC,
NEWUTS: syscall.CLONE_NEWUTS,
NEWPID: syscall.CLONE_NEWPID,
}
// CloneFlags parses the container's Namespaces options to set the correct
// flags on clone, unshare. This function returns flags only for new namespaces.
func (n *Namespaces) CloneFlags() uintptr {
var flag int
for _, v := range *n {
if v.Path != "" {
continue
}
flag |= namespaceInfo[v.Type]
}
return uintptr(flag)
}
上面的CloneFlags()方法是用来解析linuxContainer的config中的namespace相关的参数,生成clone flags,提供给linuxContainer.bootstrapData(cloneFlags uintptr, nsMaps map[configs.NamespaceType]string) 来封装。
// bootstrapData encodes the necessary data in netlink binary format
// as a io.Reader.
// Consumer can write the data to a bootstrap program
// such as one that uses nsenter package to bootstrap the container's
// init process correctly, i.e. with correct namespaces, uid/gid
// mapping etc.
func (c *linuxContainer) bootstrapData(cloneFlags uintptr, nsMaps map[configs.NamespaceType]string) (io.Reader, error) {
// create the netlink message
r := nl.NewNetlinkRequest(int(InitMsg), 0)
// write cloneFlags
r.AddData(&Int32msg{
Type: CloneFlagsAttr,
Value: uint32(cloneFlags),
})
// write custom namespace paths
if len(nsMaps) > 0 {
nsPaths, err := c.orderNamespacePaths(nsMaps)
if err != nil {
return nil, err
}
r.AddData(&Bytemsg{
Type: NsPathsAttr,
Value: []byte(strings.Join(nsPaths, ",")),
})
}
// write namespace paths only when we are not joining an existing user ns
_, joinExistingUser := nsMaps[configs.NEWUSER]
if !joinExistingUser {
// write uid mappings
if len(c.config.UidMappings) > 0 {
b, err := encodeIDMapping(c.config.UidMappings)
if err != nil {
return nil, err
}
r.AddData(&Bytemsg{
Type: UidmapAttr,
Value: b,
})
}
// write gid mappings
if len(c.config.GidMappings) > 0 {
b, err := encodeIDMapping(c.config.GidMappings)
if err != nil {
return nil, err
}
r.AddData(&Bytemsg{
Type: GidmapAttr,
Value: b,
})
// check if we have CAP_SETGID to setgroup properly
pid, err := capability.NewPid(os.Getpid())
if err != nil {
return nil, err
}
if !pid.Get(capability.EFFECTIVE, capability.CAP_SETGID) {
r.AddData(&Boolmsg{
Type: SetgroupAttr,
Value: true,
})
}
}
}
return bytes.NewReader(r.Serialize()), nil
}
linuxContainer.newInitProcess(…)最终会使用linuxContainer.bootstrapData封装的clone flags数据,完成initProcess的构建。
func (c *linuxContainer) newInitProcess(p *Process, cmd *exec.Cmd, parentPipe, childPipe, rootDir *os.File) (*initProcess, error) {
cmd.Env = append(cmd.Env, "_LIBCONTAINER_INITTYPE="+string(initStandard))
nsMaps := make(map[configs.NamespaceType]string)
for _, ns := range c.config.Namespaces {
if ns.Path != "" {
nsMaps[ns.Type] = ns.Path
}
}
_, sharePidns := nsMaps[configs.NEWPID]
data, err := c.bootstrapData(c.config.Namespaces.CloneFlags(), nsMaps)
if err != nil {
return nil, err
}
p.consoleChan = make(chan *os.File, 1)
return &initProcess{
cmd: cmd,
childPipe: childPipe,
parentPipe: parentPipe,
manager: c.cgroupManager,
config: c.newInitConfig(p),
container: c,
process: p,
bootstrapData: data,
sharePidns: sharePidns,
rootDir: rootDir,
}, nil
}
newInitProcess(…)在整个container create的流程中的位置,请参考:runC源码分析之Create/Run Container —— 王涛
如此,namespace在整个container create/run中的源码分析就完整了。
补充:runC中container的Spec是从bundle/config.json中解析得到的,见runC的create.go中的setupSpec(context)的调用。
Action: func(context *cli.Context) error {
if context.NArg() != 1 {
fmt.Printf("Incorrect Usage.\n\n")
cli.ShowCommandHelp(context, "create")
return fmt.Errorf("runc: \"create\" requires exactly one argument")
}
if err := revisePidFile(context); err != nil {
return err
}
spec, err := setupSpec(context)
if err != nil {
return err
}
status, err := startContainer(context, spec, true)
if err != nil {
return err
}
setupSepc(context)会去loadSpec(“config.json”):
// setupSpec performs initial setup based on the cli.Context for the container
func setupSpec(context *cli.Context) (*specs.Spec, error) {
bundle := context.String("bundle")
if bundle != "" {
if err := os.Chdir(bundle); err != nil {
return nil, err
}
}
spec, err := loadSpec(specConfig)
if err != nil {
return nil, err
}
notifySocket := os.Getenv("NOTIFY_SOCKET")
if notifySocket != "" {
setupSdNotify(spec, notifySocket)
}
if os.Geteuid() != 0 {
return nil, fmt.Errorf("runc should be run as root")
}
return spec, nil
}
config.json样例如下,namespace部分见 “.linux.namespaces”。
{
"ociVersion": "0.4.0",
"platform": {
"os": "linux",
"arch": "amd64"
},
"process": {
"terminal": true,
"user": {},
"args": [
"redis-server",
"--bind",
"0.0.0.0"
],
"env": [
"PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin",
"TERM=xterm"
],
"cwd": "/",
"capabilities": [
"CAP_AUDIT_WRITE",
"CAP_KILL",
"CAP_NET_BIND_SERVICE"
],
"rlimits": [
{
"type": "RLIMIT_NOFILE",
"hard": 1024,
"soft": 1024
}
],
"noNewPrivileges": true
},
"root": {
"path": "rootfs",
"readonly": true
},
"hostname": "runc",
"mounts": [
{
"destination": "/proc",
"type": "proc",
"source": "proc"
},
{
"destination": "/dev",
"type": "tmpfs",
"source": "tmpfs",
"options": [
"nosuid",
"strictatime",
"mode=755",
"size=65536k"
]
},
{
"destination": "/dev/pts",
"type": "devpts",
"source": "devpts",
"options": [
"nosuid",
"noexec",
"newinstance",
"ptmxmode=0666",
"mode=0620",
"gid=5"
]
},
{
"destination": "/dev/shm",
"type": "tmpfs",
"source": "shm",
"options": [
"nosuid",
"noexec",
"nodev",
"mode=1777",
"size=65536k"
]
},
{
"destination": "/dev/mqueue",
"type": "mqueue",
"source": "mqueue",
"options": [
"nosuid",
"noexec",
"nodev"
]
},
{
"destination": "/sys",
"type": "sysfs",
"source": "sysfs",
"options": [
"nosuid",
"noexec",
"nodev",
"ro"
]
},
{
"destination": "/sys/fs/cgroup",
"type": "cgroup",
"source": "cgroup",
"options": [
"nosuid",
"noexec",
"nodev",
"relatime",
"ro"
]
}
],
"hooks": {},
"linux": {
"resources": {
"devices": [
{
"allow": false,
"access": "rwm"
}
]
},
"namespaces": [
{
"type": "pid"
},
{
"type": "ipc"
},
{
"type": "uts"
},
{
"type": "mount"
}
],
"devices": null
}
}