本文是为了和同事们share runc相关的study。
首先 runc 支持如下的command。 从runc的 create, start两方面说起。
COMMANDS:
checkpoint checkpoint a running container
create create a container
delete delete any resources held by one or more containers often used with detached containers
events display container events such as OOM notifications, cpu, memory, and IO usage statistics
exec execute new process inside the container
init initialize the namespaces and launch the process (do not call it outside of runc)
kill kill sends the specified signal (default: SIGTERM) to the container's init process
list lists containers started by runc with the given root
pause pause suspends all processes inside the container
ps ps displays the processes running inside a container
restore restore a container from a previous checkpoint
resume resumes all processes that have been previously paused
run create and run a container
spec create a new specification file
start executes the user defined process in a created container
state output the state of a container
update update container resource constraints
help, h Shows a list of commands or help for one command
GLOBAL OPTIONS:
--debug enable debug output for logging
--log value set the log file path where internal debug information is written (default: "/dev/null")
--log-format value set the format used by logs ('text' (default), or 'json') (default: "text")
--root value root directory for storage of container state (this should be located in tmpfs) (default: "/run/runc")
--criu value path to the criu binary used for checkpoint and restore (default: "criu")
--systemd-cgroup enable systemd cgroup support, expects cgroupsPath to be of form "slice:prefix:name" for e.g. "system.slice:runc:434234"
--help, -h show help
--version, -v print the version
1. create:
create.go#createCommand
这里的spec里边存着config.json的信息。context存着cli命令行的信息。
Action:
func
(context *cli.Context)
error
{
// spec contains the info of config.json
spec
,
err
:=
setupSpec
(context)
if
err !=
nil
{
return
err
}
status
,
err
:=
startContainer
(context, spec,
true
)
if
err !=
nil
{
return
err
}
// exit with the container's exit status so any external supervisor is
// notified of the exit with the correct exit status.
os.
Exit
(status)
return
nil
},
runc\utils_linux.go#
startContainer
根据容器id参数和spec信息用工厂模式创建了一个linux container实例。对listen fd做一些初始化操作,用于socket activation?构建一个runner对象,调用runner.run
func
startContainer
(context *cli.Context, spec *specs.Spec, create
bool
) (
int
,
error
) {
id
:= context.
Args
().
First
()
if
id ==
""
{
return
-
1
, errEmptyID
}
container
,
err
:=
createContainer
(context, id, spec)
if
err !=
nil
{
return
-
1
, err
}
//Maxx check if detach mode
detach
:= context.
Bool
(
"detach"
)
// Support on-demand socket activation by passing file descriptors into the container init process.
listenFDs
:= []*os.File{}
if
os.
Getenv
(
"LISTEN_FDS"
) !=
""
{
listenFDs
= activation.
Files
(
false
)
}
r
:= &runner{
enableSubreaper: !context.
Bool
(
"no-subreaper"
),
shouldDestroy:
true
,
container: container,
listenFDs: listenFDs,
console: context.
String
(
"console"
),
detach: detach,
pidFile: context.
String
(
"pid-file"
),
create: create,
}
return
r.
run
(&spec.Process)
}
runc\utils_linux.go#
run
这个run是 create、start、run三个命令入口公用的,下面主要描述create流程。
根据spec里面process的配置信息调用newProcess创建process对象。将listen fd加入process的环境变量和需要在新进程保持打开的文件列表中(ExtraFiles)。调用setupIO来进行io和tty相关配置,对于create来说,这里就是dup将当前进程的io,chown用户/组权限。创建一个signalHandler来处理tty和signal。调用container.Start(process)来启动process进程。对于create来说,下面处理一下pid-file、tty回收等便返回了。
func
(r *runner)
run
(config *specs.Process) (
int
,
error
) {
process
,
err
:=
newProcess
(*config)
if
err !=
nil
{
r.
destroy
()
return
-
1
, err
}
if
len
(r.listenFDs) >
0
{
process.
Env
=
append
(process.Env, fmt.
Sprintf
(
"LISTEN_FDS=%d"
,
len
(r.listenFDs)),
"LISTEN_PID=1"
)
process.
ExtraFiles
=
append
(process.ExtraFiles, r.listenFDs...)
}
rootuid
,
err
:= r.container.
Config
().
HostUID
()
if
err !=
nil
{
r.
destroy
()
return
-
1
, err
}
rootgid
,
err
:= r.container.
Config
().
HostGID
()
if
err !=
nil
{
r.
destroy
()
return
-
1
, err
}
tty
,
err
:=
setupIO
(process, rootuid, rootgid, r.console, config.Terminal, r.detach || r.create)
if
err !=
nil
{
r.
destroy
()
return
-
1
, err
}
handler
:=
newSignalHandler
(tty, r.enableSubreaper) //Max no here, if the command is not create. call container.Run, else call container.Start //reate、start、run三个命令入口公用此入口,在此区分不同的动作,在此我们是create命令,转到container.Start
startFn
:= r.container.Start
if
!r.create {
startFn
= r.container.Run
}
defer
tty.
Close
()
if
err
:=
startFn
(process); err !=
nil
{
r.
destroy
()
return
-
1
, err
}
if
err
:= tty.
ClosePostStart
(); err !=
nil
{
r.
terminate
(process)
r.
destroy
()
return
-
1
, err
}
if
r.pidFile !=
""
{
if
err
:=
createPidFile
(r.pidFile, process); err !=
nil
{
r.
terminate
(process)
r.
destroy
()
return
-
1
, err
}
}
if
r.detach || r.create {
return
0
,
nil
}
status
,
err
:= handler.
forward
(process)
if
err !=
nil
{
r.
terminate
(process)
}
r.
destroy
()
return
status, err
}
container_linux.go#Start
func
(c *linuxContainer)
Start
(process *Process)
error
{
c.m.
Lock
()
defer
c.m.
Unlock
()
status
,
err
:= c.
currentStatus
()
if
err !=
nil
{
return
err
}
return
c.
start
(process, status == Stopped)
}
封装函数,仅是获取了当前容器状态(目前未创建前是stopped),并调用了容器的
start(process, true)。
container_linux.go#start
func
(c *linuxContainer)
start
(process *Process, isInit
bool
)
error
{ //Max: note here isInit is true. // in the function newParentProcess. 建立了pipe
//parentPipe
,
childPipe
,
err
:=
newPipe
()
// 同时
// return
c.
newInitProcess
(p, cmd, parentPipe, childPipe, rootDir)
// 创建parent 进程
创建一个initProcess,里面既有init进程的信息,也有spec里面指定的process的信息。
创建一对pipe——parentPipe和childPipe,打开rootDir。创建一个command,命令为runc init自身(通过/proc/self/exe软链接实现);标准io为当前进程的;工作目录为Rootfs;用ExtraFiles在新进程中保持打开childPipe和rootDir,并添加对应的环境变量。调用newInitProcess进一步将parent process和command封装为initProcess。主要工作为添加初始化类型环境变量,将namespace、uid/gid映射等配置信息用bootstrapData封装为一个io.Reader等。
newInitProcesss是这样的
func
(c *linuxContainer)
newInitProcess
(p *Process, cmd *exec.Cmd, parentPipe, childPipe, rootDir *os.File) (*initProcess,
error
) {
cmd.
Env
=
append
(cmd.Env,
"_LIBCONTAINER_INITTYPE="
+
string
(initStandard))
nsMaps
:=
make
(
map
[configs.NamespaceType]
string
)
for
_
,
ns
:=
range
c.config.Namespaces {
if
ns.Path !=
""
{
nsMaps[ns.Type] = ns.Path
}
}
_
,
sharePidns
:= nsMaps[configs.NEWPID]
data
,
err
:= c.
bootstrapData
(c.config.Namespaces.
CloneFlags
(), nsMaps,
""
)
if
err !=
nil
{
return
nil
, err
}
return
&initProcess{
cmd: cmd,
childPipe: childPipe,
parentPipe: parentPipe,
manager: c.cgroupManager,
config: c.
newInitConfig
(p),
container: c,
process: p,
bootstrapData: data,
sharePidns: sharePidns,
rootDir: rootDir,
},
nil
}
initProcess.start() 后边会有start的定义先说干了些啥
异步启动cmd.Start()(等同于调用runc init)来启动init进程。将spec中process指定的ops指定为initProcess。将前面创建bootstrapData从parentPipe传出去(init进程会从childPipe接收到这些数据,reverse出写入的内容,进行namespace相关的配置)调用execSetns(),这个方法名看似是进行namespace的配置,实际上则是等待上面init进程的执行,并在parentPipe等待并解析出从childPipe传回的pid(谁的pid),找到该pid对应的进程,并将cmd.Process对应的进程替换为该进程。为checkpoint做准备,保存cmd.Process进程的标准IO文件描述符。应用cgroup配置创建容器中的network interface。将容器的配置文件内容spec从parentPipe发送给init进程。下面与init进程进行同步,一个for循环状态机,通过解析parentPipe传回的sync Type来执行相应的操作。按正常的时间顺序,如下:
procReady,继续配置cgroup(Set与Apply的区别?)、oom、rlimits;如果配置中没有mount namespace(Why?),则执行prestart钩子;往parentPipe写入procRun状态。procHooks,执行prestart钩子,往parentPipe写入procResume状态。(这个应该不是标准create的流程,resume?)procError,just error and exit 进行一些是否成功run和resume的判断,进行错误处理。关闭parentPipe,返回nil or err。
// back to start code
parent
,
err
:= c.
newParentProcess
(process, isInit)
if
err !=
nil
{
return
newSystemErrorWithCause
(err,
"creating new parent process"
)
} // 调用parent.start()异步启动parent进程
if
err
:= parent.
start
(); err !=
nil
{
// terminate the process to ensure that it properly is reaped.
if
err
:= parent.
terminate
(); err !=
nil
{
logrus.
Warn
(err)
}
return
newSystemErrorWithCause
(err,
"starting container process"
)
}
// generate a timestamp indicating when the container was started
c.
created
= time.
Now
().
UTC
()
c.
state
= &runningState{
c: c,
}
if
isInit { // 根据parent进程的状态更新容器的状态为Created。
c.
state
= &createdState{
c: c,
}
state
,
err
:= c.
updateState
(parent)
if
err !=
nil
{
return
err
}
c.
initProcessStartTime
= state.InitProcessStartTime
if
c.config.Hooks !=
nil
{
s
:= configs.HookState{ //Max
Version: c.config.Version,
ID: c.id,
Pid: parent.
pid
(),
Root: c.config.Rootfs,
BundlePath: utils.
SearchLabels
(c.config.Labels,
"bundle"
),
}
for
i
,
hook
:=
range
c.config.Hooks.Poststart {
if
err
:= hook.
Run
(s); err !=
nil
{
if
err
:= parent.
terminate
(); err !=
nil
{
logrus.
Warn
(err)
}
return
newSystemErrorWithCausef
(err,
"running poststart hook %d"
, i)
}
}
}
}
return
nil
}
首先调用newParentProcess来创建init的parent进程。调用parent.start()异步启动parent进程。根据parent进程的状态更新容器的状态为Created。遍历spec里面的Poststart hook,分别调用。 题外补充##newParentProcess是长这样滴
func
(c *linuxContainer)
newParentProcess
(p *Process, doInit
bool
) (parentProcess,
error
) {
parentPipe
,
childPipe
,
err
:=
newPipe
()
if
err !=
nil
{
return
nil
,
newSystemErrorWithCause
(err,
"creating new init pipe"
)
}
rootDir
,
err
:= os.
Open
(c.root)
if
err !=
nil
{
return
nil
, err
}
cmd
,
err
:= c.
commandTemplate
(p, childPipe, rootDir)
if
err !=
nil
{
return
nil
,
newSystemErrorWithCause
(err,
"creating new command template"
)
}
if
!doInit {
return
c.
newSetnsProcess
(p, cmd, parentPipe, childPipe, rootDir)
}
return
c.
newInitProcess
(p, cmd, parentPipe, childPipe, rootDir)
}
## start是长这样滴
func
(p *initProcess)
start
()
error
{
defer
p.parentPipe.
Close
() //Max:异步调用
err
:= p.cmd.
Start
() //将spec中process指定的ops指定为initProcess。
p.process.
ops
= p
p.childPipe.
Close
()
p.rootDir.
Close
()
if
err !=
nil
{
p.process.
ops
=
nil
return
newSystemErrorWithCause
(err,
"starting init process command"
)
} // 写管道 将前面创建bootstrapData从parentPipe传出去(init进程会从childPipe接收到这些数据,reverse出写入的内容,进行namespace相关的配置)
if
_
,
err
:= io.
Copy
(p.parentPipe, p.bootstrapData); err !=
nil
{
return
err
} // 调用execSetns(),这个方法名看似是进行namespace的配置,实际上则是等待上面init进程的执行,并在parentPipe等待并解析出从childPipe传回的pid(谁的pid),找到该pid对应的进程,并将cmd.Process对应的进程替换为该进程。
// execSetns runs the process that executes C code to perform the setns calls
// because setns support requires the C process to fork off a child and perform the setns
// before the go runtime boots, we wait on the process to die and receive the child's pid
// over the provided pipe.
// This is called by initProcess.start function
if
err
:= p.
execSetns
(); err !=
nil
{
return
newSystemErrorWithCause
(err,
"running exec setns process for init"
)
}
// Save the standard descriptor names before the container process
// can potentially move them (e.g., via dup2()). If we don't do this now,
// we won't know at checkpoint time which file descriptor to look up.
fds
,
err
:=
getPipeFds
(p.
pid
())
if
err !=
nil
{
return
newSystemErrorWithCausef
(err,
"getting pipe fds for pid %d"
, p.
pid
())
}
p.
setExternalDescriptors
(fds)
// Do this before syncing with child so that no children
// can escape the cgroup // Max
// applying cgroup configuration for process
if
err
:= p.manager.
Apply
(p.
pid
()); err !=
nil
{
return
newSystemErrorWithCause
(err,
"applying cgroup configuration for process"
)
}
defer
func
() {
if
err !=
nil
{
// TODO: should not be the responsibility to call here
p.manager.
Destroy
()
}
}() // Max 创建容器中的network interface。
if
err
:= p.
createNetworkInterfaces
(); err !=
nil
{
return
newSystemErrorWithCause
(err,
"creating nework interfaces"
)
} // 将容器的配置文件内容spec从parentPipe发送给init进程。
if
err
:= p.
sendConfig
(); err !=
nil
{
return
newSystemErrorWithCause
(err,
"sending config to init process"
)
}
var
(
procSync
syncT
sentRun
bool
sentResume
bool
ierr
*genericError
)
dec
:= json.
NewDecoder
(p.parentPipe)
loop: // Max 下面与init进程进行同步,一个for循环状态机,通过解析parentPipe传回的sync Type来执行相应的操作。按正常的时间顺序,如下:
procReady,继续配置cgroup(Set与Apply的区别?)、oom、rlimits;如果配置中没有mount namespace(Why?),则执行prestart钩子;往parentPipe写入procRun状态。procHooks,执行prestart钩子,往parentPipe写入procResume状态。(这个应该不是标准create的流程,resume?)procError,just error and exit
for
{
if
err
:= dec.
Decode
(&procSync); err !=
nil
{
if
err == io.EOF {
break
loop
}
return
newSystemErrorWithCause
(err,
"decoding sync type from init pipe"
)
}
switch
procSync.Type {
case
procReady:
if
err
:= p.manager.
Set
(p.config.Config); err !=
nil
{
return
newSystemErrorWithCause
(err,
"setting cgroup config for ready process"
)
}
// set oom_score_adj
if
err
:=
setOomScoreAdj
(p.config.Config.OomScoreAdj, p.
pid
()); err !=
nil
{
return
newSystemErrorWithCause
(err,
"setting oom score for ready process"
)
}
// set rlimits, this has to be done here because we lose permissions
// to raise the limits once we enter a user-namespace
if
err
:=
setupRlimits
(p.config.Rlimits, p.
pid
()); err !=
nil
{
return
newSystemErrorWithCause
(err,
"setting rlimits for ready process"
)
}
// call prestart hooks
if
!p.config.Config.Namespaces.
Contains
(configs.NEWNS) {
if
p.config.Config.Hooks !=
nil
{
s
:= configs.HookState{
Version: p.container.config.Version,
ID: p.container.id,
Pid: p.
pid
(),
Root: p.config.Config.Rootfs,
}
for
i
,
hook
:=
range
p.config.Config.Hooks.Prestart {
if
err
:= hook.
Run
(s); err !=
nil
{
return
newSystemErrorWithCausef
(err,
"running prestart hook %d"
, i)
}
}
}
}
// Sync with child.
if
err
:= utils.
WriteJSON
(p.parentPipe, syncT{procRun}); err !=
nil
{
return
newSystemErrorWithCause
(err,
"reading syncT run type"
)
}
sentRun
=
true
case
procHooks:
if
p.config.Config.Hooks !=
nil
{
s
:= configs.HookState{
Version: p.container.config.Version,
ID: p.container.id,
Pid: p.
pid
(),
Root: p.config.Config.Rootfs,
BundlePath: utils.
SearchLabels
(p.config.Config.Labels,
"bundle"
),
}
for
i
,
hook
:=
range
p.config.Config.Hooks.Prestart {
if
err
:= hook.
Run
(s); err !=
nil
{
return
newSystemErrorWithCausef
(err,
"running prestart hook %d"
, i)
}
}
}
// Sync with child.
if
err
:= utils.
WriteJSON
(p.parentPipe, syncT{procResume}); err !=
nil
{
return
newSystemErrorWithCause
(err,
"reading syncT resume type"
)
}
sentResume
=
true
case
procError:
// wait for the child process to fully complete and receive an error message
// if one was encoutered
if
err
:= dec.
Decode
(&ierr); err !=
nil
&& err != io.EOF {
return
newSystemErrorWithCause
(err,
"decoding proc error from init"
)
}
if
ierr !=
nil
{
break
loop
}
// Programmer error.
panic
(
"No error following JSON procError payload."
)
default
:
return
newSystemError
(fmt.
Errorf
(
"invalid JSON payload from child"
))
}
}
if
!sentRun {
return
newSystemErrorWithCause
(ierr,
"container init"
)
}
if
p.config.Config.Namespaces.
Contains
(configs.NEWNS) && !sentResume {
return
newSystemError
(fmt.
Errorf
(
"could not synchronise after executing prestart hooks with container process"
))
}
if
err
:= syscall.
Shutdown
(
int
(p.parentPipe.
Fd
()), syscall.SHUT_WR); err !=
nil
{
return
newSystemErrorWithCause
(err,
"shutting down init pipe"
)
}
// Must be done after Shutdown so the child will exit and we can wait for it.
if
ierr !=
nil
{
p.
wait
()
return
ierr
}
return
nil
}
到此parent端已经完事, 记住这里起了pipe。我们来看init进程
转载请注明原文地址: https://ju.6miu.com/read-2846.html