晚上差点没赶上班车。明天要share,继续准备
##main.go ->initCommand
##main_unix.go
var initCommand = cli.Command{ Name: "init" , Usage: `initialize the namespaces and launch the process (do not call it outside of runc)` , Action: func (context *cli.Context) error { factory , _ := libcontainer. New ( "" ) if err := factory. StartInitialization (); err != nil { // as the error is sent back to the parent there is no need to log // or write it to stderr because the parent process will handle this os. Exit ( 1 ) } panic ( "libcontainer: container init failed to exec" ) }, } ##factory_linux.go 从环境变量中解析出childPipe、rootDir的fd以及initType(默认为standard,有时间看一下还有其他什么特别的初始化方式),并清除当前进程的所有环境变量。设置一个trap以及panic recover,如果初始化容器失败,会往childPipe中写入procError。调用newContainerInit创建一个init对象(两种类型,standard or setns,下面以standard为例),首先从childPipe中获取config配置文件,从配置文件中读取环境变量并设置到当前进程。构造一个linuxStandardInit对象,主要包括pipe、parentPid、config和rootDir等字段。调用linuxStandardInit对象的Init方法进行初始化。 // StartInitialization loads a container by opening the pipe fd from the parent to read the configuration and state // This is a low level implementation detail of the reexec and should not be consumed externally func (l *LinuxFactory) StartInitialization () (err error ) { var pipefd , rootfd int for _ , pair := range [] struct { k string v * int }{ { "_LIBCONTAINER_INITPIPE" , &pipefd}, { "_LIBCONTAINER_STATEDIR" , &rootfd}, } { s := os. Getenv (pair.k) i , err := strconv. Atoi (s) if err != nil { return fmt. Errorf ( "unable to convert %s=%s to int" , pair.k, s) } *pair. v = i } var ( pipe = os. NewFile ( uintptr (pipefd), "pipe" ) it = initType (os. Getenv ( "_LIBCONTAINER_INITTYPE" )) ) // clear the current process's environment to clean any libcontainer // specific env vars. os. Clearenv () var i initer defer func () { // We have an error during the initialization of the container's init, // send it back to the parent process in the form of an initError. // If container's init successed, syscall.Exec will not return, hence // this defer function will never be called. if _ , ok := i.(*linuxStandardInit); ok { // Synchronisation only necessary for standard init. if werr := utils. WriteJSON (pipe, syncT{procError}); werr != nil { panic (err) } } if werr := utils. WriteJSON (pipe, newSystemError (err)); werr != nil { panic (err) } // ensure that this pipe is always closed pipe. Close () }() defer func () { if e := recover (); e != nil { err = fmt. Errorf ( "panic from initialization: %v, %v" , e, string (debug. Stack ())) } }() i , err = newContainerInit (it, pipe, rootfd) if err != nil { return err } return i. Init () } ##init_linux.go func newContainerInit (t initType, pipe *os.File, stateDirFD int ) (initer, error ) { var config *initConfig if err := json. NewDecoder (pipe). Decode (&config); err != nil { return nil , err } if err := populateProcessEnvironment (config.Env); err != nil { return nil , err } switch t { case initSetns: return &linuxSetnsInit{ config: config, }, nil case initStandard: return &linuxStandardInit{ pipe: pipe, parentPid: syscall. Getppid (), config: config, stateDirFD: stateDirFD, }, nil } return nil , fmt. Errorf ( "unknown init type %q" , t) }##standard_init_linux.go
func (l *linuxStandardInit) Init () error { // Max首先是针对Session keyring的一些配置,不是很清楚这里的Session是什么? if !l.config.Config.NoNewKeyring { ringname , keepperms , newperms := l. getSessionRingParams () // do not inherit the parent's session keyring sessKeyId , err := keys. JoinSessionKeyring (ringname) if err != nil { return err } // make session keyring searcheable if err := keys. ModKeyringPerm (sessKeyId, keepperms, newperms); err != nil { return err } } // Max 配置console和tty。如果配置文件中指定有Console字段,则从该字段中获取tty的slave路径创建一个linuxConsole对象,调用其 dupStdio打开slave设备,将其fd复制(dup3)到当前进程的标准IO。如果console对象创建好以后,便调用ioctl的TIOCSCTTY分配控制终端 var console *linuxConsole if l.config.Console != "" { console = newConsoleFromPath (l.config.Console) if err := console. dupStdio (); err != nil { return err } } if console != nil { if err := system. Setctty (); err != nil { return err } } // Max 调用setupNetwork配置容器的网络。奇怪网络不是在前面配置过了吗,还是调用同样的函数 if err := setupNetwork (l.config); err != nil { return err } // Max 调用setupRoute配置容器的静态路由信息。 if err := setupRoute (l.config.Config); err != nil { return err } // Max selinux,调用label.Init()检查selinux是否被启动以及是否检查过,并将结果存入全局变量。此处的label并非是用户label,而是selinux相关的processLabel。 label. Init () // InitializeMountNamespace() can be executed only for a new mount namespace // Max 如果设置了mount namespace,则调用setupRootfs在新的mount namespace中配置设备、挂载点以及文件系统。 if l.config.Config.Namespaces. Contains (configs.NEWNS) { if err := setupRootfs (l.config.Config, console, l.pipe); err != nil { return err } } // Max 根据需要配置hostname、apparmor、processLabel、sysctl、readonlyPath、maskPath。这些都是一些feature,对容器启动本身没有太多影响。 if hostname := l.config.Config.Hostname; hostname != "" { if err := syscall. Sethostname ([] byte (hostname)); err != nil { return err } } if err := apparmor. ApplyProfile (l.config.AppArmorProfile); err != nil { return err } if err := label. SetProcessLabel (l.config.ProcessLabel); err != nil { return err } for key , value := range l.config.Config.Sysctl { if err := writeSystemProperty (key, value); err != nil { return err } } for _ , path := range l.config.Config.ReadonlyPaths { if err := remountReadonly (path); err != nil { return err } } for _ , path := range l.config.Config.MaskPaths { if err := maskPath (path); err != nil { return err } } pdeath , err := system. GetParentDeathSignal () if err != nil { return err } // Max 获取父进程的退出信号量。 if l.config.NoNewPrivileges { if err := system. Prctl (PR_SET_NO_NEW_PRIVS, 1 , 0 , 0 , 0 ); err != nil { return err } } // Tell our parent that we're ready to Execv. This must be done before the // Seccomp rules have been applied, because we need to be able to read and // write to a socket. // Max 通过管道与父进程进行同步,先发出procReady再等待procRun。 if err := syncParentReady (l.pipe); err != nil { return err } // Without NoNewPrivileges seccomp is a privileged operation, so we need to // do this before dropping capabilities; otherwise do it as late as possible // just before execve so as few syscalls take place after it as possible. // Max 初始化seccomp。 if l.config.Config.Seccomp != nil && !l.config.NoNewPrivileges { if err := seccomp. InitSeccomp (l.config.Config.Seccomp); err != nil { return err } } // Max 调用finalizeNamespace根据config配置将需要的特权capabilities加入白名单,设置user namespace,关闭不需要的文件描述符。 if err := finalizeNamespace (l.config); err != nil { return err } // finalizeNamespace can change user/group which clears the parent death // signal, so we restore it here. // 恢复parent进程的death信号量并检查当前父进程pid是否为我们原来记录的。不是的话,自杀 if err := pdeath. Restore (); err != nil { return err } // compare the parent from the inital start of the init process and make sure that it did not change. // if the parent changes that means it died and we were reparented to something else so we should // just kill ourself and not cause problems for someone else. // Max 恢复parent进程的death信号量并检查当前父进程pid是否为我们原来记录的。不是的话,kill ourself。。。 if syscall. Getppid () != l.parentPid { return syscall. Kill (syscall. Getpid (), syscall.SIGKILL) } // check for the arg before waiting to make sure it exists and it is returned // as a create time error. name , err := exec. LookPath (l.config.Args[ 0 ]) if err != nil { return err } // close the pipe to signal that we have completed our init. // Max 与父进程之间的同步已经完成,关闭pipe。 l.pipe. Close () // wait for the fifo to be opened on the other side before // exec'ing the users process. // Max 尝试以只写方式打开fifo管道,并往管道中写入“0” 。该操作会一直保持阻塞,直到管道的另一端以读方式打开,并读取内容。至此,create操作流程已经结束 fd , err := syscall. Openat (l.stateDirFD, execFifoFilename, os.O_WRONLY|syscall.O_CLOEXEC, 0 ) if err != nil { return newSystemErrorWithCause (err, "openat exec fifo" ) } if _ , err := syscall. Write (fd, [] byte ( "0" )); err != nil { return newSystemErrorWithCause (err, "write 0 exec fifo" ) } // Max 下面实际上是start的时候才会触发的操作了,阻塞清除后,根据config配置初始化seccomp,并调用syscall.Exec执行config里面指定的命令。 if l.config.Config.Seccomp != nil && l.config.NoNewPrivileges { if err := seccomp. InitSeccomp (l.config.Config.Seccomp); err != nil { return newSystemErrorWithCause (err, "init seccomp" ) } } if err := syscall. Exec (name, l.config.Args[ 0 :], os. Environ ()); err != nil { return newSystemErrorWithCause (err, "exec user process" ) } return nil }