1// +build linux 2 3package libcontainer 4 5import ( 6 "bytes" 7 "encoding/json" 8 "fmt" 9 "io" 10 "io/ioutil" 11 "net" 12 "os" 13 "strings" 14 "unsafe" 15 16 "github.com/containerd/console" 17 "github.com/opencontainers/runc/libcontainer/capabilities" 18 "github.com/opencontainers/runc/libcontainer/cgroups" 19 "github.com/opencontainers/runc/libcontainer/configs" 20 "github.com/opencontainers/runc/libcontainer/system" 21 "github.com/opencontainers/runc/libcontainer/user" 22 "github.com/opencontainers/runc/libcontainer/utils" 23 "github.com/opencontainers/runtime-spec/specs-go" 24 "github.com/pkg/errors" 25 "github.com/sirupsen/logrus" 26 "github.com/vishvananda/netlink" 27 "golang.org/x/sys/unix" 28) 29 30type initType string 31 32const ( 33 initSetns initType = "setns" 34 initStandard initType = "standard" 35) 36 37type pid struct { 38 Pid int `json:"pid"` 39 PidFirstChild int `json:"pid_first"` 40} 41 42// network is an internal struct used to setup container networks. 43type network struct { 44 configs.Network 45 46 // TempVethPeerName is a unique temporary veth peer name that was placed into 47 // the container's namespace. 48 TempVethPeerName string `json:"temp_veth_peer_name"` 49} 50 51// initConfig is used for transferring parameters from Exec() to Init() 52type initConfig struct { 53 Args []string `json:"args"` 54 Env []string `json:"env"` 55 Cwd string `json:"cwd"` 56 Capabilities *configs.Capabilities `json:"capabilities"` 57 ProcessLabel string `json:"process_label"` 58 AppArmorProfile string `json:"apparmor_profile"` 59 NoNewPrivileges bool `json:"no_new_privileges"` 60 User string `json:"user"` 61 AdditionalGroups []string `json:"additional_groups"` 62 Config *configs.Config `json:"config"` 63 Networks []*network `json:"network"` 64 PassedFilesCount int `json:"passed_files_count"` 65 ContainerId string `json:"containerid"` 66 Rlimits []configs.Rlimit `json:"rlimits"` 67 CreateConsole bool `json:"create_console"` 68 ConsoleWidth uint16 `json:"console_width"` 69 ConsoleHeight uint16 `json:"console_height"` 70 RootlessEUID bool `json:"rootless_euid,omitempty"` 71 RootlessCgroups bool `json:"rootless_cgroups,omitempty"` 72 SpecState *specs.State `json:"spec_state,omitempty"` 73} 74 75type initer interface { 76 Init() error 77} 78 79func newContainerInit(t initType, pipe *os.File, consoleSocket *os.File, fifoFd int) (initer, error) { 80 var config *initConfig 81 if err := json.NewDecoder(pipe).Decode(&config); err != nil { 82 return nil, err 83 } 84 if err := populateProcessEnvironment(config.Env); err != nil { 85 return nil, err 86 } 87 switch t { 88 case initSetns: 89 return &linuxSetnsInit{ 90 pipe: pipe, 91 consoleSocket: consoleSocket, 92 config: config, 93 }, nil 94 case initStandard: 95 return &linuxStandardInit{ 96 pipe: pipe, 97 consoleSocket: consoleSocket, 98 parentPid: unix.Getppid(), 99 config: config, 100 fifoFd: fifoFd, 101 }, nil 102 } 103 return nil, fmt.Errorf("unknown init type %q", t) 104} 105 106// populateProcessEnvironment loads the provided environment variables into the 107// current processes's environment. 108func populateProcessEnvironment(env []string) error { 109 for _, pair := range env { 110 p := strings.SplitN(pair, "=", 2) 111 if len(p) < 2 { 112 return fmt.Errorf("invalid environment '%v'", pair) 113 } 114 if err := os.Setenv(p[0], p[1]); err != nil { 115 return err 116 } 117 } 118 return nil 119} 120 121// finalizeNamespace drops the caps, sets the correct user 122// and working dir, and closes any leaked file descriptors 123// before executing the command inside the namespace 124func finalizeNamespace(config *initConfig) error { 125 // Ensure that all unwanted fds we may have accidentally 126 // inherited are marked close-on-exec so they stay out of the 127 // container 128 if err := utils.CloseExecFrom(config.PassedFilesCount + 3); err != nil { 129 return errors.Wrap(err, "close exec fds") 130 } 131 132 caps := &configs.Capabilities{} 133 if config.Capabilities != nil { 134 caps = config.Capabilities 135 } else if config.Config.Capabilities != nil { 136 caps = config.Config.Capabilities 137 } 138 w, err := capabilities.New(caps) 139 if err != nil { 140 return err 141 } 142 // drop capabilities in bounding set before changing user 143 if err := w.ApplyBoundingSet(); err != nil { 144 return errors.Wrap(err, "apply bounding set") 145 } 146 // preserve existing capabilities while we change users 147 if err := system.SetKeepCaps(); err != nil { 148 return errors.Wrap(err, "set keep caps") 149 } 150 if err := setupUser(config); err != nil { 151 return errors.Wrap(err, "setup user") 152 } 153 // Change working directory AFTER the user has been set up. 154 // Otherwise, if the cwd is also a volume that's been chowned to the container user (and not the user running runc), 155 // this command will EPERM. 156 if config.Cwd != "" { 157 if err := unix.Chdir(config.Cwd); err != nil { 158 return fmt.Errorf("chdir to cwd (%q) set in config.json failed: %v", config.Cwd, err) 159 } 160 } 161 if err := system.ClearKeepCaps(); err != nil { 162 return errors.Wrap(err, "clear keep caps") 163 } 164 if err := w.ApplyCaps(); err != nil { 165 return errors.Wrap(err, "apply caps") 166 } 167 return nil 168} 169 170// setupConsole sets up the console from inside the container, and sends the 171// master pty fd to the config.Pipe (using cmsg). This is done to ensure that 172// consoles are scoped to a container properly (see runc#814 and the many 173// issues related to that). This has to be run *after* we've pivoted to the new 174// rootfs (and the users' configuration is entirely set up). 175func setupConsole(socket *os.File, config *initConfig, mount bool) error { 176 defer socket.Close() 177 // At this point, /dev/ptmx points to something that we would expect. We 178 // used to change the owner of the slave path, but since the /dev/pts mount 179 // can have gid=X set (at the users' option). So touching the owner of the 180 // slave PTY is not necessary, as the kernel will handle that for us. Note 181 // however, that setupUser (specifically fixStdioPermissions) *will* change 182 // the UID owner of the console to be the user the process will run as (so 183 // they can actually control their console). 184 185 pty, slavePath, err := console.NewPty() 186 if err != nil { 187 return err 188 } 189 190 // After we return from here, we don't need the console anymore. 191 defer pty.Close() 192 193 if config.ConsoleHeight != 0 && config.ConsoleWidth != 0 { 194 err = pty.Resize(console.WinSize{ 195 Height: config.ConsoleHeight, 196 Width: config.ConsoleWidth, 197 }) 198 199 if err != nil { 200 return err 201 } 202 } 203 204 // Mount the console inside our rootfs. 205 if mount { 206 if err := mountConsole(slavePath); err != nil { 207 return err 208 } 209 } 210 // While we can access console.master, using the API is a good idea. 211 if err := utils.SendFd(socket, pty.Name(), pty.Fd()); err != nil { 212 return err 213 } 214 // Now, dup over all the things. 215 return dupStdio(slavePath) 216} 217 218// syncParentReady sends to the given pipe a JSON payload which indicates that 219// the init is ready to Exec the child process. It then waits for the parent to 220// indicate that it is cleared to Exec. 221func syncParentReady(pipe io.ReadWriter) error { 222 // Tell parent. 223 if err := writeSync(pipe, procReady); err != nil { 224 return err 225 } 226 227 // Wait for parent to give the all-clear. 228 return readSync(pipe, procRun) 229} 230 231// syncParentHooks sends to the given pipe a JSON payload which indicates that 232// the parent should execute pre-start hooks. It then waits for the parent to 233// indicate that it is cleared to resume. 234func syncParentHooks(pipe io.ReadWriter) error { 235 // Tell parent. 236 if err := writeSync(pipe, procHooks); err != nil { 237 return err 238 } 239 240 // Wait for parent to give the all-clear. 241 return readSync(pipe, procResume) 242} 243 244// setupUser changes the groups, gid, and uid for the user inside the container 245func setupUser(config *initConfig) error { 246 // Set up defaults. 247 defaultExecUser := user.ExecUser{ 248 Uid: 0, 249 Gid: 0, 250 Home: "/", 251 } 252 253 passwdPath, err := user.GetPasswdPath() 254 if err != nil { 255 return err 256 } 257 258 groupPath, err := user.GetGroupPath() 259 if err != nil { 260 return err 261 } 262 263 execUser, err := user.GetExecUserPath(config.User, &defaultExecUser, passwdPath, groupPath) 264 if err != nil { 265 return err 266 } 267 268 var addGroups []int 269 if len(config.AdditionalGroups) > 0 { 270 addGroups, err = user.GetAdditionalGroupsPath(config.AdditionalGroups, groupPath) 271 if err != nil { 272 return err 273 } 274 } 275 276 // Rather than just erroring out later in setuid(2) and setgid(2), check 277 // that the user is mapped here. 278 if _, err := config.Config.HostUID(execUser.Uid); err != nil { 279 return errors.New("cannot set uid to unmapped user in user namespace") 280 } 281 if _, err := config.Config.HostGID(execUser.Gid); err != nil { 282 return errors.New("cannot set gid to unmapped user in user namespace") 283 } 284 285 if config.RootlessEUID { 286 // We cannot set any additional groups in a rootless container and thus 287 // we bail if the user asked us to do so. TODO: We currently can't do 288 // this check earlier, but if libcontainer.Process.User was typesafe 289 // this might work. 290 if len(addGroups) > 0 { 291 return errors.New("cannot set any additional groups in a rootless container") 292 } 293 } 294 295 // Before we change to the container's user make sure that the processes 296 // STDIO is correctly owned by the user that we are switching to. 297 if err := fixStdioPermissions(config, execUser); err != nil { 298 return err 299 } 300 301 setgroups, err := ioutil.ReadFile("/proc/self/setgroups") 302 if err != nil && !os.IsNotExist(err) { 303 return err 304 } 305 306 // This isn't allowed in an unprivileged user namespace since Linux 3.19. 307 // There's nothing we can do about /etc/group entries, so we silently 308 // ignore setting groups here (since the user didn't explicitly ask us to 309 // set the group). 310 allowSupGroups := !config.RootlessEUID && string(bytes.TrimSpace(setgroups)) != "deny" 311 312 if allowSupGroups { 313 suppGroups := append(execUser.Sgids, addGroups...) 314 if err := unix.Setgroups(suppGroups); err != nil { 315 return err 316 } 317 } 318 319 if err := system.Setgid(execUser.Gid); err != nil { 320 return err 321 } 322 if err := system.Setuid(execUser.Uid); err != nil { 323 return err 324 } 325 326 // if we didn't get HOME already, set it based on the user's HOME 327 if envHome := os.Getenv("HOME"); envHome == "" { 328 if err := os.Setenv("HOME", execUser.Home); err != nil { 329 return err 330 } 331 } 332 return nil 333} 334 335// fixStdioPermissions fixes the permissions of PID 1's STDIO within the container to the specified user. 336// The ownership needs to match because it is created outside of the container and needs to be 337// localized. 338func fixStdioPermissions(config *initConfig, u *user.ExecUser) error { 339 var null unix.Stat_t 340 if err := unix.Stat("/dev/null", &null); err != nil { 341 return err 342 } 343 for _, fd := range []uintptr{ 344 os.Stdin.Fd(), 345 os.Stderr.Fd(), 346 os.Stdout.Fd(), 347 } { 348 var s unix.Stat_t 349 if err := unix.Fstat(int(fd), &s); err != nil { 350 return err 351 } 352 353 // Skip chown of /dev/null if it was used as one of the STDIO fds. 354 if s.Rdev == null.Rdev { 355 continue 356 } 357 358 // We only change the uid owner (as it is possible for the mount to 359 // prefer a different gid, and there's no reason for us to change it). 360 // The reason why we don't just leave the default uid=X mount setup is 361 // that users expect to be able to actually use their console. Without 362 // this code, you couldn't effectively run as a non-root user inside a 363 // container and also have a console set up. 364 if err := unix.Fchown(int(fd), u.Uid, int(s.Gid)); err != nil { 365 // If we've hit an EINVAL then s.Gid isn't mapped in the user 366 // namespace. If we've hit an EPERM then the inode's current owner 367 // is not mapped in our user namespace (in particular, 368 // privileged_wrt_inode_uidgid() has failed). In either case, we 369 // are in a configuration where it's better for us to just not 370 // touch the stdio rather than bail at this point. 371 if err == unix.EINVAL || err == unix.EPERM { 372 continue 373 } 374 return err 375 } 376 } 377 return nil 378} 379 380// setupNetwork sets up and initializes any network interface inside the container. 381func setupNetwork(config *initConfig) error { 382 for _, config := range config.Networks { 383 strategy, err := getStrategy(config.Type) 384 if err != nil { 385 return err 386 } 387 if err := strategy.initialize(config); err != nil { 388 return err 389 } 390 } 391 return nil 392} 393 394func setupRoute(config *configs.Config) error { 395 for _, config := range config.Routes { 396 _, dst, err := net.ParseCIDR(config.Destination) 397 if err != nil { 398 return err 399 } 400 src := net.ParseIP(config.Source) 401 if src == nil { 402 return fmt.Errorf("Invalid source for route: %s", config.Source) 403 } 404 gw := net.ParseIP(config.Gateway) 405 if gw == nil { 406 return fmt.Errorf("Invalid gateway for route: %s", config.Gateway) 407 } 408 l, err := netlink.LinkByName(config.InterfaceName) 409 if err != nil { 410 return err 411 } 412 route := &netlink.Route{ 413 Scope: netlink.SCOPE_UNIVERSE, 414 Dst: dst, 415 Src: src, 416 Gw: gw, 417 LinkIndex: l.Attrs().Index, 418 } 419 if err := netlink.RouteAdd(route); err != nil { 420 return err 421 } 422 } 423 return nil 424} 425 426func setupRlimits(limits []configs.Rlimit, pid int) error { 427 for _, rlimit := range limits { 428 if err := system.Prlimit(pid, rlimit.Type, unix.Rlimit{Max: rlimit.Hard, Cur: rlimit.Soft}); err != nil { 429 return fmt.Errorf("error setting rlimit type %v: %v", rlimit.Type, err) 430 } 431 } 432 return nil 433} 434 435const _P_PID = 1 436 437//nolint:structcheck,unused 438type siginfo struct { 439 si_signo int32 440 si_errno int32 441 si_code int32 442 // below here is a union; si_pid is the only field we use 443 si_pid int32 444 // Pad to 128 bytes as detailed in blockUntilWaitable 445 pad [96]byte 446} 447 448// isWaitable returns true if the process has exited false otherwise. 449// Its based off blockUntilWaitable in src/os/wait_waitid.go 450func isWaitable(pid int) (bool, error) { 451 si := &siginfo{} 452 _, _, e := unix.Syscall6(unix.SYS_WAITID, _P_PID, uintptr(pid), uintptr(unsafe.Pointer(si)), unix.WEXITED|unix.WNOWAIT|unix.WNOHANG, 0, 0) 453 if e != 0 { 454 return false, os.NewSyscallError("waitid", e) 455 } 456 457 return si.si_pid != 0, nil 458} 459 460// isNoChildren returns true if err represents a unix.ECHILD (formerly syscall.ECHILD) false otherwise 461func isNoChildren(err error) bool { 462 switch err := err.(type) { 463 case unix.Errno: 464 if err == unix.ECHILD { 465 return true 466 } 467 case *os.SyscallError: 468 if err.Err == unix.ECHILD { 469 return true 470 } 471 } 472 return false 473} 474 475// signalAllProcesses freezes then iterates over all the processes inside the 476// manager's cgroups sending the signal s to them. 477// If s is SIGKILL then it will wait for each process to exit. 478// For all other signals it will check if the process is ready to report its 479// exit status and only if it is will a wait be performed. 480func signalAllProcesses(m cgroups.Manager, s os.Signal) error { 481 var procs []*os.Process 482 if err := m.Freeze(configs.Frozen); err != nil { 483 logrus.Warn(err) 484 } 485 pids, err := m.GetAllPids() 486 if err != nil { 487 if err := m.Freeze(configs.Thawed); err != nil { 488 logrus.Warn(err) 489 } 490 return err 491 } 492 for _, pid := range pids { 493 p, err := os.FindProcess(pid) 494 if err != nil { 495 logrus.Warn(err) 496 continue 497 } 498 procs = append(procs, p) 499 if err := p.Signal(s); err != nil { 500 logrus.Warn(err) 501 } 502 } 503 if err := m.Freeze(configs.Thawed); err != nil { 504 logrus.Warn(err) 505 } 506 507 subreaper, err := system.GetSubreaper() 508 if err != nil { 509 // The error here means that PR_GET_CHILD_SUBREAPER is not 510 // supported because this code might run on a kernel older 511 // than 3.4. We don't want to throw an error in that case, 512 // and we simplify things, considering there is no subreaper 513 // set. 514 subreaper = 0 515 } 516 517 for _, p := range procs { 518 if s != unix.SIGKILL { 519 if ok, err := isWaitable(p.Pid); err != nil { 520 if !isNoChildren(err) { 521 logrus.Warn("signalAllProcesses: ", p.Pid, err) 522 } 523 continue 524 } else if !ok { 525 // Not ready to report so don't wait 526 continue 527 } 528 } 529 530 // In case a subreaper has been setup, this code must not 531 // wait for the process. Otherwise, we cannot be sure the 532 // current process will be reaped by the subreaper, while 533 // the subreaper might be waiting for this process in order 534 // to retrieve its exit code. 535 if subreaper == 0 { 536 if _, err := p.Wait(); err != nil { 537 if !isNoChildren(err) { 538 logrus.Warn("wait: ", err) 539 } 540 } 541 } 542 } 543 return nil 544} 545