1// +build linux 2 3package libcontainer 4 5import ( 6 "bytes" 7 "encoding/json" 8 "errors" 9 "fmt" 10 "io" 11 "io/ioutil" 12 "net" 13 "os" 14 "os/exec" 15 "path/filepath" 16 "reflect" 17 "strconv" 18 "strings" 19 "sync" 20 "time" 21 22 securejoin "github.com/cyphar/filepath-securejoin" 23 "github.com/opencontainers/runc/libcontainer/cgroups" 24 "github.com/opencontainers/runc/libcontainer/configs" 25 "github.com/opencontainers/runc/libcontainer/intelrdt" 26 "github.com/opencontainers/runc/libcontainer/system" 27 "github.com/opencontainers/runc/libcontainer/utils" 28 "github.com/opencontainers/runtime-spec/specs-go" 29 30 "github.com/checkpoint-restore/go-criu/v5" 31 criurpc "github.com/checkpoint-restore/go-criu/v5/rpc" 32 errorsf "github.com/pkg/errors" 33 "github.com/sirupsen/logrus" 34 "github.com/vishvananda/netlink/nl" 35 "golang.org/x/sys/unix" 36 "google.golang.org/protobuf/proto" 37) 38 39const stdioFdCount = 3 40 41type linuxContainer struct { 42 id string 43 root string 44 config *configs.Config 45 cgroupManager cgroups.Manager 46 intelRdtManager intelrdt.Manager 47 initPath string 48 initArgs []string 49 initProcess parentProcess 50 initProcessStartTime uint64 51 criuPath string 52 newuidmapPath string 53 newgidmapPath string 54 m sync.Mutex 55 criuVersion int 56 state containerState 57 created time.Time 58 fifo *os.File 59} 60 61// State represents a running container's state 62type State struct { 63 BaseState 64 65 // Platform specific fields below here 66 67 // Specified if the container was started under the rootless mode. 68 // Set to true if BaseState.Config.RootlessEUID && BaseState.Config.RootlessCgroups 69 Rootless bool `json:"rootless"` 70 71 // Paths to all the container's cgroups, as returned by (*cgroups.Manager).GetPaths 72 // 73 // For cgroup v1, a key is cgroup subsystem name, and the value is the path 74 // to the cgroup for this subsystem. 75 // 76 // For cgroup v2 unified hierarchy, a key is "", and the value is the unified path. 77 CgroupPaths map[string]string `json:"cgroup_paths"` 78 79 // NamespacePaths are filepaths to the container's namespaces. Key is the namespace type 80 // with the value as the path. 81 NamespacePaths map[configs.NamespaceType]string `json:"namespace_paths"` 82 83 // Container's standard descriptors (std{in,out,err}), needed for checkpoint and restore 84 ExternalDescriptors []string `json:"external_descriptors,omitempty"` 85 86 // Intel RDT "resource control" filesystem path 87 IntelRdtPath string `json:"intel_rdt_path"` 88} 89 90// Container is a libcontainer container object. 91// 92// Each container is thread-safe within the same process. Since a container can 93// be destroyed by a separate process, any function may return that the container 94// was not found. 95type Container interface { 96 BaseContainer 97 98 // Methods below here are platform specific 99 100 // Checkpoint checkpoints the running container's state to disk using the criu(8) utility. 101 // 102 // errors: 103 // Systemerror - System error. 104 Checkpoint(criuOpts *CriuOpts) error 105 106 // Restore restores the checkpointed container to a running state using the criu(8) utility. 107 // 108 // errors: 109 // Systemerror - System error. 110 Restore(process *Process, criuOpts *CriuOpts) error 111 112 // If the Container state is RUNNING or CREATED, sets the Container state to PAUSING and pauses 113 // the execution of any user processes. Asynchronously, when the container finished being paused the 114 // state is changed to PAUSED. 115 // If the Container state is PAUSED, do nothing. 116 // 117 // errors: 118 // ContainerNotExists - Container no longer exists, 119 // ContainerNotRunning - Container not running or created, 120 // Systemerror - System error. 121 Pause() error 122 123 // If the Container state is PAUSED, resumes the execution of any user processes in the 124 // Container before setting the Container state to RUNNING. 125 // If the Container state is RUNNING, do nothing. 126 // 127 // errors: 128 // ContainerNotExists - Container no longer exists, 129 // ContainerNotPaused - Container is not paused, 130 // Systemerror - System error. 131 Resume() error 132 133 // NotifyOOM returns a read-only channel signaling when the container receives an OOM notification. 134 // 135 // errors: 136 // Systemerror - System error. 137 NotifyOOM() (<-chan struct{}, error) 138 139 // NotifyMemoryPressure returns a read-only channel signaling when the container reaches a given pressure level 140 // 141 // errors: 142 // Systemerror - System error. 143 NotifyMemoryPressure(level PressureLevel) (<-chan struct{}, error) 144} 145 146// ID returns the container's unique ID 147func (c *linuxContainer) ID() string { 148 return c.id 149} 150 151// Config returns the container's configuration 152func (c *linuxContainer) Config() configs.Config { 153 return *c.config 154} 155 156func (c *linuxContainer) Status() (Status, error) { 157 c.m.Lock() 158 defer c.m.Unlock() 159 return c.currentStatus() 160} 161 162func (c *linuxContainer) State() (*State, error) { 163 c.m.Lock() 164 defer c.m.Unlock() 165 return c.currentState() 166} 167 168func (c *linuxContainer) OCIState() (*specs.State, error) { 169 c.m.Lock() 170 defer c.m.Unlock() 171 return c.currentOCIState() 172} 173 174func (c *linuxContainer) Processes() ([]int, error) { 175 var pids []int 176 status, err := c.currentStatus() 177 if err != nil { 178 return pids, err 179 } 180 // for systemd cgroup, the unit's cgroup path will be auto removed if container's all processes exited 181 if status == Stopped && !c.cgroupManager.Exists() { 182 return pids, nil 183 } 184 185 pids, err = c.cgroupManager.GetAllPids() 186 if err != nil { 187 return nil, newSystemErrorWithCause(err, "getting all container pids from cgroups") 188 } 189 return pids, nil 190} 191 192func (c *linuxContainer) Stats() (*Stats, error) { 193 var ( 194 err error 195 stats = &Stats{} 196 ) 197 if stats.CgroupStats, err = c.cgroupManager.GetStats(); err != nil { 198 return stats, newSystemErrorWithCause(err, "getting container stats from cgroups") 199 } 200 if c.intelRdtManager != nil { 201 if stats.IntelRdtStats, err = c.intelRdtManager.GetStats(); err != nil { 202 return stats, newSystemErrorWithCause(err, "getting container's Intel RDT stats") 203 } 204 } 205 for _, iface := range c.config.Networks { 206 switch iface.Type { 207 case "veth": 208 istats, err := getNetworkInterfaceStats(iface.HostInterfaceName) 209 if err != nil { 210 return stats, newSystemErrorWithCausef(err, "getting network stats for interface %q", iface.HostInterfaceName) 211 } 212 stats.Interfaces = append(stats.Interfaces, istats) 213 } 214 } 215 return stats, nil 216} 217 218func (c *linuxContainer) Set(config configs.Config) error { 219 c.m.Lock() 220 defer c.m.Unlock() 221 status, err := c.currentStatus() 222 if err != nil { 223 return err 224 } 225 if status == Stopped { 226 return newGenericError(errors.New("container not running"), ContainerNotRunning) 227 } 228 if err := c.cgroupManager.Set(config.Cgroups.Resources); err != nil { 229 // Set configs back 230 if err2 := c.cgroupManager.Set(c.config.Cgroups.Resources); err2 != nil { 231 logrus.Warnf("Setting back cgroup configs failed due to error: %v, your state.json and actual configs might be inconsistent.", err2) 232 } 233 return err 234 } 235 if c.intelRdtManager != nil { 236 if err := c.intelRdtManager.Set(&config); err != nil { 237 // Set configs back 238 if err2 := c.cgroupManager.Set(c.config.Cgroups.Resources); err2 != nil { 239 logrus.Warnf("Setting back cgroup configs failed due to error: %v, your state.json and actual configs might be inconsistent.", err2) 240 } 241 if err2 := c.intelRdtManager.Set(c.config); err2 != nil { 242 logrus.Warnf("Setting back intelrdt configs failed due to error: %v, your state.json and actual configs might be inconsistent.", err2) 243 } 244 return err 245 } 246 } 247 // After config setting succeed, update config and states 248 c.config = &config 249 _, err = c.updateState(nil) 250 return err 251} 252 253func (c *linuxContainer) Start(process *Process) error { 254 c.m.Lock() 255 defer c.m.Unlock() 256 if c.config.Cgroups.Resources.SkipDevices { 257 return newGenericError(errors.New("can't start container with SkipDevices set"), ConfigInvalid) 258 } 259 if process.Init { 260 if err := c.createExecFifo(); err != nil { 261 return err 262 } 263 } 264 if err := c.start(process); err != nil { 265 if process.Init { 266 c.deleteExecFifo() 267 } 268 return err 269 } 270 return nil 271} 272 273func (c *linuxContainer) Run(process *Process) error { 274 if err := c.Start(process); err != nil { 275 return err 276 } 277 if process.Init { 278 return c.exec() 279 } 280 return nil 281} 282 283func (c *linuxContainer) Exec() error { 284 c.m.Lock() 285 defer c.m.Unlock() 286 return c.exec() 287} 288 289func (c *linuxContainer) exec() error { 290 path := filepath.Join(c.root, execFifoFilename) 291 pid := c.initProcess.pid() 292 blockingFifoOpenCh := awaitFifoOpen(path) 293 for { 294 select { 295 case result := <-blockingFifoOpenCh: 296 return handleFifoResult(result) 297 298 case <-time.After(time.Millisecond * 100): 299 stat, err := system.Stat(pid) 300 if err != nil || stat.State == system.Zombie { 301 // could be because process started, ran, and completed between our 100ms timeout and our system.Stat() check. 302 // see if the fifo exists and has data (with a non-blocking open, which will succeed if the writing process is complete). 303 if err := handleFifoResult(fifoOpen(path, false)); err != nil { 304 return errors.New("container process is already dead") 305 } 306 return nil 307 } 308 } 309 } 310} 311 312func readFromExecFifo(execFifo io.Reader) error { 313 data, err := ioutil.ReadAll(execFifo) 314 if err != nil { 315 return err 316 } 317 if len(data) <= 0 { 318 return errors.New("cannot start an already running container") 319 } 320 return nil 321} 322 323func awaitFifoOpen(path string) <-chan openResult { 324 fifoOpened := make(chan openResult) 325 go func() { 326 result := fifoOpen(path, true) 327 fifoOpened <- result 328 }() 329 return fifoOpened 330} 331 332func fifoOpen(path string, block bool) openResult { 333 flags := os.O_RDONLY 334 if !block { 335 flags |= unix.O_NONBLOCK 336 } 337 f, err := os.OpenFile(path, flags, 0) 338 if err != nil { 339 return openResult{err: newSystemErrorWithCause(err, "open exec fifo for reading")} 340 } 341 return openResult{file: f} 342} 343 344func handleFifoResult(result openResult) error { 345 if result.err != nil { 346 return result.err 347 } 348 f := result.file 349 defer f.Close() 350 if err := readFromExecFifo(f); err != nil { 351 return err 352 } 353 return os.Remove(f.Name()) 354} 355 356type openResult struct { 357 file *os.File 358 err error 359} 360 361func (c *linuxContainer) start(process *Process) (retErr error) { 362 parent, err := c.newParentProcess(process) 363 if err != nil { 364 return newSystemErrorWithCause(err, "creating new parent process") 365 } 366 367 logsDone := parent.forwardChildLogs() 368 if logsDone != nil { 369 defer func() { 370 // Wait for log forwarder to finish. This depends on 371 // runc init closing the _LIBCONTAINER_LOGPIPE log fd. 372 err := <-logsDone 373 if err != nil && retErr == nil { 374 retErr = newSystemErrorWithCause(err, "forwarding init logs") 375 } 376 }() 377 } 378 379 if err := parent.start(); err != nil { 380 return newSystemErrorWithCause(err, "starting container process") 381 } 382 383 if process.Init { 384 c.fifo.Close() 385 if c.config.Hooks != nil { 386 s, err := c.currentOCIState() 387 if err != nil { 388 return err 389 } 390 391 if err := c.config.Hooks[configs.Poststart].RunHooks(s); err != nil { 392 if err := ignoreTerminateErrors(parent.terminate()); err != nil { 393 logrus.Warn(errorsf.Wrapf(err, "Running Poststart hook")) 394 } 395 return err 396 } 397 } 398 } 399 return nil 400} 401 402func (c *linuxContainer) Signal(s os.Signal, all bool) error { 403 c.m.Lock() 404 defer c.m.Unlock() 405 status, err := c.currentStatus() 406 if err != nil { 407 return err 408 } 409 if all { 410 // for systemd cgroup, the unit's cgroup path will be auto removed if container's all processes exited 411 if status == Stopped && !c.cgroupManager.Exists() { 412 return nil 413 } 414 return signalAllProcesses(c.cgroupManager, s) 415 } 416 // to avoid a PID reuse attack 417 if status == Running || status == Created || status == Paused { 418 if err := c.initProcess.signal(s); err != nil { 419 return newSystemErrorWithCause(err, "signaling init process") 420 } 421 return nil 422 } 423 return newGenericError(errors.New("container not running"), ContainerNotRunning) 424} 425 426func (c *linuxContainer) createExecFifo() error { 427 rootuid, err := c.Config().HostRootUID() 428 if err != nil { 429 return err 430 } 431 rootgid, err := c.Config().HostRootGID() 432 if err != nil { 433 return err 434 } 435 436 fifoName := filepath.Join(c.root, execFifoFilename) 437 if _, err := os.Stat(fifoName); err == nil { 438 return fmt.Errorf("exec fifo %s already exists", fifoName) 439 } 440 oldMask := unix.Umask(0o000) 441 if err := unix.Mkfifo(fifoName, 0o622); err != nil { 442 unix.Umask(oldMask) 443 return err 444 } 445 unix.Umask(oldMask) 446 return os.Chown(fifoName, rootuid, rootgid) 447} 448 449func (c *linuxContainer) deleteExecFifo() { 450 fifoName := filepath.Join(c.root, execFifoFilename) 451 os.Remove(fifoName) 452} 453 454// includeExecFifo opens the container's execfifo as a pathfd, so that the 455// container cannot access the statedir (and the FIFO itself remains 456// un-opened). It then adds the FifoFd to the given exec.Cmd as an inherited 457// fd, with _LIBCONTAINER_FIFOFD set to its fd number. 458func (c *linuxContainer) includeExecFifo(cmd *exec.Cmd) error { 459 fifoName := filepath.Join(c.root, execFifoFilename) 460 fifo, err := os.OpenFile(fifoName, unix.O_PATH|unix.O_CLOEXEC, 0) 461 if err != nil { 462 return err 463 } 464 c.fifo = fifo 465 466 cmd.ExtraFiles = append(cmd.ExtraFiles, fifo) 467 cmd.Env = append(cmd.Env, 468 "_LIBCONTAINER_FIFOFD="+strconv.Itoa(stdioFdCount+len(cmd.ExtraFiles)-1)) 469 return nil 470} 471 472func (c *linuxContainer) newParentProcess(p *Process) (parentProcess, error) { 473 parentInitPipe, childInitPipe, err := utils.NewSockPair("init") 474 if err != nil { 475 return nil, newSystemErrorWithCause(err, "creating new init pipe") 476 } 477 messageSockPair := filePair{parentInitPipe, childInitPipe} 478 479 parentLogPipe, childLogPipe, err := os.Pipe() 480 if err != nil { 481 return nil, fmt.Errorf("Unable to create the log pipe: %s", err) 482 } 483 logFilePair := filePair{parentLogPipe, childLogPipe} 484 485 cmd := c.commandTemplate(p, childInitPipe, childLogPipe) 486 if !p.Init { 487 return c.newSetnsProcess(p, cmd, messageSockPair, logFilePair) 488 } 489 490 // We only set up fifoFd if we're not doing a `runc exec`. The historic 491 // reason for this is that previously we would pass a dirfd that allowed 492 // for container rootfs escape (and not doing it in `runc exec` avoided 493 // that problem), but we no longer do that. However, there's no need to do 494 // this for `runc exec` so we just keep it this way to be safe. 495 if err := c.includeExecFifo(cmd); err != nil { 496 return nil, newSystemErrorWithCause(err, "including execfifo in cmd.Exec setup") 497 } 498 return c.newInitProcess(p, cmd, messageSockPair, logFilePair) 499} 500 501func (c *linuxContainer) commandTemplate(p *Process, childInitPipe *os.File, childLogPipe *os.File) *exec.Cmd { 502 cmd := exec.Command(c.initPath, c.initArgs[1:]...) 503 cmd.Args[0] = c.initArgs[0] 504 cmd.Stdin = p.Stdin 505 cmd.Stdout = p.Stdout 506 cmd.Stderr = p.Stderr 507 cmd.Dir = c.config.Rootfs 508 if cmd.SysProcAttr == nil { 509 cmd.SysProcAttr = &unix.SysProcAttr{} 510 } 511 cmd.Env = append(cmd.Env, "GOMAXPROCS="+os.Getenv("GOMAXPROCS")) 512 cmd.ExtraFiles = append(cmd.ExtraFiles, p.ExtraFiles...) 513 if p.ConsoleSocket != nil { 514 cmd.ExtraFiles = append(cmd.ExtraFiles, p.ConsoleSocket) 515 cmd.Env = append(cmd.Env, 516 "_LIBCONTAINER_CONSOLE="+strconv.Itoa(stdioFdCount+len(cmd.ExtraFiles)-1), 517 ) 518 } 519 cmd.ExtraFiles = append(cmd.ExtraFiles, childInitPipe) 520 cmd.Env = append(cmd.Env, 521 "_LIBCONTAINER_INITPIPE="+strconv.Itoa(stdioFdCount+len(cmd.ExtraFiles)-1), 522 "_LIBCONTAINER_STATEDIR="+c.root, 523 ) 524 525 cmd.ExtraFiles = append(cmd.ExtraFiles, childLogPipe) 526 cmd.Env = append(cmd.Env, 527 "_LIBCONTAINER_LOGPIPE="+strconv.Itoa(stdioFdCount+len(cmd.ExtraFiles)-1), 528 "_LIBCONTAINER_LOGLEVEL="+p.LogLevel, 529 ) 530 531 // NOTE: when running a container with no PID namespace and the parent process spawning the container is 532 // PID1 the pdeathsig is being delivered to the container's init process by the kernel for some reason 533 // even with the parent still running. 534 if c.config.ParentDeathSignal > 0 { 535 cmd.SysProcAttr.Pdeathsig = unix.Signal(c.config.ParentDeathSignal) 536 } 537 return cmd 538} 539 540func (c *linuxContainer) newInitProcess(p *Process, cmd *exec.Cmd, messageSockPair, logFilePair filePair) (*initProcess, error) { 541 cmd.Env = append(cmd.Env, "_LIBCONTAINER_INITTYPE="+string(initStandard)) 542 nsMaps := make(map[configs.NamespaceType]string) 543 for _, ns := range c.config.Namespaces { 544 if ns.Path != "" { 545 nsMaps[ns.Type] = ns.Path 546 } 547 } 548 _, sharePidns := nsMaps[configs.NEWPID] 549 data, err := c.bootstrapData(c.config.Namespaces.CloneFlags(), nsMaps) 550 if err != nil { 551 return nil, err 552 } 553 init := &initProcess{ 554 cmd: cmd, 555 messageSockPair: messageSockPair, 556 logFilePair: logFilePair, 557 manager: c.cgroupManager, 558 intelRdtManager: c.intelRdtManager, 559 config: c.newInitConfig(p), 560 container: c, 561 process: p, 562 bootstrapData: data, 563 sharePidns: sharePidns, 564 } 565 c.initProcess = init 566 return init, nil 567} 568 569func (c *linuxContainer) newSetnsProcess(p *Process, cmd *exec.Cmd, messageSockPair, logFilePair filePair) (*setnsProcess, error) { 570 cmd.Env = append(cmd.Env, "_LIBCONTAINER_INITTYPE="+string(initSetns)) 571 state, err := c.currentState() 572 if err != nil { 573 return nil, newSystemErrorWithCause(err, "getting container's current state") 574 } 575 // for setns process, we don't have to set cloneflags as the process namespaces 576 // will only be set via setns syscall 577 data, err := c.bootstrapData(0, state.NamespacePaths) 578 if err != nil { 579 return nil, err 580 } 581 return &setnsProcess{ 582 cmd: cmd, 583 cgroupPaths: state.CgroupPaths, 584 rootlessCgroups: c.config.RootlessCgroups, 585 intelRdtPath: state.IntelRdtPath, 586 messageSockPair: messageSockPair, 587 logFilePair: logFilePair, 588 manager: c.cgroupManager, 589 config: c.newInitConfig(p), 590 process: p, 591 bootstrapData: data, 592 initProcessPid: state.InitProcessPid, 593 }, nil 594} 595 596func (c *linuxContainer) newInitConfig(process *Process) *initConfig { 597 cfg := &initConfig{ 598 Config: c.config, 599 Args: process.Args, 600 Env: process.Env, 601 User: process.User, 602 AdditionalGroups: process.AdditionalGroups, 603 Cwd: process.Cwd, 604 Capabilities: process.Capabilities, 605 PassedFilesCount: len(process.ExtraFiles), 606 ContainerId: c.ID(), 607 NoNewPrivileges: c.config.NoNewPrivileges, 608 RootlessEUID: c.config.RootlessEUID, 609 RootlessCgroups: c.config.RootlessCgroups, 610 AppArmorProfile: c.config.AppArmorProfile, 611 ProcessLabel: c.config.ProcessLabel, 612 Rlimits: c.config.Rlimits, 613 CreateConsole: process.ConsoleSocket != nil, 614 ConsoleWidth: process.ConsoleWidth, 615 ConsoleHeight: process.ConsoleHeight, 616 } 617 if process.NoNewPrivileges != nil { 618 cfg.NoNewPrivileges = *process.NoNewPrivileges 619 } 620 if process.AppArmorProfile != "" { 621 cfg.AppArmorProfile = process.AppArmorProfile 622 } 623 if process.Label != "" { 624 cfg.ProcessLabel = process.Label 625 } 626 if len(process.Rlimits) > 0 { 627 cfg.Rlimits = process.Rlimits 628 } 629 if cgroups.IsCgroup2UnifiedMode() { 630 cfg.Cgroup2Path = c.cgroupManager.Path("") 631 } 632 633 return cfg 634} 635 636func (c *linuxContainer) Destroy() error { 637 c.m.Lock() 638 defer c.m.Unlock() 639 return c.state.destroy() 640} 641 642func (c *linuxContainer) Pause() error { 643 c.m.Lock() 644 defer c.m.Unlock() 645 status, err := c.currentStatus() 646 if err != nil { 647 return err 648 } 649 switch status { 650 case Running, Created: 651 if err := c.cgroupManager.Freeze(configs.Frozen); err != nil { 652 return err 653 } 654 return c.state.transition(&pausedState{ 655 c: c, 656 }) 657 } 658 return newGenericError(fmt.Errorf("container not running or created: %s", status), ContainerNotRunning) 659} 660 661func (c *linuxContainer) Resume() error { 662 c.m.Lock() 663 defer c.m.Unlock() 664 status, err := c.currentStatus() 665 if err != nil { 666 return err 667 } 668 if status != Paused { 669 return newGenericError(fmt.Errorf("container not paused"), ContainerNotPaused) 670 } 671 if err := c.cgroupManager.Freeze(configs.Thawed); err != nil { 672 return err 673 } 674 return c.state.transition(&runningState{ 675 c: c, 676 }) 677} 678 679func (c *linuxContainer) NotifyOOM() (<-chan struct{}, error) { 680 // XXX(cyphar): This requires cgroups. 681 if c.config.RootlessCgroups { 682 logrus.Warn("getting OOM notifications may fail if you don't have the full access to cgroups") 683 } 684 path := c.cgroupManager.Path("memory") 685 if cgroups.IsCgroup2UnifiedMode() { 686 return notifyOnOOMV2(path) 687 } 688 return notifyOnOOM(path) 689} 690 691func (c *linuxContainer) NotifyMemoryPressure(level PressureLevel) (<-chan struct{}, error) { 692 // XXX(cyphar): This requires cgroups. 693 if c.config.RootlessCgroups { 694 logrus.Warn("getting memory pressure notifications may fail if you don't have the full access to cgroups") 695 } 696 return notifyMemoryPressure(c.cgroupManager.Path("memory"), level) 697} 698 699var criuFeatures *criurpc.CriuFeatures 700 701func (c *linuxContainer) checkCriuFeatures(criuOpts *CriuOpts, rpcOpts *criurpc.CriuOpts, criuFeat *criurpc.CriuFeatures) error { 702 t := criurpc.CriuReqType_FEATURE_CHECK 703 704 // make sure the features we are looking for are really not from 705 // some previous check 706 criuFeatures = nil 707 708 req := &criurpc.CriuReq{ 709 Type: &t, 710 // Theoretically this should not be necessary but CRIU 711 // segfaults if Opts is empty. 712 // Fixed in CRIU 2.12 713 Opts: rpcOpts, 714 Features: criuFeat, 715 } 716 717 err := c.criuSwrk(nil, req, criuOpts, nil) 718 if err != nil { 719 logrus.Debugf("%s", err) 720 return errors.New("CRIU feature check failed") 721 } 722 723 missingFeatures := false 724 725 // The outer if checks if the fields actually exist 726 if (criuFeat.MemTrack != nil) && 727 (criuFeatures.MemTrack != nil) { 728 // The inner if checks if they are set to true 729 if *criuFeat.MemTrack && !*criuFeatures.MemTrack { 730 missingFeatures = true 731 logrus.Debugf("CRIU does not support MemTrack") 732 } 733 } 734 735 // This needs to be repeated for every new feature check. 736 // Is there a way to put this in a function. Reflection? 737 if (criuFeat.LazyPages != nil) && 738 (criuFeatures.LazyPages != nil) { 739 if *criuFeat.LazyPages && !*criuFeatures.LazyPages { 740 missingFeatures = true 741 logrus.Debugf("CRIU does not support LazyPages") 742 } 743 } 744 745 if missingFeatures { 746 return errors.New("CRIU is missing features") 747 } 748 749 return nil 750} 751 752func compareCriuVersion(criuVersion int, minVersion int) error { 753 // simple function to perform the actual version compare 754 if criuVersion < minVersion { 755 return fmt.Errorf("CRIU version %d must be %d or higher", criuVersion, minVersion) 756 } 757 758 return nil 759} 760 761// checkCriuVersion checks Criu version greater than or equal to minVersion 762func (c *linuxContainer) checkCriuVersion(minVersion int) error { 763 // If the version of criu has already been determined there is no need 764 // to ask criu for the version again. Use the value from c.criuVersion. 765 if c.criuVersion != 0 { 766 return compareCriuVersion(c.criuVersion, minVersion) 767 } 768 769 criu := criu.MakeCriu() 770 criu.SetCriuPath(c.criuPath) 771 var err error 772 c.criuVersion, err = criu.GetCriuVersion() 773 if err != nil { 774 return fmt.Errorf("CRIU version check failed: %s", err) 775 } 776 777 return compareCriuVersion(c.criuVersion, minVersion) 778} 779 780const descriptorsFilename = "descriptors.json" 781 782func (c *linuxContainer) addCriuDumpMount(req *criurpc.CriuReq, m *configs.Mount) { 783 mountDest := strings.TrimPrefix(m.Destination, c.config.Rootfs) 784 extMnt := &criurpc.ExtMountMap{ 785 Key: proto.String(mountDest), 786 Val: proto.String(mountDest), 787 } 788 req.Opts.ExtMnt = append(req.Opts.ExtMnt, extMnt) 789} 790 791func (c *linuxContainer) addMaskPaths(req *criurpc.CriuReq) error { 792 for _, path := range c.config.MaskPaths { 793 fi, err := os.Stat(fmt.Sprintf("/proc/%d/root/%s", c.initProcess.pid(), path)) 794 if err != nil { 795 if os.IsNotExist(err) { 796 continue 797 } 798 return err 799 } 800 if fi.IsDir() { 801 continue 802 } 803 804 extMnt := &criurpc.ExtMountMap{ 805 Key: proto.String(path), 806 Val: proto.String("/dev/null"), 807 } 808 req.Opts.ExtMnt = append(req.Opts.ExtMnt, extMnt) 809 } 810 return nil 811} 812 813func (c *linuxContainer) handleCriuConfigurationFile(rpcOpts *criurpc.CriuOpts) { 814 // CRIU will evaluate a configuration starting with release 3.11. 815 // Settings in the configuration file will overwrite RPC settings. 816 // Look for annotations. The annotation 'org.criu.config' 817 // specifies if CRIU should use a different, container specific 818 // configuration file. 819 _, annotations := utils.Annotations(c.config.Labels) 820 configFile, exists := annotations["org.criu.config"] 821 if exists { 822 // If the annotation 'org.criu.config' exists and is set 823 // to a non-empty string, tell CRIU to use that as a 824 // configuration file. If the file does not exist, CRIU 825 // will just ignore it. 826 if configFile != "" { 827 rpcOpts.ConfigFile = proto.String(configFile) 828 } 829 // If 'org.criu.config' exists and is set to an empty 830 // string, a runc specific CRIU configuration file will 831 // be not set at all. 832 } else { 833 // If the mentioned annotation has not been found, specify 834 // a default CRIU configuration file. 835 rpcOpts.ConfigFile = proto.String("/etc/criu/runc.conf") 836 } 837} 838 839func (c *linuxContainer) criuSupportsExtNS(t configs.NamespaceType) bool { 840 var minVersion int 841 switch t { 842 case configs.NEWNET: 843 // CRIU supports different external namespace with different released CRIU versions. 844 // For network namespaces to work we need at least criu 3.11.0 => 31100. 845 minVersion = 31100 846 case configs.NEWPID: 847 // For PID namespaces criu 31500 is needed. 848 minVersion = 31500 849 default: 850 return false 851 } 852 return c.checkCriuVersion(minVersion) == nil 853} 854 855func criuNsToKey(t configs.NamespaceType) string { 856 return "extRoot" + strings.Title(configs.NsName(t)) + "NS" 857} 858 859func (c *linuxContainer) handleCheckpointingExternalNamespaces(rpcOpts *criurpc.CriuOpts, t configs.NamespaceType) error { 860 if !c.criuSupportsExtNS(t) { 861 return nil 862 } 863 864 nsPath := c.config.Namespaces.PathOf(t) 865 if nsPath == "" { 866 return nil 867 } 868 // CRIU expects the information about an external namespace 869 // like this: --external <TYPE>[<inode>]:<key> 870 // This <key> is always 'extRoot<TYPE>NS'. 871 var ns unix.Stat_t 872 if err := unix.Stat(nsPath, &ns); err != nil { 873 return err 874 } 875 criuExternal := fmt.Sprintf("%s[%d]:%s", configs.NsName(t), ns.Ino, criuNsToKey(t)) 876 rpcOpts.External = append(rpcOpts.External, criuExternal) 877 878 return nil 879} 880 881func (c *linuxContainer) handleRestoringNamespaces(rpcOpts *criurpc.CriuOpts, extraFiles *[]*os.File) error { 882 for _, ns := range c.config.Namespaces { 883 switch ns.Type { 884 case configs.NEWNET, configs.NEWPID: 885 // If the container is running in a network or PID namespace and has 886 // a path to the network or PID namespace configured, we will dump 887 // that network or PID namespace as an external namespace and we 888 // will expect that the namespace exists during restore. 889 // This basically means that CRIU will ignore the namespace 890 // and expect it to be setup correctly. 891 if err := c.handleRestoringExternalNamespaces(rpcOpts, extraFiles, ns.Type); err != nil { 892 return err 893 } 894 default: 895 // For all other namespaces except NET and PID CRIU has 896 // a simpler way of joining the existing namespace if set 897 nsPath := c.config.Namespaces.PathOf(ns.Type) 898 if nsPath == "" { 899 continue 900 } 901 if ns.Type == configs.NEWCGROUP { 902 // CRIU has no code to handle NEWCGROUP 903 return fmt.Errorf("Do not know how to handle namespace %v", ns.Type) 904 } 905 // CRIU has code to handle NEWTIME, but it does not seem to be defined in runc 906 907 // CRIU will issue a warning for NEWUSER: 908 // criu/namespaces.c: 'join-ns with user-namespace is not fully tested and dangerous' 909 rpcOpts.JoinNs = append(rpcOpts.JoinNs, &criurpc.JoinNamespace{ 910 Ns: proto.String(configs.NsName(ns.Type)), 911 NsFile: proto.String(nsPath), 912 }) 913 } 914 } 915 916 return nil 917} 918 919func (c *linuxContainer) handleRestoringExternalNamespaces(rpcOpts *criurpc.CriuOpts, extraFiles *[]*os.File, t configs.NamespaceType) error { 920 if !c.criuSupportsExtNS(t) { 921 return nil 922 } 923 924 nsPath := c.config.Namespaces.PathOf(t) 925 if nsPath == "" { 926 return nil 927 } 928 // CRIU wants the information about an existing namespace 929 // like this: --inherit-fd fd[<fd>]:<key> 930 // The <key> needs to be the same as during checkpointing. 931 // We are always using 'extRoot<TYPE>NS' as the key in this. 932 nsFd, err := os.Open(nsPath) 933 if err != nil { 934 logrus.Errorf("If a specific network namespace is defined it must exist: %s", err) 935 return fmt.Errorf("Requested network namespace %v does not exist", nsPath) 936 } 937 inheritFd := &criurpc.InheritFd{ 938 Key: proto.String(criuNsToKey(t)), 939 // The offset of four is necessary because 0, 1, 2 and 3 are 940 // already used by stdin, stdout, stderr, 'criu swrk' socket. 941 Fd: proto.Int32(int32(4 + len(*extraFiles))), 942 } 943 rpcOpts.InheritFd = append(rpcOpts.InheritFd, inheritFd) 944 // All open FDs need to be transferred to CRIU via extraFiles 945 *extraFiles = append(*extraFiles, nsFd) 946 947 return nil 948} 949 950func (c *linuxContainer) Checkpoint(criuOpts *CriuOpts) error { 951 c.m.Lock() 952 defer c.m.Unlock() 953 954 // Checkpoint is unlikely to work if os.Geteuid() != 0 || system.RunningInUserNS(). 955 // (CLI prints a warning) 956 // TODO(avagin): Figure out how to make this work nicely. CRIU 2.0 has 957 // support for doing unprivileged dumps, but the setup of 958 // rootless containers might make this complicated. 959 960 // We are relying on the CRIU version RPC which was introduced with CRIU 3.0.0 961 if err := c.checkCriuVersion(30000); err != nil { 962 return err 963 } 964 965 if criuOpts.ImagesDirectory == "" { 966 return errors.New("invalid directory to save checkpoint") 967 } 968 969 // Since a container can be C/R'ed multiple times, 970 // the checkpoint directory may already exist. 971 if err := os.Mkdir(criuOpts.ImagesDirectory, 0o700); err != nil && !os.IsExist(err) { 972 return err 973 } 974 975 if criuOpts.WorkDirectory == "" { 976 criuOpts.WorkDirectory = filepath.Join(c.root, "criu.work") 977 } 978 979 if err := os.Mkdir(criuOpts.WorkDirectory, 0o700); err != nil && !os.IsExist(err) { 980 return err 981 } 982 983 workDir, err := os.Open(criuOpts.WorkDirectory) 984 if err != nil { 985 return err 986 } 987 defer workDir.Close() 988 989 imageDir, err := os.Open(criuOpts.ImagesDirectory) 990 if err != nil { 991 return err 992 } 993 defer imageDir.Close() 994 995 rpcOpts := criurpc.CriuOpts{ 996 ImagesDirFd: proto.Int32(int32(imageDir.Fd())), 997 WorkDirFd: proto.Int32(int32(workDir.Fd())), 998 LogLevel: proto.Int32(4), 999 LogFile: proto.String("dump.log"), 1000 Root: proto.String(c.config.Rootfs), 1001 ManageCgroups: proto.Bool(true), 1002 NotifyScripts: proto.Bool(true), 1003 Pid: proto.Int32(int32(c.initProcess.pid())), 1004 ShellJob: proto.Bool(criuOpts.ShellJob), 1005 LeaveRunning: proto.Bool(criuOpts.LeaveRunning), 1006 TcpEstablished: proto.Bool(criuOpts.TcpEstablished), 1007 ExtUnixSk: proto.Bool(criuOpts.ExternalUnixConnections), 1008 FileLocks: proto.Bool(criuOpts.FileLocks), 1009 EmptyNs: proto.Uint32(criuOpts.EmptyNs), 1010 OrphanPtsMaster: proto.Bool(true), 1011 AutoDedup: proto.Bool(criuOpts.AutoDedup), 1012 LazyPages: proto.Bool(criuOpts.LazyPages), 1013 } 1014 1015 c.handleCriuConfigurationFile(&rpcOpts) 1016 1017 // If the container is running in a network namespace and has 1018 // a path to the network namespace configured, we will dump 1019 // that network namespace as an external namespace and we 1020 // will expect that the namespace exists during restore. 1021 // This basically means that CRIU will ignore the namespace 1022 // and expect to be setup correctly. 1023 if err := c.handleCheckpointingExternalNamespaces(&rpcOpts, configs.NEWNET); err != nil { 1024 return err 1025 } 1026 1027 // Same for possible external PID namespaces 1028 if err := c.handleCheckpointingExternalNamespaces(&rpcOpts, configs.NEWPID); err != nil { 1029 return err 1030 } 1031 1032 // CRIU can use cgroup freezer; when rpcOpts.FreezeCgroup 1033 // is not set, CRIU uses ptrace() to pause the processes. 1034 // Note cgroup v2 freezer is only supported since CRIU release 3.14. 1035 if !cgroups.IsCgroup2UnifiedMode() || c.checkCriuVersion(31400) == nil { 1036 if fcg := c.cgroupManager.Path("freezer"); fcg != "" { 1037 rpcOpts.FreezeCgroup = proto.String(fcg) 1038 } 1039 } 1040 1041 // append optional criu opts, e.g., page-server and port 1042 if criuOpts.PageServer.Address != "" && criuOpts.PageServer.Port != 0 { 1043 rpcOpts.Ps = &criurpc.CriuPageServerInfo{ 1044 Address: proto.String(criuOpts.PageServer.Address), 1045 Port: proto.Int32(criuOpts.PageServer.Port), 1046 } 1047 } 1048 1049 // pre-dump may need parentImage param to complete iterative migration 1050 if criuOpts.ParentImage != "" { 1051 rpcOpts.ParentImg = proto.String(criuOpts.ParentImage) 1052 rpcOpts.TrackMem = proto.Bool(true) 1053 } 1054 1055 // append optional manage cgroups mode 1056 if criuOpts.ManageCgroupsMode != 0 { 1057 mode := criurpc.CriuCgMode(criuOpts.ManageCgroupsMode) 1058 rpcOpts.ManageCgroupsMode = &mode 1059 } 1060 1061 var t criurpc.CriuReqType 1062 if criuOpts.PreDump { 1063 feat := criurpc.CriuFeatures{ 1064 MemTrack: proto.Bool(true), 1065 } 1066 1067 if err := c.checkCriuFeatures(criuOpts, &rpcOpts, &feat); err != nil { 1068 return err 1069 } 1070 1071 t = criurpc.CriuReqType_PRE_DUMP 1072 } else { 1073 t = criurpc.CriuReqType_DUMP 1074 } 1075 1076 if criuOpts.LazyPages { 1077 // lazy migration requested; check if criu supports it 1078 feat := criurpc.CriuFeatures{ 1079 LazyPages: proto.Bool(true), 1080 } 1081 if err := c.checkCriuFeatures(criuOpts, &rpcOpts, &feat); err != nil { 1082 return err 1083 } 1084 1085 if fd := criuOpts.StatusFd; fd != -1 { 1086 // check that the FD is valid 1087 flags, err := unix.FcntlInt(uintptr(fd), unix.F_GETFL, 0) 1088 if err != nil { 1089 return fmt.Errorf("invalid --status-fd argument %d: %w", fd, err) 1090 } 1091 // and writable 1092 if flags&unix.O_WRONLY == 0 { 1093 return fmt.Errorf("invalid --status-fd argument %d: not writable", fd) 1094 } 1095 1096 if c.checkCriuVersion(31500) != nil { 1097 // For criu 3.15+, use notifications (see case "status-ready" 1098 // in criuNotifications). Otherwise, rely on criu status fd. 1099 rpcOpts.StatusFd = proto.Int32(int32(fd)) 1100 } 1101 } 1102 } 1103 1104 req := &criurpc.CriuReq{ 1105 Type: &t, 1106 Opts: &rpcOpts, 1107 } 1108 1109 // no need to dump all this in pre-dump 1110 if !criuOpts.PreDump { 1111 hasCgroupns := c.config.Namespaces.Contains(configs.NEWCGROUP) 1112 for _, m := range c.config.Mounts { 1113 switch m.Device { 1114 case "bind": 1115 c.addCriuDumpMount(req, m) 1116 case "cgroup": 1117 if cgroups.IsCgroup2UnifiedMode() || hasCgroupns { 1118 // real mount(s) 1119 continue 1120 } 1121 // a set of "external" bind mounts 1122 binds, err := getCgroupMounts(m) 1123 if err != nil { 1124 return err 1125 } 1126 for _, b := range binds { 1127 c.addCriuDumpMount(req, b) 1128 } 1129 } 1130 } 1131 1132 if err := c.addMaskPaths(req); err != nil { 1133 return err 1134 } 1135 1136 for _, node := range c.config.Devices { 1137 m := &configs.Mount{Destination: node.Path, Source: node.Path} 1138 c.addCriuDumpMount(req, m) 1139 } 1140 1141 // Write the FD info to a file in the image directory 1142 fdsJSON, err := json.Marshal(c.initProcess.externalDescriptors()) 1143 if err != nil { 1144 return err 1145 } 1146 1147 err = ioutil.WriteFile(filepath.Join(criuOpts.ImagesDirectory, descriptorsFilename), fdsJSON, 0o600) 1148 if err != nil { 1149 return err 1150 } 1151 } 1152 1153 err = c.criuSwrk(nil, req, criuOpts, nil) 1154 if err != nil { 1155 return err 1156 } 1157 return nil 1158} 1159 1160func (c *linuxContainer) addCriuRestoreMount(req *criurpc.CriuReq, m *configs.Mount) { 1161 mountDest := strings.TrimPrefix(m.Destination, c.config.Rootfs) 1162 extMnt := &criurpc.ExtMountMap{ 1163 Key: proto.String(mountDest), 1164 Val: proto.String(m.Source), 1165 } 1166 req.Opts.ExtMnt = append(req.Opts.ExtMnt, extMnt) 1167} 1168 1169func (c *linuxContainer) restoreNetwork(req *criurpc.CriuReq, criuOpts *CriuOpts) { 1170 for _, iface := range c.config.Networks { 1171 switch iface.Type { 1172 case "veth": 1173 veth := new(criurpc.CriuVethPair) 1174 veth.IfOut = proto.String(iface.HostInterfaceName) 1175 veth.IfIn = proto.String(iface.Name) 1176 req.Opts.Veths = append(req.Opts.Veths, veth) 1177 case "loopback": 1178 // Do nothing 1179 } 1180 } 1181 for _, i := range criuOpts.VethPairs { 1182 veth := new(criurpc.CriuVethPair) 1183 veth.IfOut = proto.String(i.HostInterfaceName) 1184 veth.IfIn = proto.String(i.ContainerInterfaceName) 1185 req.Opts.Veths = append(req.Opts.Veths, veth) 1186 } 1187} 1188 1189// makeCriuRestoreMountpoints makes the actual mountpoints for the 1190// restore using CRIU. This function is inspired from the code in 1191// rootfs_linux.go 1192func (c *linuxContainer) makeCriuRestoreMountpoints(m *configs.Mount) error { 1193 switch m.Device { 1194 case "cgroup": 1195 // No mount point(s) need to be created: 1196 // 1197 // * for v1, mount points are saved by CRIU because 1198 // /sys/fs/cgroup is a tmpfs mount 1199 // 1200 // * for v2, /sys/fs/cgroup is a real mount, but 1201 // the mountpoint appears as soon as /sys is mounted 1202 return nil 1203 case "bind": 1204 // The prepareBindMount() function checks if source 1205 // exists. So it cannot be used for other filesystem types. 1206 if err := prepareBindMount(m, c.config.Rootfs); err != nil { 1207 return err 1208 } 1209 default: 1210 // for all other filesystems just create the mountpoints 1211 dest, err := securejoin.SecureJoin(c.config.Rootfs, m.Destination) 1212 if err != nil { 1213 return err 1214 } 1215 if err := checkProcMount(c.config.Rootfs, dest, ""); err != nil { 1216 return err 1217 } 1218 if err := os.MkdirAll(dest, 0o755); err != nil { 1219 return err 1220 } 1221 } 1222 return nil 1223} 1224 1225// isPathInPrefixList is a small function for CRIU restore to make sure 1226// mountpoints, which are on a tmpfs, are not created in the roofs 1227func isPathInPrefixList(path string, prefix []string) bool { 1228 for _, p := range prefix { 1229 if strings.HasPrefix(path, p+"/") { 1230 return true 1231 } 1232 } 1233 return false 1234} 1235 1236// prepareCriuRestoreMounts tries to set up the rootfs of the 1237// container to be restored in the same way runc does it for 1238// initial container creation. Even for a read-only rootfs container 1239// runc modifies the rootfs to add mountpoints which do not exist. 1240// This function also creates missing mountpoints as long as they 1241// are not on top of a tmpfs, as CRIU will restore tmpfs content anyway. 1242func (c *linuxContainer) prepareCriuRestoreMounts(mounts []*configs.Mount) error { 1243 // First get a list of a all tmpfs mounts 1244 tmpfs := []string{} 1245 for _, m := range mounts { 1246 switch m.Device { 1247 case "tmpfs": 1248 tmpfs = append(tmpfs, m.Destination) 1249 } 1250 } 1251 // Now go through all mounts and create the mountpoints 1252 // if the mountpoints are not on a tmpfs, as CRIU will 1253 // restore the complete tmpfs content from its checkpoint. 1254 umounts := []string{} 1255 defer func() { 1256 for _, u := range umounts { 1257 _ = utils.WithProcfd(c.config.Rootfs, u, func(procfd string) error { 1258 if e := unix.Unmount(procfd, unix.MNT_DETACH); e != nil { 1259 if e != unix.EINVAL { 1260 // Ignore EINVAL as it means 'target is not a mount point.' 1261 // It probably has already been unmounted. 1262 logrus.Warnf("Error during cleanup unmounting of %s (%s): %v", procfd, u, e) 1263 } 1264 } 1265 return nil 1266 }) 1267 } 1268 }() 1269 for _, m := range mounts { 1270 if !isPathInPrefixList(m.Destination, tmpfs) { 1271 if err := c.makeCriuRestoreMountpoints(m); err != nil { 1272 return err 1273 } 1274 // If the mount point is a bind mount, we need to mount 1275 // it now so that runc can create the necessary mount 1276 // points for mounts in bind mounts. 1277 // This also happens during initial container creation. 1278 // Without this CRIU restore will fail 1279 // See: https://github.com/opencontainers/runc/issues/2748 1280 // It is also not necessary to order the mount points 1281 // because during initial container creation mounts are 1282 // set up in the order they are configured. 1283 if m.Device == "bind" { 1284 if err := utils.WithProcfd(c.config.Rootfs, m.Destination, func(procfd string) error { 1285 if err := unix.Mount(m.Source, procfd, "", unix.MS_BIND|unix.MS_REC, ""); err != nil { 1286 return errorsf.Wrapf(err, "unable to bind mount %q to %q (through %q)", m.Source, m.Destination, procfd) 1287 } 1288 return nil 1289 }); err != nil { 1290 return err 1291 } 1292 umounts = append(umounts, m.Destination) 1293 } 1294 } 1295 } 1296 return nil 1297} 1298 1299func (c *linuxContainer) Restore(process *Process, criuOpts *CriuOpts) error { 1300 c.m.Lock() 1301 defer c.m.Unlock() 1302 1303 var extraFiles []*os.File 1304 1305 // Restore is unlikely to work if os.Geteuid() != 0 || system.RunningInUserNS(). 1306 // (CLI prints a warning) 1307 // TODO(avagin): Figure out how to make this work nicely. CRIU doesn't have 1308 // support for unprivileged restore at the moment. 1309 1310 // We are relying on the CRIU version RPC which was introduced with CRIU 3.0.0 1311 if err := c.checkCriuVersion(30000); err != nil { 1312 return err 1313 } 1314 if criuOpts.WorkDirectory == "" { 1315 criuOpts.WorkDirectory = filepath.Join(c.root, "criu.work") 1316 } 1317 // Since a container can be C/R'ed multiple times, 1318 // the work directory may already exist. 1319 if err := os.Mkdir(criuOpts.WorkDirectory, 0o700); err != nil && !os.IsExist(err) { 1320 return err 1321 } 1322 workDir, err := os.Open(criuOpts.WorkDirectory) 1323 if err != nil { 1324 return err 1325 } 1326 defer workDir.Close() 1327 if criuOpts.ImagesDirectory == "" { 1328 return errors.New("invalid directory to restore checkpoint") 1329 } 1330 imageDir, err := os.Open(criuOpts.ImagesDirectory) 1331 if err != nil { 1332 return err 1333 } 1334 defer imageDir.Close() 1335 // CRIU has a few requirements for a root directory: 1336 // * it must be a mount point 1337 // * its parent must not be overmounted 1338 // c.config.Rootfs is bind-mounted to a temporary directory 1339 // to satisfy these requirements. 1340 root := filepath.Join(c.root, "criu-root") 1341 if err := os.Mkdir(root, 0o755); err != nil { 1342 return err 1343 } 1344 defer os.Remove(root) 1345 root, err = filepath.EvalSymlinks(root) 1346 if err != nil { 1347 return err 1348 } 1349 err = unix.Mount(c.config.Rootfs, root, "", unix.MS_BIND|unix.MS_REC, "") 1350 if err != nil { 1351 return err 1352 } 1353 defer unix.Unmount(root, unix.MNT_DETACH) //nolint: errcheck 1354 t := criurpc.CriuReqType_RESTORE 1355 req := &criurpc.CriuReq{ 1356 Type: &t, 1357 Opts: &criurpc.CriuOpts{ 1358 ImagesDirFd: proto.Int32(int32(imageDir.Fd())), 1359 WorkDirFd: proto.Int32(int32(workDir.Fd())), 1360 EvasiveDevices: proto.Bool(true), 1361 LogLevel: proto.Int32(4), 1362 LogFile: proto.String("restore.log"), 1363 RstSibling: proto.Bool(true), 1364 Root: proto.String(root), 1365 ManageCgroups: proto.Bool(true), 1366 NotifyScripts: proto.Bool(true), 1367 ShellJob: proto.Bool(criuOpts.ShellJob), 1368 ExtUnixSk: proto.Bool(criuOpts.ExternalUnixConnections), 1369 TcpEstablished: proto.Bool(criuOpts.TcpEstablished), 1370 FileLocks: proto.Bool(criuOpts.FileLocks), 1371 EmptyNs: proto.Uint32(criuOpts.EmptyNs), 1372 OrphanPtsMaster: proto.Bool(true), 1373 AutoDedup: proto.Bool(criuOpts.AutoDedup), 1374 LazyPages: proto.Bool(criuOpts.LazyPages), 1375 }, 1376 } 1377 1378 if criuOpts.LsmProfile != "" { 1379 // CRIU older than 3.16 has a bug which breaks the possibility 1380 // to set a different LSM profile. 1381 if err := c.checkCriuVersion(31600); err != nil { 1382 return errors.New("--lsm-profile requires at least CRIU 3.16") 1383 } 1384 req.Opts.LsmProfile = proto.String(criuOpts.LsmProfile) 1385 } 1386 1387 c.handleCriuConfigurationFile(req.Opts) 1388 1389 if err := c.handleRestoringNamespaces(req.Opts, &extraFiles); err != nil { 1390 return err 1391 } 1392 1393 // This will modify the rootfs of the container in the same way runc 1394 // modifies the container during initial creation. 1395 if err := c.prepareCriuRestoreMounts(c.config.Mounts); err != nil { 1396 return err 1397 } 1398 1399 hasCgroupns := c.config.Namespaces.Contains(configs.NEWCGROUP) 1400 for _, m := range c.config.Mounts { 1401 switch m.Device { 1402 case "bind": 1403 c.addCriuRestoreMount(req, m) 1404 case "cgroup": 1405 if cgroups.IsCgroup2UnifiedMode() || hasCgroupns { 1406 continue 1407 } 1408 // cgroup v1 is a set of bind mounts, unless cgroupns is used 1409 binds, err := getCgroupMounts(m) 1410 if err != nil { 1411 return err 1412 } 1413 for _, b := range binds { 1414 c.addCriuRestoreMount(req, b) 1415 } 1416 } 1417 } 1418 1419 if len(c.config.MaskPaths) > 0 { 1420 m := &configs.Mount{Destination: "/dev/null", Source: "/dev/null"} 1421 c.addCriuRestoreMount(req, m) 1422 } 1423 1424 for _, node := range c.config.Devices { 1425 m := &configs.Mount{Destination: node.Path, Source: node.Path} 1426 c.addCriuRestoreMount(req, m) 1427 } 1428 1429 if criuOpts.EmptyNs&unix.CLONE_NEWNET == 0 { 1430 c.restoreNetwork(req, criuOpts) 1431 } 1432 1433 // append optional manage cgroups mode 1434 if criuOpts.ManageCgroupsMode != 0 { 1435 mode := criurpc.CriuCgMode(criuOpts.ManageCgroupsMode) 1436 req.Opts.ManageCgroupsMode = &mode 1437 } 1438 1439 var ( 1440 fds []string 1441 fdJSON []byte 1442 ) 1443 if fdJSON, err = ioutil.ReadFile(filepath.Join(criuOpts.ImagesDirectory, descriptorsFilename)); err != nil { 1444 return err 1445 } 1446 1447 if err := json.Unmarshal(fdJSON, &fds); err != nil { 1448 return err 1449 } 1450 for i := range fds { 1451 if s := fds[i]; strings.Contains(s, "pipe:") { 1452 inheritFd := new(criurpc.InheritFd) 1453 inheritFd.Key = proto.String(s) 1454 inheritFd.Fd = proto.Int32(int32(i)) 1455 req.Opts.InheritFd = append(req.Opts.InheritFd, inheritFd) 1456 } 1457 } 1458 err = c.criuSwrk(process, req, criuOpts, extraFiles) 1459 1460 // Now that CRIU is done let's close all opened FDs CRIU needed. 1461 for _, fd := range extraFiles { 1462 fd.Close() 1463 } 1464 1465 return err 1466} 1467 1468func (c *linuxContainer) criuApplyCgroups(pid int, req *criurpc.CriuReq) error { 1469 // need to apply cgroups only on restore 1470 if req.GetType() != criurpc.CriuReqType_RESTORE { 1471 return nil 1472 } 1473 1474 // XXX: Do we need to deal with this case? AFAIK criu still requires root. 1475 if err := c.cgroupManager.Apply(pid); err != nil { 1476 return err 1477 } 1478 1479 if err := c.cgroupManager.Set(c.config.Cgroups.Resources); err != nil { 1480 return newSystemError(err) 1481 } 1482 1483 if cgroups.IsCgroup2UnifiedMode() { 1484 return nil 1485 } 1486 // the stuff below is cgroupv1-specific 1487 1488 path := fmt.Sprintf("/proc/%d/cgroup", pid) 1489 cgroupsPaths, err := cgroups.ParseCgroupFile(path) 1490 if err != nil { 1491 return err 1492 } 1493 1494 for c, p := range cgroupsPaths { 1495 cgroupRoot := &criurpc.CgroupRoot{ 1496 Ctrl: proto.String(c), 1497 Path: proto.String(p), 1498 } 1499 req.Opts.CgRoot = append(req.Opts.CgRoot, cgroupRoot) 1500 } 1501 1502 return nil 1503} 1504 1505func (c *linuxContainer) criuSwrk(process *Process, req *criurpc.CriuReq, opts *CriuOpts, extraFiles []*os.File) error { 1506 fds, err := unix.Socketpair(unix.AF_LOCAL, unix.SOCK_SEQPACKET|unix.SOCK_CLOEXEC, 0) 1507 if err != nil { 1508 return err 1509 } 1510 1511 var logPath string 1512 if opts != nil { 1513 logPath = filepath.Join(opts.WorkDirectory, req.GetOpts().GetLogFile()) 1514 } else { 1515 // For the VERSION RPC 'opts' is set to 'nil' and therefore 1516 // opts.WorkDirectory does not exist. Set logPath to "". 1517 logPath = "" 1518 } 1519 criuClient := os.NewFile(uintptr(fds[0]), "criu-transport-client") 1520 criuClientFileCon, err := net.FileConn(criuClient) 1521 criuClient.Close() 1522 if err != nil { 1523 return err 1524 } 1525 1526 criuClientCon := criuClientFileCon.(*net.UnixConn) 1527 defer criuClientCon.Close() 1528 1529 criuServer := os.NewFile(uintptr(fds[1]), "criu-transport-server") 1530 defer criuServer.Close() 1531 1532 args := []string{"swrk", "3"} 1533 if c.criuVersion != 0 { 1534 // If the CRIU Version is still '0' then this is probably 1535 // the initial CRIU run to detect the version. Skip it. 1536 logrus.Debugf("Using CRIU %d at: %s", c.criuVersion, c.criuPath) 1537 } 1538 cmd := exec.Command(c.criuPath, args...) 1539 if process != nil { 1540 cmd.Stdin = process.Stdin 1541 cmd.Stdout = process.Stdout 1542 cmd.Stderr = process.Stderr 1543 } 1544 cmd.ExtraFiles = append(cmd.ExtraFiles, criuServer) 1545 if extraFiles != nil { 1546 cmd.ExtraFiles = append(cmd.ExtraFiles, extraFiles...) 1547 } 1548 1549 if err := cmd.Start(); err != nil { 1550 return err 1551 } 1552 // we close criuServer so that even if CRIU crashes or unexpectedly exits, runc will not hang. 1553 criuServer.Close() 1554 // cmd.Process will be replaced by a restored init. 1555 criuProcess := cmd.Process 1556 1557 var criuProcessState *os.ProcessState 1558 defer func() { 1559 if criuProcessState == nil { 1560 criuClientCon.Close() 1561 _, err := criuProcess.Wait() 1562 if err != nil { 1563 logrus.Warnf("wait on criuProcess returned %v", err) 1564 } 1565 } 1566 }() 1567 1568 if err := c.criuApplyCgroups(criuProcess.Pid, req); err != nil { 1569 return err 1570 } 1571 1572 var extFds []string 1573 if process != nil { 1574 extFds, err = getPipeFds(criuProcess.Pid) 1575 if err != nil { 1576 return err 1577 } 1578 } 1579 1580 logrus.Debugf("Using CRIU in %s mode", req.GetType().String()) 1581 // In the case of criurpc.CriuReqType_FEATURE_CHECK req.GetOpts() 1582 // should be empty. For older CRIU versions it still will be 1583 // available but empty. criurpc.CriuReqType_VERSION actually 1584 // has no req.GetOpts(). 1585 if logrus.GetLevel() >= logrus.DebugLevel && 1586 !(req.GetType() == criurpc.CriuReqType_FEATURE_CHECK || 1587 req.GetType() == criurpc.CriuReqType_VERSION) { 1588 1589 val := reflect.ValueOf(req.GetOpts()) 1590 v := reflect.Indirect(val) 1591 for i := 0; i < v.NumField(); i++ { 1592 st := v.Type() 1593 name := st.Field(i).Name 1594 if 'A' <= name[0] && name[0] <= 'Z' { 1595 value := val.MethodByName("Get" + name).Call([]reflect.Value{}) 1596 logrus.Debugf("CRIU option %s with value %v", name, value[0]) 1597 } 1598 } 1599 } 1600 data, err := proto.Marshal(req) 1601 if err != nil { 1602 return err 1603 } 1604 _, err = criuClientCon.Write(data) 1605 if err != nil { 1606 return err 1607 } 1608 1609 buf := make([]byte, 10*4096) 1610 oob := make([]byte, 4096) 1611 for { 1612 n, oobn, _, _, err := criuClientCon.ReadMsgUnix(buf, oob) 1613 if req.Opts != nil && req.Opts.StatusFd != nil { 1614 // Close status_fd as soon as we got something back from criu, 1615 // assuming it has consumed (reopened) it by this time. 1616 // Otherwise it will might be left open forever and whoever 1617 // is waiting on it will wait forever. 1618 fd := int(*req.Opts.StatusFd) 1619 _ = unix.Close(fd) 1620 req.Opts.StatusFd = nil 1621 } 1622 if err != nil { 1623 return err 1624 } 1625 if n == 0 { 1626 return errors.New("unexpected EOF") 1627 } 1628 if n == len(buf) { 1629 return errors.New("buffer is too small") 1630 } 1631 1632 resp := new(criurpc.CriuResp) 1633 err = proto.Unmarshal(buf[:n], resp) 1634 if err != nil { 1635 return err 1636 } 1637 if !resp.GetSuccess() { 1638 typeString := req.GetType().String() 1639 return fmt.Errorf("criu failed: type %s errno %d\nlog file: %s", typeString, resp.GetCrErrno(), logPath) 1640 } 1641 1642 t := resp.GetType() 1643 switch { 1644 case t == criurpc.CriuReqType_FEATURE_CHECK: 1645 logrus.Debugf("Feature check says: %s", resp) 1646 criuFeatures = resp.GetFeatures() 1647 case t == criurpc.CriuReqType_NOTIFY: 1648 if err := c.criuNotifications(resp, process, cmd, opts, extFds, oob[:oobn]); err != nil { 1649 return err 1650 } 1651 t = criurpc.CriuReqType_NOTIFY 1652 req = &criurpc.CriuReq{ 1653 Type: &t, 1654 NotifySuccess: proto.Bool(true), 1655 } 1656 data, err = proto.Marshal(req) 1657 if err != nil { 1658 return err 1659 } 1660 _, err = criuClientCon.Write(data) 1661 if err != nil { 1662 return err 1663 } 1664 continue 1665 case t == criurpc.CriuReqType_RESTORE: 1666 case t == criurpc.CriuReqType_DUMP: 1667 case t == criurpc.CriuReqType_PRE_DUMP: 1668 default: 1669 return fmt.Errorf("unable to parse the response %s", resp.String()) 1670 } 1671 1672 break 1673 } 1674 1675 _ = criuClientCon.CloseWrite() 1676 // cmd.Wait() waits cmd.goroutines which are used for proxying file descriptors. 1677 // Here we want to wait only the CRIU process. 1678 criuProcessState, err = criuProcess.Wait() 1679 if err != nil { 1680 return err 1681 } 1682 1683 // In pre-dump mode CRIU is in a loop and waits for 1684 // the final DUMP command. 1685 // The current runc pre-dump approach, however, is 1686 // start criu in PRE_DUMP once for a single pre-dump 1687 // and not the whole series of pre-dump, pre-dump, ...m, dump 1688 // If we got the message CriuReqType_PRE_DUMP it means 1689 // CRIU was successful and we need to forcefully stop CRIU 1690 if !criuProcessState.Success() && *req.Type != criurpc.CriuReqType_PRE_DUMP { 1691 return fmt.Errorf("criu failed: %s\nlog file: %s", criuProcessState.String(), logPath) 1692 } 1693 return nil 1694} 1695 1696// block any external network activity 1697func lockNetwork(config *configs.Config) error { 1698 for _, config := range config.Networks { 1699 strategy, err := getStrategy(config.Type) 1700 if err != nil { 1701 return err 1702 } 1703 1704 if err := strategy.detach(config); err != nil { 1705 return err 1706 } 1707 } 1708 return nil 1709} 1710 1711func unlockNetwork(config *configs.Config) error { 1712 for _, config := range config.Networks { 1713 strategy, err := getStrategy(config.Type) 1714 if err != nil { 1715 return err 1716 } 1717 if err = strategy.attach(config); err != nil { 1718 return err 1719 } 1720 } 1721 return nil 1722} 1723 1724func (c *linuxContainer) criuNotifications(resp *criurpc.CriuResp, process *Process, cmd *exec.Cmd, opts *CriuOpts, fds []string, oob []byte) error { 1725 notify := resp.GetNotify() 1726 if notify == nil { 1727 return fmt.Errorf("invalid response: %s", resp.String()) 1728 } 1729 script := notify.GetScript() 1730 logrus.Debugf("notify: %s\n", script) 1731 switch script { 1732 case "post-dump": 1733 f, err := os.Create(filepath.Join(c.root, "checkpoint")) 1734 if err != nil { 1735 return err 1736 } 1737 f.Close() 1738 case "network-unlock": 1739 if err := unlockNetwork(c.config); err != nil { 1740 return err 1741 } 1742 case "network-lock": 1743 if err := lockNetwork(c.config); err != nil { 1744 return err 1745 } 1746 case "setup-namespaces": 1747 if c.config.Hooks != nil { 1748 s, err := c.currentOCIState() 1749 if err != nil { 1750 return nil 1751 } 1752 s.Pid = int(notify.GetPid()) 1753 1754 if err := c.config.Hooks[configs.Prestart].RunHooks(s); err != nil { 1755 return err 1756 } 1757 if err := c.config.Hooks[configs.CreateRuntime].RunHooks(s); err != nil { 1758 return err 1759 } 1760 } 1761 case "post-restore": 1762 pid := notify.GetPid() 1763 1764 p, err := os.FindProcess(int(pid)) 1765 if err != nil { 1766 return err 1767 } 1768 cmd.Process = p 1769 1770 r, err := newRestoredProcess(cmd, fds) 1771 if err != nil { 1772 return err 1773 } 1774 process.ops = r 1775 if err := c.state.transition(&restoredState{ 1776 imageDir: opts.ImagesDirectory, 1777 c: c, 1778 }); err != nil { 1779 return err 1780 } 1781 // create a timestamp indicating when the restored checkpoint was started 1782 c.created = time.Now().UTC() 1783 if _, err := c.updateState(r); err != nil { 1784 return err 1785 } 1786 if err := os.Remove(filepath.Join(c.root, "checkpoint")); err != nil { 1787 if !os.IsNotExist(err) { 1788 logrus.Error(err) 1789 } 1790 } 1791 case "orphan-pts-master": 1792 scm, err := unix.ParseSocketControlMessage(oob) 1793 if err != nil { 1794 return err 1795 } 1796 fds, err := unix.ParseUnixRights(&scm[0]) 1797 if err != nil { 1798 return err 1799 } 1800 1801 master := os.NewFile(uintptr(fds[0]), "orphan-pts-master") 1802 defer master.Close() 1803 1804 // While we can access console.master, using the API is a good idea. 1805 if err := utils.SendFd(process.ConsoleSocket, master.Name(), master.Fd()); err != nil { 1806 return err 1807 } 1808 case "status-ready": 1809 if opts.StatusFd != -1 { 1810 // write \0 to status fd to notify that lazy page server is ready 1811 _, err := unix.Write(opts.StatusFd, []byte{0}) 1812 if err != nil { 1813 logrus.Warnf("can't write \\0 to status fd: %v", err) 1814 } 1815 _ = unix.Close(opts.StatusFd) 1816 opts.StatusFd = -1 1817 } 1818 } 1819 return nil 1820} 1821 1822func (c *linuxContainer) updateState(process parentProcess) (*State, error) { 1823 if process != nil { 1824 c.initProcess = process 1825 } 1826 state, err := c.currentState() 1827 if err != nil { 1828 return nil, err 1829 } 1830 err = c.saveState(state) 1831 if err != nil { 1832 return nil, err 1833 } 1834 return state, nil 1835} 1836 1837func (c *linuxContainer) saveState(s *State) (retErr error) { 1838 tmpFile, err := ioutil.TempFile(c.root, "state-") 1839 if err != nil { 1840 return err 1841 } 1842 1843 defer func() { 1844 if retErr != nil { 1845 tmpFile.Close() 1846 os.Remove(tmpFile.Name()) 1847 } 1848 }() 1849 1850 err = utils.WriteJSON(tmpFile, s) 1851 if err != nil { 1852 return err 1853 } 1854 err = tmpFile.Close() 1855 if err != nil { 1856 return err 1857 } 1858 1859 stateFilePath := filepath.Join(c.root, stateFilename) 1860 return os.Rename(tmpFile.Name(), stateFilePath) 1861} 1862 1863func (c *linuxContainer) currentStatus() (Status, error) { 1864 if err := c.refreshState(); err != nil { 1865 return -1, err 1866 } 1867 return c.state.status(), nil 1868} 1869 1870// refreshState needs to be called to verify that the current state on the 1871// container is what is true. Because consumers of libcontainer can use it 1872// out of process we need to verify the container's status based on runtime 1873// information and not rely on our in process info. 1874func (c *linuxContainer) refreshState() error { 1875 paused, err := c.isPaused() 1876 if err != nil { 1877 return err 1878 } 1879 if paused { 1880 return c.state.transition(&pausedState{c: c}) 1881 } 1882 t := c.runType() 1883 switch t { 1884 case Created: 1885 return c.state.transition(&createdState{c: c}) 1886 case Running: 1887 return c.state.transition(&runningState{c: c}) 1888 } 1889 return c.state.transition(&stoppedState{c: c}) 1890} 1891 1892func (c *linuxContainer) runType() Status { 1893 if c.initProcess == nil { 1894 return Stopped 1895 } 1896 pid := c.initProcess.pid() 1897 stat, err := system.Stat(pid) 1898 if err != nil { 1899 return Stopped 1900 } 1901 if stat.StartTime != c.initProcessStartTime || stat.State == system.Zombie || stat.State == system.Dead { 1902 return Stopped 1903 } 1904 // We'll create exec fifo and blocking on it after container is created, 1905 // and delete it after start container. 1906 if _, err := os.Stat(filepath.Join(c.root, execFifoFilename)); err == nil { 1907 return Created 1908 } 1909 return Running 1910} 1911 1912func (c *linuxContainer) isPaused() (bool, error) { 1913 state, err := c.cgroupManager.GetFreezerState() 1914 if err != nil { 1915 return false, err 1916 } 1917 return state == configs.Frozen, nil 1918} 1919 1920func (c *linuxContainer) currentState() (*State, error) { 1921 var ( 1922 startTime uint64 1923 externalDescriptors []string 1924 pid = -1 1925 ) 1926 if c.initProcess != nil { 1927 pid = c.initProcess.pid() 1928 startTime, _ = c.initProcess.startTime() 1929 externalDescriptors = c.initProcess.externalDescriptors() 1930 } 1931 intelRdtPath, err := intelrdt.GetIntelRdtPath(c.ID()) 1932 if err != nil { 1933 intelRdtPath = "" 1934 } 1935 state := &State{ 1936 BaseState: BaseState{ 1937 ID: c.ID(), 1938 Config: *c.config, 1939 InitProcessPid: pid, 1940 InitProcessStartTime: startTime, 1941 Created: c.created, 1942 }, 1943 Rootless: c.config.RootlessEUID && c.config.RootlessCgroups, 1944 CgroupPaths: c.cgroupManager.GetPaths(), 1945 IntelRdtPath: intelRdtPath, 1946 NamespacePaths: make(map[configs.NamespaceType]string), 1947 ExternalDescriptors: externalDescriptors, 1948 } 1949 if pid > 0 { 1950 for _, ns := range c.config.Namespaces { 1951 state.NamespacePaths[ns.Type] = ns.GetPath(pid) 1952 } 1953 for _, nsType := range configs.NamespaceTypes() { 1954 if !configs.IsNamespaceSupported(nsType) { 1955 continue 1956 } 1957 if _, ok := state.NamespacePaths[nsType]; !ok { 1958 ns := configs.Namespace{Type: nsType} 1959 state.NamespacePaths[ns.Type] = ns.GetPath(pid) 1960 } 1961 } 1962 } 1963 return state, nil 1964} 1965 1966func (c *linuxContainer) currentOCIState() (*specs.State, error) { 1967 bundle, annotations := utils.Annotations(c.config.Labels) 1968 state := &specs.State{ 1969 Version: specs.Version, 1970 ID: c.ID(), 1971 Bundle: bundle, 1972 Annotations: annotations, 1973 } 1974 status, err := c.currentStatus() 1975 if err != nil { 1976 return nil, err 1977 } 1978 state.Status = specs.ContainerState(status.String()) 1979 if status != Stopped { 1980 if c.initProcess != nil { 1981 state.Pid = c.initProcess.pid() 1982 } 1983 } 1984 return state, nil 1985} 1986 1987// orderNamespacePaths sorts namespace paths into a list of paths that we 1988// can setns in order. 1989func (c *linuxContainer) orderNamespacePaths(namespaces map[configs.NamespaceType]string) ([]string, error) { 1990 paths := []string{} 1991 for _, ns := range configs.NamespaceTypes() { 1992 1993 // Remove namespaces that we don't need to join. 1994 if !c.config.Namespaces.Contains(ns) { 1995 continue 1996 } 1997 1998 if p, ok := namespaces[ns]; ok && p != "" { 1999 // check if the requested namespace is supported 2000 if !configs.IsNamespaceSupported(ns) { 2001 return nil, newSystemError(fmt.Errorf("namespace %s is not supported", ns)) 2002 } 2003 // only set to join this namespace if it exists 2004 if _, err := os.Lstat(p); err != nil { 2005 return nil, newSystemErrorWithCausef(err, "running lstat on namespace path %q", p) 2006 } 2007 // do not allow namespace path with comma as we use it to separate 2008 // the namespace paths 2009 if strings.ContainsRune(p, ',') { 2010 return nil, newSystemError(fmt.Errorf("invalid path %s", p)) 2011 } 2012 paths = append(paths, fmt.Sprintf("%s:%s", configs.NsName(ns), p)) 2013 } 2014 2015 } 2016 2017 return paths, nil 2018} 2019 2020func encodeIDMapping(idMap []configs.IDMap) ([]byte, error) { 2021 data := bytes.NewBuffer(nil) 2022 for _, im := range idMap { 2023 line := fmt.Sprintf("%d %d %d\n", im.ContainerID, im.HostID, im.Size) 2024 if _, err := data.WriteString(line); err != nil { 2025 return nil, err 2026 } 2027 } 2028 return data.Bytes(), nil 2029} 2030 2031// bootstrapData encodes the necessary data in netlink binary format 2032// as a io.Reader. 2033// Consumer can write the data to a bootstrap program 2034// such as one that uses nsenter package to bootstrap the container's 2035// init process correctly, i.e. with correct namespaces, uid/gid 2036// mapping etc. 2037func (c *linuxContainer) bootstrapData(cloneFlags uintptr, nsMaps map[configs.NamespaceType]string) (io.Reader, error) { 2038 // create the netlink message 2039 r := nl.NewNetlinkRequest(int(InitMsg), 0) 2040 2041 // write cloneFlags 2042 r.AddData(&Int32msg{ 2043 Type: CloneFlagsAttr, 2044 Value: uint32(cloneFlags), 2045 }) 2046 2047 // write custom namespace paths 2048 if len(nsMaps) > 0 { 2049 nsPaths, err := c.orderNamespacePaths(nsMaps) 2050 if err != nil { 2051 return nil, err 2052 } 2053 r.AddData(&Bytemsg{ 2054 Type: NsPathsAttr, 2055 Value: []byte(strings.Join(nsPaths, ",")), 2056 }) 2057 } 2058 2059 // write namespace paths only when we are not joining an existing user ns 2060 _, joinExistingUser := nsMaps[configs.NEWUSER] 2061 if !joinExistingUser { 2062 // write uid mappings 2063 if len(c.config.UidMappings) > 0 { 2064 if c.config.RootlessEUID && c.newuidmapPath != "" { 2065 r.AddData(&Bytemsg{ 2066 Type: UidmapPathAttr, 2067 Value: []byte(c.newuidmapPath), 2068 }) 2069 } 2070 b, err := encodeIDMapping(c.config.UidMappings) 2071 if err != nil { 2072 return nil, err 2073 } 2074 r.AddData(&Bytemsg{ 2075 Type: UidmapAttr, 2076 Value: b, 2077 }) 2078 } 2079 2080 // write gid mappings 2081 if len(c.config.GidMappings) > 0 { 2082 b, err := encodeIDMapping(c.config.GidMappings) 2083 if err != nil { 2084 return nil, err 2085 } 2086 r.AddData(&Bytemsg{ 2087 Type: GidmapAttr, 2088 Value: b, 2089 }) 2090 if c.config.RootlessEUID && c.newgidmapPath != "" { 2091 r.AddData(&Bytemsg{ 2092 Type: GidmapPathAttr, 2093 Value: []byte(c.newgidmapPath), 2094 }) 2095 } 2096 if requiresRootOrMappingTool(c.config) { 2097 r.AddData(&Boolmsg{ 2098 Type: SetgroupAttr, 2099 Value: true, 2100 }) 2101 } 2102 } 2103 } 2104 2105 if c.config.OomScoreAdj != nil { 2106 // write oom_score_adj 2107 r.AddData(&Bytemsg{ 2108 Type: OomScoreAdjAttr, 2109 Value: []byte(strconv.Itoa(*c.config.OomScoreAdj)), 2110 }) 2111 } 2112 2113 // write rootless 2114 r.AddData(&Boolmsg{ 2115 Type: RootlessEUIDAttr, 2116 Value: c.config.RootlessEUID, 2117 }) 2118 2119 return bytes.NewReader(r.Serialize()), nil 2120} 2121 2122// ignoreTerminateErrors returns nil if the given err matches an error known 2123// to indicate that the terminate occurred successfully or err was nil, otherwise 2124// err is returned unaltered. 2125func ignoreTerminateErrors(err error) error { 2126 if err == nil { 2127 return nil 2128 } 2129 // terminate() might return an error from ether Kill or Wait. 2130 // The (*Cmd).Wait documentation says: "If the command fails to run 2131 // or doesn't complete successfully, the error is of type *ExitError". 2132 // Filter out such errors (like "exit status 1" or "signal: killed"). 2133 var exitErr *exec.ExitError 2134 if errors.As(err, &exitErr) { 2135 return nil 2136 } 2137 // TODO: use errors.Is(err, os.ErrProcessDone) here and 2138 // remove "process already finished" string comparison below 2139 // once go 1.16 is minimally supported version. 2140 2141 s := err.Error() 2142 if strings.Contains(s, "process already finished") || 2143 strings.Contains(s, "Wait was already called") { 2144 return nil 2145 } 2146 return err 2147} 2148 2149func requiresRootOrMappingTool(c *configs.Config) bool { 2150 gidMap := []configs.IDMap{ 2151 {ContainerID: 0, HostID: os.Getegid(), Size: 1}, 2152 } 2153 return !reflect.DeepEqual(c.GidMappings, gidMap) 2154} 2155