1// +build linux
2
3package libcontainer
4
5import (
6	"bytes"
7	"encoding/json"
8	"errors"
9	"fmt"
10	"io"
11	"io/ioutil"
12	"net"
13	"os"
14	"os/exec"
15	"path/filepath"
16	"reflect"
17	"strconv"
18	"strings"
19	"sync"
20	"time"
21
22	securejoin "github.com/cyphar/filepath-securejoin"
23	"github.com/opencontainers/runc/libcontainer/cgroups"
24	"github.com/opencontainers/runc/libcontainer/configs"
25	"github.com/opencontainers/runc/libcontainer/intelrdt"
26	"github.com/opencontainers/runc/libcontainer/system"
27	"github.com/opencontainers/runc/libcontainer/utils"
28	"github.com/opencontainers/runtime-spec/specs-go"
29
30	"github.com/checkpoint-restore/go-criu/v5"
31	criurpc "github.com/checkpoint-restore/go-criu/v5/rpc"
32	errorsf "github.com/pkg/errors"
33	"github.com/sirupsen/logrus"
34	"github.com/vishvananda/netlink/nl"
35	"golang.org/x/sys/unix"
36	"google.golang.org/protobuf/proto"
37)
38
39const stdioFdCount = 3
40
41type linuxContainer struct {
42	id                   string
43	root                 string
44	config               *configs.Config
45	cgroupManager        cgroups.Manager
46	intelRdtManager      intelrdt.Manager
47	initPath             string
48	initArgs             []string
49	initProcess          parentProcess
50	initProcessStartTime uint64
51	criuPath             string
52	newuidmapPath        string
53	newgidmapPath        string
54	m                    sync.Mutex
55	criuVersion          int
56	state                containerState
57	created              time.Time
58	fifo                 *os.File
59}
60
61// State represents a running container's state
62type State struct {
63	BaseState
64
65	// Platform specific fields below here
66
67	// Specified if the container was started under the rootless mode.
68	// Set to true if BaseState.Config.RootlessEUID && BaseState.Config.RootlessCgroups
69	Rootless bool `json:"rootless"`
70
71	// Paths to all the container's cgroups, as returned by (*cgroups.Manager).GetPaths
72	//
73	// For cgroup v1, a key is cgroup subsystem name, and the value is the path
74	// to the cgroup for this subsystem.
75	//
76	// For cgroup v2 unified hierarchy, a key is "", and the value is the unified path.
77	CgroupPaths map[string]string `json:"cgroup_paths"`
78
79	// NamespacePaths are filepaths to the container's namespaces. Key is the namespace type
80	// with the value as the path.
81	NamespacePaths map[configs.NamespaceType]string `json:"namespace_paths"`
82
83	// Container's standard descriptors (std{in,out,err}), needed for checkpoint and restore
84	ExternalDescriptors []string `json:"external_descriptors,omitempty"`
85
86	// Intel RDT "resource control" filesystem path
87	IntelRdtPath string `json:"intel_rdt_path"`
88}
89
90// Container is a libcontainer container object.
91//
92// Each container is thread-safe within the same process. Since a container can
93// be destroyed by a separate process, any function may return that the container
94// was not found.
95type Container interface {
96	BaseContainer
97
98	// Methods below here are platform specific
99
100	// Checkpoint checkpoints the running container's state to disk using the criu(8) utility.
101	//
102	// errors:
103	// Systemerror - System error.
104	Checkpoint(criuOpts *CriuOpts) error
105
106	// Restore restores the checkpointed container to a running state using the criu(8) utility.
107	//
108	// errors:
109	// Systemerror - System error.
110	Restore(process *Process, criuOpts *CriuOpts) error
111
112	// If the Container state is RUNNING or CREATED, sets the Container state to PAUSING and pauses
113	// the execution of any user processes. Asynchronously, when the container finished being paused the
114	// state is changed to PAUSED.
115	// If the Container state is PAUSED, do nothing.
116	//
117	// errors:
118	// ContainerNotExists - Container no longer exists,
119	// ContainerNotRunning - Container not running or created,
120	// Systemerror - System error.
121	Pause() error
122
123	// If the Container state is PAUSED, resumes the execution of any user processes in the
124	// Container before setting the Container state to RUNNING.
125	// If the Container state is RUNNING, do nothing.
126	//
127	// errors:
128	// ContainerNotExists - Container no longer exists,
129	// ContainerNotPaused - Container is not paused,
130	// Systemerror - System error.
131	Resume() error
132
133	// NotifyOOM returns a read-only channel signaling when the container receives an OOM notification.
134	//
135	// errors:
136	// Systemerror - System error.
137	NotifyOOM() (<-chan struct{}, error)
138
139	// NotifyMemoryPressure returns a read-only channel signaling when the container reaches a given pressure level
140	//
141	// errors:
142	// Systemerror - System error.
143	NotifyMemoryPressure(level PressureLevel) (<-chan struct{}, error)
144}
145
146// ID returns the container's unique ID
147func (c *linuxContainer) ID() string {
148	return c.id
149}
150
151// Config returns the container's configuration
152func (c *linuxContainer) Config() configs.Config {
153	return *c.config
154}
155
156func (c *linuxContainer) Status() (Status, error) {
157	c.m.Lock()
158	defer c.m.Unlock()
159	return c.currentStatus()
160}
161
162func (c *linuxContainer) State() (*State, error) {
163	c.m.Lock()
164	defer c.m.Unlock()
165	return c.currentState()
166}
167
168func (c *linuxContainer) OCIState() (*specs.State, error) {
169	c.m.Lock()
170	defer c.m.Unlock()
171	return c.currentOCIState()
172}
173
174func (c *linuxContainer) Processes() ([]int, error) {
175	var pids []int
176	status, err := c.currentStatus()
177	if err != nil {
178		return pids, err
179	}
180	// for systemd cgroup, the unit's cgroup path will be auto removed if container's all processes exited
181	if status == Stopped && !c.cgroupManager.Exists() {
182		return pids, nil
183	}
184
185	pids, err = c.cgroupManager.GetAllPids()
186	if err != nil {
187		return nil, newSystemErrorWithCause(err, "getting all container pids from cgroups")
188	}
189	return pids, nil
190}
191
192func (c *linuxContainer) Stats() (*Stats, error) {
193	var (
194		err   error
195		stats = &Stats{}
196	)
197	if stats.CgroupStats, err = c.cgroupManager.GetStats(); err != nil {
198		return stats, newSystemErrorWithCause(err, "getting container stats from cgroups")
199	}
200	if c.intelRdtManager != nil {
201		if stats.IntelRdtStats, err = c.intelRdtManager.GetStats(); err != nil {
202			return stats, newSystemErrorWithCause(err, "getting container's Intel RDT stats")
203		}
204	}
205	for _, iface := range c.config.Networks {
206		switch iface.Type {
207		case "veth":
208			istats, err := getNetworkInterfaceStats(iface.HostInterfaceName)
209			if err != nil {
210				return stats, newSystemErrorWithCausef(err, "getting network stats for interface %q", iface.HostInterfaceName)
211			}
212			stats.Interfaces = append(stats.Interfaces, istats)
213		}
214	}
215	return stats, nil
216}
217
218func (c *linuxContainer) Set(config configs.Config) error {
219	c.m.Lock()
220	defer c.m.Unlock()
221	status, err := c.currentStatus()
222	if err != nil {
223		return err
224	}
225	if status == Stopped {
226		return newGenericError(errors.New("container not running"), ContainerNotRunning)
227	}
228	if err := c.cgroupManager.Set(config.Cgroups.Resources); err != nil {
229		// Set configs back
230		if err2 := c.cgroupManager.Set(c.config.Cgroups.Resources); err2 != nil {
231			logrus.Warnf("Setting back cgroup configs failed due to error: %v, your state.json and actual configs might be inconsistent.", err2)
232		}
233		return err
234	}
235	if c.intelRdtManager != nil {
236		if err := c.intelRdtManager.Set(&config); err != nil {
237			// Set configs back
238			if err2 := c.cgroupManager.Set(c.config.Cgroups.Resources); err2 != nil {
239				logrus.Warnf("Setting back cgroup configs failed due to error: %v, your state.json and actual configs might be inconsistent.", err2)
240			}
241			if err2 := c.intelRdtManager.Set(c.config); err2 != nil {
242				logrus.Warnf("Setting back intelrdt configs failed due to error: %v, your state.json and actual configs might be inconsistent.", err2)
243			}
244			return err
245		}
246	}
247	// After config setting succeed, update config and states
248	c.config = &config
249	_, err = c.updateState(nil)
250	return err
251}
252
253func (c *linuxContainer) Start(process *Process) error {
254	c.m.Lock()
255	defer c.m.Unlock()
256	if c.config.Cgroups.Resources.SkipDevices {
257		return newGenericError(errors.New("can't start container with SkipDevices set"), ConfigInvalid)
258	}
259	if process.Init {
260		if err := c.createExecFifo(); err != nil {
261			return err
262		}
263	}
264	if err := c.start(process); err != nil {
265		if process.Init {
266			c.deleteExecFifo()
267		}
268		return err
269	}
270	return nil
271}
272
273func (c *linuxContainer) Run(process *Process) error {
274	if err := c.Start(process); err != nil {
275		return err
276	}
277	if process.Init {
278		return c.exec()
279	}
280	return nil
281}
282
283func (c *linuxContainer) Exec() error {
284	c.m.Lock()
285	defer c.m.Unlock()
286	return c.exec()
287}
288
289func (c *linuxContainer) exec() error {
290	path := filepath.Join(c.root, execFifoFilename)
291	pid := c.initProcess.pid()
292	blockingFifoOpenCh := awaitFifoOpen(path)
293	for {
294		select {
295		case result := <-blockingFifoOpenCh:
296			return handleFifoResult(result)
297
298		case <-time.After(time.Millisecond * 100):
299			stat, err := system.Stat(pid)
300			if err != nil || stat.State == system.Zombie {
301				// could be because process started, ran, and completed between our 100ms timeout and our system.Stat() check.
302				// see if the fifo exists and has data (with a non-blocking open, which will succeed if the writing process is complete).
303				if err := handleFifoResult(fifoOpen(path, false)); err != nil {
304					return errors.New("container process is already dead")
305				}
306				return nil
307			}
308		}
309	}
310}
311
312func readFromExecFifo(execFifo io.Reader) error {
313	data, err := ioutil.ReadAll(execFifo)
314	if err != nil {
315		return err
316	}
317	if len(data) <= 0 {
318		return errors.New("cannot start an already running container")
319	}
320	return nil
321}
322
323func awaitFifoOpen(path string) <-chan openResult {
324	fifoOpened := make(chan openResult)
325	go func() {
326		result := fifoOpen(path, true)
327		fifoOpened <- result
328	}()
329	return fifoOpened
330}
331
332func fifoOpen(path string, block bool) openResult {
333	flags := os.O_RDONLY
334	if !block {
335		flags |= unix.O_NONBLOCK
336	}
337	f, err := os.OpenFile(path, flags, 0)
338	if err != nil {
339		return openResult{err: newSystemErrorWithCause(err, "open exec fifo for reading")}
340	}
341	return openResult{file: f}
342}
343
344func handleFifoResult(result openResult) error {
345	if result.err != nil {
346		return result.err
347	}
348	f := result.file
349	defer f.Close()
350	if err := readFromExecFifo(f); err != nil {
351		return err
352	}
353	return os.Remove(f.Name())
354}
355
356type openResult struct {
357	file *os.File
358	err  error
359}
360
361func (c *linuxContainer) start(process *Process) (retErr error) {
362	parent, err := c.newParentProcess(process)
363	if err != nil {
364		return newSystemErrorWithCause(err, "creating new parent process")
365	}
366
367	logsDone := parent.forwardChildLogs()
368	if logsDone != nil {
369		defer func() {
370			// Wait for log forwarder to finish. This depends on
371			// runc init closing the _LIBCONTAINER_LOGPIPE log fd.
372			err := <-logsDone
373			if err != nil && retErr == nil {
374				retErr = newSystemErrorWithCause(err, "forwarding init logs")
375			}
376		}()
377	}
378
379	if err := parent.start(); err != nil {
380		return newSystemErrorWithCause(err, "starting container process")
381	}
382
383	if process.Init {
384		c.fifo.Close()
385		if c.config.Hooks != nil {
386			s, err := c.currentOCIState()
387			if err != nil {
388				return err
389			}
390
391			if err := c.config.Hooks[configs.Poststart].RunHooks(s); err != nil {
392				if err := ignoreTerminateErrors(parent.terminate()); err != nil {
393					logrus.Warn(errorsf.Wrapf(err, "Running Poststart hook"))
394				}
395				return err
396			}
397		}
398	}
399	return nil
400}
401
402func (c *linuxContainer) Signal(s os.Signal, all bool) error {
403	c.m.Lock()
404	defer c.m.Unlock()
405	status, err := c.currentStatus()
406	if err != nil {
407		return err
408	}
409	if all {
410		// for systemd cgroup, the unit's cgroup path will be auto removed if container's all processes exited
411		if status == Stopped && !c.cgroupManager.Exists() {
412			return nil
413		}
414		return signalAllProcesses(c.cgroupManager, s)
415	}
416	// to avoid a PID reuse attack
417	if status == Running || status == Created || status == Paused {
418		if err := c.initProcess.signal(s); err != nil {
419			return newSystemErrorWithCause(err, "signaling init process")
420		}
421		return nil
422	}
423	return newGenericError(errors.New("container not running"), ContainerNotRunning)
424}
425
426func (c *linuxContainer) createExecFifo() error {
427	rootuid, err := c.Config().HostRootUID()
428	if err != nil {
429		return err
430	}
431	rootgid, err := c.Config().HostRootGID()
432	if err != nil {
433		return err
434	}
435
436	fifoName := filepath.Join(c.root, execFifoFilename)
437	if _, err := os.Stat(fifoName); err == nil {
438		return fmt.Errorf("exec fifo %s already exists", fifoName)
439	}
440	oldMask := unix.Umask(0o000)
441	if err := unix.Mkfifo(fifoName, 0o622); err != nil {
442		unix.Umask(oldMask)
443		return err
444	}
445	unix.Umask(oldMask)
446	return os.Chown(fifoName, rootuid, rootgid)
447}
448
449func (c *linuxContainer) deleteExecFifo() {
450	fifoName := filepath.Join(c.root, execFifoFilename)
451	os.Remove(fifoName)
452}
453
454// includeExecFifo opens the container's execfifo as a pathfd, so that the
455// container cannot access the statedir (and the FIFO itself remains
456// un-opened). It then adds the FifoFd to the given exec.Cmd as an inherited
457// fd, with _LIBCONTAINER_FIFOFD set to its fd number.
458func (c *linuxContainer) includeExecFifo(cmd *exec.Cmd) error {
459	fifoName := filepath.Join(c.root, execFifoFilename)
460	fifo, err := os.OpenFile(fifoName, unix.O_PATH|unix.O_CLOEXEC, 0)
461	if err != nil {
462		return err
463	}
464	c.fifo = fifo
465
466	cmd.ExtraFiles = append(cmd.ExtraFiles, fifo)
467	cmd.Env = append(cmd.Env,
468		"_LIBCONTAINER_FIFOFD="+strconv.Itoa(stdioFdCount+len(cmd.ExtraFiles)-1))
469	return nil
470}
471
472func (c *linuxContainer) newParentProcess(p *Process) (parentProcess, error) {
473	parentInitPipe, childInitPipe, err := utils.NewSockPair("init")
474	if err != nil {
475		return nil, newSystemErrorWithCause(err, "creating new init pipe")
476	}
477	messageSockPair := filePair{parentInitPipe, childInitPipe}
478
479	parentLogPipe, childLogPipe, err := os.Pipe()
480	if err != nil {
481		return nil, fmt.Errorf("Unable to create the log pipe:  %s", err)
482	}
483	logFilePair := filePair{parentLogPipe, childLogPipe}
484
485	cmd := c.commandTemplate(p, childInitPipe, childLogPipe)
486	if !p.Init {
487		return c.newSetnsProcess(p, cmd, messageSockPair, logFilePair)
488	}
489
490	// We only set up fifoFd if we're not doing a `runc exec`. The historic
491	// reason for this is that previously we would pass a dirfd that allowed
492	// for container rootfs escape (and not doing it in `runc exec` avoided
493	// that problem), but we no longer do that. However, there's no need to do
494	// this for `runc exec` so we just keep it this way to be safe.
495	if err := c.includeExecFifo(cmd); err != nil {
496		return nil, newSystemErrorWithCause(err, "including execfifo in cmd.Exec setup")
497	}
498	return c.newInitProcess(p, cmd, messageSockPair, logFilePair)
499}
500
501func (c *linuxContainer) commandTemplate(p *Process, childInitPipe *os.File, childLogPipe *os.File) *exec.Cmd {
502	cmd := exec.Command(c.initPath, c.initArgs[1:]...)
503	cmd.Args[0] = c.initArgs[0]
504	cmd.Stdin = p.Stdin
505	cmd.Stdout = p.Stdout
506	cmd.Stderr = p.Stderr
507	cmd.Dir = c.config.Rootfs
508	if cmd.SysProcAttr == nil {
509		cmd.SysProcAttr = &unix.SysProcAttr{}
510	}
511	cmd.Env = append(cmd.Env, "GOMAXPROCS="+os.Getenv("GOMAXPROCS"))
512	cmd.ExtraFiles = append(cmd.ExtraFiles, p.ExtraFiles...)
513	if p.ConsoleSocket != nil {
514		cmd.ExtraFiles = append(cmd.ExtraFiles, p.ConsoleSocket)
515		cmd.Env = append(cmd.Env,
516			"_LIBCONTAINER_CONSOLE="+strconv.Itoa(stdioFdCount+len(cmd.ExtraFiles)-1),
517		)
518	}
519	cmd.ExtraFiles = append(cmd.ExtraFiles, childInitPipe)
520	cmd.Env = append(cmd.Env,
521		"_LIBCONTAINER_INITPIPE="+strconv.Itoa(stdioFdCount+len(cmd.ExtraFiles)-1),
522		"_LIBCONTAINER_STATEDIR="+c.root,
523	)
524
525	cmd.ExtraFiles = append(cmd.ExtraFiles, childLogPipe)
526	cmd.Env = append(cmd.Env,
527		"_LIBCONTAINER_LOGPIPE="+strconv.Itoa(stdioFdCount+len(cmd.ExtraFiles)-1),
528		"_LIBCONTAINER_LOGLEVEL="+p.LogLevel,
529	)
530
531	// NOTE: when running a container with no PID namespace and the parent process spawning the container is
532	// PID1 the pdeathsig is being delivered to the container's init process by the kernel for some reason
533	// even with the parent still running.
534	if c.config.ParentDeathSignal > 0 {
535		cmd.SysProcAttr.Pdeathsig = unix.Signal(c.config.ParentDeathSignal)
536	}
537	return cmd
538}
539
540func (c *linuxContainer) newInitProcess(p *Process, cmd *exec.Cmd, messageSockPair, logFilePair filePair) (*initProcess, error) {
541	cmd.Env = append(cmd.Env, "_LIBCONTAINER_INITTYPE="+string(initStandard))
542	nsMaps := make(map[configs.NamespaceType]string)
543	for _, ns := range c.config.Namespaces {
544		if ns.Path != "" {
545			nsMaps[ns.Type] = ns.Path
546		}
547	}
548	_, sharePidns := nsMaps[configs.NEWPID]
549	data, err := c.bootstrapData(c.config.Namespaces.CloneFlags(), nsMaps)
550	if err != nil {
551		return nil, err
552	}
553	init := &initProcess{
554		cmd:             cmd,
555		messageSockPair: messageSockPair,
556		logFilePair:     logFilePair,
557		manager:         c.cgroupManager,
558		intelRdtManager: c.intelRdtManager,
559		config:          c.newInitConfig(p),
560		container:       c,
561		process:         p,
562		bootstrapData:   data,
563		sharePidns:      sharePidns,
564	}
565	c.initProcess = init
566	return init, nil
567}
568
569func (c *linuxContainer) newSetnsProcess(p *Process, cmd *exec.Cmd, messageSockPair, logFilePair filePair) (*setnsProcess, error) {
570	cmd.Env = append(cmd.Env, "_LIBCONTAINER_INITTYPE="+string(initSetns))
571	state, err := c.currentState()
572	if err != nil {
573		return nil, newSystemErrorWithCause(err, "getting container's current state")
574	}
575	// for setns process, we don't have to set cloneflags as the process namespaces
576	// will only be set via setns syscall
577	data, err := c.bootstrapData(0, state.NamespacePaths)
578	if err != nil {
579		return nil, err
580	}
581	return &setnsProcess{
582		cmd:             cmd,
583		cgroupPaths:     state.CgroupPaths,
584		rootlessCgroups: c.config.RootlessCgroups,
585		intelRdtPath:    state.IntelRdtPath,
586		messageSockPair: messageSockPair,
587		logFilePair:     logFilePair,
588		manager:         c.cgroupManager,
589		config:          c.newInitConfig(p),
590		process:         p,
591		bootstrapData:   data,
592		initProcessPid:  state.InitProcessPid,
593	}, nil
594}
595
596func (c *linuxContainer) newInitConfig(process *Process) *initConfig {
597	cfg := &initConfig{
598		Config:           c.config,
599		Args:             process.Args,
600		Env:              process.Env,
601		User:             process.User,
602		AdditionalGroups: process.AdditionalGroups,
603		Cwd:              process.Cwd,
604		Capabilities:     process.Capabilities,
605		PassedFilesCount: len(process.ExtraFiles),
606		ContainerId:      c.ID(),
607		NoNewPrivileges:  c.config.NoNewPrivileges,
608		RootlessEUID:     c.config.RootlessEUID,
609		RootlessCgroups:  c.config.RootlessCgroups,
610		AppArmorProfile:  c.config.AppArmorProfile,
611		ProcessLabel:     c.config.ProcessLabel,
612		Rlimits:          c.config.Rlimits,
613		CreateConsole:    process.ConsoleSocket != nil,
614		ConsoleWidth:     process.ConsoleWidth,
615		ConsoleHeight:    process.ConsoleHeight,
616	}
617	if process.NoNewPrivileges != nil {
618		cfg.NoNewPrivileges = *process.NoNewPrivileges
619	}
620	if process.AppArmorProfile != "" {
621		cfg.AppArmorProfile = process.AppArmorProfile
622	}
623	if process.Label != "" {
624		cfg.ProcessLabel = process.Label
625	}
626	if len(process.Rlimits) > 0 {
627		cfg.Rlimits = process.Rlimits
628	}
629	if cgroups.IsCgroup2UnifiedMode() {
630		cfg.Cgroup2Path = c.cgroupManager.Path("")
631	}
632
633	return cfg
634}
635
636func (c *linuxContainer) Destroy() error {
637	c.m.Lock()
638	defer c.m.Unlock()
639	return c.state.destroy()
640}
641
642func (c *linuxContainer) Pause() error {
643	c.m.Lock()
644	defer c.m.Unlock()
645	status, err := c.currentStatus()
646	if err != nil {
647		return err
648	}
649	switch status {
650	case Running, Created:
651		if err := c.cgroupManager.Freeze(configs.Frozen); err != nil {
652			return err
653		}
654		return c.state.transition(&pausedState{
655			c: c,
656		})
657	}
658	return newGenericError(fmt.Errorf("container not running or created: %s", status), ContainerNotRunning)
659}
660
661func (c *linuxContainer) Resume() error {
662	c.m.Lock()
663	defer c.m.Unlock()
664	status, err := c.currentStatus()
665	if err != nil {
666		return err
667	}
668	if status != Paused {
669		return newGenericError(fmt.Errorf("container not paused"), ContainerNotPaused)
670	}
671	if err := c.cgroupManager.Freeze(configs.Thawed); err != nil {
672		return err
673	}
674	return c.state.transition(&runningState{
675		c: c,
676	})
677}
678
679func (c *linuxContainer) NotifyOOM() (<-chan struct{}, error) {
680	// XXX(cyphar): This requires cgroups.
681	if c.config.RootlessCgroups {
682		logrus.Warn("getting OOM notifications may fail if you don't have the full access to cgroups")
683	}
684	path := c.cgroupManager.Path("memory")
685	if cgroups.IsCgroup2UnifiedMode() {
686		return notifyOnOOMV2(path)
687	}
688	return notifyOnOOM(path)
689}
690
691func (c *linuxContainer) NotifyMemoryPressure(level PressureLevel) (<-chan struct{}, error) {
692	// XXX(cyphar): This requires cgroups.
693	if c.config.RootlessCgroups {
694		logrus.Warn("getting memory pressure notifications may fail if you don't have the full access to cgroups")
695	}
696	return notifyMemoryPressure(c.cgroupManager.Path("memory"), level)
697}
698
699var criuFeatures *criurpc.CriuFeatures
700
701func (c *linuxContainer) checkCriuFeatures(criuOpts *CriuOpts, rpcOpts *criurpc.CriuOpts, criuFeat *criurpc.CriuFeatures) error {
702	t := criurpc.CriuReqType_FEATURE_CHECK
703
704	// make sure the features we are looking for are really not from
705	// some previous check
706	criuFeatures = nil
707
708	req := &criurpc.CriuReq{
709		Type: &t,
710		// Theoretically this should not be necessary but CRIU
711		// segfaults if Opts is empty.
712		// Fixed in CRIU  2.12
713		Opts:     rpcOpts,
714		Features: criuFeat,
715	}
716
717	err := c.criuSwrk(nil, req, criuOpts, nil)
718	if err != nil {
719		logrus.Debugf("%s", err)
720		return errors.New("CRIU feature check failed")
721	}
722
723	missingFeatures := false
724
725	// The outer if checks if the fields actually exist
726	if (criuFeat.MemTrack != nil) &&
727		(criuFeatures.MemTrack != nil) {
728		// The inner if checks if they are set to true
729		if *criuFeat.MemTrack && !*criuFeatures.MemTrack {
730			missingFeatures = true
731			logrus.Debugf("CRIU does not support MemTrack")
732		}
733	}
734
735	// This needs to be repeated for every new feature check.
736	// Is there a way to put this in a function. Reflection?
737	if (criuFeat.LazyPages != nil) &&
738		(criuFeatures.LazyPages != nil) {
739		if *criuFeat.LazyPages && !*criuFeatures.LazyPages {
740			missingFeatures = true
741			logrus.Debugf("CRIU does not support LazyPages")
742		}
743	}
744
745	if missingFeatures {
746		return errors.New("CRIU is missing features")
747	}
748
749	return nil
750}
751
752func compareCriuVersion(criuVersion int, minVersion int) error {
753	// simple function to perform the actual version compare
754	if criuVersion < minVersion {
755		return fmt.Errorf("CRIU version %d must be %d or higher", criuVersion, minVersion)
756	}
757
758	return nil
759}
760
761// checkCriuVersion checks Criu version greater than or equal to minVersion
762func (c *linuxContainer) checkCriuVersion(minVersion int) error {
763	// If the version of criu has already been determined there is no need
764	// to ask criu for the version again. Use the value from c.criuVersion.
765	if c.criuVersion != 0 {
766		return compareCriuVersion(c.criuVersion, minVersion)
767	}
768
769	criu := criu.MakeCriu()
770	criu.SetCriuPath(c.criuPath)
771	var err error
772	c.criuVersion, err = criu.GetCriuVersion()
773	if err != nil {
774		return fmt.Errorf("CRIU version check failed: %s", err)
775	}
776
777	return compareCriuVersion(c.criuVersion, minVersion)
778}
779
780const descriptorsFilename = "descriptors.json"
781
782func (c *linuxContainer) addCriuDumpMount(req *criurpc.CriuReq, m *configs.Mount) {
783	mountDest := strings.TrimPrefix(m.Destination, c.config.Rootfs)
784	extMnt := &criurpc.ExtMountMap{
785		Key: proto.String(mountDest),
786		Val: proto.String(mountDest),
787	}
788	req.Opts.ExtMnt = append(req.Opts.ExtMnt, extMnt)
789}
790
791func (c *linuxContainer) addMaskPaths(req *criurpc.CriuReq) error {
792	for _, path := range c.config.MaskPaths {
793		fi, err := os.Stat(fmt.Sprintf("/proc/%d/root/%s", c.initProcess.pid(), path))
794		if err != nil {
795			if os.IsNotExist(err) {
796				continue
797			}
798			return err
799		}
800		if fi.IsDir() {
801			continue
802		}
803
804		extMnt := &criurpc.ExtMountMap{
805			Key: proto.String(path),
806			Val: proto.String("/dev/null"),
807		}
808		req.Opts.ExtMnt = append(req.Opts.ExtMnt, extMnt)
809	}
810	return nil
811}
812
813func (c *linuxContainer) handleCriuConfigurationFile(rpcOpts *criurpc.CriuOpts) {
814	// CRIU will evaluate a configuration starting with release 3.11.
815	// Settings in the configuration file will overwrite RPC settings.
816	// Look for annotations. The annotation 'org.criu.config'
817	// specifies if CRIU should use a different, container specific
818	// configuration file.
819	_, annotations := utils.Annotations(c.config.Labels)
820	configFile, exists := annotations["org.criu.config"]
821	if exists {
822		// If the annotation 'org.criu.config' exists and is set
823		// to a non-empty string, tell CRIU to use that as a
824		// configuration file. If the file does not exist, CRIU
825		// will just ignore it.
826		if configFile != "" {
827			rpcOpts.ConfigFile = proto.String(configFile)
828		}
829		// If 'org.criu.config' exists and is set to an empty
830		// string, a runc specific CRIU configuration file will
831		// be not set at all.
832	} else {
833		// If the mentioned annotation has not been found, specify
834		// a default CRIU configuration file.
835		rpcOpts.ConfigFile = proto.String("/etc/criu/runc.conf")
836	}
837}
838
839func (c *linuxContainer) criuSupportsExtNS(t configs.NamespaceType) bool {
840	var minVersion int
841	switch t {
842	case configs.NEWNET:
843		// CRIU supports different external namespace with different released CRIU versions.
844		// For network namespaces to work we need at least criu 3.11.0 => 31100.
845		minVersion = 31100
846	case configs.NEWPID:
847		// For PID namespaces criu 31500 is needed.
848		minVersion = 31500
849	default:
850		return false
851	}
852	return c.checkCriuVersion(minVersion) == nil
853}
854
855func criuNsToKey(t configs.NamespaceType) string {
856	return "extRoot" + strings.Title(configs.NsName(t)) + "NS"
857}
858
859func (c *linuxContainer) handleCheckpointingExternalNamespaces(rpcOpts *criurpc.CriuOpts, t configs.NamespaceType) error {
860	if !c.criuSupportsExtNS(t) {
861		return nil
862	}
863
864	nsPath := c.config.Namespaces.PathOf(t)
865	if nsPath == "" {
866		return nil
867	}
868	// CRIU expects the information about an external namespace
869	// like this: --external <TYPE>[<inode>]:<key>
870	// This <key> is always 'extRoot<TYPE>NS'.
871	var ns unix.Stat_t
872	if err := unix.Stat(nsPath, &ns); err != nil {
873		return err
874	}
875	criuExternal := fmt.Sprintf("%s[%d]:%s", configs.NsName(t), ns.Ino, criuNsToKey(t))
876	rpcOpts.External = append(rpcOpts.External, criuExternal)
877
878	return nil
879}
880
881func (c *linuxContainer) handleRestoringNamespaces(rpcOpts *criurpc.CriuOpts, extraFiles *[]*os.File) error {
882	for _, ns := range c.config.Namespaces {
883		switch ns.Type {
884		case configs.NEWNET, configs.NEWPID:
885			// If the container is running in a network or PID namespace and has
886			// a path to the network or PID namespace configured, we will dump
887			// that network or PID namespace as an external namespace and we
888			// will expect that the namespace exists during restore.
889			// This basically means that CRIU will ignore the namespace
890			// and expect it to be setup correctly.
891			if err := c.handleRestoringExternalNamespaces(rpcOpts, extraFiles, ns.Type); err != nil {
892				return err
893			}
894		default:
895			// For all other namespaces except NET and PID CRIU has
896			// a simpler way of joining the existing namespace if set
897			nsPath := c.config.Namespaces.PathOf(ns.Type)
898			if nsPath == "" {
899				continue
900			}
901			if ns.Type == configs.NEWCGROUP {
902				// CRIU has no code to handle NEWCGROUP
903				return fmt.Errorf("Do not know how to handle namespace %v", ns.Type)
904			}
905			// CRIU has code to handle NEWTIME, but it does not seem to be defined in runc
906
907			// CRIU will issue a warning for NEWUSER:
908			// criu/namespaces.c: 'join-ns with user-namespace is not fully tested and dangerous'
909			rpcOpts.JoinNs = append(rpcOpts.JoinNs, &criurpc.JoinNamespace{
910				Ns:     proto.String(configs.NsName(ns.Type)),
911				NsFile: proto.String(nsPath),
912			})
913		}
914	}
915
916	return nil
917}
918
919func (c *linuxContainer) handleRestoringExternalNamespaces(rpcOpts *criurpc.CriuOpts, extraFiles *[]*os.File, t configs.NamespaceType) error {
920	if !c.criuSupportsExtNS(t) {
921		return nil
922	}
923
924	nsPath := c.config.Namespaces.PathOf(t)
925	if nsPath == "" {
926		return nil
927	}
928	// CRIU wants the information about an existing namespace
929	// like this: --inherit-fd fd[<fd>]:<key>
930	// The <key> needs to be the same as during checkpointing.
931	// We are always using 'extRoot<TYPE>NS' as the key in this.
932	nsFd, err := os.Open(nsPath)
933	if err != nil {
934		logrus.Errorf("If a specific network namespace is defined it must exist: %s", err)
935		return fmt.Errorf("Requested network namespace %v does not exist", nsPath)
936	}
937	inheritFd := &criurpc.InheritFd{
938		Key: proto.String(criuNsToKey(t)),
939		// The offset of four is necessary because 0, 1, 2 and 3 are
940		// already used by stdin, stdout, stderr, 'criu swrk' socket.
941		Fd: proto.Int32(int32(4 + len(*extraFiles))),
942	}
943	rpcOpts.InheritFd = append(rpcOpts.InheritFd, inheritFd)
944	// All open FDs need to be transferred to CRIU via extraFiles
945	*extraFiles = append(*extraFiles, nsFd)
946
947	return nil
948}
949
950func (c *linuxContainer) Checkpoint(criuOpts *CriuOpts) error {
951	c.m.Lock()
952	defer c.m.Unlock()
953
954	// Checkpoint is unlikely to work if os.Geteuid() != 0 || system.RunningInUserNS().
955	// (CLI prints a warning)
956	// TODO(avagin): Figure out how to make this work nicely. CRIU 2.0 has
957	//               support for doing unprivileged dumps, but the setup of
958	//               rootless containers might make this complicated.
959
960	// We are relying on the CRIU version RPC which was introduced with CRIU 3.0.0
961	if err := c.checkCriuVersion(30000); err != nil {
962		return err
963	}
964
965	if criuOpts.ImagesDirectory == "" {
966		return errors.New("invalid directory to save checkpoint")
967	}
968
969	// Since a container can be C/R'ed multiple times,
970	// the checkpoint directory may already exist.
971	if err := os.Mkdir(criuOpts.ImagesDirectory, 0o700); err != nil && !os.IsExist(err) {
972		return err
973	}
974
975	if criuOpts.WorkDirectory == "" {
976		criuOpts.WorkDirectory = filepath.Join(c.root, "criu.work")
977	}
978
979	if err := os.Mkdir(criuOpts.WorkDirectory, 0o700); err != nil && !os.IsExist(err) {
980		return err
981	}
982
983	workDir, err := os.Open(criuOpts.WorkDirectory)
984	if err != nil {
985		return err
986	}
987	defer workDir.Close()
988
989	imageDir, err := os.Open(criuOpts.ImagesDirectory)
990	if err != nil {
991		return err
992	}
993	defer imageDir.Close()
994
995	rpcOpts := criurpc.CriuOpts{
996		ImagesDirFd:     proto.Int32(int32(imageDir.Fd())),
997		WorkDirFd:       proto.Int32(int32(workDir.Fd())),
998		LogLevel:        proto.Int32(4),
999		LogFile:         proto.String("dump.log"),
1000		Root:            proto.String(c.config.Rootfs),
1001		ManageCgroups:   proto.Bool(true),
1002		NotifyScripts:   proto.Bool(true),
1003		Pid:             proto.Int32(int32(c.initProcess.pid())),
1004		ShellJob:        proto.Bool(criuOpts.ShellJob),
1005		LeaveRunning:    proto.Bool(criuOpts.LeaveRunning),
1006		TcpEstablished:  proto.Bool(criuOpts.TcpEstablished),
1007		ExtUnixSk:       proto.Bool(criuOpts.ExternalUnixConnections),
1008		FileLocks:       proto.Bool(criuOpts.FileLocks),
1009		EmptyNs:         proto.Uint32(criuOpts.EmptyNs),
1010		OrphanPtsMaster: proto.Bool(true),
1011		AutoDedup:       proto.Bool(criuOpts.AutoDedup),
1012		LazyPages:       proto.Bool(criuOpts.LazyPages),
1013	}
1014
1015	c.handleCriuConfigurationFile(&rpcOpts)
1016
1017	// If the container is running in a network namespace and has
1018	// a path to the network namespace configured, we will dump
1019	// that network namespace as an external namespace and we
1020	// will expect that the namespace exists during restore.
1021	// This basically means that CRIU will ignore the namespace
1022	// and expect to be setup correctly.
1023	if err := c.handleCheckpointingExternalNamespaces(&rpcOpts, configs.NEWNET); err != nil {
1024		return err
1025	}
1026
1027	// Same for possible external PID namespaces
1028	if err := c.handleCheckpointingExternalNamespaces(&rpcOpts, configs.NEWPID); err != nil {
1029		return err
1030	}
1031
1032	// CRIU can use cgroup freezer; when rpcOpts.FreezeCgroup
1033	// is not set, CRIU uses ptrace() to pause the processes.
1034	// Note cgroup v2 freezer is only supported since CRIU release 3.14.
1035	if !cgroups.IsCgroup2UnifiedMode() || c.checkCriuVersion(31400) == nil {
1036		if fcg := c.cgroupManager.Path("freezer"); fcg != "" {
1037			rpcOpts.FreezeCgroup = proto.String(fcg)
1038		}
1039	}
1040
1041	// append optional criu opts, e.g., page-server and port
1042	if criuOpts.PageServer.Address != "" && criuOpts.PageServer.Port != 0 {
1043		rpcOpts.Ps = &criurpc.CriuPageServerInfo{
1044			Address: proto.String(criuOpts.PageServer.Address),
1045			Port:    proto.Int32(criuOpts.PageServer.Port),
1046		}
1047	}
1048
1049	// pre-dump may need parentImage param to complete iterative migration
1050	if criuOpts.ParentImage != "" {
1051		rpcOpts.ParentImg = proto.String(criuOpts.ParentImage)
1052		rpcOpts.TrackMem = proto.Bool(true)
1053	}
1054
1055	// append optional manage cgroups mode
1056	if criuOpts.ManageCgroupsMode != 0 {
1057		mode := criurpc.CriuCgMode(criuOpts.ManageCgroupsMode)
1058		rpcOpts.ManageCgroupsMode = &mode
1059	}
1060
1061	var t criurpc.CriuReqType
1062	if criuOpts.PreDump {
1063		feat := criurpc.CriuFeatures{
1064			MemTrack: proto.Bool(true),
1065		}
1066
1067		if err := c.checkCriuFeatures(criuOpts, &rpcOpts, &feat); err != nil {
1068			return err
1069		}
1070
1071		t = criurpc.CriuReqType_PRE_DUMP
1072	} else {
1073		t = criurpc.CriuReqType_DUMP
1074	}
1075
1076	if criuOpts.LazyPages {
1077		// lazy migration requested; check if criu supports it
1078		feat := criurpc.CriuFeatures{
1079			LazyPages: proto.Bool(true),
1080		}
1081		if err := c.checkCriuFeatures(criuOpts, &rpcOpts, &feat); err != nil {
1082			return err
1083		}
1084
1085		if fd := criuOpts.StatusFd; fd != -1 {
1086			// check that the FD is valid
1087			flags, err := unix.FcntlInt(uintptr(fd), unix.F_GETFL, 0)
1088			if err != nil {
1089				return fmt.Errorf("invalid --status-fd argument %d: %w", fd, err)
1090			}
1091			// and writable
1092			if flags&unix.O_WRONLY == 0 {
1093				return fmt.Errorf("invalid --status-fd argument %d: not writable", fd)
1094			}
1095
1096			if c.checkCriuVersion(31500) != nil {
1097				// For criu 3.15+, use notifications (see case "status-ready"
1098				// in criuNotifications). Otherwise, rely on criu status fd.
1099				rpcOpts.StatusFd = proto.Int32(int32(fd))
1100			}
1101		}
1102	}
1103
1104	req := &criurpc.CriuReq{
1105		Type: &t,
1106		Opts: &rpcOpts,
1107	}
1108
1109	// no need to dump all this in pre-dump
1110	if !criuOpts.PreDump {
1111		hasCgroupns := c.config.Namespaces.Contains(configs.NEWCGROUP)
1112		for _, m := range c.config.Mounts {
1113			switch m.Device {
1114			case "bind":
1115				c.addCriuDumpMount(req, m)
1116			case "cgroup":
1117				if cgroups.IsCgroup2UnifiedMode() || hasCgroupns {
1118					// real mount(s)
1119					continue
1120				}
1121				// a set of "external" bind mounts
1122				binds, err := getCgroupMounts(m)
1123				if err != nil {
1124					return err
1125				}
1126				for _, b := range binds {
1127					c.addCriuDumpMount(req, b)
1128				}
1129			}
1130		}
1131
1132		if err := c.addMaskPaths(req); err != nil {
1133			return err
1134		}
1135
1136		for _, node := range c.config.Devices {
1137			m := &configs.Mount{Destination: node.Path, Source: node.Path}
1138			c.addCriuDumpMount(req, m)
1139		}
1140
1141		// Write the FD info to a file in the image directory
1142		fdsJSON, err := json.Marshal(c.initProcess.externalDescriptors())
1143		if err != nil {
1144			return err
1145		}
1146
1147		err = ioutil.WriteFile(filepath.Join(criuOpts.ImagesDirectory, descriptorsFilename), fdsJSON, 0o600)
1148		if err != nil {
1149			return err
1150		}
1151	}
1152
1153	err = c.criuSwrk(nil, req, criuOpts, nil)
1154	if err != nil {
1155		return err
1156	}
1157	return nil
1158}
1159
1160func (c *linuxContainer) addCriuRestoreMount(req *criurpc.CriuReq, m *configs.Mount) {
1161	mountDest := strings.TrimPrefix(m.Destination, c.config.Rootfs)
1162	extMnt := &criurpc.ExtMountMap{
1163		Key: proto.String(mountDest),
1164		Val: proto.String(m.Source),
1165	}
1166	req.Opts.ExtMnt = append(req.Opts.ExtMnt, extMnt)
1167}
1168
1169func (c *linuxContainer) restoreNetwork(req *criurpc.CriuReq, criuOpts *CriuOpts) {
1170	for _, iface := range c.config.Networks {
1171		switch iface.Type {
1172		case "veth":
1173			veth := new(criurpc.CriuVethPair)
1174			veth.IfOut = proto.String(iface.HostInterfaceName)
1175			veth.IfIn = proto.String(iface.Name)
1176			req.Opts.Veths = append(req.Opts.Veths, veth)
1177		case "loopback":
1178			// Do nothing
1179		}
1180	}
1181	for _, i := range criuOpts.VethPairs {
1182		veth := new(criurpc.CriuVethPair)
1183		veth.IfOut = proto.String(i.HostInterfaceName)
1184		veth.IfIn = proto.String(i.ContainerInterfaceName)
1185		req.Opts.Veths = append(req.Opts.Veths, veth)
1186	}
1187}
1188
1189// makeCriuRestoreMountpoints makes the actual mountpoints for the
1190// restore using CRIU. This function is inspired from the code in
1191// rootfs_linux.go
1192func (c *linuxContainer) makeCriuRestoreMountpoints(m *configs.Mount) error {
1193	switch m.Device {
1194	case "cgroup":
1195		// No mount point(s) need to be created:
1196		//
1197		// * for v1, mount points are saved by CRIU because
1198		//   /sys/fs/cgroup is a tmpfs mount
1199		//
1200		// * for v2, /sys/fs/cgroup is a real mount, but
1201		//   the mountpoint appears as soon as /sys is mounted
1202		return nil
1203	case "bind":
1204		// The prepareBindMount() function checks if source
1205		// exists. So it cannot be used for other filesystem types.
1206		if err := prepareBindMount(m, c.config.Rootfs); err != nil {
1207			return err
1208		}
1209	default:
1210		// for all other filesystems just create the mountpoints
1211		dest, err := securejoin.SecureJoin(c.config.Rootfs, m.Destination)
1212		if err != nil {
1213			return err
1214		}
1215		if err := checkProcMount(c.config.Rootfs, dest, ""); err != nil {
1216			return err
1217		}
1218		if err := os.MkdirAll(dest, 0o755); err != nil {
1219			return err
1220		}
1221	}
1222	return nil
1223}
1224
1225// isPathInPrefixList is a small function for CRIU restore to make sure
1226// mountpoints, which are on a tmpfs, are not created in the roofs
1227func isPathInPrefixList(path string, prefix []string) bool {
1228	for _, p := range prefix {
1229		if strings.HasPrefix(path, p+"/") {
1230			return true
1231		}
1232	}
1233	return false
1234}
1235
1236// prepareCriuRestoreMounts tries to set up the rootfs of the
1237// container to be restored in the same way runc does it for
1238// initial container creation. Even for a read-only rootfs container
1239// runc modifies the rootfs to add mountpoints which do not exist.
1240// This function also creates missing mountpoints as long as they
1241// are not on top of a tmpfs, as CRIU will restore tmpfs content anyway.
1242func (c *linuxContainer) prepareCriuRestoreMounts(mounts []*configs.Mount) error {
1243	// First get a list of a all tmpfs mounts
1244	tmpfs := []string{}
1245	for _, m := range mounts {
1246		switch m.Device {
1247		case "tmpfs":
1248			tmpfs = append(tmpfs, m.Destination)
1249		}
1250	}
1251	// Now go through all mounts and create the mountpoints
1252	// if the mountpoints are not on a tmpfs, as CRIU will
1253	// restore the complete tmpfs content from its checkpoint.
1254	umounts := []string{}
1255	defer func() {
1256		for _, u := range umounts {
1257			_ = utils.WithProcfd(c.config.Rootfs, u, func(procfd string) error {
1258				if e := unix.Unmount(procfd, unix.MNT_DETACH); e != nil {
1259					if e != unix.EINVAL {
1260						// Ignore EINVAL as it means 'target is not a mount point.'
1261						// It probably has already been unmounted.
1262						logrus.Warnf("Error during cleanup unmounting of %s (%s): %v", procfd, u, e)
1263					}
1264				}
1265				return nil
1266			})
1267		}
1268	}()
1269	for _, m := range mounts {
1270		if !isPathInPrefixList(m.Destination, tmpfs) {
1271			if err := c.makeCriuRestoreMountpoints(m); err != nil {
1272				return err
1273			}
1274			// If the mount point is a bind mount, we need to mount
1275			// it now so that runc can create the necessary mount
1276			// points for mounts in bind mounts.
1277			// This also happens during initial container creation.
1278			// Without this CRIU restore will fail
1279			// See: https://github.com/opencontainers/runc/issues/2748
1280			// It is also not necessary to order the mount points
1281			// because during initial container creation mounts are
1282			// set up in the order they are configured.
1283			if m.Device == "bind" {
1284				if err := utils.WithProcfd(c.config.Rootfs, m.Destination, func(procfd string) error {
1285					if err := unix.Mount(m.Source, procfd, "", unix.MS_BIND|unix.MS_REC, ""); err != nil {
1286						return errorsf.Wrapf(err, "unable to bind mount %q to %q (through %q)", m.Source, m.Destination, procfd)
1287					}
1288					return nil
1289				}); err != nil {
1290					return err
1291				}
1292				umounts = append(umounts, m.Destination)
1293			}
1294		}
1295	}
1296	return nil
1297}
1298
1299func (c *linuxContainer) Restore(process *Process, criuOpts *CriuOpts) error {
1300	c.m.Lock()
1301	defer c.m.Unlock()
1302
1303	var extraFiles []*os.File
1304
1305	// Restore is unlikely to work if os.Geteuid() != 0 || system.RunningInUserNS().
1306	// (CLI prints a warning)
1307	// TODO(avagin): Figure out how to make this work nicely. CRIU doesn't have
1308	//               support for unprivileged restore at the moment.
1309
1310	// We are relying on the CRIU version RPC which was introduced with CRIU 3.0.0
1311	if err := c.checkCriuVersion(30000); err != nil {
1312		return err
1313	}
1314	if criuOpts.WorkDirectory == "" {
1315		criuOpts.WorkDirectory = filepath.Join(c.root, "criu.work")
1316	}
1317	// Since a container can be C/R'ed multiple times,
1318	// the work directory may already exist.
1319	if err := os.Mkdir(criuOpts.WorkDirectory, 0o700); err != nil && !os.IsExist(err) {
1320		return err
1321	}
1322	workDir, err := os.Open(criuOpts.WorkDirectory)
1323	if err != nil {
1324		return err
1325	}
1326	defer workDir.Close()
1327	if criuOpts.ImagesDirectory == "" {
1328		return errors.New("invalid directory to restore checkpoint")
1329	}
1330	imageDir, err := os.Open(criuOpts.ImagesDirectory)
1331	if err != nil {
1332		return err
1333	}
1334	defer imageDir.Close()
1335	// CRIU has a few requirements for a root directory:
1336	// * it must be a mount point
1337	// * its parent must not be overmounted
1338	// c.config.Rootfs is bind-mounted to a temporary directory
1339	// to satisfy these requirements.
1340	root := filepath.Join(c.root, "criu-root")
1341	if err := os.Mkdir(root, 0o755); err != nil {
1342		return err
1343	}
1344	defer os.Remove(root)
1345	root, err = filepath.EvalSymlinks(root)
1346	if err != nil {
1347		return err
1348	}
1349	err = unix.Mount(c.config.Rootfs, root, "", unix.MS_BIND|unix.MS_REC, "")
1350	if err != nil {
1351		return err
1352	}
1353	defer unix.Unmount(root, unix.MNT_DETACH) //nolint: errcheck
1354	t := criurpc.CriuReqType_RESTORE
1355	req := &criurpc.CriuReq{
1356		Type: &t,
1357		Opts: &criurpc.CriuOpts{
1358			ImagesDirFd:     proto.Int32(int32(imageDir.Fd())),
1359			WorkDirFd:       proto.Int32(int32(workDir.Fd())),
1360			EvasiveDevices:  proto.Bool(true),
1361			LogLevel:        proto.Int32(4),
1362			LogFile:         proto.String("restore.log"),
1363			RstSibling:      proto.Bool(true),
1364			Root:            proto.String(root),
1365			ManageCgroups:   proto.Bool(true),
1366			NotifyScripts:   proto.Bool(true),
1367			ShellJob:        proto.Bool(criuOpts.ShellJob),
1368			ExtUnixSk:       proto.Bool(criuOpts.ExternalUnixConnections),
1369			TcpEstablished:  proto.Bool(criuOpts.TcpEstablished),
1370			FileLocks:       proto.Bool(criuOpts.FileLocks),
1371			EmptyNs:         proto.Uint32(criuOpts.EmptyNs),
1372			OrphanPtsMaster: proto.Bool(true),
1373			AutoDedup:       proto.Bool(criuOpts.AutoDedup),
1374			LazyPages:       proto.Bool(criuOpts.LazyPages),
1375		},
1376	}
1377
1378	if criuOpts.LsmProfile != "" {
1379		// CRIU older than 3.16 has a bug which breaks the possibility
1380		// to set a different LSM profile.
1381		if err := c.checkCriuVersion(31600); err != nil {
1382			return errors.New("--lsm-profile requires at least CRIU 3.16")
1383		}
1384		req.Opts.LsmProfile = proto.String(criuOpts.LsmProfile)
1385	}
1386
1387	c.handleCriuConfigurationFile(req.Opts)
1388
1389	if err := c.handleRestoringNamespaces(req.Opts, &extraFiles); err != nil {
1390		return err
1391	}
1392
1393	// This will modify the rootfs of the container in the same way runc
1394	// modifies the container during initial creation.
1395	if err := c.prepareCriuRestoreMounts(c.config.Mounts); err != nil {
1396		return err
1397	}
1398
1399	hasCgroupns := c.config.Namespaces.Contains(configs.NEWCGROUP)
1400	for _, m := range c.config.Mounts {
1401		switch m.Device {
1402		case "bind":
1403			c.addCriuRestoreMount(req, m)
1404		case "cgroup":
1405			if cgroups.IsCgroup2UnifiedMode() || hasCgroupns {
1406				continue
1407			}
1408			// cgroup v1 is a set of bind mounts, unless cgroupns is used
1409			binds, err := getCgroupMounts(m)
1410			if err != nil {
1411				return err
1412			}
1413			for _, b := range binds {
1414				c.addCriuRestoreMount(req, b)
1415			}
1416		}
1417	}
1418
1419	if len(c.config.MaskPaths) > 0 {
1420		m := &configs.Mount{Destination: "/dev/null", Source: "/dev/null"}
1421		c.addCriuRestoreMount(req, m)
1422	}
1423
1424	for _, node := range c.config.Devices {
1425		m := &configs.Mount{Destination: node.Path, Source: node.Path}
1426		c.addCriuRestoreMount(req, m)
1427	}
1428
1429	if criuOpts.EmptyNs&unix.CLONE_NEWNET == 0 {
1430		c.restoreNetwork(req, criuOpts)
1431	}
1432
1433	// append optional manage cgroups mode
1434	if criuOpts.ManageCgroupsMode != 0 {
1435		mode := criurpc.CriuCgMode(criuOpts.ManageCgroupsMode)
1436		req.Opts.ManageCgroupsMode = &mode
1437	}
1438
1439	var (
1440		fds    []string
1441		fdJSON []byte
1442	)
1443	if fdJSON, err = ioutil.ReadFile(filepath.Join(criuOpts.ImagesDirectory, descriptorsFilename)); err != nil {
1444		return err
1445	}
1446
1447	if err := json.Unmarshal(fdJSON, &fds); err != nil {
1448		return err
1449	}
1450	for i := range fds {
1451		if s := fds[i]; strings.Contains(s, "pipe:") {
1452			inheritFd := new(criurpc.InheritFd)
1453			inheritFd.Key = proto.String(s)
1454			inheritFd.Fd = proto.Int32(int32(i))
1455			req.Opts.InheritFd = append(req.Opts.InheritFd, inheritFd)
1456		}
1457	}
1458	err = c.criuSwrk(process, req, criuOpts, extraFiles)
1459
1460	// Now that CRIU is done let's close all opened FDs CRIU needed.
1461	for _, fd := range extraFiles {
1462		fd.Close()
1463	}
1464
1465	return err
1466}
1467
1468func (c *linuxContainer) criuApplyCgroups(pid int, req *criurpc.CriuReq) error {
1469	// need to apply cgroups only on restore
1470	if req.GetType() != criurpc.CriuReqType_RESTORE {
1471		return nil
1472	}
1473
1474	// XXX: Do we need to deal with this case? AFAIK criu still requires root.
1475	if err := c.cgroupManager.Apply(pid); err != nil {
1476		return err
1477	}
1478
1479	if err := c.cgroupManager.Set(c.config.Cgroups.Resources); err != nil {
1480		return newSystemError(err)
1481	}
1482
1483	if cgroups.IsCgroup2UnifiedMode() {
1484		return nil
1485	}
1486	// the stuff below is cgroupv1-specific
1487
1488	path := fmt.Sprintf("/proc/%d/cgroup", pid)
1489	cgroupsPaths, err := cgroups.ParseCgroupFile(path)
1490	if err != nil {
1491		return err
1492	}
1493
1494	for c, p := range cgroupsPaths {
1495		cgroupRoot := &criurpc.CgroupRoot{
1496			Ctrl: proto.String(c),
1497			Path: proto.String(p),
1498		}
1499		req.Opts.CgRoot = append(req.Opts.CgRoot, cgroupRoot)
1500	}
1501
1502	return nil
1503}
1504
1505func (c *linuxContainer) criuSwrk(process *Process, req *criurpc.CriuReq, opts *CriuOpts, extraFiles []*os.File) error {
1506	fds, err := unix.Socketpair(unix.AF_LOCAL, unix.SOCK_SEQPACKET|unix.SOCK_CLOEXEC, 0)
1507	if err != nil {
1508		return err
1509	}
1510
1511	var logPath string
1512	if opts != nil {
1513		logPath = filepath.Join(opts.WorkDirectory, req.GetOpts().GetLogFile())
1514	} else {
1515		// For the VERSION RPC 'opts' is set to 'nil' and therefore
1516		// opts.WorkDirectory does not exist. Set logPath to "".
1517		logPath = ""
1518	}
1519	criuClient := os.NewFile(uintptr(fds[0]), "criu-transport-client")
1520	criuClientFileCon, err := net.FileConn(criuClient)
1521	criuClient.Close()
1522	if err != nil {
1523		return err
1524	}
1525
1526	criuClientCon := criuClientFileCon.(*net.UnixConn)
1527	defer criuClientCon.Close()
1528
1529	criuServer := os.NewFile(uintptr(fds[1]), "criu-transport-server")
1530	defer criuServer.Close()
1531
1532	args := []string{"swrk", "3"}
1533	if c.criuVersion != 0 {
1534		// If the CRIU Version is still '0' then this is probably
1535		// the initial CRIU run to detect the version. Skip it.
1536		logrus.Debugf("Using CRIU %d at: %s", c.criuVersion, c.criuPath)
1537	}
1538	cmd := exec.Command(c.criuPath, args...)
1539	if process != nil {
1540		cmd.Stdin = process.Stdin
1541		cmd.Stdout = process.Stdout
1542		cmd.Stderr = process.Stderr
1543	}
1544	cmd.ExtraFiles = append(cmd.ExtraFiles, criuServer)
1545	if extraFiles != nil {
1546		cmd.ExtraFiles = append(cmd.ExtraFiles, extraFiles...)
1547	}
1548
1549	if err := cmd.Start(); err != nil {
1550		return err
1551	}
1552	// we close criuServer so that even if CRIU crashes or unexpectedly exits, runc will not hang.
1553	criuServer.Close()
1554	// cmd.Process will be replaced by a restored init.
1555	criuProcess := cmd.Process
1556
1557	var criuProcessState *os.ProcessState
1558	defer func() {
1559		if criuProcessState == nil {
1560			criuClientCon.Close()
1561			_, err := criuProcess.Wait()
1562			if err != nil {
1563				logrus.Warnf("wait on criuProcess returned %v", err)
1564			}
1565		}
1566	}()
1567
1568	if err := c.criuApplyCgroups(criuProcess.Pid, req); err != nil {
1569		return err
1570	}
1571
1572	var extFds []string
1573	if process != nil {
1574		extFds, err = getPipeFds(criuProcess.Pid)
1575		if err != nil {
1576			return err
1577		}
1578	}
1579
1580	logrus.Debugf("Using CRIU in %s mode", req.GetType().String())
1581	// In the case of criurpc.CriuReqType_FEATURE_CHECK req.GetOpts()
1582	// should be empty. For older CRIU versions it still will be
1583	// available but empty. criurpc.CriuReqType_VERSION actually
1584	// has no req.GetOpts().
1585	if logrus.GetLevel() >= logrus.DebugLevel &&
1586		!(req.GetType() == criurpc.CriuReqType_FEATURE_CHECK ||
1587			req.GetType() == criurpc.CriuReqType_VERSION) {
1588
1589		val := reflect.ValueOf(req.GetOpts())
1590		v := reflect.Indirect(val)
1591		for i := 0; i < v.NumField(); i++ {
1592			st := v.Type()
1593			name := st.Field(i).Name
1594			if 'A' <= name[0] && name[0] <= 'Z' {
1595				value := val.MethodByName("Get" + name).Call([]reflect.Value{})
1596				logrus.Debugf("CRIU option %s with value %v", name, value[0])
1597			}
1598		}
1599	}
1600	data, err := proto.Marshal(req)
1601	if err != nil {
1602		return err
1603	}
1604	_, err = criuClientCon.Write(data)
1605	if err != nil {
1606		return err
1607	}
1608
1609	buf := make([]byte, 10*4096)
1610	oob := make([]byte, 4096)
1611	for {
1612		n, oobn, _, _, err := criuClientCon.ReadMsgUnix(buf, oob)
1613		if req.Opts != nil && req.Opts.StatusFd != nil {
1614			// Close status_fd as soon as we got something back from criu,
1615			// assuming it has consumed (reopened) it by this time.
1616			// Otherwise it will might be left open forever and whoever
1617			// is waiting on it will wait forever.
1618			fd := int(*req.Opts.StatusFd)
1619			_ = unix.Close(fd)
1620			req.Opts.StatusFd = nil
1621		}
1622		if err != nil {
1623			return err
1624		}
1625		if n == 0 {
1626			return errors.New("unexpected EOF")
1627		}
1628		if n == len(buf) {
1629			return errors.New("buffer is too small")
1630		}
1631
1632		resp := new(criurpc.CriuResp)
1633		err = proto.Unmarshal(buf[:n], resp)
1634		if err != nil {
1635			return err
1636		}
1637		if !resp.GetSuccess() {
1638			typeString := req.GetType().String()
1639			return fmt.Errorf("criu failed: type %s errno %d\nlog file: %s", typeString, resp.GetCrErrno(), logPath)
1640		}
1641
1642		t := resp.GetType()
1643		switch {
1644		case t == criurpc.CriuReqType_FEATURE_CHECK:
1645			logrus.Debugf("Feature check says: %s", resp)
1646			criuFeatures = resp.GetFeatures()
1647		case t == criurpc.CriuReqType_NOTIFY:
1648			if err := c.criuNotifications(resp, process, cmd, opts, extFds, oob[:oobn]); err != nil {
1649				return err
1650			}
1651			t = criurpc.CriuReqType_NOTIFY
1652			req = &criurpc.CriuReq{
1653				Type:          &t,
1654				NotifySuccess: proto.Bool(true),
1655			}
1656			data, err = proto.Marshal(req)
1657			if err != nil {
1658				return err
1659			}
1660			_, err = criuClientCon.Write(data)
1661			if err != nil {
1662				return err
1663			}
1664			continue
1665		case t == criurpc.CriuReqType_RESTORE:
1666		case t == criurpc.CriuReqType_DUMP:
1667		case t == criurpc.CriuReqType_PRE_DUMP:
1668		default:
1669			return fmt.Errorf("unable to parse the response %s", resp.String())
1670		}
1671
1672		break
1673	}
1674
1675	_ = criuClientCon.CloseWrite()
1676	// cmd.Wait() waits cmd.goroutines which are used for proxying file descriptors.
1677	// Here we want to wait only the CRIU process.
1678	criuProcessState, err = criuProcess.Wait()
1679	if err != nil {
1680		return err
1681	}
1682
1683	// In pre-dump mode CRIU is in a loop and waits for
1684	// the final DUMP command.
1685	// The current runc pre-dump approach, however, is
1686	// start criu in PRE_DUMP once for a single pre-dump
1687	// and not the whole series of pre-dump, pre-dump, ...m, dump
1688	// If we got the message CriuReqType_PRE_DUMP it means
1689	// CRIU was successful and we need to forcefully stop CRIU
1690	if !criuProcessState.Success() && *req.Type != criurpc.CriuReqType_PRE_DUMP {
1691		return fmt.Errorf("criu failed: %s\nlog file: %s", criuProcessState.String(), logPath)
1692	}
1693	return nil
1694}
1695
1696// block any external network activity
1697func lockNetwork(config *configs.Config) error {
1698	for _, config := range config.Networks {
1699		strategy, err := getStrategy(config.Type)
1700		if err != nil {
1701			return err
1702		}
1703
1704		if err := strategy.detach(config); err != nil {
1705			return err
1706		}
1707	}
1708	return nil
1709}
1710
1711func unlockNetwork(config *configs.Config) error {
1712	for _, config := range config.Networks {
1713		strategy, err := getStrategy(config.Type)
1714		if err != nil {
1715			return err
1716		}
1717		if err = strategy.attach(config); err != nil {
1718			return err
1719		}
1720	}
1721	return nil
1722}
1723
1724func (c *linuxContainer) criuNotifications(resp *criurpc.CriuResp, process *Process, cmd *exec.Cmd, opts *CriuOpts, fds []string, oob []byte) error {
1725	notify := resp.GetNotify()
1726	if notify == nil {
1727		return fmt.Errorf("invalid response: %s", resp.String())
1728	}
1729	script := notify.GetScript()
1730	logrus.Debugf("notify: %s\n", script)
1731	switch script {
1732	case "post-dump":
1733		f, err := os.Create(filepath.Join(c.root, "checkpoint"))
1734		if err != nil {
1735			return err
1736		}
1737		f.Close()
1738	case "network-unlock":
1739		if err := unlockNetwork(c.config); err != nil {
1740			return err
1741		}
1742	case "network-lock":
1743		if err := lockNetwork(c.config); err != nil {
1744			return err
1745		}
1746	case "setup-namespaces":
1747		if c.config.Hooks != nil {
1748			s, err := c.currentOCIState()
1749			if err != nil {
1750				return nil
1751			}
1752			s.Pid = int(notify.GetPid())
1753
1754			if err := c.config.Hooks[configs.Prestart].RunHooks(s); err != nil {
1755				return err
1756			}
1757			if err := c.config.Hooks[configs.CreateRuntime].RunHooks(s); err != nil {
1758				return err
1759			}
1760		}
1761	case "post-restore":
1762		pid := notify.GetPid()
1763
1764		p, err := os.FindProcess(int(pid))
1765		if err != nil {
1766			return err
1767		}
1768		cmd.Process = p
1769
1770		r, err := newRestoredProcess(cmd, fds)
1771		if err != nil {
1772			return err
1773		}
1774		process.ops = r
1775		if err := c.state.transition(&restoredState{
1776			imageDir: opts.ImagesDirectory,
1777			c:        c,
1778		}); err != nil {
1779			return err
1780		}
1781		// create a timestamp indicating when the restored checkpoint was started
1782		c.created = time.Now().UTC()
1783		if _, err := c.updateState(r); err != nil {
1784			return err
1785		}
1786		if err := os.Remove(filepath.Join(c.root, "checkpoint")); err != nil {
1787			if !os.IsNotExist(err) {
1788				logrus.Error(err)
1789			}
1790		}
1791	case "orphan-pts-master":
1792		scm, err := unix.ParseSocketControlMessage(oob)
1793		if err != nil {
1794			return err
1795		}
1796		fds, err := unix.ParseUnixRights(&scm[0])
1797		if err != nil {
1798			return err
1799		}
1800
1801		master := os.NewFile(uintptr(fds[0]), "orphan-pts-master")
1802		defer master.Close()
1803
1804		// While we can access console.master, using the API is a good idea.
1805		if err := utils.SendFd(process.ConsoleSocket, master.Name(), master.Fd()); err != nil {
1806			return err
1807		}
1808	case "status-ready":
1809		if opts.StatusFd != -1 {
1810			// write \0 to status fd to notify that lazy page server is ready
1811			_, err := unix.Write(opts.StatusFd, []byte{0})
1812			if err != nil {
1813				logrus.Warnf("can't write \\0 to status fd: %v", err)
1814			}
1815			_ = unix.Close(opts.StatusFd)
1816			opts.StatusFd = -1
1817		}
1818	}
1819	return nil
1820}
1821
1822func (c *linuxContainer) updateState(process parentProcess) (*State, error) {
1823	if process != nil {
1824		c.initProcess = process
1825	}
1826	state, err := c.currentState()
1827	if err != nil {
1828		return nil, err
1829	}
1830	err = c.saveState(state)
1831	if err != nil {
1832		return nil, err
1833	}
1834	return state, nil
1835}
1836
1837func (c *linuxContainer) saveState(s *State) (retErr error) {
1838	tmpFile, err := ioutil.TempFile(c.root, "state-")
1839	if err != nil {
1840		return err
1841	}
1842
1843	defer func() {
1844		if retErr != nil {
1845			tmpFile.Close()
1846			os.Remove(tmpFile.Name())
1847		}
1848	}()
1849
1850	err = utils.WriteJSON(tmpFile, s)
1851	if err != nil {
1852		return err
1853	}
1854	err = tmpFile.Close()
1855	if err != nil {
1856		return err
1857	}
1858
1859	stateFilePath := filepath.Join(c.root, stateFilename)
1860	return os.Rename(tmpFile.Name(), stateFilePath)
1861}
1862
1863func (c *linuxContainer) currentStatus() (Status, error) {
1864	if err := c.refreshState(); err != nil {
1865		return -1, err
1866	}
1867	return c.state.status(), nil
1868}
1869
1870// refreshState needs to be called to verify that the current state on the
1871// container is what is true.  Because consumers of libcontainer can use it
1872// out of process we need to verify the container's status based on runtime
1873// information and not rely on our in process info.
1874func (c *linuxContainer) refreshState() error {
1875	paused, err := c.isPaused()
1876	if err != nil {
1877		return err
1878	}
1879	if paused {
1880		return c.state.transition(&pausedState{c: c})
1881	}
1882	t := c.runType()
1883	switch t {
1884	case Created:
1885		return c.state.transition(&createdState{c: c})
1886	case Running:
1887		return c.state.transition(&runningState{c: c})
1888	}
1889	return c.state.transition(&stoppedState{c: c})
1890}
1891
1892func (c *linuxContainer) runType() Status {
1893	if c.initProcess == nil {
1894		return Stopped
1895	}
1896	pid := c.initProcess.pid()
1897	stat, err := system.Stat(pid)
1898	if err != nil {
1899		return Stopped
1900	}
1901	if stat.StartTime != c.initProcessStartTime || stat.State == system.Zombie || stat.State == system.Dead {
1902		return Stopped
1903	}
1904	// We'll create exec fifo and blocking on it after container is created,
1905	// and delete it after start container.
1906	if _, err := os.Stat(filepath.Join(c.root, execFifoFilename)); err == nil {
1907		return Created
1908	}
1909	return Running
1910}
1911
1912func (c *linuxContainer) isPaused() (bool, error) {
1913	state, err := c.cgroupManager.GetFreezerState()
1914	if err != nil {
1915		return false, err
1916	}
1917	return state == configs.Frozen, nil
1918}
1919
1920func (c *linuxContainer) currentState() (*State, error) {
1921	var (
1922		startTime           uint64
1923		externalDescriptors []string
1924		pid                 = -1
1925	)
1926	if c.initProcess != nil {
1927		pid = c.initProcess.pid()
1928		startTime, _ = c.initProcess.startTime()
1929		externalDescriptors = c.initProcess.externalDescriptors()
1930	}
1931	intelRdtPath, err := intelrdt.GetIntelRdtPath(c.ID())
1932	if err != nil {
1933		intelRdtPath = ""
1934	}
1935	state := &State{
1936		BaseState: BaseState{
1937			ID:                   c.ID(),
1938			Config:               *c.config,
1939			InitProcessPid:       pid,
1940			InitProcessStartTime: startTime,
1941			Created:              c.created,
1942		},
1943		Rootless:            c.config.RootlessEUID && c.config.RootlessCgroups,
1944		CgroupPaths:         c.cgroupManager.GetPaths(),
1945		IntelRdtPath:        intelRdtPath,
1946		NamespacePaths:      make(map[configs.NamespaceType]string),
1947		ExternalDescriptors: externalDescriptors,
1948	}
1949	if pid > 0 {
1950		for _, ns := range c.config.Namespaces {
1951			state.NamespacePaths[ns.Type] = ns.GetPath(pid)
1952		}
1953		for _, nsType := range configs.NamespaceTypes() {
1954			if !configs.IsNamespaceSupported(nsType) {
1955				continue
1956			}
1957			if _, ok := state.NamespacePaths[nsType]; !ok {
1958				ns := configs.Namespace{Type: nsType}
1959				state.NamespacePaths[ns.Type] = ns.GetPath(pid)
1960			}
1961		}
1962	}
1963	return state, nil
1964}
1965
1966func (c *linuxContainer) currentOCIState() (*specs.State, error) {
1967	bundle, annotations := utils.Annotations(c.config.Labels)
1968	state := &specs.State{
1969		Version:     specs.Version,
1970		ID:          c.ID(),
1971		Bundle:      bundle,
1972		Annotations: annotations,
1973	}
1974	status, err := c.currentStatus()
1975	if err != nil {
1976		return nil, err
1977	}
1978	state.Status = specs.ContainerState(status.String())
1979	if status != Stopped {
1980		if c.initProcess != nil {
1981			state.Pid = c.initProcess.pid()
1982		}
1983	}
1984	return state, nil
1985}
1986
1987// orderNamespacePaths sorts namespace paths into a list of paths that we
1988// can setns in order.
1989func (c *linuxContainer) orderNamespacePaths(namespaces map[configs.NamespaceType]string) ([]string, error) {
1990	paths := []string{}
1991	for _, ns := range configs.NamespaceTypes() {
1992
1993		// Remove namespaces that we don't need to join.
1994		if !c.config.Namespaces.Contains(ns) {
1995			continue
1996		}
1997
1998		if p, ok := namespaces[ns]; ok && p != "" {
1999			// check if the requested namespace is supported
2000			if !configs.IsNamespaceSupported(ns) {
2001				return nil, newSystemError(fmt.Errorf("namespace %s is not supported", ns))
2002			}
2003			// only set to join this namespace if it exists
2004			if _, err := os.Lstat(p); err != nil {
2005				return nil, newSystemErrorWithCausef(err, "running lstat on namespace path %q", p)
2006			}
2007			// do not allow namespace path with comma as we use it to separate
2008			// the namespace paths
2009			if strings.ContainsRune(p, ',') {
2010				return nil, newSystemError(fmt.Errorf("invalid path %s", p))
2011			}
2012			paths = append(paths, fmt.Sprintf("%s:%s", configs.NsName(ns), p))
2013		}
2014
2015	}
2016
2017	return paths, nil
2018}
2019
2020func encodeIDMapping(idMap []configs.IDMap) ([]byte, error) {
2021	data := bytes.NewBuffer(nil)
2022	for _, im := range idMap {
2023		line := fmt.Sprintf("%d %d %d\n", im.ContainerID, im.HostID, im.Size)
2024		if _, err := data.WriteString(line); err != nil {
2025			return nil, err
2026		}
2027	}
2028	return data.Bytes(), nil
2029}
2030
2031// bootstrapData encodes the necessary data in netlink binary format
2032// as a io.Reader.
2033// Consumer can write the data to a bootstrap program
2034// such as one that uses nsenter package to bootstrap the container's
2035// init process correctly, i.e. with correct namespaces, uid/gid
2036// mapping etc.
2037func (c *linuxContainer) bootstrapData(cloneFlags uintptr, nsMaps map[configs.NamespaceType]string) (io.Reader, error) {
2038	// create the netlink message
2039	r := nl.NewNetlinkRequest(int(InitMsg), 0)
2040
2041	// write cloneFlags
2042	r.AddData(&Int32msg{
2043		Type:  CloneFlagsAttr,
2044		Value: uint32(cloneFlags),
2045	})
2046
2047	// write custom namespace paths
2048	if len(nsMaps) > 0 {
2049		nsPaths, err := c.orderNamespacePaths(nsMaps)
2050		if err != nil {
2051			return nil, err
2052		}
2053		r.AddData(&Bytemsg{
2054			Type:  NsPathsAttr,
2055			Value: []byte(strings.Join(nsPaths, ",")),
2056		})
2057	}
2058
2059	// write namespace paths only when we are not joining an existing user ns
2060	_, joinExistingUser := nsMaps[configs.NEWUSER]
2061	if !joinExistingUser {
2062		// write uid mappings
2063		if len(c.config.UidMappings) > 0 {
2064			if c.config.RootlessEUID && c.newuidmapPath != "" {
2065				r.AddData(&Bytemsg{
2066					Type:  UidmapPathAttr,
2067					Value: []byte(c.newuidmapPath),
2068				})
2069			}
2070			b, err := encodeIDMapping(c.config.UidMappings)
2071			if err != nil {
2072				return nil, err
2073			}
2074			r.AddData(&Bytemsg{
2075				Type:  UidmapAttr,
2076				Value: b,
2077			})
2078		}
2079
2080		// write gid mappings
2081		if len(c.config.GidMappings) > 0 {
2082			b, err := encodeIDMapping(c.config.GidMappings)
2083			if err != nil {
2084				return nil, err
2085			}
2086			r.AddData(&Bytemsg{
2087				Type:  GidmapAttr,
2088				Value: b,
2089			})
2090			if c.config.RootlessEUID && c.newgidmapPath != "" {
2091				r.AddData(&Bytemsg{
2092					Type:  GidmapPathAttr,
2093					Value: []byte(c.newgidmapPath),
2094				})
2095			}
2096			if requiresRootOrMappingTool(c.config) {
2097				r.AddData(&Boolmsg{
2098					Type:  SetgroupAttr,
2099					Value: true,
2100				})
2101			}
2102		}
2103	}
2104
2105	if c.config.OomScoreAdj != nil {
2106		// write oom_score_adj
2107		r.AddData(&Bytemsg{
2108			Type:  OomScoreAdjAttr,
2109			Value: []byte(strconv.Itoa(*c.config.OomScoreAdj)),
2110		})
2111	}
2112
2113	// write rootless
2114	r.AddData(&Boolmsg{
2115		Type:  RootlessEUIDAttr,
2116		Value: c.config.RootlessEUID,
2117	})
2118
2119	return bytes.NewReader(r.Serialize()), nil
2120}
2121
2122// ignoreTerminateErrors returns nil if the given err matches an error known
2123// to indicate that the terminate occurred successfully or err was nil, otherwise
2124// err is returned unaltered.
2125func ignoreTerminateErrors(err error) error {
2126	if err == nil {
2127		return nil
2128	}
2129	// terminate() might return an error from ether Kill or Wait.
2130	// The (*Cmd).Wait documentation says: "If the command fails to run
2131	// or doesn't complete successfully, the error is of type *ExitError".
2132	// Filter out such errors (like "exit status 1" or "signal: killed").
2133	var exitErr *exec.ExitError
2134	if errors.As(err, &exitErr) {
2135		return nil
2136	}
2137	// TODO: use errors.Is(err, os.ErrProcessDone) here and
2138	// remove "process already finished" string comparison below
2139	// once go 1.16 is minimally supported version.
2140
2141	s := err.Error()
2142	if strings.Contains(s, "process already finished") ||
2143		strings.Contains(s, "Wait was already called") {
2144		return nil
2145	}
2146	return err
2147}
2148
2149func requiresRootOrMappingTool(c *configs.Config) bool {
2150	gidMap := []configs.IDMap{
2151		{ContainerID: 0, HostID: os.Getegid(), Size: 1},
2152	}
2153	return !reflect.DeepEqual(c.GidMappings, gidMap)
2154}
2155