1// +build linux
2
3package libcontainer
4
5import (
6	"bytes"
7	"encoding/json"
8	"fmt"
9	"io"
10	"io/ioutil"
11	"net"
12	"os"
13	"strings"
14	"unsafe"
15
16	"github.com/containerd/console"
17	"github.com/opencontainers/runc/libcontainer/capabilities"
18	"github.com/opencontainers/runc/libcontainer/cgroups"
19	"github.com/opencontainers/runc/libcontainer/configs"
20	"github.com/opencontainers/runc/libcontainer/system"
21	"github.com/opencontainers/runc/libcontainer/user"
22	"github.com/opencontainers/runc/libcontainer/utils"
23	"github.com/opencontainers/runtime-spec/specs-go"
24	"github.com/pkg/errors"
25	"github.com/sirupsen/logrus"
26	"github.com/vishvananda/netlink"
27	"golang.org/x/sys/unix"
28)
29
30type initType string
31
32const (
33	initSetns    initType = "setns"
34	initStandard initType = "standard"
35)
36
37type pid struct {
38	Pid           int `json:"pid"`
39	PidFirstChild int `json:"pid_first"`
40}
41
42// network is an internal struct used to setup container networks.
43type network struct {
44	configs.Network
45
46	// TempVethPeerName is a unique temporary veth peer name that was placed into
47	// the container's namespace.
48	TempVethPeerName string `json:"temp_veth_peer_name"`
49}
50
51// initConfig is used for transferring parameters from Exec() to Init()
52type initConfig struct {
53	Args             []string              `json:"args"`
54	Env              []string              `json:"env"`
55	Cwd              string                `json:"cwd"`
56	Capabilities     *configs.Capabilities `json:"capabilities"`
57	ProcessLabel     string                `json:"process_label"`
58	AppArmorProfile  string                `json:"apparmor_profile"`
59	NoNewPrivileges  bool                  `json:"no_new_privileges"`
60	User             string                `json:"user"`
61	AdditionalGroups []string              `json:"additional_groups"`
62	Config           *configs.Config       `json:"config"`
63	Networks         []*network            `json:"network"`
64	PassedFilesCount int                   `json:"passed_files_count"`
65	ContainerId      string                `json:"containerid"`
66	Rlimits          []configs.Rlimit      `json:"rlimits"`
67	CreateConsole    bool                  `json:"create_console"`
68	ConsoleWidth     uint16                `json:"console_width"`
69	ConsoleHeight    uint16                `json:"console_height"`
70	RootlessEUID     bool                  `json:"rootless_euid,omitempty"`
71	RootlessCgroups  bool                  `json:"rootless_cgroups,omitempty"`
72	SpecState        *specs.State          `json:"spec_state,omitempty"`
73}
74
75type initer interface {
76	Init() error
77}
78
79func newContainerInit(t initType, pipe *os.File, consoleSocket *os.File, fifoFd int) (initer, error) {
80	var config *initConfig
81	if err := json.NewDecoder(pipe).Decode(&config); err != nil {
82		return nil, err
83	}
84	if err := populateProcessEnvironment(config.Env); err != nil {
85		return nil, err
86	}
87	switch t {
88	case initSetns:
89		return &linuxSetnsInit{
90			pipe:          pipe,
91			consoleSocket: consoleSocket,
92			config:        config,
93		}, nil
94	case initStandard:
95		return &linuxStandardInit{
96			pipe:          pipe,
97			consoleSocket: consoleSocket,
98			parentPid:     unix.Getppid(),
99			config:        config,
100			fifoFd:        fifoFd,
101		}, nil
102	}
103	return nil, fmt.Errorf("unknown init type %q", t)
104}
105
106// populateProcessEnvironment loads the provided environment variables into the
107// current processes's environment.
108func populateProcessEnvironment(env []string) error {
109	for _, pair := range env {
110		p := strings.SplitN(pair, "=", 2)
111		if len(p) < 2 {
112			return fmt.Errorf("invalid environment '%v'", pair)
113		}
114		if err := os.Setenv(p[0], p[1]); err != nil {
115			return err
116		}
117	}
118	return nil
119}
120
121// finalizeNamespace drops the caps, sets the correct user
122// and working dir, and closes any leaked file descriptors
123// before executing the command inside the namespace
124func finalizeNamespace(config *initConfig) error {
125	// Ensure that all unwanted fds we may have accidentally
126	// inherited are marked close-on-exec so they stay out of the
127	// container
128	if err := utils.CloseExecFrom(config.PassedFilesCount + 3); err != nil {
129		return errors.Wrap(err, "close exec fds")
130	}
131
132	caps := &configs.Capabilities{}
133	if config.Capabilities != nil {
134		caps = config.Capabilities
135	} else if config.Config.Capabilities != nil {
136		caps = config.Config.Capabilities
137	}
138	w, err := capabilities.New(caps)
139	if err != nil {
140		return err
141	}
142	// drop capabilities in bounding set before changing user
143	if err := w.ApplyBoundingSet(); err != nil {
144		return errors.Wrap(err, "apply bounding set")
145	}
146	// preserve existing capabilities while we change users
147	if err := system.SetKeepCaps(); err != nil {
148		return errors.Wrap(err, "set keep caps")
149	}
150	if err := setupUser(config); err != nil {
151		return errors.Wrap(err, "setup user")
152	}
153	// Change working directory AFTER the user has been set up.
154	// Otherwise, if the cwd is also a volume that's been chowned to the container user (and not the user running runc),
155	// this command will EPERM.
156	if config.Cwd != "" {
157		if err := unix.Chdir(config.Cwd); err != nil {
158			return fmt.Errorf("chdir to cwd (%q) set in config.json failed: %v", config.Cwd, err)
159		}
160	}
161	if err := system.ClearKeepCaps(); err != nil {
162		return errors.Wrap(err, "clear keep caps")
163	}
164	if err := w.ApplyCaps(); err != nil {
165		return errors.Wrap(err, "apply caps")
166	}
167	return nil
168}
169
170// setupConsole sets up the console from inside the container, and sends the
171// master pty fd to the config.Pipe (using cmsg). This is done to ensure that
172// consoles are scoped to a container properly (see runc#814 and the many
173// issues related to that). This has to be run *after* we've pivoted to the new
174// rootfs (and the users' configuration is entirely set up).
175func setupConsole(socket *os.File, config *initConfig, mount bool) error {
176	defer socket.Close()
177	// At this point, /dev/ptmx points to something that we would expect. We
178	// used to change the owner of the slave path, but since the /dev/pts mount
179	// can have gid=X set (at the users' option). So touching the owner of the
180	// slave PTY is not necessary, as the kernel will handle that for us. Note
181	// however, that setupUser (specifically fixStdioPermissions) *will* change
182	// the UID owner of the console to be the user the process will run as (so
183	// they can actually control their console).
184
185	pty, slavePath, err := console.NewPty()
186	if err != nil {
187		return err
188	}
189
190	// After we return from here, we don't need the console anymore.
191	defer pty.Close()
192
193	if config.ConsoleHeight != 0 && config.ConsoleWidth != 0 {
194		err = pty.Resize(console.WinSize{
195			Height: config.ConsoleHeight,
196			Width:  config.ConsoleWidth,
197		})
198
199		if err != nil {
200			return err
201		}
202	}
203
204	// Mount the console inside our rootfs.
205	if mount {
206		if err := mountConsole(slavePath); err != nil {
207			return err
208		}
209	}
210	// While we can access console.master, using the API is a good idea.
211	if err := utils.SendFd(socket, pty.Name(), pty.Fd()); err != nil {
212		return err
213	}
214	// Now, dup over all the things.
215	return dupStdio(slavePath)
216}
217
218// syncParentReady sends to the given pipe a JSON payload which indicates that
219// the init is ready to Exec the child process. It then waits for the parent to
220// indicate that it is cleared to Exec.
221func syncParentReady(pipe io.ReadWriter) error {
222	// Tell parent.
223	if err := writeSync(pipe, procReady); err != nil {
224		return err
225	}
226
227	// Wait for parent to give the all-clear.
228	return readSync(pipe, procRun)
229}
230
231// syncParentHooks sends to the given pipe a JSON payload which indicates that
232// the parent should execute pre-start hooks. It then waits for the parent to
233// indicate that it is cleared to resume.
234func syncParentHooks(pipe io.ReadWriter) error {
235	// Tell parent.
236	if err := writeSync(pipe, procHooks); err != nil {
237		return err
238	}
239
240	// Wait for parent to give the all-clear.
241	return readSync(pipe, procResume)
242}
243
244// setupUser changes the groups, gid, and uid for the user inside the container
245func setupUser(config *initConfig) error {
246	// Set up defaults.
247	defaultExecUser := user.ExecUser{
248		Uid:  0,
249		Gid:  0,
250		Home: "/",
251	}
252
253	passwdPath, err := user.GetPasswdPath()
254	if err != nil {
255		return err
256	}
257
258	groupPath, err := user.GetGroupPath()
259	if err != nil {
260		return err
261	}
262
263	execUser, err := user.GetExecUserPath(config.User, &defaultExecUser, passwdPath, groupPath)
264	if err != nil {
265		return err
266	}
267
268	var addGroups []int
269	if len(config.AdditionalGroups) > 0 {
270		addGroups, err = user.GetAdditionalGroupsPath(config.AdditionalGroups, groupPath)
271		if err != nil {
272			return err
273		}
274	}
275
276	// Rather than just erroring out later in setuid(2) and setgid(2), check
277	// that the user is mapped here.
278	if _, err := config.Config.HostUID(execUser.Uid); err != nil {
279		return errors.New("cannot set uid to unmapped user in user namespace")
280	}
281	if _, err := config.Config.HostGID(execUser.Gid); err != nil {
282		return errors.New("cannot set gid to unmapped user in user namespace")
283	}
284
285	if config.RootlessEUID {
286		// We cannot set any additional groups in a rootless container and thus
287		// we bail if the user asked us to do so. TODO: We currently can't do
288		// this check earlier, but if libcontainer.Process.User was typesafe
289		// this might work.
290		if len(addGroups) > 0 {
291			return errors.New("cannot set any additional groups in a rootless container")
292		}
293	}
294
295	// Before we change to the container's user make sure that the processes
296	// STDIO is correctly owned by the user that we are switching to.
297	if err := fixStdioPermissions(config, execUser); err != nil {
298		return err
299	}
300
301	setgroups, err := ioutil.ReadFile("/proc/self/setgroups")
302	if err != nil && !os.IsNotExist(err) {
303		return err
304	}
305
306	// This isn't allowed in an unprivileged user namespace since Linux 3.19.
307	// There's nothing we can do about /etc/group entries, so we silently
308	// ignore setting groups here (since the user didn't explicitly ask us to
309	// set the group).
310	allowSupGroups := !config.RootlessEUID && string(bytes.TrimSpace(setgroups)) != "deny"
311
312	if allowSupGroups {
313		suppGroups := append(execUser.Sgids, addGroups...)
314		if err := unix.Setgroups(suppGroups); err != nil {
315			return err
316		}
317	}
318
319	if err := system.Setgid(execUser.Gid); err != nil {
320		return err
321	}
322	if err := system.Setuid(execUser.Uid); err != nil {
323		return err
324	}
325
326	// if we didn't get HOME already, set it based on the user's HOME
327	if envHome := os.Getenv("HOME"); envHome == "" {
328		if err := os.Setenv("HOME", execUser.Home); err != nil {
329			return err
330		}
331	}
332	return nil
333}
334
335// fixStdioPermissions fixes the permissions of PID 1's STDIO within the container to the specified user.
336// The ownership needs to match because it is created outside of the container and needs to be
337// localized.
338func fixStdioPermissions(config *initConfig, u *user.ExecUser) error {
339	var null unix.Stat_t
340	if err := unix.Stat("/dev/null", &null); err != nil {
341		return err
342	}
343	for _, fd := range []uintptr{
344		os.Stdin.Fd(),
345		os.Stderr.Fd(),
346		os.Stdout.Fd(),
347	} {
348		var s unix.Stat_t
349		if err := unix.Fstat(int(fd), &s); err != nil {
350			return err
351		}
352
353		// Skip chown of /dev/null if it was used as one of the STDIO fds.
354		if s.Rdev == null.Rdev {
355			continue
356		}
357
358		// We only change the uid owner (as it is possible for the mount to
359		// prefer a different gid, and there's no reason for us to change it).
360		// The reason why we don't just leave the default uid=X mount setup is
361		// that users expect to be able to actually use their console. Without
362		// this code, you couldn't effectively run as a non-root user inside a
363		// container and also have a console set up.
364		if err := unix.Fchown(int(fd), u.Uid, int(s.Gid)); err != nil {
365			// If we've hit an EINVAL then s.Gid isn't mapped in the user
366			// namespace. If we've hit an EPERM then the inode's current owner
367			// is not mapped in our user namespace (in particular,
368			// privileged_wrt_inode_uidgid() has failed). In either case, we
369			// are in a configuration where it's better for us to just not
370			// touch the stdio rather than bail at this point.
371			if err == unix.EINVAL || err == unix.EPERM {
372				continue
373			}
374			return err
375		}
376	}
377	return nil
378}
379
380// setupNetwork sets up and initializes any network interface inside the container.
381func setupNetwork(config *initConfig) error {
382	for _, config := range config.Networks {
383		strategy, err := getStrategy(config.Type)
384		if err != nil {
385			return err
386		}
387		if err := strategy.initialize(config); err != nil {
388			return err
389		}
390	}
391	return nil
392}
393
394func setupRoute(config *configs.Config) error {
395	for _, config := range config.Routes {
396		_, dst, err := net.ParseCIDR(config.Destination)
397		if err != nil {
398			return err
399		}
400		src := net.ParseIP(config.Source)
401		if src == nil {
402			return fmt.Errorf("Invalid source for route: %s", config.Source)
403		}
404		gw := net.ParseIP(config.Gateway)
405		if gw == nil {
406			return fmt.Errorf("Invalid gateway for route: %s", config.Gateway)
407		}
408		l, err := netlink.LinkByName(config.InterfaceName)
409		if err != nil {
410			return err
411		}
412		route := &netlink.Route{
413			Scope:     netlink.SCOPE_UNIVERSE,
414			Dst:       dst,
415			Src:       src,
416			Gw:        gw,
417			LinkIndex: l.Attrs().Index,
418		}
419		if err := netlink.RouteAdd(route); err != nil {
420			return err
421		}
422	}
423	return nil
424}
425
426func setupRlimits(limits []configs.Rlimit, pid int) error {
427	for _, rlimit := range limits {
428		if err := system.Prlimit(pid, rlimit.Type, unix.Rlimit{Max: rlimit.Hard, Cur: rlimit.Soft}); err != nil {
429			return fmt.Errorf("error setting rlimit type %v: %v", rlimit.Type, err)
430		}
431	}
432	return nil
433}
434
435const _P_PID = 1
436
437//nolint:structcheck,unused
438type siginfo struct {
439	si_signo int32
440	si_errno int32
441	si_code  int32
442	// below here is a union; si_pid is the only field we use
443	si_pid int32
444	// Pad to 128 bytes as detailed in blockUntilWaitable
445	pad [96]byte
446}
447
448// isWaitable returns true if the process has exited false otherwise.
449// Its based off blockUntilWaitable in src/os/wait_waitid.go
450func isWaitable(pid int) (bool, error) {
451	si := &siginfo{}
452	_, _, e := unix.Syscall6(unix.SYS_WAITID, _P_PID, uintptr(pid), uintptr(unsafe.Pointer(si)), unix.WEXITED|unix.WNOWAIT|unix.WNOHANG, 0, 0)
453	if e != 0 {
454		return false, os.NewSyscallError("waitid", e)
455	}
456
457	return si.si_pid != 0, nil
458}
459
460// isNoChildren returns true if err represents a unix.ECHILD (formerly syscall.ECHILD) false otherwise
461func isNoChildren(err error) bool {
462	switch err := err.(type) {
463	case unix.Errno:
464		if err == unix.ECHILD {
465			return true
466		}
467	case *os.SyscallError:
468		if err.Err == unix.ECHILD {
469			return true
470		}
471	}
472	return false
473}
474
475// signalAllProcesses freezes then iterates over all the processes inside the
476// manager's cgroups sending the signal s to them.
477// If s is SIGKILL then it will wait for each process to exit.
478// For all other signals it will check if the process is ready to report its
479// exit status and only if it is will a wait be performed.
480func signalAllProcesses(m cgroups.Manager, s os.Signal) error {
481	var procs []*os.Process
482	if err := m.Freeze(configs.Frozen); err != nil {
483		logrus.Warn(err)
484	}
485	pids, err := m.GetAllPids()
486	if err != nil {
487		if err := m.Freeze(configs.Thawed); err != nil {
488			logrus.Warn(err)
489		}
490		return err
491	}
492	for _, pid := range pids {
493		p, err := os.FindProcess(pid)
494		if err != nil {
495			logrus.Warn(err)
496			continue
497		}
498		procs = append(procs, p)
499		if err := p.Signal(s); err != nil {
500			logrus.Warn(err)
501		}
502	}
503	if err := m.Freeze(configs.Thawed); err != nil {
504		logrus.Warn(err)
505	}
506
507	subreaper, err := system.GetSubreaper()
508	if err != nil {
509		// The error here means that PR_GET_CHILD_SUBREAPER is not
510		// supported because this code might run on a kernel older
511		// than 3.4. We don't want to throw an error in that case,
512		// and we simplify things, considering there is no subreaper
513		// set.
514		subreaper = 0
515	}
516
517	for _, p := range procs {
518		if s != unix.SIGKILL {
519			if ok, err := isWaitable(p.Pid); err != nil {
520				if !isNoChildren(err) {
521					logrus.Warn("signalAllProcesses: ", p.Pid, err)
522				}
523				continue
524			} else if !ok {
525				// Not ready to report so don't wait
526				continue
527			}
528		}
529
530		// In case a subreaper has been setup, this code must not
531		// wait for the process. Otherwise, we cannot be sure the
532		// current process will be reaped by the subreaper, while
533		// the subreaper might be waiting for this process in order
534		// to retrieve its exit code.
535		if subreaper == 0 {
536			if _, err := p.Wait(); err != nil {
537				if !isNoChildren(err) {
538					logrus.Warn("wait: ", err)
539				}
540			}
541		}
542	}
543	return nil
544}
545