1// +build linux
2
3package libcontainer
4
5import (
6	"encoding/json"
7	"fmt"
8	"io"
9	"io/ioutil"
10	"net"
11	"os"
12	"strconv"
13	"strings"
14	"syscall"
15
16	"github.com/Sirupsen/logrus"
17	"github.com/opencontainers/runc/libcontainer/cgroups"
18	"github.com/opencontainers/runc/libcontainer/configs"
19	"github.com/opencontainers/runc/libcontainer/system"
20	"github.com/opencontainers/runc/libcontainer/user"
21	"github.com/opencontainers/runc/libcontainer/utils"
22	"github.com/vishvananda/netlink"
23)
24
25type initType string
26
27const (
28	initSetns    initType = "setns"
29	initStandard initType = "standard"
30)
31
32type pid struct {
33	Pid int `json:"pid"`
34}
35
36// network is an internal struct used to setup container networks.
37type network struct {
38	configs.Network
39
40	// TempVethPeerName is a unique temporary veth peer name that was placed into
41	// the container's namespace.
42	TempVethPeerName string `json:"temp_veth_peer_name"`
43}
44
45// initConfig is used for transferring parameters from Exec() to Init()
46type initConfig struct {
47	Args             []string         `json:"args"`
48	Env              []string         `json:"env"`
49	Cwd              string           `json:"cwd"`
50	Capabilities     []string         `json:"capabilities"`
51	ProcessLabel     string           `json:"process_label"`
52	AppArmorProfile  string           `json:"apparmor_profile"`
53	NoNewPrivileges  bool             `json:"no_new_privileges"`
54	User             string           `json:"user"`
55	Config           *configs.Config  `json:"config"`
56	Console          string           `json:"console"`
57	Networks         []*network       `json:"network"`
58	PassedFilesCount int              `json:"passed_files_count"`
59	ContainerId      string           `json:"containerid"`
60	Rlimits          []configs.Rlimit `json:"rlimits"`
61}
62
63type initer interface {
64	Init() error
65}
66
67func newContainerInit(t initType, pipe *os.File) (initer, error) {
68	var config *initConfig
69	if err := json.NewDecoder(pipe).Decode(&config); err != nil {
70		return nil, err
71	}
72	if err := populateProcessEnvironment(config.Env); err != nil {
73		return nil, err
74	}
75	switch t {
76	case initSetns:
77		return &linuxSetnsInit{
78			config: config,
79		}, nil
80	case initStandard:
81		return &linuxStandardInit{
82			pipe:      pipe,
83			parentPid: syscall.Getppid(),
84			config:    config,
85		}, nil
86	}
87	return nil, fmt.Errorf("unknown init type %q", t)
88}
89
90// populateProcessEnvironment loads the provided environment variables into the
91// current processes's environment.
92func populateProcessEnvironment(env []string) error {
93	for _, pair := range env {
94		p := strings.SplitN(pair, "=", 2)
95		if len(p) < 2 {
96			return fmt.Errorf("invalid environment '%v'", pair)
97		}
98		if err := os.Setenv(p[0], p[1]); err != nil {
99			return err
100		}
101	}
102	return nil
103}
104
105// finalizeNamespace drops the caps, sets the correct user
106// and working dir, and closes any leaked file descriptors
107// before executing the command inside the namespace
108func finalizeNamespace(config *initConfig) error {
109	// Ensure that all unwanted fds we may have accidentally
110	// inherited are marked close-on-exec so they stay out of the
111	// container
112	if err := utils.CloseExecFrom(config.PassedFilesCount + 3); err != nil {
113		return err
114	}
115
116	capabilities := config.Config.Capabilities
117	if config.Capabilities != nil {
118		capabilities = config.Capabilities
119	}
120	w, err := newCapWhitelist(capabilities)
121	if err != nil {
122		return err
123	}
124	// drop capabilities in bounding set before changing user
125	if err := w.dropBoundingSet(); err != nil {
126		return err
127	}
128	// preserve existing capabilities while we change users
129	if err := system.SetKeepCaps(); err != nil {
130		return err
131	}
132	if err := setupUser(config); err != nil {
133		return err
134	}
135	if err := system.ClearKeepCaps(); err != nil {
136		return err
137	}
138	// drop all other capabilities
139	if err := w.drop(); err != nil {
140		return err
141	}
142	if config.Cwd != "" {
143		if err := syscall.Chdir(config.Cwd); err != nil {
144			return err
145		}
146	}
147	return nil
148}
149
150// syncParentReady sends to the given pipe a JSON payload which indicates that
151// the init is ready to Exec the child process. It then waits for the parent to
152// indicate that it is cleared to Exec.
153func syncParentReady(pipe io.ReadWriter) error {
154	// Tell parent.
155	if err := utils.WriteJSON(pipe, syncT{procReady}); err != nil {
156		return err
157	}
158	// Wait for parent to give the all-clear.
159	var procSync syncT
160	if err := json.NewDecoder(pipe).Decode(&procSync); err != nil {
161		if err == io.EOF {
162			return fmt.Errorf("parent closed synchronisation channel")
163		}
164		if procSync.Type != procRun {
165			return fmt.Errorf("invalid synchronisation flag from parent")
166		}
167	}
168	return nil
169}
170
171// syncParentHooks sends to the given pipe a JSON payload which indicates that
172// the parent should execute pre-start hooks. It then waits for the parent to
173// indicate that it is cleared to resume.
174func syncParentHooks(pipe io.ReadWriter) error {
175	// Tell parent.
176	if err := utils.WriteJSON(pipe, syncT{procHooks}); err != nil {
177		return err
178	}
179	// Wait for parent to give the all-clear.
180	var procSync syncT
181	if err := json.NewDecoder(pipe).Decode(&procSync); err != nil {
182		if err == io.EOF {
183			return fmt.Errorf("parent closed synchronisation channel")
184		}
185		if procSync.Type != procResume {
186			return fmt.Errorf("invalid synchronisation flag from parent")
187		}
188	}
189	return nil
190}
191
192// setupUser changes the groups, gid, and uid for the user inside the container
193func setupUser(config *initConfig) error {
194	// Set up defaults.
195	defaultExecUser := user.ExecUser{
196		Uid:  syscall.Getuid(),
197		Gid:  syscall.Getgid(),
198		Home: "/",
199	}
200	passwdPath, err := user.GetPasswdPath()
201	if err != nil {
202		return err
203	}
204	groupPath, err := user.GetGroupPath()
205	if err != nil {
206		return err
207	}
208	execUser, err := user.GetExecUserPath(config.User, &defaultExecUser, passwdPath, groupPath)
209	if err != nil {
210		return err
211	}
212
213	var addGroups []int
214	if len(config.Config.AdditionalGroups) > 0 {
215		addGroups, err = user.GetAdditionalGroupsPath(config.Config.AdditionalGroups, groupPath)
216		if err != nil {
217			return err
218		}
219	}
220	// before we change to the container's user make sure that the processes STDIO
221	// is correctly owned by the user that we are switching to.
222	if err := fixStdioPermissions(execUser); err != nil {
223		return err
224	}
225	suppGroups := append(execUser.Sgids, addGroups...)
226	if err := syscall.Setgroups(suppGroups); err != nil {
227		return err
228	}
229
230	if err := system.Setgid(execUser.Gid); err != nil {
231		return err
232	}
233	if err := system.Setuid(execUser.Uid); err != nil {
234		return err
235	}
236	// if we didn't get HOME already, set it based on the user's HOME
237	if envHome := os.Getenv("HOME"); envHome == "" {
238		if err := os.Setenv("HOME", execUser.Home); err != nil {
239			return err
240		}
241	}
242	return nil
243}
244
245// fixStdioPermissions fixes the permissions of PID 1's STDIO within the container to the specified user.
246// The ownership needs to match because it is created outside of the container and needs to be
247// localized.
248func fixStdioPermissions(u *user.ExecUser) error {
249	var null syscall.Stat_t
250	if err := syscall.Stat("/dev/null", &null); err != nil {
251		return err
252	}
253	for _, fd := range []uintptr{
254		os.Stdin.Fd(),
255		os.Stderr.Fd(),
256		os.Stdout.Fd(),
257	} {
258		var s syscall.Stat_t
259		if err := syscall.Fstat(int(fd), &s); err != nil {
260			return err
261		}
262		// skip chown of /dev/null if it was used as one of the STDIO fds.
263		if s.Rdev == null.Rdev {
264			continue
265		}
266		if err := syscall.Fchown(int(fd), u.Uid, u.Gid); err != nil {
267			return err
268		}
269	}
270	return nil
271}
272
273// setupNetwork sets up and initializes any network interface inside the container.
274func setupNetwork(config *initConfig) error {
275	for _, config := range config.Networks {
276		strategy, err := getStrategy(config.Type)
277		if err != nil {
278			return err
279		}
280		if err := strategy.initialize(config); err != nil {
281			return err
282		}
283	}
284	return nil
285}
286
287func setupRoute(config *configs.Config) error {
288	for _, config := range config.Routes {
289		_, dst, err := net.ParseCIDR(config.Destination)
290		if err != nil {
291			return err
292		}
293		src := net.ParseIP(config.Source)
294		if src == nil {
295			return fmt.Errorf("Invalid source for route: %s", config.Source)
296		}
297		gw := net.ParseIP(config.Gateway)
298		if gw == nil {
299			return fmt.Errorf("Invalid gateway for route: %s", config.Gateway)
300		}
301		l, err := netlink.LinkByName(config.InterfaceName)
302		if err != nil {
303			return err
304		}
305		route := &netlink.Route{
306			Scope:     netlink.SCOPE_UNIVERSE,
307			Dst:       dst,
308			Src:       src,
309			Gw:        gw,
310			LinkIndex: l.Attrs().Index,
311		}
312		if err := netlink.RouteAdd(route); err != nil {
313			return err
314		}
315	}
316	return nil
317}
318
319func setupRlimits(limits []configs.Rlimit, pid int) error {
320	for _, rlimit := range limits {
321		if err := system.Prlimit(pid, rlimit.Type, syscall.Rlimit{Max: rlimit.Hard, Cur: rlimit.Soft}); err != nil {
322			return fmt.Errorf("error setting rlimit type %v: %v", rlimit.Type, err)
323		}
324	}
325	return nil
326}
327
328func setOomScoreAdj(oomScoreAdj int, pid int) error {
329	path := fmt.Sprintf("/proc/%d/oom_score_adj", pid)
330
331	return ioutil.WriteFile(path, []byte(strconv.Itoa(oomScoreAdj)), 0600)
332}
333
334// killCgroupProcesses freezes then iterates over all the processes inside the
335// manager's cgroups sending a SIGKILL to each process then waiting for them to
336// exit.
337func killCgroupProcesses(m cgroups.Manager) error {
338	var procs []*os.Process
339	if err := m.Freeze(configs.Frozen); err != nil {
340		logrus.Warn(err)
341	}
342	pids, err := m.GetAllPids()
343	if err != nil {
344		m.Freeze(configs.Thawed)
345		return err
346	}
347	for _, pid := range pids {
348		p, err := os.FindProcess(pid)
349		if err != nil {
350			logrus.Warn(err)
351			continue
352		}
353		procs = append(procs, p)
354		if err := p.Kill(); err != nil {
355			logrus.Warn(err)
356		}
357	}
358	if err := m.Freeze(configs.Thawed); err != nil {
359		logrus.Warn(err)
360	}
361	for _, p := range procs {
362		if _, err := p.Wait(); err != nil {
363			logrus.Warn(err)
364		}
365	}
366	return nil
367}
368