1// +build linux
2
3package fs
4
5import (
6	"fmt"
7	"os"
8	"path/filepath"
9	"sync"
10
11	"github.com/opencontainers/runc/libcontainer/cgroups"
12	"github.com/opencontainers/runc/libcontainer/configs"
13	libcontainerUtils "github.com/opencontainers/runc/libcontainer/utils"
14	"github.com/pkg/errors"
15	"golang.org/x/sys/unix"
16)
17
18var (
19	subsystems = []subsystem{
20		&CpusetGroup{},
21		&DevicesGroup{},
22		&MemoryGroup{},
23		&CpuGroup{},
24		&CpuacctGroup{},
25		&PidsGroup{},
26		&BlkioGroup{},
27		&HugetlbGroup{},
28		&NetClsGroup{},
29		&NetPrioGroup{},
30		&PerfEventGroup{},
31		&FreezerGroup{},
32		&NameGroup{GroupName: "name=systemd", Join: true},
33	}
34	HugePageSizes, _ = cgroups.GetHugePageSize()
35)
36
37var errSubsystemDoesNotExist = errors.New("cgroup: subsystem does not exist")
38
39type subsystem interface {
40	// Name returns the name of the subsystem.
41	Name() string
42	// Returns the stats, as 'stats', corresponding to the cgroup under 'path'.
43	GetStats(path string, stats *cgroups.Stats) error
44	// Creates and joins the cgroup represented by 'cgroupData'.
45	Apply(path string, c *cgroupData) error
46	// Set the cgroup represented by cgroup.
47	Set(path string, cgroup *configs.Cgroup) error
48}
49
50type manager struct {
51	mu       sync.Mutex
52	cgroups  *configs.Cgroup
53	rootless bool // ignore permission-related errors
54	paths    map[string]string
55}
56
57func NewManager(cg *configs.Cgroup, paths map[string]string, rootless bool) cgroups.Manager {
58	return &manager{
59		cgroups:  cg,
60		paths:    paths,
61		rootless: rootless,
62	}
63}
64
65// The absolute path to the root of the cgroup hierarchies.
66var cgroupRootLock sync.Mutex
67var cgroupRoot string
68
69const defaultCgroupRoot = "/sys/fs/cgroup"
70
71func tryDefaultCgroupRoot() string {
72	var st, pst unix.Stat_t
73
74	// (1) it should be a directory...
75	err := unix.Lstat(defaultCgroupRoot, &st)
76	if err != nil || st.Mode&unix.S_IFDIR == 0 {
77		return ""
78	}
79
80	// (2) ... and a mount point ...
81	err = unix.Lstat(filepath.Dir(defaultCgroupRoot), &pst)
82	if err != nil {
83		return ""
84	}
85
86	if st.Dev == pst.Dev {
87		// parent dir has the same dev -- not a mount point
88		return ""
89	}
90
91	// (3) ... of 'tmpfs' fs type.
92	var fst unix.Statfs_t
93	err = unix.Statfs(defaultCgroupRoot, &fst)
94	if err != nil || fst.Type != unix.TMPFS_MAGIC {
95		return ""
96	}
97
98	// (4) it should have at least 1 entry ...
99	dir, err := os.Open(defaultCgroupRoot)
100	if err != nil {
101		return ""
102	}
103	names, err := dir.Readdirnames(1)
104	if err != nil {
105		return ""
106	}
107	if len(names) < 1 {
108		return ""
109	}
110	// ... which is a cgroup mount point.
111	err = unix.Statfs(filepath.Join(defaultCgroupRoot, names[0]), &fst)
112	if err != nil || fst.Type != unix.CGROUP_SUPER_MAGIC {
113		return ""
114	}
115
116	return defaultCgroupRoot
117}
118
119// Gets the cgroupRoot.
120func getCgroupRoot() (string, error) {
121	cgroupRootLock.Lock()
122	defer cgroupRootLock.Unlock()
123
124	if cgroupRoot != "" {
125		return cgroupRoot, nil
126	}
127
128	// fast path
129	cgroupRoot = tryDefaultCgroupRoot()
130	if cgroupRoot != "" {
131		return cgroupRoot, nil
132	}
133
134	// slow path: parse mountinfo
135	mi, err := cgroups.GetCgroupMounts(false)
136	if err != nil {
137		return "", err
138	}
139	if len(mi) < 1 {
140		return "", errors.New("no cgroup mount found in mountinfo")
141	}
142
143	// Get the first cgroup mount (e.g. "/sys/fs/cgroup/memory"),
144	// use its parent directory.
145	root := filepath.Dir(mi[0].Mountpoint)
146
147	if _, err := os.Stat(root); err != nil {
148		return "", err
149	}
150
151	cgroupRoot = root
152	return cgroupRoot, nil
153}
154
155type cgroupData struct {
156	root      string
157	innerPath string
158	config    *configs.Cgroup
159	pid       int
160}
161
162// isIgnorableError returns whether err is a permission error (in the loose
163// sense of the word). This includes EROFS (which for an unprivileged user is
164// basically a permission error) and EACCES (for similar reasons) as well as
165// the normal EPERM.
166func isIgnorableError(rootless bool, err error) bool {
167	// We do not ignore errors if we are root.
168	if !rootless {
169		return false
170	}
171	// TODO: rm errors.Cause once we switch to %w everywhere
172	err = errors.Cause(err)
173	// Is it an ordinary EPERM?
174	if errors.Is(err, os.ErrPermission) {
175		return true
176	}
177	// Handle some specific syscall errors.
178	var errno unix.Errno
179	if errors.As(err, &errno) {
180		return errno == unix.EROFS || errno == unix.EPERM || errno == unix.EACCES
181	}
182	return false
183}
184
185func (m *manager) Apply(pid int) (err error) {
186	if m.cgroups == nil {
187		return nil
188	}
189	m.mu.Lock()
190	defer m.mu.Unlock()
191
192	c := m.cgroups
193	if c.Resources.Unified != nil {
194		return cgroups.ErrV1NoUnified
195	}
196
197	m.paths = make(map[string]string)
198	if c.Paths != nil {
199		cgMap, err := cgroups.ParseCgroupFile("/proc/self/cgroup")
200		if err != nil {
201			return err
202		}
203		for name, path := range c.Paths {
204			// XXX(kolyshkin@): why this check is needed?
205			if _, ok := cgMap[name]; ok {
206				m.paths[name] = path
207			}
208		}
209		return cgroups.EnterPid(m.paths, pid)
210	}
211
212	d, err := getCgroupData(m.cgroups, pid)
213	if err != nil {
214		return err
215	}
216
217	for _, sys := range subsystems {
218		p, err := d.path(sys.Name())
219		if err != nil {
220			// The non-presence of the devices subsystem is
221			// considered fatal for security reasons.
222			if cgroups.IsNotFound(err) && (c.SkipDevices || sys.Name() != "devices") {
223				continue
224			}
225			return err
226		}
227		m.paths[sys.Name()] = p
228
229		if err := sys.Apply(p, d); err != nil {
230			// In the case of rootless (including euid=0 in userns), where an
231			// explicit cgroup path hasn't been set, we don't bail on error in
232			// case of permission problems. Cases where limits have been set
233			// (and we couldn't create our own cgroup) are handled by Set.
234			if isIgnorableError(m.rootless, err) && m.cgroups.Path == "" {
235				delete(m.paths, sys.Name())
236				continue
237			}
238			return err
239		}
240
241	}
242	return nil
243}
244
245func (m *manager) Destroy() error {
246	if m.cgroups == nil || m.cgroups.Paths != nil {
247		return nil
248	}
249	m.mu.Lock()
250	defer m.mu.Unlock()
251	return cgroups.RemovePaths(m.paths)
252}
253
254func (m *manager) Path(subsys string) string {
255	m.mu.Lock()
256	defer m.mu.Unlock()
257	return m.paths[subsys]
258}
259
260func (m *manager) GetStats() (*cgroups.Stats, error) {
261	m.mu.Lock()
262	defer m.mu.Unlock()
263	stats := cgroups.NewStats()
264	for _, sys := range subsystems {
265		path := m.paths[sys.Name()]
266		if path == "" {
267			continue
268		}
269		if err := sys.GetStats(path, stats); err != nil {
270			return nil, err
271		}
272	}
273	return stats, nil
274}
275
276func (m *manager) Set(container *configs.Config) error {
277	if container.Cgroups == nil {
278		return nil
279	}
280
281	// If Paths are set, then we are just joining cgroups paths
282	// and there is no need to set any values.
283	if m.cgroups != nil && m.cgroups.Paths != nil {
284		return nil
285	}
286	if container.Cgroups.Resources.Unified != nil {
287		return cgroups.ErrV1NoUnified
288	}
289
290	m.mu.Lock()
291	defer m.mu.Unlock()
292	for _, sys := range subsystems {
293		path := m.paths[sys.Name()]
294		if err := sys.Set(path, container.Cgroups); err != nil {
295			if m.rootless && sys.Name() == "devices" {
296				continue
297			}
298			// When m.Rootless is true, errors from the device subsystem are ignored because it is really not expected to work.
299			// However, errors from other subsystems are not ignored.
300			// see @test "runc create (rootless + limits + no cgrouppath + no permission) fails with informative error"
301			if path == "" {
302				// We never created a path for this cgroup, so we cannot set
303				// limits for it (though we have already tried at this point).
304				return fmt.Errorf("cannot set %s limit: container could not join or create cgroup", sys.Name())
305			}
306			return err
307		}
308	}
309
310	return nil
311}
312
313// Freeze toggles the container's freezer cgroup depending on the state
314// provided
315func (m *manager) Freeze(state configs.FreezerState) error {
316	path := m.Path("freezer")
317	if m.cgroups == nil || path == "" {
318		return errors.New("cannot toggle freezer: cgroups not configured for container")
319	}
320
321	prevState := m.cgroups.Resources.Freezer
322	m.cgroups.Resources.Freezer = state
323	freezer := &FreezerGroup{}
324	if err := freezer.Set(path, m.cgroups); err != nil {
325		m.cgroups.Resources.Freezer = prevState
326		return err
327	}
328	return nil
329}
330
331func (m *manager) GetPids() ([]int, error) {
332	return cgroups.GetPids(m.Path("devices"))
333}
334
335func (m *manager) GetAllPids() ([]int, error) {
336	return cgroups.GetAllPids(m.Path("devices"))
337}
338
339func getCgroupData(c *configs.Cgroup, pid int) (*cgroupData, error) {
340	root, err := getCgroupRoot()
341	if err != nil {
342		return nil, err
343	}
344
345	if (c.Name != "" || c.Parent != "") && c.Path != "" {
346		return nil, errors.New("cgroup: either Path or Name and Parent should be used")
347	}
348
349	// XXX: Do not remove this code. Path safety is important! -- cyphar
350	cgPath := libcontainerUtils.CleanPath(c.Path)
351	cgParent := libcontainerUtils.CleanPath(c.Parent)
352	cgName := libcontainerUtils.CleanPath(c.Name)
353
354	innerPath := cgPath
355	if innerPath == "" {
356		innerPath = filepath.Join(cgParent, cgName)
357	}
358
359	return &cgroupData{
360		root:      root,
361		innerPath: innerPath,
362		config:    c,
363		pid:       pid,
364	}, nil
365}
366
367func (raw *cgroupData) path(subsystem string) (string, error) {
368	// If the cgroup name/path is absolute do not look relative to the cgroup of the init process.
369	if filepath.IsAbs(raw.innerPath) {
370		mnt, err := cgroups.FindCgroupMountpoint(raw.root, subsystem)
371		// If we didn't mount the subsystem, there is no point we make the path.
372		if err != nil {
373			return "", err
374		}
375
376		// Sometimes subsystems can be mounted together as 'cpu,cpuacct'.
377		return filepath.Join(raw.root, filepath.Base(mnt), raw.innerPath), nil
378	}
379
380	// Use GetOwnCgroupPath instead of GetInitCgroupPath, because the creating
381	// process could in container and shared pid namespace with host, and
382	// /proc/1/cgroup could point to whole other world of cgroups.
383	parentPath, err := cgroups.GetOwnCgroupPath(subsystem)
384	if err != nil {
385		return "", err
386	}
387
388	return filepath.Join(parentPath, raw.innerPath), nil
389}
390
391func join(path string, pid int) error {
392	if path == "" {
393		return nil
394	}
395	if err := os.MkdirAll(path, 0755); err != nil {
396		return err
397	}
398	return cgroups.WriteCgroupProc(path, pid)
399}
400
401func (m *manager) GetPaths() map[string]string {
402	m.mu.Lock()
403	defer m.mu.Unlock()
404	return m.paths
405}
406
407func (m *manager) GetCgroups() (*configs.Cgroup, error) {
408	return m.cgroups, nil
409}
410
411func (m *manager) GetFreezerState() (configs.FreezerState, error) {
412	dir := m.Path("freezer")
413	// If the container doesn't have the freezer cgroup, say it's undefined.
414	if dir == "" {
415		return configs.Undefined, nil
416	}
417	freezer := &FreezerGroup{}
418	return freezer.GetState(dir)
419}
420
421func (m *manager) Exists() bool {
422	return cgroups.PathExists(m.Path("devices"))
423}
424