1// +build linux 2 3package fs 4 5import ( 6 "fmt" 7 "os" 8 "path/filepath" 9 "sync" 10 11 "github.com/opencontainers/runc/libcontainer/cgroups" 12 "github.com/opencontainers/runc/libcontainer/configs" 13 libcontainerUtils "github.com/opencontainers/runc/libcontainer/utils" 14 "github.com/pkg/errors" 15 "golang.org/x/sys/unix" 16) 17 18var ( 19 subsystems = []subsystem{ 20 &CpusetGroup{}, 21 &DevicesGroup{}, 22 &MemoryGroup{}, 23 &CpuGroup{}, 24 &CpuacctGroup{}, 25 &PidsGroup{}, 26 &BlkioGroup{}, 27 &HugetlbGroup{}, 28 &NetClsGroup{}, 29 &NetPrioGroup{}, 30 &PerfEventGroup{}, 31 &FreezerGroup{}, 32 &NameGroup{GroupName: "name=systemd", Join: true}, 33 } 34 HugePageSizes, _ = cgroups.GetHugePageSize() 35) 36 37var errSubsystemDoesNotExist = errors.New("cgroup: subsystem does not exist") 38 39type subsystem interface { 40 // Name returns the name of the subsystem. 41 Name() string 42 // Returns the stats, as 'stats', corresponding to the cgroup under 'path'. 43 GetStats(path string, stats *cgroups.Stats) error 44 // Creates and joins the cgroup represented by 'cgroupData'. 45 Apply(path string, c *cgroupData) error 46 // Set the cgroup represented by cgroup. 47 Set(path string, cgroup *configs.Cgroup) error 48} 49 50type manager struct { 51 mu sync.Mutex 52 cgroups *configs.Cgroup 53 rootless bool // ignore permission-related errors 54 paths map[string]string 55} 56 57func NewManager(cg *configs.Cgroup, paths map[string]string, rootless bool) cgroups.Manager { 58 return &manager{ 59 cgroups: cg, 60 paths: paths, 61 rootless: rootless, 62 } 63} 64 65// The absolute path to the root of the cgroup hierarchies. 66var cgroupRootLock sync.Mutex 67var cgroupRoot string 68 69const defaultCgroupRoot = "/sys/fs/cgroup" 70 71func tryDefaultCgroupRoot() string { 72 var st, pst unix.Stat_t 73 74 // (1) it should be a directory... 75 err := unix.Lstat(defaultCgroupRoot, &st) 76 if err != nil || st.Mode&unix.S_IFDIR == 0 { 77 return "" 78 } 79 80 // (2) ... and a mount point ... 81 err = unix.Lstat(filepath.Dir(defaultCgroupRoot), &pst) 82 if err != nil { 83 return "" 84 } 85 86 if st.Dev == pst.Dev { 87 // parent dir has the same dev -- not a mount point 88 return "" 89 } 90 91 // (3) ... of 'tmpfs' fs type. 92 var fst unix.Statfs_t 93 err = unix.Statfs(defaultCgroupRoot, &fst) 94 if err != nil || fst.Type != unix.TMPFS_MAGIC { 95 return "" 96 } 97 98 // (4) it should have at least 1 entry ... 99 dir, err := os.Open(defaultCgroupRoot) 100 if err != nil { 101 return "" 102 } 103 names, err := dir.Readdirnames(1) 104 if err != nil { 105 return "" 106 } 107 if len(names) < 1 { 108 return "" 109 } 110 // ... which is a cgroup mount point. 111 err = unix.Statfs(filepath.Join(defaultCgroupRoot, names[0]), &fst) 112 if err != nil || fst.Type != unix.CGROUP_SUPER_MAGIC { 113 return "" 114 } 115 116 return defaultCgroupRoot 117} 118 119// Gets the cgroupRoot. 120func getCgroupRoot() (string, error) { 121 cgroupRootLock.Lock() 122 defer cgroupRootLock.Unlock() 123 124 if cgroupRoot != "" { 125 return cgroupRoot, nil 126 } 127 128 // fast path 129 cgroupRoot = tryDefaultCgroupRoot() 130 if cgroupRoot != "" { 131 return cgroupRoot, nil 132 } 133 134 // slow path: parse mountinfo 135 mi, err := cgroups.GetCgroupMounts(false) 136 if err != nil { 137 return "", err 138 } 139 if len(mi) < 1 { 140 return "", errors.New("no cgroup mount found in mountinfo") 141 } 142 143 // Get the first cgroup mount (e.g. "/sys/fs/cgroup/memory"), 144 // use its parent directory. 145 root := filepath.Dir(mi[0].Mountpoint) 146 147 if _, err := os.Stat(root); err != nil { 148 return "", err 149 } 150 151 cgroupRoot = root 152 return cgroupRoot, nil 153} 154 155type cgroupData struct { 156 root string 157 innerPath string 158 config *configs.Cgroup 159 pid int 160} 161 162// isIgnorableError returns whether err is a permission error (in the loose 163// sense of the word). This includes EROFS (which for an unprivileged user is 164// basically a permission error) and EACCES (for similar reasons) as well as 165// the normal EPERM. 166func isIgnorableError(rootless bool, err error) bool { 167 // We do not ignore errors if we are root. 168 if !rootless { 169 return false 170 } 171 // TODO: rm errors.Cause once we switch to %w everywhere 172 err = errors.Cause(err) 173 // Is it an ordinary EPERM? 174 if errors.Is(err, os.ErrPermission) { 175 return true 176 } 177 // Handle some specific syscall errors. 178 var errno unix.Errno 179 if errors.As(err, &errno) { 180 return errno == unix.EROFS || errno == unix.EPERM || errno == unix.EACCES 181 } 182 return false 183} 184 185func (m *manager) Apply(pid int) (err error) { 186 if m.cgroups == nil { 187 return nil 188 } 189 m.mu.Lock() 190 defer m.mu.Unlock() 191 192 c := m.cgroups 193 if c.Resources.Unified != nil { 194 return cgroups.ErrV1NoUnified 195 } 196 197 m.paths = make(map[string]string) 198 if c.Paths != nil { 199 cgMap, err := cgroups.ParseCgroupFile("/proc/self/cgroup") 200 if err != nil { 201 return err 202 } 203 for name, path := range c.Paths { 204 // XXX(kolyshkin@): why this check is needed? 205 if _, ok := cgMap[name]; ok { 206 m.paths[name] = path 207 } 208 } 209 return cgroups.EnterPid(m.paths, pid) 210 } 211 212 d, err := getCgroupData(m.cgroups, pid) 213 if err != nil { 214 return err 215 } 216 217 for _, sys := range subsystems { 218 p, err := d.path(sys.Name()) 219 if err != nil { 220 // The non-presence of the devices subsystem is 221 // considered fatal for security reasons. 222 if cgroups.IsNotFound(err) && (c.SkipDevices || sys.Name() != "devices") { 223 continue 224 } 225 return err 226 } 227 m.paths[sys.Name()] = p 228 229 if err := sys.Apply(p, d); err != nil { 230 // In the case of rootless (including euid=0 in userns), where an 231 // explicit cgroup path hasn't been set, we don't bail on error in 232 // case of permission problems. Cases where limits have been set 233 // (and we couldn't create our own cgroup) are handled by Set. 234 if isIgnorableError(m.rootless, err) && m.cgroups.Path == "" { 235 delete(m.paths, sys.Name()) 236 continue 237 } 238 return err 239 } 240 241 } 242 return nil 243} 244 245func (m *manager) Destroy() error { 246 if m.cgroups == nil || m.cgroups.Paths != nil { 247 return nil 248 } 249 m.mu.Lock() 250 defer m.mu.Unlock() 251 return cgroups.RemovePaths(m.paths) 252} 253 254func (m *manager) Path(subsys string) string { 255 m.mu.Lock() 256 defer m.mu.Unlock() 257 return m.paths[subsys] 258} 259 260func (m *manager) GetStats() (*cgroups.Stats, error) { 261 m.mu.Lock() 262 defer m.mu.Unlock() 263 stats := cgroups.NewStats() 264 for _, sys := range subsystems { 265 path := m.paths[sys.Name()] 266 if path == "" { 267 continue 268 } 269 if err := sys.GetStats(path, stats); err != nil { 270 return nil, err 271 } 272 } 273 return stats, nil 274} 275 276func (m *manager) Set(container *configs.Config) error { 277 if container.Cgroups == nil { 278 return nil 279 } 280 281 // If Paths are set, then we are just joining cgroups paths 282 // and there is no need to set any values. 283 if m.cgroups != nil && m.cgroups.Paths != nil { 284 return nil 285 } 286 if container.Cgroups.Resources.Unified != nil { 287 return cgroups.ErrV1NoUnified 288 } 289 290 m.mu.Lock() 291 defer m.mu.Unlock() 292 for _, sys := range subsystems { 293 path := m.paths[sys.Name()] 294 if err := sys.Set(path, container.Cgroups); err != nil { 295 if m.rootless && sys.Name() == "devices" { 296 continue 297 } 298 // When m.Rootless is true, errors from the device subsystem are ignored because it is really not expected to work. 299 // However, errors from other subsystems are not ignored. 300 // see @test "runc create (rootless + limits + no cgrouppath + no permission) fails with informative error" 301 if path == "" { 302 // We never created a path for this cgroup, so we cannot set 303 // limits for it (though we have already tried at this point). 304 return fmt.Errorf("cannot set %s limit: container could not join or create cgroup", sys.Name()) 305 } 306 return err 307 } 308 } 309 310 return nil 311} 312 313// Freeze toggles the container's freezer cgroup depending on the state 314// provided 315func (m *manager) Freeze(state configs.FreezerState) error { 316 path := m.Path("freezer") 317 if m.cgroups == nil || path == "" { 318 return errors.New("cannot toggle freezer: cgroups not configured for container") 319 } 320 321 prevState := m.cgroups.Resources.Freezer 322 m.cgroups.Resources.Freezer = state 323 freezer := &FreezerGroup{} 324 if err := freezer.Set(path, m.cgroups); err != nil { 325 m.cgroups.Resources.Freezer = prevState 326 return err 327 } 328 return nil 329} 330 331func (m *manager) GetPids() ([]int, error) { 332 return cgroups.GetPids(m.Path("devices")) 333} 334 335func (m *manager) GetAllPids() ([]int, error) { 336 return cgroups.GetAllPids(m.Path("devices")) 337} 338 339func getCgroupData(c *configs.Cgroup, pid int) (*cgroupData, error) { 340 root, err := getCgroupRoot() 341 if err != nil { 342 return nil, err 343 } 344 345 if (c.Name != "" || c.Parent != "") && c.Path != "" { 346 return nil, errors.New("cgroup: either Path or Name and Parent should be used") 347 } 348 349 // XXX: Do not remove this code. Path safety is important! -- cyphar 350 cgPath := libcontainerUtils.CleanPath(c.Path) 351 cgParent := libcontainerUtils.CleanPath(c.Parent) 352 cgName := libcontainerUtils.CleanPath(c.Name) 353 354 innerPath := cgPath 355 if innerPath == "" { 356 innerPath = filepath.Join(cgParent, cgName) 357 } 358 359 return &cgroupData{ 360 root: root, 361 innerPath: innerPath, 362 config: c, 363 pid: pid, 364 }, nil 365} 366 367func (raw *cgroupData) path(subsystem string) (string, error) { 368 // If the cgroup name/path is absolute do not look relative to the cgroup of the init process. 369 if filepath.IsAbs(raw.innerPath) { 370 mnt, err := cgroups.FindCgroupMountpoint(raw.root, subsystem) 371 // If we didn't mount the subsystem, there is no point we make the path. 372 if err != nil { 373 return "", err 374 } 375 376 // Sometimes subsystems can be mounted together as 'cpu,cpuacct'. 377 return filepath.Join(raw.root, filepath.Base(mnt), raw.innerPath), nil 378 } 379 380 // Use GetOwnCgroupPath instead of GetInitCgroupPath, because the creating 381 // process could in container and shared pid namespace with host, and 382 // /proc/1/cgroup could point to whole other world of cgroups. 383 parentPath, err := cgroups.GetOwnCgroupPath(subsystem) 384 if err != nil { 385 return "", err 386 } 387 388 return filepath.Join(parentPath, raw.innerPath), nil 389} 390 391func join(path string, pid int) error { 392 if path == "" { 393 return nil 394 } 395 if err := os.MkdirAll(path, 0755); err != nil { 396 return err 397 } 398 return cgroups.WriteCgroupProc(path, pid) 399} 400 401func (m *manager) GetPaths() map[string]string { 402 m.mu.Lock() 403 defer m.mu.Unlock() 404 return m.paths 405} 406 407func (m *manager) GetCgroups() (*configs.Cgroup, error) { 408 return m.cgroups, nil 409} 410 411func (m *manager) GetFreezerState() (configs.FreezerState, error) { 412 dir := m.Path("freezer") 413 // If the container doesn't have the freezer cgroup, say it's undefined. 414 if dir == "" { 415 return configs.Undefined, nil 416 } 417 freezer := &FreezerGroup{} 418 return freezer.GetState(dir) 419} 420 421func (m *manager) Exists() bool { 422 return cgroups.PathExists(m.Path("devices")) 423} 424