1// +build linux freebsd 2 3package daemon // import "github.com/docker/docker/daemon" 4 5import ( 6 "bufio" 7 "context" 8 "fmt" 9 "io/ioutil" 10 "net" 11 "os" 12 "path/filepath" 13 "runtime" 14 "runtime/debug" 15 "strconv" 16 "strings" 17 "time" 18 19 "github.com/containerd/cgroups" 20 statsV1 "github.com/containerd/cgroups/stats/v1" 21 statsV2 "github.com/containerd/cgroups/v2/stats" 22 "github.com/containerd/containerd/sys" 23 "github.com/docker/docker/api/types" 24 "github.com/docker/docker/api/types/blkiodev" 25 pblkiodev "github.com/docker/docker/api/types/blkiodev" 26 containertypes "github.com/docker/docker/api/types/container" 27 "github.com/docker/docker/container" 28 "github.com/docker/docker/daemon/config" 29 "github.com/docker/docker/daemon/initlayer" 30 "github.com/docker/docker/errdefs" 31 "github.com/docker/docker/opts" 32 "github.com/docker/docker/pkg/containerfs" 33 "github.com/docker/docker/pkg/idtools" 34 "github.com/docker/docker/pkg/parsers" 35 "github.com/docker/docker/pkg/parsers/kernel" 36 "github.com/docker/docker/pkg/sysinfo" 37 "github.com/docker/docker/runconfig" 38 volumemounts "github.com/docker/docker/volume/mounts" 39 "github.com/docker/libnetwork" 40 nwconfig "github.com/docker/libnetwork/config" 41 "github.com/docker/libnetwork/drivers/bridge" 42 "github.com/docker/libnetwork/netlabel" 43 "github.com/docker/libnetwork/netutils" 44 "github.com/docker/libnetwork/options" 45 lntypes "github.com/docker/libnetwork/types" 46 "github.com/moby/sys/mount" 47 specs "github.com/opencontainers/runtime-spec/specs-go" 48 "github.com/opencontainers/selinux/go-selinux" 49 "github.com/opencontainers/selinux/go-selinux/label" 50 "github.com/pkg/errors" 51 "github.com/sirupsen/logrus" 52 "github.com/vishvananda/netlink" 53 "golang.org/x/sys/unix" 54) 55 56const ( 57 isWindows = false 58 59 // DefaultShimBinary is the default shim to be used by containerd if none 60 // is specified 61 DefaultShimBinary = "containerd-shim" 62 63 // DefaultRuntimeBinary is the default runtime to be used by 64 // containerd if none is specified 65 DefaultRuntimeBinary = "runc" 66 67 // See https://git.kernel.org/cgit/linux/kernel/git/tip/tip.git/tree/kernel/sched/sched.h?id=8cd9234c64c584432f6992fe944ca9e46ca8ea76#n269 68 linuxMinCPUShares = 2 69 linuxMaxCPUShares = 262144 70 platformSupported = true 71 // It's not kernel limit, we want this 6M limit to account for overhead during startup, and to supply a reasonable functional container 72 linuxMinMemory = 6291456 73 // constants for remapped root settings 74 defaultIDSpecifier = "default" 75 defaultRemappedID = "dockremap" 76 77 // constant for cgroup drivers 78 cgroupFsDriver = "cgroupfs" 79 cgroupSystemdDriver = "systemd" 80 cgroupNoneDriver = "none" 81) 82 83type containerGetter interface { 84 GetContainer(string) (*container.Container, error) 85} 86 87func getMemoryResources(config containertypes.Resources) *specs.LinuxMemory { 88 memory := specs.LinuxMemory{} 89 90 if config.Memory > 0 { 91 memory.Limit = &config.Memory 92 } 93 94 if config.MemoryReservation > 0 { 95 memory.Reservation = &config.MemoryReservation 96 } 97 98 if config.MemorySwap > 0 { 99 memory.Swap = &config.MemorySwap 100 } 101 102 if config.MemorySwappiness != nil { 103 swappiness := uint64(*config.MemorySwappiness) 104 memory.Swappiness = &swappiness 105 } 106 107 if config.OomKillDisable != nil { 108 memory.DisableOOMKiller = config.OomKillDisable 109 } 110 111 if config.KernelMemory != 0 { 112 memory.Kernel = &config.KernelMemory 113 } 114 115 if config.KernelMemoryTCP != 0 { 116 memory.KernelTCP = &config.KernelMemoryTCP 117 } 118 119 return &memory 120} 121 122func getPidsLimit(config containertypes.Resources) *specs.LinuxPids { 123 if config.PidsLimit == nil { 124 return nil 125 } 126 if *config.PidsLimit <= 0 { 127 // docker API allows 0 and negative values to unset this to be consistent 128 // with default values. When updating values, runc requires -1 to unset 129 // the previous limit. 130 return &specs.LinuxPids{Limit: -1} 131 } 132 return &specs.LinuxPids{Limit: *config.PidsLimit} 133} 134 135func getCPUResources(config containertypes.Resources) (*specs.LinuxCPU, error) { 136 cpu := specs.LinuxCPU{} 137 138 if config.CPUShares < 0 { 139 return nil, fmt.Errorf("shares: invalid argument") 140 } 141 if config.CPUShares >= 0 { 142 shares := uint64(config.CPUShares) 143 cpu.Shares = &shares 144 } 145 146 if config.CpusetCpus != "" { 147 cpu.Cpus = config.CpusetCpus 148 } 149 150 if config.CpusetMems != "" { 151 cpu.Mems = config.CpusetMems 152 } 153 154 if config.NanoCPUs > 0 { 155 // https://www.kernel.org/doc/Documentation/scheduler/sched-bwc.txt 156 period := uint64(100 * time.Millisecond / time.Microsecond) 157 quota := config.NanoCPUs * int64(period) / 1e9 158 cpu.Period = &period 159 cpu.Quota = "a 160 } 161 162 if config.CPUPeriod != 0 { 163 period := uint64(config.CPUPeriod) 164 cpu.Period = &period 165 } 166 167 if config.CPUQuota != 0 { 168 q := config.CPUQuota 169 cpu.Quota = &q 170 } 171 172 if config.CPURealtimePeriod != 0 { 173 period := uint64(config.CPURealtimePeriod) 174 cpu.RealtimePeriod = &period 175 } 176 177 if config.CPURealtimeRuntime != 0 { 178 c := config.CPURealtimeRuntime 179 cpu.RealtimeRuntime = &c 180 } 181 182 return &cpu, nil 183} 184 185func getBlkioWeightDevices(config containertypes.Resources) ([]specs.LinuxWeightDevice, error) { 186 var stat unix.Stat_t 187 var blkioWeightDevices []specs.LinuxWeightDevice 188 189 for _, weightDevice := range config.BlkioWeightDevice { 190 if err := unix.Stat(weightDevice.Path, &stat); err != nil { 191 return nil, errors.WithStack(&os.PathError{Op: "stat", Path: weightDevice.Path, Err: err}) 192 } 193 weight := weightDevice.Weight 194 d := specs.LinuxWeightDevice{Weight: &weight} 195 // The type is 32bit on mips. 196 d.Major = int64(unix.Major(uint64(stat.Rdev))) // nolint: unconvert 197 d.Minor = int64(unix.Minor(uint64(stat.Rdev))) // nolint: unconvert 198 blkioWeightDevices = append(blkioWeightDevices, d) 199 } 200 201 return blkioWeightDevices, nil 202} 203 204func (daemon *Daemon) parseSecurityOpt(container *container.Container, hostConfig *containertypes.HostConfig) error { 205 container.NoNewPrivileges = daemon.configStore.NoNewPrivileges 206 return parseSecurityOpt(container, hostConfig) 207} 208 209func parseSecurityOpt(container *container.Container, config *containertypes.HostConfig) error { 210 var ( 211 labelOpts []string 212 err error 213 ) 214 215 for _, opt := range config.SecurityOpt { 216 if opt == "no-new-privileges" { 217 container.NoNewPrivileges = true 218 continue 219 } 220 if opt == "disable" { 221 labelOpts = append(labelOpts, "disable") 222 continue 223 } 224 225 var con []string 226 if strings.Contains(opt, "=") { 227 con = strings.SplitN(opt, "=", 2) 228 } else if strings.Contains(opt, ":") { 229 con = strings.SplitN(opt, ":", 2) 230 logrus.Warn("Security options with `:` as a separator are deprecated and will be completely unsupported in 17.04, use `=` instead.") 231 } 232 if len(con) != 2 { 233 return fmt.Errorf("invalid --security-opt 1: %q", opt) 234 } 235 236 switch con[0] { 237 case "label": 238 labelOpts = append(labelOpts, con[1]) 239 case "apparmor": 240 container.AppArmorProfile = con[1] 241 case "seccomp": 242 container.SeccompProfile = con[1] 243 case "no-new-privileges": 244 noNewPrivileges, err := strconv.ParseBool(con[1]) 245 if err != nil { 246 return fmt.Errorf("invalid --security-opt 2: %q", opt) 247 } 248 container.NoNewPrivileges = noNewPrivileges 249 default: 250 return fmt.Errorf("invalid --security-opt 2: %q", opt) 251 } 252 } 253 254 container.ProcessLabel, container.MountLabel, err = label.InitLabels(labelOpts) 255 return err 256} 257 258func getBlkioThrottleDevices(devs []*blkiodev.ThrottleDevice) ([]specs.LinuxThrottleDevice, error) { 259 var throttleDevices []specs.LinuxThrottleDevice 260 var stat unix.Stat_t 261 262 for _, d := range devs { 263 if err := unix.Stat(d.Path, &stat); err != nil { 264 return nil, errors.WithStack(&os.PathError{Op: "stat", Path: d.Path, Err: err}) 265 } 266 d := specs.LinuxThrottleDevice{Rate: d.Rate} 267 // the type is 32bit on mips 268 d.Major = int64(unix.Major(uint64(stat.Rdev))) // nolint: unconvert 269 d.Minor = int64(unix.Minor(uint64(stat.Rdev))) // nolint: unconvert 270 throttleDevices = append(throttleDevices, d) 271 } 272 273 return throttleDevices, nil 274} 275 276// adjustParallelLimit takes a number of objects and a proposed limit and 277// figures out if it's reasonable (and adjusts it accordingly). This is only 278// used for daemon startup, which does a lot of parallel loading of containers 279// (and if we exceed RLIMIT_NOFILE then we're in trouble). 280func adjustParallelLimit(n int, limit int) int { 281 // Rule-of-thumb overhead factor (how many files will each goroutine open 282 // simultaneously). Yes, this is ugly but to be frank this whole thing is 283 // ugly. 284 const overhead = 2 285 286 // On Linux, we need to ensure that parallelStartupJobs doesn't cause us to 287 // exceed RLIMIT_NOFILE. If parallelStartupJobs is too large, we reduce it 288 // and give a warning (since in theory the user should increase their 289 // ulimits to the largest possible value for dockerd). 290 var rlim unix.Rlimit 291 if err := unix.Getrlimit(unix.RLIMIT_NOFILE, &rlim); err != nil { 292 logrus.Warnf("Couldn't find dockerd's RLIMIT_NOFILE to double-check startup parallelism factor: %v", err) 293 return limit 294 } 295 softRlimit := int(rlim.Cur) 296 297 // Much fewer containers than RLIMIT_NOFILE. No need to adjust anything. 298 if softRlimit > overhead*n { 299 return limit 300 } 301 302 // RLIMIT_NOFILE big enough, no need to adjust anything. 303 if softRlimit > overhead*limit { 304 return limit 305 } 306 307 logrus.Warnf("Found dockerd's open file ulimit (%v) is far too small -- consider increasing it significantly (at least %v)", softRlimit, overhead*limit) 308 return softRlimit / overhead 309} 310 311func checkKernel() error { 312 // Check for unsupported kernel versions 313 // FIXME: it would be cleaner to not test for specific versions, but rather 314 // test for specific functionalities. 315 // Unfortunately we can't test for the feature "does not cause a kernel panic" 316 // without actually causing a kernel panic, so we need this workaround until 317 // the circumstances of pre-3.10 crashes are clearer. 318 // For details see https://github.com/docker/docker/issues/407 319 // Docker 1.11 and above doesn't actually run on kernels older than 3.4, 320 // due to containerd-shim usage of PR_SET_CHILD_SUBREAPER (introduced in 3.4). 321 if !kernel.CheckKernelVersion(3, 10, 0) { 322 v, _ := kernel.GetKernelVersion() 323 if os.Getenv("DOCKER_NOWARN_KERNEL_VERSION") == "" { 324 logrus.Fatalf("Your Linux kernel version %s is not supported for running docker. Please upgrade your kernel to 3.10.0 or newer.", v.String()) 325 } 326 } 327 return nil 328} 329 330// adaptContainerSettings is called during container creation to modify any 331// settings necessary in the HostConfig structure. 332func (daemon *Daemon) adaptContainerSettings(hostConfig *containertypes.HostConfig, adjustCPUShares bool) error { 333 if adjustCPUShares && hostConfig.CPUShares > 0 { 334 // Handle unsupported CPUShares 335 if hostConfig.CPUShares < linuxMinCPUShares { 336 logrus.Warnf("Changing requested CPUShares of %d to minimum allowed of %d", hostConfig.CPUShares, linuxMinCPUShares) 337 hostConfig.CPUShares = linuxMinCPUShares 338 } else if hostConfig.CPUShares > linuxMaxCPUShares { 339 logrus.Warnf("Changing requested CPUShares of %d to maximum allowed of %d", hostConfig.CPUShares, linuxMaxCPUShares) 340 hostConfig.CPUShares = linuxMaxCPUShares 341 } 342 } 343 if hostConfig.Memory > 0 && hostConfig.MemorySwap == 0 { 344 // By default, MemorySwap is set to twice the size of Memory. 345 hostConfig.MemorySwap = hostConfig.Memory * 2 346 } 347 if hostConfig.ShmSize == 0 { 348 hostConfig.ShmSize = config.DefaultShmSize 349 if daemon.configStore != nil { 350 hostConfig.ShmSize = int64(daemon.configStore.ShmSize) 351 } 352 } 353 // Set default IPC mode, if unset for container 354 if hostConfig.IpcMode.IsEmpty() { 355 m := config.DefaultIpcMode 356 if daemon.configStore != nil { 357 m = daemon.configStore.IpcMode 358 } 359 hostConfig.IpcMode = containertypes.IpcMode(m) 360 } 361 362 // Set default cgroup namespace mode, if unset for container 363 if hostConfig.CgroupnsMode.IsEmpty() { 364 // for cgroup v2: unshare cgroupns even for privileged containers 365 // https://github.com/containers/libpod/pull/4374#issuecomment-549776387 366 if hostConfig.Privileged && cgroups.Mode() != cgroups.Unified { 367 hostConfig.CgroupnsMode = containertypes.CgroupnsMode("host") 368 } else { 369 m := "host" 370 if cgroups.Mode() == cgroups.Unified { 371 m = "private" 372 } 373 if daemon.configStore != nil { 374 m = daemon.configStore.CgroupNamespaceMode 375 } 376 hostConfig.CgroupnsMode = containertypes.CgroupnsMode(m) 377 } 378 } 379 380 adaptSharedNamespaceContainer(daemon, hostConfig) 381 382 var err error 383 secOpts, err := daemon.generateSecurityOpt(hostConfig) 384 if err != nil { 385 return err 386 } 387 hostConfig.SecurityOpt = append(hostConfig.SecurityOpt, secOpts...) 388 if hostConfig.OomKillDisable == nil { 389 defaultOomKillDisable := false 390 hostConfig.OomKillDisable = &defaultOomKillDisable 391 } 392 393 return nil 394} 395 396// adaptSharedNamespaceContainer replaces container name with its ID in hostConfig. 397// To be more precisely, it modifies `container:name` to `container:ID` of PidMode, IpcMode 398// and NetworkMode. 399// 400// When a container shares its namespace with another container, use ID can keep the namespace 401// sharing connection between the two containers even the another container is renamed. 402func adaptSharedNamespaceContainer(daemon containerGetter, hostConfig *containertypes.HostConfig) { 403 containerPrefix := "container:" 404 if hostConfig.PidMode.IsContainer() { 405 pidContainer := hostConfig.PidMode.Container() 406 // if there is any error returned here, we just ignore it and leave it to be 407 // handled in the following logic 408 if c, err := daemon.GetContainer(pidContainer); err == nil { 409 hostConfig.PidMode = containertypes.PidMode(containerPrefix + c.ID) 410 } 411 } 412 if hostConfig.IpcMode.IsContainer() { 413 ipcContainer := hostConfig.IpcMode.Container() 414 if c, err := daemon.GetContainer(ipcContainer); err == nil { 415 hostConfig.IpcMode = containertypes.IpcMode(containerPrefix + c.ID) 416 } 417 } 418 if hostConfig.NetworkMode.IsContainer() { 419 netContainer := hostConfig.NetworkMode.ConnectedContainer() 420 if c, err := daemon.GetContainer(netContainer); err == nil { 421 hostConfig.NetworkMode = containertypes.NetworkMode(containerPrefix + c.ID) 422 } 423 } 424} 425 426// verifyPlatformContainerResources performs platform-specific validation of the container's resource-configuration 427func verifyPlatformContainerResources(resources *containertypes.Resources, sysInfo *sysinfo.SysInfo, update bool) (warnings []string, err error) { 428 fixMemorySwappiness(resources) 429 430 // memory subsystem checks and adjustments 431 if resources.Memory != 0 && resources.Memory < linuxMinMemory { 432 return warnings, fmt.Errorf("Minimum memory limit allowed is 6MB") 433 } 434 if resources.Memory > 0 && !sysInfo.MemoryLimit { 435 warnings = append(warnings, "Your kernel does not support memory limit capabilities or the cgroup is not mounted. Limitation discarded.") 436 resources.Memory = 0 437 resources.MemorySwap = -1 438 } 439 if resources.Memory > 0 && resources.MemorySwap != -1 && !sysInfo.SwapLimit { 440 warnings = append(warnings, "Your kernel does not support swap limit capabilities or the cgroup is not mounted. Memory limited without swap.") 441 resources.MemorySwap = -1 442 } 443 if resources.Memory > 0 && resources.MemorySwap > 0 && resources.MemorySwap < resources.Memory { 444 return warnings, fmt.Errorf("Minimum memoryswap limit should be larger than memory limit, see usage") 445 } 446 if resources.Memory == 0 && resources.MemorySwap > 0 && !update { 447 return warnings, fmt.Errorf("You should always set the Memory limit when using Memoryswap limit, see usage") 448 } 449 if resources.MemorySwappiness != nil && !sysInfo.MemorySwappiness { 450 warnings = append(warnings, "Your kernel does not support memory swappiness capabilities or the cgroup is not mounted. Memory swappiness discarded.") 451 resources.MemorySwappiness = nil 452 } 453 if resources.MemorySwappiness != nil { 454 swappiness := *resources.MemorySwappiness 455 if swappiness < 0 || swappiness > 100 { 456 return warnings, fmt.Errorf("Invalid value: %v, valid memory swappiness range is 0-100", swappiness) 457 } 458 } 459 if resources.MemoryReservation > 0 && !sysInfo.MemoryReservation { 460 warnings = append(warnings, "Your kernel does not support memory soft limit capabilities or the cgroup is not mounted. Limitation discarded.") 461 resources.MemoryReservation = 0 462 } 463 if resources.MemoryReservation > 0 && resources.MemoryReservation < linuxMinMemory { 464 return warnings, fmt.Errorf("Minimum memory reservation allowed is 4MB") 465 } 466 if resources.Memory > 0 && resources.MemoryReservation > 0 && resources.Memory < resources.MemoryReservation { 467 return warnings, fmt.Errorf("Minimum memory limit can not be less than memory reservation limit, see usage") 468 } 469 if resources.KernelMemory > 0 { 470 // Kernel memory limit is not supported on cgroup v2. 471 // Even on cgroup v1, kernel memory limit (`kmem.limit_in_bytes`) has been deprecated since kernel 5.4. 472 // https://github.com/torvalds/linux/commit/0158115f702b0ba208ab0b5adf44cae99b3ebcc7 473 warnings = append(warnings, "Specifying a kernel memory limit is deprecated and will be removed in a future release.") 474 } 475 if resources.KernelMemory > 0 && !sysInfo.KernelMemory { 476 warnings = append(warnings, "Your kernel does not support kernel memory limit capabilities or the cgroup is not mounted. Limitation discarded.") 477 resources.KernelMemory = 0 478 } 479 if resources.KernelMemory > 0 && resources.KernelMemory < linuxMinMemory { 480 return warnings, fmt.Errorf("Minimum kernel memory limit allowed is 4MB") 481 } 482 if resources.KernelMemory > 0 && !kernel.CheckKernelVersion(4, 0, 0) { 483 warnings = append(warnings, "You specified a kernel memory limit on a kernel older than 4.0. Kernel memory limits are experimental on older kernels, it won't work as expected and can cause your system to be unstable.") 484 } 485 if resources.OomKillDisable != nil && !sysInfo.OomKillDisable { 486 // only produce warnings if the setting wasn't to *disable* the OOM Kill; no point 487 // warning the caller if they already wanted the feature to be off 488 if *resources.OomKillDisable { 489 warnings = append(warnings, "Your kernel does not support OomKillDisable. OomKillDisable discarded.") 490 } 491 resources.OomKillDisable = nil 492 } 493 if resources.OomKillDisable != nil && *resources.OomKillDisable && resources.Memory == 0 { 494 warnings = append(warnings, "OOM killer is disabled for the container, but no memory limit is set, this can result in the system running out of resources.") 495 } 496 if resources.PidsLimit != nil && !sysInfo.PidsLimit { 497 if *resources.PidsLimit > 0 { 498 warnings = append(warnings, "Your kernel does not support PIDs limit capabilities or the cgroup is not mounted. PIDs limit discarded.") 499 } 500 resources.PidsLimit = nil 501 } 502 503 // cpu subsystem checks and adjustments 504 if resources.NanoCPUs > 0 && resources.CPUPeriod > 0 { 505 return warnings, fmt.Errorf("Conflicting options: Nano CPUs and CPU Period cannot both be set") 506 } 507 if resources.NanoCPUs > 0 && resources.CPUQuota > 0 { 508 return warnings, fmt.Errorf("Conflicting options: Nano CPUs and CPU Quota cannot both be set") 509 } 510 if resources.NanoCPUs > 0 && !sysInfo.CPUCfs { 511 return warnings, fmt.Errorf("NanoCPUs can not be set, as your kernel does not support CPU CFS scheduler or the cgroup is not mounted") 512 } 513 // The highest precision we could get on Linux is 0.001, by setting 514 // cpu.cfs_period_us=1000ms 515 // cpu.cfs_quota=1ms 516 // See the following link for details: 517 // https://www.kernel.org/doc/Documentation/scheduler/sched-bwc.txt 518 // Here we don't set the lower limit and it is up to the underlying platform (e.g., Linux) to return an error. 519 // The error message is 0.01 so that this is consistent with Windows 520 if resources.NanoCPUs < 0 || resources.NanoCPUs > int64(sysinfo.NumCPU())*1e9 { 521 return warnings, fmt.Errorf("Range of CPUs is from 0.01 to %d.00, as there are only %d CPUs available", sysinfo.NumCPU(), sysinfo.NumCPU()) 522 } 523 524 if resources.CPUShares > 0 && !sysInfo.CPUShares { 525 warnings = append(warnings, "Your kernel does not support CPU shares or the cgroup is not mounted. Shares discarded.") 526 resources.CPUShares = 0 527 } 528 if (resources.CPUPeriod != 0 || resources.CPUQuota != 0) && !sysInfo.CPUCfs { 529 warnings = append(warnings, "Your kernel does not support CPU CFS scheduler. CPU period/quota discarded.") 530 resources.CPUPeriod = 0 531 resources.CPUQuota = 0 532 } 533 if resources.CPUPeriod != 0 && (resources.CPUPeriod < 1000 || resources.CPUPeriod > 1000000) { 534 return warnings, fmt.Errorf("CPU cfs period can not be less than 1ms (i.e. 1000) or larger than 1s (i.e. 1000000)") 535 } 536 if resources.CPUQuota > 0 && resources.CPUQuota < 1000 { 537 return warnings, fmt.Errorf("CPU cfs quota can not be less than 1ms (i.e. 1000)") 538 } 539 if resources.CPUPercent > 0 { 540 warnings = append(warnings, fmt.Sprintf("%s does not support CPU percent. Percent discarded.", runtime.GOOS)) 541 resources.CPUPercent = 0 542 } 543 544 // cpuset subsystem checks and adjustments 545 if (resources.CpusetCpus != "" || resources.CpusetMems != "") && !sysInfo.Cpuset { 546 warnings = append(warnings, "Your kernel does not support cpuset or the cgroup is not mounted. Cpuset discarded.") 547 resources.CpusetCpus = "" 548 resources.CpusetMems = "" 549 } 550 cpusAvailable, err := sysInfo.IsCpusetCpusAvailable(resources.CpusetCpus) 551 if err != nil { 552 return warnings, errors.Wrapf(err, "Invalid value %s for cpuset cpus", resources.CpusetCpus) 553 } 554 if !cpusAvailable { 555 return warnings, fmt.Errorf("Requested CPUs are not available - requested %s, available: %s", resources.CpusetCpus, sysInfo.Cpus) 556 } 557 memsAvailable, err := sysInfo.IsCpusetMemsAvailable(resources.CpusetMems) 558 if err != nil { 559 return warnings, errors.Wrapf(err, "Invalid value %s for cpuset mems", resources.CpusetMems) 560 } 561 if !memsAvailable { 562 return warnings, fmt.Errorf("Requested memory nodes are not available - requested %s, available: %s", resources.CpusetMems, sysInfo.Mems) 563 } 564 565 // blkio subsystem checks and adjustments 566 if resources.BlkioWeight > 0 && !sysInfo.BlkioWeight { 567 warnings = append(warnings, "Your kernel does not support Block I/O weight or the cgroup is not mounted. Weight discarded.") 568 resources.BlkioWeight = 0 569 } 570 if resources.BlkioWeight > 0 && (resources.BlkioWeight < 10 || resources.BlkioWeight > 1000) { 571 return warnings, fmt.Errorf("Range of blkio weight is from 10 to 1000") 572 } 573 if resources.IOMaximumBandwidth != 0 || resources.IOMaximumIOps != 0 { 574 return warnings, fmt.Errorf("Invalid QoS settings: %s does not support Maximum IO Bandwidth or Maximum IO IOps", runtime.GOOS) 575 } 576 if len(resources.BlkioWeightDevice) > 0 && !sysInfo.BlkioWeightDevice { 577 warnings = append(warnings, "Your kernel does not support Block I/O weight_device or the cgroup is not mounted. Weight-device discarded.") 578 resources.BlkioWeightDevice = []*pblkiodev.WeightDevice{} 579 } 580 if len(resources.BlkioDeviceReadBps) > 0 && !sysInfo.BlkioReadBpsDevice { 581 warnings = append(warnings, "Your kernel does not support BPS Block I/O read limit or the cgroup is not mounted. Block I/O BPS read limit discarded.") 582 resources.BlkioDeviceReadBps = []*pblkiodev.ThrottleDevice{} 583 } 584 if len(resources.BlkioDeviceWriteBps) > 0 && !sysInfo.BlkioWriteBpsDevice { 585 warnings = append(warnings, "Your kernel does not support BPS Block I/O write limit or the cgroup is not mounted. Block I/O BPS write limit discarded.") 586 resources.BlkioDeviceWriteBps = []*pblkiodev.ThrottleDevice{} 587 588 } 589 if len(resources.BlkioDeviceReadIOps) > 0 && !sysInfo.BlkioReadIOpsDevice { 590 warnings = append(warnings, "Your kernel does not support IOPS Block read limit or the cgroup is not mounted. Block I/O IOPS read limit discarded.") 591 resources.BlkioDeviceReadIOps = []*pblkiodev.ThrottleDevice{} 592 } 593 if len(resources.BlkioDeviceWriteIOps) > 0 && !sysInfo.BlkioWriteIOpsDevice { 594 warnings = append(warnings, "Your kernel does not support IOPS Block write limit or the cgroup is not mounted. Block I/O IOPS write limit discarded.") 595 resources.BlkioDeviceWriteIOps = []*pblkiodev.ThrottleDevice{} 596 } 597 598 return warnings, nil 599} 600 601func (daemon *Daemon) getCgroupDriver() string { 602 if UsingSystemd(daemon.configStore) { 603 return cgroupSystemdDriver 604 } 605 if daemon.Rootless() { 606 return cgroupNoneDriver 607 } 608 return cgroupFsDriver 609} 610 611// getCD gets the raw value of the native.cgroupdriver option, if set. 612func getCD(config *config.Config) string { 613 for _, option := range config.ExecOptions { 614 key, val, err := parsers.ParseKeyValueOpt(option) 615 if err != nil || !strings.EqualFold(key, "native.cgroupdriver") { 616 continue 617 } 618 return val 619 } 620 return "" 621} 622 623// VerifyCgroupDriver validates native.cgroupdriver 624func VerifyCgroupDriver(config *config.Config) error { 625 cd := getCD(config) 626 if cd == "" || cd == cgroupFsDriver || cd == cgroupSystemdDriver { 627 return nil 628 } 629 if cd == cgroupNoneDriver { 630 return fmt.Errorf("native.cgroupdriver option %s is internally used and cannot be specified manually", cd) 631 } 632 return fmt.Errorf("native.cgroupdriver option %s not supported", cd) 633} 634 635// UsingSystemd returns true if cli option includes native.cgroupdriver=systemd 636func UsingSystemd(config *config.Config) bool { 637 if getCD(config) == cgroupSystemdDriver { 638 return true 639 } 640 // On cgroup v2 hosts, default to systemd driver 641 if getCD(config) == "" && cgroups.Mode() == cgroups.Unified && IsRunningSystemd() { 642 return true 643 } 644 return false 645} 646 647// IsRunningSystemd is from https://github.com/opencontainers/runc/blob/46be7b612e2533c494e6a251111de46d8e286ed5/libcontainer/cgroups/systemd/common.go#L27-L33 648func IsRunningSystemd() bool { 649 fi, err := os.Lstat("/run/systemd/system") 650 if err != nil { 651 return false 652 } 653 return fi.IsDir() 654} 655 656// verifyPlatformContainerSettings performs platform-specific validation of the 657// hostconfig and config structures. 658func verifyPlatformContainerSettings(daemon *Daemon, hostConfig *containertypes.HostConfig, update bool) (warnings []string, err error) { 659 if hostConfig == nil { 660 return nil, nil 661 } 662 sysInfo := daemon.RawSysInfo(true) 663 664 w, err := verifyPlatformContainerResources(&hostConfig.Resources, sysInfo, update) 665 666 // no matter err is nil or not, w could have data in itself. 667 warnings = append(warnings, w...) 668 669 if err != nil { 670 return warnings, err 671 } 672 673 if hostConfig.ShmSize < 0 { 674 return warnings, fmt.Errorf("SHM size can not be less than 0") 675 } 676 677 if hostConfig.OomScoreAdj < -1000 || hostConfig.OomScoreAdj > 1000 { 678 return warnings, fmt.Errorf("Invalid value %d, range for oom score adj is [-1000, 1000]", hostConfig.OomScoreAdj) 679 } 680 681 // ip-forwarding does not affect container with '--net=host' (or '--net=none') 682 if sysInfo.IPv4ForwardingDisabled && !(hostConfig.NetworkMode.IsHost() || hostConfig.NetworkMode.IsNone()) { 683 warnings = append(warnings, "IPv4 forwarding is disabled. Networking will not work.") 684 } 685 if hostConfig.NetworkMode.IsHost() && len(hostConfig.PortBindings) > 0 { 686 warnings = append(warnings, "Published ports are discarded when using host network mode") 687 } 688 689 // check for various conflicting options with user namespaces 690 if daemon.configStore.RemappedRoot != "" && hostConfig.UsernsMode.IsPrivate() { 691 if hostConfig.Privileged { 692 return warnings, fmt.Errorf("privileged mode is incompatible with user namespaces. You must run the container in the host namespace when running privileged mode") 693 } 694 if hostConfig.NetworkMode.IsHost() && !hostConfig.UsernsMode.IsHost() { 695 return warnings, fmt.Errorf("cannot share the host's network namespace when user namespaces are enabled") 696 } 697 if hostConfig.PidMode.IsHost() && !hostConfig.UsernsMode.IsHost() { 698 return warnings, fmt.Errorf("cannot share the host PID namespace when user namespaces are enabled") 699 } 700 } 701 if hostConfig.CgroupParent != "" && UsingSystemd(daemon.configStore) { 702 // CgroupParent for systemd cgroup should be named as "xxx.slice" 703 if len(hostConfig.CgroupParent) <= 6 || !strings.HasSuffix(hostConfig.CgroupParent, ".slice") { 704 return warnings, fmt.Errorf("cgroup-parent for systemd cgroup should be a valid slice named as \"xxx.slice\"") 705 } 706 } 707 if hostConfig.Runtime == "" { 708 hostConfig.Runtime = daemon.configStore.GetDefaultRuntimeName() 709 } 710 711 if rt := daemon.configStore.GetRuntime(hostConfig.Runtime); rt == nil { 712 return warnings, fmt.Errorf("Unknown runtime specified %s", hostConfig.Runtime) 713 } 714 715 parser := volumemounts.NewParser(runtime.GOOS) 716 for dest := range hostConfig.Tmpfs { 717 if err := parser.ValidateTmpfsMountDestination(dest); err != nil { 718 return warnings, err 719 } 720 } 721 722 if !hostConfig.CgroupnsMode.Valid() { 723 return warnings, fmt.Errorf("invalid cgroup namespace mode: %v", hostConfig.CgroupnsMode) 724 } 725 if hostConfig.CgroupnsMode.IsPrivate() { 726 if !sysInfo.CgroupNamespaces { 727 warnings = append(warnings, "Your kernel does not support cgroup namespaces. Cgroup namespace setting discarded.") 728 } 729 } 730 731 if hostConfig.Runtime == config.LinuxV1RuntimeName || (hostConfig.Runtime == "" && daemon.configStore.DefaultRuntime == config.LinuxV1RuntimeName) { 732 warnings = append(warnings, fmt.Sprintf("Configured runtime %q is deprecated and will be removed in the next release.", config.LinuxV1RuntimeName)) 733 } 734 735 return warnings, nil 736} 737 738// verifyDaemonSettings performs validation of daemon config struct 739func verifyDaemonSettings(conf *config.Config) error { 740 if conf.ContainerdNamespace == conf.ContainerdPluginNamespace { 741 return errors.New("containers namespace and plugins namespace cannot be the same") 742 } 743 // Check for mutually incompatible config options 744 if conf.BridgeConfig.Iface != "" && conf.BridgeConfig.IP != "" { 745 return fmt.Errorf("You specified -b & --bip, mutually exclusive options. Please specify only one") 746 } 747 if !conf.BridgeConfig.EnableIPTables && !conf.BridgeConfig.InterContainerCommunication { 748 return fmt.Errorf("You specified --iptables=false with --icc=false. ICC=false uses iptables to function. Please set --icc or --iptables to true") 749 } 750 if conf.BridgeConfig.EnableIP6Tables && !conf.Experimental { 751 return fmt.Errorf("ip6tables rules are only available if experimental features are enabled") 752 } 753 if !conf.BridgeConfig.EnableIPTables && conf.BridgeConfig.EnableIPMasq { 754 conf.BridgeConfig.EnableIPMasq = false 755 } 756 if err := VerifyCgroupDriver(conf); err != nil { 757 return err 758 } 759 if conf.CgroupParent != "" && UsingSystemd(conf) { 760 if len(conf.CgroupParent) <= 6 || !strings.HasSuffix(conf.CgroupParent, ".slice") { 761 return fmt.Errorf("cgroup-parent for systemd cgroup should be a valid slice named as \"xxx.slice\"") 762 } 763 } 764 765 if conf.Rootless && UsingSystemd(conf) && cgroups.Mode() != cgroups.Unified { 766 return fmt.Errorf("exec-opt native.cgroupdriver=systemd requires cgroup v2 for rootless mode") 767 } 768 769 configureRuntimes(conf) 770 if rtName := conf.GetDefaultRuntimeName(); rtName != "" { 771 if conf.GetRuntime(rtName) == nil { 772 return fmt.Errorf("specified default runtime '%s' does not exist", rtName) 773 } 774 if rtName == config.LinuxV1RuntimeName { 775 logrus.Warnf("Configured default runtime %q is deprecated and will be removed in the next release.", config.LinuxV1RuntimeName) 776 } 777 } 778 return nil 779} 780 781// checkSystem validates platform-specific requirements 782func checkSystem() error { 783 return checkKernel() 784} 785 786// configureMaxThreads sets the Go runtime max threads threshold 787// which is 90% of the kernel setting from /proc/sys/kernel/threads-max 788func configureMaxThreads(config *config.Config) error { 789 mt, err := ioutil.ReadFile("/proc/sys/kernel/threads-max") 790 if err != nil { 791 return err 792 } 793 mtint, err := strconv.Atoi(strings.TrimSpace(string(mt))) 794 if err != nil { 795 return err 796 } 797 maxThreads := (mtint / 100) * 90 798 debug.SetMaxThreads(maxThreads) 799 logrus.Debugf("Golang's threads limit set to %d", maxThreads) 800 return nil 801} 802 803func overlaySupportsSelinux() (bool, error) { 804 f, err := os.Open("/proc/kallsyms") 805 if err != nil { 806 if os.IsNotExist(err) { 807 return false, nil 808 } 809 return false, err 810 } 811 defer f.Close() 812 813 s := bufio.NewScanner(f) 814 for s.Scan() { 815 if strings.HasSuffix(s.Text(), " security_inode_copy_up") { 816 return true, nil 817 } 818 } 819 820 return false, s.Err() 821} 822 823// configureKernelSecuritySupport configures and validates security support for the kernel 824func configureKernelSecuritySupport(config *config.Config, driverName string) error { 825 if config.EnableSelinuxSupport { 826 if !selinux.GetEnabled() { 827 logrus.Warn("Docker could not enable SELinux on the host system") 828 return nil 829 } 830 831 if driverName == "overlay" || driverName == "overlay2" { 832 // If driver is overlay or overlay2, make sure kernel 833 // supports selinux with overlay. 834 supported, err := overlaySupportsSelinux() 835 if err != nil { 836 return err 837 } 838 839 if !supported { 840 logrus.Warnf("SELinux is not supported with the %v graph driver on this kernel", driverName) 841 } 842 } 843 } else { 844 selinux.SetDisabled() 845 } 846 return nil 847} 848 849func (daemon *Daemon) initNetworkController(config *config.Config, activeSandboxes map[string]interface{}) (libnetwork.NetworkController, error) { 850 netOptions, err := daemon.networkOptions(config, daemon.PluginStore, activeSandboxes) 851 if err != nil { 852 return nil, err 853 } 854 855 controller, err := libnetwork.New(netOptions...) 856 if err != nil { 857 return nil, fmt.Errorf("error obtaining controller instance: %v", err) 858 } 859 860 if len(activeSandboxes) > 0 { 861 logrus.Info("There are old running containers, the network config will not take affect") 862 return controller, nil 863 } 864 865 // Initialize default network on "null" 866 if n, _ := controller.NetworkByName("none"); n == nil { 867 if _, err := controller.NewNetwork("null", "none", "", libnetwork.NetworkOptionPersist(true)); err != nil { 868 return nil, fmt.Errorf("Error creating default \"null\" network: %v", err) 869 } 870 } 871 872 // Initialize default network on "host" 873 if n, _ := controller.NetworkByName("host"); n == nil { 874 if _, err := controller.NewNetwork("host", "host", "", libnetwork.NetworkOptionPersist(true)); err != nil { 875 return nil, fmt.Errorf("Error creating default \"host\" network: %v", err) 876 } 877 } 878 879 // Clear stale bridge network 880 if n, err := controller.NetworkByName("bridge"); err == nil { 881 if err = n.Delete(); err != nil { 882 return nil, fmt.Errorf("could not delete the default bridge network: %v", err) 883 } 884 if len(config.NetworkConfig.DefaultAddressPools.Value()) > 0 && !daemon.configStore.LiveRestoreEnabled { 885 removeDefaultBridgeInterface() 886 } 887 } 888 889 if !config.DisableBridge { 890 // Initialize default driver "bridge" 891 if err := initBridgeDriver(controller, config); err != nil { 892 return nil, err 893 } 894 } else { 895 removeDefaultBridgeInterface() 896 } 897 898 // Set HostGatewayIP to the default bridge's IP if it is empty 899 if daemon.configStore.HostGatewayIP == nil && controller != nil { 900 if n, err := controller.NetworkByName("bridge"); err == nil { 901 v4Info, v6Info := n.Info().IpamInfo() 902 var gateway net.IP 903 if len(v4Info) > 0 { 904 gateway = v4Info[0].Gateway.IP 905 } else if len(v6Info) > 0 { 906 gateway = v6Info[0].Gateway.IP 907 } 908 daemon.configStore.HostGatewayIP = gateway 909 } 910 } 911 return controller, nil 912} 913 914func driverOptions(config *config.Config) []nwconfig.Option { 915 bridgeConfig := options.Generic{ 916 "EnableIPForwarding": config.BridgeConfig.EnableIPForward, 917 "EnableIPTables": config.BridgeConfig.EnableIPTables, 918 "EnableIP6Tables": config.BridgeConfig.EnableIP6Tables, 919 "EnableUserlandProxy": config.BridgeConfig.EnableUserlandProxy, 920 "UserlandProxyPath": config.BridgeConfig.UserlandProxyPath} 921 bridgeOption := options.Generic{netlabel.GenericData: bridgeConfig} 922 923 dOptions := []nwconfig.Option{} 924 dOptions = append(dOptions, nwconfig.OptionDriverConfig("bridge", bridgeOption)) 925 return dOptions 926} 927 928func initBridgeDriver(controller libnetwork.NetworkController, config *config.Config) error { 929 bridgeName := bridge.DefaultBridgeName 930 if config.BridgeConfig.Iface != "" { 931 bridgeName = config.BridgeConfig.Iface 932 } 933 netOption := map[string]string{ 934 bridge.BridgeName: bridgeName, 935 bridge.DefaultBridge: strconv.FormatBool(true), 936 netlabel.DriverMTU: strconv.Itoa(config.Mtu), 937 bridge.EnableIPMasquerade: strconv.FormatBool(config.BridgeConfig.EnableIPMasq), 938 bridge.EnableICC: strconv.FormatBool(config.BridgeConfig.InterContainerCommunication), 939 } 940 941 // --ip processing 942 if config.BridgeConfig.DefaultIP != nil { 943 netOption[bridge.DefaultBindingIP] = config.BridgeConfig.DefaultIP.String() 944 } 945 946 ipamV4Conf := &libnetwork.IpamConf{AuxAddresses: make(map[string]string)} 947 948 nwList, nw6List, err := netutils.ElectInterfaceAddresses(bridgeName) 949 if err != nil { 950 return errors.Wrap(err, "list bridge addresses failed") 951 } 952 953 nw := nwList[0] 954 if len(nwList) > 1 && config.BridgeConfig.FixedCIDR != "" { 955 _, fCIDR, err := net.ParseCIDR(config.BridgeConfig.FixedCIDR) 956 if err != nil { 957 return errors.Wrap(err, "parse CIDR failed") 958 } 959 // Iterate through in case there are multiple addresses for the bridge 960 for _, entry := range nwList { 961 if fCIDR.Contains(entry.IP) { 962 nw = entry 963 break 964 } 965 } 966 } 967 968 ipamV4Conf.PreferredPool = lntypes.GetIPNetCanonical(nw).String() 969 hip, _ := lntypes.GetHostPartIP(nw.IP, nw.Mask) 970 if hip.IsGlobalUnicast() { 971 ipamV4Conf.Gateway = nw.IP.String() 972 } 973 974 if config.BridgeConfig.IP != "" { 975 ip, ipNet, err := net.ParseCIDR(config.BridgeConfig.IP) 976 if err != nil { 977 return err 978 } 979 ipamV4Conf.PreferredPool = ipNet.String() 980 ipamV4Conf.Gateway = ip.String() 981 } else if bridgeName == bridge.DefaultBridgeName && ipamV4Conf.PreferredPool != "" { 982 logrus.Infof("Default bridge (%s) is assigned with an IP address %s. Daemon option --bip can be used to set a preferred IP address", bridgeName, ipamV4Conf.PreferredPool) 983 } 984 985 if config.BridgeConfig.FixedCIDR != "" { 986 _, fCIDR, err := net.ParseCIDR(config.BridgeConfig.FixedCIDR) 987 if err != nil { 988 return err 989 } 990 991 ipamV4Conf.SubPool = fCIDR.String() 992 } 993 994 if config.BridgeConfig.DefaultGatewayIPv4 != nil { 995 ipamV4Conf.AuxAddresses["DefaultGatewayIPv4"] = config.BridgeConfig.DefaultGatewayIPv4.String() 996 } 997 998 var ( 999 deferIPv6Alloc bool 1000 ipamV6Conf *libnetwork.IpamConf 1001 ) 1002 1003 if config.BridgeConfig.EnableIPv6 && config.BridgeConfig.FixedCIDRv6 == "" { 1004 return errdefs.InvalidParameter(errors.New("IPv6 is enabled for the default bridge, but no subnet is configured. Specify an IPv6 subnet using --fixed-cidr-v6")) 1005 } else if config.BridgeConfig.FixedCIDRv6 != "" { 1006 _, fCIDRv6, err := net.ParseCIDR(config.BridgeConfig.FixedCIDRv6) 1007 if err != nil { 1008 return err 1009 } 1010 1011 // In case user has specified the daemon flag --fixed-cidr-v6 and the passed network has 1012 // at least 48 host bits, we need to guarantee the current behavior where the containers' 1013 // IPv6 addresses will be constructed based on the containers' interface MAC address. 1014 // We do so by telling libnetwork to defer the IPv6 address allocation for the endpoints 1015 // on this network until after the driver has created the endpoint and returned the 1016 // constructed address. Libnetwork will then reserve this address with the ipam driver. 1017 ones, _ := fCIDRv6.Mask.Size() 1018 deferIPv6Alloc = ones <= 80 1019 1020 ipamV6Conf = &libnetwork.IpamConf{ 1021 AuxAddresses: make(map[string]string), 1022 PreferredPool: fCIDRv6.String(), 1023 } 1024 1025 // In case the --fixed-cidr-v6 is specified and the current docker0 bridge IPv6 1026 // address belongs to the same network, we need to inform libnetwork about it, so 1027 // that it can be reserved with IPAM and it will not be given away to somebody else 1028 for _, nw6 := range nw6List { 1029 if fCIDRv6.Contains(nw6.IP) { 1030 ipamV6Conf.Gateway = nw6.IP.String() 1031 break 1032 } 1033 } 1034 } 1035 1036 if config.BridgeConfig.DefaultGatewayIPv6 != nil { 1037 if ipamV6Conf == nil { 1038 ipamV6Conf = &libnetwork.IpamConf{AuxAddresses: make(map[string]string)} 1039 } 1040 ipamV6Conf.AuxAddresses["DefaultGatewayIPv6"] = config.BridgeConfig.DefaultGatewayIPv6.String() 1041 } 1042 1043 v4Conf := []*libnetwork.IpamConf{ipamV4Conf} 1044 v6Conf := []*libnetwork.IpamConf{} 1045 if ipamV6Conf != nil { 1046 v6Conf = append(v6Conf, ipamV6Conf) 1047 } 1048 // Initialize default network on "bridge" with the same name 1049 _, err = controller.NewNetwork("bridge", "bridge", "", 1050 libnetwork.NetworkOptionEnableIPv6(config.BridgeConfig.EnableIPv6), 1051 libnetwork.NetworkOptionDriverOpts(netOption), 1052 libnetwork.NetworkOptionIpam("default", "", v4Conf, v6Conf, nil), 1053 libnetwork.NetworkOptionDeferIPv6Alloc(deferIPv6Alloc)) 1054 if err != nil { 1055 return fmt.Errorf("Error creating default \"bridge\" network: %v", err) 1056 } 1057 return nil 1058} 1059 1060// Remove default bridge interface if present (--bridge=none use case) 1061func removeDefaultBridgeInterface() { 1062 if lnk, err := netlink.LinkByName(bridge.DefaultBridgeName); err == nil { 1063 if err := netlink.LinkDel(lnk); err != nil { 1064 logrus.Warnf("Failed to remove bridge interface (%s): %v", bridge.DefaultBridgeName, err) 1065 } 1066 } 1067} 1068 1069func setupInitLayer(idMapping *idtools.IdentityMapping) func(containerfs.ContainerFS) error { 1070 return func(initPath containerfs.ContainerFS) error { 1071 return initlayer.Setup(initPath, idMapping.RootPair()) 1072 } 1073} 1074 1075// Parse the remapped root (user namespace) option, which can be one of: 1076// username - valid username from /etc/passwd 1077// username:groupname - valid username; valid groupname from /etc/group 1078// uid - 32-bit unsigned int valid Linux UID value 1079// uid:gid - uid value; 32-bit unsigned int Linux GID value 1080// 1081// If no groupname is specified, and a username is specified, an attempt 1082// will be made to lookup a gid for that username as a groupname 1083// 1084// If names are used, they are verified to exist in passwd/group 1085func parseRemappedRoot(usergrp string) (string, string, error) { 1086 1087 var ( 1088 userID, groupID int 1089 username, groupname string 1090 ) 1091 1092 idparts := strings.Split(usergrp, ":") 1093 if len(idparts) > 2 { 1094 return "", "", fmt.Errorf("Invalid user/group specification in --userns-remap: %q", usergrp) 1095 } 1096 1097 if uid, err := strconv.ParseInt(idparts[0], 10, 32); err == nil { 1098 // must be a uid; take it as valid 1099 userID = int(uid) 1100 luser, err := idtools.LookupUID(userID) 1101 if err != nil { 1102 return "", "", fmt.Errorf("Uid %d has no entry in /etc/passwd: %v", userID, err) 1103 } 1104 username = luser.Name 1105 if len(idparts) == 1 { 1106 // if the uid was numeric and no gid was specified, take the uid as the gid 1107 groupID = userID 1108 lgrp, err := idtools.LookupGID(groupID) 1109 if err != nil { 1110 return "", "", fmt.Errorf("Gid %d has no entry in /etc/group: %v", groupID, err) 1111 } 1112 groupname = lgrp.Name 1113 } 1114 } else { 1115 lookupName := idparts[0] 1116 // special case: if the user specified "default", they want Docker to create or 1117 // use (after creation) the "dockremap" user/group for root remapping 1118 if lookupName == defaultIDSpecifier { 1119 lookupName = defaultRemappedID 1120 } 1121 luser, err := idtools.LookupUser(lookupName) 1122 if err != nil && idparts[0] != defaultIDSpecifier { 1123 // error if the name requested isn't the special "dockremap" ID 1124 return "", "", fmt.Errorf("Error during uid lookup for %q: %v", lookupName, err) 1125 } else if err != nil { 1126 // special case-- if the username == "default", then we have been asked 1127 // to create a new entry pair in /etc/{passwd,group} for which the /etc/sub{uid,gid} 1128 // ranges will be used for the user and group mappings in user namespaced containers 1129 _, _, err := idtools.AddNamespaceRangesUser(defaultRemappedID) 1130 if err == nil { 1131 return defaultRemappedID, defaultRemappedID, nil 1132 } 1133 return "", "", fmt.Errorf("Error during %q user creation: %v", defaultRemappedID, err) 1134 } 1135 username = luser.Name 1136 if len(idparts) == 1 { 1137 // we only have a string username, and no group specified; look up gid from username as group 1138 group, err := idtools.LookupGroup(lookupName) 1139 if err != nil { 1140 return "", "", fmt.Errorf("Error during gid lookup for %q: %v", lookupName, err) 1141 } 1142 groupname = group.Name 1143 } 1144 } 1145 1146 if len(idparts) == 2 { 1147 // groupname or gid is separately specified and must be resolved 1148 // to an unsigned 32-bit gid 1149 if gid, err := strconv.ParseInt(idparts[1], 10, 32); err == nil { 1150 // must be a gid, take it as valid 1151 groupID = int(gid) 1152 lgrp, err := idtools.LookupGID(groupID) 1153 if err != nil { 1154 return "", "", fmt.Errorf("Gid %d has no entry in /etc/passwd: %v", groupID, err) 1155 } 1156 groupname = lgrp.Name 1157 } else { 1158 // not a number; attempt a lookup 1159 if _, err := idtools.LookupGroup(idparts[1]); err != nil { 1160 return "", "", fmt.Errorf("Error during groupname lookup for %q: %v", idparts[1], err) 1161 } 1162 groupname = idparts[1] 1163 } 1164 } 1165 return username, groupname, nil 1166} 1167 1168func setupRemappedRoot(config *config.Config) (*idtools.IdentityMapping, error) { 1169 if runtime.GOOS != "linux" && config.RemappedRoot != "" { 1170 return nil, fmt.Errorf("User namespaces are only supported on Linux") 1171 } 1172 1173 // if the daemon was started with remapped root option, parse 1174 // the config option to the int uid,gid values 1175 if config.RemappedRoot != "" { 1176 username, groupname, err := parseRemappedRoot(config.RemappedRoot) 1177 if err != nil { 1178 return nil, err 1179 } 1180 if username == "root" { 1181 // Cannot setup user namespaces with a 1-to-1 mapping; "--root=0:0" is a no-op 1182 // effectively 1183 logrus.Warn("User namespaces: root cannot be remapped with itself; user namespaces are OFF") 1184 return &idtools.IdentityMapping{}, nil 1185 } 1186 logrus.Infof("User namespaces: ID ranges will be mapped to subuid/subgid ranges of: %s", username) 1187 // update remapped root setting now that we have resolved them to actual names 1188 config.RemappedRoot = fmt.Sprintf("%s:%s", username, groupname) 1189 1190 mappings, err := idtools.NewIdentityMapping(username) 1191 if err != nil { 1192 return nil, errors.Wrap(err, "Can't create ID mappings") 1193 } 1194 return mappings, nil 1195 } 1196 return &idtools.IdentityMapping{}, nil 1197} 1198 1199func setupDaemonRoot(config *config.Config, rootDir string, rootIdentity idtools.Identity) error { 1200 config.Root = rootDir 1201 // the docker root metadata directory needs to have execute permissions for all users (g+x,o+x) 1202 // so that syscalls executing as non-root, operating on subdirectories of the graph root 1203 // (e.g. mounted layers of a container) can traverse this path. 1204 // The user namespace support will create subdirectories for the remapped root host uid:gid 1205 // pair owned by that same uid:gid pair for proper write access to those needed metadata and 1206 // layer content subtrees. 1207 if _, err := os.Stat(rootDir); err == nil { 1208 // root current exists; verify the access bits are correct by setting them 1209 if err = os.Chmod(rootDir, 0711); err != nil { 1210 return err 1211 } 1212 } else if os.IsNotExist(err) { 1213 // no root exists yet, create it 0711 with root:root ownership 1214 if err := os.MkdirAll(rootDir, 0711); err != nil { 1215 return err 1216 } 1217 } 1218 1219 // if user namespaces are enabled we will create a subtree underneath the specified root 1220 // with any/all specified remapped root uid/gid options on the daemon creating 1221 // a new subdirectory with ownership set to the remapped uid/gid (so as to allow 1222 // `chdir()` to work for containers namespaced to that uid/gid) 1223 if config.RemappedRoot != "" { 1224 config.Root = filepath.Join(rootDir, fmt.Sprintf("%d.%d", rootIdentity.UID, rootIdentity.GID)) 1225 logrus.Debugf("Creating user namespaced daemon root: %s", config.Root) 1226 // Create the root directory if it doesn't exist 1227 if err := idtools.MkdirAllAndChown(config.Root, 0700, rootIdentity); err != nil { 1228 return fmt.Errorf("Cannot create daemon root: %s: %v", config.Root, err) 1229 } 1230 // we also need to verify that any pre-existing directories in the path to 1231 // the graphroot won't block access to remapped root--if any pre-existing directory 1232 // has strict permissions that don't allow "x", container start will fail, so 1233 // better to warn and fail now 1234 dirPath := config.Root 1235 for { 1236 dirPath = filepath.Dir(dirPath) 1237 if dirPath == "/" { 1238 break 1239 } 1240 if !idtools.CanAccess(dirPath, rootIdentity) { 1241 return fmt.Errorf("a subdirectory in your graphroot path (%s) restricts access to the remapped root uid/gid; please fix by allowing 'o+x' permissions on existing directories", config.Root) 1242 } 1243 } 1244 } 1245 1246 if err := setupDaemonRootPropagation(config); err != nil { 1247 logrus.WithError(err).WithField("dir", config.Root).Warn("Error while setting daemon root propagation, this is not generally critical but may cause some functionality to not work or fallback to less desirable behavior") 1248 } 1249 return nil 1250} 1251 1252func setupDaemonRootPropagation(cfg *config.Config) error { 1253 rootParentMount, mountOptions, err := getSourceMount(cfg.Root) 1254 if err != nil { 1255 return errors.Wrap(err, "error getting daemon root's parent mount") 1256 } 1257 1258 var cleanupOldFile bool 1259 cleanupFile := getUnmountOnShutdownPath(cfg) 1260 defer func() { 1261 if !cleanupOldFile { 1262 return 1263 } 1264 if err := os.Remove(cleanupFile); err != nil && !os.IsNotExist(err) { 1265 logrus.WithError(err).WithField("file", cleanupFile).Warn("could not clean up old root propagation unmount file") 1266 } 1267 }() 1268 1269 if hasMountInfoOption(mountOptions, sharedPropagationOption, slavePropagationOption) { 1270 cleanupOldFile = true 1271 return nil 1272 } 1273 1274 if err := mount.MakeShared(cfg.Root); err != nil { 1275 return errors.Wrap(err, "could not setup daemon root propagation to shared") 1276 } 1277 1278 // check the case where this may have already been a mount to itself. 1279 // If so then the daemon only performed a remount and should not try to unmount this later. 1280 if rootParentMount == cfg.Root { 1281 cleanupOldFile = true 1282 return nil 1283 } 1284 1285 if err := os.MkdirAll(filepath.Dir(cleanupFile), 0700); err != nil { 1286 return errors.Wrap(err, "error creating dir to store mount cleanup file") 1287 } 1288 1289 if err := ioutil.WriteFile(cleanupFile, nil, 0600); err != nil { 1290 return errors.Wrap(err, "error writing file to signal mount cleanup on shutdown") 1291 } 1292 return nil 1293} 1294 1295// getUnmountOnShutdownPath generates the path to used when writing the file that signals to the daemon that on shutdown 1296// the daemon root should be unmounted. 1297func getUnmountOnShutdownPath(config *config.Config) string { 1298 return filepath.Join(config.ExecRoot, "unmount-on-shutdown") 1299} 1300 1301// registerLinks writes the links to a file. 1302func (daemon *Daemon) registerLinks(container *container.Container, hostConfig *containertypes.HostConfig) error { 1303 if hostConfig == nil || hostConfig.NetworkMode.IsUserDefined() { 1304 return nil 1305 } 1306 1307 for _, l := range hostConfig.Links { 1308 name, alias, err := opts.ParseLink(l) 1309 if err != nil { 1310 return err 1311 } 1312 child, err := daemon.GetContainer(name) 1313 if err != nil { 1314 if errdefs.IsNotFound(err) { 1315 // Trying to link to a non-existing container is not valid, and 1316 // should return an "invalid parameter" error. Returning a "not 1317 // found" error here would make the client report the container's 1318 // image could not be found (see moby/moby#39823) 1319 err = errdefs.InvalidParameter(err) 1320 } 1321 return errors.Wrapf(err, "could not get container for %s", name) 1322 } 1323 for child.HostConfig.NetworkMode.IsContainer() { 1324 parts := strings.SplitN(string(child.HostConfig.NetworkMode), ":", 2) 1325 child, err = daemon.GetContainer(parts[1]) 1326 if err != nil { 1327 if errdefs.IsNotFound(err) { 1328 // Trying to link to a non-existing container is not valid, and 1329 // should return an "invalid parameter" error. Returning a "not 1330 // found" error here would make the client report the container's 1331 // image could not be found (see moby/moby#39823) 1332 err = errdefs.InvalidParameter(err) 1333 } 1334 return errors.Wrapf(err, "Could not get container for %s", parts[1]) 1335 } 1336 } 1337 if child.HostConfig.NetworkMode.IsHost() { 1338 return runconfig.ErrConflictHostNetworkAndLinks 1339 } 1340 if err := daemon.registerLink(container, child, alias); err != nil { 1341 return err 1342 } 1343 } 1344 1345 // After we load all the links into the daemon 1346 // set them to nil on the hostconfig 1347 _, err := container.WriteHostConfig() 1348 return err 1349} 1350 1351// conditionalMountOnStart is a platform specific helper function during the 1352// container start to call mount. 1353func (daemon *Daemon) conditionalMountOnStart(container *container.Container) error { 1354 return daemon.Mount(container) 1355} 1356 1357// conditionalUnmountOnCleanup is a platform specific helper function called 1358// during the cleanup of a container to unmount. 1359func (daemon *Daemon) conditionalUnmountOnCleanup(container *container.Container) error { 1360 return daemon.Unmount(container) 1361} 1362 1363func copyBlkioEntry(entries []*statsV1.BlkIOEntry) []types.BlkioStatEntry { 1364 out := make([]types.BlkioStatEntry, len(entries)) 1365 for i, re := range entries { 1366 out[i] = types.BlkioStatEntry{ 1367 Major: re.Major, 1368 Minor: re.Minor, 1369 Op: re.Op, 1370 Value: re.Value, 1371 } 1372 } 1373 return out 1374} 1375 1376func (daemon *Daemon) stats(c *container.Container) (*types.StatsJSON, error) { 1377 if !c.IsRunning() { 1378 return nil, errNotRunning(c.ID) 1379 } 1380 cs, err := daemon.containerd.Stats(context.Background(), c.ID) 1381 if err != nil { 1382 if strings.Contains(err.Error(), "container not found") { 1383 return nil, containerNotFound(c.ID) 1384 } 1385 return nil, err 1386 } 1387 s := &types.StatsJSON{} 1388 s.Read = cs.Read 1389 stats := cs.Metrics 1390 switch t := stats.(type) { 1391 case *statsV1.Metrics: 1392 return daemon.statsV1(s, t) 1393 case *statsV2.Metrics: 1394 return daemon.statsV2(s, t) 1395 default: 1396 return nil, errors.Errorf("unexpected type of metrics %+v", t) 1397 } 1398} 1399 1400func (daemon *Daemon) statsV1(s *types.StatsJSON, stats *statsV1.Metrics) (*types.StatsJSON, error) { 1401 if stats.Blkio != nil { 1402 s.BlkioStats = types.BlkioStats{ 1403 IoServiceBytesRecursive: copyBlkioEntry(stats.Blkio.IoServiceBytesRecursive), 1404 IoServicedRecursive: copyBlkioEntry(stats.Blkio.IoServicedRecursive), 1405 IoQueuedRecursive: copyBlkioEntry(stats.Blkio.IoQueuedRecursive), 1406 IoServiceTimeRecursive: copyBlkioEntry(stats.Blkio.IoServiceTimeRecursive), 1407 IoWaitTimeRecursive: copyBlkioEntry(stats.Blkio.IoWaitTimeRecursive), 1408 IoMergedRecursive: copyBlkioEntry(stats.Blkio.IoMergedRecursive), 1409 IoTimeRecursive: copyBlkioEntry(stats.Blkio.IoTimeRecursive), 1410 SectorsRecursive: copyBlkioEntry(stats.Blkio.SectorsRecursive), 1411 } 1412 } 1413 if stats.CPU != nil { 1414 s.CPUStats = types.CPUStats{ 1415 CPUUsage: types.CPUUsage{ 1416 TotalUsage: stats.CPU.Usage.Total, 1417 PercpuUsage: stats.CPU.Usage.PerCPU, 1418 UsageInKernelmode: stats.CPU.Usage.Kernel, 1419 UsageInUsermode: stats.CPU.Usage.User, 1420 }, 1421 ThrottlingData: types.ThrottlingData{ 1422 Periods: stats.CPU.Throttling.Periods, 1423 ThrottledPeriods: stats.CPU.Throttling.ThrottledPeriods, 1424 ThrottledTime: stats.CPU.Throttling.ThrottledTime, 1425 }, 1426 } 1427 } 1428 1429 if stats.Memory != nil { 1430 raw := make(map[string]uint64) 1431 raw["cache"] = stats.Memory.Cache 1432 raw["rss"] = stats.Memory.RSS 1433 raw["rss_huge"] = stats.Memory.RSSHuge 1434 raw["mapped_file"] = stats.Memory.MappedFile 1435 raw["dirty"] = stats.Memory.Dirty 1436 raw["writeback"] = stats.Memory.Writeback 1437 raw["pgpgin"] = stats.Memory.PgPgIn 1438 raw["pgpgout"] = stats.Memory.PgPgOut 1439 raw["pgfault"] = stats.Memory.PgFault 1440 raw["pgmajfault"] = stats.Memory.PgMajFault 1441 raw["inactive_anon"] = stats.Memory.InactiveAnon 1442 raw["active_anon"] = stats.Memory.ActiveAnon 1443 raw["inactive_file"] = stats.Memory.InactiveFile 1444 raw["active_file"] = stats.Memory.ActiveFile 1445 raw["unevictable"] = stats.Memory.Unevictable 1446 raw["hierarchical_memory_limit"] = stats.Memory.HierarchicalMemoryLimit 1447 raw["hierarchical_memsw_limit"] = stats.Memory.HierarchicalSwapLimit 1448 raw["total_cache"] = stats.Memory.TotalCache 1449 raw["total_rss"] = stats.Memory.TotalRSS 1450 raw["total_rss_huge"] = stats.Memory.TotalRSSHuge 1451 raw["total_mapped_file"] = stats.Memory.TotalMappedFile 1452 raw["total_dirty"] = stats.Memory.TotalDirty 1453 raw["total_writeback"] = stats.Memory.TotalWriteback 1454 raw["total_pgpgin"] = stats.Memory.TotalPgPgIn 1455 raw["total_pgpgout"] = stats.Memory.TotalPgPgOut 1456 raw["total_pgfault"] = stats.Memory.TotalPgFault 1457 raw["total_pgmajfault"] = stats.Memory.TotalPgMajFault 1458 raw["total_inactive_anon"] = stats.Memory.TotalInactiveAnon 1459 raw["total_active_anon"] = stats.Memory.TotalActiveAnon 1460 raw["total_inactive_file"] = stats.Memory.TotalInactiveFile 1461 raw["total_active_file"] = stats.Memory.TotalActiveFile 1462 raw["total_unevictable"] = stats.Memory.TotalUnevictable 1463 1464 if stats.Memory.Usage != nil { 1465 s.MemoryStats = types.MemoryStats{ 1466 Stats: raw, 1467 Usage: stats.Memory.Usage.Usage, 1468 MaxUsage: stats.Memory.Usage.Max, 1469 Limit: stats.Memory.Usage.Limit, 1470 Failcnt: stats.Memory.Usage.Failcnt, 1471 } 1472 } else { 1473 s.MemoryStats = types.MemoryStats{ 1474 Stats: raw, 1475 } 1476 } 1477 1478 // if the container does not set memory limit, use the machineMemory 1479 if s.MemoryStats.Limit > daemon.machineMemory && daemon.machineMemory > 0 { 1480 s.MemoryStats.Limit = daemon.machineMemory 1481 } 1482 } 1483 1484 if stats.Pids != nil { 1485 s.PidsStats = types.PidsStats{ 1486 Current: stats.Pids.Current, 1487 Limit: stats.Pids.Limit, 1488 } 1489 } 1490 1491 return s, nil 1492} 1493 1494func (daemon *Daemon) statsV2(s *types.StatsJSON, stats *statsV2.Metrics) (*types.StatsJSON, error) { 1495 if stats.Io != nil { 1496 var isbr []types.BlkioStatEntry 1497 for _, re := range stats.Io.Usage { 1498 isbr = append(isbr, 1499 types.BlkioStatEntry{ 1500 Major: re.Major, 1501 Minor: re.Minor, 1502 Op: "read", 1503 Value: re.Rbytes, 1504 }, 1505 types.BlkioStatEntry{ 1506 Major: re.Major, 1507 Minor: re.Minor, 1508 Op: "write", 1509 Value: re.Wbytes, 1510 }, 1511 ) 1512 } 1513 s.BlkioStats = types.BlkioStats{ 1514 IoServiceBytesRecursive: isbr, 1515 // Other fields are unsupported 1516 } 1517 } 1518 1519 if stats.CPU != nil { 1520 s.CPUStats = types.CPUStats{ 1521 CPUUsage: types.CPUUsage{ 1522 TotalUsage: stats.CPU.UsageUsec * 1000, 1523 // PercpuUsage is not supported 1524 UsageInKernelmode: stats.CPU.SystemUsec * 1000, 1525 UsageInUsermode: stats.CPU.UserUsec * 1000, 1526 }, 1527 ThrottlingData: types.ThrottlingData{ 1528 Periods: stats.CPU.NrPeriods, 1529 ThrottledPeriods: stats.CPU.NrThrottled, 1530 ThrottledTime: stats.CPU.ThrottledUsec * 1000, 1531 }, 1532 } 1533 } 1534 1535 if stats.Memory != nil { 1536 raw := make(map[string]uint64) 1537 raw["anon"] = stats.Memory.Anon 1538 raw["file"] = stats.Memory.File 1539 raw["kernel_stack"] = stats.Memory.KernelStack 1540 raw["slab"] = stats.Memory.Slab 1541 raw["sock"] = stats.Memory.Sock 1542 raw["shmem"] = stats.Memory.Shmem 1543 raw["file_mapped"] = stats.Memory.FileMapped 1544 raw["file_dirty"] = stats.Memory.FileDirty 1545 raw["file_writeback"] = stats.Memory.FileWriteback 1546 raw["anon_thp"] = stats.Memory.AnonThp 1547 raw["inactive_anon"] = stats.Memory.InactiveAnon 1548 raw["active_anon"] = stats.Memory.ActiveAnon 1549 raw["inactive_file"] = stats.Memory.InactiveFile 1550 raw["active_file"] = stats.Memory.ActiveFile 1551 raw["unevictable"] = stats.Memory.Unevictable 1552 raw["slab_reclaimable"] = stats.Memory.SlabReclaimable 1553 raw["slab_unreclaimable"] = stats.Memory.SlabUnreclaimable 1554 raw["pgfault"] = stats.Memory.Pgfault 1555 raw["pgmajfault"] = stats.Memory.Pgmajfault 1556 raw["workingset_refault"] = stats.Memory.WorkingsetRefault 1557 raw["workingset_activate"] = stats.Memory.WorkingsetActivate 1558 raw["workingset_nodereclaim"] = stats.Memory.WorkingsetNodereclaim 1559 raw["pgrefill"] = stats.Memory.Pgrefill 1560 raw["pgscan"] = stats.Memory.Pgscan 1561 raw["pgsteal"] = stats.Memory.Pgsteal 1562 raw["pgactivate"] = stats.Memory.Pgactivate 1563 raw["pgdeactivate"] = stats.Memory.Pgdeactivate 1564 raw["pglazyfree"] = stats.Memory.Pglazyfree 1565 raw["pglazyfreed"] = stats.Memory.Pglazyfreed 1566 raw["thp_fault_alloc"] = stats.Memory.ThpFaultAlloc 1567 raw["thp_collapse_alloc"] = stats.Memory.ThpCollapseAlloc 1568 s.MemoryStats = types.MemoryStats{ 1569 // Stats is not compatible with v1 1570 Stats: raw, 1571 Usage: stats.Memory.Usage, 1572 // MaxUsage is not supported 1573 Limit: stats.Memory.UsageLimit, 1574 } 1575 // if the container does not set memory limit, use the machineMemory 1576 if s.MemoryStats.Limit > daemon.machineMemory && daemon.machineMemory > 0 { 1577 s.MemoryStats.Limit = daemon.machineMemory 1578 } 1579 if stats.MemoryEvents != nil { 1580 // Failcnt is set to the "oom" field of the "memory.events" file. 1581 // See https://www.kernel.org/doc/html/latest/admin-guide/cgroup-v2.html 1582 s.MemoryStats.Failcnt = stats.MemoryEvents.Oom 1583 } 1584 } 1585 1586 if stats.Pids != nil { 1587 s.PidsStats = types.PidsStats{ 1588 Current: stats.Pids.Current, 1589 Limit: stats.Pids.Limit, 1590 } 1591 } 1592 1593 return s, nil 1594} 1595 1596// setDefaultIsolation determines the default isolation mode for the 1597// daemon to run in. This is only applicable on Windows 1598func (daemon *Daemon) setDefaultIsolation() error { 1599 return nil 1600} 1601 1602// setupDaemonProcess sets various settings for the daemon's process 1603func setupDaemonProcess(config *config.Config) error { 1604 // setup the daemons oom_score_adj 1605 if err := setupOOMScoreAdj(config.OOMScoreAdjust); err != nil { 1606 return err 1607 } 1608 if err := setMayDetachMounts(); err != nil { 1609 logrus.WithError(err).Warn("Could not set may_detach_mounts kernel parameter") 1610 } 1611 return nil 1612} 1613 1614// This is used to allow removal of mountpoints that may be mounted in other 1615// namespaces on RHEL based kernels starting from RHEL 7.4. 1616// Without this setting, removals on these RHEL based kernels may fail with 1617// "device or resource busy". 1618// This setting is not available in upstream kernels as it is not configurable, 1619// but has been in the upstream kernels since 3.15. 1620func setMayDetachMounts() error { 1621 f, err := os.OpenFile("/proc/sys/fs/may_detach_mounts", os.O_WRONLY, 0) 1622 if err != nil { 1623 if os.IsNotExist(err) { 1624 return nil 1625 } 1626 return errors.Wrap(err, "error opening may_detach_mounts kernel config file") 1627 } 1628 defer f.Close() 1629 1630 _, err = f.WriteString("1") 1631 if os.IsPermission(err) { 1632 // Setting may_detach_mounts does not work in an 1633 // unprivileged container. Ignore the error, but log 1634 // it if we appear not to be in that situation. 1635 if !sys.RunningInUserNS() { 1636 logrus.Debugf("Permission denied writing %q to /proc/sys/fs/may_detach_mounts", "1") 1637 } 1638 return nil 1639 } 1640 return err 1641} 1642 1643func setupOOMScoreAdj(score int) error { 1644 if score == 0 { 1645 return nil 1646 } 1647 f, err := os.OpenFile("/proc/self/oom_score_adj", os.O_WRONLY, 0) 1648 if err != nil { 1649 return err 1650 } 1651 defer f.Close() 1652 stringScore := strconv.Itoa(score) 1653 _, err = f.WriteString(stringScore) 1654 if os.IsPermission(err) { 1655 // Setting oom_score_adj does not work in an 1656 // unprivileged container. Ignore the error, but log 1657 // it if we appear not to be in that situation. 1658 if !sys.RunningInUserNS() { 1659 logrus.Debugf("Permission denied writing %q to /proc/self/oom_score_adj", stringScore) 1660 } 1661 return nil 1662 } 1663 1664 return err 1665} 1666 1667func (daemon *Daemon) initCPURtController(mnt, path string) error { 1668 if path == "/" || path == "." { 1669 return nil 1670 } 1671 1672 // Recursively create cgroup to ensure that the system and all parent cgroups have values set 1673 // for the period and runtime as this limits what the children can be set to. 1674 if err := daemon.initCPURtController(mnt, filepath.Dir(path)); err != nil { 1675 return err 1676 } 1677 1678 path = filepath.Join(mnt, path) 1679 if err := os.MkdirAll(path, 0755); err != nil { 1680 return err 1681 } 1682 if err := maybeCreateCPURealTimeFile(daemon.configStore.CPURealtimePeriod, "cpu.rt_period_us", path); err != nil { 1683 return err 1684 } 1685 return maybeCreateCPURealTimeFile(daemon.configStore.CPURealtimeRuntime, "cpu.rt_runtime_us", path) 1686} 1687 1688func maybeCreateCPURealTimeFile(configValue int64, file string, path string) error { 1689 if configValue == 0 { 1690 return nil 1691 } 1692 return ioutil.WriteFile(filepath.Join(path, file), []byte(strconv.FormatInt(configValue, 10)), 0700) 1693} 1694 1695func (daemon *Daemon) setupSeccompProfile() error { 1696 if daemon.configStore.SeccompProfile != "" { 1697 daemon.seccompProfilePath = daemon.configStore.SeccompProfile 1698 b, err := ioutil.ReadFile(daemon.configStore.SeccompProfile) 1699 if err != nil { 1700 return fmt.Errorf("opening seccomp profile (%s) failed: %v", daemon.configStore.SeccompProfile, err) 1701 } 1702 daemon.seccompProfile = b 1703 } 1704 return nil 1705} 1706 1707// RawSysInfo returns *sysinfo.SysInfo . 1708func (daemon *Daemon) RawSysInfo(quiet bool) *sysinfo.SysInfo { 1709 var opts []sysinfo.Opt 1710 if daemon.getCgroupDriver() == cgroupSystemdDriver { 1711 rootlesskitParentEUID := os.Getenv("ROOTLESSKIT_PARENT_EUID") 1712 if rootlesskitParentEUID != "" { 1713 groupPath := fmt.Sprintf("/user.slice/user-%s.slice", rootlesskitParentEUID) 1714 opts = append(opts, sysinfo.WithCgroup2GroupPath(groupPath)) 1715 } 1716 } 1717 return sysinfo.New(quiet, opts...) 1718} 1719 1720func recursiveUnmount(target string) error { 1721 return mount.RecursiveUnmount(target) 1722} 1723