1// +build !dockerless 2 3/* 4Copyright 2016 The Kubernetes Authors. 5 6Licensed under the Apache License, Version 2.0 (the "License"); 7you may not use this file except in compliance with the License. 8You may obtain a copy of the License at 9 10 http://www.apache.org/licenses/LICENSE-2.0 11 12Unless required by applicable law or agreed to in writing, software 13distributed under the License is distributed on an "AS IS" BASIS, 14WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15See the License for the specific language governing permissions and 16limitations under the License. 17*/ 18 19package dockershim 20 21import ( 22 "context" 23 "fmt" 24 "net/http" 25 "os" 26 "path" 27 "path/filepath" 28 "runtime" 29 "sync" 30 "time" 31 32 "github.com/blang/semver" 33 dockertypes "github.com/docker/docker/api/types" 34 "k8s.io/klog/v2" 35 36 v1 "k8s.io/api/core/v1" 37 runtimeapi "k8s.io/cri-api/pkg/apis/runtime/v1alpha2" 38 kubeletconfig "k8s.io/kubernetes/pkg/kubelet/apis/config" 39 "k8s.io/kubernetes/pkg/kubelet/checkpointmanager" 40 "k8s.io/kubernetes/pkg/kubelet/checkpointmanager/errors" 41 kubecontainer "k8s.io/kubernetes/pkg/kubelet/container" 42 "k8s.io/kubernetes/pkg/kubelet/cri/streaming" 43 "k8s.io/kubernetes/pkg/kubelet/dockershim/cm" 44 "k8s.io/kubernetes/pkg/kubelet/dockershim/network" 45 "k8s.io/kubernetes/pkg/kubelet/dockershim/network/cni" 46 "k8s.io/kubernetes/pkg/kubelet/dockershim/network/hostport" 47 "k8s.io/kubernetes/pkg/kubelet/dockershim/network/kubenet" 48 "k8s.io/kubernetes/pkg/kubelet/legacy" 49 "k8s.io/kubernetes/pkg/kubelet/util/cache" 50 51 "k8s.io/kubernetes/pkg/kubelet/dockershim/libdocker" 52 "k8s.io/kubernetes/pkg/kubelet/dockershim/metrics" 53) 54 55const ( 56 dockerRuntimeName = "docker" 57 kubeAPIVersion = "0.1.0" 58 59 // String used to detect docker host mode for various namespaces (e.g. 60 // networking). Must match the value returned by docker inspect -f 61 // '{{.HostConfig.NetworkMode}}'. 62 namespaceModeHost = "host" 63 64 dockerNetNSFmt = "/proc/%v/ns/net" 65 66 // Internal docker labels used to identify whether a container is a sandbox 67 // or a regular container. 68 // TODO: This is not backward compatible with older containers. We will 69 // need to add filtering based on names. 70 containerTypeLabelKey = "io.kubernetes.docker.type" 71 containerTypeLabelSandbox = "podsandbox" 72 containerTypeLabelContainer = "container" 73 containerLogPathLabelKey = "io.kubernetes.container.logpath" 74 sandboxIDLabelKey = "io.kubernetes.sandbox.id" 75 76 // The expiration time of version cache. 77 versionCacheTTL = 60 * time.Second 78 79 defaultCgroupDriver = "cgroupfs" 80 81 // TODO: https://github.com/kubernetes/kubernetes/pull/31169 provides experimental 82 // defaulting of host user namespace that may be enabled when the docker daemon 83 // is using remapped UIDs. 84 // Dockershim should provide detection support for a remapping environment . 85 // This should be included in the feature proposal. Defaulting may still occur according 86 // to kubelet behavior and system settings in addition to any API flags that may be introduced. 87) 88 89// CRIService includes all methods necessary for a CRI server. 90type CRIService interface { 91 runtimeapi.RuntimeServiceServer 92 runtimeapi.ImageServiceServer 93 Start() error 94} 95 96// DockerService is an interface that embeds the new RuntimeService and 97// ImageService interfaces. 98type DockerService interface { 99 CRIService 100 101 // For serving streaming calls. 102 http.Handler 103 104 // For supporting legacy features. 105 legacy.DockerLegacyService 106} 107 108// NetworkPluginSettings is the subset of kubelet runtime args we pass 109// to the container runtime shim so it can probe for network plugins. 110// In the future we will feed these directly to a standalone container 111// runtime process. 112type NetworkPluginSettings struct { 113 // HairpinMode is best described by comments surrounding the kubelet arg 114 HairpinMode kubeletconfig.HairpinMode 115 // NonMasqueradeCIDR is the range of ips which should *not* be included 116 // in any MASQUERADE rules applied by the plugin 117 NonMasqueradeCIDR string 118 // PluginName is the name of the plugin, runtime shim probes for 119 PluginName string 120 // PluginBinDirString is a list of directiores delimited by commas, in 121 // which the binaries for the plugin with PluginName may be found. 122 PluginBinDirString string 123 // PluginBinDirs is an array of directories in which the binaries for 124 // the plugin with PluginName may be found. The admin is responsible for 125 // provisioning these binaries before-hand. 126 PluginBinDirs []string 127 // PluginConfDir is the directory in which the admin places a CNI conf. 128 // Depending on the plugin, this may be an optional field, eg: kubenet 129 // generates its own plugin conf. 130 PluginConfDir string 131 // PluginCacheDir is the directory in which CNI should store cache files. 132 PluginCacheDir string 133 // MTU is the desired MTU for network devices created by the plugin. 134 MTU int 135} 136 137// namespaceGetter is a wrapper around the dockerService that implements 138// the network.NamespaceGetter interface. 139type namespaceGetter struct { 140 ds *dockerService 141} 142 143func (n *namespaceGetter) GetNetNS(containerID string) (string, error) { 144 return n.ds.GetNetNS(containerID) 145} 146 147// portMappingGetter is a wrapper around the dockerService that implements 148// the network.PortMappingGetter interface. 149type portMappingGetter struct { 150 ds *dockerService 151} 152 153func (p *portMappingGetter) GetPodPortMappings(containerID string) ([]*hostport.PortMapping, error) { 154 return p.ds.GetPodPortMappings(containerID) 155} 156 157// dockerNetworkHost implements network.Host by wrapping the legacy host passed in by the kubelet 158// and dockerServices which implements the rest of the network host interfaces. 159// The legacy host methods are slated for deletion. 160type dockerNetworkHost struct { 161 *namespaceGetter 162 *portMappingGetter 163} 164 165var internalLabelKeys = []string{containerTypeLabelKey, containerLogPathLabelKey, sandboxIDLabelKey} 166 167// ClientConfig is parameters used to initialize docker client 168type ClientConfig struct { 169 DockerEndpoint string 170 RuntimeRequestTimeout time.Duration 171 ImagePullProgressDeadline time.Duration 172 173 // Configuration for fake docker client 174 EnableSleep bool 175 WithTraceDisabled bool 176} 177 178// NewDockerClientFromConfig create a docker client from given configure 179// return nil if nil configure is given. 180func NewDockerClientFromConfig(config *ClientConfig) libdocker.Interface { 181 if config != nil { 182 // Create docker client. 183 client := libdocker.ConnectToDockerOrDie( 184 config.DockerEndpoint, 185 config.RuntimeRequestTimeout, 186 config.ImagePullProgressDeadline, 187 ) 188 return client 189 } 190 191 return nil 192} 193 194// NewDockerService creates a new `DockerService` struct. 195// NOTE: Anything passed to DockerService should be eventually handled in another way when we switch to running the shim as a different process. 196func NewDockerService(config *ClientConfig, podSandboxImage string, streamingConfig *streaming.Config, pluginSettings *NetworkPluginSettings, 197 cgroupsName string, kubeCgroupDriver string, dockershimRootDir string) (DockerService, error) { 198 199 client := NewDockerClientFromConfig(config) 200 201 c := libdocker.NewInstrumentedInterface(client) 202 203 checkpointManager, err := checkpointmanager.NewCheckpointManager(filepath.Join(dockershimRootDir, sandboxCheckpointDir)) 204 if err != nil { 205 return nil, err 206 } 207 208 ds := &dockerService{ 209 client: c, 210 os: kubecontainer.RealOS{}, 211 podSandboxImage: podSandboxImage, 212 streamingRuntime: &streamingRuntime{ 213 client: client, 214 execHandler: &NativeExecHandler{}, 215 }, 216 containerManager: cm.NewContainerManager(cgroupsName, client), 217 checkpointManager: checkpointManager, 218 networkReady: make(map[string]bool), 219 containerCleanupInfos: make(map[string]*containerCleanupInfo), 220 } 221 222 // check docker version compatibility. 223 if err = ds.checkVersionCompatibility(); err != nil { 224 return nil, err 225 } 226 227 // create streaming server if configured. 228 if streamingConfig != nil { 229 var err error 230 ds.streamingServer, err = streaming.NewServer(*streamingConfig, ds.streamingRuntime) 231 if err != nil { 232 return nil, err 233 } 234 } 235 236 // Determine the hairpin mode. 237 if err := effectiveHairpinMode(pluginSettings); err != nil { 238 // This is a non-recoverable error. Returning it up the callstack will just 239 // lead to retries of the same failure, so just fail hard. 240 return nil, err 241 } 242 klog.InfoS("Hairpin mode is set", "hairpinMode", pluginSettings.HairpinMode) 243 244 // dockershim currently only supports CNI plugins. 245 pluginSettings.PluginBinDirs = cni.SplitDirs(pluginSettings.PluginBinDirString) 246 cniPlugins := cni.ProbeNetworkPlugins(pluginSettings.PluginConfDir, pluginSettings.PluginCacheDir, pluginSettings.PluginBinDirs) 247 cniPlugins = append(cniPlugins, kubenet.NewPlugin(pluginSettings.PluginBinDirs, pluginSettings.PluginCacheDir)) 248 netHost := &dockerNetworkHost{ 249 &namespaceGetter{ds}, 250 &portMappingGetter{ds}, 251 } 252 plug, err := network.InitNetworkPlugin(cniPlugins, pluginSettings.PluginName, netHost, pluginSettings.HairpinMode, pluginSettings.NonMasqueradeCIDR, pluginSettings.MTU) 253 if err != nil { 254 return nil, fmt.Errorf("didn't find compatible CNI plugin with given settings %+v: %v", pluginSettings, err) 255 } 256 ds.network = network.NewPluginManager(plug) 257 klog.InfoS("Docker cri networking managed by the network plugin", "networkPluginName", plug.Name()) 258 259 // skipping cgroup driver checks for Windows 260 if runtime.GOOS == "linux" { 261 // NOTE: cgroup driver is only detectable in docker 1.11+ 262 cgroupDriver := defaultCgroupDriver 263 dockerInfo, err := ds.client.Info() 264 klog.InfoS("Docker Info", "dockerInfo", dockerInfo) 265 if err != nil { 266 klog.ErrorS(err, "Failed to execute Info() call to the Docker client") 267 klog.InfoS("Falling back to use the default driver", "cgroupDriver", cgroupDriver) 268 } else if len(dockerInfo.CgroupDriver) == 0 { 269 klog.InfoS("No cgroup driver is set in Docker") 270 klog.InfoS("Falling back to use the default driver", "cgroupDriver", cgroupDriver) 271 } else { 272 cgroupDriver = dockerInfo.CgroupDriver 273 } 274 if len(kubeCgroupDriver) != 0 && kubeCgroupDriver != cgroupDriver { 275 return nil, fmt.Errorf("misconfiguration: kubelet cgroup driver: %q is different from docker cgroup driver: %q", kubeCgroupDriver, cgroupDriver) 276 } 277 klog.InfoS("Setting cgroupDriver", "cgroupDriver", cgroupDriver) 278 ds.cgroupDriver = cgroupDriver 279 } 280 281 ds.versionCache = cache.NewObjectCache( 282 func() (interface{}, error) { 283 return ds.getDockerVersion() 284 }, 285 versionCacheTTL, 286 ) 287 288 // Register prometheus metrics. 289 metrics.Register() 290 291 return ds, nil 292} 293 294type dockerService struct { 295 client libdocker.Interface 296 os kubecontainer.OSInterface 297 podSandboxImage string 298 streamingRuntime *streamingRuntime 299 streamingServer streaming.Server 300 301 network *network.PluginManager 302 // Map of podSandboxID :: network-is-ready 303 networkReady map[string]bool 304 networkReadyLock sync.Mutex 305 306 containerManager cm.ContainerManager 307 // cgroup driver used by Docker runtime. 308 cgroupDriver string 309 checkpointManager checkpointmanager.CheckpointManager 310 // caches the version of the runtime. 311 // To be compatible with multiple docker versions, we need to perform 312 // version checking for some operations. Use this cache to avoid querying 313 // the docker daemon every time we need to do such checks. 314 versionCache *cache.ObjectCache 315 316 // containerCleanupInfos maps container IDs to the `containerCleanupInfo` structs 317 // needed to clean up after containers have been removed. 318 // (see `applyPlatformSpecificDockerConfig` and `performPlatformSpecificContainerCleanup` 319 // methods for more info). 320 containerCleanupInfos map[string]*containerCleanupInfo 321 cleanupInfosLock sync.RWMutex 322} 323 324// TODO: handle context. 325 326// Version returns the runtime name, runtime version and runtime API version 327func (ds *dockerService) Version(_ context.Context, r *runtimeapi.VersionRequest) (*runtimeapi.VersionResponse, error) { 328 v, err := ds.getDockerVersion() 329 if err != nil { 330 return nil, err 331 } 332 return &runtimeapi.VersionResponse{ 333 Version: kubeAPIVersion, 334 RuntimeName: dockerRuntimeName, 335 RuntimeVersion: v.Version, 336 RuntimeApiVersion: v.APIVersion, 337 }, nil 338} 339 340// getDockerVersion gets the version information from docker. 341func (ds *dockerService) getDockerVersion() (*dockertypes.Version, error) { 342 v, err := ds.client.Version() 343 if err != nil { 344 return nil, fmt.Errorf("failed to get docker version: %v", err) 345 } 346 // Docker API version (e.g., 1.23) is not semver compatible. Add a ".0" 347 // suffix to remedy this. 348 v.APIVersion = fmt.Sprintf("%s.0", v.APIVersion) 349 return v, nil 350} 351 352// UpdateRuntimeConfig updates the runtime config. Currently only handles podCIDR updates. 353func (ds *dockerService) UpdateRuntimeConfig(_ context.Context, r *runtimeapi.UpdateRuntimeConfigRequest) (*runtimeapi.UpdateRuntimeConfigResponse, error) { 354 runtimeConfig := r.GetRuntimeConfig() 355 if runtimeConfig == nil { 356 return &runtimeapi.UpdateRuntimeConfigResponse{}, nil 357 } 358 359 klog.InfoS("Docker cri received runtime config", "runtimeConfig", runtimeConfig) 360 if ds.network != nil && runtimeConfig.NetworkConfig.PodCidr != "" { 361 event := make(map[string]interface{}) 362 event[network.NET_PLUGIN_EVENT_POD_CIDR_CHANGE_DETAIL_CIDR] = runtimeConfig.NetworkConfig.PodCidr 363 ds.network.Event(network.NET_PLUGIN_EVENT_POD_CIDR_CHANGE, event) 364 } 365 366 return &runtimeapi.UpdateRuntimeConfigResponse{}, nil 367} 368 369// GetNetNS returns the network namespace of the given containerID. The ID 370// supplied is typically the ID of a pod sandbox. This getter doesn't try 371// to map non-sandbox IDs to their respective sandboxes. 372func (ds *dockerService) GetNetNS(podSandboxID string) (string, error) { 373 r, err := ds.client.InspectContainer(podSandboxID) 374 if err != nil { 375 return "", err 376 } 377 return getNetworkNamespace(r) 378} 379 380// GetPodPortMappings returns the port mappings of the given podSandbox ID. 381func (ds *dockerService) GetPodPortMappings(podSandboxID string) ([]*hostport.PortMapping, error) { 382 // TODO: get portmappings from docker labels for backward compatibility 383 checkpoint := NewPodSandboxCheckpoint("", "", &CheckpointData{}) 384 err := ds.checkpointManager.GetCheckpoint(podSandboxID, checkpoint) 385 // Return empty portMappings if checkpoint is not found 386 if err != nil { 387 if err == errors.ErrCheckpointNotFound { 388 return nil, nil 389 } 390 errRem := ds.checkpointManager.RemoveCheckpoint(podSandboxID) 391 if errRem != nil { 392 klog.ErrorS(errRem, "Failed to delete corrupt checkpoint for sandbox", "podSandboxID", podSandboxID) 393 } 394 return nil, err 395 } 396 _, _, _, checkpointedPortMappings, _ := checkpoint.GetData() 397 portMappings := make([]*hostport.PortMapping, 0, len(checkpointedPortMappings)) 398 for _, pm := range checkpointedPortMappings { 399 proto := toAPIProtocol(*pm.Protocol) 400 portMappings = append(portMappings, &hostport.PortMapping{ 401 HostPort: *pm.HostPort, 402 ContainerPort: *pm.ContainerPort, 403 Protocol: proto, 404 HostIP: pm.HostIP, 405 }) 406 } 407 return portMappings, nil 408} 409 410// Start initializes and starts components in dockerService. 411func (ds *dockerService) Start() error { 412 ds.initCleanup() 413 414 go func() { 415 if err := ds.streamingServer.Start(true); err != nil { 416 klog.ErrorS(err, "Streaming server stopped unexpectedly") 417 os.Exit(1) 418 } 419 }() 420 421 return ds.containerManager.Start() 422} 423 424// initCleanup is responsible for cleaning up any crufts left by previous 425// runs. If there are any errors, it simply logs them. 426func (ds *dockerService) initCleanup() { 427 errors := ds.platformSpecificContainerInitCleanup() 428 429 for _, err := range errors { 430 klog.InfoS("Initialization error", "err", err) 431 } 432} 433 434// Status returns the status of the runtime. 435func (ds *dockerService) Status(_ context.Context, r *runtimeapi.StatusRequest) (*runtimeapi.StatusResponse, error) { 436 runtimeReady := &runtimeapi.RuntimeCondition{ 437 Type: runtimeapi.RuntimeReady, 438 Status: true, 439 } 440 networkReady := &runtimeapi.RuntimeCondition{ 441 Type: runtimeapi.NetworkReady, 442 Status: true, 443 } 444 conditions := []*runtimeapi.RuntimeCondition{runtimeReady, networkReady} 445 if _, err := ds.client.Version(); err != nil { 446 runtimeReady.Status = false 447 runtimeReady.Reason = "DockerDaemonNotReady" 448 runtimeReady.Message = fmt.Sprintf("docker: failed to get docker version: %v", err) 449 } 450 if err := ds.network.Status(); err != nil { 451 networkReady.Status = false 452 networkReady.Reason = "NetworkPluginNotReady" 453 networkReady.Message = fmt.Sprintf("docker: network plugin is not ready: %v", err) 454 } 455 status := &runtimeapi.RuntimeStatus{Conditions: conditions} 456 return &runtimeapi.StatusResponse{Status: status}, nil 457} 458 459func (ds *dockerService) ServeHTTP(w http.ResponseWriter, r *http.Request) { 460 if ds.streamingServer != nil { 461 ds.streamingServer.ServeHTTP(w, r) 462 } else { 463 http.NotFound(w, r) 464 } 465} 466 467// GenerateExpectedCgroupParent returns cgroup parent in syntax expected by cgroup driver 468func (ds *dockerService) GenerateExpectedCgroupParent(cgroupParent string) (string, error) { 469 if cgroupParent != "" { 470 // if docker uses the systemd cgroup driver, it expects *.slice style names for cgroup parent. 471 // if we configured kubelet to use --cgroup-driver=cgroupfs, and docker is configured to use systemd driver 472 // docker will fail to launch the container because the name we provide will not be a valid slice. 473 // this is a very good thing. 474 if ds.cgroupDriver == "systemd" { 475 // Pass only the last component of the cgroup path to systemd. 476 cgroupParent = path.Base(cgroupParent) 477 } 478 } 479 klog.V(3).InfoS("Setting cgroup parent", "cgroupParent", cgroupParent) 480 return cgroupParent, nil 481} 482 483// checkVersionCompatibility verifies whether docker is in a compatible version. 484func (ds *dockerService) checkVersionCompatibility() error { 485 apiVersion, err := ds.getDockerAPIVersion() 486 if err != nil { 487 return err 488 } 489 490 minAPIVersion, err := semver.Parse(libdocker.MinimumDockerAPIVersion) 491 if err != nil { 492 return err 493 } 494 495 // Verify the docker version. 496 result := apiVersion.Compare(minAPIVersion) 497 if result < 0 { 498 return fmt.Errorf("docker API version is older than %s", libdocker.MinimumDockerAPIVersion) 499 } 500 501 return nil 502} 503 504// getDockerAPIVersion gets the semver-compatible docker api version. 505func (ds *dockerService) getDockerAPIVersion() (*semver.Version, error) { 506 var dv *dockertypes.Version 507 var err error 508 if ds.versionCache != nil { 509 dv, err = ds.getDockerVersionFromCache() 510 } else { 511 dv, err = ds.getDockerVersion() 512 } 513 if err != nil { 514 return nil, err 515 } 516 517 apiVersion, err := semver.Parse(dv.APIVersion) 518 if err != nil { 519 return nil, err 520 } 521 return &apiVersion, nil 522} 523 524func (ds *dockerService) getDockerVersionFromCache() (*dockertypes.Version, error) { 525 // We only store on key in the cache. 526 const dummyKey = "version" 527 value, err := ds.versionCache.Get(dummyKey) 528 if err != nil { 529 return nil, err 530 } 531 dv, ok := value.(*dockertypes.Version) 532 if !ok { 533 return nil, fmt.Errorf("converted to *dockertype.Version error") 534 } 535 return dv, nil 536} 537 538func toAPIProtocol(protocol Protocol) v1.Protocol { 539 switch protocol { 540 case protocolTCP: 541 return v1.ProtocolTCP 542 case protocolUDP: 543 return v1.ProtocolUDP 544 case protocolSCTP: 545 return v1.ProtocolSCTP 546 } 547 klog.InfoS("Unknown protocol, defaulting to TCP", "protocol", protocol) 548 return v1.ProtocolTCP 549} 550 551// effectiveHairpinMode determines the effective hairpin mode given the 552// configured mode, and whether cbr0 should be configured. 553func effectiveHairpinMode(s *NetworkPluginSettings) error { 554 // The hairpin mode setting doesn't matter if: 555 // - We're not using a bridge network. This is hard to check because we might 556 // be using a plugin. 557 // - It's set to hairpin-veth for a container runtime that doesn't know how 558 // to set the hairpin flag on the veth's of containers. Currently the 559 // docker runtime is the only one that understands this. 560 // - It's set to "none". 561 if s.HairpinMode == kubeletconfig.PromiscuousBridge || s.HairpinMode == kubeletconfig.HairpinVeth { 562 if s.HairpinMode == kubeletconfig.PromiscuousBridge && s.PluginName != "kubenet" { 563 // This is not a valid combination, since promiscuous-bridge only works on kubenet. Users might be using the 564 // default values (from before the hairpin-mode flag existed) and we 565 // should keep the old behavior. 566 klog.InfoS("Hairpin mode is set but kubenet is not enabled, falling back to HairpinVeth", "hairpinMode", s.HairpinMode) 567 s.HairpinMode = kubeletconfig.HairpinVeth 568 return nil 569 } 570 } else if s.HairpinMode != kubeletconfig.HairpinNone { 571 return fmt.Errorf("unknown value: %q", s.HairpinMode) 572 } 573 return nil 574} 575