1// +build !dockerless
2
3/*
4Copyright 2016 The Kubernetes Authors.
5
6Licensed under the Apache License, Version 2.0 (the "License");
7you may not use this file except in compliance with the License.
8You may obtain a copy of the License at
9
10    http://www.apache.org/licenses/LICENSE-2.0
11
12Unless required by applicable law or agreed to in writing, software
13distributed under the License is distributed on an "AS IS" BASIS,
14WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15See the License for the specific language governing permissions and
16limitations under the License.
17*/
18
19package dockershim
20
21import (
22	"context"
23	"fmt"
24	"net/http"
25	"os"
26	"path"
27	"path/filepath"
28	"runtime"
29	"sync"
30	"time"
31
32	"github.com/blang/semver"
33	dockertypes "github.com/docker/docker/api/types"
34	"k8s.io/klog/v2"
35
36	v1 "k8s.io/api/core/v1"
37	runtimeapi "k8s.io/cri-api/pkg/apis/runtime/v1alpha2"
38	kubeletconfig "k8s.io/kubernetes/pkg/kubelet/apis/config"
39	"k8s.io/kubernetes/pkg/kubelet/checkpointmanager"
40	"k8s.io/kubernetes/pkg/kubelet/checkpointmanager/errors"
41	kubecontainer "k8s.io/kubernetes/pkg/kubelet/container"
42	"k8s.io/kubernetes/pkg/kubelet/cri/streaming"
43	"k8s.io/kubernetes/pkg/kubelet/dockershim/cm"
44	"k8s.io/kubernetes/pkg/kubelet/dockershim/network"
45	"k8s.io/kubernetes/pkg/kubelet/dockershim/network/cni"
46	"k8s.io/kubernetes/pkg/kubelet/dockershim/network/hostport"
47	"k8s.io/kubernetes/pkg/kubelet/dockershim/network/kubenet"
48	"k8s.io/kubernetes/pkg/kubelet/legacy"
49	"k8s.io/kubernetes/pkg/kubelet/util/cache"
50
51	"k8s.io/kubernetes/pkg/kubelet/dockershim/libdocker"
52	"k8s.io/kubernetes/pkg/kubelet/dockershim/metrics"
53)
54
55const (
56	dockerRuntimeName = "docker"
57	kubeAPIVersion    = "0.1.0"
58
59	// String used to detect docker host mode for various namespaces (e.g.
60	// networking). Must match the value returned by docker inspect -f
61	// '{{.HostConfig.NetworkMode}}'.
62	namespaceModeHost = "host"
63
64	dockerNetNSFmt = "/proc/%v/ns/net"
65
66	// Internal docker labels used to identify whether a container is a sandbox
67	// or a regular container.
68	// TODO: This is not backward compatible with older containers. We will
69	// need to add filtering based on names.
70	containerTypeLabelKey       = "io.kubernetes.docker.type"
71	containerTypeLabelSandbox   = "podsandbox"
72	containerTypeLabelContainer = "container"
73	containerLogPathLabelKey    = "io.kubernetes.container.logpath"
74	sandboxIDLabelKey           = "io.kubernetes.sandbox.id"
75
76	// The expiration time of version cache.
77	versionCacheTTL = 60 * time.Second
78
79	defaultCgroupDriver = "cgroupfs"
80
81	// TODO: https://github.com/kubernetes/kubernetes/pull/31169 provides experimental
82	// defaulting of host user namespace that may be enabled when the docker daemon
83	// is using remapped UIDs.
84	// Dockershim should provide detection support for a remapping environment .
85	// This should be included in the feature proposal.  Defaulting may still occur according
86	// to kubelet behavior and system settings in addition to any API flags that may be introduced.
87)
88
89// CRIService includes all methods necessary for a CRI server.
90type CRIService interface {
91	runtimeapi.RuntimeServiceServer
92	runtimeapi.ImageServiceServer
93	Start() error
94}
95
96// DockerService is an interface that embeds the new RuntimeService and
97// ImageService interfaces.
98type DockerService interface {
99	CRIService
100
101	// For serving streaming calls.
102	http.Handler
103
104	// For supporting legacy features.
105	legacy.DockerLegacyService
106}
107
108// NetworkPluginSettings is the subset of kubelet runtime args we pass
109// to the container runtime shim so it can probe for network plugins.
110// In the future we will feed these directly to a standalone container
111// runtime process.
112type NetworkPluginSettings struct {
113	// HairpinMode is best described by comments surrounding the kubelet arg
114	HairpinMode kubeletconfig.HairpinMode
115	// NonMasqueradeCIDR is the range of ips which should *not* be included
116	// in any MASQUERADE rules applied by the plugin
117	NonMasqueradeCIDR string
118	// PluginName is the name of the plugin, runtime shim probes for
119	PluginName string
120	// PluginBinDirString is a list of directiores delimited by commas, in
121	// which the binaries for the plugin with PluginName may be found.
122	PluginBinDirString string
123	// PluginBinDirs is an array of directories in which the binaries for
124	// the plugin with PluginName may be found. The admin is responsible for
125	// provisioning these binaries before-hand.
126	PluginBinDirs []string
127	// PluginConfDir is the directory in which the admin places a CNI conf.
128	// Depending on the plugin, this may be an optional field, eg: kubenet
129	// generates its own plugin conf.
130	PluginConfDir string
131	// PluginCacheDir is the directory in which CNI should store cache files.
132	PluginCacheDir string
133	// MTU is the desired MTU for network devices created by the plugin.
134	MTU int
135}
136
137// namespaceGetter is a wrapper around the dockerService that implements
138// the network.NamespaceGetter interface.
139type namespaceGetter struct {
140	ds *dockerService
141}
142
143func (n *namespaceGetter) GetNetNS(containerID string) (string, error) {
144	return n.ds.GetNetNS(containerID)
145}
146
147// portMappingGetter is a wrapper around the dockerService that implements
148// the network.PortMappingGetter interface.
149type portMappingGetter struct {
150	ds *dockerService
151}
152
153func (p *portMappingGetter) GetPodPortMappings(containerID string) ([]*hostport.PortMapping, error) {
154	return p.ds.GetPodPortMappings(containerID)
155}
156
157// dockerNetworkHost implements network.Host by wrapping the legacy host passed in by the kubelet
158// and dockerServices which implements the rest of the network host interfaces.
159// The legacy host methods are slated for deletion.
160type dockerNetworkHost struct {
161	*namespaceGetter
162	*portMappingGetter
163}
164
165var internalLabelKeys = []string{containerTypeLabelKey, containerLogPathLabelKey, sandboxIDLabelKey}
166
167// ClientConfig is parameters used to initialize docker client
168type ClientConfig struct {
169	DockerEndpoint            string
170	RuntimeRequestTimeout     time.Duration
171	ImagePullProgressDeadline time.Duration
172
173	// Configuration for fake docker client
174	EnableSleep       bool
175	WithTraceDisabled bool
176}
177
178// NewDockerClientFromConfig create a docker client from given configure
179// return nil if nil configure is given.
180func NewDockerClientFromConfig(config *ClientConfig) libdocker.Interface {
181	if config != nil {
182		// Create docker client.
183		client := libdocker.ConnectToDockerOrDie(
184			config.DockerEndpoint,
185			config.RuntimeRequestTimeout,
186			config.ImagePullProgressDeadline,
187		)
188		return client
189	}
190
191	return nil
192}
193
194// NewDockerService creates a new `DockerService` struct.
195// NOTE: Anything passed to DockerService should be eventually handled in another way when we switch to running the shim as a different process.
196func NewDockerService(config *ClientConfig, podSandboxImage string, streamingConfig *streaming.Config, pluginSettings *NetworkPluginSettings,
197	cgroupsName string, kubeCgroupDriver string, dockershimRootDir string) (DockerService, error) {
198
199	client := NewDockerClientFromConfig(config)
200
201	c := libdocker.NewInstrumentedInterface(client)
202
203	checkpointManager, err := checkpointmanager.NewCheckpointManager(filepath.Join(dockershimRootDir, sandboxCheckpointDir))
204	if err != nil {
205		return nil, err
206	}
207
208	ds := &dockerService{
209		client:          c,
210		os:              kubecontainer.RealOS{},
211		podSandboxImage: podSandboxImage,
212		streamingRuntime: &streamingRuntime{
213			client:      client,
214			execHandler: &NativeExecHandler{},
215		},
216		containerManager:      cm.NewContainerManager(cgroupsName, client),
217		checkpointManager:     checkpointManager,
218		networkReady:          make(map[string]bool),
219		containerCleanupInfos: make(map[string]*containerCleanupInfo),
220	}
221
222	// check docker version compatibility.
223	if err = ds.checkVersionCompatibility(); err != nil {
224		return nil, err
225	}
226
227	// create streaming server if configured.
228	if streamingConfig != nil {
229		var err error
230		ds.streamingServer, err = streaming.NewServer(*streamingConfig, ds.streamingRuntime)
231		if err != nil {
232			return nil, err
233		}
234	}
235
236	// Determine the hairpin mode.
237	if err := effectiveHairpinMode(pluginSettings); err != nil {
238		// This is a non-recoverable error. Returning it up the callstack will just
239		// lead to retries of the same failure, so just fail hard.
240		return nil, err
241	}
242	klog.InfoS("Hairpin mode is set", "hairpinMode", pluginSettings.HairpinMode)
243
244	// dockershim currently only supports CNI plugins.
245	pluginSettings.PluginBinDirs = cni.SplitDirs(pluginSettings.PluginBinDirString)
246	cniPlugins := cni.ProbeNetworkPlugins(pluginSettings.PluginConfDir, pluginSettings.PluginCacheDir, pluginSettings.PluginBinDirs)
247	cniPlugins = append(cniPlugins, kubenet.NewPlugin(pluginSettings.PluginBinDirs, pluginSettings.PluginCacheDir))
248	netHost := &dockerNetworkHost{
249		&namespaceGetter{ds},
250		&portMappingGetter{ds},
251	}
252	plug, err := network.InitNetworkPlugin(cniPlugins, pluginSettings.PluginName, netHost, pluginSettings.HairpinMode, pluginSettings.NonMasqueradeCIDR, pluginSettings.MTU)
253	if err != nil {
254		return nil, fmt.Errorf("didn't find compatible CNI plugin with given settings %+v: %v", pluginSettings, err)
255	}
256	ds.network = network.NewPluginManager(plug)
257	klog.InfoS("Docker cri networking managed by the network plugin", "networkPluginName", plug.Name())
258
259	// skipping cgroup driver checks for Windows
260	if runtime.GOOS == "linux" {
261		// NOTE: cgroup driver is only detectable in docker 1.11+
262		cgroupDriver := defaultCgroupDriver
263		dockerInfo, err := ds.client.Info()
264		klog.InfoS("Docker Info", "dockerInfo", dockerInfo)
265		if err != nil {
266			klog.ErrorS(err, "Failed to execute Info() call to the Docker client")
267			klog.InfoS("Falling back to use the default driver", "cgroupDriver", cgroupDriver)
268		} else if len(dockerInfo.CgroupDriver) == 0 {
269			klog.InfoS("No cgroup driver is set in Docker")
270			klog.InfoS("Falling back to use the default driver", "cgroupDriver", cgroupDriver)
271		} else {
272			cgroupDriver = dockerInfo.CgroupDriver
273		}
274		if len(kubeCgroupDriver) != 0 && kubeCgroupDriver != cgroupDriver {
275			return nil, fmt.Errorf("misconfiguration: kubelet cgroup driver: %q is different from docker cgroup driver: %q", kubeCgroupDriver, cgroupDriver)
276		}
277		klog.InfoS("Setting cgroupDriver", "cgroupDriver", cgroupDriver)
278		ds.cgroupDriver = cgroupDriver
279	}
280
281	ds.versionCache = cache.NewObjectCache(
282		func() (interface{}, error) {
283			return ds.getDockerVersion()
284		},
285		versionCacheTTL,
286	)
287
288	// Register prometheus metrics.
289	metrics.Register()
290
291	return ds, nil
292}
293
294type dockerService struct {
295	client           libdocker.Interface
296	os               kubecontainer.OSInterface
297	podSandboxImage  string
298	streamingRuntime *streamingRuntime
299	streamingServer  streaming.Server
300
301	network *network.PluginManager
302	// Map of podSandboxID :: network-is-ready
303	networkReady     map[string]bool
304	networkReadyLock sync.Mutex
305
306	containerManager cm.ContainerManager
307	// cgroup driver used by Docker runtime.
308	cgroupDriver      string
309	checkpointManager checkpointmanager.CheckpointManager
310	// caches the version of the runtime.
311	// To be compatible with multiple docker versions, we need to perform
312	// version checking for some operations. Use this cache to avoid querying
313	// the docker daemon every time we need to do such checks.
314	versionCache *cache.ObjectCache
315
316	// containerCleanupInfos maps container IDs to the `containerCleanupInfo` structs
317	// needed to clean up after containers have been removed.
318	// (see `applyPlatformSpecificDockerConfig` and `performPlatformSpecificContainerCleanup`
319	// methods for more info).
320	containerCleanupInfos map[string]*containerCleanupInfo
321	cleanupInfosLock      sync.RWMutex
322}
323
324// TODO: handle context.
325
326// Version returns the runtime name, runtime version and runtime API version
327func (ds *dockerService) Version(_ context.Context, r *runtimeapi.VersionRequest) (*runtimeapi.VersionResponse, error) {
328	v, err := ds.getDockerVersion()
329	if err != nil {
330		return nil, err
331	}
332	return &runtimeapi.VersionResponse{
333		Version:           kubeAPIVersion,
334		RuntimeName:       dockerRuntimeName,
335		RuntimeVersion:    v.Version,
336		RuntimeApiVersion: v.APIVersion,
337	}, nil
338}
339
340// getDockerVersion gets the version information from docker.
341func (ds *dockerService) getDockerVersion() (*dockertypes.Version, error) {
342	v, err := ds.client.Version()
343	if err != nil {
344		return nil, fmt.Errorf("failed to get docker version: %v", err)
345	}
346	// Docker API version (e.g., 1.23) is not semver compatible. Add a ".0"
347	// suffix to remedy this.
348	v.APIVersion = fmt.Sprintf("%s.0", v.APIVersion)
349	return v, nil
350}
351
352// UpdateRuntimeConfig updates the runtime config. Currently only handles podCIDR updates.
353func (ds *dockerService) UpdateRuntimeConfig(_ context.Context, r *runtimeapi.UpdateRuntimeConfigRequest) (*runtimeapi.UpdateRuntimeConfigResponse, error) {
354	runtimeConfig := r.GetRuntimeConfig()
355	if runtimeConfig == nil {
356		return &runtimeapi.UpdateRuntimeConfigResponse{}, nil
357	}
358
359	klog.InfoS("Docker cri received runtime config", "runtimeConfig", runtimeConfig)
360	if ds.network != nil && runtimeConfig.NetworkConfig.PodCidr != "" {
361		event := make(map[string]interface{})
362		event[network.NET_PLUGIN_EVENT_POD_CIDR_CHANGE_DETAIL_CIDR] = runtimeConfig.NetworkConfig.PodCidr
363		ds.network.Event(network.NET_PLUGIN_EVENT_POD_CIDR_CHANGE, event)
364	}
365
366	return &runtimeapi.UpdateRuntimeConfigResponse{}, nil
367}
368
369// GetNetNS returns the network namespace of the given containerID. The ID
370// supplied is typically the ID of a pod sandbox. This getter doesn't try
371// to map non-sandbox IDs to their respective sandboxes.
372func (ds *dockerService) GetNetNS(podSandboxID string) (string, error) {
373	r, err := ds.client.InspectContainer(podSandboxID)
374	if err != nil {
375		return "", err
376	}
377	return getNetworkNamespace(r)
378}
379
380// GetPodPortMappings returns the port mappings of the given podSandbox ID.
381func (ds *dockerService) GetPodPortMappings(podSandboxID string) ([]*hostport.PortMapping, error) {
382	// TODO: get portmappings from docker labels for backward compatibility
383	checkpoint := NewPodSandboxCheckpoint("", "", &CheckpointData{})
384	err := ds.checkpointManager.GetCheckpoint(podSandboxID, checkpoint)
385	// Return empty portMappings if checkpoint is not found
386	if err != nil {
387		if err == errors.ErrCheckpointNotFound {
388			return nil, nil
389		}
390		errRem := ds.checkpointManager.RemoveCheckpoint(podSandboxID)
391		if errRem != nil {
392			klog.ErrorS(errRem, "Failed to delete corrupt checkpoint for sandbox", "podSandboxID", podSandboxID)
393		}
394		return nil, err
395	}
396	_, _, _, checkpointedPortMappings, _ := checkpoint.GetData()
397	portMappings := make([]*hostport.PortMapping, 0, len(checkpointedPortMappings))
398	for _, pm := range checkpointedPortMappings {
399		proto := toAPIProtocol(*pm.Protocol)
400		portMappings = append(portMappings, &hostport.PortMapping{
401			HostPort:      *pm.HostPort,
402			ContainerPort: *pm.ContainerPort,
403			Protocol:      proto,
404			HostIP:        pm.HostIP,
405		})
406	}
407	return portMappings, nil
408}
409
410// Start initializes and starts components in dockerService.
411func (ds *dockerService) Start() error {
412	ds.initCleanup()
413
414	go func() {
415		if err := ds.streamingServer.Start(true); err != nil {
416			klog.ErrorS(err, "Streaming server stopped unexpectedly")
417			os.Exit(1)
418		}
419	}()
420
421	return ds.containerManager.Start()
422}
423
424// initCleanup is responsible for cleaning up any crufts left by previous
425// runs. If there are any errors, it simply logs them.
426func (ds *dockerService) initCleanup() {
427	errors := ds.platformSpecificContainerInitCleanup()
428
429	for _, err := range errors {
430		klog.InfoS("Initialization error", "err", err)
431	}
432}
433
434// Status returns the status of the runtime.
435func (ds *dockerService) Status(_ context.Context, r *runtimeapi.StatusRequest) (*runtimeapi.StatusResponse, error) {
436	runtimeReady := &runtimeapi.RuntimeCondition{
437		Type:   runtimeapi.RuntimeReady,
438		Status: true,
439	}
440	networkReady := &runtimeapi.RuntimeCondition{
441		Type:   runtimeapi.NetworkReady,
442		Status: true,
443	}
444	conditions := []*runtimeapi.RuntimeCondition{runtimeReady, networkReady}
445	if _, err := ds.client.Version(); err != nil {
446		runtimeReady.Status = false
447		runtimeReady.Reason = "DockerDaemonNotReady"
448		runtimeReady.Message = fmt.Sprintf("docker: failed to get docker version: %v", err)
449	}
450	if err := ds.network.Status(); err != nil {
451		networkReady.Status = false
452		networkReady.Reason = "NetworkPluginNotReady"
453		networkReady.Message = fmt.Sprintf("docker: network plugin is not ready: %v", err)
454	}
455	status := &runtimeapi.RuntimeStatus{Conditions: conditions}
456	return &runtimeapi.StatusResponse{Status: status}, nil
457}
458
459func (ds *dockerService) ServeHTTP(w http.ResponseWriter, r *http.Request) {
460	if ds.streamingServer != nil {
461		ds.streamingServer.ServeHTTP(w, r)
462	} else {
463		http.NotFound(w, r)
464	}
465}
466
467// GenerateExpectedCgroupParent returns cgroup parent in syntax expected by cgroup driver
468func (ds *dockerService) GenerateExpectedCgroupParent(cgroupParent string) (string, error) {
469	if cgroupParent != "" {
470		// if docker uses the systemd cgroup driver, it expects *.slice style names for cgroup parent.
471		// if we configured kubelet to use --cgroup-driver=cgroupfs, and docker is configured to use systemd driver
472		// docker will fail to launch the container because the name we provide will not be a valid slice.
473		// this is a very good thing.
474		if ds.cgroupDriver == "systemd" {
475			// Pass only the last component of the cgroup path to systemd.
476			cgroupParent = path.Base(cgroupParent)
477		}
478	}
479	klog.V(3).InfoS("Setting cgroup parent", "cgroupParent", cgroupParent)
480	return cgroupParent, nil
481}
482
483// checkVersionCompatibility verifies whether docker is in a compatible version.
484func (ds *dockerService) checkVersionCompatibility() error {
485	apiVersion, err := ds.getDockerAPIVersion()
486	if err != nil {
487		return err
488	}
489
490	minAPIVersion, err := semver.Parse(libdocker.MinimumDockerAPIVersion)
491	if err != nil {
492		return err
493	}
494
495	// Verify the docker version.
496	result := apiVersion.Compare(minAPIVersion)
497	if result < 0 {
498		return fmt.Errorf("docker API version is older than %s", libdocker.MinimumDockerAPIVersion)
499	}
500
501	return nil
502}
503
504// getDockerAPIVersion gets the semver-compatible docker api version.
505func (ds *dockerService) getDockerAPIVersion() (*semver.Version, error) {
506	var dv *dockertypes.Version
507	var err error
508	if ds.versionCache != nil {
509		dv, err = ds.getDockerVersionFromCache()
510	} else {
511		dv, err = ds.getDockerVersion()
512	}
513	if err != nil {
514		return nil, err
515	}
516
517	apiVersion, err := semver.Parse(dv.APIVersion)
518	if err != nil {
519		return nil, err
520	}
521	return &apiVersion, nil
522}
523
524func (ds *dockerService) getDockerVersionFromCache() (*dockertypes.Version, error) {
525	// We only store on key in the cache.
526	const dummyKey = "version"
527	value, err := ds.versionCache.Get(dummyKey)
528	if err != nil {
529		return nil, err
530	}
531	dv, ok := value.(*dockertypes.Version)
532	if !ok {
533		return nil, fmt.Errorf("converted to *dockertype.Version error")
534	}
535	return dv, nil
536}
537
538func toAPIProtocol(protocol Protocol) v1.Protocol {
539	switch protocol {
540	case protocolTCP:
541		return v1.ProtocolTCP
542	case protocolUDP:
543		return v1.ProtocolUDP
544	case protocolSCTP:
545		return v1.ProtocolSCTP
546	}
547	klog.InfoS("Unknown protocol, defaulting to TCP", "protocol", protocol)
548	return v1.ProtocolTCP
549}
550
551// effectiveHairpinMode determines the effective hairpin mode given the
552// configured mode, and whether cbr0 should be configured.
553func effectiveHairpinMode(s *NetworkPluginSettings) error {
554	// The hairpin mode setting doesn't matter if:
555	// - We're not using a bridge network. This is hard to check because we might
556	//   be using a plugin.
557	// - It's set to hairpin-veth for a container runtime that doesn't know how
558	//   to set the hairpin flag on the veth's of containers. Currently the
559	//   docker runtime is the only one that understands this.
560	// - It's set to "none".
561	if s.HairpinMode == kubeletconfig.PromiscuousBridge || s.HairpinMode == kubeletconfig.HairpinVeth {
562		if s.HairpinMode == kubeletconfig.PromiscuousBridge && s.PluginName != "kubenet" {
563			// This is not a valid combination, since promiscuous-bridge only works on kubenet. Users might be using the
564			// default values (from before the hairpin-mode flag existed) and we
565			// should keep the old behavior.
566			klog.InfoS("Hairpin mode is set but kubenet is not enabled, falling back to HairpinVeth", "hairpinMode", s.HairpinMode)
567			s.HairpinMode = kubeletconfig.HairpinVeth
568			return nil
569		}
570	} else if s.HairpinMode != kubeletconfig.HairpinNone {
571		return fmt.Errorf("unknown value: %q", s.HairpinMode)
572	}
573	return nil
574}
575