1/*
2Copyright 2015 The Kubernetes Authors.
3
4Licensed under the Apache License, Version 2.0 (the "License");
5you may not use this file except in compliance with the License.
6You may obtain a copy of the License at
7
8    http://www.apache.org/licenses/LICENSE-2.0
9
10Unless required by applicable law or agreed to in writing, software
11distributed under the License is distributed on an "AS IS" BASIS,
12WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13See the License for the specific language governing permissions and
14limitations under the License.
15*/
16
17package cm
18
19import (
20	"fmt"
21	"strconv"
22	"strings"
23	"time"
24
25	"k8s.io/apimachinery/pkg/util/sets"
26	// TODO: Migrate kubelet to either use its own internal objects or client library.
27	v1 "k8s.io/api/core/v1"
28	internalapi "k8s.io/cri-api/pkg/apis"
29	podresourcesapi "k8s.io/kubelet/pkg/apis/podresources/v1"
30	kubeletconfig "k8s.io/kubernetes/pkg/kubelet/apis/config"
31	"k8s.io/kubernetes/pkg/kubelet/apis/podresources"
32	"k8s.io/kubernetes/pkg/kubelet/cm/cpuset"
33	"k8s.io/kubernetes/pkg/kubelet/cm/devicemanager"
34	"k8s.io/kubernetes/pkg/kubelet/config"
35	kubecontainer "k8s.io/kubernetes/pkg/kubelet/container"
36	evictionapi "k8s.io/kubernetes/pkg/kubelet/eviction/api"
37	"k8s.io/kubernetes/pkg/kubelet/lifecycle"
38	"k8s.io/kubernetes/pkg/kubelet/pluginmanager/cache"
39	"k8s.io/kubernetes/pkg/kubelet/status"
40	schedulerframework "k8s.io/kubernetes/pkg/scheduler/framework"
41)
42
43type ActivePodsFunc func() []*v1.Pod
44
45// Manages the containers running on a machine.
46type ContainerManager interface {
47	// Runs the container manager's housekeeping.
48	// - Ensures that the Docker daemon is in a container.
49	// - Creates the system container where all non-containerized processes run.
50	Start(*v1.Node, ActivePodsFunc, config.SourcesReady, status.PodStatusProvider, internalapi.RuntimeService) error
51
52	// SystemCgroupsLimit returns resources allocated to system cgroups in the machine.
53	// These cgroups include the system and Kubernetes services.
54	SystemCgroupsLimit() v1.ResourceList
55
56	// GetNodeConfig returns a NodeConfig that is being used by the container manager.
57	GetNodeConfig() NodeConfig
58
59	// Status returns internal Status.
60	Status() Status
61
62	// NewPodContainerManager is a factory method which returns a podContainerManager object
63	// Returns a noop implementation if qos cgroup hierarchy is not enabled
64	NewPodContainerManager() PodContainerManager
65
66	// GetMountedSubsystems returns the mounted cgroup subsystems on the node
67	GetMountedSubsystems() *CgroupSubsystems
68
69	// GetQOSContainersInfo returns the names of top level QoS containers
70	GetQOSContainersInfo() QOSContainersInfo
71
72	// GetNodeAllocatableReservation returns the amount of compute resources that have to be reserved from scheduling.
73	GetNodeAllocatableReservation() v1.ResourceList
74
75	// GetCapacity returns the amount of compute resources tracked by container manager available on the node.
76	GetCapacity() v1.ResourceList
77
78	// GetDevicePluginResourceCapacity returns the node capacity (amount of total device plugin resources),
79	// node allocatable (amount of total healthy resources reported by device plugin),
80	// and inactive device plugin resources previously registered on the node.
81	GetDevicePluginResourceCapacity() (v1.ResourceList, v1.ResourceList, []string)
82
83	// UpdateQOSCgroups performs housekeeping updates to ensure that the top
84	// level QoS containers have their desired state in a thread-safe way
85	UpdateQOSCgroups() error
86
87	// GetResources returns RunContainerOptions with devices, mounts, and env fields populated for
88	// extended resources required by container.
89	GetResources(pod *v1.Pod, container *v1.Container) (*kubecontainer.RunContainerOptions, error)
90
91	// UpdatePluginResources calls Allocate of device plugin handler for potential
92	// requests for device plugin resources, and returns an error if fails.
93	// Otherwise, it updates allocatableResource in nodeInfo if necessary,
94	// to make sure it is at least equal to the pod's requested capacity for
95	// any registered device plugin resource
96	UpdatePluginResources(*schedulerframework.NodeInfo, *lifecycle.PodAdmitAttributes) error
97
98	InternalContainerLifecycle() InternalContainerLifecycle
99
100	// GetPodCgroupRoot returns the cgroup which contains all pods.
101	GetPodCgroupRoot() string
102
103	// GetPluginRegistrationHandler returns a plugin registration handler
104	// The pluginwatcher's Handlers allow to have a single module for handling
105	// registration.
106	GetPluginRegistrationHandler() cache.PluginHandler
107
108	// ShouldResetExtendedResourceCapacity returns whether or not the extended resources should be zeroed,
109	// due to node recreation.
110	ShouldResetExtendedResourceCapacity() bool
111
112	// GetAllocateResourcesPodAdmitHandler returns an instance of a PodAdmitHandler responsible for allocating pod resources.
113	GetAllocateResourcesPodAdmitHandler() lifecycle.PodAdmitHandler
114
115	// GetNodeAllocatableAbsolute returns the absolute value of Node Allocatable which is primarily useful for enforcement.
116	GetNodeAllocatableAbsolute() v1.ResourceList
117
118	// Implements the podresources Provider API for CPUs, Memory and Devices
119	podresources.CPUsProvider
120	podresources.DevicesProvider
121	podresources.MemoryProvider
122}
123
124type NodeConfig struct {
125	RuntimeCgroupsName    string
126	SystemCgroupsName     string
127	KubeletCgroupsName    string
128	ContainerRuntime      string
129	CgroupsPerQOS         bool
130	CgroupRoot            string
131	CgroupDriver          string
132	KubeletRootDir        string
133	ProtectKernelDefaults bool
134	NodeAllocatableConfig
135	QOSReserved                             map[v1.ResourceName]int64
136	ExperimentalCPUManagerPolicy            string
137	ExperimentalCPUManagerPolicyOptions     map[string]string
138	ExperimentalTopologyManagerScope        string
139	ExperimentalCPUManagerReconcilePeriod   time.Duration
140	ExperimentalMemoryManagerPolicy         string
141	ExperimentalMemoryManagerReservedMemory []kubeletconfig.MemoryReservation
142	ExperimentalPodPidsLimit                int64
143	EnforceCPULimits                        bool
144	CPUCFSQuotaPeriod                       time.Duration
145	ExperimentalTopologyManagerPolicy       string
146}
147
148type NodeAllocatableConfig struct {
149	KubeReservedCgroupName   string
150	SystemReservedCgroupName string
151	ReservedSystemCPUs       cpuset.CPUSet
152	EnforceNodeAllocatable   sets.String
153	KubeReserved             v1.ResourceList
154	SystemReserved           v1.ResourceList
155	HardEvictionThresholds   []evictionapi.Threshold
156}
157
158type Status struct {
159	// Any soft requirements that were unsatisfied.
160	SoftRequirements error
161}
162
163// parsePercentage parses the percentage string to numeric value.
164func parsePercentage(v string) (int64, error) {
165	if !strings.HasSuffix(v, "%") {
166		return 0, fmt.Errorf("percentage expected, got '%s'", v)
167	}
168	percentage, err := strconv.ParseInt(strings.TrimRight(v, "%"), 10, 0)
169	if err != nil {
170		return 0, fmt.Errorf("invalid number in percentage '%s'", v)
171	}
172	if percentage < 0 || percentage > 100 {
173		return 0, fmt.Errorf("percentage must be between 0 and 100")
174	}
175	return percentage, nil
176}
177
178// ParseQOSReserved parses the --qos-reserve-requests option
179func ParseQOSReserved(m map[string]string) (*map[v1.ResourceName]int64, error) {
180	reservations := make(map[v1.ResourceName]int64)
181	for k, v := range m {
182		switch v1.ResourceName(k) {
183		// Only memory resources are supported.
184		case v1.ResourceMemory:
185			q, err := parsePercentage(v)
186			if err != nil {
187				return nil, err
188			}
189			reservations[v1.ResourceName(k)] = q
190		default:
191			return nil, fmt.Errorf("cannot reserve %q resource", k)
192		}
193	}
194	return &reservations, nil
195}
196
197func containerDevicesFromResourceDeviceInstances(devs devicemanager.ResourceDeviceInstances) []*podresourcesapi.ContainerDevices {
198	var respDevs []*podresourcesapi.ContainerDevices
199
200	for resourceName, resourceDevs := range devs {
201		for devID, dev := range resourceDevs {
202			topo := dev.GetTopology()
203			if topo == nil {
204				// Some device plugin do not report the topology information.
205				// This is legal, so we report the devices anyway,
206				// let the client decide what to do.
207				respDevs = append(respDevs, &podresourcesapi.ContainerDevices{
208					ResourceName: resourceName,
209					DeviceIds:    []string{devID},
210				})
211				continue
212			}
213
214			for _, node := range topo.GetNodes() {
215				respDevs = append(respDevs, &podresourcesapi.ContainerDevices{
216					ResourceName: resourceName,
217					DeviceIds:    []string{devID},
218					Topology: &podresourcesapi.TopologyInfo{
219						Nodes: []*podresourcesapi.NUMANode{
220							{
221								ID: node.GetID(),
222							},
223						},
224					},
225				})
226			}
227		}
228	}
229
230	return respDevs
231}
232