1/* 2Copyright 2015 The Kubernetes Authors. 3 4Licensed under the Apache License, Version 2.0 (the "License"); 5you may not use this file except in compliance with the License. 6You may obtain a copy of the License at 7 8 http://www.apache.org/licenses/LICENSE-2.0 9 10Unless required by applicable law or agreed to in writing, software 11distributed under the License is distributed on an "AS IS" BASIS, 12WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13See the License for the specific language governing permissions and 14limitations under the License. 15*/ 16 17package cm 18 19import ( 20 "fmt" 21 "strconv" 22 "strings" 23 "time" 24 25 "k8s.io/apimachinery/pkg/util/sets" 26 // TODO: Migrate kubelet to either use its own internal objects or client library. 27 v1 "k8s.io/api/core/v1" 28 internalapi "k8s.io/cri-api/pkg/apis" 29 podresourcesapi "k8s.io/kubelet/pkg/apis/podresources/v1" 30 kubeletconfig "k8s.io/kubernetes/pkg/kubelet/apis/config" 31 "k8s.io/kubernetes/pkg/kubelet/apis/podresources" 32 "k8s.io/kubernetes/pkg/kubelet/cm/cpuset" 33 "k8s.io/kubernetes/pkg/kubelet/cm/devicemanager" 34 "k8s.io/kubernetes/pkg/kubelet/config" 35 kubecontainer "k8s.io/kubernetes/pkg/kubelet/container" 36 evictionapi "k8s.io/kubernetes/pkg/kubelet/eviction/api" 37 "k8s.io/kubernetes/pkg/kubelet/lifecycle" 38 "k8s.io/kubernetes/pkg/kubelet/pluginmanager/cache" 39 "k8s.io/kubernetes/pkg/kubelet/status" 40 schedulerframework "k8s.io/kubernetes/pkg/scheduler/framework" 41) 42 43type ActivePodsFunc func() []*v1.Pod 44 45// Manages the containers running on a machine. 46type ContainerManager interface { 47 // Runs the container manager's housekeeping. 48 // - Ensures that the Docker daemon is in a container. 49 // - Creates the system container where all non-containerized processes run. 50 Start(*v1.Node, ActivePodsFunc, config.SourcesReady, status.PodStatusProvider, internalapi.RuntimeService) error 51 52 // SystemCgroupsLimit returns resources allocated to system cgroups in the machine. 53 // These cgroups include the system and Kubernetes services. 54 SystemCgroupsLimit() v1.ResourceList 55 56 // GetNodeConfig returns a NodeConfig that is being used by the container manager. 57 GetNodeConfig() NodeConfig 58 59 // Status returns internal Status. 60 Status() Status 61 62 // NewPodContainerManager is a factory method which returns a podContainerManager object 63 // Returns a noop implementation if qos cgroup hierarchy is not enabled 64 NewPodContainerManager() PodContainerManager 65 66 // GetMountedSubsystems returns the mounted cgroup subsystems on the node 67 GetMountedSubsystems() *CgroupSubsystems 68 69 // GetQOSContainersInfo returns the names of top level QoS containers 70 GetQOSContainersInfo() QOSContainersInfo 71 72 // GetNodeAllocatableReservation returns the amount of compute resources that have to be reserved from scheduling. 73 GetNodeAllocatableReservation() v1.ResourceList 74 75 // GetCapacity returns the amount of compute resources tracked by container manager available on the node. 76 GetCapacity() v1.ResourceList 77 78 // GetDevicePluginResourceCapacity returns the node capacity (amount of total device plugin resources), 79 // node allocatable (amount of total healthy resources reported by device plugin), 80 // and inactive device plugin resources previously registered on the node. 81 GetDevicePluginResourceCapacity() (v1.ResourceList, v1.ResourceList, []string) 82 83 // UpdateQOSCgroups performs housekeeping updates to ensure that the top 84 // level QoS containers have their desired state in a thread-safe way 85 UpdateQOSCgroups() error 86 87 // GetResources returns RunContainerOptions with devices, mounts, and env fields populated for 88 // extended resources required by container. 89 GetResources(pod *v1.Pod, container *v1.Container) (*kubecontainer.RunContainerOptions, error) 90 91 // UpdatePluginResources calls Allocate of device plugin handler for potential 92 // requests for device plugin resources, and returns an error if fails. 93 // Otherwise, it updates allocatableResource in nodeInfo if necessary, 94 // to make sure it is at least equal to the pod's requested capacity for 95 // any registered device plugin resource 96 UpdatePluginResources(*schedulerframework.NodeInfo, *lifecycle.PodAdmitAttributes) error 97 98 InternalContainerLifecycle() InternalContainerLifecycle 99 100 // GetPodCgroupRoot returns the cgroup which contains all pods. 101 GetPodCgroupRoot() string 102 103 // GetPluginRegistrationHandler returns a plugin registration handler 104 // The pluginwatcher's Handlers allow to have a single module for handling 105 // registration. 106 GetPluginRegistrationHandler() cache.PluginHandler 107 108 // ShouldResetExtendedResourceCapacity returns whether or not the extended resources should be zeroed, 109 // due to node recreation. 110 ShouldResetExtendedResourceCapacity() bool 111 112 // GetAllocateResourcesPodAdmitHandler returns an instance of a PodAdmitHandler responsible for allocating pod resources. 113 GetAllocateResourcesPodAdmitHandler() lifecycle.PodAdmitHandler 114 115 // GetNodeAllocatableAbsolute returns the absolute value of Node Allocatable which is primarily useful for enforcement. 116 GetNodeAllocatableAbsolute() v1.ResourceList 117 118 // Implements the podresources Provider API for CPUs, Memory and Devices 119 podresources.CPUsProvider 120 podresources.DevicesProvider 121 podresources.MemoryProvider 122} 123 124type NodeConfig struct { 125 RuntimeCgroupsName string 126 SystemCgroupsName string 127 KubeletCgroupsName string 128 ContainerRuntime string 129 CgroupsPerQOS bool 130 CgroupRoot string 131 CgroupDriver string 132 KubeletRootDir string 133 ProtectKernelDefaults bool 134 NodeAllocatableConfig 135 QOSReserved map[v1.ResourceName]int64 136 ExperimentalCPUManagerPolicy string 137 ExperimentalCPUManagerPolicyOptions map[string]string 138 ExperimentalTopologyManagerScope string 139 ExperimentalCPUManagerReconcilePeriod time.Duration 140 ExperimentalMemoryManagerPolicy string 141 ExperimentalMemoryManagerReservedMemory []kubeletconfig.MemoryReservation 142 ExperimentalPodPidsLimit int64 143 EnforceCPULimits bool 144 CPUCFSQuotaPeriod time.Duration 145 ExperimentalTopologyManagerPolicy string 146} 147 148type NodeAllocatableConfig struct { 149 KubeReservedCgroupName string 150 SystemReservedCgroupName string 151 ReservedSystemCPUs cpuset.CPUSet 152 EnforceNodeAllocatable sets.String 153 KubeReserved v1.ResourceList 154 SystemReserved v1.ResourceList 155 HardEvictionThresholds []evictionapi.Threshold 156} 157 158type Status struct { 159 // Any soft requirements that were unsatisfied. 160 SoftRequirements error 161} 162 163// parsePercentage parses the percentage string to numeric value. 164func parsePercentage(v string) (int64, error) { 165 if !strings.HasSuffix(v, "%") { 166 return 0, fmt.Errorf("percentage expected, got '%s'", v) 167 } 168 percentage, err := strconv.ParseInt(strings.TrimRight(v, "%"), 10, 0) 169 if err != nil { 170 return 0, fmt.Errorf("invalid number in percentage '%s'", v) 171 } 172 if percentage < 0 || percentage > 100 { 173 return 0, fmt.Errorf("percentage must be between 0 and 100") 174 } 175 return percentage, nil 176} 177 178// ParseQOSReserved parses the --qos-reserve-requests option 179func ParseQOSReserved(m map[string]string) (*map[v1.ResourceName]int64, error) { 180 reservations := make(map[v1.ResourceName]int64) 181 for k, v := range m { 182 switch v1.ResourceName(k) { 183 // Only memory resources are supported. 184 case v1.ResourceMemory: 185 q, err := parsePercentage(v) 186 if err != nil { 187 return nil, err 188 } 189 reservations[v1.ResourceName(k)] = q 190 default: 191 return nil, fmt.Errorf("cannot reserve %q resource", k) 192 } 193 } 194 return &reservations, nil 195} 196 197func containerDevicesFromResourceDeviceInstances(devs devicemanager.ResourceDeviceInstances) []*podresourcesapi.ContainerDevices { 198 var respDevs []*podresourcesapi.ContainerDevices 199 200 for resourceName, resourceDevs := range devs { 201 for devID, dev := range resourceDevs { 202 topo := dev.GetTopology() 203 if topo == nil { 204 // Some device plugin do not report the topology information. 205 // This is legal, so we report the devices anyway, 206 // let the client decide what to do. 207 respDevs = append(respDevs, &podresourcesapi.ContainerDevices{ 208 ResourceName: resourceName, 209 DeviceIds: []string{devID}, 210 }) 211 continue 212 } 213 214 for _, node := range topo.GetNodes() { 215 respDevs = append(respDevs, &podresourcesapi.ContainerDevices{ 216 ResourceName: resourceName, 217 DeviceIds: []string{devID}, 218 Topology: &podresourcesapi.TopologyInfo{ 219 Nodes: []*podresourcesapi.NUMANode{ 220 { 221 ID: node.GetID(), 222 }, 223 }, 224 }, 225 }) 226 } 227 } 228 } 229 230 return respDevs 231} 232