1/* 2Copyright 2016 The Kubernetes Authors. 3 4Licensed under the Apache License, Version 2.0 (the "License"); 5you may not use this file except in compliance with the License. 6You may obtain a copy of the License at 7 8 http://www.apache.org/licenses/LICENSE-2.0 9 10Unless required by applicable law or agreed to in writing, software 11distributed under the License is distributed on an "AS IS" BASIS, 12WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13See the License for the specific language governing permissions and 14limitations under the License. 15*/ 16 17package kuberuntime 18 19import ( 20 "fmt" 21 "os" 22 "path/filepath" 23 "sort" 24 "time" 25 26 "k8s.io/apimachinery/pkg/types" 27 utilerrors "k8s.io/apimachinery/pkg/util/errors" 28 "k8s.io/apimachinery/pkg/util/sets" 29 internalapi "k8s.io/cri-api/pkg/apis" 30 runtimeapi "k8s.io/cri-api/pkg/apis/runtime/v1alpha2" 31 "k8s.io/klog/v2" 32 kubecontainer "k8s.io/kubernetes/pkg/kubelet/container" 33) 34 35// containerGC is the manager of garbage collection. 36type containerGC struct { 37 client internalapi.RuntimeService 38 manager *kubeGenericRuntimeManager 39 podStateProvider podStateProvider 40} 41 42// NewContainerGC creates a new containerGC. 43func newContainerGC(client internalapi.RuntimeService, podStateProvider podStateProvider, manager *kubeGenericRuntimeManager) *containerGC { 44 return &containerGC{ 45 client: client, 46 manager: manager, 47 podStateProvider: podStateProvider, 48 } 49} 50 51// containerGCInfo is the internal information kept for containers being considered for GC. 52type containerGCInfo struct { 53 // The ID of the container. 54 id string 55 // The name of the container. 56 name string 57 // Creation time for the container. 58 createTime time.Time 59 // If true, the container is in unknown state. Garbage collector should try 60 // to stop containers before removal. 61 unknown bool 62} 63 64// sandboxGCInfo is the internal information kept for sandboxes being considered for GC. 65type sandboxGCInfo struct { 66 // The ID of the sandbox. 67 id string 68 // Creation time for the sandbox. 69 createTime time.Time 70 // If true, the sandbox is ready or still has containers. 71 active bool 72} 73 74// evictUnit is considered for eviction as units of (UID, container name) pair. 75type evictUnit struct { 76 // UID of the pod. 77 uid types.UID 78 // Name of the container in the pod. 79 name string 80} 81 82type containersByEvictUnit map[evictUnit][]containerGCInfo 83type sandboxesByPodUID map[types.UID][]sandboxGCInfo 84 85// NumContainers returns the number of containers in this map. 86func (cu containersByEvictUnit) NumContainers() int { 87 num := 0 88 for key := range cu { 89 num += len(cu[key]) 90 } 91 return num 92} 93 94// NumEvictUnits returns the number of pod in this map. 95func (cu containersByEvictUnit) NumEvictUnits() int { 96 return len(cu) 97} 98 99// Newest first. 100type byCreated []containerGCInfo 101 102func (a byCreated) Len() int { return len(a) } 103func (a byCreated) Swap(i, j int) { a[i], a[j] = a[j], a[i] } 104func (a byCreated) Less(i, j int) bool { return a[i].createTime.After(a[j].createTime) } 105 106// Newest first. 107type sandboxByCreated []sandboxGCInfo 108 109func (a sandboxByCreated) Len() int { return len(a) } 110func (a sandboxByCreated) Swap(i, j int) { a[i], a[j] = a[j], a[i] } 111func (a sandboxByCreated) Less(i, j int) bool { return a[i].createTime.After(a[j].createTime) } 112 113// enforceMaxContainersPerEvictUnit enforces MaxPerPodContainer for each evictUnit. 114func (cgc *containerGC) enforceMaxContainersPerEvictUnit(evictUnits containersByEvictUnit, MaxContainers int) { 115 for key := range evictUnits { 116 toRemove := len(evictUnits[key]) - MaxContainers 117 118 if toRemove > 0 { 119 evictUnits[key] = cgc.removeOldestN(evictUnits[key], toRemove) 120 } 121 } 122} 123 124// removeOldestN removes the oldest toRemove containers and returns the resulting slice. 125func (cgc *containerGC) removeOldestN(containers []containerGCInfo, toRemove int) []containerGCInfo { 126 // Remove from oldest to newest (last to first). 127 numToKeep := len(containers) - toRemove 128 if numToKeep > 0 { 129 sort.Sort(byCreated(containers)) 130 } 131 for i := len(containers) - 1; i >= numToKeep; i-- { 132 if containers[i].unknown { 133 // Containers in known state could be running, we should try 134 // to stop it before removal. 135 id := kubecontainer.ContainerID{ 136 Type: cgc.manager.runtimeName, 137 ID: containers[i].id, 138 } 139 message := "Container is in unknown state, try killing it before removal" 140 if err := cgc.manager.killContainer(nil, id, containers[i].name, message, reasonUnknown, nil); err != nil { 141 klog.ErrorS(err, "Failed to stop container", "containerID", containers[i].id) 142 continue 143 } 144 } 145 if err := cgc.manager.removeContainer(containers[i].id); err != nil { 146 klog.ErrorS(err, "Failed to remove container", "containerID", containers[i].id) 147 } 148 } 149 150 // Assume we removed the containers so that we're not too aggressive. 151 return containers[:numToKeep] 152} 153 154// removeOldestNSandboxes removes the oldest inactive toRemove sandboxes and 155// returns the resulting slice. 156func (cgc *containerGC) removeOldestNSandboxes(sandboxes []sandboxGCInfo, toRemove int) { 157 numToKeep := len(sandboxes) - toRemove 158 if numToKeep > 0 { 159 sort.Sort(sandboxByCreated(sandboxes)) 160 } 161 // Remove from oldest to newest (last to first). 162 for i := len(sandboxes) - 1; i >= numToKeep; i-- { 163 if !sandboxes[i].active { 164 cgc.removeSandbox(sandboxes[i].id) 165 } 166 } 167} 168 169// removeSandbox removes the sandbox by sandboxID. 170func (cgc *containerGC) removeSandbox(sandboxID string) { 171 klog.V(4).InfoS("Removing sandbox", "sandboxID", sandboxID) 172 // In normal cases, kubelet should've already called StopPodSandbox before 173 // GC kicks in. To guard against the rare cases where this is not true, try 174 // stopping the sandbox before removing it. 175 if err := cgc.client.StopPodSandbox(sandboxID); err != nil { 176 klog.ErrorS(err, "Failed to stop sandbox before removing", "sandboxID", sandboxID) 177 return 178 } 179 if err := cgc.client.RemovePodSandbox(sandboxID); err != nil { 180 klog.ErrorS(err, "Failed to remove sandbox", "sandboxID", sandboxID) 181 } 182} 183 184// evictableContainers gets all containers that are evictable. Evictable containers are: not running 185// and created more than MinAge ago. 186func (cgc *containerGC) evictableContainers(minAge time.Duration) (containersByEvictUnit, error) { 187 containers, err := cgc.manager.getKubeletContainers(true) 188 if err != nil { 189 return containersByEvictUnit{}, err 190 } 191 192 evictUnits := make(containersByEvictUnit) 193 newestGCTime := time.Now().Add(-minAge) 194 for _, container := range containers { 195 // Prune out running containers. 196 if container.State == runtimeapi.ContainerState_CONTAINER_RUNNING { 197 continue 198 } 199 200 createdAt := time.Unix(0, container.CreatedAt) 201 if newestGCTime.Before(createdAt) { 202 continue 203 } 204 205 labeledInfo := getContainerInfoFromLabels(container.Labels) 206 containerInfo := containerGCInfo{ 207 id: container.Id, 208 name: container.Metadata.Name, 209 createTime: createdAt, 210 unknown: container.State == runtimeapi.ContainerState_CONTAINER_UNKNOWN, 211 } 212 key := evictUnit{ 213 uid: labeledInfo.PodUID, 214 name: containerInfo.name, 215 } 216 evictUnits[key] = append(evictUnits[key], containerInfo) 217 } 218 219 return evictUnits, nil 220} 221 222// evict all containers that are evictable 223func (cgc *containerGC) evictContainers(gcPolicy kubecontainer.GCPolicy, allSourcesReady bool, evictNonDeletedPods bool) error { 224 // Separate containers by evict units. 225 evictUnits, err := cgc.evictableContainers(gcPolicy.MinAge) 226 if err != nil { 227 return err 228 } 229 230 // Remove deleted pod containers if all sources are ready. 231 if allSourcesReady { 232 for key, unit := range evictUnits { 233 if cgc.podStateProvider.ShouldPodContentBeRemoved(key.uid) || (evictNonDeletedPods && cgc.podStateProvider.ShouldPodRuntimeBeRemoved(key.uid)) { 234 cgc.removeOldestN(unit, len(unit)) // Remove all. 235 delete(evictUnits, key) 236 } 237 } 238 } 239 240 // Enforce max containers per evict unit. 241 if gcPolicy.MaxPerPodContainer >= 0 { 242 cgc.enforceMaxContainersPerEvictUnit(evictUnits, gcPolicy.MaxPerPodContainer) 243 } 244 245 // Enforce max total number of containers. 246 if gcPolicy.MaxContainers >= 0 && evictUnits.NumContainers() > gcPolicy.MaxContainers { 247 // Leave an equal number of containers per evict unit (min: 1). 248 numContainersPerEvictUnit := gcPolicy.MaxContainers / evictUnits.NumEvictUnits() 249 if numContainersPerEvictUnit < 1 { 250 numContainersPerEvictUnit = 1 251 } 252 cgc.enforceMaxContainersPerEvictUnit(evictUnits, numContainersPerEvictUnit) 253 254 // If we still need to evict, evict oldest first. 255 numContainers := evictUnits.NumContainers() 256 if numContainers > gcPolicy.MaxContainers { 257 flattened := make([]containerGCInfo, 0, numContainers) 258 for key := range evictUnits { 259 flattened = append(flattened, evictUnits[key]...) 260 } 261 sort.Sort(byCreated(flattened)) 262 263 cgc.removeOldestN(flattened, numContainers-gcPolicy.MaxContainers) 264 } 265 } 266 return nil 267} 268 269// evictSandboxes remove all evictable sandboxes. An evictable sandbox must 270// meet the following requirements: 271// 1. not in ready state 272// 2. contains no containers. 273// 3. belong to a non-existent (i.e., already removed) pod, or is not the 274// most recently created sandbox for the pod. 275func (cgc *containerGC) evictSandboxes(evictNonDeletedPods bool) error { 276 containers, err := cgc.manager.getKubeletContainers(true) 277 if err != nil { 278 return err 279 } 280 281 sandboxes, err := cgc.manager.getKubeletSandboxes(true) 282 if err != nil { 283 return err 284 } 285 286 // collect all the PodSandboxId of container 287 sandboxIDs := sets.NewString() 288 for _, container := range containers { 289 sandboxIDs.Insert(container.PodSandboxId) 290 } 291 292 sandboxesByPod := make(sandboxesByPodUID) 293 for _, sandbox := range sandboxes { 294 podUID := types.UID(sandbox.Metadata.Uid) 295 sandboxInfo := sandboxGCInfo{ 296 id: sandbox.Id, 297 createTime: time.Unix(0, sandbox.CreatedAt), 298 } 299 300 // Set ready sandboxes to be active. 301 if sandbox.State == runtimeapi.PodSandboxState_SANDBOX_READY { 302 sandboxInfo.active = true 303 } 304 305 // Set sandboxes that still have containers to be active. 306 if sandboxIDs.Has(sandbox.Id) { 307 sandboxInfo.active = true 308 } 309 310 sandboxesByPod[podUID] = append(sandboxesByPod[podUID], sandboxInfo) 311 } 312 313 for podUID, sandboxes := range sandboxesByPod { 314 if cgc.podStateProvider.ShouldPodContentBeRemoved(podUID) || (evictNonDeletedPods && cgc.podStateProvider.ShouldPodRuntimeBeRemoved(podUID)) { 315 // Remove all evictable sandboxes if the pod has been removed. 316 // Note that the latest dead sandbox is also removed if there is 317 // already an active one. 318 cgc.removeOldestNSandboxes(sandboxes, len(sandboxes)) 319 } else { 320 // Keep latest one if the pod still exists. 321 cgc.removeOldestNSandboxes(sandboxes, len(sandboxes)-1) 322 } 323 } 324 return nil 325} 326 327// evictPodLogsDirectories evicts all evictable pod logs directories. Pod logs directories 328// are evictable if there are no corresponding pods. 329func (cgc *containerGC) evictPodLogsDirectories(allSourcesReady bool) error { 330 osInterface := cgc.manager.osInterface 331 if allSourcesReady { 332 // Only remove pod logs directories when all sources are ready. 333 dirs, err := osInterface.ReadDir(podLogsRootDirectory) 334 if err != nil { 335 return fmt.Errorf("failed to read podLogsRootDirectory %q: %v", podLogsRootDirectory, err) 336 } 337 for _, dir := range dirs { 338 name := dir.Name() 339 podUID := parsePodUIDFromLogsDirectory(name) 340 if !cgc.podStateProvider.ShouldPodContentBeRemoved(podUID) { 341 continue 342 } 343 klog.V(4).InfoS("Removing pod logs", "podUID", podUID) 344 err := osInterface.RemoveAll(filepath.Join(podLogsRootDirectory, name)) 345 if err != nil { 346 klog.ErrorS(err, "Failed to remove pod logs directory", "path", name) 347 } 348 } 349 } 350 351 // Remove dead container log symlinks. 352 // TODO(random-liu): Remove this after cluster logging supports CRI container log path. 353 logSymlinks, _ := osInterface.Glob(filepath.Join(legacyContainerLogsDir, fmt.Sprintf("*.%s", legacyLogSuffix))) 354 for _, logSymlink := range logSymlinks { 355 if _, err := osInterface.Stat(logSymlink); os.IsNotExist(err) { 356 if containerID, err := getContainerIDFromLegacyLogSymlink(logSymlink); err == nil { 357 status, err := cgc.manager.runtimeService.ContainerStatus(containerID) 358 if err != nil { 359 // TODO: we should handle container not found (i.e. container was deleted) case differently 360 // once https://github.com/kubernetes/kubernetes/issues/63336 is resolved 361 klog.InfoS("Error getting ContainerStatus for containerID", "containerID", containerID, "err", err) 362 } else if status.State != runtimeapi.ContainerState_CONTAINER_EXITED { 363 // Here is how container log rotation works (see containerLogManager#rotateLatestLog): 364 // 365 // 1. rename current log to rotated log file whose filename contains current timestamp (fmt.Sprintf("%s.%s", log, timestamp)) 366 // 2. reopen the container log 367 // 3. if #2 fails, rename rotated log file back to container log 368 // 369 // There is small but indeterministic amount of time during which log file doesn't exist (between steps #1 and #2, between #1 and #3). 370 // Hence the symlink may be deemed unhealthy during that period. 371 // See https://github.com/kubernetes/kubernetes/issues/52172 372 // 373 // We only remove unhealthy symlink for dead containers 374 klog.V(5).InfoS("Container is still running, not removing symlink", "containerID", containerID, "path", logSymlink) 375 continue 376 } 377 } else { 378 klog.V(4).InfoS("Unable to obtain container ID", "err", err) 379 } 380 err := osInterface.Remove(logSymlink) 381 if err != nil { 382 klog.ErrorS(err, "Failed to remove container log dead symlink", "path", logSymlink) 383 } else { 384 klog.V(4).InfoS("Removed symlink", "path", logSymlink) 385 } 386 } 387 } 388 return nil 389} 390 391// GarbageCollect removes dead containers using the specified container gc policy. 392// Note that gc policy is not applied to sandboxes. Sandboxes are only removed when they are 393// not ready and containing no containers. 394// 395// GarbageCollect consists of the following steps: 396// * gets evictable containers which are not active and created more than gcPolicy.MinAge ago. 397// * removes oldest dead containers for each pod by enforcing gcPolicy.MaxPerPodContainer. 398// * removes oldest dead containers by enforcing gcPolicy.MaxContainers. 399// * gets evictable sandboxes which are not ready and contains no containers. 400// * removes evictable sandboxes. 401func (cgc *containerGC) GarbageCollect(gcPolicy kubecontainer.GCPolicy, allSourcesReady bool, evictNonDeletedPods bool) error { 402 errors := []error{} 403 // Remove evictable containers 404 if err := cgc.evictContainers(gcPolicy, allSourcesReady, evictNonDeletedPods); err != nil { 405 errors = append(errors, err) 406 } 407 408 // Remove sandboxes with zero containers 409 if err := cgc.evictSandboxes(evictNonDeletedPods); err != nil { 410 errors = append(errors, err) 411 } 412 413 // Remove pod sandbox log directory 414 if err := cgc.evictPodLogsDirectories(allSourcesReady); err != nil { 415 errors = append(errors, err) 416 } 417 return utilerrors.NewAggregate(errors) 418} 419