1/*
2Copyright 2016 The Kubernetes Authors.
3
4Licensed under the Apache License, Version 2.0 (the "License");
5you may not use this file except in compliance with the License.
6You may obtain a copy of the License at
7
8    http://www.apache.org/licenses/LICENSE-2.0
9
10Unless required by applicable law or agreed to in writing, software
11distributed under the License is distributed on an "AS IS" BASIS,
12WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13See the License for the specific language governing permissions and
14limitations under the License.
15*/
16
17package kuberuntime
18
19import (
20	"fmt"
21	"os"
22	"path/filepath"
23	"sort"
24	"time"
25
26	"k8s.io/apimachinery/pkg/types"
27	utilerrors "k8s.io/apimachinery/pkg/util/errors"
28	"k8s.io/apimachinery/pkg/util/sets"
29	internalapi "k8s.io/cri-api/pkg/apis"
30	runtimeapi "k8s.io/cri-api/pkg/apis/runtime/v1alpha2"
31	"k8s.io/klog/v2"
32	kubecontainer "k8s.io/kubernetes/pkg/kubelet/container"
33)
34
35// containerGC is the manager of garbage collection.
36type containerGC struct {
37	client           internalapi.RuntimeService
38	manager          *kubeGenericRuntimeManager
39	podStateProvider podStateProvider
40}
41
42// NewContainerGC creates a new containerGC.
43func newContainerGC(client internalapi.RuntimeService, podStateProvider podStateProvider, manager *kubeGenericRuntimeManager) *containerGC {
44	return &containerGC{
45		client:           client,
46		manager:          manager,
47		podStateProvider: podStateProvider,
48	}
49}
50
51// containerGCInfo is the internal information kept for containers being considered for GC.
52type containerGCInfo struct {
53	// The ID of the container.
54	id string
55	// The name of the container.
56	name string
57	// Creation time for the container.
58	createTime time.Time
59	// If true, the container is in unknown state. Garbage collector should try
60	// to stop containers before removal.
61	unknown bool
62}
63
64// sandboxGCInfo is the internal information kept for sandboxes being considered for GC.
65type sandboxGCInfo struct {
66	// The ID of the sandbox.
67	id string
68	// Creation time for the sandbox.
69	createTime time.Time
70	// If true, the sandbox is ready or still has containers.
71	active bool
72}
73
74// evictUnit is considered for eviction as units of (UID, container name) pair.
75type evictUnit struct {
76	// UID of the pod.
77	uid types.UID
78	// Name of the container in the pod.
79	name string
80}
81
82type containersByEvictUnit map[evictUnit][]containerGCInfo
83type sandboxesByPodUID map[types.UID][]sandboxGCInfo
84
85// NumContainers returns the number of containers in this map.
86func (cu containersByEvictUnit) NumContainers() int {
87	num := 0
88	for key := range cu {
89		num += len(cu[key])
90	}
91	return num
92}
93
94// NumEvictUnits returns the number of pod in this map.
95func (cu containersByEvictUnit) NumEvictUnits() int {
96	return len(cu)
97}
98
99// Newest first.
100type byCreated []containerGCInfo
101
102func (a byCreated) Len() int           { return len(a) }
103func (a byCreated) Swap(i, j int)      { a[i], a[j] = a[j], a[i] }
104func (a byCreated) Less(i, j int) bool { return a[i].createTime.After(a[j].createTime) }
105
106// Newest first.
107type sandboxByCreated []sandboxGCInfo
108
109func (a sandboxByCreated) Len() int           { return len(a) }
110func (a sandboxByCreated) Swap(i, j int)      { a[i], a[j] = a[j], a[i] }
111func (a sandboxByCreated) Less(i, j int) bool { return a[i].createTime.After(a[j].createTime) }
112
113// enforceMaxContainersPerEvictUnit enforces MaxPerPodContainer for each evictUnit.
114func (cgc *containerGC) enforceMaxContainersPerEvictUnit(evictUnits containersByEvictUnit, MaxContainers int) {
115	for key := range evictUnits {
116		toRemove := len(evictUnits[key]) - MaxContainers
117
118		if toRemove > 0 {
119			evictUnits[key] = cgc.removeOldestN(evictUnits[key], toRemove)
120		}
121	}
122}
123
124// removeOldestN removes the oldest toRemove containers and returns the resulting slice.
125func (cgc *containerGC) removeOldestN(containers []containerGCInfo, toRemove int) []containerGCInfo {
126	// Remove from oldest to newest (last to first).
127	numToKeep := len(containers) - toRemove
128	if numToKeep > 0 {
129		sort.Sort(byCreated(containers))
130	}
131	for i := len(containers) - 1; i >= numToKeep; i-- {
132		if containers[i].unknown {
133			// Containers in known state could be running, we should try
134			// to stop it before removal.
135			id := kubecontainer.ContainerID{
136				Type: cgc.manager.runtimeName,
137				ID:   containers[i].id,
138			}
139			message := "Container is in unknown state, try killing it before removal"
140			if err := cgc.manager.killContainer(nil, id, containers[i].name, message, reasonUnknown, nil); err != nil {
141				klog.ErrorS(err, "Failed to stop container", "containerID", containers[i].id)
142				continue
143			}
144		}
145		if err := cgc.manager.removeContainer(containers[i].id); err != nil {
146			klog.ErrorS(err, "Failed to remove container", "containerID", containers[i].id)
147		}
148	}
149
150	// Assume we removed the containers so that we're not too aggressive.
151	return containers[:numToKeep]
152}
153
154// removeOldestNSandboxes removes the oldest inactive toRemove sandboxes and
155// returns the resulting slice.
156func (cgc *containerGC) removeOldestNSandboxes(sandboxes []sandboxGCInfo, toRemove int) {
157	numToKeep := len(sandboxes) - toRemove
158	if numToKeep > 0 {
159		sort.Sort(sandboxByCreated(sandboxes))
160	}
161	// Remove from oldest to newest (last to first).
162	for i := len(sandboxes) - 1; i >= numToKeep; i-- {
163		if !sandboxes[i].active {
164			cgc.removeSandbox(sandboxes[i].id)
165		}
166	}
167}
168
169// removeSandbox removes the sandbox by sandboxID.
170func (cgc *containerGC) removeSandbox(sandboxID string) {
171	klog.V(4).InfoS("Removing sandbox", "sandboxID", sandboxID)
172	// In normal cases, kubelet should've already called StopPodSandbox before
173	// GC kicks in. To guard against the rare cases where this is not true, try
174	// stopping the sandbox before removing it.
175	if err := cgc.client.StopPodSandbox(sandboxID); err != nil {
176		klog.ErrorS(err, "Failed to stop sandbox before removing", "sandboxID", sandboxID)
177		return
178	}
179	if err := cgc.client.RemovePodSandbox(sandboxID); err != nil {
180		klog.ErrorS(err, "Failed to remove sandbox", "sandboxID", sandboxID)
181	}
182}
183
184// evictableContainers gets all containers that are evictable. Evictable containers are: not running
185// and created more than MinAge ago.
186func (cgc *containerGC) evictableContainers(minAge time.Duration) (containersByEvictUnit, error) {
187	containers, err := cgc.manager.getKubeletContainers(true)
188	if err != nil {
189		return containersByEvictUnit{}, err
190	}
191
192	evictUnits := make(containersByEvictUnit)
193	newestGCTime := time.Now().Add(-minAge)
194	for _, container := range containers {
195		// Prune out running containers.
196		if container.State == runtimeapi.ContainerState_CONTAINER_RUNNING {
197			continue
198		}
199
200		createdAt := time.Unix(0, container.CreatedAt)
201		if newestGCTime.Before(createdAt) {
202			continue
203		}
204
205		labeledInfo := getContainerInfoFromLabels(container.Labels)
206		containerInfo := containerGCInfo{
207			id:         container.Id,
208			name:       container.Metadata.Name,
209			createTime: createdAt,
210			unknown:    container.State == runtimeapi.ContainerState_CONTAINER_UNKNOWN,
211		}
212		key := evictUnit{
213			uid:  labeledInfo.PodUID,
214			name: containerInfo.name,
215		}
216		evictUnits[key] = append(evictUnits[key], containerInfo)
217	}
218
219	return evictUnits, nil
220}
221
222// evict all containers that are evictable
223func (cgc *containerGC) evictContainers(gcPolicy kubecontainer.GCPolicy, allSourcesReady bool, evictNonDeletedPods bool) error {
224	// Separate containers by evict units.
225	evictUnits, err := cgc.evictableContainers(gcPolicy.MinAge)
226	if err != nil {
227		return err
228	}
229
230	// Remove deleted pod containers if all sources are ready.
231	if allSourcesReady {
232		for key, unit := range evictUnits {
233			if cgc.podStateProvider.ShouldPodContentBeRemoved(key.uid) || (evictNonDeletedPods && cgc.podStateProvider.ShouldPodRuntimeBeRemoved(key.uid)) {
234				cgc.removeOldestN(unit, len(unit)) // Remove all.
235				delete(evictUnits, key)
236			}
237		}
238	}
239
240	// Enforce max containers per evict unit.
241	if gcPolicy.MaxPerPodContainer >= 0 {
242		cgc.enforceMaxContainersPerEvictUnit(evictUnits, gcPolicy.MaxPerPodContainer)
243	}
244
245	// Enforce max total number of containers.
246	if gcPolicy.MaxContainers >= 0 && evictUnits.NumContainers() > gcPolicy.MaxContainers {
247		// Leave an equal number of containers per evict unit (min: 1).
248		numContainersPerEvictUnit := gcPolicy.MaxContainers / evictUnits.NumEvictUnits()
249		if numContainersPerEvictUnit < 1 {
250			numContainersPerEvictUnit = 1
251		}
252		cgc.enforceMaxContainersPerEvictUnit(evictUnits, numContainersPerEvictUnit)
253
254		// If we still need to evict, evict oldest first.
255		numContainers := evictUnits.NumContainers()
256		if numContainers > gcPolicy.MaxContainers {
257			flattened := make([]containerGCInfo, 0, numContainers)
258			for key := range evictUnits {
259				flattened = append(flattened, evictUnits[key]...)
260			}
261			sort.Sort(byCreated(flattened))
262
263			cgc.removeOldestN(flattened, numContainers-gcPolicy.MaxContainers)
264		}
265	}
266	return nil
267}
268
269// evictSandboxes remove all evictable sandboxes. An evictable sandbox must
270// meet the following requirements:
271//   1. not in ready state
272//   2. contains no containers.
273//   3. belong to a non-existent (i.e., already removed) pod, or is not the
274//      most recently created sandbox for the pod.
275func (cgc *containerGC) evictSandboxes(evictNonDeletedPods bool) error {
276	containers, err := cgc.manager.getKubeletContainers(true)
277	if err != nil {
278		return err
279	}
280
281	sandboxes, err := cgc.manager.getKubeletSandboxes(true)
282	if err != nil {
283		return err
284	}
285
286	// collect all the PodSandboxId of container
287	sandboxIDs := sets.NewString()
288	for _, container := range containers {
289		sandboxIDs.Insert(container.PodSandboxId)
290	}
291
292	sandboxesByPod := make(sandboxesByPodUID)
293	for _, sandbox := range sandboxes {
294		podUID := types.UID(sandbox.Metadata.Uid)
295		sandboxInfo := sandboxGCInfo{
296			id:         sandbox.Id,
297			createTime: time.Unix(0, sandbox.CreatedAt),
298		}
299
300		// Set ready sandboxes to be active.
301		if sandbox.State == runtimeapi.PodSandboxState_SANDBOX_READY {
302			sandboxInfo.active = true
303		}
304
305		// Set sandboxes that still have containers to be active.
306		if sandboxIDs.Has(sandbox.Id) {
307			sandboxInfo.active = true
308		}
309
310		sandboxesByPod[podUID] = append(sandboxesByPod[podUID], sandboxInfo)
311	}
312
313	for podUID, sandboxes := range sandboxesByPod {
314		if cgc.podStateProvider.ShouldPodContentBeRemoved(podUID) || (evictNonDeletedPods && cgc.podStateProvider.ShouldPodRuntimeBeRemoved(podUID)) {
315			// Remove all evictable sandboxes if the pod has been removed.
316			// Note that the latest dead sandbox is also removed if there is
317			// already an active one.
318			cgc.removeOldestNSandboxes(sandboxes, len(sandboxes))
319		} else {
320			// Keep latest one if the pod still exists.
321			cgc.removeOldestNSandboxes(sandboxes, len(sandboxes)-1)
322		}
323	}
324	return nil
325}
326
327// evictPodLogsDirectories evicts all evictable pod logs directories. Pod logs directories
328// are evictable if there are no corresponding pods.
329func (cgc *containerGC) evictPodLogsDirectories(allSourcesReady bool) error {
330	osInterface := cgc.manager.osInterface
331	if allSourcesReady {
332		// Only remove pod logs directories when all sources are ready.
333		dirs, err := osInterface.ReadDir(podLogsRootDirectory)
334		if err != nil {
335			return fmt.Errorf("failed to read podLogsRootDirectory %q: %v", podLogsRootDirectory, err)
336		}
337		for _, dir := range dirs {
338			name := dir.Name()
339			podUID := parsePodUIDFromLogsDirectory(name)
340			if !cgc.podStateProvider.ShouldPodContentBeRemoved(podUID) {
341				continue
342			}
343			klog.V(4).InfoS("Removing pod logs", "podUID", podUID)
344			err := osInterface.RemoveAll(filepath.Join(podLogsRootDirectory, name))
345			if err != nil {
346				klog.ErrorS(err, "Failed to remove pod logs directory", "path", name)
347			}
348		}
349	}
350
351	// Remove dead container log symlinks.
352	// TODO(random-liu): Remove this after cluster logging supports CRI container log path.
353	logSymlinks, _ := osInterface.Glob(filepath.Join(legacyContainerLogsDir, fmt.Sprintf("*.%s", legacyLogSuffix)))
354	for _, logSymlink := range logSymlinks {
355		if _, err := osInterface.Stat(logSymlink); os.IsNotExist(err) {
356			if containerID, err := getContainerIDFromLegacyLogSymlink(logSymlink); err == nil {
357				status, err := cgc.manager.runtimeService.ContainerStatus(containerID)
358				if err != nil {
359					// TODO: we should handle container not found (i.e. container was deleted) case differently
360					// once https://github.com/kubernetes/kubernetes/issues/63336 is resolved
361					klog.InfoS("Error getting ContainerStatus for containerID", "containerID", containerID, "err", err)
362				} else if status.State != runtimeapi.ContainerState_CONTAINER_EXITED {
363					// Here is how container log rotation works (see containerLogManager#rotateLatestLog):
364					//
365					// 1. rename current log to rotated log file whose filename contains current timestamp (fmt.Sprintf("%s.%s", log, timestamp))
366					// 2. reopen the container log
367					// 3. if #2 fails, rename rotated log file back to container log
368					//
369					// There is small but indeterministic amount of time during which log file doesn't exist (between steps #1 and #2, between #1 and #3).
370					// Hence the symlink may be deemed unhealthy during that period.
371					// See https://github.com/kubernetes/kubernetes/issues/52172
372					//
373					// We only remove unhealthy symlink for dead containers
374					klog.V(5).InfoS("Container is still running, not removing symlink", "containerID", containerID, "path", logSymlink)
375					continue
376				}
377			} else {
378				klog.V(4).InfoS("Unable to obtain container ID", "err", err)
379			}
380			err := osInterface.Remove(logSymlink)
381			if err != nil {
382				klog.ErrorS(err, "Failed to remove container log dead symlink", "path", logSymlink)
383			} else {
384				klog.V(4).InfoS("Removed symlink", "path", logSymlink)
385			}
386		}
387	}
388	return nil
389}
390
391// GarbageCollect removes dead containers using the specified container gc policy.
392// Note that gc policy is not applied to sandboxes. Sandboxes are only removed when they are
393// not ready and containing no containers.
394//
395// GarbageCollect consists of the following steps:
396// * gets evictable containers which are not active and created more than gcPolicy.MinAge ago.
397// * removes oldest dead containers for each pod by enforcing gcPolicy.MaxPerPodContainer.
398// * removes oldest dead containers by enforcing gcPolicy.MaxContainers.
399// * gets evictable sandboxes which are not ready and contains no containers.
400// * removes evictable sandboxes.
401func (cgc *containerGC) GarbageCollect(gcPolicy kubecontainer.GCPolicy, allSourcesReady bool, evictNonDeletedPods bool) error {
402	errors := []error{}
403	// Remove evictable containers
404	if err := cgc.evictContainers(gcPolicy, allSourcesReady, evictNonDeletedPods); err != nil {
405		errors = append(errors, err)
406	}
407
408	// Remove sandboxes with zero containers
409	if err := cgc.evictSandboxes(evictNonDeletedPods); err != nil {
410		errors = append(errors, err)
411	}
412
413	// Remove pod sandbox log directory
414	if err := cgc.evictPodLogsDirectories(allSourcesReady); err != nil {
415		errors = append(errors, err)
416	}
417	return utilerrors.NewAggregate(errors)
418}
419