1/* 2Copyright 2015 The Kubernetes Authors. 3 4Licensed under the Apache License, Version 2.0 (the "License"); 5you may not use this file except in compliance with the License. 6You may obtain a copy of the License at 7 8 http://www.apache.org/licenses/LICENSE-2.0 9 10Unless required by applicable law or agreed to in writing, software 11distributed under the License is distributed on an "AS IS" BASIS, 12WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13See the License for the specific language governing permissions and 14limitations under the License. 15*/ 16 17package metrics 18 19import ( 20 "fmt" 21 "sync" 22 "time" 23 24 "k8s.io/component-base/metrics" 25 "k8s.io/component-base/metrics/legacyregistry" 26 27 corev1 "k8s.io/api/core/v1" 28 "k8s.io/apimachinery/pkg/types" 29 utilfeature "k8s.io/apiserver/pkg/util/feature" 30 "k8s.io/klog/v2" 31 "k8s.io/kubernetes/pkg/features" 32) 33 34// This const block defines the metric names for the kubelet metrics. 35const ( 36 KubeletSubsystem = "kubelet" 37 NodeNameKey = "node_name" 38 NodeLabelKey = "node" 39 PodWorkerDurationKey = "pod_worker_duration_seconds" 40 PodStartDurationKey = "pod_start_duration_seconds" 41 CgroupManagerOperationsKey = "cgroup_manager_duration_seconds" 42 PodWorkerStartDurationKey = "pod_worker_start_duration_seconds" 43 PLEGRelistDurationKey = "pleg_relist_duration_seconds" 44 PLEGDiscardEventsKey = "pleg_discard_events" 45 PLEGRelistIntervalKey = "pleg_relist_interval_seconds" 46 PLEGLastSeenKey = "pleg_last_seen_seconds" 47 EvictionsKey = "evictions" 48 EvictionStatsAgeKey = "eviction_stats_age_seconds" 49 PreemptionsKey = "preemptions" 50 VolumeStatsCapacityBytesKey = "volume_stats_capacity_bytes" 51 VolumeStatsAvailableBytesKey = "volume_stats_available_bytes" 52 VolumeStatsUsedBytesKey = "volume_stats_used_bytes" 53 VolumeStatsInodesKey = "volume_stats_inodes" 54 VolumeStatsInodesFreeKey = "volume_stats_inodes_free" 55 VolumeStatsInodesUsedKey = "volume_stats_inodes_used" 56 RunningPodsKey = "running_pods" 57 RunningContainersKey = "running_containers" 58 // Metrics keys of remote runtime operations 59 RuntimeOperationsKey = "runtime_operations_total" 60 RuntimeOperationsDurationKey = "runtime_operations_duration_seconds" 61 RuntimeOperationsErrorsKey = "runtime_operations_errors_total" 62 // Metrics keys of device plugin operations 63 DevicePluginRegistrationCountKey = "device_plugin_registration_total" 64 DevicePluginAllocationDurationKey = "device_plugin_alloc_duration_seconds" 65 // Metrics keys of pod resources operations 66 PodResourcesEndpointRequestsTotalKey = "pod_resources_endpoint_requests_total" 67 PodResourcesEndpointRequestsListKey = "pod_resources_endpoint_requests_list" 68 PodResourcesEndpointRequestsGetAllocatableKey = "pod_resources_endpoint_requests_get_allocatable" 69 PodResourcesEndpointErrorsListKey = "pod_resources_endpoint_errors_list" 70 PodResourcesEndpointErrorsGetAllocatableKey = "pod_resources_endpoint_errors_get_allocatable" 71 72 // Metric keys for node config 73 AssignedConfigKey = "node_config_assigned" 74 ActiveConfigKey = "node_config_active" 75 LastKnownGoodConfigKey = "node_config_last_known_good" 76 ConfigErrorKey = "node_config_error" 77 ConfigSourceLabelKey = "node_config_source" 78 ConfigSourceLabelValueLocal = "local" 79 ConfigUIDLabelKey = "node_config_uid" 80 ConfigResourceVersionLabelKey = "node_config_resource_version" 81 KubeletConfigKeyLabelKey = "node_config_kubelet_key" 82 83 // Metrics keys for RuntimeClass 84 RunPodSandboxDurationKey = "run_podsandbox_duration_seconds" 85 RunPodSandboxErrorsKey = "run_podsandbox_errors_total" 86 87 // Metrics to keep track of total number of Pods and Containers started 88 StartedPodsTotalKey = "started_pods_total" 89 StartedPodsErrorsTotalKey = "started_pods_errors_total" 90 StartedContainersTotalKey = "started_containers_total" 91 StartedContainersErrorsTotalKey = "started_containers_errors_total" 92 93 // Metrics to track ephemeral container usage by this kubelet 94 ManagedEphemeralContainersKey = "managed_ephemeral_containers" 95 96 // Values used in metric labels 97 Container = "container" 98 InitContainer = "init_container" 99 EphemeralContainer = "ephemeral_container" 100) 101 102var ( 103 // NodeName is a Gauge that tracks the ode's name. The count is always 1. 104 NodeName = metrics.NewGaugeVec( 105 &metrics.GaugeOpts{ 106 Subsystem: KubeletSubsystem, 107 Name: NodeNameKey, 108 Help: "The node's name. The count is always 1.", 109 StabilityLevel: metrics.ALPHA, 110 }, 111 []string{NodeLabelKey}, 112 ) 113 // ContainersPerPodCount is a Histogram that tracks the number of containers per pod. 114 ContainersPerPodCount = metrics.NewHistogram( 115 &metrics.HistogramOpts{ 116 Subsystem: KubeletSubsystem, 117 Name: "containers_per_pod_count", 118 Help: "The number of containers per pod.", 119 Buckets: metrics.ExponentialBuckets(1, 2, 5), 120 StabilityLevel: metrics.ALPHA, 121 }, 122 ) 123 // PodWorkerDuration is a Histogram that tracks the duration (in seconds) in takes to sync a single pod. 124 // Broken down by the operation type. 125 PodWorkerDuration = metrics.NewHistogramVec( 126 &metrics.HistogramOpts{ 127 Subsystem: KubeletSubsystem, 128 Name: PodWorkerDurationKey, 129 Help: "Duration in seconds to sync a single pod. Broken down by operation type: create, update, or sync", 130 Buckets: metrics.DefBuckets, 131 StabilityLevel: metrics.ALPHA, 132 }, 133 []string{"operation_type"}, 134 ) 135 // PodStartDuration is a Histogram that tracks the duration (in seconds) it takes for a single pod to go from pending to running. 136 PodStartDuration = metrics.NewHistogram( 137 &metrics.HistogramOpts{ 138 Subsystem: KubeletSubsystem, 139 Name: PodStartDurationKey, 140 Help: "Duration in seconds for a single pod to go from pending to running.", 141 Buckets: metrics.DefBuckets, 142 StabilityLevel: metrics.ALPHA, 143 }, 144 ) 145 // CgroupManagerDuration is a Histogram that tracks the duration (in seconds) it takes for cgroup manager operations to complete. 146 // Broken down by method. 147 CgroupManagerDuration = metrics.NewHistogramVec( 148 &metrics.HistogramOpts{ 149 Subsystem: KubeletSubsystem, 150 Name: CgroupManagerOperationsKey, 151 Help: "Duration in seconds for cgroup manager operations. Broken down by method.", 152 Buckets: metrics.DefBuckets, 153 StabilityLevel: metrics.ALPHA, 154 }, 155 []string{"operation_type"}, 156 ) 157 // PodWorkerStartDuration is a Histogram that tracks the duration (in seconds) it takes from seeing a pod to starting a worker. 158 PodWorkerStartDuration = metrics.NewHistogram( 159 &metrics.HistogramOpts{ 160 Subsystem: KubeletSubsystem, 161 Name: PodWorkerStartDurationKey, 162 Help: "Duration in seconds from seeing a pod to starting a worker.", 163 Buckets: metrics.DefBuckets, 164 StabilityLevel: metrics.ALPHA, 165 }, 166 ) 167 // PLEGRelistDuration is a Histogram that tracks the duration (in seconds) it takes for relisting pods in the Kubelet's 168 // Pod Lifecycle Event Generator (PLEG). 169 PLEGRelistDuration = metrics.NewHistogram( 170 &metrics.HistogramOpts{ 171 Subsystem: KubeletSubsystem, 172 Name: PLEGRelistDurationKey, 173 Help: "Duration in seconds for relisting pods in PLEG.", 174 Buckets: metrics.DefBuckets, 175 StabilityLevel: metrics.ALPHA, 176 }, 177 ) 178 // PLEGDiscardEvents is a Counter that tracks the number of discarding events in the Kubelet's Pod Lifecycle Event Generator (PLEG). 179 PLEGDiscardEvents = metrics.NewCounter( 180 &metrics.CounterOpts{ 181 Subsystem: KubeletSubsystem, 182 Name: PLEGDiscardEventsKey, 183 Help: "The number of discard events in PLEG.", 184 StabilityLevel: metrics.ALPHA, 185 }, 186 ) 187 188 // PLEGRelistInterval is a Histogram that tracks the intervals (in seconds) between relisting in the Kubelet's 189 // Pod Lifecycle Event Generator (PLEG). 190 PLEGRelistInterval = metrics.NewHistogram( 191 &metrics.HistogramOpts{ 192 Subsystem: KubeletSubsystem, 193 Name: PLEGRelistIntervalKey, 194 Help: "Interval in seconds between relisting in PLEG.", 195 Buckets: metrics.DefBuckets, 196 StabilityLevel: metrics.ALPHA, 197 }, 198 ) 199 // PLEGLastSeen is a Gauge giving the Unix timestamp when the Kubelet's 200 // Pod Lifecycle Event Generator (PLEG) was last seen active. 201 PLEGLastSeen = metrics.NewGauge( 202 &metrics.GaugeOpts{ 203 Subsystem: KubeletSubsystem, 204 Name: PLEGLastSeenKey, 205 Help: "Timestamp in seconds when PLEG was last seen active.", 206 StabilityLevel: metrics.ALPHA, 207 }, 208 ) 209 // RuntimeOperations is a Counter that tracks the cumulative number of remote runtime operations. 210 // Broken down by operation type. 211 RuntimeOperations = metrics.NewCounterVec( 212 &metrics.CounterOpts{ 213 Subsystem: KubeletSubsystem, 214 Name: RuntimeOperationsKey, 215 Help: "Cumulative number of runtime operations by operation type.", 216 StabilityLevel: metrics.ALPHA, 217 }, 218 []string{"operation_type"}, 219 ) 220 // RuntimeOperationsDuration is a Histogram that tracks the duration (in seconds) for remote runtime operations to complete. 221 // Broken down by operation type. 222 RuntimeOperationsDuration = metrics.NewHistogramVec( 223 &metrics.HistogramOpts{ 224 Subsystem: KubeletSubsystem, 225 Name: RuntimeOperationsDurationKey, 226 Help: "Duration in seconds of runtime operations. Broken down by operation type.", 227 Buckets: metrics.ExponentialBuckets(.005, 2.5, 14), 228 StabilityLevel: metrics.ALPHA, 229 }, 230 []string{"operation_type"}, 231 ) 232 // RuntimeOperationsErrors is a Counter that tracks the cumulative number of remote runtime operations errors. 233 // Broken down by operation type. 234 RuntimeOperationsErrors = metrics.NewCounterVec( 235 &metrics.CounterOpts{ 236 Subsystem: KubeletSubsystem, 237 Name: RuntimeOperationsErrorsKey, 238 Help: "Cumulative number of runtime operation errors by operation type.", 239 StabilityLevel: metrics.ALPHA, 240 }, 241 []string{"operation_type"}, 242 ) 243 // Evictions is a Counter that tracks the cumulative number of pod evictions initiated by the kubelet. 244 // Broken down by eviction signal. 245 Evictions = metrics.NewCounterVec( 246 &metrics.CounterOpts{ 247 Subsystem: KubeletSubsystem, 248 Name: EvictionsKey, 249 Help: "Cumulative number of pod evictions by eviction signal", 250 StabilityLevel: metrics.ALPHA, 251 }, 252 []string{"eviction_signal"}, 253 ) 254 // EvictionStatsAge is a Histogram that tracks the time (in seconds) between when stats are collected and when a pod is evicted 255 // based on those stats. Broken down by eviction signal. 256 EvictionStatsAge = metrics.NewHistogramVec( 257 &metrics.HistogramOpts{ 258 Subsystem: KubeletSubsystem, 259 Name: EvictionStatsAgeKey, 260 Help: "Time between when stats are collected, and when pod is evicted based on those stats by eviction signal", 261 Buckets: metrics.DefBuckets, 262 StabilityLevel: metrics.ALPHA, 263 }, 264 []string{"eviction_signal"}, 265 ) 266 // Preemptions is a Counter that tracks the cumulative number of pod preemptions initiated by the kubelet. 267 // Broken down by preemption signal. A preemption is only recorded for one resource, the sum of all signals 268 // is the number of preemptions on the given node. 269 Preemptions = metrics.NewCounterVec( 270 &metrics.CounterOpts{ 271 Subsystem: KubeletSubsystem, 272 Name: PreemptionsKey, 273 Help: "Cumulative number of pod preemptions by preemption resource", 274 StabilityLevel: metrics.ALPHA, 275 }, 276 []string{"preemption_signal"}, 277 ) 278 // DevicePluginRegistrationCount is a Counter that tracks the cumulative number of device plugin registrations. 279 // Broken down by resource name. 280 DevicePluginRegistrationCount = metrics.NewCounterVec( 281 &metrics.CounterOpts{ 282 Subsystem: KubeletSubsystem, 283 Name: DevicePluginRegistrationCountKey, 284 Help: "Cumulative number of device plugin registrations. Broken down by resource name.", 285 StabilityLevel: metrics.ALPHA, 286 }, 287 []string{"resource_name"}, 288 ) 289 // DevicePluginAllocationDuration is a Histogram that tracks the duration (in seconds) to serve a device plugin allocation request. 290 // Broken down by resource name. 291 DevicePluginAllocationDuration = metrics.NewHistogramVec( 292 &metrics.HistogramOpts{ 293 Subsystem: KubeletSubsystem, 294 Name: DevicePluginAllocationDurationKey, 295 Help: "Duration in seconds to serve a device plugin Allocation request. Broken down by resource name.", 296 Buckets: metrics.DefBuckets, 297 StabilityLevel: metrics.ALPHA, 298 }, 299 []string{"resource_name"}, 300 ) 301 302 // PodResourcesEndpointRequestsTotalCount is a Counter that tracks the cumulative number of requests to the PodResource endpoints. 303 // Broken down by server API version. 304 PodResourcesEndpointRequestsTotalCount = metrics.NewCounterVec( 305 &metrics.CounterOpts{ 306 Subsystem: KubeletSubsystem, 307 Name: PodResourcesEndpointRequestsTotalKey, 308 Help: "Cumulative number of requests to the PodResource endpoint. Broken down by server api version.", 309 StabilityLevel: metrics.ALPHA, 310 }, 311 []string{"server_api_version"}, 312 ) 313 314 // PodResourcesEndpointRequestsListCount is a Counter that tracks the number of requests to the PodResource List() endpoint. 315 // Broken down by server API version. 316 PodResourcesEndpointRequestsListCount = metrics.NewCounterVec( 317 &metrics.CounterOpts{ 318 Subsystem: KubeletSubsystem, 319 Name: PodResourcesEndpointRequestsListKey, 320 Help: "Number of requests to the PodResource List endpoint. Broken down by server api version.", 321 StabilityLevel: metrics.ALPHA, 322 }, 323 []string{"server_api_version"}, 324 ) 325 326 // PodResourcesEndpointRequestsGetAllocatableCount is a Counter that tracks the number of requests to the PodResource GetAllocatableResources() endpoint. 327 // Broken down by server API version. 328 PodResourcesEndpointRequestsGetAllocatableCount = metrics.NewCounterVec( 329 &metrics.CounterOpts{ 330 Subsystem: KubeletSubsystem, 331 Name: PodResourcesEndpointRequestsGetAllocatableKey, 332 Help: "Number of requests to the PodResource GetAllocatableResources endpoint. Broken down by server api version.", 333 StabilityLevel: metrics.ALPHA, 334 }, 335 []string{"server_api_version"}, 336 ) 337 338 // PodResourcesEndpointErrorsListCount is a Counter that tracks the number of errors returned by he PodResource List() endpoint. 339 // Broken down by server API version. 340 PodResourcesEndpointErrorsListCount = metrics.NewCounterVec( 341 &metrics.CounterOpts{ 342 Subsystem: KubeletSubsystem, 343 Name: PodResourcesEndpointErrorsListKey, 344 Help: "Number of requests to the PodResource List endpoint which returned error. Broken down by server api version.", 345 StabilityLevel: metrics.ALPHA, 346 }, 347 []string{"server_api_version"}, 348 ) 349 350 // PodResourcesEndpointErrorsGetAllocatableCount is a Counter that tracks the number of errors returned by the PodResource GetAllocatableResources() endpoint. 351 // Broken down by server API version. 352 PodResourcesEndpointErrorsGetAllocatableCount = metrics.NewCounterVec( 353 &metrics.CounterOpts{ 354 Subsystem: KubeletSubsystem, 355 Name: PodResourcesEndpointErrorsGetAllocatableKey, 356 Help: "Number of requests to the PodResource GetAllocatableResources endpoint which returned error. Broken down by server api version.", 357 StabilityLevel: metrics.ALPHA, 358 }, 359 []string{"server_api_version"}, 360 ) 361 362 // Metrics for node config 363 364 // AssignedConfig is a Gauge that is set 1 if the Kubelet has a NodeConfig assigned. 365 AssignedConfig = metrics.NewGaugeVec( 366 &metrics.GaugeOpts{ 367 Subsystem: KubeletSubsystem, 368 Name: AssignedConfigKey, 369 Help: "The node's understanding of intended config. The count is always 1.", 370 DeprecatedVersion: "1.22.0", 371 StabilityLevel: metrics.ALPHA, 372 }, 373 []string{ConfigSourceLabelKey, ConfigUIDLabelKey, ConfigResourceVersionLabelKey, KubeletConfigKeyLabelKey}, 374 ) 375 // ActiveConfig is a Gauge that is set to 1 if the Kubelet has an active NodeConfig. 376 ActiveConfig = metrics.NewGaugeVec( 377 &metrics.GaugeOpts{ 378 Subsystem: KubeletSubsystem, 379 Name: ActiveConfigKey, 380 Help: "The config source the node is actively using. The count is always 1.", 381 DeprecatedVersion: "1.22.0", 382 StabilityLevel: metrics.ALPHA, 383 }, 384 []string{ConfigSourceLabelKey, ConfigUIDLabelKey, ConfigResourceVersionLabelKey, KubeletConfigKeyLabelKey}, 385 ) 386 // LastKnownGoodConfig is a Gauge that is set to 1 if the Kubelet has a NodeConfig it can fall back to if there 387 // are certain errors. 388 LastKnownGoodConfig = metrics.NewGaugeVec( 389 &metrics.GaugeOpts{ 390 Subsystem: KubeletSubsystem, 391 Name: LastKnownGoodConfigKey, 392 Help: "The config source the node will fall back to when it encounters certain errors. The count is always 1.", 393 DeprecatedVersion: "1.22.0", 394 StabilityLevel: metrics.ALPHA, 395 }, 396 []string{ConfigSourceLabelKey, ConfigUIDLabelKey, ConfigResourceVersionLabelKey, KubeletConfigKeyLabelKey}, 397 ) 398 // ConfigError is a Gauge that is set to 1 if the node is experiencing a configuration-related error. 399 ConfigError = metrics.NewGauge( 400 &metrics.GaugeOpts{ 401 Subsystem: KubeletSubsystem, 402 Name: ConfigErrorKey, 403 Help: "This metric is true (1) if the node is experiencing a configuration-related error, false (0) otherwise.", 404 DeprecatedVersion: "1.22.0", 405 StabilityLevel: metrics.ALPHA, 406 }, 407 ) 408 // RunPodSandboxDuration is a Histogram that tracks the duration (in seconds) it takes to run Pod Sandbox operations. 409 // Broken down by RuntimeClass.Handler. 410 RunPodSandboxDuration = metrics.NewHistogramVec( 411 &metrics.HistogramOpts{ 412 Subsystem: KubeletSubsystem, 413 Name: RunPodSandboxDurationKey, 414 Help: "Duration in seconds of the run_podsandbox operations. Broken down by RuntimeClass.Handler.", 415 // Use DefBuckets for now, will customize the buckets if necessary. 416 Buckets: metrics.DefBuckets, 417 StabilityLevel: metrics.ALPHA, 418 }, 419 []string{"runtime_handler"}, 420 ) 421 // RunPodSandboxErrors is a Counter that tracks the cumulative number of Pod Sandbox operations errors. 422 // Broken down by RuntimeClass.Handler. 423 RunPodSandboxErrors = metrics.NewCounterVec( 424 &metrics.CounterOpts{ 425 Subsystem: KubeletSubsystem, 426 Name: RunPodSandboxErrorsKey, 427 Help: "Cumulative number of the run_podsandbox operation errors by RuntimeClass.Handler.", 428 StabilityLevel: metrics.ALPHA, 429 }, 430 []string{"runtime_handler"}, 431 ) 432 433 // RunningPodCount is a gauge that tracks the number of Pods currently with a running sandbox 434 // It is used to expose the kubelet internal state: how many pods have running containers in the container runtime, and mainly for debugging purpose. 435 RunningPodCount = metrics.NewGauge( 436 &metrics.GaugeOpts{ 437 Subsystem: KubeletSubsystem, 438 Name: RunningPodsKey, 439 Help: "Number of pods that have a running pod sandbox", 440 StabilityLevel: metrics.ALPHA, 441 }, 442 ) 443 // RunningContainerCount is a gauge that tracks the number of containers currently running 444 RunningContainerCount = metrics.NewGaugeVec( 445 &metrics.GaugeOpts{ 446 Subsystem: KubeletSubsystem, 447 Name: RunningContainersKey, 448 Help: "Number of containers currently running", 449 StabilityLevel: metrics.ALPHA, 450 }, 451 []string{"container_state"}, 452 ) 453 // StartedPodsTotal is a counter that tracks pod sandbox creation operations 454 StartedPodsTotal = metrics.NewCounter( 455 &metrics.CounterOpts{ 456 Subsystem: KubeletSubsystem, 457 Name: StartedPodsTotalKey, 458 Help: "Cumulative number of pods started", 459 StabilityLevel: metrics.ALPHA, 460 }, 461 ) 462 // StartedPodsErrorsTotal is a counter that tracks the number of errors creating pod sandboxes 463 StartedPodsErrorsTotal = metrics.NewCounterVec( 464 &metrics.CounterOpts{ 465 Subsystem: KubeletSubsystem, 466 Name: StartedPodsErrorsTotalKey, 467 Help: "Cumulative number of errors when starting pods", 468 StabilityLevel: metrics.ALPHA, 469 }, 470 []string{"message"}, 471 ) 472 // StartedContainersTotal is a counter that tracks the number of container creation operations 473 StartedContainersTotal = metrics.NewCounterVec( 474 &metrics.CounterOpts{ 475 Subsystem: KubeletSubsystem, 476 Name: StartedContainersTotalKey, 477 Help: "Cumulative number of containers started", 478 StabilityLevel: metrics.ALPHA, 479 }, 480 []string{"container_type"}, 481 ) 482 // StartedContainersTotal is a counter that tracks the number of errors creating containers 483 StartedContainersErrorsTotal = metrics.NewCounterVec( 484 &metrics.CounterOpts{ 485 Subsystem: KubeletSubsystem, 486 Name: StartedContainersErrorsTotalKey, 487 Help: "Cumulative number of errors when starting containers", 488 StabilityLevel: metrics.ALPHA, 489 }, 490 []string{"container_type", "code"}, 491 ) 492 // ManagedEphemeralContainers is a gauge that indicates how many ephemeral containers are managed by this kubelet. 493 ManagedEphemeralContainers = metrics.NewGauge( 494 &metrics.GaugeOpts{ 495 Subsystem: KubeletSubsystem, 496 Name: ManagedEphemeralContainersKey, 497 Help: "Current number of ephemeral containers in pods managed by this kubelet. Ephemeral containers will be ignored if disabled by the EphemeralContainers feature gate, and this number will be 0.", 498 StabilityLevel: metrics.ALPHA, 499 }, 500 ) 501) 502 503var registerMetrics sync.Once 504 505// Register registers all metrics. 506func Register(collectors ...metrics.StableCollector) { 507 // Register the metrics. 508 registerMetrics.Do(func() { 509 legacyregistry.MustRegister(NodeName) 510 legacyregistry.MustRegister(PodWorkerDuration) 511 legacyregistry.MustRegister(PodStartDuration) 512 legacyregistry.MustRegister(CgroupManagerDuration) 513 legacyregistry.MustRegister(PodWorkerStartDuration) 514 legacyregistry.MustRegister(ContainersPerPodCount) 515 legacyregistry.MustRegister(PLEGRelistDuration) 516 legacyregistry.MustRegister(PLEGDiscardEvents) 517 legacyregistry.MustRegister(PLEGRelistInterval) 518 legacyregistry.MustRegister(PLEGLastSeen) 519 legacyregistry.MustRegister(RuntimeOperations) 520 legacyregistry.MustRegister(RuntimeOperationsDuration) 521 legacyregistry.MustRegister(RuntimeOperationsErrors) 522 legacyregistry.MustRegister(Evictions) 523 legacyregistry.MustRegister(EvictionStatsAge) 524 legacyregistry.MustRegister(Preemptions) 525 legacyregistry.MustRegister(DevicePluginRegistrationCount) 526 legacyregistry.MustRegister(DevicePluginAllocationDuration) 527 legacyregistry.MustRegister(RunningContainerCount) 528 legacyregistry.MustRegister(RunningPodCount) 529 legacyregistry.MustRegister(ManagedEphemeralContainers) 530 legacyregistry.MustRegister(StartedPodsTotal) 531 legacyregistry.MustRegister(StartedPodsErrorsTotal) 532 legacyregistry.MustRegister(StartedContainersTotal) 533 legacyregistry.MustRegister(StartedContainersErrorsTotal) 534 legacyregistry.MustRegister(RunPodSandboxDuration) 535 legacyregistry.MustRegister(RunPodSandboxErrors) 536 if utilfeature.DefaultFeatureGate.Enabled(features.DynamicKubeletConfig) { 537 legacyregistry.MustRegister(AssignedConfig) 538 legacyregistry.MustRegister(ActiveConfig) 539 legacyregistry.MustRegister(LastKnownGoodConfig) 540 legacyregistry.MustRegister(ConfigError) 541 } 542 for _, collector := range collectors { 543 legacyregistry.CustomMustRegister(collector) 544 } 545 }) 546} 547 548// GetGather returns the gatherer. It used by test case outside current package. 549func GetGather() metrics.Gatherer { 550 return legacyregistry.DefaultGatherer 551} 552 553// SinceInSeconds gets the time since the specified start in seconds. 554func SinceInSeconds(start time.Time) float64 { 555 return time.Since(start).Seconds() 556} 557 558const configMapAPIPathFmt = "/api/v1/namespaces/%s/configmaps/%s" 559 560func configLabels(source *corev1.NodeConfigSource) (map[string]string, error) { 561 if source == nil { 562 return map[string]string{ 563 // prometheus requires all of the labels that can be set on the metric 564 ConfigSourceLabelKey: "local", 565 ConfigUIDLabelKey: "", 566 ConfigResourceVersionLabelKey: "", 567 KubeletConfigKeyLabelKey: "", 568 }, nil 569 } 570 if source.ConfigMap != nil { 571 return map[string]string{ 572 ConfigSourceLabelKey: fmt.Sprintf(configMapAPIPathFmt, source.ConfigMap.Namespace, source.ConfigMap.Name), 573 ConfigUIDLabelKey: string(source.ConfigMap.UID), 574 ConfigResourceVersionLabelKey: source.ConfigMap.ResourceVersion, 575 KubeletConfigKeyLabelKey: source.ConfigMap.KubeletConfigKey, 576 }, nil 577 } 578 return nil, fmt.Errorf("unrecognized config source type, all source subfields were nil") 579} 580 581// track labels across metric updates, so we can delete old label sets and prevent leaks 582var assignedConfigLabels map[string]string 583 584// SetAssignedConfig tracks labels according to the assigned NodeConfig. It also tracks labels 585// across metric updates so old labels can be safely deleted. 586func SetAssignedConfig(source *corev1.NodeConfigSource) error { 587 // compute the timeseries labels from the source 588 labels, err := configLabels(source) 589 if err != nil { 590 return err 591 } 592 // clean up the old timeseries (WithLabelValues creates a new one for each distinct label set) 593 if !AssignedConfig.Delete(assignedConfigLabels) { 594 klog.InfoS("Failed to delete metric for labels. This may result in ambiguity from multiple metrics concurrently indicating different assigned configs.", "labels", assignedConfigLabels) 595 } 596 // record the new timeseries 597 assignedConfigLabels = labels 598 // expose the new timeseries with a constant count of 1 599 AssignedConfig.With(assignedConfigLabels).Set(1) 600 return nil 601} 602 603// track labels across metric updates, so we can delete old label sets and prevent leaks 604var activeConfigLabels map[string]string 605 606// SetActiveConfig tracks labels according to the NodeConfig that is currently used by the Kubelet. 607// It also tracks labels across metric updates so old labels can be safely deleted. 608func SetActiveConfig(source *corev1.NodeConfigSource) error { 609 // compute the timeseries labels from the source 610 labels, err := configLabels(source) 611 if err != nil { 612 return err 613 } 614 // clean up the old timeseries (WithLabelValues creates a new one for each distinct label set) 615 if !ActiveConfig.Delete(activeConfigLabels) { 616 klog.InfoS("Failed to delete metric for labels. This may result in ambiguity from multiple metrics concurrently indicating different active configs.", "labels", activeConfigLabels) 617 } 618 // record the new timeseries 619 activeConfigLabels = labels 620 // expose the new timeseries with a constant count of 1 621 ActiveConfig.With(activeConfigLabels).Set(1) 622 return nil 623} 624 625// track labels across metric updates, so we can delete old label sets and prevent leaks 626var lastKnownGoodConfigLabels map[string]string 627 628// SetLastKnownGoodConfig tracks labels according to the NodeConfig that was successfully applied last. 629// It also tracks labels across metric updates so old labels can be safely deleted. 630func SetLastKnownGoodConfig(source *corev1.NodeConfigSource) error { 631 // compute the timeseries labels from the source 632 labels, err := configLabels(source) 633 if err != nil { 634 return err 635 } 636 // clean up the old timeseries (WithLabelValues creates a new one for each distinct label set) 637 if !LastKnownGoodConfig.Delete(lastKnownGoodConfigLabels) { 638 klog.InfoS("Failed to delete metric for labels. This may result in ambiguity from multiple metrics concurrently indicating different last known good configs.", "labels", lastKnownGoodConfigLabels) 639 } 640 // record the new timeseries 641 lastKnownGoodConfigLabels = labels 642 // expose the new timeseries with a constant count of 1 643 LastKnownGoodConfig.With(lastKnownGoodConfigLabels).Set(1) 644 return nil 645} 646 647// SetConfigError sets a the ConfigError metric to 1 in case any errors were encountered. 648func SetConfigError(err bool) { 649 if err { 650 ConfigError.Set(1) 651 } else { 652 ConfigError.Set(0) 653 } 654} 655 656// SetNodeName sets the NodeName Gauge to 1. 657func SetNodeName(name types.NodeName) { 658 NodeName.WithLabelValues(string(name)).Set(1) 659} 660