1/* 2Copyright 2017 The Kubernetes Authors. 3 4Licensed under the Apache License, Version 2.0 (the "License"); 5you may not use this file except in compliance with the License. 6You may obtain a copy of the License at 7 8 http://www.apache.org/licenses/LICENSE-2.0 9 10Unless required by applicable law or agreed to in writing, software 11distributed under the License is distributed on an "AS IS" BASIS, 12WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13See the License for the specific language governing permissions and 14limitations under the License. 15*/ 16 17package certificate 18 19import ( 20 "context" 21 "crypto/ecdsa" 22 "crypto/elliptic" 23 cryptorand "crypto/rand" 24 "crypto/tls" 25 "crypto/x509" 26 "encoding/pem" 27 "fmt" 28 "reflect" 29 "sync" 30 "time" 31 32 "k8s.io/klog/v2" 33 34 certificates "k8s.io/api/certificates/v1" 35 "k8s.io/apimachinery/pkg/api/errors" 36 utilruntime "k8s.io/apimachinery/pkg/util/runtime" 37 "k8s.io/apimachinery/pkg/util/sets" 38 "k8s.io/apimachinery/pkg/util/wait" 39 clientset "k8s.io/client-go/kubernetes" 40 "k8s.io/client-go/util/cert" 41 "k8s.io/client-go/util/certificate/csr" 42 "k8s.io/client-go/util/keyutil" 43) 44 45// certificateWaitTimeout controls the amount of time we wait for certificate 46// approval in one iteration. 47var certificateWaitTimeout = 15 * time.Minute 48 49// Manager maintains and updates the certificates in use by this certificate 50// manager. In the background it communicates with the API server to get new 51// certificates for certificates about to expire. 52type Manager interface { 53 // Start the API server status sync loop. 54 Start() 55 // Stop the cert manager loop. 56 Stop() 57 // Current returns the currently selected certificate from the 58 // certificate manager, as well as the associated certificate and key data 59 // in PEM format. 60 Current() *tls.Certificate 61 // ServerHealthy returns true if the manager is able to communicate with 62 // the server. This allows a caller to determine whether the cert manager 63 // thinks it can potentially talk to the API server. The cert manager may 64 // be very conservative and only return true if recent communication has 65 // occurred with the server. 66 ServerHealthy() bool 67} 68 69// Config is the set of configuration parameters available for a new Manager. 70type Config struct { 71 // ClientsetFn will be used to create a clientset for 72 // creating/fetching new certificate requests generated when a key rotation occurs. 73 // The function will never be invoked in parallel. 74 // It is passed the current client certificate if one exists. 75 ClientsetFn ClientsetFunc 76 // Template is the CertificateRequest that will be used as a template for 77 // generating certificate signing requests for all new keys generated as 78 // part of rotation. It follows the same rules as the template parameter of 79 // crypto.x509.CreateCertificateRequest in the Go standard libraries. 80 Template *x509.CertificateRequest 81 // GetTemplate returns the CertificateRequest that will be used as a template for 82 // generating certificate signing requests for all new keys generated as 83 // part of rotation. It follows the same rules as the template parameter of 84 // crypto.x509.CreateCertificateRequest in the Go standard libraries. 85 // If no template is available, nil may be returned, and no certificate will be requested. 86 // If specified, takes precedence over Template. 87 GetTemplate func() *x509.CertificateRequest 88 // SignerName is the name of the certificate signer that should sign certificates 89 // generated by the manager. 90 SignerName string 91 // Usages is the types of usages that certificates generated by the manager 92 // can be used for. 93 Usages []certificates.KeyUsage 94 // CertificateStore is a persistent store where the current cert/key is 95 // kept and future cert/key pairs will be persisted after they are 96 // generated. 97 CertificateStore Store 98 // BootstrapCertificatePEM is the certificate data that will be returned 99 // from the Manager if the CertificateStore doesn't have any cert/key pairs 100 // currently available and has not yet had a chance to get a new cert/key 101 // pair from the API. If the CertificateStore does have a cert/key pair, 102 // this will be ignored. If there is no cert/key pair available in the 103 // CertificateStore, as soon as Start is called, it will request a new 104 // cert/key pair from the CertificateSigningRequestClient. This is intended 105 // to allow the first boot of a component to be initialized using a 106 // generic, multi-use cert/key pair which will be quickly replaced with a 107 // unique cert/key pair. 108 BootstrapCertificatePEM []byte 109 // BootstrapKeyPEM is the key data that will be returned from the Manager 110 // if the CertificateStore doesn't have any cert/key pairs currently 111 // available. If the CertificateStore does have a cert/key pair, this will 112 // be ignored. If the bootstrap cert/key pair are used, they will be 113 // rotated at the first opportunity, possibly well in advance of expiring. 114 // This is intended to allow the first boot of a component to be 115 // initialized using a generic, multi-use cert/key pair which will be 116 // quickly replaced with a unique cert/key pair. 117 BootstrapKeyPEM []byte `datapolicy:"security-key"` 118 // CertificateRotation will record a metric showing the time in seconds 119 // that certificates lived before being rotated. This metric is a histogram 120 // because there is value in keeping a history of rotation cadences. It 121 // allows one to setup monitoring and alerting of unexpected rotation 122 // behavior and track trends in rotation frequency. 123 CertificateRotation Histogram 124 // CertifcateRenewFailure will record a metric that keeps track of 125 // certificate renewal failures. 126 CertificateRenewFailure Counter 127} 128 129// Store is responsible for getting and updating the current certificate. 130// Depending on the concrete implementation, the backing store for this 131// behavior may vary. 132type Store interface { 133 // Current returns the currently selected certificate, as well as the 134 // associated certificate and key data in PEM format. If the Store doesn't 135 // have a cert/key pair currently, it should return a NoCertKeyError so 136 // that the Manager can recover by using bootstrap certificates to request 137 // a new cert/key pair. 138 Current() (*tls.Certificate, error) 139 // Update accepts the PEM data for the cert/key pair and makes the new 140 // cert/key pair the 'current' pair, that will be returned by future calls 141 // to Current(). 142 Update(cert, key []byte) (*tls.Certificate, error) 143} 144 145// Gauge will record the remaining lifetime of the certificate each time it is 146// updated. 147type Gauge interface { 148 Set(float64) 149} 150 151// Histogram will record the time a rotated certificate was used before being 152// rotated. 153type Histogram interface { 154 Observe(float64) 155} 156 157// Counter will wrap a counter with labels 158type Counter interface { 159 Inc() 160} 161 162// NoCertKeyError indicates there is no cert/key currently available. 163type NoCertKeyError string 164 165// ClientsetFunc returns a new clientset for discovering CSR API availability and requesting CSRs. 166// It is passed the current certificate if one is available and valid. 167type ClientsetFunc func(current *tls.Certificate) (clientset.Interface, error) 168 169func (e *NoCertKeyError) Error() string { return string(*e) } 170 171type manager struct { 172 getTemplate func() *x509.CertificateRequest 173 174 // lastRequestLock guards lastRequestCancel and lastRequest 175 lastRequestLock sync.Mutex 176 lastRequestCancel context.CancelFunc 177 lastRequest *x509.CertificateRequest 178 179 dynamicTemplate bool 180 signerName string 181 usages []certificates.KeyUsage 182 forceRotation bool 183 184 certStore Store 185 186 certificateRotation Histogram 187 certificateRenewFailure Counter 188 189 // the following variables must only be accessed under certAccessLock 190 certAccessLock sync.RWMutex 191 cert *tls.Certificate 192 serverHealth bool 193 194 // the clientFn must only be accessed under the clientAccessLock 195 clientAccessLock sync.Mutex 196 clientsetFn ClientsetFunc 197 stopCh chan struct{} 198 stopped bool 199 200 // Set to time.Now but can be stubbed out for testing 201 now func() time.Time 202} 203 204// NewManager returns a new certificate manager. A certificate manager is 205// responsible for being the authoritative source of certificates in the 206// Kubelet and handling updates due to rotation. 207func NewManager(config *Config) (Manager, error) { 208 cert, forceRotation, err := getCurrentCertificateOrBootstrap( 209 config.CertificateStore, 210 config.BootstrapCertificatePEM, 211 config.BootstrapKeyPEM) 212 if err != nil { 213 return nil, err 214 } 215 216 getTemplate := config.GetTemplate 217 if getTemplate == nil { 218 getTemplate = func() *x509.CertificateRequest { return config.Template } 219 } 220 221 m := manager{ 222 stopCh: make(chan struct{}), 223 clientsetFn: config.ClientsetFn, 224 getTemplate: getTemplate, 225 dynamicTemplate: config.GetTemplate != nil, 226 signerName: config.SignerName, 227 usages: config.Usages, 228 certStore: config.CertificateStore, 229 cert: cert, 230 forceRotation: forceRotation, 231 certificateRotation: config.CertificateRotation, 232 certificateRenewFailure: config.CertificateRenewFailure, 233 now: time.Now, 234 } 235 236 return &m, nil 237} 238 239// Current returns the currently selected certificate from the certificate 240// manager. This can be nil if the manager was initialized without a 241// certificate and has not yet received one from the 242// CertificateSigningRequestClient, or if the current cert has expired. 243func (m *manager) Current() *tls.Certificate { 244 m.certAccessLock.RLock() 245 defer m.certAccessLock.RUnlock() 246 if m.cert != nil && m.cert.Leaf != nil && m.now().After(m.cert.Leaf.NotAfter) { 247 klog.V(2).Infof("Current certificate is expired.") 248 return nil 249 } 250 return m.cert 251} 252 253// ServerHealthy returns true if the cert manager believes the server 254// is currently alive. 255func (m *manager) ServerHealthy() bool { 256 m.certAccessLock.RLock() 257 defer m.certAccessLock.RUnlock() 258 return m.serverHealth 259} 260 261// Stop terminates the manager. 262func (m *manager) Stop() { 263 m.clientAccessLock.Lock() 264 defer m.clientAccessLock.Unlock() 265 if m.stopped { 266 return 267 } 268 close(m.stopCh) 269 m.stopped = true 270} 271 272// Start will start the background work of rotating the certificates. 273func (m *manager) Start() { 274 // Certificate rotation depends on access to the API server certificate 275 // signing API, so don't start the certificate manager if we don't have a 276 // client. 277 if m.clientsetFn == nil { 278 klog.V(2).Infof("Certificate rotation is not enabled, no connection to the apiserver.") 279 return 280 } 281 282 klog.V(2).Infof("Certificate rotation is enabled.") 283 284 templateChanged := make(chan struct{}) 285 go wait.Until(func() { 286 deadline := m.nextRotationDeadline() 287 if sleepInterval := deadline.Sub(m.now()); sleepInterval > 0 { 288 klog.V(2).Infof("Waiting %v for next certificate rotation", sleepInterval) 289 290 timer := time.NewTimer(sleepInterval) 291 defer timer.Stop() 292 293 select { 294 case <-timer.C: 295 // unblock when deadline expires 296 case <-templateChanged: 297 _, lastRequestTemplate := m.getLastRequest() 298 if reflect.DeepEqual(lastRequestTemplate, m.getTemplate()) { 299 // if the template now matches what we last requested, restart the rotation deadline loop 300 return 301 } 302 klog.V(2).Infof("Certificate template changed, rotating") 303 } 304 } 305 306 // Don't enter rotateCerts and trigger backoff if we don't even have a template to request yet 307 if m.getTemplate() == nil { 308 return 309 } 310 311 backoff := wait.Backoff{ 312 Duration: 2 * time.Second, 313 Factor: 2, 314 Jitter: 0.1, 315 Steps: 5, 316 } 317 if err := wait.ExponentialBackoff(backoff, m.rotateCerts); err != nil { 318 utilruntime.HandleError(fmt.Errorf("Reached backoff limit, still unable to rotate certs: %v", err)) 319 wait.PollInfinite(32*time.Second, m.rotateCerts) 320 } 321 }, time.Second, m.stopCh) 322 323 if m.dynamicTemplate { 324 go wait.Until(func() { 325 // check if the current template matches what we last requested 326 lastRequestCancel, lastRequestTemplate := m.getLastRequest() 327 328 if !m.certSatisfiesTemplate() && !reflect.DeepEqual(lastRequestTemplate, m.getTemplate()) { 329 // if the template is different, queue up an interrupt of the rotation deadline loop. 330 // if we've requested a CSR that matches the new template by the time the interrupt is handled, the interrupt is disregarded. 331 if lastRequestCancel != nil { 332 // if we're currently waiting on a submitted request that no longer matches what we want, stop waiting 333 lastRequestCancel() 334 } 335 select { 336 case templateChanged <- struct{}{}: 337 case <-m.stopCh: 338 } 339 } 340 }, time.Second, m.stopCh) 341 } 342} 343 344func getCurrentCertificateOrBootstrap( 345 store Store, 346 bootstrapCertificatePEM []byte, 347 bootstrapKeyPEM []byte) (cert *tls.Certificate, shouldRotate bool, errResult error) { 348 349 currentCert, err := store.Current() 350 if err == nil { 351 // if the current cert is expired, fall back to the bootstrap cert 352 if currentCert.Leaf != nil && time.Now().Before(currentCert.Leaf.NotAfter) { 353 return currentCert, false, nil 354 } 355 } else { 356 if _, ok := err.(*NoCertKeyError); !ok { 357 return nil, false, err 358 } 359 } 360 361 if bootstrapCertificatePEM == nil || bootstrapKeyPEM == nil { 362 return nil, true, nil 363 } 364 365 bootstrapCert, err := tls.X509KeyPair(bootstrapCertificatePEM, bootstrapKeyPEM) 366 if err != nil { 367 return nil, false, err 368 } 369 if len(bootstrapCert.Certificate) < 1 { 370 return nil, false, fmt.Errorf("no cert/key data found") 371 } 372 373 certs, err := x509.ParseCertificates(bootstrapCert.Certificate[0]) 374 if err != nil { 375 return nil, false, fmt.Errorf("unable to parse certificate data: %v", err) 376 } 377 if len(certs) < 1 { 378 return nil, false, fmt.Errorf("no cert data found") 379 } 380 bootstrapCert.Leaf = certs[0] 381 382 if _, err := store.Update(bootstrapCertificatePEM, bootstrapKeyPEM); err != nil { 383 utilruntime.HandleError(fmt.Errorf("Unable to set the cert/key pair to the bootstrap certificate: %v", err)) 384 } else { 385 klog.V(4).Infof("Updated the store to contain the initial bootstrap certificate") 386 } 387 388 return &bootstrapCert, true, nil 389} 390 391func (m *manager) getClientset() (clientset.Interface, error) { 392 current := m.Current() 393 m.clientAccessLock.Lock() 394 defer m.clientAccessLock.Unlock() 395 return m.clientsetFn(current) 396} 397 398// RotateCerts is exposed for testing only and is not a part of the public interface. 399// Returns true if it changed the cert, false otherwise. Error is only returned in 400// exceptional cases. 401func (m *manager) RotateCerts() (bool, error) { 402 return m.rotateCerts() 403} 404 405// rotateCerts attempts to request a client cert from the server, wait a reasonable 406// period of time for it to be signed, and then update the cert on disk. If it cannot 407// retrieve a cert, it will return false. It will only return error in exceptional cases. 408// This method also keeps track of "server health" by interpreting the responses it gets 409// from the server on the various calls it makes. 410// TODO: return errors, have callers handle and log them correctly 411func (m *manager) rotateCerts() (bool, error) { 412 klog.V(2).Infof("Rotating certificates") 413 414 template, csrPEM, keyPEM, privateKey, err := m.generateCSR() 415 if err != nil { 416 utilruntime.HandleError(fmt.Errorf("Unable to generate a certificate signing request: %v", err)) 417 if m.certificateRenewFailure != nil { 418 m.certificateRenewFailure.Inc() 419 } 420 return false, nil 421 } 422 423 // request the client each time 424 clientSet, err := m.getClientset() 425 if err != nil { 426 utilruntime.HandleError(fmt.Errorf("Unable to load a client to request certificates: %v", err)) 427 if m.certificateRenewFailure != nil { 428 m.certificateRenewFailure.Inc() 429 } 430 return false, nil 431 } 432 433 // Call the Certificate Signing Request API to get a certificate for the 434 // new private key. 435 reqName, reqUID, err := csr.RequestCertificate(clientSet, csrPEM, "", m.signerName, m.usages, privateKey) 436 if err != nil { 437 utilruntime.HandleError(fmt.Errorf("Failed while requesting a signed certificate from the master: %v", err)) 438 if m.certificateRenewFailure != nil { 439 m.certificateRenewFailure.Inc() 440 } 441 return false, m.updateServerError(err) 442 } 443 444 ctx, cancel := context.WithTimeout(context.Background(), certificateWaitTimeout) 445 defer cancel() 446 447 // Once we've successfully submitted a CSR for this template, record that we did so 448 m.setLastRequest(cancel, template) 449 450 // Wait for the certificate to be signed. This interface and internal timout 451 // is a remainder after the old design using raw watch wrapped with backoff. 452 crtPEM, err := csr.WaitForCertificate(ctx, clientSet, reqName, reqUID) 453 if err != nil { 454 utilruntime.HandleError(fmt.Errorf("certificate request was not signed: %v", err)) 455 if m.certificateRenewFailure != nil { 456 m.certificateRenewFailure.Inc() 457 } 458 return false, nil 459 } 460 461 cert, err := m.certStore.Update(crtPEM, keyPEM) 462 if err != nil { 463 utilruntime.HandleError(fmt.Errorf("Unable to store the new cert/key pair: %v", err)) 464 if m.certificateRenewFailure != nil { 465 m.certificateRenewFailure.Inc() 466 } 467 return false, nil 468 } 469 470 if old := m.updateCached(cert); old != nil && m.certificateRotation != nil { 471 m.certificateRotation.Observe(m.now().Sub(old.Leaf.NotBefore).Seconds()) 472 } 473 474 return true, nil 475} 476 477// Check that the current certificate on disk satisfies the requests from the 478// current template. 479// 480// Note that extra items in the certificate's SAN or orgs that don't exist in 481// the template will not trigger a renewal. 482// 483// Requires certAccessLock to be locked. 484func (m *manager) certSatisfiesTemplateLocked() bool { 485 if m.cert == nil { 486 return false 487 } 488 489 if template := m.getTemplate(); template != nil { 490 if template.Subject.CommonName != m.cert.Leaf.Subject.CommonName { 491 klog.V(2).Infof("Current certificate CN (%s) does not match requested CN (%s)", m.cert.Leaf.Subject.CommonName, template.Subject.CommonName) 492 return false 493 } 494 495 currentDNSNames := sets.NewString(m.cert.Leaf.DNSNames...) 496 desiredDNSNames := sets.NewString(template.DNSNames...) 497 missingDNSNames := desiredDNSNames.Difference(currentDNSNames) 498 if len(missingDNSNames) > 0 { 499 klog.V(2).Infof("Current certificate is missing requested DNS names %v", missingDNSNames.List()) 500 return false 501 } 502 503 currentIPs := sets.NewString() 504 for _, ip := range m.cert.Leaf.IPAddresses { 505 currentIPs.Insert(ip.String()) 506 } 507 desiredIPs := sets.NewString() 508 for _, ip := range template.IPAddresses { 509 desiredIPs.Insert(ip.String()) 510 } 511 missingIPs := desiredIPs.Difference(currentIPs) 512 if len(missingIPs) > 0 { 513 klog.V(2).Infof("Current certificate is missing requested IP addresses %v", missingIPs.List()) 514 return false 515 } 516 517 currentOrgs := sets.NewString(m.cert.Leaf.Subject.Organization...) 518 desiredOrgs := sets.NewString(template.Subject.Organization...) 519 missingOrgs := desiredOrgs.Difference(currentOrgs) 520 if len(missingOrgs) > 0 { 521 klog.V(2).Infof("Current certificate is missing requested orgs %v", missingOrgs.List()) 522 return false 523 } 524 } 525 526 return true 527} 528 529func (m *manager) certSatisfiesTemplate() bool { 530 m.certAccessLock.RLock() 531 defer m.certAccessLock.RUnlock() 532 return m.certSatisfiesTemplateLocked() 533} 534 535// nextRotationDeadline returns a value for the threshold at which the 536// current certificate should be rotated, 80%+/-10% of the expiration of the 537// certificate. 538func (m *manager) nextRotationDeadline() time.Time { 539 // forceRotation is not protected by locks 540 if m.forceRotation { 541 m.forceRotation = false 542 return m.now() 543 } 544 545 m.certAccessLock.RLock() 546 defer m.certAccessLock.RUnlock() 547 548 if !m.certSatisfiesTemplateLocked() { 549 return m.now() 550 } 551 552 notAfter := m.cert.Leaf.NotAfter 553 totalDuration := float64(notAfter.Sub(m.cert.Leaf.NotBefore)) 554 deadline := m.cert.Leaf.NotBefore.Add(jitteryDuration(totalDuration)) 555 556 klog.V(2).Infof("Certificate expiration is %v, rotation deadline is %v", notAfter, deadline) 557 return deadline 558} 559 560// jitteryDuration uses some jitter to set the rotation threshold so each node 561// will rotate at approximately 70-90% of the total lifetime of the 562// certificate. With jitter, if a number of nodes are added to a cluster at 563// approximately the same time (such as cluster creation time), they won't all 564// try to rotate certificates at the same time for the rest of the life of the 565// cluster. 566// 567// This function is represented as a variable to allow replacement during testing. 568var jitteryDuration = func(totalDuration float64) time.Duration { 569 return wait.Jitter(time.Duration(totalDuration), 0.2) - time.Duration(totalDuration*0.3) 570} 571 572// updateCached sets the most recent retrieved cert and returns the old cert. 573// It also sets the server as assumed healthy. 574func (m *manager) updateCached(cert *tls.Certificate) *tls.Certificate { 575 m.certAccessLock.Lock() 576 defer m.certAccessLock.Unlock() 577 m.serverHealth = true 578 old := m.cert 579 m.cert = cert 580 return old 581} 582 583// updateServerError takes an error returned by the server and infers 584// the health of the server based on the error. It will return nil if 585// the error does not require immediate termination of any wait loops, 586// and otherwise it will return the error. 587func (m *manager) updateServerError(err error) error { 588 m.certAccessLock.Lock() 589 defer m.certAccessLock.Unlock() 590 switch { 591 case errors.IsUnauthorized(err): 592 // SSL terminating proxies may report this error instead of the master 593 m.serverHealth = true 594 case errors.IsUnexpectedServerError(err): 595 // generally indicates a proxy or other load balancer problem, rather than a problem coming 596 // from the master 597 m.serverHealth = false 598 default: 599 // Identify known errors that could be expected for a cert request that 600 // indicate everything is working normally 601 m.serverHealth = errors.IsNotFound(err) || errors.IsForbidden(err) 602 } 603 return nil 604} 605 606func (m *manager) generateCSR() (template *x509.CertificateRequest, csrPEM []byte, keyPEM []byte, key interface{}, err error) { 607 // Generate a new private key. 608 privateKey, err := ecdsa.GenerateKey(elliptic.P256(), cryptorand.Reader) 609 if err != nil { 610 return nil, nil, nil, nil, fmt.Errorf("unable to generate a new private key: %v", err) 611 } 612 der, err := x509.MarshalECPrivateKey(privateKey) 613 if err != nil { 614 return nil, nil, nil, nil, fmt.Errorf("unable to marshal the new key to DER: %v", err) 615 } 616 617 keyPEM = pem.EncodeToMemory(&pem.Block{Type: keyutil.ECPrivateKeyBlockType, Bytes: der}) 618 619 template = m.getTemplate() 620 if template == nil { 621 return nil, nil, nil, nil, fmt.Errorf("unable to create a csr, no template available") 622 } 623 csrPEM, err = cert.MakeCSRFromTemplate(privateKey, template) 624 if err != nil { 625 return nil, nil, nil, nil, fmt.Errorf("unable to create a csr from the private key: %v", err) 626 } 627 return template, csrPEM, keyPEM, privateKey, nil 628} 629 630func (m *manager) getLastRequest() (context.CancelFunc, *x509.CertificateRequest) { 631 m.lastRequestLock.Lock() 632 defer m.lastRequestLock.Unlock() 633 return m.lastRequestCancel, m.lastRequest 634} 635 636func (m *manager) setLastRequest(cancel context.CancelFunc, r *x509.CertificateRequest) { 637 m.lastRequestLock.Lock() 638 defer m.lastRequestLock.Unlock() 639 m.lastRequestCancel = cancel 640 m.lastRequest = r 641} 642