1/* 2Copyright 2017 The Kubernetes Authors. 3 4Licensed under the Apache License, Version 2.0 (the "License"); 5you may not use this file except in compliance with the License. 6You may obtain a copy of the License at 7 8 http://www.apache.org/licenses/LICENSE-2.0 9 10Unless required by applicable law or agreed to in writing, software 11distributed under the License is distributed on an "AS IS" BASIS, 12WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13See the License for the specific language governing permissions and 14limitations under the License. 15*/ 16 17package certificate 18 19import ( 20 "context" 21 "crypto/ecdsa" 22 "crypto/elliptic" 23 cryptorand "crypto/rand" 24 "crypto/tls" 25 "crypto/x509" 26 "encoding/pem" 27 "fmt" 28 "reflect" 29 "sync" 30 "time" 31 32 "k8s.io/klog" 33 34 certificates "k8s.io/api/certificates/v1beta1" 35 "k8s.io/apimachinery/pkg/api/errors" 36 utilruntime "k8s.io/apimachinery/pkg/util/runtime" 37 "k8s.io/apimachinery/pkg/util/sets" 38 "k8s.io/apimachinery/pkg/util/wait" 39 certificatesclient "k8s.io/client-go/kubernetes/typed/certificates/v1beta1" 40 "k8s.io/client-go/util/cert" 41 "k8s.io/client-go/util/certificate/csr" 42 "k8s.io/client-go/util/keyutil" 43) 44 45// certificateWaitTimeout controls the amount of time we wait for certificate 46// approval in one iteration. 47var certificateWaitTimeout = 15 * time.Minute 48 49// Manager maintains and updates the certificates in use by this certificate 50// manager. In the background it communicates with the API server to get new 51// certificates for certificates about to expire. 52type Manager interface { 53 // Start the API server status sync loop. 54 Start() 55 // Stop the cert manager loop. 56 Stop() 57 // Current returns the currently selected certificate from the 58 // certificate manager, as well as the associated certificate and key data 59 // in PEM format. 60 Current() *tls.Certificate 61 // ServerHealthy returns true if the manager is able to communicate with 62 // the server. This allows a caller to determine whether the cert manager 63 // thinks it can potentially talk to the API server. The cert manager may 64 // be very conservative and only return true if recent communication has 65 // occurred with the server. 66 ServerHealthy() bool 67} 68 69// Config is the set of configuration parameters available for a new Manager. 70type Config struct { 71 // ClientFn will be used to create a client for 72 // signing new certificate requests generated when a key rotation occurs. 73 // It must be set at initialization. The function will never be invoked 74 // in parallel. It is passed the current client certificate if one exists. 75 ClientFn CSRClientFunc 76 // Template is the CertificateRequest that will be used as a template for 77 // generating certificate signing requests for all new keys generated as 78 // part of rotation. It follows the same rules as the template parameter of 79 // crypto.x509.CreateCertificateRequest in the Go standard libraries. 80 Template *x509.CertificateRequest 81 // GetTemplate returns the CertificateRequest that will be used as a template for 82 // generating certificate signing requests for all new keys generated as 83 // part of rotation. It follows the same rules as the template parameter of 84 // crypto.x509.CreateCertificateRequest in the Go standard libraries. 85 // If no template is available, nil may be returned, and no certificate will be requested. 86 // If specified, takes precedence over Template. 87 GetTemplate func() *x509.CertificateRequest 88 // SignerName is the name of the certificate signer that should sign certificates 89 // generated by the manager. 90 SignerName string 91 // Usages is the types of usages that certificates generated by the manager 92 // can be used for. 93 Usages []certificates.KeyUsage 94 // CertificateStore is a persistent store where the current cert/key is 95 // kept and future cert/key pairs will be persisted after they are 96 // generated. 97 CertificateStore Store 98 // BootstrapCertificatePEM is the certificate data that will be returned 99 // from the Manager if the CertificateStore doesn't have any cert/key pairs 100 // currently available and has not yet had a chance to get a new cert/key 101 // pair from the API. If the CertificateStore does have a cert/key pair, 102 // this will be ignored. If there is no cert/key pair available in the 103 // CertificateStore, as soon as Start is called, it will request a new 104 // cert/key pair from the CertificateSigningRequestClient. This is intended 105 // to allow the first boot of a component to be initialized using a 106 // generic, multi-use cert/key pair which will be quickly replaced with a 107 // unique cert/key pair. 108 BootstrapCertificatePEM []byte 109 // BootstrapKeyPEM is the key data that will be returned from the Manager 110 // if the CertificateStore doesn't have any cert/key pairs currently 111 // available. If the CertificateStore does have a cert/key pair, this will 112 // be ignored. If the bootstrap cert/key pair are used, they will be 113 // rotated at the first opportunity, possibly well in advance of expiring. 114 // This is intended to allow the first boot of a component to be 115 // initialized using a generic, multi-use cert/key pair which will be 116 // quickly replaced with a unique cert/key pair. 117 BootstrapKeyPEM []byte 118 // CertificateRotation will record a metric showing the time in seconds 119 // that certificates lived before being rotated. This metric is a histogram 120 // because there is value in keeping a history of rotation cadences. It 121 // allows one to setup monitoring and alerting of unexpected rotation 122 // behavior and track trends in rotation frequency. 123 CertificateRotation Histogram 124 // CertifcateRenewFailure will record a metric that keeps track of 125 // certificate renewal failures. 126 CertificateRenewFailure Counter 127} 128 129// Store is responsible for getting and updating the current certificate. 130// Depending on the concrete implementation, the backing store for this 131// behavior may vary. 132type Store interface { 133 // Current returns the currently selected certificate, as well as the 134 // associated certificate and key data in PEM format. If the Store doesn't 135 // have a cert/key pair currently, it should return a NoCertKeyError so 136 // that the Manager can recover by using bootstrap certificates to request 137 // a new cert/key pair. 138 Current() (*tls.Certificate, error) 139 // Update accepts the PEM data for the cert/key pair and makes the new 140 // cert/key pair the 'current' pair, that will be returned by future calls 141 // to Current(). 142 Update(cert, key []byte) (*tls.Certificate, error) 143} 144 145// Gauge will record the remaining lifetime of the certificate each time it is 146// updated. 147type Gauge interface { 148 Set(float64) 149} 150 151// Histogram will record the time a rotated certificate was used before being 152// rotated. 153type Histogram interface { 154 Observe(float64) 155} 156 157// Counter will wrap a counter with labels 158type Counter interface { 159 Inc() 160} 161 162// NoCertKeyError indicates there is no cert/key currently available. 163type NoCertKeyError string 164 165// CSRClientFunc returns a new client for requesting CSRs. It passes the 166// current certificate if one is available and valid. 167type CSRClientFunc func(current *tls.Certificate) (certificatesclient.CertificateSigningRequestInterface, error) 168 169func (e *NoCertKeyError) Error() string { return string(*e) } 170 171type manager struct { 172 getTemplate func() *x509.CertificateRequest 173 174 // lastRequestLock guards lastRequestCancel and lastRequest 175 lastRequestLock sync.Mutex 176 lastRequestCancel context.CancelFunc 177 lastRequest *x509.CertificateRequest 178 179 dynamicTemplate bool 180 signerName string 181 usages []certificates.KeyUsage 182 forceRotation bool 183 184 certStore Store 185 186 certificateRotation Histogram 187 certificateRenewFailure Counter 188 189 // the following variables must only be accessed under certAccessLock 190 certAccessLock sync.RWMutex 191 cert *tls.Certificate 192 serverHealth bool 193 194 // the clientFn must only be accessed under the clientAccessLock 195 clientAccessLock sync.Mutex 196 clientFn CSRClientFunc 197 stopCh chan struct{} 198 stopped bool 199 200 // Set to time.Now but can be stubbed out for testing 201 now func() time.Time 202} 203 204// NewManager returns a new certificate manager. A certificate manager is 205// responsible for being the authoritative source of certificates in the 206// Kubelet and handling updates due to rotation. 207func NewManager(config *Config) (Manager, error) { 208 cert, forceRotation, err := getCurrentCertificateOrBootstrap( 209 config.CertificateStore, 210 config.BootstrapCertificatePEM, 211 config.BootstrapKeyPEM) 212 if err != nil { 213 return nil, err 214 } 215 216 getTemplate := config.GetTemplate 217 if getTemplate == nil { 218 getTemplate = func() *x509.CertificateRequest { return config.Template } 219 } 220 221 m := manager{ 222 stopCh: make(chan struct{}), 223 clientFn: config.ClientFn, 224 getTemplate: getTemplate, 225 dynamicTemplate: config.GetTemplate != nil, 226 signerName: config.SignerName, 227 usages: config.Usages, 228 certStore: config.CertificateStore, 229 cert: cert, 230 forceRotation: forceRotation, 231 certificateRotation: config.CertificateRotation, 232 certificateRenewFailure: config.CertificateRenewFailure, 233 now: time.Now, 234 } 235 236 return &m, nil 237} 238 239// Current returns the currently selected certificate from the certificate 240// manager. This can be nil if the manager was initialized without a 241// certificate and has not yet received one from the 242// CertificateSigningRequestClient, or if the current cert has expired. 243func (m *manager) Current() *tls.Certificate { 244 m.certAccessLock.RLock() 245 defer m.certAccessLock.RUnlock() 246 if m.cert != nil && m.cert.Leaf != nil && m.now().After(m.cert.Leaf.NotAfter) { 247 klog.V(2).Infof("Current certificate is expired.") 248 return nil 249 } 250 return m.cert 251} 252 253// ServerHealthy returns true if the cert manager believes the server 254// is currently alive. 255func (m *manager) ServerHealthy() bool { 256 m.certAccessLock.RLock() 257 defer m.certAccessLock.RUnlock() 258 return m.serverHealth 259} 260 261// Stop terminates the manager. 262func (m *manager) Stop() { 263 m.clientAccessLock.Lock() 264 defer m.clientAccessLock.Unlock() 265 if m.stopped { 266 return 267 } 268 close(m.stopCh) 269 m.stopped = true 270} 271 272// Start will start the background work of rotating the certificates. 273func (m *manager) Start() { 274 // Certificate rotation depends on access to the API server certificate 275 // signing API, so don't start the certificate manager if we don't have a 276 // client. 277 if m.clientFn == nil { 278 klog.V(2).Infof("Certificate rotation is not enabled, no connection to the apiserver.") 279 return 280 } 281 282 klog.V(2).Infof("Certificate rotation is enabled.") 283 284 templateChanged := make(chan struct{}) 285 go wait.Until(func() { 286 deadline := m.nextRotationDeadline() 287 if sleepInterval := deadline.Sub(m.now()); sleepInterval > 0 { 288 klog.V(2).Infof("Waiting %v for next certificate rotation", sleepInterval) 289 290 timer := time.NewTimer(sleepInterval) 291 defer timer.Stop() 292 293 select { 294 case <-timer.C: 295 // unblock when deadline expires 296 case <-templateChanged: 297 _, lastRequestTemplate := m.getLastRequest() 298 if reflect.DeepEqual(lastRequestTemplate, m.getTemplate()) { 299 // if the template now matches what we last requested, restart the rotation deadline loop 300 return 301 } 302 klog.V(2).Infof("Certificate template changed, rotating") 303 } 304 } 305 306 // Don't enter rotateCerts and trigger backoff if we don't even have a template to request yet 307 if m.getTemplate() == nil { 308 return 309 } 310 311 backoff := wait.Backoff{ 312 Duration: 2 * time.Second, 313 Factor: 2, 314 Jitter: 0.1, 315 Steps: 5, 316 } 317 if err := wait.ExponentialBackoff(backoff, m.rotateCerts); err != nil { 318 utilruntime.HandleError(fmt.Errorf("Reached backoff limit, still unable to rotate certs: %v", err)) 319 wait.PollInfinite(32*time.Second, m.rotateCerts) 320 } 321 }, time.Second, m.stopCh) 322 323 if m.dynamicTemplate { 324 go wait.Until(func() { 325 // check if the current template matches what we last requested 326 lastRequestCancel, lastRequestTemplate := m.getLastRequest() 327 328 if !m.certSatisfiesTemplate() && !reflect.DeepEqual(lastRequestTemplate, m.getTemplate()) { 329 // if the template is different, queue up an interrupt of the rotation deadline loop. 330 // if we've requested a CSR that matches the new template by the time the interrupt is handled, the interrupt is disregarded. 331 if lastRequestCancel != nil { 332 // if we're currently waiting on a submitted request that no longer matches what we want, stop waiting 333 lastRequestCancel() 334 } 335 select { 336 case templateChanged <- struct{}{}: 337 case <-m.stopCh: 338 } 339 } 340 }, time.Second, m.stopCh) 341 } 342} 343 344func getCurrentCertificateOrBootstrap( 345 store Store, 346 bootstrapCertificatePEM []byte, 347 bootstrapKeyPEM []byte) (cert *tls.Certificate, shouldRotate bool, errResult error) { 348 349 currentCert, err := store.Current() 350 if err == nil { 351 // if the current cert is expired, fall back to the bootstrap cert 352 if currentCert.Leaf != nil && time.Now().Before(currentCert.Leaf.NotAfter) { 353 return currentCert, false, nil 354 } 355 } else { 356 if _, ok := err.(*NoCertKeyError); !ok { 357 return nil, false, err 358 } 359 } 360 361 if bootstrapCertificatePEM == nil || bootstrapKeyPEM == nil { 362 return nil, true, nil 363 } 364 365 bootstrapCert, err := tls.X509KeyPair(bootstrapCertificatePEM, bootstrapKeyPEM) 366 if err != nil { 367 return nil, false, err 368 } 369 if len(bootstrapCert.Certificate) < 1 { 370 return nil, false, fmt.Errorf("no cert/key data found") 371 } 372 373 certs, err := x509.ParseCertificates(bootstrapCert.Certificate[0]) 374 if err != nil { 375 return nil, false, fmt.Errorf("unable to parse certificate data: %v", err) 376 } 377 bootstrapCert.Leaf = certs[0] 378 379 if _, err := store.Update(bootstrapCertificatePEM, bootstrapKeyPEM); err != nil { 380 utilruntime.HandleError(fmt.Errorf("Unable to set the cert/key pair to the bootstrap certificate: %v", err)) 381 } else { 382 klog.V(4).Infof("Updated the store to contain the initial bootstrap certificate") 383 } 384 385 return &bootstrapCert, true, nil 386} 387 388func (m *manager) getClient() (certificatesclient.CertificateSigningRequestInterface, error) { 389 current := m.Current() 390 m.clientAccessLock.Lock() 391 defer m.clientAccessLock.Unlock() 392 return m.clientFn(current) 393} 394 395// RotateCerts is exposed for testing only and is not a part of the public interface. 396// Returns true if it changed the cert, false otherwise. Error is only returned in 397// exceptional cases. 398func (m *manager) RotateCerts() (bool, error) { 399 return m.rotateCerts() 400} 401 402// rotateCerts attempts to request a client cert from the server, wait a reasonable 403// period of time for it to be signed, and then update the cert on disk. If it cannot 404// retrieve a cert, it will return false. It will only return error in exceptional cases. 405// This method also keeps track of "server health" by interpreting the responses it gets 406// from the server on the various calls it makes. 407// TODO: return errors, have callers handle and log them correctly 408func (m *manager) rotateCerts() (bool, error) { 409 klog.V(2).Infof("Rotating certificates") 410 411 template, csrPEM, keyPEM, privateKey, err := m.generateCSR() 412 if err != nil { 413 utilruntime.HandleError(fmt.Errorf("Unable to generate a certificate signing request: %v", err)) 414 if m.certificateRenewFailure != nil { 415 m.certificateRenewFailure.Inc() 416 } 417 return false, nil 418 } 419 420 // request the client each time 421 client, err := m.getClient() 422 if err != nil { 423 utilruntime.HandleError(fmt.Errorf("Unable to load a client to request certificates: %v", err)) 424 if m.certificateRenewFailure != nil { 425 m.certificateRenewFailure.Inc() 426 } 427 return false, nil 428 } 429 430 // Call the Certificate Signing Request API to get a certificate for the 431 // new private key. 432 req, err := csr.RequestCertificate(client, csrPEM, "", m.signerName, m.usages, privateKey) 433 if err != nil { 434 utilruntime.HandleError(fmt.Errorf("Failed while requesting a signed certificate from the master: %v", err)) 435 if m.certificateRenewFailure != nil { 436 m.certificateRenewFailure.Inc() 437 } 438 return false, m.updateServerError(err) 439 } 440 441 ctx, cancel := context.WithTimeout(context.Background(), certificateWaitTimeout) 442 defer cancel() 443 444 // Once we've successfully submitted a CSR for this template, record that we did so 445 m.setLastRequest(cancel, template) 446 447 // Wait for the certificate to be signed. This interface and internal timout 448 // is a remainder after the old design using raw watch wrapped with backoff. 449 crtPEM, err := csr.WaitForCertificate(ctx, client, req) 450 if err != nil { 451 utilruntime.HandleError(fmt.Errorf("certificate request was not signed: %v", err)) 452 if m.certificateRenewFailure != nil { 453 m.certificateRenewFailure.Inc() 454 } 455 return false, nil 456 } 457 458 cert, err := m.certStore.Update(crtPEM, keyPEM) 459 if err != nil { 460 utilruntime.HandleError(fmt.Errorf("Unable to store the new cert/key pair: %v", err)) 461 if m.certificateRenewFailure != nil { 462 m.certificateRenewFailure.Inc() 463 } 464 return false, nil 465 } 466 467 if old := m.updateCached(cert); old != nil && m.certificateRotation != nil { 468 m.certificateRotation.Observe(m.now().Sub(old.Leaf.NotBefore).Seconds()) 469 } 470 471 return true, nil 472} 473 474// Check that the current certificate on disk satisfies the requests from the 475// current template. 476// 477// Note that extra items in the certificate's SAN or orgs that don't exist in 478// the template will not trigger a renewal. 479// 480// Requires certAccessLock to be locked. 481func (m *manager) certSatisfiesTemplateLocked() bool { 482 if m.cert == nil { 483 return false 484 } 485 486 if template := m.getTemplate(); template != nil { 487 if template.Subject.CommonName != m.cert.Leaf.Subject.CommonName { 488 klog.V(2).Infof("Current certificate CN (%s) does not match requested CN (%s)", m.cert.Leaf.Subject.CommonName, template.Subject.CommonName) 489 return false 490 } 491 492 currentDNSNames := sets.NewString(m.cert.Leaf.DNSNames...) 493 desiredDNSNames := sets.NewString(template.DNSNames...) 494 missingDNSNames := desiredDNSNames.Difference(currentDNSNames) 495 if len(missingDNSNames) > 0 { 496 klog.V(2).Infof("Current certificate is missing requested DNS names %v", missingDNSNames.List()) 497 return false 498 } 499 500 currentIPs := sets.NewString() 501 for _, ip := range m.cert.Leaf.IPAddresses { 502 currentIPs.Insert(ip.String()) 503 } 504 desiredIPs := sets.NewString() 505 for _, ip := range template.IPAddresses { 506 desiredIPs.Insert(ip.String()) 507 } 508 missingIPs := desiredIPs.Difference(currentIPs) 509 if len(missingIPs) > 0 { 510 klog.V(2).Infof("Current certificate is missing requested IP addresses %v", missingIPs.List()) 511 return false 512 } 513 514 currentOrgs := sets.NewString(m.cert.Leaf.Subject.Organization...) 515 desiredOrgs := sets.NewString(template.Subject.Organization...) 516 missingOrgs := desiredOrgs.Difference(currentOrgs) 517 if len(missingOrgs) > 0 { 518 klog.V(2).Infof("Current certificate is missing requested orgs %v", missingOrgs.List()) 519 return false 520 } 521 } 522 523 return true 524} 525 526func (m *manager) certSatisfiesTemplate() bool { 527 m.certAccessLock.RLock() 528 defer m.certAccessLock.RUnlock() 529 return m.certSatisfiesTemplateLocked() 530} 531 532// nextRotationDeadline returns a value for the threshold at which the 533// current certificate should be rotated, 80%+/-10% of the expiration of the 534// certificate. 535func (m *manager) nextRotationDeadline() time.Time { 536 // forceRotation is not protected by locks 537 if m.forceRotation { 538 m.forceRotation = false 539 return m.now() 540 } 541 542 m.certAccessLock.RLock() 543 defer m.certAccessLock.RUnlock() 544 545 if !m.certSatisfiesTemplateLocked() { 546 return m.now() 547 } 548 549 notAfter := m.cert.Leaf.NotAfter 550 totalDuration := float64(notAfter.Sub(m.cert.Leaf.NotBefore)) 551 deadline := m.cert.Leaf.NotBefore.Add(jitteryDuration(totalDuration)) 552 553 klog.V(2).Infof("Certificate expiration is %v, rotation deadline is %v", notAfter, deadline) 554 return deadline 555} 556 557// jitteryDuration uses some jitter to set the rotation threshold so each node 558// will rotate at approximately 70-90% of the total lifetime of the 559// certificate. With jitter, if a number of nodes are added to a cluster at 560// approximately the same time (such as cluster creation time), they won't all 561// try to rotate certificates at the same time for the rest of the life of the 562// cluster. 563// 564// This function is represented as a variable to allow replacement during testing. 565var jitteryDuration = func(totalDuration float64) time.Duration { 566 return wait.Jitter(time.Duration(totalDuration), 0.2) - time.Duration(totalDuration*0.3) 567} 568 569// updateCached sets the most recent retrieved cert and returns the old cert. 570// It also sets the server as assumed healthy. 571func (m *manager) updateCached(cert *tls.Certificate) *tls.Certificate { 572 m.certAccessLock.Lock() 573 defer m.certAccessLock.Unlock() 574 m.serverHealth = true 575 old := m.cert 576 m.cert = cert 577 return old 578} 579 580// updateServerError takes an error returned by the server and infers 581// the health of the server based on the error. It will return nil if 582// the error does not require immediate termination of any wait loops, 583// and otherwise it will return the error. 584func (m *manager) updateServerError(err error) error { 585 m.certAccessLock.Lock() 586 defer m.certAccessLock.Unlock() 587 switch { 588 case errors.IsUnauthorized(err): 589 // SSL terminating proxies may report this error instead of the master 590 m.serverHealth = true 591 case errors.IsUnexpectedServerError(err): 592 // generally indicates a proxy or other load balancer problem, rather than a problem coming 593 // from the master 594 m.serverHealth = false 595 default: 596 // Identify known errors that could be expected for a cert request that 597 // indicate everything is working normally 598 m.serverHealth = errors.IsNotFound(err) || errors.IsForbidden(err) 599 } 600 return nil 601} 602 603func (m *manager) generateCSR() (template *x509.CertificateRequest, csrPEM []byte, keyPEM []byte, key interface{}, err error) { 604 // Generate a new private key. 605 privateKey, err := ecdsa.GenerateKey(elliptic.P256(), cryptorand.Reader) 606 if err != nil { 607 return nil, nil, nil, nil, fmt.Errorf("unable to generate a new private key: %v", err) 608 } 609 der, err := x509.MarshalECPrivateKey(privateKey) 610 if err != nil { 611 return nil, nil, nil, nil, fmt.Errorf("unable to marshal the new key to DER: %v", err) 612 } 613 614 keyPEM = pem.EncodeToMemory(&pem.Block{Type: keyutil.ECPrivateKeyBlockType, Bytes: der}) 615 616 template = m.getTemplate() 617 if template == nil { 618 return nil, nil, nil, nil, fmt.Errorf("unable to create a csr, no template available") 619 } 620 csrPEM, err = cert.MakeCSRFromTemplate(privateKey, template) 621 if err != nil { 622 return nil, nil, nil, nil, fmt.Errorf("unable to create a csr from the private key: %v", err) 623 } 624 return template, csrPEM, keyPEM, privateKey, nil 625} 626 627func (m *manager) getLastRequest() (context.CancelFunc, *x509.CertificateRequest) { 628 m.lastRequestLock.Lock() 629 defer m.lastRequestLock.Unlock() 630 return m.lastRequestCancel, m.lastRequest 631} 632 633func (m *manager) setLastRequest(cancel context.CancelFunc, r *x509.CertificateRequest) { 634 m.lastRequestLock.Lock() 635 defer m.lastRequestLock.Unlock() 636 m.lastRequestCancel = cancel 637 m.lastRequest = r 638} 639