1/*
2Copyright 2017 The Kubernetes Authors.
3
4Licensed under the Apache License, Version 2.0 (the "License");
5you may not use this file except in compliance with the License.
6You may obtain a copy of the License at
7
8    http://www.apache.org/licenses/LICENSE-2.0
9
10Unless required by applicable law or agreed to in writing, software
11distributed under the License is distributed on an "AS IS" BASIS,
12WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13See the License for the specific language governing permissions and
14limitations under the License.
15*/
16
17package certificate
18
19import (
20	"context"
21	"crypto/ecdsa"
22	"crypto/elliptic"
23	cryptorand "crypto/rand"
24	"crypto/tls"
25	"crypto/x509"
26	"encoding/pem"
27	"fmt"
28	"reflect"
29	"sync"
30	"time"
31
32	"k8s.io/klog"
33
34	certificates "k8s.io/api/certificates/v1beta1"
35	"k8s.io/apimachinery/pkg/api/errors"
36	utilruntime "k8s.io/apimachinery/pkg/util/runtime"
37	"k8s.io/apimachinery/pkg/util/sets"
38	"k8s.io/apimachinery/pkg/util/wait"
39	certificatesclient "k8s.io/client-go/kubernetes/typed/certificates/v1beta1"
40	"k8s.io/client-go/util/cert"
41	"k8s.io/client-go/util/certificate/csr"
42	"k8s.io/client-go/util/keyutil"
43)
44
45// certificateWaitTimeout controls the amount of time we wait for certificate
46// approval in one iteration.
47var certificateWaitTimeout = 15 * time.Minute
48
49// Manager maintains and updates the certificates in use by this certificate
50// manager. In the background it communicates with the API server to get new
51// certificates for certificates about to expire.
52type Manager interface {
53	// Start the API server status sync loop.
54	Start()
55	// Stop the cert manager loop.
56	Stop()
57	// Current returns the currently selected certificate from the
58	// certificate manager, as well as the associated certificate and key data
59	// in PEM format.
60	Current() *tls.Certificate
61	// ServerHealthy returns true if the manager is able to communicate with
62	// the server. This allows a caller to determine whether the cert manager
63	// thinks it can potentially talk to the API server. The cert manager may
64	// be very conservative and only return true if recent communication has
65	// occurred with the server.
66	ServerHealthy() bool
67}
68
69// Config is the set of configuration parameters available for a new Manager.
70type Config struct {
71	// ClientFn will be used to create a client for
72	// signing new certificate requests generated when a key rotation occurs.
73	// It must be set at initialization. The function will never be invoked
74	// in parallel. It is passed the current client certificate if one exists.
75	ClientFn CSRClientFunc
76	// Template is the CertificateRequest that will be used as a template for
77	// generating certificate signing requests for all new keys generated as
78	// part of rotation. It follows the same rules as the template parameter of
79	// crypto.x509.CreateCertificateRequest in the Go standard libraries.
80	Template *x509.CertificateRequest
81	// GetTemplate returns the CertificateRequest that will be used as a template for
82	// generating certificate signing requests for all new keys generated as
83	// part of rotation. It follows the same rules as the template parameter of
84	// crypto.x509.CreateCertificateRequest in the Go standard libraries.
85	// If no template is available, nil may be returned, and no certificate will be requested.
86	// If specified, takes precedence over Template.
87	GetTemplate func() *x509.CertificateRequest
88	// SignerName is the name of the certificate signer that should sign certificates
89	// generated by the manager.
90	SignerName string
91	// Usages is the types of usages that certificates generated by the manager
92	// can be used for.
93	Usages []certificates.KeyUsage
94	// CertificateStore is a persistent store where the current cert/key is
95	// kept and future cert/key pairs will be persisted after they are
96	// generated.
97	CertificateStore Store
98	// BootstrapCertificatePEM is the certificate data that will be returned
99	// from the Manager if the CertificateStore doesn't have any cert/key pairs
100	// currently available and has not yet had a chance to get a new cert/key
101	// pair from the API. If the CertificateStore does have a cert/key pair,
102	// this will be ignored. If there is no cert/key pair available in the
103	// CertificateStore, as soon as Start is called, it will request a new
104	// cert/key pair from the CertificateSigningRequestClient. This is intended
105	// to allow the first boot of a component to be initialized using a
106	// generic, multi-use cert/key pair which will be quickly replaced with a
107	// unique cert/key pair.
108	BootstrapCertificatePEM []byte
109	// BootstrapKeyPEM is the key data that will be returned from the Manager
110	// if the CertificateStore doesn't have any cert/key pairs currently
111	// available. If the CertificateStore does have a cert/key pair, this will
112	// be ignored. If the bootstrap cert/key pair are used, they will be
113	// rotated at the first opportunity, possibly well in advance of expiring.
114	// This is intended to allow the first boot of a component to be
115	// initialized using a generic, multi-use cert/key pair which will be
116	// quickly replaced with a unique cert/key pair.
117	BootstrapKeyPEM []byte
118	// CertificateRotation will record a metric showing the time in seconds
119	// that certificates lived before being rotated. This metric is a histogram
120	// because there is value in keeping a history of rotation cadences. It
121	// allows one to setup monitoring and alerting of unexpected rotation
122	// behavior and track trends in rotation frequency.
123	CertificateRotation Histogram
124	// CertifcateRenewFailure will record a metric that keeps track of
125	// certificate renewal failures.
126	CertificateRenewFailure Counter
127}
128
129// Store is responsible for getting and updating the current certificate.
130// Depending on the concrete implementation, the backing store for this
131// behavior may vary.
132type Store interface {
133	// Current returns the currently selected certificate, as well as the
134	// associated certificate and key data in PEM format. If the Store doesn't
135	// have a cert/key pair currently, it should return a NoCertKeyError so
136	// that the Manager can recover by using bootstrap certificates to request
137	// a new cert/key pair.
138	Current() (*tls.Certificate, error)
139	// Update accepts the PEM data for the cert/key pair and makes the new
140	// cert/key pair the 'current' pair, that will be returned by future calls
141	// to Current().
142	Update(cert, key []byte) (*tls.Certificate, error)
143}
144
145// Gauge will record the remaining lifetime of the certificate each time it is
146// updated.
147type Gauge interface {
148	Set(float64)
149}
150
151// Histogram will record the time a rotated certificate was used before being
152// rotated.
153type Histogram interface {
154	Observe(float64)
155}
156
157// Counter will wrap a counter with labels
158type Counter interface {
159	Inc()
160}
161
162// NoCertKeyError indicates there is no cert/key currently available.
163type NoCertKeyError string
164
165// CSRClientFunc returns a new client for requesting CSRs. It passes the
166// current certificate if one is available and valid.
167type CSRClientFunc func(current *tls.Certificate) (certificatesclient.CertificateSigningRequestInterface, error)
168
169func (e *NoCertKeyError) Error() string { return string(*e) }
170
171type manager struct {
172	getTemplate func() *x509.CertificateRequest
173
174	// lastRequestLock guards lastRequestCancel and lastRequest
175	lastRequestLock   sync.Mutex
176	lastRequestCancel context.CancelFunc
177	lastRequest       *x509.CertificateRequest
178
179	dynamicTemplate bool
180	signerName      string
181	usages          []certificates.KeyUsage
182	forceRotation   bool
183
184	certStore Store
185
186	certificateRotation     Histogram
187	certificateRenewFailure Counter
188
189	// the following variables must only be accessed under certAccessLock
190	certAccessLock sync.RWMutex
191	cert           *tls.Certificate
192	serverHealth   bool
193
194	// the clientFn must only be accessed under the clientAccessLock
195	clientAccessLock sync.Mutex
196	clientFn         CSRClientFunc
197	stopCh           chan struct{}
198	stopped          bool
199
200	// Set to time.Now but can be stubbed out for testing
201	now func() time.Time
202}
203
204// NewManager returns a new certificate manager. A certificate manager is
205// responsible for being the authoritative source of certificates in the
206// Kubelet and handling updates due to rotation.
207func NewManager(config *Config) (Manager, error) {
208	cert, forceRotation, err := getCurrentCertificateOrBootstrap(
209		config.CertificateStore,
210		config.BootstrapCertificatePEM,
211		config.BootstrapKeyPEM)
212	if err != nil {
213		return nil, err
214	}
215
216	getTemplate := config.GetTemplate
217	if getTemplate == nil {
218		getTemplate = func() *x509.CertificateRequest { return config.Template }
219	}
220
221	m := manager{
222		stopCh:                  make(chan struct{}),
223		clientFn:                config.ClientFn,
224		getTemplate:             getTemplate,
225		dynamicTemplate:         config.GetTemplate != nil,
226		signerName:              config.SignerName,
227		usages:                  config.Usages,
228		certStore:               config.CertificateStore,
229		cert:                    cert,
230		forceRotation:           forceRotation,
231		certificateRotation:     config.CertificateRotation,
232		certificateRenewFailure: config.CertificateRenewFailure,
233		now:                     time.Now,
234	}
235
236	return &m, nil
237}
238
239// Current returns the currently selected certificate from the certificate
240// manager. This can be nil if the manager was initialized without a
241// certificate and has not yet received one from the
242// CertificateSigningRequestClient, or if the current cert has expired.
243func (m *manager) Current() *tls.Certificate {
244	m.certAccessLock.RLock()
245	defer m.certAccessLock.RUnlock()
246	if m.cert != nil && m.cert.Leaf != nil && m.now().After(m.cert.Leaf.NotAfter) {
247		klog.V(2).Infof("Current certificate is expired.")
248		return nil
249	}
250	return m.cert
251}
252
253// ServerHealthy returns true if the cert manager believes the server
254// is currently alive.
255func (m *manager) ServerHealthy() bool {
256	m.certAccessLock.RLock()
257	defer m.certAccessLock.RUnlock()
258	return m.serverHealth
259}
260
261// Stop terminates the manager.
262func (m *manager) Stop() {
263	m.clientAccessLock.Lock()
264	defer m.clientAccessLock.Unlock()
265	if m.stopped {
266		return
267	}
268	close(m.stopCh)
269	m.stopped = true
270}
271
272// Start will start the background work of rotating the certificates.
273func (m *manager) Start() {
274	// Certificate rotation depends on access to the API server certificate
275	// signing API, so don't start the certificate manager if we don't have a
276	// client.
277	if m.clientFn == nil {
278		klog.V(2).Infof("Certificate rotation is not enabled, no connection to the apiserver.")
279		return
280	}
281
282	klog.V(2).Infof("Certificate rotation is enabled.")
283
284	templateChanged := make(chan struct{})
285	go wait.Until(func() {
286		deadline := m.nextRotationDeadline()
287		if sleepInterval := deadline.Sub(m.now()); sleepInterval > 0 {
288			klog.V(2).Infof("Waiting %v for next certificate rotation", sleepInterval)
289
290			timer := time.NewTimer(sleepInterval)
291			defer timer.Stop()
292
293			select {
294			case <-timer.C:
295				// unblock when deadline expires
296			case <-templateChanged:
297				_, lastRequestTemplate := m.getLastRequest()
298				if reflect.DeepEqual(lastRequestTemplate, m.getTemplate()) {
299					// if the template now matches what we last requested, restart the rotation deadline loop
300					return
301				}
302				klog.V(2).Infof("Certificate template changed, rotating")
303			}
304		}
305
306		// Don't enter rotateCerts and trigger backoff if we don't even have a template to request yet
307		if m.getTemplate() == nil {
308			return
309		}
310
311		backoff := wait.Backoff{
312			Duration: 2 * time.Second,
313			Factor:   2,
314			Jitter:   0.1,
315			Steps:    5,
316		}
317		if err := wait.ExponentialBackoff(backoff, m.rotateCerts); err != nil {
318			utilruntime.HandleError(fmt.Errorf("Reached backoff limit, still unable to rotate certs: %v", err))
319			wait.PollInfinite(32*time.Second, m.rotateCerts)
320		}
321	}, time.Second, m.stopCh)
322
323	if m.dynamicTemplate {
324		go wait.Until(func() {
325			// check if the current template matches what we last requested
326			lastRequestCancel, lastRequestTemplate := m.getLastRequest()
327
328			if !m.certSatisfiesTemplate() && !reflect.DeepEqual(lastRequestTemplate, m.getTemplate()) {
329				// if the template is different, queue up an interrupt of the rotation deadline loop.
330				// if we've requested a CSR that matches the new template by the time the interrupt is handled, the interrupt is disregarded.
331				if lastRequestCancel != nil {
332					// if we're currently waiting on a submitted request that no longer matches what we want, stop waiting
333					lastRequestCancel()
334				}
335				select {
336				case templateChanged <- struct{}{}:
337				case <-m.stopCh:
338				}
339			}
340		}, time.Second, m.stopCh)
341	}
342}
343
344func getCurrentCertificateOrBootstrap(
345	store Store,
346	bootstrapCertificatePEM []byte,
347	bootstrapKeyPEM []byte) (cert *tls.Certificate, shouldRotate bool, errResult error) {
348
349	currentCert, err := store.Current()
350	if err == nil {
351		// if the current cert is expired, fall back to the bootstrap cert
352		if currentCert.Leaf != nil && time.Now().Before(currentCert.Leaf.NotAfter) {
353			return currentCert, false, nil
354		}
355	} else {
356		if _, ok := err.(*NoCertKeyError); !ok {
357			return nil, false, err
358		}
359	}
360
361	if bootstrapCertificatePEM == nil || bootstrapKeyPEM == nil {
362		return nil, true, nil
363	}
364
365	bootstrapCert, err := tls.X509KeyPair(bootstrapCertificatePEM, bootstrapKeyPEM)
366	if err != nil {
367		return nil, false, err
368	}
369	if len(bootstrapCert.Certificate) < 1 {
370		return nil, false, fmt.Errorf("no cert/key data found")
371	}
372
373	certs, err := x509.ParseCertificates(bootstrapCert.Certificate[0])
374	if err != nil {
375		return nil, false, fmt.Errorf("unable to parse certificate data: %v", err)
376	}
377	bootstrapCert.Leaf = certs[0]
378
379	if _, err := store.Update(bootstrapCertificatePEM, bootstrapKeyPEM); err != nil {
380		utilruntime.HandleError(fmt.Errorf("Unable to set the cert/key pair to the bootstrap certificate: %v", err))
381	} else {
382		klog.V(4).Infof("Updated the store to contain the initial bootstrap certificate")
383	}
384
385	return &bootstrapCert, true, nil
386}
387
388func (m *manager) getClient() (certificatesclient.CertificateSigningRequestInterface, error) {
389	current := m.Current()
390	m.clientAccessLock.Lock()
391	defer m.clientAccessLock.Unlock()
392	return m.clientFn(current)
393}
394
395// RotateCerts is exposed for testing only and is not a part of the public interface.
396// Returns true if it changed the cert, false otherwise. Error is only returned in
397// exceptional cases.
398func (m *manager) RotateCerts() (bool, error) {
399	return m.rotateCerts()
400}
401
402// rotateCerts attempts to request a client cert from the server, wait a reasonable
403// period of time for it to be signed, and then update the cert on disk. If it cannot
404// retrieve a cert, it will return false. It will only return error in exceptional cases.
405// This method also keeps track of "server health" by interpreting the responses it gets
406// from the server on the various calls it makes.
407// TODO: return errors, have callers handle and log them correctly
408func (m *manager) rotateCerts() (bool, error) {
409	klog.V(2).Infof("Rotating certificates")
410
411	template, csrPEM, keyPEM, privateKey, err := m.generateCSR()
412	if err != nil {
413		utilruntime.HandleError(fmt.Errorf("Unable to generate a certificate signing request: %v", err))
414		if m.certificateRenewFailure != nil {
415			m.certificateRenewFailure.Inc()
416		}
417		return false, nil
418	}
419
420	// request the client each time
421	client, err := m.getClient()
422	if err != nil {
423		utilruntime.HandleError(fmt.Errorf("Unable to load a client to request certificates: %v", err))
424		if m.certificateRenewFailure != nil {
425			m.certificateRenewFailure.Inc()
426		}
427		return false, nil
428	}
429
430	// Call the Certificate Signing Request API to get a certificate for the
431	// new private key.
432	req, err := csr.RequestCertificate(client, csrPEM, "", m.signerName, m.usages, privateKey)
433	if err != nil {
434		utilruntime.HandleError(fmt.Errorf("Failed while requesting a signed certificate from the master: %v", err))
435		if m.certificateRenewFailure != nil {
436			m.certificateRenewFailure.Inc()
437		}
438		return false, m.updateServerError(err)
439	}
440
441	ctx, cancel := context.WithTimeout(context.Background(), certificateWaitTimeout)
442	defer cancel()
443
444	// Once we've successfully submitted a CSR for this template, record that we did so
445	m.setLastRequest(cancel, template)
446
447	// Wait for the certificate to be signed. This interface and internal timout
448	// is a remainder after the old design using raw watch wrapped with backoff.
449	crtPEM, err := csr.WaitForCertificate(ctx, client, req)
450	if err != nil {
451		utilruntime.HandleError(fmt.Errorf("certificate request was not signed: %v", err))
452		if m.certificateRenewFailure != nil {
453			m.certificateRenewFailure.Inc()
454		}
455		return false, nil
456	}
457
458	cert, err := m.certStore.Update(crtPEM, keyPEM)
459	if err != nil {
460		utilruntime.HandleError(fmt.Errorf("Unable to store the new cert/key pair: %v", err))
461		if m.certificateRenewFailure != nil {
462			m.certificateRenewFailure.Inc()
463		}
464		return false, nil
465	}
466
467	if old := m.updateCached(cert); old != nil && m.certificateRotation != nil {
468		m.certificateRotation.Observe(m.now().Sub(old.Leaf.NotBefore).Seconds())
469	}
470
471	return true, nil
472}
473
474// Check that the current certificate on disk satisfies the requests from the
475// current template.
476//
477// Note that extra items in the certificate's SAN or orgs that don't exist in
478// the template will not trigger a renewal.
479//
480// Requires certAccessLock to be locked.
481func (m *manager) certSatisfiesTemplateLocked() bool {
482	if m.cert == nil {
483		return false
484	}
485
486	if template := m.getTemplate(); template != nil {
487		if template.Subject.CommonName != m.cert.Leaf.Subject.CommonName {
488			klog.V(2).Infof("Current certificate CN (%s) does not match requested CN (%s)", m.cert.Leaf.Subject.CommonName, template.Subject.CommonName)
489			return false
490		}
491
492		currentDNSNames := sets.NewString(m.cert.Leaf.DNSNames...)
493		desiredDNSNames := sets.NewString(template.DNSNames...)
494		missingDNSNames := desiredDNSNames.Difference(currentDNSNames)
495		if len(missingDNSNames) > 0 {
496			klog.V(2).Infof("Current certificate is missing requested DNS names %v", missingDNSNames.List())
497			return false
498		}
499
500		currentIPs := sets.NewString()
501		for _, ip := range m.cert.Leaf.IPAddresses {
502			currentIPs.Insert(ip.String())
503		}
504		desiredIPs := sets.NewString()
505		for _, ip := range template.IPAddresses {
506			desiredIPs.Insert(ip.String())
507		}
508		missingIPs := desiredIPs.Difference(currentIPs)
509		if len(missingIPs) > 0 {
510			klog.V(2).Infof("Current certificate is missing requested IP addresses %v", missingIPs.List())
511			return false
512		}
513
514		currentOrgs := sets.NewString(m.cert.Leaf.Subject.Organization...)
515		desiredOrgs := sets.NewString(template.Subject.Organization...)
516		missingOrgs := desiredOrgs.Difference(currentOrgs)
517		if len(missingOrgs) > 0 {
518			klog.V(2).Infof("Current certificate is missing requested orgs %v", missingOrgs.List())
519			return false
520		}
521	}
522
523	return true
524}
525
526func (m *manager) certSatisfiesTemplate() bool {
527	m.certAccessLock.RLock()
528	defer m.certAccessLock.RUnlock()
529	return m.certSatisfiesTemplateLocked()
530}
531
532// nextRotationDeadline returns a value for the threshold at which the
533// current certificate should be rotated, 80%+/-10% of the expiration of the
534// certificate.
535func (m *manager) nextRotationDeadline() time.Time {
536	// forceRotation is not protected by locks
537	if m.forceRotation {
538		m.forceRotation = false
539		return m.now()
540	}
541
542	m.certAccessLock.RLock()
543	defer m.certAccessLock.RUnlock()
544
545	if !m.certSatisfiesTemplateLocked() {
546		return m.now()
547	}
548
549	notAfter := m.cert.Leaf.NotAfter
550	totalDuration := float64(notAfter.Sub(m.cert.Leaf.NotBefore))
551	deadline := m.cert.Leaf.NotBefore.Add(jitteryDuration(totalDuration))
552
553	klog.V(2).Infof("Certificate expiration is %v, rotation deadline is %v", notAfter, deadline)
554	return deadline
555}
556
557// jitteryDuration uses some jitter to set the rotation threshold so each node
558// will rotate at approximately 70-90% of the total lifetime of the
559// certificate.  With jitter, if a number of nodes are added to a cluster at
560// approximately the same time (such as cluster creation time), they won't all
561// try to rotate certificates at the same time for the rest of the life of the
562// cluster.
563//
564// This function is represented as a variable to allow replacement during testing.
565var jitteryDuration = func(totalDuration float64) time.Duration {
566	return wait.Jitter(time.Duration(totalDuration), 0.2) - time.Duration(totalDuration*0.3)
567}
568
569// updateCached sets the most recent retrieved cert and returns the old cert.
570// It also sets the server as assumed healthy.
571func (m *manager) updateCached(cert *tls.Certificate) *tls.Certificate {
572	m.certAccessLock.Lock()
573	defer m.certAccessLock.Unlock()
574	m.serverHealth = true
575	old := m.cert
576	m.cert = cert
577	return old
578}
579
580// updateServerError takes an error returned by the server and infers
581// the health of the server based on the error. It will return nil if
582// the error does not require immediate termination of any wait loops,
583// and otherwise it will return the error.
584func (m *manager) updateServerError(err error) error {
585	m.certAccessLock.Lock()
586	defer m.certAccessLock.Unlock()
587	switch {
588	case errors.IsUnauthorized(err):
589		// SSL terminating proxies may report this error instead of the master
590		m.serverHealth = true
591	case errors.IsUnexpectedServerError(err):
592		// generally indicates a proxy or other load balancer problem, rather than a problem coming
593		// from the master
594		m.serverHealth = false
595	default:
596		// Identify known errors that could be expected for a cert request that
597		// indicate everything is working normally
598		m.serverHealth = errors.IsNotFound(err) || errors.IsForbidden(err)
599	}
600	return nil
601}
602
603func (m *manager) generateCSR() (template *x509.CertificateRequest, csrPEM []byte, keyPEM []byte, key interface{}, err error) {
604	// Generate a new private key.
605	privateKey, err := ecdsa.GenerateKey(elliptic.P256(), cryptorand.Reader)
606	if err != nil {
607		return nil, nil, nil, nil, fmt.Errorf("unable to generate a new private key: %v", err)
608	}
609	der, err := x509.MarshalECPrivateKey(privateKey)
610	if err != nil {
611		return nil, nil, nil, nil, fmt.Errorf("unable to marshal the new key to DER: %v", err)
612	}
613
614	keyPEM = pem.EncodeToMemory(&pem.Block{Type: keyutil.ECPrivateKeyBlockType, Bytes: der})
615
616	template = m.getTemplate()
617	if template == nil {
618		return nil, nil, nil, nil, fmt.Errorf("unable to create a csr, no template available")
619	}
620	csrPEM, err = cert.MakeCSRFromTemplate(privateKey, template)
621	if err != nil {
622		return nil, nil, nil, nil, fmt.Errorf("unable to create a csr from the private key: %v", err)
623	}
624	return template, csrPEM, keyPEM, privateKey, nil
625}
626
627func (m *manager) getLastRequest() (context.CancelFunc, *x509.CertificateRequest) {
628	m.lastRequestLock.Lock()
629	defer m.lastRequestLock.Unlock()
630	return m.lastRequestCancel, m.lastRequest
631}
632
633func (m *manager) setLastRequest(cancel context.CancelFunc, r *x509.CertificateRequest) {
634	m.lastRequestLock.Lock()
635	defer m.lastRequestLock.Unlock()
636	m.lastRequestCancel = cancel
637	m.lastRequest = r
638}
639