1/*
2Copyright 2017 The Kubernetes Authors.
3
4Licensed under the Apache License, Version 2.0 (the "License");
5you may not use this file except in compliance with the License.
6You may obtain a copy of the License at
7
8    http://www.apache.org/licenses/LICENSE-2.0
9
10Unless required by applicable law or agreed to in writing, software
11distributed under the License is distributed on an "AS IS" BASIS,
12WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13See the License for the specific language governing permissions and
14limitations under the License.
15*/
16
17package certificate
18
19import (
20	"context"
21	"crypto/ecdsa"
22	"crypto/elliptic"
23	cryptorand "crypto/rand"
24	"crypto/tls"
25	"crypto/x509"
26	"encoding/pem"
27	"fmt"
28	"reflect"
29	"sync"
30	"time"
31
32	"k8s.io/klog/v2"
33
34	certificates "k8s.io/api/certificates/v1"
35	"k8s.io/apimachinery/pkg/api/errors"
36	utilruntime "k8s.io/apimachinery/pkg/util/runtime"
37	"k8s.io/apimachinery/pkg/util/sets"
38	"k8s.io/apimachinery/pkg/util/wait"
39	clientset "k8s.io/client-go/kubernetes"
40	"k8s.io/client-go/util/cert"
41	"k8s.io/client-go/util/certificate/csr"
42	"k8s.io/client-go/util/keyutil"
43)
44
45// certificateWaitTimeout controls the amount of time we wait for certificate
46// approval in one iteration.
47var certificateWaitTimeout = 15 * time.Minute
48
49// Manager maintains and updates the certificates in use by this certificate
50// manager. In the background it communicates with the API server to get new
51// certificates for certificates about to expire.
52type Manager interface {
53	// Start the API server status sync loop.
54	Start()
55	// Stop the cert manager loop.
56	Stop()
57	// Current returns the currently selected certificate from the
58	// certificate manager, as well as the associated certificate and key data
59	// in PEM format.
60	Current() *tls.Certificate
61	// ServerHealthy returns true if the manager is able to communicate with
62	// the server. This allows a caller to determine whether the cert manager
63	// thinks it can potentially talk to the API server. The cert manager may
64	// be very conservative and only return true if recent communication has
65	// occurred with the server.
66	ServerHealthy() bool
67}
68
69// Config is the set of configuration parameters available for a new Manager.
70type Config struct {
71	// ClientsetFn will be used to create a clientset for
72	// creating/fetching new certificate requests generated when a key rotation occurs.
73	// The function will never be invoked in parallel.
74	// It is passed the current client certificate if one exists.
75	ClientsetFn ClientsetFunc
76	// Template is the CertificateRequest that will be used as a template for
77	// generating certificate signing requests for all new keys generated as
78	// part of rotation. It follows the same rules as the template parameter of
79	// crypto.x509.CreateCertificateRequest in the Go standard libraries.
80	Template *x509.CertificateRequest
81	// GetTemplate returns the CertificateRequest that will be used as a template for
82	// generating certificate signing requests for all new keys generated as
83	// part of rotation. It follows the same rules as the template parameter of
84	// crypto.x509.CreateCertificateRequest in the Go standard libraries.
85	// If no template is available, nil may be returned, and no certificate will be requested.
86	// If specified, takes precedence over Template.
87	GetTemplate func() *x509.CertificateRequest
88	// SignerName is the name of the certificate signer that should sign certificates
89	// generated by the manager.
90	SignerName string
91	// Usages is the types of usages that certificates generated by the manager
92	// can be used for.
93	Usages []certificates.KeyUsage
94	// CertificateStore is a persistent store where the current cert/key is
95	// kept and future cert/key pairs will be persisted after they are
96	// generated.
97	CertificateStore Store
98	// BootstrapCertificatePEM is the certificate data that will be returned
99	// from the Manager if the CertificateStore doesn't have any cert/key pairs
100	// currently available and has not yet had a chance to get a new cert/key
101	// pair from the API. If the CertificateStore does have a cert/key pair,
102	// this will be ignored. If there is no cert/key pair available in the
103	// CertificateStore, as soon as Start is called, it will request a new
104	// cert/key pair from the CertificateSigningRequestClient. This is intended
105	// to allow the first boot of a component to be initialized using a
106	// generic, multi-use cert/key pair which will be quickly replaced with a
107	// unique cert/key pair.
108	BootstrapCertificatePEM []byte
109	// BootstrapKeyPEM is the key data that will be returned from the Manager
110	// if the CertificateStore doesn't have any cert/key pairs currently
111	// available. If the CertificateStore does have a cert/key pair, this will
112	// be ignored. If the bootstrap cert/key pair are used, they will be
113	// rotated at the first opportunity, possibly well in advance of expiring.
114	// This is intended to allow the first boot of a component to be
115	// initialized using a generic, multi-use cert/key pair which will be
116	// quickly replaced with a unique cert/key pair.
117	BootstrapKeyPEM []byte `datapolicy:"security-key"`
118	// CertificateRotation will record a metric showing the time in seconds
119	// that certificates lived before being rotated. This metric is a histogram
120	// because there is value in keeping a history of rotation cadences. It
121	// allows one to setup monitoring and alerting of unexpected rotation
122	// behavior and track trends in rotation frequency.
123	CertificateRotation Histogram
124	// CertifcateRenewFailure will record a metric that keeps track of
125	// certificate renewal failures.
126	CertificateRenewFailure Counter
127}
128
129// Store is responsible for getting and updating the current certificate.
130// Depending on the concrete implementation, the backing store for this
131// behavior may vary.
132type Store interface {
133	// Current returns the currently selected certificate, as well as the
134	// associated certificate and key data in PEM format. If the Store doesn't
135	// have a cert/key pair currently, it should return a NoCertKeyError so
136	// that the Manager can recover by using bootstrap certificates to request
137	// a new cert/key pair.
138	Current() (*tls.Certificate, error)
139	// Update accepts the PEM data for the cert/key pair and makes the new
140	// cert/key pair the 'current' pair, that will be returned by future calls
141	// to Current().
142	Update(cert, key []byte) (*tls.Certificate, error)
143}
144
145// Gauge will record the remaining lifetime of the certificate each time it is
146// updated.
147type Gauge interface {
148	Set(float64)
149}
150
151// Histogram will record the time a rotated certificate was used before being
152// rotated.
153type Histogram interface {
154	Observe(float64)
155}
156
157// Counter will wrap a counter with labels
158type Counter interface {
159	Inc()
160}
161
162// NoCertKeyError indicates there is no cert/key currently available.
163type NoCertKeyError string
164
165// ClientsetFunc returns a new clientset for discovering CSR API availability and requesting CSRs.
166// It is passed the current certificate if one is available and valid.
167type ClientsetFunc func(current *tls.Certificate) (clientset.Interface, error)
168
169func (e *NoCertKeyError) Error() string { return string(*e) }
170
171type manager struct {
172	getTemplate func() *x509.CertificateRequest
173
174	// lastRequestLock guards lastRequestCancel and lastRequest
175	lastRequestLock   sync.Mutex
176	lastRequestCancel context.CancelFunc
177	lastRequest       *x509.CertificateRequest
178
179	dynamicTemplate bool
180	signerName      string
181	usages          []certificates.KeyUsage
182	forceRotation   bool
183
184	certStore Store
185
186	certificateRotation     Histogram
187	certificateRenewFailure Counter
188
189	// the following variables must only be accessed under certAccessLock
190	certAccessLock sync.RWMutex
191	cert           *tls.Certificate
192	serverHealth   bool
193
194	// the clientFn must only be accessed under the clientAccessLock
195	clientAccessLock sync.Mutex
196	clientsetFn      ClientsetFunc
197	stopCh           chan struct{}
198	stopped          bool
199
200	// Set to time.Now but can be stubbed out for testing
201	now func() time.Time
202}
203
204// NewManager returns a new certificate manager. A certificate manager is
205// responsible for being the authoritative source of certificates in the
206// Kubelet and handling updates due to rotation.
207func NewManager(config *Config) (Manager, error) {
208	cert, forceRotation, err := getCurrentCertificateOrBootstrap(
209		config.CertificateStore,
210		config.BootstrapCertificatePEM,
211		config.BootstrapKeyPEM)
212	if err != nil {
213		return nil, err
214	}
215
216	getTemplate := config.GetTemplate
217	if getTemplate == nil {
218		getTemplate = func() *x509.CertificateRequest { return config.Template }
219	}
220
221	m := manager{
222		stopCh:                  make(chan struct{}),
223		clientsetFn:             config.ClientsetFn,
224		getTemplate:             getTemplate,
225		dynamicTemplate:         config.GetTemplate != nil,
226		signerName:              config.SignerName,
227		usages:                  config.Usages,
228		certStore:               config.CertificateStore,
229		cert:                    cert,
230		forceRotation:           forceRotation,
231		certificateRotation:     config.CertificateRotation,
232		certificateRenewFailure: config.CertificateRenewFailure,
233		now:                     time.Now,
234	}
235
236	return &m, nil
237}
238
239// Current returns the currently selected certificate from the certificate
240// manager. This can be nil if the manager was initialized without a
241// certificate and has not yet received one from the
242// CertificateSigningRequestClient, or if the current cert has expired.
243func (m *manager) Current() *tls.Certificate {
244	m.certAccessLock.RLock()
245	defer m.certAccessLock.RUnlock()
246	if m.cert != nil && m.cert.Leaf != nil && m.now().After(m.cert.Leaf.NotAfter) {
247		klog.V(2).Infof("Current certificate is expired.")
248		return nil
249	}
250	return m.cert
251}
252
253// ServerHealthy returns true if the cert manager believes the server
254// is currently alive.
255func (m *manager) ServerHealthy() bool {
256	m.certAccessLock.RLock()
257	defer m.certAccessLock.RUnlock()
258	return m.serverHealth
259}
260
261// Stop terminates the manager.
262func (m *manager) Stop() {
263	m.clientAccessLock.Lock()
264	defer m.clientAccessLock.Unlock()
265	if m.stopped {
266		return
267	}
268	close(m.stopCh)
269	m.stopped = true
270}
271
272// Start will start the background work of rotating the certificates.
273func (m *manager) Start() {
274	// Certificate rotation depends on access to the API server certificate
275	// signing API, so don't start the certificate manager if we don't have a
276	// client.
277	if m.clientsetFn == nil {
278		klog.V(2).Infof("Certificate rotation is not enabled, no connection to the apiserver.")
279		return
280	}
281
282	klog.V(2).Infof("Certificate rotation is enabled.")
283
284	templateChanged := make(chan struct{})
285	go wait.Until(func() {
286		deadline := m.nextRotationDeadline()
287		if sleepInterval := deadline.Sub(m.now()); sleepInterval > 0 {
288			klog.V(2).Infof("Waiting %v for next certificate rotation", sleepInterval)
289
290			timer := time.NewTimer(sleepInterval)
291			defer timer.Stop()
292
293			select {
294			case <-timer.C:
295				// unblock when deadline expires
296			case <-templateChanged:
297				_, lastRequestTemplate := m.getLastRequest()
298				if reflect.DeepEqual(lastRequestTemplate, m.getTemplate()) {
299					// if the template now matches what we last requested, restart the rotation deadline loop
300					return
301				}
302				klog.V(2).Infof("Certificate template changed, rotating")
303			}
304		}
305
306		// Don't enter rotateCerts and trigger backoff if we don't even have a template to request yet
307		if m.getTemplate() == nil {
308			return
309		}
310
311		backoff := wait.Backoff{
312			Duration: 2 * time.Second,
313			Factor:   2,
314			Jitter:   0.1,
315			Steps:    5,
316		}
317		if err := wait.ExponentialBackoff(backoff, m.rotateCerts); err != nil {
318			utilruntime.HandleError(fmt.Errorf("Reached backoff limit, still unable to rotate certs: %v", err))
319			wait.PollInfinite(32*time.Second, m.rotateCerts)
320		}
321	}, time.Second, m.stopCh)
322
323	if m.dynamicTemplate {
324		go wait.Until(func() {
325			// check if the current template matches what we last requested
326			lastRequestCancel, lastRequestTemplate := m.getLastRequest()
327
328			if !m.certSatisfiesTemplate() && !reflect.DeepEqual(lastRequestTemplate, m.getTemplate()) {
329				// if the template is different, queue up an interrupt of the rotation deadline loop.
330				// if we've requested a CSR that matches the new template by the time the interrupt is handled, the interrupt is disregarded.
331				if lastRequestCancel != nil {
332					// if we're currently waiting on a submitted request that no longer matches what we want, stop waiting
333					lastRequestCancel()
334				}
335				select {
336				case templateChanged <- struct{}{}:
337				case <-m.stopCh:
338				}
339			}
340		}, time.Second, m.stopCh)
341	}
342}
343
344func getCurrentCertificateOrBootstrap(
345	store Store,
346	bootstrapCertificatePEM []byte,
347	bootstrapKeyPEM []byte) (cert *tls.Certificate, shouldRotate bool, errResult error) {
348
349	currentCert, err := store.Current()
350	if err == nil {
351		// if the current cert is expired, fall back to the bootstrap cert
352		if currentCert.Leaf != nil && time.Now().Before(currentCert.Leaf.NotAfter) {
353			return currentCert, false, nil
354		}
355	} else {
356		if _, ok := err.(*NoCertKeyError); !ok {
357			return nil, false, err
358		}
359	}
360
361	if bootstrapCertificatePEM == nil || bootstrapKeyPEM == nil {
362		return nil, true, nil
363	}
364
365	bootstrapCert, err := tls.X509KeyPair(bootstrapCertificatePEM, bootstrapKeyPEM)
366	if err != nil {
367		return nil, false, err
368	}
369	if len(bootstrapCert.Certificate) < 1 {
370		return nil, false, fmt.Errorf("no cert/key data found")
371	}
372
373	certs, err := x509.ParseCertificates(bootstrapCert.Certificate[0])
374	if err != nil {
375		return nil, false, fmt.Errorf("unable to parse certificate data: %v", err)
376	}
377	if len(certs) < 1 {
378		return nil, false, fmt.Errorf("no cert data found")
379	}
380	bootstrapCert.Leaf = certs[0]
381
382	if _, err := store.Update(bootstrapCertificatePEM, bootstrapKeyPEM); err != nil {
383		utilruntime.HandleError(fmt.Errorf("Unable to set the cert/key pair to the bootstrap certificate: %v", err))
384	} else {
385		klog.V(4).Infof("Updated the store to contain the initial bootstrap certificate")
386	}
387
388	return &bootstrapCert, true, nil
389}
390
391func (m *manager) getClientset() (clientset.Interface, error) {
392	current := m.Current()
393	m.clientAccessLock.Lock()
394	defer m.clientAccessLock.Unlock()
395	return m.clientsetFn(current)
396}
397
398// RotateCerts is exposed for testing only and is not a part of the public interface.
399// Returns true if it changed the cert, false otherwise. Error is only returned in
400// exceptional cases.
401func (m *manager) RotateCerts() (bool, error) {
402	return m.rotateCerts()
403}
404
405// rotateCerts attempts to request a client cert from the server, wait a reasonable
406// period of time for it to be signed, and then update the cert on disk. If it cannot
407// retrieve a cert, it will return false. It will only return error in exceptional cases.
408// This method also keeps track of "server health" by interpreting the responses it gets
409// from the server on the various calls it makes.
410// TODO: return errors, have callers handle and log them correctly
411func (m *manager) rotateCerts() (bool, error) {
412	klog.V(2).Infof("Rotating certificates")
413
414	template, csrPEM, keyPEM, privateKey, err := m.generateCSR()
415	if err != nil {
416		utilruntime.HandleError(fmt.Errorf("Unable to generate a certificate signing request: %v", err))
417		if m.certificateRenewFailure != nil {
418			m.certificateRenewFailure.Inc()
419		}
420		return false, nil
421	}
422
423	// request the client each time
424	clientSet, err := m.getClientset()
425	if err != nil {
426		utilruntime.HandleError(fmt.Errorf("Unable to load a client to request certificates: %v", err))
427		if m.certificateRenewFailure != nil {
428			m.certificateRenewFailure.Inc()
429		}
430		return false, nil
431	}
432
433	// Call the Certificate Signing Request API to get a certificate for the
434	// new private key.
435	reqName, reqUID, err := csr.RequestCertificate(clientSet, csrPEM, "", m.signerName, m.usages, privateKey)
436	if err != nil {
437		utilruntime.HandleError(fmt.Errorf("Failed while requesting a signed certificate from the master: %v", err))
438		if m.certificateRenewFailure != nil {
439			m.certificateRenewFailure.Inc()
440		}
441		return false, m.updateServerError(err)
442	}
443
444	ctx, cancel := context.WithTimeout(context.Background(), certificateWaitTimeout)
445	defer cancel()
446
447	// Once we've successfully submitted a CSR for this template, record that we did so
448	m.setLastRequest(cancel, template)
449
450	// Wait for the certificate to be signed. This interface and internal timout
451	// is a remainder after the old design using raw watch wrapped with backoff.
452	crtPEM, err := csr.WaitForCertificate(ctx, clientSet, reqName, reqUID)
453	if err != nil {
454		utilruntime.HandleError(fmt.Errorf("certificate request was not signed: %v", err))
455		if m.certificateRenewFailure != nil {
456			m.certificateRenewFailure.Inc()
457		}
458		return false, nil
459	}
460
461	cert, err := m.certStore.Update(crtPEM, keyPEM)
462	if err != nil {
463		utilruntime.HandleError(fmt.Errorf("Unable to store the new cert/key pair: %v", err))
464		if m.certificateRenewFailure != nil {
465			m.certificateRenewFailure.Inc()
466		}
467		return false, nil
468	}
469
470	if old := m.updateCached(cert); old != nil && m.certificateRotation != nil {
471		m.certificateRotation.Observe(m.now().Sub(old.Leaf.NotBefore).Seconds())
472	}
473
474	return true, nil
475}
476
477// Check that the current certificate on disk satisfies the requests from the
478// current template.
479//
480// Note that extra items in the certificate's SAN or orgs that don't exist in
481// the template will not trigger a renewal.
482//
483// Requires certAccessLock to be locked.
484func (m *manager) certSatisfiesTemplateLocked() bool {
485	if m.cert == nil {
486		return false
487	}
488
489	if template := m.getTemplate(); template != nil {
490		if template.Subject.CommonName != m.cert.Leaf.Subject.CommonName {
491			klog.V(2).Infof("Current certificate CN (%s) does not match requested CN (%s)", m.cert.Leaf.Subject.CommonName, template.Subject.CommonName)
492			return false
493		}
494
495		currentDNSNames := sets.NewString(m.cert.Leaf.DNSNames...)
496		desiredDNSNames := sets.NewString(template.DNSNames...)
497		missingDNSNames := desiredDNSNames.Difference(currentDNSNames)
498		if len(missingDNSNames) > 0 {
499			klog.V(2).Infof("Current certificate is missing requested DNS names %v", missingDNSNames.List())
500			return false
501		}
502
503		currentIPs := sets.NewString()
504		for _, ip := range m.cert.Leaf.IPAddresses {
505			currentIPs.Insert(ip.String())
506		}
507		desiredIPs := sets.NewString()
508		for _, ip := range template.IPAddresses {
509			desiredIPs.Insert(ip.String())
510		}
511		missingIPs := desiredIPs.Difference(currentIPs)
512		if len(missingIPs) > 0 {
513			klog.V(2).Infof("Current certificate is missing requested IP addresses %v", missingIPs.List())
514			return false
515		}
516
517		currentOrgs := sets.NewString(m.cert.Leaf.Subject.Organization...)
518		desiredOrgs := sets.NewString(template.Subject.Organization...)
519		missingOrgs := desiredOrgs.Difference(currentOrgs)
520		if len(missingOrgs) > 0 {
521			klog.V(2).Infof("Current certificate is missing requested orgs %v", missingOrgs.List())
522			return false
523		}
524	}
525
526	return true
527}
528
529func (m *manager) certSatisfiesTemplate() bool {
530	m.certAccessLock.RLock()
531	defer m.certAccessLock.RUnlock()
532	return m.certSatisfiesTemplateLocked()
533}
534
535// nextRotationDeadline returns a value for the threshold at which the
536// current certificate should be rotated, 80%+/-10% of the expiration of the
537// certificate.
538func (m *manager) nextRotationDeadline() time.Time {
539	// forceRotation is not protected by locks
540	if m.forceRotation {
541		m.forceRotation = false
542		return m.now()
543	}
544
545	m.certAccessLock.RLock()
546	defer m.certAccessLock.RUnlock()
547
548	if !m.certSatisfiesTemplateLocked() {
549		return m.now()
550	}
551
552	notAfter := m.cert.Leaf.NotAfter
553	totalDuration := float64(notAfter.Sub(m.cert.Leaf.NotBefore))
554	deadline := m.cert.Leaf.NotBefore.Add(jitteryDuration(totalDuration))
555
556	klog.V(2).Infof("Certificate expiration is %v, rotation deadline is %v", notAfter, deadline)
557	return deadline
558}
559
560// jitteryDuration uses some jitter to set the rotation threshold so each node
561// will rotate at approximately 70-90% of the total lifetime of the
562// certificate.  With jitter, if a number of nodes are added to a cluster at
563// approximately the same time (such as cluster creation time), they won't all
564// try to rotate certificates at the same time for the rest of the life of the
565// cluster.
566//
567// This function is represented as a variable to allow replacement during testing.
568var jitteryDuration = func(totalDuration float64) time.Duration {
569	return wait.Jitter(time.Duration(totalDuration), 0.2) - time.Duration(totalDuration*0.3)
570}
571
572// updateCached sets the most recent retrieved cert and returns the old cert.
573// It also sets the server as assumed healthy.
574func (m *manager) updateCached(cert *tls.Certificate) *tls.Certificate {
575	m.certAccessLock.Lock()
576	defer m.certAccessLock.Unlock()
577	m.serverHealth = true
578	old := m.cert
579	m.cert = cert
580	return old
581}
582
583// updateServerError takes an error returned by the server and infers
584// the health of the server based on the error. It will return nil if
585// the error does not require immediate termination of any wait loops,
586// and otherwise it will return the error.
587func (m *manager) updateServerError(err error) error {
588	m.certAccessLock.Lock()
589	defer m.certAccessLock.Unlock()
590	switch {
591	case errors.IsUnauthorized(err):
592		// SSL terminating proxies may report this error instead of the master
593		m.serverHealth = true
594	case errors.IsUnexpectedServerError(err):
595		// generally indicates a proxy or other load balancer problem, rather than a problem coming
596		// from the master
597		m.serverHealth = false
598	default:
599		// Identify known errors that could be expected for a cert request that
600		// indicate everything is working normally
601		m.serverHealth = errors.IsNotFound(err) || errors.IsForbidden(err)
602	}
603	return nil
604}
605
606func (m *manager) generateCSR() (template *x509.CertificateRequest, csrPEM []byte, keyPEM []byte, key interface{}, err error) {
607	// Generate a new private key.
608	privateKey, err := ecdsa.GenerateKey(elliptic.P256(), cryptorand.Reader)
609	if err != nil {
610		return nil, nil, nil, nil, fmt.Errorf("unable to generate a new private key: %v", err)
611	}
612	der, err := x509.MarshalECPrivateKey(privateKey)
613	if err != nil {
614		return nil, nil, nil, nil, fmt.Errorf("unable to marshal the new key to DER: %v", err)
615	}
616
617	keyPEM = pem.EncodeToMemory(&pem.Block{Type: keyutil.ECPrivateKeyBlockType, Bytes: der})
618
619	template = m.getTemplate()
620	if template == nil {
621		return nil, nil, nil, nil, fmt.Errorf("unable to create a csr, no template available")
622	}
623	csrPEM, err = cert.MakeCSRFromTemplate(privateKey, template)
624	if err != nil {
625		return nil, nil, nil, nil, fmt.Errorf("unable to create a csr from the private key: %v", err)
626	}
627	return template, csrPEM, keyPEM, privateKey, nil
628}
629
630func (m *manager) getLastRequest() (context.CancelFunc, *x509.CertificateRequest) {
631	m.lastRequestLock.Lock()
632	defer m.lastRequestLock.Unlock()
633	return m.lastRequestCancel, m.lastRequest
634}
635
636func (m *manager) setLastRequest(cancel context.CancelFunc, r *x509.CertificateRequest) {
637	m.lastRequestLock.Lock()
638	defer m.lastRequestLock.Unlock()
639	m.lastRequestCancel = cancel
640	m.lastRequest = r
641}
642