1package autoconf
2
3import (
4	"context"
5	"fmt"
6	"time"
7
8	"github.com/hashicorp/consul/agent/cache"
9	"github.com/hashicorp/consul/agent/structs"
10)
11
12// handleCacheEvent is used to handle event notifications from the cache for the roots
13// or leaf cert watches.
14func (ac *AutoConfig) handleCacheEvent(u cache.UpdateEvent) error {
15	switch u.CorrelationID {
16	case rootsWatchID:
17		ac.logger.Debug("roots watch fired - updating CA certificates")
18		if u.Err != nil {
19			return fmt.Errorf("root watch returned an error: %w", u.Err)
20		}
21
22		roots, ok := u.Result.(*structs.IndexedCARoots)
23		if !ok {
24			return fmt.Errorf("invalid type for roots watch response: %T", u.Result)
25		}
26
27		return ac.updateCARoots(roots)
28	case leafWatchID:
29		ac.logger.Debug("leaf certificate watch fired - updating TLS certificate")
30		if u.Err != nil {
31			return fmt.Errorf("leaf watch returned an error: %w", u.Err)
32		}
33
34		leaf, ok := u.Result.(*structs.IssuedCert)
35		if !ok {
36			return fmt.Errorf("invalid type for agent leaf cert watch response: %T", u.Result)
37		}
38
39		return ac.updateLeafCert(leaf)
40	}
41
42	return nil
43}
44
45// handleTokenUpdate is used when a notification about the agent token being updated
46// is received and various watches need cancelling/restarting to use the new token.
47func (ac *AutoConfig) handleTokenUpdate(ctx context.Context) error {
48	ac.logger.Debug("Agent token updated - resetting watches")
49
50	// TODO (autoencrypt) Prepopulate the cache with the new token with
51	// the existing cache entry with the old token. The certificate doesn't
52	// need to change just because the token has. However there isn't a
53	// good way to make that happen and this behavior is benign enough
54	// that I am going to push off implementing it.
55
56	// the agent token has been updated so we must update our leaf cert watch.
57	// this cancels the current watches before setting up new ones
58	ac.cancelWatches()
59
60	// recreate the chan for cache updates. This is a precautionary measure to ensure
61	// that we don't accidentally get notified for the new watches being setup before
62	// a blocking query in the cache returns and sends data to the old chan. In theory
63	// the code in agent/cache/watch.go should prevent this where we specifically check
64	// for context cancellation prior to sending the event. However we could cancel
65	// it after that check and finish setting up the new watches before getting the old
66	// events. Both the go routine scheduler and the OS thread scheduler would have to
67	// be acting up for this to happen. Regardless the way to ensure we don't get events
68	// for the old watches is to simply replace the chan we are expecting them from.
69	close(ac.cacheUpdates)
70	ac.cacheUpdates = make(chan cache.UpdateEvent, 10)
71
72	// restart watches - this will be done with the correct token
73	cancelWatches, err := ac.setupCertificateCacheWatches(ctx)
74	if err != nil {
75		return fmt.Errorf("failed to restart watches after agent token update: %w", err)
76	}
77	ac.cancelWatches = cancelWatches
78	return nil
79}
80
81// handleFallback is used when the current TLS certificate has expired and the normal
82// updating mechanisms have failed to renew it quickly enough. This function will
83// use the configured fallback mechanism to retrieve a new cert and start monitoring
84// that one.
85func (ac *AutoConfig) handleFallback(ctx context.Context) error {
86	ac.logger.Warn("agent's client certificate has expired")
87	// Background because the context is mainly useful when the agent is first starting up.
88	switch {
89	case ac.config.AutoConfig.Enabled:
90		resp, err := ac.getInitialConfiguration(ctx)
91		if err != nil {
92			return fmt.Errorf("error while retrieving new agent certificates via auto-config: %w", err)
93		}
94
95		return ac.recordInitialConfiguration(resp)
96	case ac.config.AutoEncryptTLS:
97		reply, err := ac.autoEncryptInitialCerts(ctx)
98		if err != nil {
99			return fmt.Errorf("error while retrieving new agent certificate via auto-encrypt: %w", err)
100		}
101		return ac.setInitialTLSCertificates(reply)
102	default:
103		return fmt.Errorf("logic error: either auto-encrypt or auto-config must be enabled")
104	}
105}
106
107// run is the private method to be spawn by the Start method for
108// executing the main monitoring loop.
109func (ac *AutoConfig) run(ctx context.Context, exit chan struct{}) {
110	// The fallbackTimer is used to notify AFTER the agents
111	// leaf certificate has expired and where we need
112	// to fall back to the less secure RPC endpoint just like
113	// if the agent was starting up new.
114	//
115	// Check 10sec (fallback leeway duration) after cert
116	// expires. The agent cache should be handling the expiration
117	// and renew it before then.
118	//
119	// If there is no cert, AutoEncryptCertNotAfter returns
120	// a value in the past which immediately triggers the
121	// renew, but this case shouldn't happen because at
122	// this point, auto_encrypt was just being setup
123	// successfully.
124	calcFallbackInterval := func() time.Duration {
125		certExpiry := ac.acConfig.TLSConfigurator.AutoEncryptCertNotAfter()
126		return certExpiry.Add(ac.acConfig.FallbackLeeway).Sub(time.Now())
127	}
128	fallbackTimer := time.NewTimer(calcFallbackInterval())
129
130	// cleanup for once we are stopped
131	defer func() {
132		// cancel the go routines performing the cache watches
133		ac.cancelWatches()
134		// ensure we don't leak the timers go routine
135		fallbackTimer.Stop()
136		// stop receiving notifications for token updates
137		ac.acConfig.Tokens.StopNotify(ac.tokenUpdates)
138
139		ac.logger.Debug("auto-config has been stopped")
140
141		ac.Lock()
142		ac.cancel = nil
143		ac.running = false
144		// this should be the final cleanup task as its what notifies
145		// the rest of the world that this go routine has exited.
146		close(exit)
147		ac.Unlock()
148	}()
149
150	for {
151		select {
152		case <-ctx.Done():
153			ac.logger.Debug("stopping auto-config")
154			return
155		case <-ac.tokenUpdates.Ch:
156			ac.logger.Debug("handling a token update event")
157
158			if err := ac.handleTokenUpdate(ctx); err != nil {
159				ac.logger.Error("error in handling token update event", "error", err)
160			}
161		case u := <-ac.cacheUpdates:
162			ac.logger.Debug("handling a cache update event", "correlation_id", u.CorrelationID)
163
164			if err := ac.handleCacheEvent(u); err != nil {
165				ac.logger.Error("error in handling cache update event", "error", err)
166			}
167
168			// reset the fallback timer as the certificate may have been updated
169			fallbackTimer.Stop()
170			fallbackTimer = time.NewTimer(calcFallbackInterval())
171		case <-fallbackTimer.C:
172			// This is a safety net in case the cert doesn't get renewed
173			// in time. The agent would be stuck in that case because the watches
174			// never use the AutoEncrypt.Sign endpoint.
175
176			// check auto encrypt client cert expiration
177			if ac.acConfig.TLSConfigurator.AutoEncryptCertExpired() {
178				if err := ac.handleFallback(ctx); err != nil {
179					ac.logger.Error("error when handling a certificate expiry event", "error", err)
180					fallbackTimer = time.NewTimer(ac.acConfig.FallbackRetry)
181				} else {
182					fallbackTimer = time.NewTimer(calcFallbackInterval())
183				}
184			} else {
185				// this shouldn't be possible. We calculate the timer duration to be the certificate
186				// expiration time + some leeway (10s default). So whenever we get here the certificate
187				// should be expired. Regardless its probably worth resetting the timer.
188				fallbackTimer = time.NewTimer(calcFallbackInterval())
189			}
190		}
191	}
192}
193