1package autoconf 2 3import ( 4 "context" 5 "fmt" 6 "time" 7 8 "github.com/hashicorp/consul/agent/cache" 9 "github.com/hashicorp/consul/agent/structs" 10) 11 12// handleCacheEvent is used to handle event notifications from the cache for the roots 13// or leaf cert watches. 14func (ac *AutoConfig) handleCacheEvent(u cache.UpdateEvent) error { 15 switch u.CorrelationID { 16 case rootsWatchID: 17 ac.logger.Debug("roots watch fired - updating CA certificates") 18 if u.Err != nil { 19 return fmt.Errorf("root watch returned an error: %w", u.Err) 20 } 21 22 roots, ok := u.Result.(*structs.IndexedCARoots) 23 if !ok { 24 return fmt.Errorf("invalid type for roots watch response: %T", u.Result) 25 } 26 27 return ac.updateCARoots(roots) 28 case leafWatchID: 29 ac.logger.Debug("leaf certificate watch fired - updating TLS certificate") 30 if u.Err != nil { 31 return fmt.Errorf("leaf watch returned an error: %w", u.Err) 32 } 33 34 leaf, ok := u.Result.(*structs.IssuedCert) 35 if !ok { 36 return fmt.Errorf("invalid type for agent leaf cert watch response: %T", u.Result) 37 } 38 39 return ac.updateLeafCert(leaf) 40 } 41 42 return nil 43} 44 45// handleTokenUpdate is used when a notification about the agent token being updated 46// is received and various watches need cancelling/restarting to use the new token. 47func (ac *AutoConfig) handleTokenUpdate(ctx context.Context) error { 48 ac.logger.Debug("Agent token updated - resetting watches") 49 50 // TODO (autoencrypt) Prepopulate the cache with the new token with 51 // the existing cache entry with the old token. The certificate doesn't 52 // need to change just because the token has. However there isn't a 53 // good way to make that happen and this behavior is benign enough 54 // that I am going to push off implementing it. 55 56 // the agent token has been updated so we must update our leaf cert watch. 57 // this cancels the current watches before setting up new ones 58 ac.cancelWatches() 59 60 // recreate the chan for cache updates. This is a precautionary measure to ensure 61 // that we don't accidentally get notified for the new watches being setup before 62 // a blocking query in the cache returns and sends data to the old chan. In theory 63 // the code in agent/cache/watch.go should prevent this where we specifically check 64 // for context cancellation prior to sending the event. However we could cancel 65 // it after that check and finish setting up the new watches before getting the old 66 // events. Both the go routine scheduler and the OS thread scheduler would have to 67 // be acting up for this to happen. Regardless the way to ensure we don't get events 68 // for the old watches is to simply replace the chan we are expecting them from. 69 close(ac.cacheUpdates) 70 ac.cacheUpdates = make(chan cache.UpdateEvent, 10) 71 72 // restart watches - this will be done with the correct token 73 cancelWatches, err := ac.setupCertificateCacheWatches(ctx) 74 if err != nil { 75 return fmt.Errorf("failed to restart watches after agent token update: %w", err) 76 } 77 ac.cancelWatches = cancelWatches 78 return nil 79} 80 81// handleFallback is used when the current TLS certificate has expired and the normal 82// updating mechanisms have failed to renew it quickly enough. This function will 83// use the configured fallback mechanism to retrieve a new cert and start monitoring 84// that one. 85func (ac *AutoConfig) handleFallback(ctx context.Context) error { 86 ac.logger.Warn("agent's client certificate has expired") 87 // Background because the context is mainly useful when the agent is first starting up. 88 switch { 89 case ac.config.AutoConfig.Enabled: 90 resp, err := ac.getInitialConfiguration(ctx) 91 if err != nil { 92 return fmt.Errorf("error while retrieving new agent certificates via auto-config: %w", err) 93 } 94 95 return ac.recordInitialConfiguration(resp) 96 case ac.config.AutoEncryptTLS: 97 reply, err := ac.autoEncryptInitialCerts(ctx) 98 if err != nil { 99 return fmt.Errorf("error while retrieving new agent certificate via auto-encrypt: %w", err) 100 } 101 return ac.setInitialTLSCertificates(reply) 102 default: 103 return fmt.Errorf("logic error: either auto-encrypt or auto-config must be enabled") 104 } 105} 106 107// run is the private method to be spawn by the Start method for 108// executing the main monitoring loop. 109func (ac *AutoConfig) run(ctx context.Context, exit chan struct{}) { 110 // The fallbackTimer is used to notify AFTER the agents 111 // leaf certificate has expired and where we need 112 // to fall back to the less secure RPC endpoint just like 113 // if the agent was starting up new. 114 // 115 // Check 10sec (fallback leeway duration) after cert 116 // expires. The agent cache should be handling the expiration 117 // and renew it before then. 118 // 119 // If there is no cert, AutoEncryptCertNotAfter returns 120 // a value in the past which immediately triggers the 121 // renew, but this case shouldn't happen because at 122 // this point, auto_encrypt was just being setup 123 // successfully. 124 calcFallbackInterval := func() time.Duration { 125 certExpiry := ac.acConfig.TLSConfigurator.AutoEncryptCertNotAfter() 126 return certExpiry.Add(ac.acConfig.FallbackLeeway).Sub(time.Now()) 127 } 128 fallbackTimer := time.NewTimer(calcFallbackInterval()) 129 130 // cleanup for once we are stopped 131 defer func() { 132 // cancel the go routines performing the cache watches 133 ac.cancelWatches() 134 // ensure we don't leak the timers go routine 135 fallbackTimer.Stop() 136 // stop receiving notifications for token updates 137 ac.acConfig.Tokens.StopNotify(ac.tokenUpdates) 138 139 ac.logger.Debug("auto-config has been stopped") 140 141 ac.Lock() 142 ac.cancel = nil 143 ac.running = false 144 // this should be the final cleanup task as its what notifies 145 // the rest of the world that this go routine has exited. 146 close(exit) 147 ac.Unlock() 148 }() 149 150 for { 151 select { 152 case <-ctx.Done(): 153 ac.logger.Debug("stopping auto-config") 154 return 155 case <-ac.tokenUpdates.Ch: 156 ac.logger.Debug("handling a token update event") 157 158 if err := ac.handleTokenUpdate(ctx); err != nil { 159 ac.logger.Error("error in handling token update event", "error", err) 160 } 161 case u := <-ac.cacheUpdates: 162 ac.logger.Debug("handling a cache update event", "correlation_id", u.CorrelationID) 163 164 if err := ac.handleCacheEvent(u); err != nil { 165 ac.logger.Error("error in handling cache update event", "error", err) 166 } 167 168 // reset the fallback timer as the certificate may have been updated 169 fallbackTimer.Stop() 170 fallbackTimer = time.NewTimer(calcFallbackInterval()) 171 case <-fallbackTimer.C: 172 // This is a safety net in case the cert doesn't get renewed 173 // in time. The agent would be stuck in that case because the watches 174 // never use the AutoEncrypt.Sign endpoint. 175 176 // check auto encrypt client cert expiration 177 if ac.acConfig.TLSConfigurator.AutoEncryptCertExpired() { 178 if err := ac.handleFallback(ctx); err != nil { 179 ac.logger.Error("error when handling a certificate expiry event", "error", err) 180 fallbackTimer = time.NewTimer(ac.acConfig.FallbackRetry) 181 } else { 182 fallbackTimer = time.NewTimer(calcFallbackInterval()) 183 } 184 } else { 185 // this shouldn't be possible. We calculate the timer duration to be the certificate 186 // expiration time + some leeway (10s default). So whenever we get here the certificate 187 // should be expired. Regardless its probably worth resetting the timer. 188 fallbackTimer = time.NewTimer(calcFallbackInterval()) 189 } 190 } 191 } 192} 193