1// Copyright 2016 The Prometheus Authors 2// Licensed under the Apache License, Version 2.0 (the "License"); 3// you may not use this file except in compliance with the License. 4// You may obtain a copy of the License at 5// 6// http://www.apache.org/licenses/LICENSE-2.0 7// 8// Unless required by applicable law or agreed to in writing, software 9// distributed under the License is distributed on an "AS IS" BASIS, 10// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 11// See the License for the specific language governing permissions and 12// limitations under the License. 13 14package dns 15 16import ( 17 "context" 18 "fmt" 19 "net" 20 "strings" 21 "sync" 22 "time" 23 24 "github.com/go-kit/kit/log" 25 "github.com/go-kit/kit/log/level" 26 "github.com/miekg/dns" 27 "github.com/prometheus/client_golang/prometheus" 28 "github.com/prometheus/common/model" 29 "github.com/prometheus/prometheus/discovery/targetgroup" 30) 31 32const ( 33 resolvConf = "/etc/resolv.conf" 34 35 dnsNameLabel = model.MetaLabelPrefix + "dns_name" 36 37 // Constants for instrumentation. 38 namespace = "prometheus" 39) 40 41var ( 42 dnsSDLookupsCount = prometheus.NewCounter( 43 prometheus.CounterOpts{ 44 Namespace: namespace, 45 Name: "sd_dns_lookups_total", 46 Help: "The number of DNS-SD lookups.", 47 }) 48 dnsSDLookupFailuresCount = prometheus.NewCounter( 49 prometheus.CounterOpts{ 50 Namespace: namespace, 51 Name: "sd_dns_lookup_failures_total", 52 Help: "The number of DNS-SD lookup failures.", 53 }) 54 55 // DefaultSDConfig is the default DNS SD configuration. 56 DefaultSDConfig = SDConfig{ 57 RefreshInterval: model.Duration(30 * time.Second), 58 Type: "SRV", 59 } 60) 61 62// SDConfig is the configuration for DNS based service discovery. 63type SDConfig struct { 64 Names []string `yaml:"names"` 65 RefreshInterval model.Duration `yaml:"refresh_interval,omitempty"` 66 Type string `yaml:"type"` 67 Port int `yaml:"port"` // Ignored for SRV records 68} 69 70// UnmarshalYAML implements the yaml.Unmarshaler interface. 71func (c *SDConfig) UnmarshalYAML(unmarshal func(interface{}) error) error { 72 *c = DefaultSDConfig 73 type plain SDConfig 74 err := unmarshal((*plain)(c)) 75 if err != nil { 76 return err 77 } 78 if len(c.Names) == 0 { 79 return fmt.Errorf("DNS-SD config must contain at least one SRV record name") 80 } 81 switch strings.ToUpper(c.Type) { 82 case "SRV": 83 case "A", "AAAA": 84 if c.Port == 0 { 85 return fmt.Errorf("a port is required in DNS-SD configs for all record types except SRV") 86 } 87 default: 88 return fmt.Errorf("invalid DNS-SD records type %s", c.Type) 89 } 90 return nil 91} 92 93func init() { 94 prometheus.MustRegister(dnsSDLookupFailuresCount) 95 prometheus.MustRegister(dnsSDLookupsCount) 96} 97 98// Discovery periodically performs DNS-SD requests. It implements 99// the Discoverer interface. 100type Discovery struct { 101 names []string 102 103 interval time.Duration 104 port int 105 qtype uint16 106 logger log.Logger 107} 108 109// NewDiscovery returns a new Discovery which periodically refreshes its targets. 110func NewDiscovery(conf SDConfig, logger log.Logger) *Discovery { 111 if logger == nil { 112 logger = log.NewNopLogger() 113 } 114 115 qtype := dns.TypeSRV 116 switch strings.ToUpper(conf.Type) { 117 case "A": 118 qtype = dns.TypeA 119 case "AAAA": 120 qtype = dns.TypeAAAA 121 case "SRV": 122 qtype = dns.TypeSRV 123 } 124 return &Discovery{ 125 names: conf.Names, 126 interval: time.Duration(conf.RefreshInterval), 127 qtype: qtype, 128 port: conf.Port, 129 logger: logger, 130 } 131} 132 133// Run implements the Discoverer interface. 134func (d *Discovery) Run(ctx context.Context, ch chan<- []*targetgroup.Group) { 135 ticker := time.NewTicker(d.interval) 136 defer ticker.Stop() 137 138 // Get an initial set right away. 139 d.refreshAll(ctx, ch) 140 141 for { 142 select { 143 case <-ticker.C: 144 d.refreshAll(ctx, ch) 145 case <-ctx.Done(): 146 return 147 } 148 } 149} 150 151func (d *Discovery) refreshAll(ctx context.Context, ch chan<- []*targetgroup.Group) { 152 var wg sync.WaitGroup 153 154 wg.Add(len(d.names)) 155 for _, name := range d.names { 156 go func(n string) { 157 if err := d.refresh(ctx, n, ch); err != nil { 158 level.Error(d.logger).Log("msg", "Error refreshing DNS targets", "err", err) 159 } 160 wg.Done() 161 }(name) 162 } 163 164 wg.Wait() 165} 166 167func (d *Discovery) refresh(ctx context.Context, name string, ch chan<- []*targetgroup.Group) error { 168 response, err := lookupWithSearchPath(name, d.qtype, d.logger) 169 dnsSDLookupsCount.Inc() 170 if err != nil { 171 dnsSDLookupFailuresCount.Inc() 172 return err 173 } 174 175 tg := &targetgroup.Group{} 176 hostPort := func(a string, p int) model.LabelValue { 177 return model.LabelValue(net.JoinHostPort(a, fmt.Sprintf("%d", p))) 178 } 179 180 for _, record := range response.Answer { 181 target := model.LabelValue("") 182 switch addr := record.(type) { 183 case *dns.SRV: 184 // Remove the final dot from rooted DNS names to make them look more usual. 185 addr.Target = strings.TrimRight(addr.Target, ".") 186 187 target = hostPort(addr.Target, int(addr.Port)) 188 case *dns.A: 189 target = hostPort(addr.A.String(), d.port) 190 case *dns.AAAA: 191 target = hostPort(addr.AAAA.String(), d.port) 192 default: 193 level.Warn(d.logger).Log("msg", "Invalid SRV record", "record", record) 194 continue 195 } 196 tg.Targets = append(tg.Targets, model.LabelSet{ 197 model.AddressLabel: target, 198 dnsNameLabel: model.LabelValue(name), 199 }) 200 } 201 202 tg.Source = name 203 select { 204 case <-ctx.Done(): 205 return ctx.Err() 206 case ch <- []*targetgroup.Group{tg}: 207 } 208 209 return nil 210} 211 212// lookupWithSearchPath tries to get an answer for various permutations of 213// the given name, appending the system-configured search path as necessary. 214// 215// There are three possible outcomes: 216// 217// 1. One of the permutations of the given name is recognized as 218// "valid" by the DNS, in which case we consider ourselves "done" 219// and that answer is returned. Note that, due to the way the DNS 220// handles "name has resource records, but none of the specified type", 221// the answer received may have an empty set of results. 222// 223// 2. All of the permutations of the given name are responded to by one of 224// the servers in the "nameservers" list with the answer "that name does 225// not exist" (NXDOMAIN). In that case, it can be considered 226// pseudo-authoritative that there are no records for that name. 227// 228// 3. One or more of the names was responded to by all servers with some 229// sort of error indication. In that case, we can't know if, in fact, 230// there are records for the name or not, so whatever state the 231// configuration is in, we should keep it that way until we know for 232// sure (by, presumably, all the names getting answers in the future). 233// 234// Outcomes 1 and 2 are indicated by a valid response message (possibly an 235// empty one) and no error. Outcome 3 is indicated by an error return. The 236// error will be generic-looking, because trying to return all the errors 237// returned by the combination of all name permutations and servers is a 238// nightmare. 239func lookupWithSearchPath(name string, qtype uint16, logger log.Logger) (*dns.Msg, error) { 240 conf, err := dns.ClientConfigFromFile(resolvConf) 241 if err != nil { 242 return nil, fmt.Errorf("could not load resolv.conf: %s", err) 243 } 244 245 allResponsesValid := true 246 247 for _, lname := range conf.NameList(name) { 248 response, err := lookupFromAnyServer(lname, qtype, conf, logger) 249 250 if err != nil { 251 // We can't go home yet, because a later name 252 // may give us a valid, successful answer. However 253 // we can no longer say "this name definitely doesn't 254 // exist", because we did not get that answer for 255 // at least one name. 256 allResponsesValid = false 257 } else if response.Rcode == dns.RcodeSuccess { 258 // Outcome 1: GOLD! 259 return response, nil 260 } 261 } 262 263 if allResponsesValid { 264 // Outcome 2: everyone says NXDOMAIN, that's good enough for me 265 return &dns.Msg{}, nil 266 } 267 // Outcome 3: boned. 268 return nil, fmt.Errorf("could not resolve %q: all servers responded with errors to at least one search domain", name) 269} 270 271// lookupFromAnyServer uses all configured servers to try and resolve a specific 272// name. If a viable answer is received from a server, then it is 273// immediately returned, otherwise the other servers in the config are 274// tried, and if none of them return a viable answer, an error is returned. 275// 276// A "viable answer" is one which indicates either: 277// 278// 1. "yes, I know that name, and here are its records of the requested type" 279// (RCODE==SUCCESS, ANCOUNT > 0); 280// 2. "yes, I know that name, but it has no records of the requested type" 281// (RCODE==SUCCESS, ANCOUNT==0); or 282// 3. "I know that name doesn't exist" (RCODE==NXDOMAIN). 283// 284// A non-viable answer is "anything else", which encompasses both various 285// system-level problems (like network timeouts) and also 286// valid-but-unexpected DNS responses (SERVFAIL, REFUSED, etc). 287func lookupFromAnyServer(name string, qtype uint16, conf *dns.ClientConfig, logger log.Logger) (*dns.Msg, error) { 288 client := &dns.Client{} 289 290 for _, server := range conf.Servers { 291 servAddr := net.JoinHostPort(server, conf.Port) 292 msg, err := askServerForName(name, qtype, client, servAddr, true) 293 if err != nil { 294 level.Warn(logger).Log("msg", "DNS resolution failed", "server", server, "name", name, "err", err) 295 continue 296 } 297 298 if msg.Rcode == dns.RcodeSuccess || msg.Rcode == dns.RcodeNameError { 299 // We have our answer. Time to go home. 300 return msg, nil 301 } 302 } 303 304 return nil, fmt.Errorf("could not resolve %s: no servers returned a viable answer", name) 305} 306 307// askServerForName makes a request to a specific DNS server for a specific 308// name (and qtype). Retries with TCP in the event of response truncation, 309// but otherwise just sends back whatever the server gave, whether that be a 310// valid-looking response, or an error. 311func askServerForName(name string, queryType uint16, client *dns.Client, servAddr string, edns bool) (*dns.Msg, error) { 312 msg := &dns.Msg{} 313 314 msg.SetQuestion(dns.Fqdn(name), queryType) 315 if edns { 316 msg.SetEdns0(dns.DefaultMsgSize, false) 317 } 318 319 response, _, err := client.Exchange(msg, servAddr) 320 if err == dns.ErrTruncated { 321 if client.Net == "tcp" { 322 return nil, fmt.Errorf("got truncated message on TCP (64kiB limit exceeded?)") 323 } 324 325 client.Net = "tcp" 326 return askServerForName(name, queryType, client, servAddr, false) 327 } 328 if err != nil { 329 return nil, err 330 } 331 if msg.Id != response.Id { 332 return nil, fmt.Errorf("DNS ID mismatch, request: %d, response: %d", msg.Id, response.Id) 333 } 334 return response, nil 335} 336