1// Copyright 2016 The Prometheus Authors
2// Licensed under the Apache License, Version 2.0 (the "License");
3// you may not use this file except in compliance with the License.
4// You may obtain a copy of the License at
5//
6// http://www.apache.org/licenses/LICENSE-2.0
7//
8// Unless required by applicable law or agreed to in writing, software
9// distributed under the License is distributed on an "AS IS" BASIS,
10// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
11// See the License for the specific language governing permissions and
12// limitations under the License.
13
14package dns
15
16import (
17	"context"
18	"fmt"
19	"net"
20	"strings"
21	"sync"
22	"time"
23
24	"github.com/go-kit/kit/log"
25	"github.com/go-kit/kit/log/level"
26	"github.com/miekg/dns"
27	"github.com/prometheus/client_golang/prometheus"
28	"github.com/prometheus/common/model"
29	"github.com/prometheus/prometheus/discovery/targetgroup"
30)
31
32const (
33	resolvConf = "/etc/resolv.conf"
34
35	dnsNameLabel = model.MetaLabelPrefix + "dns_name"
36
37	// Constants for instrumentation.
38	namespace = "prometheus"
39)
40
41var (
42	dnsSDLookupsCount = prometheus.NewCounter(
43		prometheus.CounterOpts{
44			Namespace: namespace,
45			Name:      "sd_dns_lookups_total",
46			Help:      "The number of DNS-SD lookups.",
47		})
48	dnsSDLookupFailuresCount = prometheus.NewCounter(
49		prometheus.CounterOpts{
50			Namespace: namespace,
51			Name:      "sd_dns_lookup_failures_total",
52			Help:      "The number of DNS-SD lookup failures.",
53		})
54
55	// DefaultSDConfig is the default DNS SD configuration.
56	DefaultSDConfig = SDConfig{
57		RefreshInterval: model.Duration(30 * time.Second),
58		Type:            "SRV",
59	}
60)
61
62// SDConfig is the configuration for DNS based service discovery.
63type SDConfig struct {
64	Names           []string       `yaml:"names"`
65	RefreshInterval model.Duration `yaml:"refresh_interval,omitempty"`
66	Type            string         `yaml:"type"`
67	Port            int            `yaml:"port"` // Ignored for SRV records
68}
69
70// UnmarshalYAML implements the yaml.Unmarshaler interface.
71func (c *SDConfig) UnmarshalYAML(unmarshal func(interface{}) error) error {
72	*c = DefaultSDConfig
73	type plain SDConfig
74	err := unmarshal((*plain)(c))
75	if err != nil {
76		return err
77	}
78	if len(c.Names) == 0 {
79		return fmt.Errorf("DNS-SD config must contain at least one SRV record name")
80	}
81	switch strings.ToUpper(c.Type) {
82	case "SRV":
83	case "A", "AAAA":
84		if c.Port == 0 {
85			return fmt.Errorf("a port is required in DNS-SD configs for all record types except SRV")
86		}
87	default:
88		return fmt.Errorf("invalid DNS-SD records type %s", c.Type)
89	}
90	return nil
91}
92
93func init() {
94	prometheus.MustRegister(dnsSDLookupFailuresCount)
95	prometheus.MustRegister(dnsSDLookupsCount)
96}
97
98// Discovery periodically performs DNS-SD requests. It implements
99// the Discoverer interface.
100type Discovery struct {
101	names []string
102
103	interval time.Duration
104	port     int
105	qtype    uint16
106	logger   log.Logger
107}
108
109// NewDiscovery returns a new Discovery which periodically refreshes its targets.
110func NewDiscovery(conf SDConfig, logger log.Logger) *Discovery {
111	if logger == nil {
112		logger = log.NewNopLogger()
113	}
114
115	qtype := dns.TypeSRV
116	switch strings.ToUpper(conf.Type) {
117	case "A":
118		qtype = dns.TypeA
119	case "AAAA":
120		qtype = dns.TypeAAAA
121	case "SRV":
122		qtype = dns.TypeSRV
123	}
124	return &Discovery{
125		names:    conf.Names,
126		interval: time.Duration(conf.RefreshInterval),
127		qtype:    qtype,
128		port:     conf.Port,
129		logger:   logger,
130	}
131}
132
133// Run implements the Discoverer interface.
134func (d *Discovery) Run(ctx context.Context, ch chan<- []*targetgroup.Group) {
135	ticker := time.NewTicker(d.interval)
136	defer ticker.Stop()
137
138	// Get an initial set right away.
139	d.refreshAll(ctx, ch)
140
141	for {
142		select {
143		case <-ticker.C:
144			d.refreshAll(ctx, ch)
145		case <-ctx.Done():
146			return
147		}
148	}
149}
150
151func (d *Discovery) refreshAll(ctx context.Context, ch chan<- []*targetgroup.Group) {
152	var wg sync.WaitGroup
153
154	wg.Add(len(d.names))
155	for _, name := range d.names {
156		go func(n string) {
157			if err := d.refresh(ctx, n, ch); err != nil {
158				level.Error(d.logger).Log("msg", "Error refreshing DNS targets", "err", err)
159			}
160			wg.Done()
161		}(name)
162	}
163
164	wg.Wait()
165}
166
167func (d *Discovery) refresh(ctx context.Context, name string, ch chan<- []*targetgroup.Group) error {
168	response, err := lookupWithSearchPath(name, d.qtype, d.logger)
169	dnsSDLookupsCount.Inc()
170	if err != nil {
171		dnsSDLookupFailuresCount.Inc()
172		return err
173	}
174
175	tg := &targetgroup.Group{}
176	hostPort := func(a string, p int) model.LabelValue {
177		return model.LabelValue(net.JoinHostPort(a, fmt.Sprintf("%d", p)))
178	}
179
180	for _, record := range response.Answer {
181		target := model.LabelValue("")
182		switch addr := record.(type) {
183		case *dns.SRV:
184			// Remove the final dot from rooted DNS names to make them look more usual.
185			addr.Target = strings.TrimRight(addr.Target, ".")
186
187			target = hostPort(addr.Target, int(addr.Port))
188		case *dns.A:
189			target = hostPort(addr.A.String(), d.port)
190		case *dns.AAAA:
191			target = hostPort(addr.AAAA.String(), d.port)
192		default:
193			level.Warn(d.logger).Log("msg", "Invalid SRV record", "record", record)
194			continue
195		}
196		tg.Targets = append(tg.Targets, model.LabelSet{
197			model.AddressLabel: target,
198			dnsNameLabel:       model.LabelValue(name),
199		})
200	}
201
202	tg.Source = name
203	select {
204	case <-ctx.Done():
205		return ctx.Err()
206	case ch <- []*targetgroup.Group{tg}:
207	}
208
209	return nil
210}
211
212// lookupWithSearchPath tries to get an answer for various permutations of
213// the given name, appending the system-configured search path as necessary.
214//
215// There are three possible outcomes:
216//
217// 1. One of the permutations of the given name is recognized as
218//    "valid" by the DNS, in which case we consider ourselves "done"
219//    and that answer is returned.  Note that, due to the way the DNS
220//    handles "name has resource records, but none of the specified type",
221//    the answer received may have an empty set of results.
222//
223// 2.  All of the permutations of the given name are responded to by one of
224//    the servers in the "nameservers" list with the answer "that name does
225//    not exist" (NXDOMAIN).  In that case, it can be considered
226//    pseudo-authoritative that there are no records for that name.
227//
228// 3.  One or more of the names was responded to by all servers with some
229//    sort of error indication.  In that case, we can't know if, in fact,
230//    there are records for the name or not, so whatever state the
231//    configuration is in, we should keep it that way until we know for
232//    sure (by, presumably, all the names getting answers in the future).
233//
234// Outcomes 1 and 2 are indicated by a valid response message (possibly an
235// empty one) and no error.  Outcome 3 is indicated by an error return.  The
236// error will be generic-looking, because trying to return all the errors
237// returned by the combination of all name permutations and servers is a
238// nightmare.
239func lookupWithSearchPath(name string, qtype uint16, logger log.Logger) (*dns.Msg, error) {
240	conf, err := dns.ClientConfigFromFile(resolvConf)
241	if err != nil {
242		return nil, fmt.Errorf("could not load resolv.conf: %s", err)
243	}
244
245	allResponsesValid := true
246
247	for _, lname := range conf.NameList(name) {
248		response, err := lookupFromAnyServer(lname, qtype, conf, logger)
249
250		if err != nil {
251			// We can't go home yet, because a later name
252			// may give us a valid, successful answer.  However
253			// we can no longer say "this name definitely doesn't
254			// exist", because we did not get that answer for
255			// at least one name.
256			allResponsesValid = false
257		} else if response.Rcode == dns.RcodeSuccess {
258			// Outcome 1: GOLD!
259			return response, nil
260		}
261	}
262
263	if allResponsesValid {
264		// Outcome 2: everyone says NXDOMAIN, that's good enough for me
265		return &dns.Msg{}, nil
266	}
267	// Outcome 3: boned.
268	return nil, fmt.Errorf("could not resolve %q: all servers responded with errors to at least one search domain", name)
269}
270
271// lookupFromAnyServer uses all configured servers to try and resolve a specific
272// name.  If a viable answer is received from a server, then it is
273// immediately returned, otherwise the other servers in the config are
274// tried, and if none of them return a viable answer, an error is returned.
275//
276// A "viable answer" is one which indicates either:
277//
278// 1. "yes, I know that name, and here are its records of the requested type"
279//    (RCODE==SUCCESS, ANCOUNT > 0);
280// 2. "yes, I know that name, but it has no records of the requested type"
281//    (RCODE==SUCCESS, ANCOUNT==0); or
282// 3. "I know that name doesn't exist" (RCODE==NXDOMAIN).
283//
284// A non-viable answer is "anything else", which encompasses both various
285// system-level problems (like network timeouts) and also
286// valid-but-unexpected DNS responses (SERVFAIL, REFUSED, etc).
287func lookupFromAnyServer(name string, qtype uint16, conf *dns.ClientConfig, logger log.Logger) (*dns.Msg, error) {
288	client := &dns.Client{}
289
290	for _, server := range conf.Servers {
291		servAddr := net.JoinHostPort(server, conf.Port)
292		msg, err := askServerForName(name, qtype, client, servAddr, true)
293		if err != nil {
294			level.Warn(logger).Log("msg", "DNS resolution failed", "server", server, "name", name, "err", err)
295			continue
296		}
297
298		if msg.Rcode == dns.RcodeSuccess || msg.Rcode == dns.RcodeNameError {
299			// We have our answer.  Time to go home.
300			return msg, nil
301		}
302	}
303
304	return nil, fmt.Errorf("could not resolve %s: no servers returned a viable answer", name)
305}
306
307// askServerForName makes a request to a specific DNS server for a specific
308// name (and qtype).  Retries with TCP in the event of response truncation,
309// but otherwise just sends back whatever the server gave, whether that be a
310// valid-looking response, or an error.
311func askServerForName(name string, queryType uint16, client *dns.Client, servAddr string, edns bool) (*dns.Msg, error) {
312	msg := &dns.Msg{}
313
314	msg.SetQuestion(dns.Fqdn(name), queryType)
315	if edns {
316		msg.SetEdns0(dns.DefaultMsgSize, false)
317	}
318
319	response, _, err := client.Exchange(msg, servAddr)
320	if err == dns.ErrTruncated {
321		if client.Net == "tcp" {
322			return nil, fmt.Errorf("got truncated message on TCP (64kiB limit exceeded?)")
323		}
324
325		client.Net = "tcp"
326		return askServerForName(name, queryType, client, servAddr, false)
327	}
328	if err != nil {
329		return nil, err
330	}
331	if msg.Id != response.Id {
332		return nil, fmt.Errorf("DNS ID mismatch, request: %d, response: %d", msg.Id, response.Id)
333	}
334	return response, nil
335}
336