1// Copyright 2015 The Prometheus Authors
2// Licensed under the Apache License, Version 2.0 (the "License");
3// you may not use this file except in compliance with the License.
4// You may obtain a copy of the License at
5//
6// http://www.apache.org/licenses/LICENSE-2.0
7//
8// Unless required by applicable law or agreed to in writing, software
9// distributed under the License is distributed on an "AS IS" BASIS,
10// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
11// See the License for the specific language governing permissions and
12// limitations under the License.
13
14package azure
15
16import (
17	"fmt"
18	"net"
19	"strings"
20	"time"
21
22	"github.com/Azure/azure-sdk-for-go/arm/compute"
23	"github.com/Azure/azure-sdk-for-go/arm/network"
24	"github.com/Azure/go-autorest/autorest/azure"
25
26	"github.com/prometheus/client_golang/prometheus"
27	"github.com/prometheus/common/log"
28	"github.com/prometheus/common/model"
29	"golang.org/x/net/context"
30
31	"github.com/prometheus/prometheus/config"
32	"github.com/prometheus/prometheus/util/strutil"
33)
34
35const (
36	azureLabel                     = model.MetaLabelPrefix + "azure_"
37	azureLabelMachineID            = azureLabel + "machine_id"
38	azureLabelMachineResourceGroup = azureLabel + "machine_resource_group"
39	azureLabelMachineName          = azureLabel + "machine_name"
40	azureLabelMachineLocation      = azureLabel + "machine_location"
41	azureLabelMachinePrivateIP     = azureLabel + "machine_private_ip"
42	azureLabelMachineTag           = azureLabel + "machine_tag_"
43)
44
45var (
46	azureSDRefreshFailuresCount = prometheus.NewCounter(
47		prometheus.CounterOpts{
48			Name: "prometheus_sd_azure_refresh_failures_total",
49			Help: "Number of Azure-SD refresh failures.",
50		})
51	azureSDRefreshDuration = prometheus.NewSummary(
52		prometheus.SummaryOpts{
53			Name: "prometheus_sd_azure_refresh_duration_seconds",
54			Help: "The duration of a Azure-SD refresh in seconds.",
55		})
56)
57
58func init() {
59	prometheus.MustRegister(azureSDRefreshDuration)
60	prometheus.MustRegister(azureSDRefreshFailuresCount)
61}
62
63// Discovery periodically performs Azure-SD requests. It implements
64// the TargetProvider interface.
65type Discovery struct {
66	cfg      *config.AzureSDConfig
67	interval time.Duration
68	port     int
69	logger   log.Logger
70}
71
72// NewDiscovery returns a new AzureDiscovery which periodically refreshes its targets.
73func NewDiscovery(cfg *config.AzureSDConfig, logger log.Logger) *Discovery {
74	return &Discovery{
75		cfg:      cfg,
76		interval: time.Duration(cfg.RefreshInterval),
77		port:     cfg.Port,
78		logger:   logger,
79	}
80}
81
82// Run implements the TargetProvider interface.
83func (d *Discovery) Run(ctx context.Context, ch chan<- []*config.TargetGroup) {
84	ticker := time.NewTicker(d.interval)
85	defer ticker.Stop()
86
87	for {
88		select {
89		case <-ctx.Done():
90			return
91		default:
92		}
93
94		tg, err := d.refresh()
95		if err != nil {
96			d.logger.Errorf("unable to refresh during Azure discovery: %s", err)
97		} else {
98			select {
99			case <-ctx.Done():
100			case ch <- []*config.TargetGroup{tg}:
101			}
102		}
103
104		select {
105		case <-ticker.C:
106		case <-ctx.Done():
107			return
108		}
109	}
110}
111
112// azureClient represents multiple Azure Resource Manager providers.
113type azureClient struct {
114	nic network.InterfacesClient
115	vm  compute.VirtualMachinesClient
116}
117
118// createAzureClient is a helper function for creating an Azure compute client to ARM.
119func createAzureClient(cfg config.AzureSDConfig) (azureClient, error) {
120	var c azureClient
121	oauthConfig, err := azure.PublicCloud.OAuthConfigForTenant(cfg.TenantID)
122	if err != nil {
123		return azureClient{}, err
124	}
125	spt, err := azure.NewServicePrincipalToken(*oauthConfig, cfg.ClientID, string(cfg.ClientSecret), azure.PublicCloud.ResourceManagerEndpoint)
126	if err != nil {
127		return azureClient{}, err
128	}
129
130	c.vm = compute.NewVirtualMachinesClient(cfg.SubscriptionID)
131	c.vm.Authorizer = spt
132
133	c.nic = network.NewInterfacesClient(cfg.SubscriptionID)
134	c.nic.Authorizer = spt
135
136	return c, nil
137}
138
139// azureResource represents a resource identifier in Azure.
140type azureResource struct {
141	Name          string
142	ResourceGroup string
143}
144
145// Create a new azureResource object from an ID string.
146func newAzureResourceFromID(id string, logger log.Logger) (azureResource, error) {
147	// Resource IDs have the following format.
148	// /subscriptions/SUBSCRIPTION_ID/resourceGroups/RESOURCE_GROUP/providers/PROVIDER/TYPE/NAME
149	s := strings.Split(id, "/")
150	if len(s) != 9 {
151		err := fmt.Errorf("invalid ID '%s'. Refusing to create azureResource", id)
152		logger.Error(err)
153		return azureResource{}, err
154	}
155	return azureResource{
156		Name:          strings.ToLower(s[8]),
157		ResourceGroup: strings.ToLower(s[4]),
158	}, nil
159}
160
161func (d *Discovery) refresh() (tg *config.TargetGroup, err error) {
162	t0 := time.Now()
163	defer func() {
164		azureSDRefreshDuration.Observe(time.Since(t0).Seconds())
165		if err != nil {
166			azureSDRefreshFailuresCount.Inc()
167		}
168	}()
169	tg = &config.TargetGroup{}
170	client, err := createAzureClient(*d.cfg)
171	if err != nil {
172		return tg, fmt.Errorf("could not create Azure client: %s", err)
173	}
174
175	var machines []compute.VirtualMachine
176	result, err := client.vm.ListAll()
177	if err != nil {
178		return tg, fmt.Errorf("could not list virtual machines: %s", err)
179	}
180	machines = append(machines, *result.Value...)
181
182	// If we still have results, keep going until we have no more.
183	for result.NextLink != nil {
184		result, err = client.vm.ListAllNextResults(result)
185		if err != nil {
186			return tg, fmt.Errorf("could not list virtual machines: %s", err)
187		}
188		machines = append(machines, *result.Value...)
189	}
190	d.logger.Debugf("Found %d virtual machines during Azure discovery.", len(machines))
191
192	// We have the slice of machines. Now turn them into targets.
193	// Doing them in go routines because the network interface calls are slow.
194	type target struct {
195		labelSet model.LabelSet
196		err      error
197	}
198
199	ch := make(chan target, len(machines))
200	for i, vm := range machines {
201		go func(i int, vm compute.VirtualMachine) {
202			r, err := newAzureResourceFromID(*vm.ID, d.logger)
203			if err != nil {
204				ch <- target{labelSet: nil, err: err}
205				return
206			}
207
208			labels := model.LabelSet{
209				azureLabelMachineID:            model.LabelValue(*vm.ID),
210				azureLabelMachineName:          model.LabelValue(*vm.Name),
211				azureLabelMachineLocation:      model.LabelValue(*vm.Location),
212				azureLabelMachineResourceGroup: model.LabelValue(r.ResourceGroup),
213			}
214
215			if vm.Tags != nil {
216				for k, v := range *vm.Tags {
217					name := strutil.SanitizeLabelName(k)
218					labels[azureLabelMachineTag+model.LabelName(name)] = model.LabelValue(*v)
219				}
220			}
221
222			// Get the IP address information via separate call to the network provider.
223			for _, nic := range *vm.Properties.NetworkProfile.NetworkInterfaces {
224				r, err := newAzureResourceFromID(*nic.ID, d.logger)
225				if err != nil {
226					ch <- target{labelSet: nil, err: err}
227					return
228				}
229				networkInterface, err := client.nic.Get(r.ResourceGroup, r.Name, "")
230				if err != nil {
231					d.logger.Errorf("Unable to get network interface %s: %s", r.Name, err)
232					ch <- target{labelSet: nil, err: err}
233					// Get out of this routine because we cannot continue without a network interface.
234					return
235				}
236
237				// Unfortunately Azure does not return information on whether a VM is deallocated.
238				// This information is available via another API call however the Go SDK does not
239				// yet support this. On deallocated machines, this value happens to be nil so it
240				// is a cheap and easy way to determine if a machine is allocated or not.
241				if networkInterface.Properties.Primary == nil {
242					d.logger.Debugf("Virtual machine %s is deallocated. Skipping during Azure SD.", *vm.Name)
243					ch <- target{}
244					return
245				}
246
247				if *networkInterface.Properties.Primary {
248					for _, ip := range *networkInterface.Properties.IPConfigurations {
249						if ip.Properties.PrivateIPAddress != nil {
250							labels[azureLabelMachinePrivateIP] = model.LabelValue(*ip.Properties.PrivateIPAddress)
251							address := net.JoinHostPort(*ip.Properties.PrivateIPAddress, fmt.Sprintf("%d", d.port))
252							labels[model.AddressLabel] = model.LabelValue(address)
253							ch <- target{labelSet: labels, err: nil}
254							return
255						}
256						// If we made it here, we don't have a private IP which should be impossible.
257						// Return an empty target and error to ensure an all or nothing situation.
258						err = fmt.Errorf("unable to find a private IP for VM %s", *vm.Name)
259						ch <- target{labelSet: nil, err: err}
260						return
261					}
262				}
263			}
264		}(i, vm)
265	}
266
267	for range machines {
268		tgt := <-ch
269		if tgt.err != nil {
270			return nil, fmt.Errorf("unable to complete Azure service discovery: %s", err)
271		}
272		if tgt.labelSet != nil {
273			tg.Targets = append(tg.Targets, tgt.labelSet)
274		}
275	}
276
277	d.logger.Debugf("Azure discovery completed.")
278	return tg, nil
279}
280