1package ruler
2
3import (
4	"bytes"
5	"testing"
6
7	"github.com/prometheus/client_golang/prometheus"
8	"github.com/prometheus/client_golang/prometheus/promauto"
9	"github.com/prometheus/client_golang/prometheus/testutil"
10	dto "github.com/prometheus/client_model/go"
11	"github.com/stretchr/testify/assert"
12	"github.com/stretchr/testify/require"
13)
14
15func TestManagerMetrics(t *testing.T) {
16	mainReg := prometheus.NewPedanticRegistry()
17
18	managerMetrics := NewManagerMetrics()
19	mainReg.MustRegister(managerMetrics)
20	managerMetrics.AddUserRegistry("user1", populateManager(1))
21	managerMetrics.AddUserRegistry("user2", populateManager(10))
22	managerMetrics.AddUserRegistry("user3", populateManager(100))
23
24	managerMetrics.AddUserRegistry("user4", populateManager(1000))
25	managerMetrics.RemoveUserRegistry("user4")
26
27	//noinspection ALL
28	err := testutil.GatherAndCompare(mainReg, bytes.NewBufferString(`
29# HELP cortex_prometheus_last_evaluation_samples The number of samples returned during the last rule group evaluation.
30# TYPE cortex_prometheus_last_evaluation_samples gauge
31cortex_prometheus_last_evaluation_samples{rule_group="group_one",user="user1"} 1000
32cortex_prometheus_last_evaluation_samples{rule_group="group_one",user="user2"} 10000
33cortex_prometheus_last_evaluation_samples{rule_group="group_one",user="user3"} 100000
34cortex_prometheus_last_evaluation_samples{rule_group="group_two",user="user1"} 1000
35cortex_prometheus_last_evaluation_samples{rule_group="group_two",user="user2"} 10000
36cortex_prometheus_last_evaluation_samples{rule_group="group_two",user="user3"} 100000
37# HELP cortex_prometheus_rule_evaluation_duration_seconds The duration for a rule to execute.
38# TYPE cortex_prometheus_rule_evaluation_duration_seconds summary
39cortex_prometheus_rule_evaluation_duration_seconds{user="user1",quantile="0.5"} 1
40cortex_prometheus_rule_evaluation_duration_seconds{user="user1",quantile="0.9"} 1
41cortex_prometheus_rule_evaluation_duration_seconds{user="user1",quantile="0.99"} 1
42cortex_prometheus_rule_evaluation_duration_seconds_sum{user="user1"} 1
43cortex_prometheus_rule_evaluation_duration_seconds_count{user="user1"} 1
44cortex_prometheus_rule_evaluation_duration_seconds{user="user2",quantile="0.5"} 10
45cortex_prometheus_rule_evaluation_duration_seconds{user="user2",quantile="0.9"} 10
46cortex_prometheus_rule_evaluation_duration_seconds{user="user2",quantile="0.99"} 10
47cortex_prometheus_rule_evaluation_duration_seconds_sum{user="user2"} 10
48cortex_prometheus_rule_evaluation_duration_seconds_count{user="user2"} 1
49cortex_prometheus_rule_evaluation_duration_seconds{user="user3",quantile="0.5"} 100
50cortex_prometheus_rule_evaluation_duration_seconds{user="user3",quantile="0.9"} 100
51cortex_prometheus_rule_evaluation_duration_seconds{user="user3",quantile="0.99"} 100
52cortex_prometheus_rule_evaluation_duration_seconds_sum{user="user3"} 100
53cortex_prometheus_rule_evaluation_duration_seconds_count{user="user3"} 1
54# HELP cortex_prometheus_rule_evaluation_failures_total The total number of rule evaluation failures.
55# TYPE cortex_prometheus_rule_evaluation_failures_total counter
56cortex_prometheus_rule_evaluation_failures_total{rule_group="group_one",user="user1"} 1
57cortex_prometheus_rule_evaluation_failures_total{rule_group="group_one",user="user2"} 10
58cortex_prometheus_rule_evaluation_failures_total{rule_group="group_one",user="user3"} 100
59cortex_prometheus_rule_evaluation_failures_total{rule_group="group_two",user="user1"} 1
60cortex_prometheus_rule_evaluation_failures_total{rule_group="group_two",user="user2"} 10
61cortex_prometheus_rule_evaluation_failures_total{rule_group="group_two",user="user3"} 100
62# HELP cortex_prometheus_rule_evaluations_total The total number of rule evaluations.
63# TYPE cortex_prometheus_rule_evaluations_total counter
64cortex_prometheus_rule_evaluations_total{rule_group="group_one",user="user1"} 1
65cortex_prometheus_rule_evaluations_total{rule_group="group_one",user="user2"} 10
66cortex_prometheus_rule_evaluations_total{rule_group="group_one",user="user3"} 100
67cortex_prometheus_rule_evaluations_total{rule_group="group_two",user="user1"} 1
68cortex_prometheus_rule_evaluations_total{rule_group="group_two",user="user2"} 10
69cortex_prometheus_rule_evaluations_total{rule_group="group_two",user="user3"} 100
70# HELP cortex_prometheus_rule_group_duration_seconds The duration of rule group evaluations.
71# TYPE cortex_prometheus_rule_group_duration_seconds summary
72cortex_prometheus_rule_group_duration_seconds{user="user1",quantile="0.01"} 1
73cortex_prometheus_rule_group_duration_seconds{user="user1",quantile="0.05"} 1
74cortex_prometheus_rule_group_duration_seconds{user="user1",quantile="0.5"} 1
75cortex_prometheus_rule_group_duration_seconds{user="user1",quantile="0.9"} 1
76cortex_prometheus_rule_group_duration_seconds{user="user1",quantile="0.99"} 1
77cortex_prometheus_rule_group_duration_seconds_sum{user="user1"} 1
78cortex_prometheus_rule_group_duration_seconds_count{user="user1"} 1
79cortex_prometheus_rule_group_duration_seconds{user="user2",quantile="0.01"} 10
80cortex_prometheus_rule_group_duration_seconds{user="user2",quantile="0.05"} 10
81cortex_prometheus_rule_group_duration_seconds{user="user2",quantile="0.5"} 10
82cortex_prometheus_rule_group_duration_seconds{user="user2",quantile="0.9"} 10
83cortex_prometheus_rule_group_duration_seconds{user="user2",quantile="0.99"} 10
84cortex_prometheus_rule_group_duration_seconds_sum{user="user2"} 10
85cortex_prometheus_rule_group_duration_seconds_count{user="user2"} 1
86cortex_prometheus_rule_group_duration_seconds{user="user3",quantile="0.01"} 100
87cortex_prometheus_rule_group_duration_seconds{user="user3",quantile="0.05"} 100
88cortex_prometheus_rule_group_duration_seconds{user="user3",quantile="0.5"} 100
89cortex_prometheus_rule_group_duration_seconds{user="user3",quantile="0.9"} 100
90cortex_prometheus_rule_group_duration_seconds{user="user3",quantile="0.99"} 100
91cortex_prometheus_rule_group_duration_seconds_sum{user="user3"} 100
92cortex_prometheus_rule_group_duration_seconds_count{user="user3"} 1
93# HELP cortex_prometheus_rule_group_iterations_missed_total The total number of rule group evaluations missed due to slow rule group evaluation.
94# TYPE cortex_prometheus_rule_group_iterations_missed_total counter
95cortex_prometheus_rule_group_iterations_missed_total{rule_group="group_one",user="user1"} 1
96cortex_prometheus_rule_group_iterations_missed_total{rule_group="group_one",user="user2"} 10
97cortex_prometheus_rule_group_iterations_missed_total{rule_group="group_one",user="user3"} 100
98cortex_prometheus_rule_group_iterations_missed_total{rule_group="group_two",user="user1"} 1
99cortex_prometheus_rule_group_iterations_missed_total{rule_group="group_two",user="user2"} 10
100cortex_prometheus_rule_group_iterations_missed_total{rule_group="group_two",user="user3"} 100
101# HELP cortex_prometheus_rule_group_iterations_total The total number of scheduled rule group evaluations, whether executed or missed.
102# TYPE cortex_prometheus_rule_group_iterations_total counter
103cortex_prometheus_rule_group_iterations_total{rule_group="group_one",user="user1"} 1
104cortex_prometheus_rule_group_iterations_total{rule_group="group_one",user="user2"} 10
105cortex_prometheus_rule_group_iterations_total{rule_group="group_one",user="user3"} 100
106cortex_prometheus_rule_group_iterations_total{rule_group="group_two",user="user1"} 1
107cortex_prometheus_rule_group_iterations_total{rule_group="group_two",user="user2"} 10
108cortex_prometheus_rule_group_iterations_total{rule_group="group_two",user="user3"} 100
109# HELP cortex_prometheus_rule_group_last_duration_seconds The duration of the last rule group evaluation.
110# TYPE cortex_prometheus_rule_group_last_duration_seconds gauge
111cortex_prometheus_rule_group_last_duration_seconds{rule_group="group_one",user="user1"} 1000
112cortex_prometheus_rule_group_last_duration_seconds{rule_group="group_one",user="user2"} 10000
113cortex_prometheus_rule_group_last_duration_seconds{rule_group="group_one",user="user3"} 100000
114cortex_prometheus_rule_group_last_duration_seconds{rule_group="group_two",user="user1"} 1000
115cortex_prometheus_rule_group_last_duration_seconds{rule_group="group_two",user="user2"} 10000
116cortex_prometheus_rule_group_last_duration_seconds{rule_group="group_two",user="user3"} 100000
117# HELP cortex_prometheus_rule_group_last_evaluation_timestamp_seconds The timestamp of the last rule group evaluation in seconds.
118# TYPE cortex_prometheus_rule_group_last_evaluation_timestamp_seconds gauge
119cortex_prometheus_rule_group_last_evaluation_timestamp_seconds{rule_group="group_one",user="user1"} 1000
120cortex_prometheus_rule_group_last_evaluation_timestamp_seconds{rule_group="group_one",user="user2"} 10000
121cortex_prometheus_rule_group_last_evaluation_timestamp_seconds{rule_group="group_one",user="user3"} 100000
122cortex_prometheus_rule_group_last_evaluation_timestamp_seconds{rule_group="group_two",user="user1"} 1000
123cortex_prometheus_rule_group_last_evaluation_timestamp_seconds{rule_group="group_two",user="user2"} 10000
124cortex_prometheus_rule_group_last_evaluation_timestamp_seconds{rule_group="group_two",user="user3"} 100000
125# HELP cortex_prometheus_rule_group_rules The number of rules.
126# TYPE cortex_prometheus_rule_group_rules gauge
127cortex_prometheus_rule_group_rules{rule_group="group_one",user="user1"} 1000
128cortex_prometheus_rule_group_rules{rule_group="group_one",user="user2"} 10000
129cortex_prometheus_rule_group_rules{rule_group="group_one",user="user3"} 100000
130cortex_prometheus_rule_group_rules{rule_group="group_two",user="user1"} 1000
131cortex_prometheus_rule_group_rules{rule_group="group_two",user="user2"} 10000
132cortex_prometheus_rule_group_rules{rule_group="group_two",user="user3"} 100000
133`))
134	require.NoError(t, err)
135}
136
137func populateManager(base float64) *prometheus.Registry {
138	r := prometheus.NewRegistry()
139
140	metrics := newGroupMetrics(r)
141
142	metrics.evalDuration.Observe(base)
143	metrics.iterationDuration.Observe(base)
144
145	metrics.iterationsScheduled.WithLabelValues("group_one").Add(base)
146	metrics.iterationsScheduled.WithLabelValues("group_two").Add(base)
147	metrics.iterationsMissed.WithLabelValues("group_one").Add(base)
148	metrics.iterationsMissed.WithLabelValues("group_two").Add(base)
149	metrics.evalTotal.WithLabelValues("group_one").Add(base)
150	metrics.evalTotal.WithLabelValues("group_two").Add(base)
151	metrics.evalFailures.WithLabelValues("group_one").Add(base)
152	metrics.evalFailures.WithLabelValues("group_two").Add(base)
153
154	metrics.groupLastEvalTime.WithLabelValues("group_one").Add(base * 1000)
155	metrics.groupLastEvalTime.WithLabelValues("group_two").Add(base * 1000)
156
157	metrics.groupLastDuration.WithLabelValues("group_one").Add(base * 1000)
158	metrics.groupLastDuration.WithLabelValues("group_two").Add(base * 1000)
159
160	metrics.groupRules.WithLabelValues("group_one").Add(base * 1000)
161	metrics.groupRules.WithLabelValues("group_two").Add(base * 1000)
162
163	metrics.groupLastEvalSamples.WithLabelValues("group_one").Add(base * 1000)
164	metrics.groupLastEvalSamples.WithLabelValues("group_two").Add(base * 1000)
165
166	return r
167}
168
169// Copied from github.com/prometheus/rules/manager.go
170type groupMetrics struct {
171	evalDuration         prometheus.Summary
172	iterationDuration    prometheus.Summary
173	iterationsMissed     *prometheus.CounterVec
174	iterationsScheduled  *prometheus.CounterVec
175	evalTotal            *prometheus.CounterVec
176	evalFailures         *prometheus.CounterVec
177	groupInterval        *prometheus.GaugeVec
178	groupLastEvalTime    *prometheus.GaugeVec
179	groupLastDuration    *prometheus.GaugeVec
180	groupRules           *prometheus.GaugeVec
181	groupLastEvalSamples *prometheus.GaugeVec
182}
183
184func newGroupMetrics(r prometheus.Registerer) *groupMetrics {
185	m := &groupMetrics{
186		evalDuration: promauto.With(r).NewSummary(
187			prometheus.SummaryOpts{
188				Name:       "prometheus_rule_evaluation_duration_seconds",
189				Help:       "The duration for a rule to execute.",
190				Objectives: map[float64]float64{0.5: 0.05, 0.9: 0.01, 0.99: 0.001},
191			}),
192		iterationDuration: promauto.With(r).NewSummary(prometheus.SummaryOpts{
193			Name:       "prometheus_rule_group_duration_seconds",
194			Help:       "The duration of rule group evaluations.",
195			Objectives: map[float64]float64{0.01: 0.001, 0.05: 0.005, 0.5: 0.05, 0.90: 0.01, 0.99: 0.001},
196		}),
197		iterationsMissed: promauto.With(r).NewCounterVec(
198			prometheus.CounterOpts{
199				Name: "prometheus_rule_group_iterations_missed_total",
200				Help: "The total number of rule group evaluations missed due to slow rule group evaluation.",
201			},
202			[]string{"rule_group"},
203		),
204		iterationsScheduled: promauto.With(r).NewCounterVec(
205			prometheus.CounterOpts{
206				Name: "prometheus_rule_group_iterations_total",
207				Help: "The total number of scheduled rule group evaluations, whether executed or missed.",
208			},
209			[]string{"rule_group"},
210		),
211		evalTotal: promauto.With(r).NewCounterVec(
212			prometheus.CounterOpts{
213				Name: "prometheus_rule_evaluations_total",
214				Help: "The total number of rule evaluations.",
215			},
216			[]string{"rule_group"},
217		),
218		evalFailures: promauto.With(r).NewCounterVec(
219			prometheus.CounterOpts{
220				Name: "prometheus_rule_evaluation_failures_total",
221				Help: "The total number of rule evaluation failures.",
222			},
223			[]string{"rule_group"},
224		),
225		groupInterval: promauto.With(r).NewGaugeVec(
226			prometheus.GaugeOpts{
227				Name: "prometheus_rule_group_interval_seconds",
228				Help: "The interval of a rule group.",
229			},
230			[]string{"rule_group"},
231		),
232		groupLastEvalTime: promauto.With(r).NewGaugeVec(
233			prometheus.GaugeOpts{
234				Name: "prometheus_rule_group_last_evaluation_timestamp_seconds",
235				Help: "The timestamp of the last rule group evaluation in seconds.",
236			},
237			[]string{"rule_group"},
238		),
239		groupLastDuration: promauto.With(r).NewGaugeVec(
240			prometheus.GaugeOpts{
241				Name: "prometheus_rule_group_last_duration_seconds",
242				Help: "The duration of the last rule group evaluation.",
243			},
244			[]string{"rule_group"},
245		),
246		groupRules: promauto.With(r).NewGaugeVec(
247			prometheus.GaugeOpts{
248				Name: "prometheus_rule_group_rules",
249				Help: "The number of rules.",
250			},
251			[]string{"rule_group"},
252		),
253		groupLastEvalSamples: promauto.With(r).NewGaugeVec(
254			prometheus.GaugeOpts{
255				Name: "prometheus_rule_group_last_evaluation_samples",
256				Help: "The number of samples returned during the last rule group evaluation.",
257			},
258			[]string{"rule_group"},
259		),
260	}
261
262	return m
263}
264
265func TestMetricsArePerUser(t *testing.T) {
266	mainReg := prometheus.NewPedanticRegistry()
267
268	managerMetrics := NewManagerMetrics()
269	mainReg.MustRegister(managerMetrics)
270	managerMetrics.AddUserRegistry("user1", populateManager(1))
271	managerMetrics.AddUserRegistry("user2", populateManager(10))
272	managerMetrics.AddUserRegistry("user3", populateManager(100))
273
274	ch := make(chan prometheus.Metric)
275
276	defer func() {
277		// drain the channel, so that collecting gouroutine can stop.
278		// This is useful if test fails.
279		for range ch {
280		}
281	}()
282
283	go func() {
284		managerMetrics.Collect(ch)
285		close(ch)
286	}()
287
288	for m := range ch {
289		desc := m.Desc()
290
291		dtoM := &dto.Metric{}
292		err := m.Write(dtoM)
293
294		require.NoError(t, err)
295
296		foundUserLabel := false
297		for _, l := range dtoM.Label {
298			if l.GetName() == "user" {
299				foundUserLabel = true
300				break
301			}
302		}
303
304		assert.True(t, foundUserLabel, "user label not found for metric %s", desc.String())
305	}
306}
307