1// Package eval executes the condition for an alert definition, evaluates the condition results, and
2// returns the alert instance states.
3package eval
4
5import (
6	"context"
7	"fmt"
8	"runtime/debug"
9	"sort"
10	"strconv"
11	"strings"
12	"time"
13
14	"github.com/grafana/grafana/pkg/expr/classic"
15	"github.com/grafana/grafana/pkg/infra/log"
16	"github.com/grafana/grafana/pkg/services/ngalert/models"
17
18	"github.com/grafana/grafana/pkg/setting"
19
20	"github.com/grafana/grafana-plugin-sdk-go/backend"
21	"github.com/grafana/grafana-plugin-sdk-go/data"
22	"github.com/grafana/grafana/pkg/expr"
23)
24
25type Evaluator struct {
26	Cfg *setting.Cfg
27	Log log.Logger
28}
29
30// invalidEvalResultFormatError is an error for invalid format of the alert definition evaluation results.
31type invalidEvalResultFormatError struct {
32	refID  string
33	reason string
34	err    error
35}
36
37func (e *invalidEvalResultFormatError) Error() string {
38	s := fmt.Sprintf("invalid format of evaluation results for the alert definition %s: %s", e.refID, e.reason)
39	if e.err != nil {
40		s = fmt.Sprintf("%s: %s", s, e.err.Error())
41	}
42	return s
43}
44
45func (e *invalidEvalResultFormatError) Unwrap() error {
46	return e.err
47}
48
49// ExecutionResults contains the unevaluated results from executing
50// a condition.
51type ExecutionResults struct {
52	Error error
53
54	// NoData contains the DatasourceUID for RefIDs that returned no data.
55	NoData map[string]string
56
57	Results data.Frames
58}
59
60// Results is a slice of evaluated alert instances states.
61type Results []Result
62
63// Result contains the evaluated State of an alert instance
64// identified by its labels.
65type Result struct {
66	Instance data.Labels
67	State    State // Enum
68	// Error message for Error state. should be nil if State != Error.
69	Error              error
70	EvaluatedAt        time.Time
71	EvaluationDuration time.Duration
72
73	// EvaluationString is a string representation of evaluation data such
74	// as EvalMatches (from "classic condition"), and in the future from operations
75	// like SSE "math".
76	EvaluationString string
77
78	// Values contains the RefID and value of reduce and math expressions.
79	// It does not contain values for classic conditions as the values
80	// in classic conditions do not have a RefID.
81	Values map[string]NumberValueCapture
82}
83
84// State is an enum of the evaluation State for an alert instance.
85type State int
86
87const (
88	// Normal is the eval state for an alert instance condition
89	// that evaluated to false.
90	Normal State = iota
91
92	// Alerting is the eval state for an alert instance condition
93	// that evaluated to true (Alerting).
94	Alerting
95
96	// Pending is the eval state for an alert instance condition
97	// that evaluated to true (Alerting) but has not yet met
98	// the For duration defined in AlertRule.
99	Pending
100
101	// NoData is the eval state for an alert rule condition
102	// that evaluated to NoData.
103	NoData
104
105	// Error is the eval state for an alert rule condition
106	// that evaluated to Error.
107	Error
108)
109
110func (s State) String() string {
111	return [...]string{"Normal", "Alerting", "Pending", "NoData", "Error"}[s]
112}
113
114// AlertExecCtx is the context provided for executing an alert condition.
115type AlertExecCtx struct {
116	OrgID              int64
117	ExpressionsEnabled bool
118	Log                log.Logger
119
120	Ctx context.Context
121}
122
123// GetExprRequest validates the condition and creates a expr.Request from it.
124func GetExprRequest(ctx AlertExecCtx, data []models.AlertQuery, now time.Time) (*expr.Request, error) {
125	req := &expr.Request{
126		OrgId: ctx.OrgID,
127		Headers: map[string]string{
128			// Some data sources check this in query method as sometimes alerting needs special considerations.
129			"FromAlert":    "true",
130			"X-Cache-Skip": "true",
131		},
132	}
133
134	for i := range data {
135		q := data[i]
136		model, err := q.GetModel()
137		if err != nil {
138			return nil, fmt.Errorf("failed to get query model: %w", err)
139		}
140		interval, err := q.GetIntervalDuration()
141		if err != nil {
142			return nil, fmt.Errorf("failed to retrieve intervalMs from the model: %w", err)
143		}
144
145		maxDatapoints, err := q.GetMaxDatapoints()
146		if err != nil {
147			return nil, fmt.Errorf("failed to retrieve maxDatapoints from the model: %w", err)
148		}
149
150		req.Queries = append(req.Queries, expr.Query{
151			TimeRange: expr.TimeRange{
152				From: q.RelativeTimeRange.ToTimeRange(now).From,
153				To:   q.RelativeTimeRange.ToTimeRange(now).To,
154			},
155			DatasourceUID: q.DatasourceUID,
156			JSON:          model,
157			Interval:      interval,
158			RefID:         q.RefID,
159			MaxDataPoints: maxDatapoints,
160			QueryType:     q.QueryType,
161		})
162	}
163	return req, nil
164}
165
166type NumberValueCapture struct {
167	Var    string // RefID
168	Labels data.Labels
169	Value  *float64
170}
171
172func executeCondition(ctx AlertExecCtx, c *models.Condition, now time.Time, exprService *expr.Service) ExecutionResults {
173	execResp, err := executeQueriesAndExpressions(ctx, c.Data, now, exprService)
174	if err != nil {
175		return ExecutionResults{Error: err}
176	}
177
178	// eval captures for the '__value_string__' annotation and the Value property of the API response.
179	captures := make([]NumberValueCapture, 0, len(execResp.Responses))
180	captureVal := func(refID string, labels data.Labels, value *float64) {
181		captures = append(captures, NumberValueCapture{
182			Var:    refID,
183			Value:  value,
184			Labels: labels.Copy(),
185		})
186	}
187
188	// datasourceUIDsForRefIDs is a short-lived lookup table of RefID to DatasourceUID
189	// for efficient lookups of the DatasourceUID when a RefID returns no data
190	datasourceUIDsForRefIDs := make(map[string]string)
191	for _, next := range c.Data {
192		datasourceUIDsForRefIDs[next.RefID] = next.DatasourceUID
193	}
194	// datasourceExprUID is a special DatasourceUID for expressions
195	datasourceExprUID := strconv.FormatInt(expr.DatasourceID, 10)
196
197	var result ExecutionResults
198	for refID, res := range execResp.Responses {
199		if len(res.Frames) == 0 {
200			// to ensure that NoData is consistent with Results we do not initialize NoData
201			// unless there is at least one RefID that returned no data
202			if result.NoData == nil {
203				result.NoData = make(map[string]string)
204			}
205			if s, ok := datasourceUIDsForRefIDs[refID]; ok && s != datasourceExprUID {
206				result.NoData[refID] = s
207			}
208		}
209
210		// for each frame within each response, the response can contain several data types including time-series data.
211		// For now, we favour simplicity and only care about single scalar values.
212		for _, frame := range res.Frames {
213			if len(frame.Fields) != 1 || frame.Fields[0].Type() != data.FieldTypeNullableFloat64 {
214				continue
215			}
216			var v *float64
217			if frame.Fields[0].Len() == 1 {
218				v = frame.At(0, 0).(*float64) // type checked above
219			}
220			captureVal(frame.RefID, frame.Fields[0].Labels, v)
221		}
222
223		if refID == c.Condition {
224			result.Results = res.Frames
225		}
226	}
227
228	// add capture values as data frame metadata to each result (frame) that has matching labels.
229	for _, frame := range result.Results {
230		// classic conditions already have metadata set and only have one value, there's no need to add anything in this case.
231		if frame.Meta != nil && frame.Meta.Custom != nil {
232			if _, ok := frame.Meta.Custom.([]classic.EvalMatch); ok {
233				continue // do not overwrite EvalMatch from classic condition.
234			}
235		}
236
237		frame.SetMeta(&data.FrameMeta{}) // overwrite metadata
238
239		if len(frame.Fields) == 1 {
240			theseLabels := frame.Fields[0].Labels
241			for _, cap := range captures {
242				// matching labels are equal labels, or when one set of labels includes the labels of the other.
243				if theseLabels.Equals(cap.Labels) || theseLabels.Contains(cap.Labels) || cap.Labels.Contains(theseLabels) {
244					if frame.Meta.Custom == nil {
245						frame.Meta.Custom = []NumberValueCapture{}
246					}
247					frame.Meta.Custom = append(frame.Meta.Custom.([]NumberValueCapture), cap)
248				}
249			}
250		}
251	}
252
253	return result
254}
255
256func executeQueriesAndExpressions(ctx AlertExecCtx, data []models.AlertQuery, now time.Time, exprService *expr.Service) (resp *backend.QueryDataResponse, err error) {
257	defer func() {
258		if e := recover(); e != nil {
259			ctx.Log.Error("alert rule panic", "error", e, "stack", string(debug.Stack()))
260			panicErr := fmt.Errorf("alert rule panic; please check the logs for the full stack")
261			if err != nil {
262				err = fmt.Errorf("queries and expressions execution failed: %w; %v", err, panicErr.Error())
263			} else {
264				err = panicErr
265			}
266		}
267	}()
268
269	queryDataReq, err := GetExprRequest(ctx, data, now)
270	if err != nil {
271		return nil, err
272	}
273
274	return exprService.TransformData(ctx.Ctx, queryDataReq)
275}
276
277// datasourceUIDsToRefIDs returns a sorted slice of Ref IDs for each Datasource UID.
278//
279// If refIDsToDatasourceUIDs is nil then this function also returns nil. Likewise,
280// if it is an empty map then it too returns an empty map.
281//
282// For example, given the following:
283//
284//		map[string]string{
285//			"ref1": "datasource1",
286//			"ref2": "datasource1",
287//			"ref3": "datasource2",
288//		}
289//
290// we would expect:
291//
292//  	map[string][]string{
293// 			"datasource1": []string{"ref1", "ref2"},
294//			"datasource2": []string{"ref3"},
295//		}
296func datasourceUIDsToRefIDs(refIDsToDatasourceUIDs map[string]string) map[string][]string {
297	if refIDsToDatasourceUIDs == nil {
298		return nil
299	}
300
301	// The ref IDs must be sorted. However, instead of sorting them once
302	// for each Datasource UID we can append them all to a slice and then
303	// sort them once
304	refIDs := make([]string, 0, len(refIDsToDatasourceUIDs))
305	for refID := range refIDsToDatasourceUIDs {
306		refIDs = append(refIDs, refID)
307	}
308	sort.Strings(refIDs)
309
310	result := make(map[string][]string)
311	for _, refID := range refIDs {
312		datasourceUID := refIDsToDatasourceUIDs[refID]
313		result[datasourceUID] = append(result[datasourceUID], refID)
314	}
315
316	return result
317}
318
319// evaluateExecutionResult takes the ExecutionResult which includes data.Frames returned
320// from SSE (Server Side Expressions). It will create Results (slice of Result) with a State
321// extracted from each Frame.
322//
323// If the ExecutionResults error property is not nil, a single Error result will be returned.
324// If there is no error and no results then a single NoData state Result will be returned.
325//
326// Each non-empty Frame must be a single Field of type []*float64 and of length 1.
327// Also, each Frame must be uniquely identified by its Field.Labels or a single Error result will be returned.
328//
329// Per Frame, data becomes a State based on the following rules:
330//  - Empty or zero length Frames result in NoData.
331//  - If a value:
332//    - 0 results in Normal.
333//    - Nonzero (e.g 1.2, NaN) results in Alerting.
334//    - nil results in noData.
335//    - unsupported Frame schemas results in Error.
336func evaluateExecutionResult(execResults ExecutionResults, ts time.Time) Results {
337	evalResults := make([]Result, 0)
338
339	appendErrRes := func(e error) {
340		evalResults = append(evalResults, Result{
341			State:              Error,
342			Error:              e,
343			EvaluatedAt:        ts,
344			EvaluationDuration: time.Since(ts),
345		})
346	}
347
348	appendNoData := func(labels data.Labels) {
349		evalResults = append(evalResults, Result{
350			State:              NoData,
351			Instance:           labels,
352			EvaluatedAt:        ts,
353			EvaluationDuration: time.Since(ts),
354		})
355	}
356
357	if execResults.Error != nil {
358		appendErrRes(execResults.Error)
359		return evalResults
360	}
361
362	if len(execResults.NoData) > 0 {
363		noData := datasourceUIDsToRefIDs(execResults.NoData)
364		for datasourceUID, refIDs := range noData {
365			appendNoData(data.Labels{
366				"datasource_uid": datasourceUID,
367				"ref_id":         strings.Join(refIDs, ","),
368			})
369		}
370		return evalResults
371	}
372
373	if len(execResults.Results) == 0 {
374		appendNoData(nil)
375		return evalResults
376	}
377
378	for _, f := range execResults.Results {
379		rowLen, err := f.RowLen()
380		if err != nil {
381			appendErrRes(&invalidEvalResultFormatError{refID: f.RefID, reason: "unable to get frame row length", err: err})
382			continue
383		}
384
385		if len(f.TypeIndices(data.FieldTypeTime, data.FieldTypeNullableTime)) > 0 {
386			appendErrRes(&invalidEvalResultFormatError{refID: f.RefID, reason: "looks like time series data, only reduced data can be alerted on."})
387			continue
388		}
389
390		if rowLen == 0 {
391			if len(f.Fields) == 0 {
392				appendNoData(nil)
393				continue
394			}
395			if len(f.Fields) == 1 {
396				appendNoData(f.Fields[0].Labels)
397				continue
398			}
399		}
400
401		if rowLen > 1 {
402			appendErrRes(&invalidEvalResultFormatError{refID: f.RefID, reason: fmt.Sprintf("unexpected row length: %d instead of 0 or 1", rowLen)})
403			continue
404		}
405
406		if len(f.Fields) > 1 {
407			appendErrRes(&invalidEvalResultFormatError{refID: f.RefID, reason: fmt.Sprintf("unexpected field length: %d instead of 1", len(f.Fields))})
408			continue
409		}
410
411		if f.Fields[0].Type() != data.FieldTypeNullableFloat64 {
412			appendErrRes(&invalidEvalResultFormatError{refID: f.RefID, reason: fmt.Sprintf("invalid field type: %s", f.Fields[0].Type())})
413			continue
414		}
415
416		val := f.Fields[0].At(0).(*float64) // type checked by data.FieldTypeNullableFloat64 above
417
418		r := Result{
419			Instance:           f.Fields[0].Labels,
420			EvaluatedAt:        ts,
421			EvaluationDuration: time.Since(ts),
422			EvaluationString:   extractEvalString(f),
423			Values:             extractValues(f),
424		}
425
426		switch {
427		case val == nil:
428			r.State = NoData
429		case *val == 0:
430			r.State = Normal
431		default:
432			r.State = Alerting
433		}
434
435		evalResults = append(evalResults, r)
436	}
437
438	seenLabels := make(map[string]bool)
439	for _, res := range evalResults {
440		labelsStr := res.Instance.String()
441		_, ok := seenLabels[labelsStr]
442		if ok {
443			return Results{
444				Result{
445					State:              Error,
446					Instance:           res.Instance,
447					EvaluatedAt:        ts,
448					EvaluationDuration: time.Since(ts),
449					Error:              &invalidEvalResultFormatError{reason: fmt.Sprintf("frame cannot uniquely be identified by its labels: has duplicate results with labels {%s}", labelsStr)},
450				},
451			}
452		}
453		seenLabels[labelsStr] = true
454	}
455
456	return evalResults
457}
458
459// AsDataFrame forms the EvalResults in Frame suitable for displaying in the table panel of the front end.
460// It displays one row per alert instance, with a column for each label and one for the alerting state.
461func (evalResults Results) AsDataFrame() data.Frame {
462	fieldLen := len(evalResults)
463
464	uniqueLabelKeys := make(map[string]struct{})
465
466	for _, evalResult := range evalResults {
467		for k := range evalResult.Instance {
468			uniqueLabelKeys[k] = struct{}{}
469		}
470	}
471
472	labelColumns := make([]string, 0, len(uniqueLabelKeys))
473	for k := range uniqueLabelKeys {
474		labelColumns = append(labelColumns, k)
475	}
476
477	labelColumns = sort.StringSlice(labelColumns)
478
479	frame := data.NewFrame("evaluation results")
480	for _, lKey := range labelColumns {
481		frame.Fields = append(frame.Fields, data.NewField(lKey, nil, make([]string, fieldLen)))
482	}
483	frame.Fields = append(frame.Fields, data.NewField("State", nil, make([]string, fieldLen)))
484	frame.Fields = append(frame.Fields, data.NewField("Info", nil, make([]string, fieldLen)))
485
486	for evalIdx, evalResult := range evalResults {
487		for lIdx, v := range labelColumns {
488			frame.Set(lIdx, evalIdx, evalResult.Instance[v])
489		}
490
491		frame.Set(len(labelColumns), evalIdx, evalResult.State.String())
492
493		switch {
494		case evalResult.Error != nil:
495			frame.Set(len(labelColumns)+1, evalIdx, evalResult.Error.Error())
496		case evalResult.EvaluationString != "":
497			frame.Set(len(labelColumns)+1, evalIdx, evalResult.EvaluationString)
498		}
499	}
500	return *frame
501}
502
503// ConditionEval executes conditions and evaluates the result.
504func (e *Evaluator) ConditionEval(condition *models.Condition, now time.Time, expressionService *expr.Service) (Results, error) {
505	alertCtx, cancelFn := context.WithTimeout(context.Background(), e.Cfg.UnifiedAlerting.EvaluationTimeout)
506	defer cancelFn()
507
508	alertExecCtx := AlertExecCtx{OrgID: condition.OrgID, Ctx: alertCtx, ExpressionsEnabled: e.Cfg.ExpressionsEnabled, Log: e.Log}
509
510	execResult := executeCondition(alertExecCtx, condition, now, expressionService)
511
512	evalResults := evaluateExecutionResult(execResult, now)
513	return evalResults, nil
514}
515
516// QueriesAndExpressionsEval executes queries and expressions and returns the result.
517func (e *Evaluator) QueriesAndExpressionsEval(orgID int64, data []models.AlertQuery, now time.Time, expressionService *expr.Service) (*backend.QueryDataResponse, error) {
518	alertCtx, cancelFn := context.WithTimeout(context.Background(), e.Cfg.UnifiedAlerting.EvaluationTimeout)
519	defer cancelFn()
520
521	alertExecCtx := AlertExecCtx{OrgID: orgID, Ctx: alertCtx, ExpressionsEnabled: e.Cfg.ExpressionsEnabled, Log: e.Log}
522
523	execResult, err := executeQueriesAndExpressions(alertExecCtx, data, now, expressionService)
524	if err != nil {
525		return nil, fmt.Errorf("failed to execute conditions: %w", err)
526	}
527
528	return execResult, nil
529}
530