1// Package eval executes the condition for an alert definition, evaluates the condition results, and 2// returns the alert instance states. 3package eval 4 5import ( 6 "context" 7 "fmt" 8 "runtime/debug" 9 "sort" 10 "strconv" 11 "strings" 12 "time" 13 14 "github.com/grafana/grafana/pkg/expr/classic" 15 "github.com/grafana/grafana/pkg/infra/log" 16 "github.com/grafana/grafana/pkg/services/ngalert/models" 17 18 "github.com/grafana/grafana/pkg/setting" 19 20 "github.com/grafana/grafana-plugin-sdk-go/backend" 21 "github.com/grafana/grafana-plugin-sdk-go/data" 22 "github.com/grafana/grafana/pkg/expr" 23) 24 25type Evaluator struct { 26 Cfg *setting.Cfg 27 Log log.Logger 28} 29 30// invalidEvalResultFormatError is an error for invalid format of the alert definition evaluation results. 31type invalidEvalResultFormatError struct { 32 refID string 33 reason string 34 err error 35} 36 37func (e *invalidEvalResultFormatError) Error() string { 38 s := fmt.Sprintf("invalid format of evaluation results for the alert definition %s: %s", e.refID, e.reason) 39 if e.err != nil { 40 s = fmt.Sprintf("%s: %s", s, e.err.Error()) 41 } 42 return s 43} 44 45func (e *invalidEvalResultFormatError) Unwrap() error { 46 return e.err 47} 48 49// ExecutionResults contains the unevaluated results from executing 50// a condition. 51type ExecutionResults struct { 52 Error error 53 54 // NoData contains the DatasourceUID for RefIDs that returned no data. 55 NoData map[string]string 56 57 Results data.Frames 58} 59 60// Results is a slice of evaluated alert instances states. 61type Results []Result 62 63// Result contains the evaluated State of an alert instance 64// identified by its labels. 65type Result struct { 66 Instance data.Labels 67 State State // Enum 68 // Error message for Error state. should be nil if State != Error. 69 Error error 70 EvaluatedAt time.Time 71 EvaluationDuration time.Duration 72 73 // EvaluationString is a string representation of evaluation data such 74 // as EvalMatches (from "classic condition"), and in the future from operations 75 // like SSE "math". 76 EvaluationString string 77 78 // Values contains the RefID and value of reduce and math expressions. 79 // It does not contain values for classic conditions as the values 80 // in classic conditions do not have a RefID. 81 Values map[string]NumberValueCapture 82} 83 84// State is an enum of the evaluation State for an alert instance. 85type State int 86 87const ( 88 // Normal is the eval state for an alert instance condition 89 // that evaluated to false. 90 Normal State = iota 91 92 // Alerting is the eval state for an alert instance condition 93 // that evaluated to true (Alerting). 94 Alerting 95 96 // Pending is the eval state for an alert instance condition 97 // that evaluated to true (Alerting) but has not yet met 98 // the For duration defined in AlertRule. 99 Pending 100 101 // NoData is the eval state for an alert rule condition 102 // that evaluated to NoData. 103 NoData 104 105 // Error is the eval state for an alert rule condition 106 // that evaluated to Error. 107 Error 108) 109 110func (s State) String() string { 111 return [...]string{"Normal", "Alerting", "Pending", "NoData", "Error"}[s] 112} 113 114// AlertExecCtx is the context provided for executing an alert condition. 115type AlertExecCtx struct { 116 OrgID int64 117 ExpressionsEnabled bool 118 Log log.Logger 119 120 Ctx context.Context 121} 122 123// GetExprRequest validates the condition and creates a expr.Request from it. 124func GetExprRequest(ctx AlertExecCtx, data []models.AlertQuery, now time.Time) (*expr.Request, error) { 125 req := &expr.Request{ 126 OrgId: ctx.OrgID, 127 Headers: map[string]string{ 128 // Some data sources check this in query method as sometimes alerting needs special considerations. 129 "FromAlert": "true", 130 "X-Cache-Skip": "true", 131 }, 132 } 133 134 for i := range data { 135 q := data[i] 136 model, err := q.GetModel() 137 if err != nil { 138 return nil, fmt.Errorf("failed to get query model: %w", err) 139 } 140 interval, err := q.GetIntervalDuration() 141 if err != nil { 142 return nil, fmt.Errorf("failed to retrieve intervalMs from the model: %w", err) 143 } 144 145 maxDatapoints, err := q.GetMaxDatapoints() 146 if err != nil { 147 return nil, fmt.Errorf("failed to retrieve maxDatapoints from the model: %w", err) 148 } 149 150 req.Queries = append(req.Queries, expr.Query{ 151 TimeRange: expr.TimeRange{ 152 From: q.RelativeTimeRange.ToTimeRange(now).From, 153 To: q.RelativeTimeRange.ToTimeRange(now).To, 154 }, 155 DatasourceUID: q.DatasourceUID, 156 JSON: model, 157 Interval: interval, 158 RefID: q.RefID, 159 MaxDataPoints: maxDatapoints, 160 QueryType: q.QueryType, 161 }) 162 } 163 return req, nil 164} 165 166type NumberValueCapture struct { 167 Var string // RefID 168 Labels data.Labels 169 Value *float64 170} 171 172func executeCondition(ctx AlertExecCtx, c *models.Condition, now time.Time, exprService *expr.Service) ExecutionResults { 173 execResp, err := executeQueriesAndExpressions(ctx, c.Data, now, exprService) 174 if err != nil { 175 return ExecutionResults{Error: err} 176 } 177 178 // eval captures for the '__value_string__' annotation and the Value property of the API response. 179 captures := make([]NumberValueCapture, 0, len(execResp.Responses)) 180 captureVal := func(refID string, labels data.Labels, value *float64) { 181 captures = append(captures, NumberValueCapture{ 182 Var: refID, 183 Value: value, 184 Labels: labels.Copy(), 185 }) 186 } 187 188 // datasourceUIDsForRefIDs is a short-lived lookup table of RefID to DatasourceUID 189 // for efficient lookups of the DatasourceUID when a RefID returns no data 190 datasourceUIDsForRefIDs := make(map[string]string) 191 for _, next := range c.Data { 192 datasourceUIDsForRefIDs[next.RefID] = next.DatasourceUID 193 } 194 // datasourceExprUID is a special DatasourceUID for expressions 195 datasourceExprUID := strconv.FormatInt(expr.DatasourceID, 10) 196 197 var result ExecutionResults 198 for refID, res := range execResp.Responses { 199 if len(res.Frames) == 0 { 200 // to ensure that NoData is consistent with Results we do not initialize NoData 201 // unless there is at least one RefID that returned no data 202 if result.NoData == nil { 203 result.NoData = make(map[string]string) 204 } 205 if s, ok := datasourceUIDsForRefIDs[refID]; ok && s != datasourceExprUID { 206 result.NoData[refID] = s 207 } 208 } 209 210 // for each frame within each response, the response can contain several data types including time-series data. 211 // For now, we favour simplicity and only care about single scalar values. 212 for _, frame := range res.Frames { 213 if len(frame.Fields) != 1 || frame.Fields[0].Type() != data.FieldTypeNullableFloat64 { 214 continue 215 } 216 var v *float64 217 if frame.Fields[0].Len() == 1 { 218 v = frame.At(0, 0).(*float64) // type checked above 219 } 220 captureVal(frame.RefID, frame.Fields[0].Labels, v) 221 } 222 223 if refID == c.Condition { 224 result.Results = res.Frames 225 } 226 } 227 228 // add capture values as data frame metadata to each result (frame) that has matching labels. 229 for _, frame := range result.Results { 230 // classic conditions already have metadata set and only have one value, there's no need to add anything in this case. 231 if frame.Meta != nil && frame.Meta.Custom != nil { 232 if _, ok := frame.Meta.Custom.([]classic.EvalMatch); ok { 233 continue // do not overwrite EvalMatch from classic condition. 234 } 235 } 236 237 frame.SetMeta(&data.FrameMeta{}) // overwrite metadata 238 239 if len(frame.Fields) == 1 { 240 theseLabels := frame.Fields[0].Labels 241 for _, cap := range captures { 242 // matching labels are equal labels, or when one set of labels includes the labels of the other. 243 if theseLabels.Equals(cap.Labels) || theseLabels.Contains(cap.Labels) || cap.Labels.Contains(theseLabels) { 244 if frame.Meta.Custom == nil { 245 frame.Meta.Custom = []NumberValueCapture{} 246 } 247 frame.Meta.Custom = append(frame.Meta.Custom.([]NumberValueCapture), cap) 248 } 249 } 250 } 251 } 252 253 return result 254} 255 256func executeQueriesAndExpressions(ctx AlertExecCtx, data []models.AlertQuery, now time.Time, exprService *expr.Service) (resp *backend.QueryDataResponse, err error) { 257 defer func() { 258 if e := recover(); e != nil { 259 ctx.Log.Error("alert rule panic", "error", e, "stack", string(debug.Stack())) 260 panicErr := fmt.Errorf("alert rule panic; please check the logs for the full stack") 261 if err != nil { 262 err = fmt.Errorf("queries and expressions execution failed: %w; %v", err, panicErr.Error()) 263 } else { 264 err = panicErr 265 } 266 } 267 }() 268 269 queryDataReq, err := GetExprRequest(ctx, data, now) 270 if err != nil { 271 return nil, err 272 } 273 274 return exprService.TransformData(ctx.Ctx, queryDataReq) 275} 276 277// datasourceUIDsToRefIDs returns a sorted slice of Ref IDs for each Datasource UID. 278// 279// If refIDsToDatasourceUIDs is nil then this function also returns nil. Likewise, 280// if it is an empty map then it too returns an empty map. 281// 282// For example, given the following: 283// 284// map[string]string{ 285// "ref1": "datasource1", 286// "ref2": "datasource1", 287// "ref3": "datasource2", 288// } 289// 290// we would expect: 291// 292// map[string][]string{ 293// "datasource1": []string{"ref1", "ref2"}, 294// "datasource2": []string{"ref3"}, 295// } 296func datasourceUIDsToRefIDs(refIDsToDatasourceUIDs map[string]string) map[string][]string { 297 if refIDsToDatasourceUIDs == nil { 298 return nil 299 } 300 301 // The ref IDs must be sorted. However, instead of sorting them once 302 // for each Datasource UID we can append them all to a slice and then 303 // sort them once 304 refIDs := make([]string, 0, len(refIDsToDatasourceUIDs)) 305 for refID := range refIDsToDatasourceUIDs { 306 refIDs = append(refIDs, refID) 307 } 308 sort.Strings(refIDs) 309 310 result := make(map[string][]string) 311 for _, refID := range refIDs { 312 datasourceUID := refIDsToDatasourceUIDs[refID] 313 result[datasourceUID] = append(result[datasourceUID], refID) 314 } 315 316 return result 317} 318 319// evaluateExecutionResult takes the ExecutionResult which includes data.Frames returned 320// from SSE (Server Side Expressions). It will create Results (slice of Result) with a State 321// extracted from each Frame. 322// 323// If the ExecutionResults error property is not nil, a single Error result will be returned. 324// If there is no error and no results then a single NoData state Result will be returned. 325// 326// Each non-empty Frame must be a single Field of type []*float64 and of length 1. 327// Also, each Frame must be uniquely identified by its Field.Labels or a single Error result will be returned. 328// 329// Per Frame, data becomes a State based on the following rules: 330// - Empty or zero length Frames result in NoData. 331// - If a value: 332// - 0 results in Normal. 333// - Nonzero (e.g 1.2, NaN) results in Alerting. 334// - nil results in noData. 335// - unsupported Frame schemas results in Error. 336func evaluateExecutionResult(execResults ExecutionResults, ts time.Time) Results { 337 evalResults := make([]Result, 0) 338 339 appendErrRes := func(e error) { 340 evalResults = append(evalResults, Result{ 341 State: Error, 342 Error: e, 343 EvaluatedAt: ts, 344 EvaluationDuration: time.Since(ts), 345 }) 346 } 347 348 appendNoData := func(labels data.Labels) { 349 evalResults = append(evalResults, Result{ 350 State: NoData, 351 Instance: labels, 352 EvaluatedAt: ts, 353 EvaluationDuration: time.Since(ts), 354 }) 355 } 356 357 if execResults.Error != nil { 358 appendErrRes(execResults.Error) 359 return evalResults 360 } 361 362 if len(execResults.NoData) > 0 { 363 noData := datasourceUIDsToRefIDs(execResults.NoData) 364 for datasourceUID, refIDs := range noData { 365 appendNoData(data.Labels{ 366 "datasource_uid": datasourceUID, 367 "ref_id": strings.Join(refIDs, ","), 368 }) 369 } 370 return evalResults 371 } 372 373 if len(execResults.Results) == 0 { 374 appendNoData(nil) 375 return evalResults 376 } 377 378 for _, f := range execResults.Results { 379 rowLen, err := f.RowLen() 380 if err != nil { 381 appendErrRes(&invalidEvalResultFormatError{refID: f.RefID, reason: "unable to get frame row length", err: err}) 382 continue 383 } 384 385 if len(f.TypeIndices(data.FieldTypeTime, data.FieldTypeNullableTime)) > 0 { 386 appendErrRes(&invalidEvalResultFormatError{refID: f.RefID, reason: "looks like time series data, only reduced data can be alerted on."}) 387 continue 388 } 389 390 if rowLen == 0 { 391 if len(f.Fields) == 0 { 392 appendNoData(nil) 393 continue 394 } 395 if len(f.Fields) == 1 { 396 appendNoData(f.Fields[0].Labels) 397 continue 398 } 399 } 400 401 if rowLen > 1 { 402 appendErrRes(&invalidEvalResultFormatError{refID: f.RefID, reason: fmt.Sprintf("unexpected row length: %d instead of 0 or 1", rowLen)}) 403 continue 404 } 405 406 if len(f.Fields) > 1 { 407 appendErrRes(&invalidEvalResultFormatError{refID: f.RefID, reason: fmt.Sprintf("unexpected field length: %d instead of 1", len(f.Fields))}) 408 continue 409 } 410 411 if f.Fields[0].Type() != data.FieldTypeNullableFloat64 { 412 appendErrRes(&invalidEvalResultFormatError{refID: f.RefID, reason: fmt.Sprintf("invalid field type: %s", f.Fields[0].Type())}) 413 continue 414 } 415 416 val := f.Fields[0].At(0).(*float64) // type checked by data.FieldTypeNullableFloat64 above 417 418 r := Result{ 419 Instance: f.Fields[0].Labels, 420 EvaluatedAt: ts, 421 EvaluationDuration: time.Since(ts), 422 EvaluationString: extractEvalString(f), 423 Values: extractValues(f), 424 } 425 426 switch { 427 case val == nil: 428 r.State = NoData 429 case *val == 0: 430 r.State = Normal 431 default: 432 r.State = Alerting 433 } 434 435 evalResults = append(evalResults, r) 436 } 437 438 seenLabels := make(map[string]bool) 439 for _, res := range evalResults { 440 labelsStr := res.Instance.String() 441 _, ok := seenLabels[labelsStr] 442 if ok { 443 return Results{ 444 Result{ 445 State: Error, 446 Instance: res.Instance, 447 EvaluatedAt: ts, 448 EvaluationDuration: time.Since(ts), 449 Error: &invalidEvalResultFormatError{reason: fmt.Sprintf("frame cannot uniquely be identified by its labels: has duplicate results with labels {%s}", labelsStr)}, 450 }, 451 } 452 } 453 seenLabels[labelsStr] = true 454 } 455 456 return evalResults 457} 458 459// AsDataFrame forms the EvalResults in Frame suitable for displaying in the table panel of the front end. 460// It displays one row per alert instance, with a column for each label and one for the alerting state. 461func (evalResults Results) AsDataFrame() data.Frame { 462 fieldLen := len(evalResults) 463 464 uniqueLabelKeys := make(map[string]struct{}) 465 466 for _, evalResult := range evalResults { 467 for k := range evalResult.Instance { 468 uniqueLabelKeys[k] = struct{}{} 469 } 470 } 471 472 labelColumns := make([]string, 0, len(uniqueLabelKeys)) 473 for k := range uniqueLabelKeys { 474 labelColumns = append(labelColumns, k) 475 } 476 477 labelColumns = sort.StringSlice(labelColumns) 478 479 frame := data.NewFrame("evaluation results") 480 for _, lKey := range labelColumns { 481 frame.Fields = append(frame.Fields, data.NewField(lKey, nil, make([]string, fieldLen))) 482 } 483 frame.Fields = append(frame.Fields, data.NewField("State", nil, make([]string, fieldLen))) 484 frame.Fields = append(frame.Fields, data.NewField("Info", nil, make([]string, fieldLen))) 485 486 for evalIdx, evalResult := range evalResults { 487 for lIdx, v := range labelColumns { 488 frame.Set(lIdx, evalIdx, evalResult.Instance[v]) 489 } 490 491 frame.Set(len(labelColumns), evalIdx, evalResult.State.String()) 492 493 switch { 494 case evalResult.Error != nil: 495 frame.Set(len(labelColumns)+1, evalIdx, evalResult.Error.Error()) 496 case evalResult.EvaluationString != "": 497 frame.Set(len(labelColumns)+1, evalIdx, evalResult.EvaluationString) 498 } 499 } 500 return *frame 501} 502 503// ConditionEval executes conditions and evaluates the result. 504func (e *Evaluator) ConditionEval(condition *models.Condition, now time.Time, expressionService *expr.Service) (Results, error) { 505 alertCtx, cancelFn := context.WithTimeout(context.Background(), e.Cfg.UnifiedAlerting.EvaluationTimeout) 506 defer cancelFn() 507 508 alertExecCtx := AlertExecCtx{OrgID: condition.OrgID, Ctx: alertCtx, ExpressionsEnabled: e.Cfg.ExpressionsEnabled, Log: e.Log} 509 510 execResult := executeCondition(alertExecCtx, condition, now, expressionService) 511 512 evalResults := evaluateExecutionResult(execResult, now) 513 return evalResults, nil 514} 515 516// QueriesAndExpressionsEval executes queries and expressions and returns the result. 517func (e *Evaluator) QueriesAndExpressionsEval(orgID int64, data []models.AlertQuery, now time.Time, expressionService *expr.Service) (*backend.QueryDataResponse, error) { 518 alertCtx, cancelFn := context.WithTimeout(context.Background(), e.Cfg.UnifiedAlerting.EvaluationTimeout) 519 defer cancelFn() 520 521 alertExecCtx := AlertExecCtx{OrgID: orgID, Ctx: alertCtx, ExpressionsEnabled: e.Cfg.ExpressionsEnabled, Log: e.Log} 522 523 execResult, err := executeQueriesAndExpressions(alertExecCtx, data, now, expressionService) 524 if err != nil { 525 return nil, fmt.Errorf("failed to execute conditions: %w", err) 526 } 527 528 return execResult, nil 529} 530