1# these rules synced manually from https://github.com/etcd-io/etcd/blob/master/Documentation/etcd-mixin/mixin.libsonnet
2groups:
3- name: etcd
4  rules:
5  - alert: etcdInsufficientMembers
6    annotations:
7      message: 'etcd cluster "{{ $labels.job }}": insufficient members ({{ $value
8        }}).'
9    expr: |
10      sum(up{job=~".*etcd.*"} == bool 1) by (job) < ((count(up{job=~".*etcd.*"}) by (job) + 1) / 2)
11    for: 3m
12    labels:
13      severity: critical
14  - alert: etcdNoLeader
15    annotations:
16      message: 'etcd cluster "{{ $labels.job }}": member {{ $labels.instance }} has
17        no leader.'
18    expr: |
19      etcd_server_has_leader{job=~".*etcd.*"} == 0
20    for: 1m
21    labels:
22      severity: critical
23  - alert: etcdHighNumberOfLeaderChanges
24    annotations:
25      message: 'etcd cluster "{{ $labels.job }}": instance {{ $labels.instance }}
26        has seen {{ $value }} leader changes within the last hour.'
27    expr: |
28      rate(etcd_server_leader_changes_seen_total{job=~".*etcd.*"}[15m]) > 3
29    for: 15m
30    labels:
31      severity: warning
32  - alert: etcdHighNumberOfFailedGRPCRequests
33    annotations:
34      message: 'etcd cluster "{{ $labels.job }}": {{ $value }}% of requests for {{
35        $labels.grpc_method }} failed on etcd instance {{ $labels.instance }}.'
36    expr: |
37      100 * sum(rate(grpc_server_handled_total{job=~".*etcd.*", grpc_code!="OK"}[5m])) BY (job, instance, grpc_service, grpc_method)
38        /
39      sum(rate(grpc_server_handled_total{job=~".*etcd.*"}[5m])) BY (job, instance, grpc_service, grpc_method)
40        > 1
41    for: 10m
42    labels:
43      severity: warning
44  - alert: etcdHighNumberOfFailedGRPCRequests
45    annotations:
46      message: 'etcd cluster "{{ $labels.job }}": {{ $value }}% of requests for {{
47        $labels.grpc_method }} failed on etcd instance {{ $labels.instance }}.'
48    expr: |
49      100 * sum(rate(grpc_server_handled_total{job=~".*etcd.*", grpc_code!="OK"}[5m])) BY (job, instance, grpc_service, grpc_method)
50        /
51      sum(rate(grpc_server_handled_total{job=~".*etcd.*"}[5m])) BY (job, instance, grpc_service, grpc_method)
52        > 5
53    for: 5m
54    labels:
55      severity: critical
56  - alert: etcdGRPCRequestsSlow
57    annotations:
58      message: 'etcd cluster "{{ $labels.job }}": gRPC requests to {{ $labels.grpc_method
59        }} are taking {{ $value }}s on etcd instance {{ $labels.instance }}.'
60    expr: |
61      histogram_quantile(0.99, sum(rate(grpc_server_handling_seconds_bucket{job=~".*etcd.*", grpc_type="unary"}[5m])) by (job, instance, grpc_service, grpc_method, le))
62      > 0.15
63    for: 10m
64    labels:
65      severity: critical
66  - alert: etcdMemberCommunicationSlow
67    annotations:
68      message: 'etcd cluster "{{ $labels.job }}": member communication with {{ $labels.To
69        }} is taking {{ $value }}s on etcd instance {{ $labels.instance }}.'
70    expr: |
71      histogram_quantile(0.99, rate(etcd_network_peer_round_trip_time_seconds_bucket{job=~".*etcd.*"}[5m]))
72      > 0.15
73    for: 10m
74    labels:
75      severity: warning
76  - alert: etcdHighNumberOfFailedProposals
77    annotations:
78      message: 'etcd cluster "{{ $labels.job }}": {{ $value }} proposal failures within
79        the last hour on etcd instance {{ $labels.instance }}.'
80    expr: |
81      rate(etcd_server_proposals_failed_total{job=~".*etcd.*"}[15m]) > 5
82    for: 15m
83    labels:
84      severity: warning
85  - alert: etcdHighFsyncDurations
86    annotations:
87      message: 'etcd cluster "{{ $labels.job }}": 99th percentile fync durations are
88        {{ $value }}s on etcd instance {{ $labels.instance }}.'
89    expr: |
90      histogram_quantile(0.99, rate(etcd_disk_wal_fsync_duration_seconds_bucket{job=~".*etcd.*"}[5m]))
91      > 0.5
92    for: 10m
93    labels:
94      severity: warning
95  - alert: etcdHighCommitDurations
96    annotations:
97      message: 'etcd cluster "{{ $labels.job }}": 99th percentile commit durations
98        {{ $value }}s on etcd instance {{ $labels.instance }}.'
99    expr: |
100      histogram_quantile(0.99, rate(etcd_disk_backend_commit_duration_seconds_bucket{job=~".*etcd.*"}[5m]))
101      > 0.25
102    for: 10m
103    labels:
104      severity: warning
105  - alert: etcdHighNumberOfFailedHTTPRequests
106    annotations:
107      message: '{{ $value }}% of requests for {{ $labels.method }} failed on etcd
108        instance {{ $labels.instance }}'
109    expr: |
110      sum(rate(etcd_http_failed_total{job=~".*etcd.*", code!="404"}[5m])) BY (method) / sum(rate(etcd_http_received_total{job=~".*etcd.*"}[5m]))
111      BY (method) > 0.01
112    for: 10m
113    labels:
114      severity: warning
115  - alert: etcdHighNumberOfFailedHTTPRequests
116    annotations:
117      message: '{{ $value }}% of requests for {{ $labels.method }} failed on etcd
118        instance {{ $labels.instance }}.'
119    expr: |
120      sum(rate(etcd_http_failed_total{job=~".*etcd.*", code!="404"}[5m])) BY (method) / sum(rate(etcd_http_received_total{job=~".*etcd.*"}[5m]))
121      BY (method) > 0.05
122    for: 10m
123    labels:
124      severity: critical
125  - alert: etcdHTTPRequestsSlow
126    annotations:
127      message: etcd instance {{ $labels.instance }} HTTP requests to {{ $labels.method
128        }} are slow.
129    expr: |
130      histogram_quantile(0.99, rate(etcd_http_successful_duration_seconds_bucket[5m]))
131      > 0.15
132    for: 10m
133    labels:
134      severity: warning
135