1// Copyright 2015 The etcd Authors
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7//     http://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14
15package etcdserver
16
17import (
18	goruntime "runtime"
19	"time"
20
21	"go.etcd.io/etcd/api/v3/version"
22	"go.etcd.io/etcd/pkg/v3/runtime"
23
24	"github.com/prometheus/client_golang/prometheus"
25	"go.uber.org/zap"
26)
27
28var (
29	hasLeader = prometheus.NewGauge(prometheus.GaugeOpts{
30		Namespace: "etcd",
31		Subsystem: "server",
32		Name:      "has_leader",
33		Help:      "Whether or not a leader exists. 1 is existence, 0 is not.",
34	})
35	isLeader = prometheus.NewGauge(prometheus.GaugeOpts{
36		Namespace: "etcd",
37		Subsystem: "server",
38		Name:      "is_leader",
39		Help:      "Whether or not this member is a leader. 1 if is, 0 otherwise.",
40	})
41	leaderChanges = prometheus.NewCounter(prometheus.CounterOpts{
42		Namespace: "etcd",
43		Subsystem: "server",
44		Name:      "leader_changes_seen_total",
45		Help:      "The number of leader changes seen.",
46	})
47	isLearner = prometheus.NewGauge(prometheus.GaugeOpts{
48		Namespace: "etcd",
49		Subsystem: "server",
50		Name:      "is_learner",
51		Help:      "Whether or not this member is a learner. 1 if is, 0 otherwise.",
52	})
53	learnerPromoteFailed = prometheus.NewCounterVec(prometheus.CounterOpts{
54		Namespace: "etcd",
55		Subsystem: "server",
56		Name:      "learner_promote_failures",
57		Help:      "The total number of failed learner promotions (likely learner not ready) while this member is leader.",
58	},
59		[]string{"Reason"},
60	)
61	learnerPromoteSucceed = prometheus.NewCounter(prometheus.CounterOpts{
62		Namespace: "etcd",
63		Subsystem: "server",
64		Name:      "learner_promote_successes",
65		Help:      "The total number of successful learner promotions while this member is leader.",
66	})
67	heartbeatSendFailures = prometheus.NewCounter(prometheus.CounterOpts{
68		Namespace: "etcd",
69		Subsystem: "server",
70		Name:      "heartbeat_send_failures_total",
71		Help:      "The total number of leader heartbeat send failures (likely overloaded from slow disk).",
72	})
73	slowApplies = prometheus.NewCounter(prometheus.CounterOpts{
74		Namespace: "etcd",
75		Subsystem: "server",
76		Name:      "slow_apply_total",
77		Help:      "The total number of slow apply requests (likely overloaded from slow disk).",
78	})
79	applySnapshotInProgress = prometheus.NewGauge(prometheus.GaugeOpts{
80		Namespace: "etcd",
81		Subsystem: "server",
82		Name:      "snapshot_apply_in_progress_total",
83		Help:      "1 if the server is applying the incoming snapshot. 0 if none.",
84	})
85	proposalsCommitted = prometheus.NewGauge(prometheus.GaugeOpts{
86		Namespace: "etcd",
87		Subsystem: "server",
88		Name:      "proposals_committed_total",
89		Help:      "The total number of consensus proposals committed.",
90	})
91	proposalsApplied = prometheus.NewGauge(prometheus.GaugeOpts{
92		Namespace: "etcd",
93		Subsystem: "server",
94		Name:      "proposals_applied_total",
95		Help:      "The total number of consensus proposals applied.",
96	})
97	proposalsPending = prometheus.NewGauge(prometheus.GaugeOpts{
98		Namespace: "etcd",
99		Subsystem: "server",
100		Name:      "proposals_pending",
101		Help:      "The current number of pending proposals to commit.",
102	})
103	proposalsFailed = prometheus.NewCounter(prometheus.CounterOpts{
104		Namespace: "etcd",
105		Subsystem: "server",
106		Name:      "proposals_failed_total",
107		Help:      "The total number of failed proposals seen.",
108	})
109	slowReadIndex = prometheus.NewCounter(prometheus.CounterOpts{
110		Namespace: "etcd",
111		Subsystem: "server",
112		Name:      "slow_read_indexes_total",
113		Help:      "The total number of pending read indexes not in sync with leader's or timed out read index requests.",
114	})
115	readIndexFailed = prometheus.NewCounter(prometheus.CounterOpts{
116		Namespace: "etcd",
117		Subsystem: "server",
118		Name:      "read_indexes_failed_total",
119		Help:      "The total number of failed read indexes seen.",
120	})
121	leaseExpired = prometheus.NewCounter(prometheus.CounterOpts{
122		Namespace: "etcd_debugging",
123		Subsystem: "server",
124		Name:      "lease_expired_total",
125		Help:      "The total number of expired leases.",
126	})
127	quotaBackendBytes = prometheus.NewGauge(prometheus.GaugeOpts{
128		Namespace: "etcd",
129		Subsystem: "server",
130		Name:      "quota_backend_bytes",
131		Help:      "Current backend storage quota size in bytes.",
132	})
133	currentVersion = prometheus.NewGaugeVec(prometheus.GaugeOpts{
134		Namespace: "etcd",
135		Subsystem: "server",
136		Name:      "version",
137		Help:      "Which version is running. 1 for 'server_version' label with current version.",
138	},
139		[]string{"server_version"})
140	currentGoVersion = prometheus.NewGaugeVec(prometheus.GaugeOpts{
141		Namespace: "etcd",
142		Subsystem: "server",
143		Name:      "go_version",
144		Help:      "Which Go version server is running with. 1 for 'server_go_version' label with current version.",
145	},
146		[]string{"server_go_version"})
147	serverID = prometheus.NewGaugeVec(prometheus.GaugeOpts{
148		Namespace: "etcd",
149		Subsystem: "server",
150		Name:      "id",
151		Help:      "Server or member ID in hexadecimal format. 1 for 'server_id' label with current ID.",
152	},
153		[]string{"server_id"})
154
155	fdUsed = prometheus.NewGauge(prometheus.GaugeOpts{
156		Namespace: "os",
157		Subsystem: "fd",
158		Name:      "used",
159		Help:      "The number of used file descriptors.",
160	})
161	fdLimit = prometheus.NewGauge(prometheus.GaugeOpts{
162		Namespace: "os",
163		Subsystem: "fd",
164		Name:      "limit",
165		Help:      "The file descriptor limit.",
166	})
167	applySec = prometheus.NewHistogramVec(prometheus.HistogramOpts{
168		Namespace: "etcd",
169		Subsystem: "server",
170		Name:      "apply_duration_seconds",
171		Help:      "The latency distributions of v2 apply called by backend.",
172
173		// lowest bucket start of upper bound 0.0001 sec (0.1 ms) with factor 2
174		// highest bucket start of 0.0001 sec * 2^19 == 52.4288 sec
175		Buckets: prometheus.ExponentialBuckets(0.0001, 2, 20),
176	},
177		[]string{"version", "op", "success"})
178)
179
180func init() {
181	prometheus.MustRegister(hasLeader)
182	prometheus.MustRegister(isLeader)
183	prometheus.MustRegister(leaderChanges)
184	prometheus.MustRegister(heartbeatSendFailures)
185	prometheus.MustRegister(slowApplies)
186	prometheus.MustRegister(applySnapshotInProgress)
187	prometheus.MustRegister(proposalsCommitted)
188	prometheus.MustRegister(proposalsApplied)
189	prometheus.MustRegister(proposalsPending)
190	prometheus.MustRegister(proposalsFailed)
191	prometheus.MustRegister(slowReadIndex)
192	prometheus.MustRegister(readIndexFailed)
193	prometheus.MustRegister(leaseExpired)
194	prometheus.MustRegister(quotaBackendBytes)
195	prometheus.MustRegister(currentVersion)
196	prometheus.MustRegister(currentGoVersion)
197	prometheus.MustRegister(serverID)
198	prometheus.MustRegister(isLearner)
199	prometheus.MustRegister(learnerPromoteSucceed)
200	prometheus.MustRegister(learnerPromoteFailed)
201	prometheus.MustRegister(fdUsed)
202	prometheus.MustRegister(fdLimit)
203	prometheus.MustRegister(applySec)
204
205	currentVersion.With(prometheus.Labels{
206		"server_version": version.Version,
207	}).Set(1)
208	currentGoVersion.With(prometheus.Labels{
209		"server_go_version": goruntime.Version(),
210	}).Set(1)
211}
212
213func monitorFileDescriptor(lg *zap.Logger, done <-chan struct{}) {
214	// This ticker will check File Descriptor Requirements ,and count all fds in used.
215	// And recorded some logs when in used >= limit/5*4. Just recorded message.
216	// If fds was more than 10K,It's low performance due to FDUsage() works.
217	// So need to increase it.
218	// See https://github.com/etcd-io/etcd/issues/11969 for more detail.
219	ticker := time.NewTicker(10 * time.Minute)
220	defer ticker.Stop()
221	for {
222		used, err := runtime.FDUsage()
223		if err != nil {
224			lg.Warn("failed to get file descriptor usage", zap.Error(err))
225			return
226		}
227		fdUsed.Set(float64(used))
228		limit, err := runtime.FDLimit()
229		if err != nil {
230			lg.Warn("failed to get file descriptor limit", zap.Error(err))
231			return
232		}
233		fdLimit.Set(float64(limit))
234		if used >= limit/5*4 {
235			lg.Warn("80% of file descriptors are used", zap.Uint64("used", used), zap.Uint64("limit", limit))
236		}
237		select {
238		case <-ticker.C:
239		case <-done:
240			return
241		}
242	}
243}
244