1// Copyright 2015 The etcd Authors
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7//     http://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14
15package etcdserver
16
17import (
18	goruntime "runtime"
19	"time"
20
21	"go.etcd.io/etcd/pkg/runtime"
22	"go.etcd.io/etcd/version"
23
24	"github.com/prometheus/client_golang/prometheus"
25	"go.uber.org/zap"
26)
27
28var (
29	hasLeader = prometheus.NewGauge(prometheus.GaugeOpts{
30		Namespace: "etcd",
31		Subsystem: "server",
32		Name:      "has_leader",
33		Help:      "Whether or not a leader exists. 1 is existence, 0 is not.",
34	})
35	isLeader = prometheus.NewGauge(prometheus.GaugeOpts{
36		Namespace: "etcd",
37		Subsystem: "server",
38		Name:      "is_leader",
39		Help:      "Whether or not this member is a leader. 1 if is, 0 otherwise.",
40	})
41	leaderChanges = prometheus.NewCounter(prometheus.CounterOpts{
42		Namespace: "etcd",
43		Subsystem: "server",
44		Name:      "leader_changes_seen_total",
45		Help:      "The number of leader changes seen.",
46	})
47	isLearner = prometheus.NewGauge(prometheus.GaugeOpts{
48		Namespace: "etcd",
49		Subsystem: "server",
50		Name:      "is_learner",
51		Help:      "Whether or not this member is a learner. 1 if is, 0 otherwise.",
52	})
53	learnerPromoteFailed = prometheus.NewCounterVec(prometheus.CounterOpts{
54		Namespace: "etcd",
55		Subsystem: "server",
56		Name:      "learner_promote_failures",
57		Help:      "The total number of failed learner promotions (likely learner not ready) while this member is leader.",
58	},
59		[]string{"Reason"},
60	)
61	learnerPromoteSucceed = prometheus.NewCounter(prometheus.CounterOpts{
62		Namespace: "etcd",
63		Subsystem: "server",
64		Name:      "learner_promote_successes",
65		Help:      "The total number of successful learner promotions while this member is leader.",
66	})
67	heartbeatSendFailures = prometheus.NewCounter(prometheus.CounterOpts{
68		Namespace: "etcd",
69		Subsystem: "server",
70		Name:      "heartbeat_send_failures_total",
71		Help:      "The total number of leader heartbeat send failures (likely overloaded from slow disk).",
72	})
73	slowApplies = prometheus.NewCounter(prometheus.CounterOpts{
74		Namespace: "etcd",
75		Subsystem: "server",
76		Name:      "slow_apply_total",
77		Help:      "The total number of slow apply requests (likely overloaded from slow disk).",
78	})
79	applySnapshotInProgress = prometheus.NewGauge(prometheus.GaugeOpts{
80		Namespace: "etcd",
81		Subsystem: "server",
82		Name:      "snapshot_apply_in_progress_total",
83		Help:      "1 if the server is applying the incoming snapshot. 0 if none.",
84	})
85	proposalsCommitted = prometheus.NewGauge(prometheus.GaugeOpts{
86		Namespace: "etcd",
87		Subsystem: "server",
88		Name:      "proposals_committed_total",
89		Help:      "The total number of consensus proposals committed.",
90	})
91	proposalsApplied = prometheus.NewGauge(prometheus.GaugeOpts{
92		Namespace: "etcd",
93		Subsystem: "server",
94		Name:      "proposals_applied_total",
95		Help:      "The total number of consensus proposals applied.",
96	})
97	proposalsPending = prometheus.NewGauge(prometheus.GaugeOpts{
98		Namespace: "etcd",
99		Subsystem: "server",
100		Name:      "proposals_pending",
101		Help:      "The current number of pending proposals to commit.",
102	})
103	proposalsFailed = prometheus.NewCounter(prometheus.CounterOpts{
104		Namespace: "etcd",
105		Subsystem: "server",
106		Name:      "proposals_failed_total",
107		Help:      "The total number of failed proposals seen.",
108	})
109	slowReadIndex = prometheus.NewCounter(prometheus.CounterOpts{
110		Namespace: "etcd",
111		Subsystem: "server",
112		Name:      "slow_read_indexes_total",
113		Help:      "The total number of pending read indexes not in sync with leader's or timed out read index requests.",
114	})
115	readIndexFailed = prometheus.NewCounter(prometheus.CounterOpts{
116		Namespace: "etcd",
117		Subsystem: "server",
118		Name:      "read_indexes_failed_total",
119		Help:      "The total number of failed read indexes seen.",
120	})
121	leaseExpired = prometheus.NewCounter(prometheus.CounterOpts{
122		Namespace: "etcd_debugging",
123		Subsystem: "server",
124		Name:      "lease_expired_total",
125		Help:      "The total number of expired leases.",
126	})
127	quotaBackendBytes = prometheus.NewGauge(prometheus.GaugeOpts{
128		Namespace: "etcd",
129		Subsystem: "server",
130		Name:      "quota_backend_bytes",
131		Help:      "Current backend storage quota size in bytes.",
132	})
133	currentVersion = prometheus.NewGaugeVec(prometheus.GaugeOpts{
134		Namespace: "etcd",
135		Subsystem: "server",
136		Name:      "version",
137		Help:      "Which version is running. 1 for 'server_version' label with current version.",
138	},
139		[]string{"server_version"})
140	currentGoVersion = prometheus.NewGaugeVec(prometheus.GaugeOpts{
141		Namespace: "etcd",
142		Subsystem: "server",
143		Name:      "go_version",
144		Help:      "Which Go version server is running with. 1 for 'server_go_version' label with current version.",
145	},
146		[]string{"server_go_version"})
147	serverID = prometheus.NewGaugeVec(prometheus.GaugeOpts{
148		Namespace: "etcd",
149		Subsystem: "server",
150		Name:      "id",
151		Help:      "Server or member ID in hexadecimal format. 1 for 'server_id' label with current ID.",
152	},
153		[]string{"server_id"})
154
155	fdUsed = prometheus.NewGauge(prometheus.GaugeOpts{
156		Namespace: "os",
157		Subsystem: "fd",
158		Name:      "used",
159		Help:      "The number of used file descriptors.",
160	})
161	fdLimit = prometheus.NewGauge(prometheus.GaugeOpts{
162		Namespace: "os",
163		Subsystem: "fd",
164		Name:      "limit",
165		Help:      "The file descriptor limit.",
166	})
167)
168
169func init() {
170	prometheus.MustRegister(hasLeader)
171	prometheus.MustRegister(isLeader)
172	prometheus.MustRegister(leaderChanges)
173	prometheus.MustRegister(heartbeatSendFailures)
174	prometheus.MustRegister(slowApplies)
175	prometheus.MustRegister(applySnapshotInProgress)
176	prometheus.MustRegister(proposalsCommitted)
177	prometheus.MustRegister(proposalsApplied)
178	prometheus.MustRegister(proposalsPending)
179	prometheus.MustRegister(proposalsFailed)
180	prometheus.MustRegister(slowReadIndex)
181	prometheus.MustRegister(readIndexFailed)
182	prometheus.MustRegister(leaseExpired)
183	prometheus.MustRegister(quotaBackendBytes)
184	prometheus.MustRegister(currentVersion)
185	prometheus.MustRegister(currentGoVersion)
186	prometheus.MustRegister(serverID)
187	prometheus.MustRegister(isLearner)
188	prometheus.MustRegister(learnerPromoteSucceed)
189	prometheus.MustRegister(learnerPromoteFailed)
190	prometheus.MustRegister(fdUsed)
191	prometheus.MustRegister(fdLimit)
192
193	currentVersion.With(prometheus.Labels{
194		"server_version": version.Version,
195	}).Set(1)
196	currentGoVersion.With(prometheus.Labels{
197		"server_go_version": goruntime.Version(),
198	}).Set(1)
199}
200
201func monitorFileDescriptor(lg *zap.Logger, done <-chan struct{}) {
202	// This ticker will check File Descriptor Requirements ,and count all fds in used.
203	// And recorded some logs when in used >= limit/5*4. Just recorded message.
204	// If fds was more than 10K,It's low performance due to FDUsage() works.
205	// So need to increase it.
206	// See https://github.com/etcd-io/etcd/issues/11969 for more detail.
207	ticker := time.NewTicker(10 * time.Minute)
208	defer ticker.Stop()
209	for {
210		used, err := runtime.FDUsage()
211		if err != nil {
212			if lg != nil {
213				lg.Warn("failed to get file descriptor usage", zap.Error(err))
214			} else {
215				plog.Errorf("cannot monitor file descriptor usage (%v)", err)
216			}
217			return
218		}
219		fdUsed.Set(float64(used))
220		limit, err := runtime.FDLimit()
221		if err != nil {
222			if lg != nil {
223				lg.Warn("failed to get file descriptor limit", zap.Error(err))
224			} else {
225				plog.Errorf("cannot monitor file descriptor usage (%v)", err)
226			}
227			return
228		}
229		fdLimit.Set(float64(limit))
230		if used >= limit/5*4 {
231			if lg != nil {
232				lg.Warn("80% of file descriptors are used", zap.Uint64("used", used), zap.Uint64("limit", limit))
233			} else {
234				plog.Warningf("80%% of the file descriptor limit is used [used = %d, limit = %d]", used, limit)
235			}
236		}
237		select {
238		case <-ticker.C:
239		case <-done:
240			return
241		}
242	}
243}
244