1// Copyright 2015 The etcd Authors
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7//     http://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14
15package etcdserver
16
17import (
18	goruntime "runtime"
19	"time"
20
21	"github.com/coreos/etcd/pkg/runtime"
22	"github.com/coreos/etcd/version"
23	"github.com/prometheus/client_golang/prometheus"
24)
25
26var (
27	hasLeader = prometheus.NewGauge(prometheus.GaugeOpts{
28		Namespace: "etcd",
29		Subsystem: "server",
30		Name:      "has_leader",
31		Help:      "Whether or not a leader exists. 1 is existence, 0 is not.",
32	})
33	isLeader = prometheus.NewGauge(prometheus.GaugeOpts{
34		Namespace: "etcd",
35		Subsystem: "server",
36		Name:      "is_leader",
37		Help:      "Whether or not this member is a leader. 1 if is, 0 otherwise.",
38	})
39	leaderChanges = prometheus.NewCounter(prometheus.CounterOpts{
40		Namespace: "etcd",
41		Subsystem: "server",
42		Name:      "leader_changes_seen_total",
43		Help:      "The number of leader changes seen.",
44	})
45	heartbeatSendFailures = prometheus.NewCounter(prometheus.CounterOpts{
46		Namespace: "etcd",
47		Subsystem: "server",
48		Name:      "heartbeat_send_failures_total",
49		Help:      "The total number of leader heartbeat send failures (likely overloaded from slow disk).",
50	})
51	slowApplies = prometheus.NewCounter(prometheus.CounterOpts{
52		Namespace: "etcd",
53		Subsystem: "server",
54		Name:      "slow_apply_total",
55		Help:      "The total number of slow apply requests (likely overloaded from slow disk).",
56	})
57	applySnapshotInProgress = prometheus.NewGauge(prometheus.GaugeOpts{
58		Namespace: "etcd",
59		Subsystem: "server",
60		Name:      "snapshot_apply_in_progress_total",
61		Help:      "1 if the server is applying the incoming snapshot. 0 if none.",
62	})
63	proposalsCommitted = prometheus.NewGauge(prometheus.GaugeOpts{
64		Namespace: "etcd",
65		Subsystem: "server",
66		Name:      "proposals_committed_total",
67		Help:      "The total number of consensus proposals committed.",
68	})
69	proposalsApplied = prometheus.NewGauge(prometheus.GaugeOpts{
70		Namespace: "etcd",
71		Subsystem: "server",
72		Name:      "proposals_applied_total",
73		Help:      "The total number of consensus proposals applied.",
74	})
75	proposalsPending = prometheus.NewGauge(prometheus.GaugeOpts{
76		Namespace: "etcd",
77		Subsystem: "server",
78		Name:      "proposals_pending",
79		Help:      "The current number of pending proposals to commit.",
80	})
81	proposalsFailed = prometheus.NewCounter(prometheus.CounterOpts{
82		Namespace: "etcd",
83		Subsystem: "server",
84		Name:      "proposals_failed_total",
85		Help:      "The total number of failed proposals seen.",
86	})
87	leaseExpired = prometheus.NewCounter(prometheus.CounterOpts{
88		Namespace: "etcd_debugging",
89		Subsystem: "server",
90		Name:      "lease_expired_total",
91		Help:      "The total number of expired leases.",
92	})
93	slowReadIndex = prometheus.NewCounter(prometheus.CounterOpts{
94		Namespace: "etcd",
95		Subsystem: "server",
96		Name:      "slow_read_indexes_total",
97		Help:      "The total number of pending read indexes not in sync with leader's or timed out read index requests.",
98	})
99	readIndexFailed = prometheus.NewCounter(prometheus.CounterOpts{
100		Namespace: "etcd",
101		Subsystem: "server",
102		Name:      "read_indexes_failed_total",
103		Help:      "The total number of failed read indexes seen.",
104	})
105	quotaBackendBytes = prometheus.NewGauge(prometheus.GaugeOpts{
106		Namespace: "etcd",
107		Subsystem: "server",
108		Name:      "quota_backend_bytes",
109		Help:      "Current backend storage quota size in bytes.",
110	})
111	currentVersion = prometheus.NewGaugeVec(prometheus.GaugeOpts{
112		Namespace: "etcd",
113		Subsystem: "server",
114		Name:      "version",
115		Help:      "Which version is running. 1 for 'server_version' label with current version.",
116	},
117		[]string{"server_version"})
118	currentGoVersion = prometheus.NewGaugeVec(prometheus.GaugeOpts{
119		Namespace: "etcd",
120		Subsystem: "server",
121		Name:      "go_version",
122		Help:      "Which Go version server is running with. 1 for 'server_go_version' label with current version.",
123	},
124		[]string{"server_go_version"})
125	serverID = prometheus.NewGaugeVec(prometheus.GaugeOpts{
126		Namespace: "etcd",
127		Subsystem: "server",
128		Name:      "id",
129		Help:      "Server or member ID in hexadecimal format. 1 for 'server_id' label with current ID.",
130	},
131		[]string{"server_id"})
132)
133
134func init() {
135	prometheus.MustRegister(hasLeader)
136	prometheus.MustRegister(isLeader)
137	prometheus.MustRegister(leaderChanges)
138	prometheus.MustRegister(heartbeatSendFailures)
139	prometheus.MustRegister(slowApplies)
140	prometheus.MustRegister(applySnapshotInProgress)
141	prometheus.MustRegister(proposalsCommitted)
142	prometheus.MustRegister(proposalsApplied)
143	prometheus.MustRegister(proposalsPending)
144	prometheus.MustRegister(proposalsFailed)
145	prometheus.MustRegister(leaseExpired)
146	prometheus.MustRegister(slowReadIndex)
147	prometheus.MustRegister(readIndexFailed)
148	prometheus.MustRegister(quotaBackendBytes)
149	prometheus.MustRegister(currentVersion)
150	prometheus.MustRegister(currentGoVersion)
151	prometheus.MustRegister(serverID)
152
153	currentVersion.With(prometheus.Labels{
154		"server_version": version.Version,
155	}).Set(1)
156	currentGoVersion.With(prometheus.Labels{
157		"server_go_version": goruntime.Version(),
158	}).Set(1)
159}
160
161func monitorFileDescriptor(done <-chan struct{}) {
162
163	// This ticker will check File Descriptor Requirements ,and count all fds in used.
164	// And recorded some logs when in used >= limit/5*4. Just recorded message.
165	// If fds was more than 10K,It's low performance due to FDUsage() works.
166	// So need to increase it.
167	// See https://github.com/etcd-io/etcd/issues/11969 for more detail.
168	ticker := time.NewTicker(10 * time.Minute)
169	defer ticker.Stop()
170	for {
171		used, err := runtime.FDUsage()
172		if err != nil {
173			plog.Errorf("cannot monitor file descriptor usage (%v)", err)
174			return
175		}
176		limit, err := runtime.FDLimit()
177		if err != nil {
178			plog.Errorf("cannot monitor file descriptor usage (%v)", err)
179			return
180		}
181		if used >= limit/5*4 {
182			plog.Warningf("80%% of the file descriptor limit is used [used = %d, limit = %d]", used, limit)
183		}
184		select {
185		case <-ticker.C:
186		case <-done:
187			return
188		}
189	}
190}
191