1// Copyright 2015 The etcd Authors 2// 3// Licensed under the Apache License, Version 2.0 (the "License"); 4// you may not use this file except in compliance with the License. 5// You may obtain a copy of the License at 6// 7// http://www.apache.org/licenses/LICENSE-2.0 8// 9// Unless required by applicable law or agreed to in writing, software 10// distributed under the License is distributed on an "AS IS" BASIS, 11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12// See the License for the specific language governing permissions and 13// limitations under the License. 14 15package etcdserver 16 17import ( 18 goruntime "runtime" 19 "time" 20 21 "go.etcd.io/etcd/pkg/runtime" 22 "go.etcd.io/etcd/version" 23 24 "github.com/prometheus/client_golang/prometheus" 25 "go.uber.org/zap" 26) 27 28var ( 29 hasLeader = prometheus.NewGauge(prometheus.GaugeOpts{ 30 Namespace: "etcd", 31 Subsystem: "server", 32 Name: "has_leader", 33 Help: "Whether or not a leader exists. 1 is existence, 0 is not.", 34 }) 35 isLeader = prometheus.NewGauge(prometheus.GaugeOpts{ 36 Namespace: "etcd", 37 Subsystem: "server", 38 Name: "is_leader", 39 Help: "Whether or not this member is a leader. 1 if is, 0 otherwise.", 40 }) 41 leaderChanges = prometheus.NewCounter(prometheus.CounterOpts{ 42 Namespace: "etcd", 43 Subsystem: "server", 44 Name: "leader_changes_seen_total", 45 Help: "The number of leader changes seen.", 46 }) 47 isLearner = prometheus.NewGauge(prometheus.GaugeOpts{ 48 Namespace: "etcd", 49 Subsystem: "server", 50 Name: "is_learner", 51 Help: "Whether or not this member is a learner. 1 if is, 0 otherwise.", 52 }) 53 learnerPromoteFailed = prometheus.NewCounterVec(prometheus.CounterOpts{ 54 Namespace: "etcd", 55 Subsystem: "server", 56 Name: "learner_promote_failures", 57 Help: "The total number of failed learner promotions (likely learner not ready) while this member is leader.", 58 }, 59 []string{"Reason"}, 60 ) 61 learnerPromoteSucceed = prometheus.NewCounter(prometheus.CounterOpts{ 62 Namespace: "etcd", 63 Subsystem: "server", 64 Name: "learner_promote_successes", 65 Help: "The total number of successful learner promotions while this member is leader.", 66 }) 67 heartbeatSendFailures = prometheus.NewCounter(prometheus.CounterOpts{ 68 Namespace: "etcd", 69 Subsystem: "server", 70 Name: "heartbeat_send_failures_total", 71 Help: "The total number of leader heartbeat send failures (likely overloaded from slow disk).", 72 }) 73 slowApplies = prometheus.NewCounter(prometheus.CounterOpts{ 74 Namespace: "etcd", 75 Subsystem: "server", 76 Name: "slow_apply_total", 77 Help: "The total number of slow apply requests (likely overloaded from slow disk).", 78 }) 79 applySnapshotInProgress = prometheus.NewGauge(prometheus.GaugeOpts{ 80 Namespace: "etcd", 81 Subsystem: "server", 82 Name: "snapshot_apply_in_progress_total", 83 Help: "1 if the server is applying the incoming snapshot. 0 if none.", 84 }) 85 proposalsCommitted = prometheus.NewGauge(prometheus.GaugeOpts{ 86 Namespace: "etcd", 87 Subsystem: "server", 88 Name: "proposals_committed_total", 89 Help: "The total number of consensus proposals committed.", 90 }) 91 proposalsApplied = prometheus.NewGauge(prometheus.GaugeOpts{ 92 Namespace: "etcd", 93 Subsystem: "server", 94 Name: "proposals_applied_total", 95 Help: "The total number of consensus proposals applied.", 96 }) 97 proposalsPending = prometheus.NewGauge(prometheus.GaugeOpts{ 98 Namespace: "etcd", 99 Subsystem: "server", 100 Name: "proposals_pending", 101 Help: "The current number of pending proposals to commit.", 102 }) 103 proposalsFailed = prometheus.NewCounter(prometheus.CounterOpts{ 104 Namespace: "etcd", 105 Subsystem: "server", 106 Name: "proposals_failed_total", 107 Help: "The total number of failed proposals seen.", 108 }) 109 slowReadIndex = prometheus.NewCounter(prometheus.CounterOpts{ 110 Namespace: "etcd", 111 Subsystem: "server", 112 Name: "slow_read_indexes_total", 113 Help: "The total number of pending read indexes not in sync with leader's or timed out read index requests.", 114 }) 115 readIndexFailed = prometheus.NewCounter(prometheus.CounterOpts{ 116 Namespace: "etcd", 117 Subsystem: "server", 118 Name: "read_indexes_failed_total", 119 Help: "The total number of failed read indexes seen.", 120 }) 121 leaseExpired = prometheus.NewCounter(prometheus.CounterOpts{ 122 Namespace: "etcd_debugging", 123 Subsystem: "server", 124 Name: "lease_expired_total", 125 Help: "The total number of expired leases.", 126 }) 127 quotaBackendBytes = prometheus.NewGauge(prometheus.GaugeOpts{ 128 Namespace: "etcd", 129 Subsystem: "server", 130 Name: "quota_backend_bytes", 131 Help: "Current backend storage quota size in bytes.", 132 }) 133 currentVersion = prometheus.NewGaugeVec(prometheus.GaugeOpts{ 134 Namespace: "etcd", 135 Subsystem: "server", 136 Name: "version", 137 Help: "Which version is running. 1 for 'server_version' label with current version.", 138 }, 139 []string{"server_version"}) 140 currentGoVersion = prometheus.NewGaugeVec(prometheus.GaugeOpts{ 141 Namespace: "etcd", 142 Subsystem: "server", 143 Name: "go_version", 144 Help: "Which Go version server is running with. 1 for 'server_go_version' label with current version.", 145 }, 146 []string{"server_go_version"}) 147 serverID = prometheus.NewGaugeVec(prometheus.GaugeOpts{ 148 Namespace: "etcd", 149 Subsystem: "server", 150 Name: "id", 151 Help: "Server or member ID in hexadecimal format. 1 for 'server_id' label with current ID.", 152 }, 153 []string{"server_id"}) 154 155 fdUsed = prometheus.NewGauge(prometheus.GaugeOpts{ 156 Namespace: "os", 157 Subsystem: "fd", 158 Name: "used", 159 Help: "The number of used file descriptors.", 160 }) 161 fdLimit = prometheus.NewGauge(prometheus.GaugeOpts{ 162 Namespace: "os", 163 Subsystem: "fd", 164 Name: "limit", 165 Help: "The file descriptor limit.", 166 }) 167) 168 169func init() { 170 prometheus.MustRegister(hasLeader) 171 prometheus.MustRegister(isLeader) 172 prometheus.MustRegister(leaderChanges) 173 prometheus.MustRegister(heartbeatSendFailures) 174 prometheus.MustRegister(slowApplies) 175 prometheus.MustRegister(applySnapshotInProgress) 176 prometheus.MustRegister(proposalsCommitted) 177 prometheus.MustRegister(proposalsApplied) 178 prometheus.MustRegister(proposalsPending) 179 prometheus.MustRegister(proposalsFailed) 180 prometheus.MustRegister(slowReadIndex) 181 prometheus.MustRegister(readIndexFailed) 182 prometheus.MustRegister(leaseExpired) 183 prometheus.MustRegister(quotaBackendBytes) 184 prometheus.MustRegister(currentVersion) 185 prometheus.MustRegister(currentGoVersion) 186 prometheus.MustRegister(serverID) 187 prometheus.MustRegister(isLearner) 188 prometheus.MustRegister(learnerPromoteSucceed) 189 prometheus.MustRegister(learnerPromoteFailed) 190 prometheus.MustRegister(fdUsed) 191 prometheus.MustRegister(fdLimit) 192 193 currentVersion.With(prometheus.Labels{ 194 "server_version": version.Version, 195 }).Set(1) 196 currentGoVersion.With(prometheus.Labels{ 197 "server_go_version": goruntime.Version(), 198 }).Set(1) 199} 200 201func monitorFileDescriptor(lg *zap.Logger, done <-chan struct{}) { 202 // This ticker will check File Descriptor Requirements ,and count all fds in used. 203 // And recorded some logs when in used >= limit/5*4. Just recorded message. 204 // If fds was more than 10K,It's low performance due to FDUsage() works. 205 // So need to increase it. 206 // See https://github.com/etcd-io/etcd/issues/11969 for more detail. 207 ticker := time.NewTicker(10 * time.Minute) 208 defer ticker.Stop() 209 for { 210 used, err := runtime.FDUsage() 211 if err != nil { 212 if lg != nil { 213 lg.Warn("failed to get file descriptor usage", zap.Error(err)) 214 } else { 215 plog.Errorf("cannot monitor file descriptor usage (%v)", err) 216 } 217 return 218 } 219 fdUsed.Set(float64(used)) 220 limit, err := runtime.FDLimit() 221 if err != nil { 222 if lg != nil { 223 lg.Warn("failed to get file descriptor limit", zap.Error(err)) 224 } else { 225 plog.Errorf("cannot monitor file descriptor usage (%v)", err) 226 } 227 return 228 } 229 fdLimit.Set(float64(limit)) 230 if used >= limit/5*4 { 231 if lg != nil { 232 lg.Warn("80% of file descriptors are used", zap.Uint64("used", used), zap.Uint64("limit", limit)) 233 } else { 234 plog.Warningf("80%% of the file descriptor limit is used [used = %d, limit = %d]", used, limit) 235 } 236 } 237 select { 238 case <-ticker.C: 239 case <-done: 240 return 241 } 242 } 243} 244