1package ingester
2
3import (
4	"github.com/prometheus/client_golang/prometheus"
5	"github.com/prometheus/client_golang/prometheus/promauto"
6
7	"github.com/grafana/loki/pkg/validation"
8)
9
10type ingesterMetrics struct {
11	checkpointDeleteFail       prometheus.Counter
12	checkpointDeleteTotal      prometheus.Counter
13	checkpointCreationFail     prometheus.Counter
14	checkpointCreationTotal    prometheus.Counter
15	checkpointDuration         prometheus.Summary
16	checkpointLoggedBytesTotal prometheus.Counter
17
18	walDiskFullFailures     prometheus.Counter
19	walReplayActive         prometheus.Gauge
20	walReplayDuration       prometheus.Gauge
21	walReplaySamplesDropped *prometheus.CounterVec
22	walReplayBytesDropped   *prometheus.CounterVec
23	walCorruptionsTotal     *prometheus.CounterVec
24	walLoggedBytesTotal     prometheus.Counter
25	walRecordsLogged        prometheus.Counter
26
27	recoveredStreamsTotal prometheus.Counter
28	recoveredChunksTotal  prometheus.Counter
29	recoveredEntriesTotal prometheus.Counter
30	duplicateEntriesTotal prometheus.Counter
31	recoveredBytesTotal   prometheus.Counter
32	recoveryBytesInUse    prometheus.Gauge
33	recoveryIsFlushing    prometheus.Gauge
34
35	limiterEnabled prometheus.Gauge
36
37	autoForgetUnhealthyIngestersTotal prometheus.Counter
38}
39
40// setRecoveryBytesInUse bounds the bytes reports to >= 0.
41// TODO(owen-d): we can gain some efficiency by having the flusher never update this after recovery ends.
42func (m *ingesterMetrics) setRecoveryBytesInUse(v int64) {
43	if v < 0 {
44		v = 0
45	}
46	m.recoveryBytesInUse.Set(float64(v))
47}
48
49const (
50	walTypeCheckpoint = "checkpoint"
51	walTypeSegment    = "segment"
52
53	duplicateReason = "duplicate"
54)
55
56func newIngesterMetrics(r prometheus.Registerer) *ingesterMetrics {
57	return &ingesterMetrics{
58		walDiskFullFailures: promauto.With(r).NewCounter(prometheus.CounterOpts{
59			Name: "loki_ingester_wal_disk_full_failures_total",
60			Help: "Total number of wal write failures due to full disk.",
61		}),
62		walReplayActive: promauto.With(r).NewGauge(prometheus.GaugeOpts{
63			Name: "loki_ingester_wal_replay_active",
64			Help: "Whether the WAL is replaying",
65		}),
66		walReplayDuration: promauto.With(r).NewGauge(prometheus.GaugeOpts{
67			Name: "loki_ingester_wal_replay_duration_seconds",
68			Help: "Time taken to replay the checkpoint and the WAL.",
69		}),
70		walReplaySamplesDropped: promauto.With(r).NewCounterVec(prometheus.CounterOpts{
71			Name: "loki_ingester_wal_discarded_samples_total",
72			Help: "WAL segment entries discarded during replay",
73		}, []string{validation.ReasonLabel}),
74		walReplayBytesDropped: promauto.With(r).NewCounterVec(prometheus.CounterOpts{
75			Name: "loki_ingester_wal_discarded_bytes_total",
76			Help: "WAL segment bytes discarded during replay",
77		}, []string{validation.ReasonLabel}),
78		walCorruptionsTotal: promauto.With(r).NewCounterVec(prometheus.CounterOpts{
79			Name: "loki_ingester_wal_corruptions_total",
80			Help: "Total number of WAL corruptions encountered.",
81		}, []string{"type"}),
82		checkpointDeleteFail: promauto.With(r).NewCounter(prometheus.CounterOpts{
83			Name: "loki_ingester_checkpoint_deletions_failed_total",
84			Help: "Total number of checkpoint deletions that failed.",
85		}),
86		checkpointDeleteTotal: promauto.With(r).NewCounter(prometheus.CounterOpts{
87			Name: "loki_ingester_checkpoint_deletions_total",
88			Help: "Total number of checkpoint deletions attempted.",
89		}),
90		checkpointCreationFail: promauto.With(r).NewCounter(prometheus.CounterOpts{
91			Name: "loki_ingester_checkpoint_creations_failed_total",
92			Help: "Total number of checkpoint creations that failed.",
93		}),
94		checkpointCreationTotal: promauto.With(r).NewCounter(prometheus.CounterOpts{
95			Name: "loki_ingester_checkpoint_creations_total",
96			Help: "Total number of checkpoint creations attempted.",
97		}),
98		checkpointDuration: promauto.With(r).NewSummary(prometheus.SummaryOpts{
99			Name:       "loki_ingester_checkpoint_duration_seconds",
100			Help:       "Time taken to create a checkpoint.",
101			Objectives: map[float64]float64{0.5: 0.05, 0.9: 0.01, 0.99: 0.001},
102		}),
103		walRecordsLogged: promauto.With(r).NewCounter(prometheus.CounterOpts{
104			Name: "loki_ingester_wal_records_logged_total",
105			Help: "Total number of WAL records logged.",
106		}),
107		checkpointLoggedBytesTotal: promauto.With(r).NewCounter(prometheus.CounterOpts{
108			Name: "loki_ingester_checkpoint_logged_bytes_total",
109			Help: "Total number of bytes written to disk for checkpointing.",
110		}),
111		walLoggedBytesTotal: promauto.With(r).NewCounter(prometheus.CounterOpts{
112			Name: "loki_ingester_wal_logged_bytes_total",
113			Help: "Total number of bytes written to disk for WAL records.",
114		}),
115		recoveredStreamsTotal: promauto.With(r).NewCounter(prometheus.CounterOpts{
116			Name: "loki_ingester_wal_recovered_streams_total",
117			Help: "Total number of streams recovered from the WAL.",
118		}),
119		recoveredChunksTotal: promauto.With(r).NewCounter(prometheus.CounterOpts{
120			Name: "loki_ingester_wal_recovered_chunks_total",
121			Help: "Total number of chunks recovered from the WAL checkpoints.",
122		}),
123		recoveredEntriesTotal: promauto.With(r).NewCounter(prometheus.CounterOpts{
124			Name: "loki_ingester_wal_recovered_entries_total",
125			Help: "Total number of entries recovered from the WAL.",
126		}),
127		duplicateEntriesTotal: promauto.With(r).NewCounter(prometheus.CounterOpts{
128			Name: "loki_ingester_wal_duplicate_entries_total",
129			Help: "Entries discarded during WAL replay due to existing in checkpoints.",
130		}),
131		recoveredBytesTotal: promauto.With(r).NewCounter(prometheus.CounterOpts{
132			Name: "loki_ingester_wal_recovered_bytes_total",
133			Help: "Total number of bytes recovered from the WAL.",
134		}),
135		recoveryBytesInUse: promauto.With(r).NewGauge(prometheus.GaugeOpts{
136			Name: "loki_ingester_wal_bytes_in_use",
137			Help: "Total number of bytes in use by the WAL recovery process.",
138		}),
139		recoveryIsFlushing: promauto.With(r).NewGauge(prometheus.GaugeOpts{
140			Name: "loki_ingester_wal_replay_flushing",
141			Help: "Whether the wal replay is in a flushing phase due to backpressure",
142		}),
143		limiterEnabled: promauto.With(r).NewGauge(prometheus.GaugeOpts{
144			Name: "loki_ingester_limiter_enabled",
145			Help: "Whether the ingester's limiter is enabled",
146		}),
147		autoForgetUnhealthyIngestersTotal: promauto.With(r).NewCounter(prometheus.CounterOpts{
148			Name: "loki_ingester_autoforget_unhealthy_ingesters_total",
149			Help: "Total number of ingesters automatically forgotten",
150		}),
151	}
152}
153