1package ingester 2 3import ( 4 "github.com/prometheus/client_golang/prometheus" 5 "github.com/prometheus/client_golang/prometheus/promauto" 6 7 "github.com/grafana/loki/pkg/validation" 8) 9 10type ingesterMetrics struct { 11 checkpointDeleteFail prometheus.Counter 12 checkpointDeleteTotal prometheus.Counter 13 checkpointCreationFail prometheus.Counter 14 checkpointCreationTotal prometheus.Counter 15 checkpointDuration prometheus.Summary 16 checkpointLoggedBytesTotal prometheus.Counter 17 18 walDiskFullFailures prometheus.Counter 19 walReplayActive prometheus.Gauge 20 walReplayDuration prometheus.Gauge 21 walReplaySamplesDropped *prometheus.CounterVec 22 walReplayBytesDropped *prometheus.CounterVec 23 walCorruptionsTotal *prometheus.CounterVec 24 walLoggedBytesTotal prometheus.Counter 25 walRecordsLogged prometheus.Counter 26 27 recoveredStreamsTotal prometheus.Counter 28 recoveredChunksTotal prometheus.Counter 29 recoveredEntriesTotal prometheus.Counter 30 duplicateEntriesTotal prometheus.Counter 31 recoveredBytesTotal prometheus.Counter 32 recoveryBytesInUse prometheus.Gauge 33 recoveryIsFlushing prometheus.Gauge 34 35 limiterEnabled prometheus.Gauge 36 37 autoForgetUnhealthyIngestersTotal prometheus.Counter 38} 39 40// setRecoveryBytesInUse bounds the bytes reports to >= 0. 41// TODO(owen-d): we can gain some efficiency by having the flusher never update this after recovery ends. 42func (m *ingesterMetrics) setRecoveryBytesInUse(v int64) { 43 if v < 0 { 44 v = 0 45 } 46 m.recoveryBytesInUse.Set(float64(v)) 47} 48 49const ( 50 walTypeCheckpoint = "checkpoint" 51 walTypeSegment = "segment" 52 53 duplicateReason = "duplicate" 54) 55 56func newIngesterMetrics(r prometheus.Registerer) *ingesterMetrics { 57 return &ingesterMetrics{ 58 walDiskFullFailures: promauto.With(r).NewCounter(prometheus.CounterOpts{ 59 Name: "loki_ingester_wal_disk_full_failures_total", 60 Help: "Total number of wal write failures due to full disk.", 61 }), 62 walReplayActive: promauto.With(r).NewGauge(prometheus.GaugeOpts{ 63 Name: "loki_ingester_wal_replay_active", 64 Help: "Whether the WAL is replaying", 65 }), 66 walReplayDuration: promauto.With(r).NewGauge(prometheus.GaugeOpts{ 67 Name: "loki_ingester_wal_replay_duration_seconds", 68 Help: "Time taken to replay the checkpoint and the WAL.", 69 }), 70 walReplaySamplesDropped: promauto.With(r).NewCounterVec(prometheus.CounterOpts{ 71 Name: "loki_ingester_wal_discarded_samples_total", 72 Help: "WAL segment entries discarded during replay", 73 }, []string{validation.ReasonLabel}), 74 walReplayBytesDropped: promauto.With(r).NewCounterVec(prometheus.CounterOpts{ 75 Name: "loki_ingester_wal_discarded_bytes_total", 76 Help: "WAL segment bytes discarded during replay", 77 }, []string{validation.ReasonLabel}), 78 walCorruptionsTotal: promauto.With(r).NewCounterVec(prometheus.CounterOpts{ 79 Name: "loki_ingester_wal_corruptions_total", 80 Help: "Total number of WAL corruptions encountered.", 81 }, []string{"type"}), 82 checkpointDeleteFail: promauto.With(r).NewCounter(prometheus.CounterOpts{ 83 Name: "loki_ingester_checkpoint_deletions_failed_total", 84 Help: "Total number of checkpoint deletions that failed.", 85 }), 86 checkpointDeleteTotal: promauto.With(r).NewCounter(prometheus.CounterOpts{ 87 Name: "loki_ingester_checkpoint_deletions_total", 88 Help: "Total number of checkpoint deletions attempted.", 89 }), 90 checkpointCreationFail: promauto.With(r).NewCounter(prometheus.CounterOpts{ 91 Name: "loki_ingester_checkpoint_creations_failed_total", 92 Help: "Total number of checkpoint creations that failed.", 93 }), 94 checkpointCreationTotal: promauto.With(r).NewCounter(prometheus.CounterOpts{ 95 Name: "loki_ingester_checkpoint_creations_total", 96 Help: "Total number of checkpoint creations attempted.", 97 }), 98 checkpointDuration: promauto.With(r).NewSummary(prometheus.SummaryOpts{ 99 Name: "loki_ingester_checkpoint_duration_seconds", 100 Help: "Time taken to create a checkpoint.", 101 Objectives: map[float64]float64{0.5: 0.05, 0.9: 0.01, 0.99: 0.001}, 102 }), 103 walRecordsLogged: promauto.With(r).NewCounter(prometheus.CounterOpts{ 104 Name: "loki_ingester_wal_records_logged_total", 105 Help: "Total number of WAL records logged.", 106 }), 107 checkpointLoggedBytesTotal: promauto.With(r).NewCounter(prometheus.CounterOpts{ 108 Name: "loki_ingester_checkpoint_logged_bytes_total", 109 Help: "Total number of bytes written to disk for checkpointing.", 110 }), 111 walLoggedBytesTotal: promauto.With(r).NewCounter(prometheus.CounterOpts{ 112 Name: "loki_ingester_wal_logged_bytes_total", 113 Help: "Total number of bytes written to disk for WAL records.", 114 }), 115 recoveredStreamsTotal: promauto.With(r).NewCounter(prometheus.CounterOpts{ 116 Name: "loki_ingester_wal_recovered_streams_total", 117 Help: "Total number of streams recovered from the WAL.", 118 }), 119 recoveredChunksTotal: promauto.With(r).NewCounter(prometheus.CounterOpts{ 120 Name: "loki_ingester_wal_recovered_chunks_total", 121 Help: "Total number of chunks recovered from the WAL checkpoints.", 122 }), 123 recoveredEntriesTotal: promauto.With(r).NewCounter(prometheus.CounterOpts{ 124 Name: "loki_ingester_wal_recovered_entries_total", 125 Help: "Total number of entries recovered from the WAL.", 126 }), 127 duplicateEntriesTotal: promauto.With(r).NewCounter(prometheus.CounterOpts{ 128 Name: "loki_ingester_wal_duplicate_entries_total", 129 Help: "Entries discarded during WAL replay due to existing in checkpoints.", 130 }), 131 recoveredBytesTotal: promauto.With(r).NewCounter(prometheus.CounterOpts{ 132 Name: "loki_ingester_wal_recovered_bytes_total", 133 Help: "Total number of bytes recovered from the WAL.", 134 }), 135 recoveryBytesInUse: promauto.With(r).NewGauge(prometheus.GaugeOpts{ 136 Name: "loki_ingester_wal_bytes_in_use", 137 Help: "Total number of bytes in use by the WAL recovery process.", 138 }), 139 recoveryIsFlushing: promauto.With(r).NewGauge(prometheus.GaugeOpts{ 140 Name: "loki_ingester_wal_replay_flushing", 141 Help: "Whether the wal replay is in a flushing phase due to backpressure", 142 }), 143 limiterEnabled: promauto.With(r).NewGauge(prometheus.GaugeOpts{ 144 Name: "loki_ingester_limiter_enabled", 145 Help: "Whether the ingester's limiter is enabled", 146 }), 147 autoForgetUnhealthyIngestersTotal: promauto.With(r).NewCounter(prometheus.CounterOpts{ 148 Name: "loki_ingester_autoforget_unhealthy_ingesters_total", 149 Help: "Total number of ingesters automatically forgotten", 150 }), 151 } 152} 153