1local grafana = import 'github.com/grafana/grafonnet-lib/grafonnet/grafana.libsonnet';
2local g = import 'github.com/grafana/jsonnet-libs/grafana-builder/grafana.libsonnet';
3local dashboard = grafana.dashboard;
4local row = grafana.row;
5local singlestat = grafana.singlestat;
6local prometheus = grafana.prometheus;
7local graphPanel = grafana.graphPanel;
8local tablePanel = grafana.tablePanel;
9local template = grafana.template;
10{
11  grafanaDashboards+:: {
12    'prometheus.json':
13      g.dashboard(
14        '%(prefix)sOverview' % $._config.grafanaPrometheus
15      )
16      .addMultiTemplate('job', 'prometheus_build_info{%(prometheusSelector)s}' % $._config, 'job')
17      .addMultiTemplate('instance', 'prometheus_build_info{job=~"$job"}', 'instance')
18      .addRow(
19        g.row('Prometheus Stats')
20        .addPanel(
21          g.panel('Prometheus Stats') +
22          g.tablePanel([
23            'count by (job, instance, version) (prometheus_build_info{job=~"$job", instance=~"$instance"})',
24            'max by (job, instance) (time() - process_start_time_seconds{job=~"$job", instance=~"$instance"})',
25          ], {
26            job: { alias: 'Job' },
27            instance: { alias: 'Instance' },
28            version: { alias: 'Version' },
29            'Value #A': { alias: 'Count', type: 'hidden' },
30            'Value #B': { alias: 'Uptime' },
31          })
32        )
33      )
34      .addRow(
35        g.row('Discovery')
36        .addPanel(
37          g.panel('Target Sync') +
38          g.queryPanel('sum(rate(prometheus_target_sync_length_seconds_sum{job=~"$job",instance=~"$instance"}[5m])) by (scrape_job) * 1e3', '{{scrape_job}}') +
39          { yaxes: g.yaxes('ms') }
40        )
41        .addPanel(
42          g.panel('Targets') +
43          g.queryPanel('sum(prometheus_sd_discovered_targets{job=~"$job",instance=~"$instance"})', 'Targets') +
44          g.stack
45        )
46      )
47      .addRow(
48        g.row('Retrieval')
49        .addPanel(
50          g.panel('Average Scrape Interval Duration') +
51          g.queryPanel('rate(prometheus_target_interval_length_seconds_sum{job=~"$job",instance=~"$instance"}[5m]) / rate(prometheus_target_interval_length_seconds_count{job=~"$job",instance=~"$instance"}[5m]) * 1e3', '{{interval}} configured') +
52          { yaxes: g.yaxes('ms') }
53        )
54        .addPanel(
55          g.panel('Scrape failures') +
56          g.queryPanel([
57            'sum by (job) (rate(prometheus_target_scrapes_exceeded_body_size_limit_total[1m]))',
58            'sum by (job) (rate(prometheus_target_scrapes_exceeded_sample_limit_total[1m]))',
59            'sum by (job) (rate(prometheus_target_scrapes_sample_duplicate_timestamp_total[1m]))',
60            'sum by (job) (rate(prometheus_target_scrapes_sample_out_of_bounds_total[1m]))',
61            'sum by (job) (rate(prometheus_target_scrapes_sample_out_of_order_total[1m]))',
62          ], [
63            'exceeded body size limit: {{job}}',
64            'exceeded sample limit: {{job}}',
65            'duplicate timestamp: {{job}}',
66            'out of bounds: {{job}}',
67            'out of order: {{job}}',
68          ]) +
69          g.stack
70        )
71        .addPanel(
72          g.panel('Appended Samples') +
73          g.queryPanel('rate(prometheus_tsdb_head_samples_appended_total{job=~"$job",instance=~"$instance"}[5m])', '{{job}} {{instance}}') +
74          g.stack
75        )
76      )
77      .addRow(
78        g.row('Storage')
79        .addPanel(
80          g.panel('Head Series') +
81          g.queryPanel('prometheus_tsdb_head_series{job=~"$job",instance=~"$instance"}', '{{job}} {{instance}} head series') +
82          g.stack
83        )
84        .addPanel(
85          g.panel('Head Chunks') +
86          g.queryPanel('prometheus_tsdb_head_chunks{job=~"$job",instance=~"$instance"}', '{{job}} {{instance}} head chunks') +
87          g.stack
88        )
89      )
90      .addRow(
91        g.row('Query')
92        .addPanel(
93          g.panel('Query Rate') +
94          g.queryPanel('rate(prometheus_engine_query_duration_seconds_count{job=~"$job",instance=~"$instance",slice="inner_eval"}[5m])', '{{job}} {{instance}}') +
95          g.stack,
96        )
97        .addPanel(
98          g.panel('Stage Duration') +
99          g.queryPanel('max by (slice) (prometheus_engine_query_duration_seconds{quantile="0.9",job=~"$job",instance=~"$instance"}) * 1e3', '{{slice}}') +
100          { yaxes: g.yaxes('ms') } +
101          g.stack,
102        )
103      ) + {
104        tags: $._config.grafanaPrometheus.tags,
105        refresh: $._config.grafanaPrometheus.refresh,
106      },
107    // Remote write specific dashboard.
108    'prometheus-remote-write.json':
109      local timestampComparison =
110        graphPanel.new(
111          'Highest Timestamp In vs. Highest Timestamp Sent',
112          datasource='$datasource',
113          span=6,
114        )
115        .addTarget(prometheus.target(
116          |||
117            (
118              prometheus_remote_storage_highest_timestamp_in_seconds{cluster=~"$cluster", instance=~"$instance"}
119            -
120              ignoring(remote_name, url) group_right(instance) (prometheus_remote_storage_queue_highest_sent_timestamp_seconds{cluster=~"$cluster", instance=~"$instance"} != 0)
121            )
122          |||,
123          legendFormat='{{cluster}}:{{instance}} {{remote_name}}:{{url}}',
124        ));
125
126      local timestampComparisonRate =
127        graphPanel.new(
128          'Rate[5m]',
129          datasource='$datasource',
130          span=6,
131        )
132        .addTarget(prometheus.target(
133          |||
134            clamp_min(
135              rate(prometheus_remote_storage_highest_timestamp_in_seconds{cluster=~"$cluster", instance=~"$instance"}[5m])
136            -
137              ignoring (remote_name, url) group_right(instance) rate(prometheus_remote_storage_queue_highest_sent_timestamp_seconds{cluster=~"$cluster", instance=~"$instance"}[5m])
138            , 0)
139          |||,
140          legendFormat='{{cluster}}:{{instance}} {{remote_name}}:{{url}}',
141        ));
142
143      local samplesRate =
144        graphPanel.new(
145          'Rate, in vs. succeeded or dropped [5m]',
146          datasource='$datasource',
147          span=12,
148        )
149        .addTarget(prometheus.target(
150          |||
151            rate(
152              prometheus_remote_storage_samples_in_total{cluster=~"$cluster", instance=~"$instance"}[5m])
153            -
154              ignoring(remote_name, url) group_right(instance) (rate(prometheus_remote_storage_succeeded_samples_total{cluster=~"$cluster", instance=~"$instance"}[5m]) or rate(prometheus_remote_storage_samples_total{cluster=~"$cluster", instance=~"$instance"}[5m]))
155            -
156              (rate(prometheus_remote_storage_dropped_samples_total{cluster=~"$cluster", instance=~"$instance"}[5m]) or rate(prometheus_remote_storage_samples_dropped_total{cluster=~"$cluster", instance=~"$instance"}[5m]))
157          |||,
158          legendFormat='{{cluster}}:{{instance}} {{remote_name}}:{{url}}'
159        ));
160
161      local currentShards =
162        graphPanel.new(
163          'Current Shards',
164          datasource='$datasource',
165          span=12,
166          min_span=6,
167        )
168        .addTarget(prometheus.target(
169          'prometheus_remote_storage_shards{cluster=~"$cluster", instance=~"$instance"}',
170          legendFormat='{{cluster}}:{{instance}} {{remote_name}}:{{url}}'
171        ));
172
173      local maxShards =
174        graphPanel.new(
175          'Max Shards',
176          datasource='$datasource',
177          span=4,
178        )
179        .addTarget(prometheus.target(
180          'prometheus_remote_storage_shards_max{cluster=~"$cluster", instance=~"$instance"}',
181          legendFormat='{{cluster}}:{{instance}} {{remote_name}}:{{url}}'
182        ));
183
184      local minShards =
185        graphPanel.new(
186          'Min Shards',
187          datasource='$datasource',
188          span=4,
189        )
190        .addTarget(prometheus.target(
191          'prometheus_remote_storage_shards_min{cluster=~"$cluster", instance=~"$instance"}',
192          legendFormat='{{cluster}}:{{instance}} {{remote_name}}:{{url}}'
193        ));
194
195      local desiredShards =
196        graphPanel.new(
197          'Desired Shards',
198          datasource='$datasource',
199          span=4,
200        )
201        .addTarget(prometheus.target(
202          'prometheus_remote_storage_shards_desired{cluster=~"$cluster", instance=~"$instance"}',
203          legendFormat='{{cluster}}:{{instance}} {{remote_name}}:{{url}}'
204        ));
205
206      local shardsCapacity =
207        graphPanel.new(
208          'Shard Capacity',
209          datasource='$datasource',
210          span=6,
211        )
212        .addTarget(prometheus.target(
213          'prometheus_remote_storage_shard_capacity{cluster=~"$cluster", instance=~"$instance"}',
214          legendFormat='{{cluster}}:{{instance}} {{remote_name}}:{{url}}'
215        ));
216
217
218      local pendingSamples =
219        graphPanel.new(
220          'Pending Samples',
221          datasource='$datasource',
222          span=6,
223        )
224        .addTarget(prometheus.target(
225          'prometheus_remote_storage_pending_samples{cluster=~"$cluster", instance=~"$instance"} or prometheus_remote_storage_samples_pending{cluster=~"$cluster", instance=~"$instance"}',
226          legendFormat='{{cluster}}:{{instance}} {{remote_name}}:{{url}}'
227        ));
228
229      local walSegment =
230        graphPanel.new(
231          'TSDB Current Segment',
232          datasource='$datasource',
233          span=6,
234          formatY1='none',
235        )
236        .addTarget(prometheus.target(
237          'prometheus_tsdb_wal_segment_current{cluster=~"$cluster", instance=~"$instance"}',
238          legendFormat='{{cluster}}:{{instance}}'
239        ));
240
241      local queueSegment =
242        graphPanel.new(
243          'Remote Write Current Segment',
244          datasource='$datasource',
245          span=6,
246          formatY1='none',
247        )
248        .addTarget(prometheus.target(
249          'prometheus_wal_watcher_current_segment{cluster=~"$cluster", instance=~"$instance"}',
250          legendFormat='{{cluster}}:{{instance}} {{consumer}}'
251        ));
252
253      local droppedSamples =
254        graphPanel.new(
255          'Dropped Samples',
256          datasource='$datasource',
257          span=3,
258        )
259        .addTarget(prometheus.target(
260          'rate(prometheus_remote_storage_dropped_samples_total{cluster=~"$cluster", instance=~"$instance"}[5m]) or rate(prometheus_remote_storage_samples_dropped_total{cluster=~"$cluster", instance=~"$instance"}[5m])',
261          legendFormat='{{cluster}}:{{instance}} {{remote_name}}:{{url}}'
262        ));
263
264      local failedSamples =
265        graphPanel.new(
266          'Failed Samples',
267          datasource='$datasource',
268          span=3,
269        )
270        .addTarget(prometheus.target(
271          'rate(prometheus_remote_storage_failed_samples_total{cluster=~"$cluster", instance=~"$instance"}[5m]) or rate(prometheus_remote_storage_samples_failed_total{cluster=~"$cluster", instance=~"$instance"}[5m])',
272          legendFormat='{{cluster}}:{{instance}} {{remote_name}}:{{url}}'
273        ));
274
275      local retriedSamples =
276        graphPanel.new(
277          'Retried Samples',
278          datasource='$datasource',
279          span=3,
280        )
281        .addTarget(prometheus.target(
282          'rate(prometheus_remote_storage_retried_samples_total{cluster=~"$cluster", instance=~"$instance"}[5m]) or rate(prometheus_remote_storage_samples_retried_total{cluster=~"$cluster", instance=~"$instance"}[5m])',
283          legendFormat='{{cluster}}:{{instance}} {{remote_name}}:{{url}}'
284        ));
285
286      local enqueueRetries =
287        graphPanel.new(
288          'Enqueue Retries',
289          datasource='$datasource',
290          span=3,
291        )
292        .addTarget(prometheus.target(
293          'rate(prometheus_remote_storage_enqueue_retries_total{cluster=~"$cluster", instance=~"$instance"}[5m])',
294          legendFormat='{{cluster}}:{{instance}} {{remote_name}}:{{url}}'
295        ));
296
297      dashboard.new(
298        title='%(prefix)sRemote Write' % $._config.grafanaPrometheus,
299        editable=true
300      )
301      .addTemplate(
302        {
303          hide: 0,
304          label: null,
305          name: 'datasource',
306          options: [],
307          query: 'prometheus',
308          refresh: 1,
309          regex: '',
310          type: 'datasource',
311        },
312      )
313      .addTemplate(
314        template.new(
315          'instance',
316          '$datasource',
317          'label_values(prometheus_build_info, instance)' % $._config,
318          refresh='time',
319          current={
320            selected: true,
321            text: 'All',
322            value: '$__all',
323          },
324          includeAll=true,
325        )
326      )
327      .addTemplate(
328        template.new(
329          'cluster',
330          '$datasource',
331          'label_values(kube_pod_container_info{image=~".*prometheus.*"}, cluster)' % $._config,
332          refresh='time',
333          current={
334            selected: true,
335            text: 'All',
336            value: '$__all',
337          },
338          includeAll=true,
339        )
340      )
341      .addTemplate(
342        template.new(
343          'url',
344          '$datasource',
345          'label_values(prometheus_remote_storage_shards{cluster=~"$cluster", instance=~"$instance"}, url)' % $._config,
346          refresh='time',
347          includeAll=true,
348        )
349      )
350      .addRow(
351        row.new('Timestamps')
352        .addPanel(timestampComparison)
353        .addPanel(timestampComparisonRate)
354      )
355      .addRow(
356        row.new('Samples')
357        .addPanel(samplesRate)
358      )
359      .addRow(
360        row.new(
361          'Shards'
362        )
363        .addPanel(currentShards)
364        .addPanel(maxShards)
365        .addPanel(minShards)
366        .addPanel(desiredShards)
367      )
368      .addRow(
369        row.new('Shard Details')
370        .addPanel(shardsCapacity)
371        .addPanel(pendingSamples)
372      )
373      .addRow(
374        row.new('Segments')
375        .addPanel(walSegment)
376        .addPanel(queueSegment)
377      )
378      .addRow(
379        row.new('Misc. Rates')
380        .addPanel(droppedSamples)
381        .addPanel(failedSamples)
382        .addPanel(retriedSamples)
383        .addPanel(enqueueRetries)
384      ) + {
385        tags: $._config.grafanaPrometheus.tags,
386        refresh: $._config.grafanaPrometheus.refresh,
387      },
388  },
389}
390