1local grafana = import 'github.com/grafana/grafonnet-lib/grafonnet/grafana.libsonnet'; 2local g = import 'github.com/grafana/jsonnet-libs/grafana-builder/grafana.libsonnet'; 3local dashboard = grafana.dashboard; 4local row = grafana.row; 5local singlestat = grafana.singlestat; 6local prometheus = grafana.prometheus; 7local graphPanel = grafana.graphPanel; 8local tablePanel = grafana.tablePanel; 9local template = grafana.template; 10{ 11 grafanaDashboards+:: { 12 'prometheus.json': 13 g.dashboard( 14 '%(prefix)sOverview' % $._config.grafanaPrometheus 15 ) 16 .addMultiTemplate('job', 'prometheus_build_info{%(prometheusSelector)s}' % $._config, 'job') 17 .addMultiTemplate('instance', 'prometheus_build_info{job=~"$job"}', 'instance') 18 .addRow( 19 g.row('Prometheus Stats') 20 .addPanel( 21 g.panel('Prometheus Stats') + 22 g.tablePanel([ 23 'count by (job, instance, version) (prometheus_build_info{job=~"$job", instance=~"$instance"})', 24 'max by (job, instance) (time() - process_start_time_seconds{job=~"$job", instance=~"$instance"})', 25 ], { 26 job: { alias: 'Job' }, 27 instance: { alias: 'Instance' }, 28 version: { alias: 'Version' }, 29 'Value #A': { alias: 'Count', type: 'hidden' }, 30 'Value #B': { alias: 'Uptime' }, 31 }) 32 ) 33 ) 34 .addRow( 35 g.row('Discovery') 36 .addPanel( 37 g.panel('Target Sync') + 38 g.queryPanel('sum(rate(prometheus_target_sync_length_seconds_sum{job=~"$job",instance=~"$instance"}[5m])) by (scrape_job) * 1e3', '{{scrape_job}}') + 39 { yaxes: g.yaxes('ms') } 40 ) 41 .addPanel( 42 g.panel('Targets') + 43 g.queryPanel('sum(prometheus_sd_discovered_targets{job=~"$job",instance=~"$instance"})', 'Targets') + 44 g.stack 45 ) 46 ) 47 .addRow( 48 g.row('Retrieval') 49 .addPanel( 50 g.panel('Average Scrape Interval Duration') + 51 g.queryPanel('rate(prometheus_target_interval_length_seconds_sum{job=~"$job",instance=~"$instance"}[5m]) / rate(prometheus_target_interval_length_seconds_count{job=~"$job",instance=~"$instance"}[5m]) * 1e3', '{{interval}} configured') + 52 { yaxes: g.yaxes('ms') } 53 ) 54 .addPanel( 55 g.panel('Scrape failures') + 56 g.queryPanel([ 57 'sum by (job) (rate(prometheus_target_scrapes_exceeded_body_size_limit_total[1m]))', 58 'sum by (job) (rate(prometheus_target_scrapes_exceeded_sample_limit_total[1m]))', 59 'sum by (job) (rate(prometheus_target_scrapes_sample_duplicate_timestamp_total[1m]))', 60 'sum by (job) (rate(prometheus_target_scrapes_sample_out_of_bounds_total[1m]))', 61 'sum by (job) (rate(prometheus_target_scrapes_sample_out_of_order_total[1m]))', 62 ], [ 63 'exceeded body size limit: {{job}}', 64 'exceeded sample limit: {{job}}', 65 'duplicate timestamp: {{job}}', 66 'out of bounds: {{job}}', 67 'out of order: {{job}}', 68 ]) + 69 g.stack 70 ) 71 .addPanel( 72 g.panel('Appended Samples') + 73 g.queryPanel('rate(prometheus_tsdb_head_samples_appended_total{job=~"$job",instance=~"$instance"}[5m])', '{{job}} {{instance}}') + 74 g.stack 75 ) 76 ) 77 .addRow( 78 g.row('Storage') 79 .addPanel( 80 g.panel('Head Series') + 81 g.queryPanel('prometheus_tsdb_head_series{job=~"$job",instance=~"$instance"}', '{{job}} {{instance}} head series') + 82 g.stack 83 ) 84 .addPanel( 85 g.panel('Head Chunks') + 86 g.queryPanel('prometheus_tsdb_head_chunks{job=~"$job",instance=~"$instance"}', '{{job}} {{instance}} head chunks') + 87 g.stack 88 ) 89 ) 90 .addRow( 91 g.row('Query') 92 .addPanel( 93 g.panel('Query Rate') + 94 g.queryPanel('rate(prometheus_engine_query_duration_seconds_count{job=~"$job",instance=~"$instance",slice="inner_eval"}[5m])', '{{job}} {{instance}}') + 95 g.stack, 96 ) 97 .addPanel( 98 g.panel('Stage Duration') + 99 g.queryPanel('max by (slice) (prometheus_engine_query_duration_seconds{quantile="0.9",job=~"$job",instance=~"$instance"}) * 1e3', '{{slice}}') + 100 { yaxes: g.yaxes('ms') } + 101 g.stack, 102 ) 103 ) + { 104 tags: $._config.grafanaPrometheus.tags, 105 refresh: $._config.grafanaPrometheus.refresh, 106 }, 107 // Remote write specific dashboard. 108 'prometheus-remote-write.json': 109 local timestampComparison = 110 graphPanel.new( 111 'Highest Timestamp In vs. Highest Timestamp Sent', 112 datasource='$datasource', 113 span=6, 114 ) 115 .addTarget(prometheus.target( 116 ||| 117 ( 118 prometheus_remote_storage_highest_timestamp_in_seconds{cluster=~"$cluster", instance=~"$instance"} 119 - 120 ignoring(remote_name, url) group_right(instance) (prometheus_remote_storage_queue_highest_sent_timestamp_seconds{cluster=~"$cluster", instance=~"$instance"} != 0) 121 ) 122 |||, 123 legendFormat='{{cluster}}:{{instance}} {{remote_name}}:{{url}}', 124 )); 125 126 local timestampComparisonRate = 127 graphPanel.new( 128 'Rate[5m]', 129 datasource='$datasource', 130 span=6, 131 ) 132 .addTarget(prometheus.target( 133 ||| 134 clamp_min( 135 rate(prometheus_remote_storage_highest_timestamp_in_seconds{cluster=~"$cluster", instance=~"$instance"}[5m]) 136 - 137 ignoring (remote_name, url) group_right(instance) rate(prometheus_remote_storage_queue_highest_sent_timestamp_seconds{cluster=~"$cluster", instance=~"$instance"}[5m]) 138 , 0) 139 |||, 140 legendFormat='{{cluster}}:{{instance}} {{remote_name}}:{{url}}', 141 )); 142 143 local samplesRate = 144 graphPanel.new( 145 'Rate, in vs. succeeded or dropped [5m]', 146 datasource='$datasource', 147 span=12, 148 ) 149 .addTarget(prometheus.target( 150 ||| 151 rate( 152 prometheus_remote_storage_samples_in_total{cluster=~"$cluster", instance=~"$instance"}[5m]) 153 - 154 ignoring(remote_name, url) group_right(instance) (rate(prometheus_remote_storage_succeeded_samples_total{cluster=~"$cluster", instance=~"$instance"}[5m]) or rate(prometheus_remote_storage_samples_total{cluster=~"$cluster", instance=~"$instance"}[5m])) 155 - 156 (rate(prometheus_remote_storage_dropped_samples_total{cluster=~"$cluster", instance=~"$instance"}[5m]) or rate(prometheus_remote_storage_samples_dropped_total{cluster=~"$cluster", instance=~"$instance"}[5m])) 157 |||, 158 legendFormat='{{cluster}}:{{instance}} {{remote_name}}:{{url}}' 159 )); 160 161 local currentShards = 162 graphPanel.new( 163 'Current Shards', 164 datasource='$datasource', 165 span=12, 166 min_span=6, 167 ) 168 .addTarget(prometheus.target( 169 'prometheus_remote_storage_shards{cluster=~"$cluster", instance=~"$instance"}', 170 legendFormat='{{cluster}}:{{instance}} {{remote_name}}:{{url}}' 171 )); 172 173 local maxShards = 174 graphPanel.new( 175 'Max Shards', 176 datasource='$datasource', 177 span=4, 178 ) 179 .addTarget(prometheus.target( 180 'prometheus_remote_storage_shards_max{cluster=~"$cluster", instance=~"$instance"}', 181 legendFormat='{{cluster}}:{{instance}} {{remote_name}}:{{url}}' 182 )); 183 184 local minShards = 185 graphPanel.new( 186 'Min Shards', 187 datasource='$datasource', 188 span=4, 189 ) 190 .addTarget(prometheus.target( 191 'prometheus_remote_storage_shards_min{cluster=~"$cluster", instance=~"$instance"}', 192 legendFormat='{{cluster}}:{{instance}} {{remote_name}}:{{url}}' 193 )); 194 195 local desiredShards = 196 graphPanel.new( 197 'Desired Shards', 198 datasource='$datasource', 199 span=4, 200 ) 201 .addTarget(prometheus.target( 202 'prometheus_remote_storage_shards_desired{cluster=~"$cluster", instance=~"$instance"}', 203 legendFormat='{{cluster}}:{{instance}} {{remote_name}}:{{url}}' 204 )); 205 206 local shardsCapacity = 207 graphPanel.new( 208 'Shard Capacity', 209 datasource='$datasource', 210 span=6, 211 ) 212 .addTarget(prometheus.target( 213 'prometheus_remote_storage_shard_capacity{cluster=~"$cluster", instance=~"$instance"}', 214 legendFormat='{{cluster}}:{{instance}} {{remote_name}}:{{url}}' 215 )); 216 217 218 local pendingSamples = 219 graphPanel.new( 220 'Pending Samples', 221 datasource='$datasource', 222 span=6, 223 ) 224 .addTarget(prometheus.target( 225 'prometheus_remote_storage_pending_samples{cluster=~"$cluster", instance=~"$instance"} or prometheus_remote_storage_samples_pending{cluster=~"$cluster", instance=~"$instance"}', 226 legendFormat='{{cluster}}:{{instance}} {{remote_name}}:{{url}}' 227 )); 228 229 local walSegment = 230 graphPanel.new( 231 'TSDB Current Segment', 232 datasource='$datasource', 233 span=6, 234 formatY1='none', 235 ) 236 .addTarget(prometheus.target( 237 'prometheus_tsdb_wal_segment_current{cluster=~"$cluster", instance=~"$instance"}', 238 legendFormat='{{cluster}}:{{instance}}' 239 )); 240 241 local queueSegment = 242 graphPanel.new( 243 'Remote Write Current Segment', 244 datasource='$datasource', 245 span=6, 246 formatY1='none', 247 ) 248 .addTarget(prometheus.target( 249 'prometheus_wal_watcher_current_segment{cluster=~"$cluster", instance=~"$instance"}', 250 legendFormat='{{cluster}}:{{instance}} {{consumer}}' 251 )); 252 253 local droppedSamples = 254 graphPanel.new( 255 'Dropped Samples', 256 datasource='$datasource', 257 span=3, 258 ) 259 .addTarget(prometheus.target( 260 'rate(prometheus_remote_storage_dropped_samples_total{cluster=~"$cluster", instance=~"$instance"}[5m]) or rate(prometheus_remote_storage_samples_dropped_total{cluster=~"$cluster", instance=~"$instance"}[5m])', 261 legendFormat='{{cluster}}:{{instance}} {{remote_name}}:{{url}}' 262 )); 263 264 local failedSamples = 265 graphPanel.new( 266 'Failed Samples', 267 datasource='$datasource', 268 span=3, 269 ) 270 .addTarget(prometheus.target( 271 'rate(prometheus_remote_storage_failed_samples_total{cluster=~"$cluster", instance=~"$instance"}[5m]) or rate(prometheus_remote_storage_samples_failed_total{cluster=~"$cluster", instance=~"$instance"}[5m])', 272 legendFormat='{{cluster}}:{{instance}} {{remote_name}}:{{url}}' 273 )); 274 275 local retriedSamples = 276 graphPanel.new( 277 'Retried Samples', 278 datasource='$datasource', 279 span=3, 280 ) 281 .addTarget(prometheus.target( 282 'rate(prometheus_remote_storage_retried_samples_total{cluster=~"$cluster", instance=~"$instance"}[5m]) or rate(prometheus_remote_storage_samples_retried_total{cluster=~"$cluster", instance=~"$instance"}[5m])', 283 legendFormat='{{cluster}}:{{instance}} {{remote_name}}:{{url}}' 284 )); 285 286 local enqueueRetries = 287 graphPanel.new( 288 'Enqueue Retries', 289 datasource='$datasource', 290 span=3, 291 ) 292 .addTarget(prometheus.target( 293 'rate(prometheus_remote_storage_enqueue_retries_total{cluster=~"$cluster", instance=~"$instance"}[5m])', 294 legendFormat='{{cluster}}:{{instance}} {{remote_name}}:{{url}}' 295 )); 296 297 dashboard.new( 298 title='%(prefix)sRemote Write' % $._config.grafanaPrometheus, 299 editable=true 300 ) 301 .addTemplate( 302 { 303 hide: 0, 304 label: null, 305 name: 'datasource', 306 options: [], 307 query: 'prometheus', 308 refresh: 1, 309 regex: '', 310 type: 'datasource', 311 }, 312 ) 313 .addTemplate( 314 template.new( 315 'instance', 316 '$datasource', 317 'label_values(prometheus_build_info, instance)' % $._config, 318 refresh='time', 319 current={ 320 selected: true, 321 text: 'All', 322 value: '$__all', 323 }, 324 includeAll=true, 325 ) 326 ) 327 .addTemplate( 328 template.new( 329 'cluster', 330 '$datasource', 331 'label_values(kube_pod_container_info{image=~".*prometheus.*"}, cluster)' % $._config, 332 refresh='time', 333 current={ 334 selected: true, 335 text: 'All', 336 value: '$__all', 337 }, 338 includeAll=true, 339 ) 340 ) 341 .addTemplate( 342 template.new( 343 'url', 344 '$datasource', 345 'label_values(prometheus_remote_storage_shards{cluster=~"$cluster", instance=~"$instance"}, url)' % $._config, 346 refresh='time', 347 includeAll=true, 348 ) 349 ) 350 .addRow( 351 row.new('Timestamps') 352 .addPanel(timestampComparison) 353 .addPanel(timestampComparisonRate) 354 ) 355 .addRow( 356 row.new('Samples') 357 .addPanel(samplesRate) 358 ) 359 .addRow( 360 row.new( 361 'Shards' 362 ) 363 .addPanel(currentShards) 364 .addPanel(maxShards) 365 .addPanel(minShards) 366 .addPanel(desiredShards) 367 ) 368 .addRow( 369 row.new('Shard Details') 370 .addPanel(shardsCapacity) 371 .addPanel(pendingSamples) 372 ) 373 .addRow( 374 row.new('Segments') 375 .addPanel(walSegment) 376 .addPanel(queueSegment) 377 ) 378 .addRow( 379 row.new('Misc. Rates') 380 .addPanel(droppedSamples) 381 .addPanel(failedSamples) 382 .addPanel(retriedSamples) 383 .addPanel(enqueueRetries) 384 ) + { 385 tags: $._config.grafanaPrometheus.tags, 386 refresh: $._config.grafanaPrometheus.refresh, 387 }, 388 }, 389} 390