1# frozen_string_literal: true
2
3require 'spec_helper'
4
5RSpec.describe Gitlab::UsageData::Topology do
6  include UsageDataHelpers
7
8  describe '#topology_usage_data' do
9    subject { topology.topology_usage_data }
10
11    let(:topology) { described_class.new }
12    let(:prometheus_client) { Gitlab::PrometheusClient.new('http://localhost:9090') }
13    let(:fallback) { {} }
14
15    before do
16      # this pins down time shifts when benchmarking durations
17      allow(Process).to receive(:clock_gettime).and_return(0)
18    end
19
20    shared_examples 'query topology data from Prometheus' do
21      context 'tracking node metrics' do
22        it 'contains node level metrics for each instance' do
23          expect_prometheus_client_to(
24            receive_app_request_volume_query,
25            receive_query_apdex_ratio_query,
26            receive_node_memory_query,
27            receive_node_memory_utilization_query,
28            receive_node_cpu_count_query,
29            receive_node_cpu_utilization_query,
30            receive_node_uname_info_query,
31            receive_node_service_memory_rss_query,
32            receive_node_service_memory_uss_query,
33            receive_node_service_memory_pss_query,
34            receive_node_service_process_count_query,
35            receive_node_service_app_server_workers_query
36          )
37
38          expect(subject[:topology]).to eq({
39            duration_s: 0,
40            application_requests_per_hour: 36,
41            query_apdex_weekly_average: 0.996,
42            failures: [],
43            nodes: [
44              {
45                node_memory_total_bytes: 512,
46                node_memory_utilization: 0.45,
47                node_cpus: 8,
48                node_cpu_utilization: 0.1,
49                node_uname_info: {
50                  machine: 'x86_64',
51                  sysname: 'Linux',
52                  release: '4.19.76-linuxkit'
53                },
54                node_services: [
55                  {
56                    name: 'web',
57                    process_count: 10,
58                    process_memory_rss: 300,
59                    process_memory_uss: 301,
60                    process_memory_pss: 302,
61                    server: 'puma'
62                  },
63                  {
64                    name: 'sidekiq',
65                    process_count: 5,
66                    process_memory_rss: 303
67                  }
68                ]
69              },
70              {
71                node_memory_total_bytes: 1024,
72                node_memory_utilization: 0.25,
73                node_cpus: 16,
74                node_cpu_utilization: 0.2,
75                node_uname_info: {
76                  machine: 'x86_64',
77                  sysname: 'Linux',
78                  release: '4.15.0-101-generic'
79                },
80                node_services: [
81                  {
82                    name: 'sidekiq',
83                    process_count: 15,
84                    process_memory_rss: 400,
85                    process_memory_pss: 401
86                  },
87                  {
88                    name: 'redis',
89                    process_count: 1,
90                    process_memory_rss: 402
91                  },
92                  {
93                    name: 'registry',
94                    process_count: 1
95                  },
96                  {
97                    name: 'web',
98                    server: 'puma'
99                  }
100                ]
101              }
102            ]
103          })
104        end
105      end
106
107      context 'and some node memory metrics are missing' do
108        it 'removes the respective entries and includes the failures' do
109          expect_prometheus_client_to(
110            receive_app_request_volume_query(result: []),
111            receive_query_apdex_ratio_query(result: []),
112            receive_node_memory_query(result: []),
113            receive_node_memory_utilization_query(result: []),
114            receive_node_cpu_count_query,
115            receive_node_cpu_utilization_query,
116            receive_node_uname_info_query,
117            receive_node_service_memory_rss_query(result: []),
118            receive_node_service_memory_uss_query(result: []),
119            receive_node_service_memory_pss_query,
120            receive_node_service_process_count_query,
121            receive_node_service_app_server_workers_query(result: [])
122          )
123
124          expect(subject[:topology]).to eq({
125            duration_s: 0,
126            failures: [
127              { 'app_requests' => 'empty_result' },
128              { 'query_apdex' => 'empty_result' },
129              { 'node_memory' => 'empty_result' },
130              { 'node_memory_utilization' => 'empty_result' },
131              { 'service_rss' => 'empty_result' },
132              { 'service_uss' => 'empty_result' },
133              { 'service_workers' => 'empty_result' }
134            ],
135            nodes: [
136              {
137                node_cpus: 16,
138                node_cpu_utilization: 0.2,
139                node_uname_info: {
140                  machine: 'x86_64',
141                  release: '4.15.0-101-generic',
142                  sysname: 'Linux'
143                },
144                node_services: [
145                  {
146                    name: 'sidekiq',
147                    process_count: 15,
148                    process_memory_pss: 401
149                  },
150                  {
151                    name: 'redis',
152                    process_count: 1
153                  },
154                  {
155                    name: 'registry',
156                    process_count: 1
157                  }
158                ]
159              },
160              {
161                node_cpus: 8,
162                node_cpu_utilization: 0.1,
163                node_uname_info: {
164                  machine: 'x86_64',
165                  release: '4.19.76-linuxkit',
166                  sysname: 'Linux'
167                },
168                node_services: [
169                  {
170                    name: 'web',
171                    process_count: 10,
172                    process_memory_pss: 302
173                  },
174                  {
175                    name: 'sidekiq',
176                    process_count: 5
177                  }
178                ]
179              }
180            ]
181          })
182        end
183      end
184
185      context 'and services run on the same node but report different instance values' do
186        let(:node_memory_response) do
187          [
188            {
189              'metric' => { 'instance' => 'localhost:9100' },
190              'value' =>  [1000, '512']
191            }
192          ]
193        end
194
195        let(:node_memory_utilization_response) do
196          [
197            {
198              'metric' => { 'instance' => 'localhost:9100' },
199              'value' =>  [1000, '0.35']
200            }
201          ]
202        end
203
204        let(:node_uname_info_response) do
205          [
206            {
207              "metric" => {
208                "__name__" => "node_uname_info",
209                "domainname" => "(none)",
210                "instance" => "127.0.0.1:9100",
211                "job" => "node_exporter",
212                "machine" => "x86_64",
213                "nodename" => "127.0.0.1",
214                "release" => "4.19.76-linuxkit",
215                "sysname" => "Linux"
216              },
217              "value" => [1592463033.359, "1"]
218            }
219          ]
220        end
221        # The services in this response should all be mapped to localhost i.e. the same node
222
223        let(:service_memory_response) do
224          [
225            {
226              'metric' => { 'instance' => 'localhost:8080', 'job' => 'gitlab-rails' },
227              'value' =>  [1000, '10']
228            },
229            {
230              'metric' => { 'instance' => '127.0.0.1:8090', 'job' => 'gitlab-sidekiq' },
231              'value' =>  [1000, '11']
232            },
233            {
234              'metric' => { 'instance' => '0.0.0.0:9090', 'job' => 'prometheus' },
235              'value' =>  [1000, '12']
236            },
237            {
238              'metric' => { 'instance' => '[::1]:1234', 'job' => 'redis' },
239              'value' =>  [1000, '13']
240            },
241            {
242              'metric' => { 'instance' => '[::]:1234', 'job' => 'postgres' },
243              'value' =>  [1000, '14']
244            }
245          ]
246        end
247
248        it 'normalizes equivalent instance values and maps them to the same node' do
249          expect_prometheus_client_to(
250            receive_app_request_volume_query(result: []),
251            receive_query_apdex_ratio_query(result: []),
252            receive_node_memory_query(result: node_memory_response),
253            receive_node_memory_utilization_query(result: node_memory_utilization_response),
254            receive_node_cpu_count_query(result: []),
255            receive_node_cpu_utilization_query(result: []),
256            receive_node_uname_info_query(result: node_uname_info_response),
257            receive_node_service_memory_rss_query(result: service_memory_response),
258            receive_node_service_memory_uss_query(result: []),
259            receive_node_service_memory_pss_query(result: []),
260            receive_node_service_process_count_query(result: []),
261            receive_node_service_app_server_workers_query(result: [])
262          )
263
264          expect(subject[:topology]).to eq({
265            duration_s: 0,
266            failures: [
267              { 'app_requests' => 'empty_result' },
268              { 'query_apdex' => 'empty_result' },
269              { 'node_cpus' => 'empty_result' },
270              { 'node_cpu_utilization' => 'empty_result' },
271              { 'service_uss' => 'empty_result' },
272              { 'service_pss' => 'empty_result' },
273              { 'service_process_count' => 'empty_result' },
274              { 'service_workers' => 'empty_result' }
275            ],
276            nodes: [
277              {
278                node_memory_total_bytes: 512,
279                node_memory_utilization: 0.35,
280                node_uname_info: {
281                  machine: 'x86_64',
282                  sysname: 'Linux',
283                  release: '4.19.76-linuxkit'
284                },
285                node_services: [
286                  {
287                    name: 'web',
288                    process_memory_rss: 10
289                  },
290                  {
291                    name: 'sidekiq',
292                    process_memory_rss: 11
293                  },
294                  {
295                    name: 'prometheus',
296                    process_memory_rss: 12
297                  },
298                  {
299                    name: 'redis',
300                    process_memory_rss: 13
301                  },
302                  {
303                    name: 'postgres',
304                    process_memory_rss: 14
305                  }
306                ]
307              }
308            ]
309          })
310        end
311      end
312
313      context 'and node metrics are missing but service metrics exist' do
314        it 'still reports service metrics' do
315          expect_prometheus_client_to(
316            receive_app_request_volume_query(result: []),
317            receive_query_apdex_ratio_query(result: []),
318            receive_node_memory_query(result: []),
319            receive_node_memory_utilization_query(result: []),
320            receive_node_cpu_count_query(result: []),
321            receive_node_cpu_utilization_query(result: []),
322            receive_node_uname_info_query(result: []),
323            receive_node_service_memory_rss_query,
324            receive_node_service_memory_uss_query(result: []),
325            receive_node_service_memory_pss_query(result: []),
326            receive_node_service_process_count_query(result: []),
327            receive_node_service_app_server_workers_query(result: [])
328          )
329
330          expect(subject[:topology]).to eq({
331            duration_s: 0,
332            failures: [
333              { 'app_requests' => 'empty_result' },
334              { 'query_apdex' => 'empty_result' },
335              { 'node_memory' => 'empty_result' },
336              { 'node_memory_utilization' => 'empty_result' },
337              { 'node_cpus' => 'empty_result' },
338              { 'node_cpu_utilization' => 'empty_result' },
339              { 'node_uname_info' => 'empty_result' },
340              { 'service_uss' => 'empty_result' },
341              { 'service_pss' => 'empty_result' },
342              { 'service_process_count' => 'empty_result' },
343              { 'service_workers' => 'empty_result' }
344            ],
345            nodes: [
346              {
347                node_services: [
348                  {
349                    name: 'web',
350                    process_memory_rss: 300
351                  },
352                  {
353                    name: 'sidekiq',
354                    process_memory_rss: 303
355                  }
356                ]
357              },
358              {
359                node_services: [
360                  {
361                    name: 'sidekiq',
362                    process_memory_rss: 400
363                  },
364                  {
365                    name: 'redis',
366                    process_memory_rss: 402
367                  }
368                ]
369              }
370            ]
371          })
372        end
373      end
374
375      context 'and unknown services are encountered' do
376        let(:unknown_service_process_count_response) do
377          [
378            {
379              'metric' => { 'instance' => 'instance2:9000', 'job' => 'unknown-service-A' },
380              'value' => [1000, '42']
381            },
382            {
383              'metric' => { 'instance' => 'instance2:9001', 'job' => 'unknown-service-B' },
384              'value' => [1000, '42']
385            }
386          ]
387        end
388
389        it 'filters out unknown service data and reports the unknown services as a failure' do
390          expect_prometheus_client_to(
391            receive_app_request_volume_query(result: []),
392            receive_query_apdex_ratio_query(result: []),
393            receive_node_memory_query(result: []),
394            receive_node_memory_utilization_query(result: []),
395            receive_node_cpu_count_query(result: []),
396            receive_node_cpu_utilization_query(result: []),
397            receive_node_uname_info_query(result: []),
398            receive_node_service_memory_rss_query(result: []),
399            receive_node_service_memory_uss_query(result: []),
400            receive_node_service_memory_pss_query(result: []),
401            receive_node_service_process_count_query(result: unknown_service_process_count_response),
402            receive_node_service_app_server_workers_query(result: [])
403          )
404
405          expect(subject.dig(:topology, :failures)).to include(
406            { 'service_unknown' => 'unknown-service-A' },
407            { 'service_unknown' => 'unknown-service-B' }
408          )
409        end
410      end
411
412      context 'and an error is raised when querying Prometheus' do
413        context 'without timeout failures' do
414          it 'returns empty result and executes subsequent queries as usual' do
415            expect_prometheus_client_to(
416              receive(:query).at_least(:once).and_raise(Gitlab::PrometheusClient::UnexpectedResponseError)
417            )
418
419            expect(subject[:topology]).to eq({
420              duration_s: 0,
421              failures: [
422                { 'app_requests' => 'Gitlab::PrometheusClient::UnexpectedResponseError' },
423                { 'query_apdex' => 'Gitlab::PrometheusClient::UnexpectedResponseError' },
424                { 'node_memory' => 'Gitlab::PrometheusClient::UnexpectedResponseError' },
425                { 'node_memory_utilization' => 'Gitlab::PrometheusClient::UnexpectedResponseError' },
426                { 'node_cpus' => 'Gitlab::PrometheusClient::UnexpectedResponseError' },
427                { 'node_cpu_utilization' => 'Gitlab::PrometheusClient::UnexpectedResponseError' },
428                { 'node_uname_info' => 'Gitlab::PrometheusClient::UnexpectedResponseError' },
429                { 'service_rss' => 'Gitlab::PrometheusClient::UnexpectedResponseError' },
430                { 'service_uss' => 'Gitlab::PrometheusClient::UnexpectedResponseError' },
431                { 'service_pss' => 'Gitlab::PrometheusClient::UnexpectedResponseError' },
432                { 'service_process_count' => 'Gitlab::PrometheusClient::UnexpectedResponseError' },
433                { 'service_workers' => 'Gitlab::PrometheusClient::UnexpectedResponseError' }
434              ],
435              nodes: []
436            })
437          end
438        end
439
440        context 'with timeout failures' do
441          where(:exception) do
442            described_class::TIMEOUT_ERRORS
443          end
444
445          with_them do
446            it 'returns empty result and cancelled subsequent queries' do
447              expect_prometheus_client_to(
448                receive(:query).and_raise(exception)
449              )
450
451              expect(subject[:topology]).to eq({
452                duration_s: 0,
453                failures: [
454                  { 'app_requests' => exception.to_s },
455                  { 'query_apdex' => 'timeout_cancellation' },
456                  { 'node_memory' => 'timeout_cancellation' },
457                  { 'node_memory_utilization' => 'timeout_cancellation' },
458                  { 'node_cpus' => 'timeout_cancellation' },
459                  { 'node_cpu_utilization' => 'timeout_cancellation' },
460                  { 'node_uname_info' => 'timeout_cancellation' },
461                  { 'service_rss' => 'timeout_cancellation' },
462                  { 'service_uss' => 'timeout_cancellation' },
463                  { 'service_pss' => 'timeout_cancellation' },
464                  { 'service_process_count' => 'timeout_cancellation' },
465                  { 'service_workers' => 'timeout_cancellation' }
466                ],
467                nodes: []
468              })
469            end
470          end
471        end
472      end
473    end
474
475    shared_examples 'returns empty result with no failures' do
476      it do
477        expect(subject[:topology]).to eq({
478          duration_s: 0,
479          failures: []
480        })
481      end
482    end
483
484    context 'can reach a ready Prometheus client' do
485      before do
486        expect(topology).to receive(:with_prometheus_client).and_yield(prometheus_client)
487      end
488
489      it_behaves_like 'query topology data from Prometheus'
490    end
491
492    context 'can not reach a ready Prometheus client' do
493      before do
494        expect(topology).to receive(:with_prometheus_client).and_return(fallback)
495      end
496
497      it_behaves_like 'returns empty result with no failures'
498    end
499
500    context 'when top-level function raises error' do
501      it 'returns empty result with generic failure' do
502        expect(topology).to receive(:with_prometheus_client).and_raise(RuntimeError)
503
504        expect(subject[:topology]).to eq({
505          duration_s: 0,
506          failures: [
507            { 'other' => 'RuntimeError' }
508          ]
509        })
510      end
511    end
512  end
513
514  def receive_ready_check_query(result: nil, raise_error: nil)
515    if raise_error.nil?
516      receive(:ready?).and_return(result.nil? ? true : result)
517    else
518      receive(:ready?).and_raise(raise_error)
519    end
520  end
521
522  def receive_app_request_volume_query(result: nil)
523    receive(:query)
524      .with(/gitlab_usage_ping:ops:rate/)
525      .and_return(result || [
526        {
527          'metric' => { 'component' => 'http_requests', 'service' => 'workhorse' },
528          'value' => [1000, '0.01']
529        }
530      ])
531  end
532
533  def receive_query_apdex_ratio_query(result: nil)
534    receive(:query)
535      .with(/gitlab_usage_ping:sql_duration_apdex:ratio_rate5m/)
536      .and_return(result || [
537        {
538          'metric' => {},
539          'value' => [1000, '0.996']
540        }
541      ])
542  end
543
544  def receive_node_memory_query(result: nil)
545    receive(:query)
546      .with(/node_memory_total_bytes/, an_instance_of(Hash))
547      .and_return(result || [
548        {
549          'metric' => { 'instance' => 'instance1:8080' },
550          'value' => [1000, '512']
551        },
552        {
553          'metric' => { 'instance' => 'instance2:8090' },
554          'value' => [1000, '1024']
555        }
556      ])
557  end
558
559  def receive_node_memory_utilization_query(result: nil)
560    receive(:query)
561      .with(/node_memory_utilization/, an_instance_of(Hash))
562      .and_return(result || [
563        {
564          'metric' => { 'instance' => 'instance1:8080' },
565          'value' => [1000, '0.45']
566        },
567        {
568          'metric' => { 'instance' => 'instance2:8090' },
569          'value' => [1000, '0.25']
570        }
571      ])
572  end
573
574  def receive_node_cpu_count_query(result: nil)
575    receive(:query)
576      .with(/node_cpus/, an_instance_of(Hash))
577      .and_return(result || [
578        {
579          'metric' => { 'instance' => 'instance2:8090' },
580          'value' => [1000, '16']
581        },
582        {
583          'metric' => { 'instance' => 'instance1:8080' },
584          'value' => [1000, '8']
585        }
586      ])
587  end
588
589  def receive_node_cpu_utilization_query(result: nil)
590    receive(:query)
591      .with(/node_cpu_utilization/, an_instance_of(Hash))
592      .and_return(result || [
593        {
594          'metric' => { 'instance' => 'instance2:8090' },
595          'value' => [1000, '0.2']
596        },
597        {
598          'metric' => { 'instance' => 'instance1:8080' },
599          'value' => [1000, '0.1']
600        }
601      ])
602  end
603
604  def receive_node_uname_info_query(result: nil)
605    receive(:query)
606      .with('node_uname_info')
607      .and_return(result || [
608        {
609          "metric" => {
610            "__name__" => "node_uname_info",
611            "domainname" => "(none)",
612            "instance" => "instance1:9100",
613            "job" => "node_exporter",
614            "machine" => "x86_64",
615            "nodename" => "instance1",
616            "release" => "4.19.76-linuxkit",
617            "sysname" => "Linux"
618          },
619          "value" => [1592463033.359, "1"]
620        },
621        {
622          "metric" => {
623            "__name__" => "node_uname_info",
624            "domainname" => "(none)",
625            "instance" => "instance2:9100",
626            "job" => "node_exporter",
627            "machine" => "x86_64",
628            "nodename" => "instance2",
629            "release" => "4.15.0-101-generic",
630            "sysname" => "Linux"
631          },
632          "value" => [1592463033.359, "1"]
633        }
634      ])
635  end
636
637  def receive_node_service_memory_rss_query(result: nil)
638    receive(:query)
639      .with(/process_resident_memory_bytes/, an_instance_of(Hash))
640      .and_return(result || [
641        {
642          'metric' => { 'instance' => 'instance1:8080', 'job' => 'gitlab-rails' },
643          'value' =>  [1000, '300']
644        },
645        {
646          'metric' => { 'instance' => 'instance1:8090', 'job' => 'gitlab-sidekiq' },
647          'value' => [1000, '303']
648        },
649        # instance 2: runs a dedicated Sidekiq + Redis (which uses a different metric name)
650        {
651          'metric' => { 'instance' => 'instance2:8090', 'job' => 'gitlab-sidekiq' },
652          'value' => [1000, '400']
653        },
654        {
655          'metric' => { 'instance' => 'instance2:9121', 'job' => 'redis' },
656          'value' => [1000, '402']
657        }
658      ])
659  end
660
661  def receive_node_service_memory_uss_query(result: nil)
662    receive(:query)
663      .with(/process_unique_memory_bytes/, an_instance_of(Hash))
664      .and_return(result || [
665        {
666          'metric' => { 'instance' => 'instance1:8080', 'job' => 'gitlab-rails' },
667          'value' => [1000, '301']
668        }
669      ])
670  end
671
672  def receive_node_service_memory_pss_query(result: nil)
673    receive(:query)
674      .with(/process_proportional_memory_bytes/, an_instance_of(Hash))
675      .and_return(result || [
676        {
677          'metric' => { 'instance' => 'instance1:8080', 'job' => 'gitlab-rails' },
678          'value' => [1000, '302']
679        },
680        {
681          'metric' => { 'instance' => 'instance2:8090', 'job' => 'gitlab-sidekiq' },
682          'value' => [1000, '401']
683        }
684      ])
685  end
686
687  def receive_node_service_process_count_query(result: nil)
688    receive(:query)
689      .with(/service_process:count/, an_instance_of(Hash))
690      .and_return(result || [
691        # instance 1
692        {
693          'metric' => { 'instance' => 'instance1:8080', 'job' => 'gitlab-rails' },
694          'value' => [1000, '10']
695        },
696        {
697          'metric' => { 'instance' => 'instance1:8090', 'job' => 'gitlab-sidekiq' },
698          'value' => [1000, '5']
699        },
700        # instance 2
701        {
702          'metric' => { 'instance' => 'instance2:8090', 'job' => 'gitlab-sidekiq' },
703          'value' => [1000, '15']
704        },
705        {
706          'metric' => { 'instance' => 'instance2:9121', 'job' => 'redis' },
707          'value' => [1000, '1']
708        },
709        {
710          'metric' => { 'instance' => 'instance2:8080', 'job' => 'registry' },
711          'value' => [1000, '1']
712        }
713      ])
714  end
715
716  def receive_node_service_app_server_workers_query(result: nil)
717    receive(:query)
718      .with(/app_server_workers/, an_instance_of(Hash))
719      .and_return(result || [
720        # instance 1
721        {
722          'metric' => { 'instance' => 'instance1:8080', 'job' => 'gitlab-rails', 'server' => 'puma' },
723          'value' => [1000, '2']
724        },
725        # instance 2
726        {
727          'metric' => { 'instance' => 'instance2:8080', 'job' => 'gitlab-rails', 'server' => 'puma' },
728          'value' => [1000, '1']
729        }
730      ])
731  end
732end
733