1# frozen_string_literal: true 2 3require 'spec_helper' 4 5RSpec.describe Gitlab::UsageData::Topology do 6 include UsageDataHelpers 7 8 describe '#topology_usage_data' do 9 subject { topology.topology_usage_data } 10 11 let(:topology) { described_class.new } 12 let(:prometheus_client) { Gitlab::PrometheusClient.new('http://localhost:9090') } 13 let(:fallback) { {} } 14 15 before do 16 # this pins down time shifts when benchmarking durations 17 allow(Process).to receive(:clock_gettime).and_return(0) 18 end 19 20 shared_examples 'query topology data from Prometheus' do 21 context 'tracking node metrics' do 22 it 'contains node level metrics for each instance' do 23 expect_prometheus_client_to( 24 receive_app_request_volume_query, 25 receive_query_apdex_ratio_query, 26 receive_node_memory_query, 27 receive_node_memory_utilization_query, 28 receive_node_cpu_count_query, 29 receive_node_cpu_utilization_query, 30 receive_node_uname_info_query, 31 receive_node_service_memory_rss_query, 32 receive_node_service_memory_uss_query, 33 receive_node_service_memory_pss_query, 34 receive_node_service_process_count_query, 35 receive_node_service_app_server_workers_query 36 ) 37 38 expect(subject[:topology]).to eq({ 39 duration_s: 0, 40 application_requests_per_hour: 36, 41 query_apdex_weekly_average: 0.996, 42 failures: [], 43 nodes: [ 44 { 45 node_memory_total_bytes: 512, 46 node_memory_utilization: 0.45, 47 node_cpus: 8, 48 node_cpu_utilization: 0.1, 49 node_uname_info: { 50 machine: 'x86_64', 51 sysname: 'Linux', 52 release: '4.19.76-linuxkit' 53 }, 54 node_services: [ 55 { 56 name: 'web', 57 process_count: 10, 58 process_memory_rss: 300, 59 process_memory_uss: 301, 60 process_memory_pss: 302, 61 server: 'puma' 62 }, 63 { 64 name: 'sidekiq', 65 process_count: 5, 66 process_memory_rss: 303 67 } 68 ] 69 }, 70 { 71 node_memory_total_bytes: 1024, 72 node_memory_utilization: 0.25, 73 node_cpus: 16, 74 node_cpu_utilization: 0.2, 75 node_uname_info: { 76 machine: 'x86_64', 77 sysname: 'Linux', 78 release: '4.15.0-101-generic' 79 }, 80 node_services: [ 81 { 82 name: 'sidekiq', 83 process_count: 15, 84 process_memory_rss: 400, 85 process_memory_pss: 401 86 }, 87 { 88 name: 'redis', 89 process_count: 1, 90 process_memory_rss: 402 91 }, 92 { 93 name: 'registry', 94 process_count: 1 95 }, 96 { 97 name: 'web', 98 server: 'puma' 99 } 100 ] 101 } 102 ] 103 }) 104 end 105 end 106 107 context 'and some node memory metrics are missing' do 108 it 'removes the respective entries and includes the failures' do 109 expect_prometheus_client_to( 110 receive_app_request_volume_query(result: []), 111 receive_query_apdex_ratio_query(result: []), 112 receive_node_memory_query(result: []), 113 receive_node_memory_utilization_query(result: []), 114 receive_node_cpu_count_query, 115 receive_node_cpu_utilization_query, 116 receive_node_uname_info_query, 117 receive_node_service_memory_rss_query(result: []), 118 receive_node_service_memory_uss_query(result: []), 119 receive_node_service_memory_pss_query, 120 receive_node_service_process_count_query, 121 receive_node_service_app_server_workers_query(result: []) 122 ) 123 124 expect(subject[:topology]).to eq({ 125 duration_s: 0, 126 failures: [ 127 { 'app_requests' => 'empty_result' }, 128 { 'query_apdex' => 'empty_result' }, 129 { 'node_memory' => 'empty_result' }, 130 { 'node_memory_utilization' => 'empty_result' }, 131 { 'service_rss' => 'empty_result' }, 132 { 'service_uss' => 'empty_result' }, 133 { 'service_workers' => 'empty_result' } 134 ], 135 nodes: [ 136 { 137 node_cpus: 16, 138 node_cpu_utilization: 0.2, 139 node_uname_info: { 140 machine: 'x86_64', 141 release: '4.15.0-101-generic', 142 sysname: 'Linux' 143 }, 144 node_services: [ 145 { 146 name: 'sidekiq', 147 process_count: 15, 148 process_memory_pss: 401 149 }, 150 { 151 name: 'redis', 152 process_count: 1 153 }, 154 { 155 name: 'registry', 156 process_count: 1 157 } 158 ] 159 }, 160 { 161 node_cpus: 8, 162 node_cpu_utilization: 0.1, 163 node_uname_info: { 164 machine: 'x86_64', 165 release: '4.19.76-linuxkit', 166 sysname: 'Linux' 167 }, 168 node_services: [ 169 { 170 name: 'web', 171 process_count: 10, 172 process_memory_pss: 302 173 }, 174 { 175 name: 'sidekiq', 176 process_count: 5 177 } 178 ] 179 } 180 ] 181 }) 182 end 183 end 184 185 context 'and services run on the same node but report different instance values' do 186 let(:node_memory_response) do 187 [ 188 { 189 'metric' => { 'instance' => 'localhost:9100' }, 190 'value' => [1000, '512'] 191 } 192 ] 193 end 194 195 let(:node_memory_utilization_response) do 196 [ 197 { 198 'metric' => { 'instance' => 'localhost:9100' }, 199 'value' => [1000, '0.35'] 200 } 201 ] 202 end 203 204 let(:node_uname_info_response) do 205 [ 206 { 207 "metric" => { 208 "__name__" => "node_uname_info", 209 "domainname" => "(none)", 210 "instance" => "127.0.0.1:9100", 211 "job" => "node_exporter", 212 "machine" => "x86_64", 213 "nodename" => "127.0.0.1", 214 "release" => "4.19.76-linuxkit", 215 "sysname" => "Linux" 216 }, 217 "value" => [1592463033.359, "1"] 218 } 219 ] 220 end 221 # The services in this response should all be mapped to localhost i.e. the same node 222 223 let(:service_memory_response) do 224 [ 225 { 226 'metric' => { 'instance' => 'localhost:8080', 'job' => 'gitlab-rails' }, 227 'value' => [1000, '10'] 228 }, 229 { 230 'metric' => { 'instance' => '127.0.0.1:8090', 'job' => 'gitlab-sidekiq' }, 231 'value' => [1000, '11'] 232 }, 233 { 234 'metric' => { 'instance' => '0.0.0.0:9090', 'job' => 'prometheus' }, 235 'value' => [1000, '12'] 236 }, 237 { 238 'metric' => { 'instance' => '[::1]:1234', 'job' => 'redis' }, 239 'value' => [1000, '13'] 240 }, 241 { 242 'metric' => { 'instance' => '[::]:1234', 'job' => 'postgres' }, 243 'value' => [1000, '14'] 244 } 245 ] 246 end 247 248 it 'normalizes equivalent instance values and maps them to the same node' do 249 expect_prometheus_client_to( 250 receive_app_request_volume_query(result: []), 251 receive_query_apdex_ratio_query(result: []), 252 receive_node_memory_query(result: node_memory_response), 253 receive_node_memory_utilization_query(result: node_memory_utilization_response), 254 receive_node_cpu_count_query(result: []), 255 receive_node_cpu_utilization_query(result: []), 256 receive_node_uname_info_query(result: node_uname_info_response), 257 receive_node_service_memory_rss_query(result: service_memory_response), 258 receive_node_service_memory_uss_query(result: []), 259 receive_node_service_memory_pss_query(result: []), 260 receive_node_service_process_count_query(result: []), 261 receive_node_service_app_server_workers_query(result: []) 262 ) 263 264 expect(subject[:topology]).to eq({ 265 duration_s: 0, 266 failures: [ 267 { 'app_requests' => 'empty_result' }, 268 { 'query_apdex' => 'empty_result' }, 269 { 'node_cpus' => 'empty_result' }, 270 { 'node_cpu_utilization' => 'empty_result' }, 271 { 'service_uss' => 'empty_result' }, 272 { 'service_pss' => 'empty_result' }, 273 { 'service_process_count' => 'empty_result' }, 274 { 'service_workers' => 'empty_result' } 275 ], 276 nodes: [ 277 { 278 node_memory_total_bytes: 512, 279 node_memory_utilization: 0.35, 280 node_uname_info: { 281 machine: 'x86_64', 282 sysname: 'Linux', 283 release: '4.19.76-linuxkit' 284 }, 285 node_services: [ 286 { 287 name: 'web', 288 process_memory_rss: 10 289 }, 290 { 291 name: 'sidekiq', 292 process_memory_rss: 11 293 }, 294 { 295 name: 'prometheus', 296 process_memory_rss: 12 297 }, 298 { 299 name: 'redis', 300 process_memory_rss: 13 301 }, 302 { 303 name: 'postgres', 304 process_memory_rss: 14 305 } 306 ] 307 } 308 ] 309 }) 310 end 311 end 312 313 context 'and node metrics are missing but service metrics exist' do 314 it 'still reports service metrics' do 315 expect_prometheus_client_to( 316 receive_app_request_volume_query(result: []), 317 receive_query_apdex_ratio_query(result: []), 318 receive_node_memory_query(result: []), 319 receive_node_memory_utilization_query(result: []), 320 receive_node_cpu_count_query(result: []), 321 receive_node_cpu_utilization_query(result: []), 322 receive_node_uname_info_query(result: []), 323 receive_node_service_memory_rss_query, 324 receive_node_service_memory_uss_query(result: []), 325 receive_node_service_memory_pss_query(result: []), 326 receive_node_service_process_count_query(result: []), 327 receive_node_service_app_server_workers_query(result: []) 328 ) 329 330 expect(subject[:topology]).to eq({ 331 duration_s: 0, 332 failures: [ 333 { 'app_requests' => 'empty_result' }, 334 { 'query_apdex' => 'empty_result' }, 335 { 'node_memory' => 'empty_result' }, 336 { 'node_memory_utilization' => 'empty_result' }, 337 { 'node_cpus' => 'empty_result' }, 338 { 'node_cpu_utilization' => 'empty_result' }, 339 { 'node_uname_info' => 'empty_result' }, 340 { 'service_uss' => 'empty_result' }, 341 { 'service_pss' => 'empty_result' }, 342 { 'service_process_count' => 'empty_result' }, 343 { 'service_workers' => 'empty_result' } 344 ], 345 nodes: [ 346 { 347 node_services: [ 348 { 349 name: 'web', 350 process_memory_rss: 300 351 }, 352 { 353 name: 'sidekiq', 354 process_memory_rss: 303 355 } 356 ] 357 }, 358 { 359 node_services: [ 360 { 361 name: 'sidekiq', 362 process_memory_rss: 400 363 }, 364 { 365 name: 'redis', 366 process_memory_rss: 402 367 } 368 ] 369 } 370 ] 371 }) 372 end 373 end 374 375 context 'and unknown services are encountered' do 376 let(:unknown_service_process_count_response) do 377 [ 378 { 379 'metric' => { 'instance' => 'instance2:9000', 'job' => 'unknown-service-A' }, 380 'value' => [1000, '42'] 381 }, 382 { 383 'metric' => { 'instance' => 'instance2:9001', 'job' => 'unknown-service-B' }, 384 'value' => [1000, '42'] 385 } 386 ] 387 end 388 389 it 'filters out unknown service data and reports the unknown services as a failure' do 390 expect_prometheus_client_to( 391 receive_app_request_volume_query(result: []), 392 receive_query_apdex_ratio_query(result: []), 393 receive_node_memory_query(result: []), 394 receive_node_memory_utilization_query(result: []), 395 receive_node_cpu_count_query(result: []), 396 receive_node_cpu_utilization_query(result: []), 397 receive_node_uname_info_query(result: []), 398 receive_node_service_memory_rss_query(result: []), 399 receive_node_service_memory_uss_query(result: []), 400 receive_node_service_memory_pss_query(result: []), 401 receive_node_service_process_count_query(result: unknown_service_process_count_response), 402 receive_node_service_app_server_workers_query(result: []) 403 ) 404 405 expect(subject.dig(:topology, :failures)).to include( 406 { 'service_unknown' => 'unknown-service-A' }, 407 { 'service_unknown' => 'unknown-service-B' } 408 ) 409 end 410 end 411 412 context 'and an error is raised when querying Prometheus' do 413 context 'without timeout failures' do 414 it 'returns empty result and executes subsequent queries as usual' do 415 expect_prometheus_client_to( 416 receive(:query).at_least(:once).and_raise(Gitlab::PrometheusClient::UnexpectedResponseError) 417 ) 418 419 expect(subject[:topology]).to eq({ 420 duration_s: 0, 421 failures: [ 422 { 'app_requests' => 'Gitlab::PrometheusClient::UnexpectedResponseError' }, 423 { 'query_apdex' => 'Gitlab::PrometheusClient::UnexpectedResponseError' }, 424 { 'node_memory' => 'Gitlab::PrometheusClient::UnexpectedResponseError' }, 425 { 'node_memory_utilization' => 'Gitlab::PrometheusClient::UnexpectedResponseError' }, 426 { 'node_cpus' => 'Gitlab::PrometheusClient::UnexpectedResponseError' }, 427 { 'node_cpu_utilization' => 'Gitlab::PrometheusClient::UnexpectedResponseError' }, 428 { 'node_uname_info' => 'Gitlab::PrometheusClient::UnexpectedResponseError' }, 429 { 'service_rss' => 'Gitlab::PrometheusClient::UnexpectedResponseError' }, 430 { 'service_uss' => 'Gitlab::PrometheusClient::UnexpectedResponseError' }, 431 { 'service_pss' => 'Gitlab::PrometheusClient::UnexpectedResponseError' }, 432 { 'service_process_count' => 'Gitlab::PrometheusClient::UnexpectedResponseError' }, 433 { 'service_workers' => 'Gitlab::PrometheusClient::UnexpectedResponseError' } 434 ], 435 nodes: [] 436 }) 437 end 438 end 439 440 context 'with timeout failures' do 441 where(:exception) do 442 described_class::TIMEOUT_ERRORS 443 end 444 445 with_them do 446 it 'returns empty result and cancelled subsequent queries' do 447 expect_prometheus_client_to( 448 receive(:query).and_raise(exception) 449 ) 450 451 expect(subject[:topology]).to eq({ 452 duration_s: 0, 453 failures: [ 454 { 'app_requests' => exception.to_s }, 455 { 'query_apdex' => 'timeout_cancellation' }, 456 { 'node_memory' => 'timeout_cancellation' }, 457 { 'node_memory_utilization' => 'timeout_cancellation' }, 458 { 'node_cpus' => 'timeout_cancellation' }, 459 { 'node_cpu_utilization' => 'timeout_cancellation' }, 460 { 'node_uname_info' => 'timeout_cancellation' }, 461 { 'service_rss' => 'timeout_cancellation' }, 462 { 'service_uss' => 'timeout_cancellation' }, 463 { 'service_pss' => 'timeout_cancellation' }, 464 { 'service_process_count' => 'timeout_cancellation' }, 465 { 'service_workers' => 'timeout_cancellation' } 466 ], 467 nodes: [] 468 }) 469 end 470 end 471 end 472 end 473 end 474 475 shared_examples 'returns empty result with no failures' do 476 it do 477 expect(subject[:topology]).to eq({ 478 duration_s: 0, 479 failures: [] 480 }) 481 end 482 end 483 484 context 'can reach a ready Prometheus client' do 485 before do 486 expect(topology).to receive(:with_prometheus_client).and_yield(prometheus_client) 487 end 488 489 it_behaves_like 'query topology data from Prometheus' 490 end 491 492 context 'can not reach a ready Prometheus client' do 493 before do 494 expect(topology).to receive(:with_prometheus_client).and_return(fallback) 495 end 496 497 it_behaves_like 'returns empty result with no failures' 498 end 499 500 context 'when top-level function raises error' do 501 it 'returns empty result with generic failure' do 502 expect(topology).to receive(:with_prometheus_client).and_raise(RuntimeError) 503 504 expect(subject[:topology]).to eq({ 505 duration_s: 0, 506 failures: [ 507 { 'other' => 'RuntimeError' } 508 ] 509 }) 510 end 511 end 512 end 513 514 def receive_ready_check_query(result: nil, raise_error: nil) 515 if raise_error.nil? 516 receive(:ready?).and_return(result.nil? ? true : result) 517 else 518 receive(:ready?).and_raise(raise_error) 519 end 520 end 521 522 def receive_app_request_volume_query(result: nil) 523 receive(:query) 524 .with(/gitlab_usage_ping:ops:rate/) 525 .and_return(result || [ 526 { 527 'metric' => { 'component' => 'http_requests', 'service' => 'workhorse' }, 528 'value' => [1000, '0.01'] 529 } 530 ]) 531 end 532 533 def receive_query_apdex_ratio_query(result: nil) 534 receive(:query) 535 .with(/gitlab_usage_ping:sql_duration_apdex:ratio_rate5m/) 536 .and_return(result || [ 537 { 538 'metric' => {}, 539 'value' => [1000, '0.996'] 540 } 541 ]) 542 end 543 544 def receive_node_memory_query(result: nil) 545 receive(:query) 546 .with(/node_memory_total_bytes/, an_instance_of(Hash)) 547 .and_return(result || [ 548 { 549 'metric' => { 'instance' => 'instance1:8080' }, 550 'value' => [1000, '512'] 551 }, 552 { 553 'metric' => { 'instance' => 'instance2:8090' }, 554 'value' => [1000, '1024'] 555 } 556 ]) 557 end 558 559 def receive_node_memory_utilization_query(result: nil) 560 receive(:query) 561 .with(/node_memory_utilization/, an_instance_of(Hash)) 562 .and_return(result || [ 563 { 564 'metric' => { 'instance' => 'instance1:8080' }, 565 'value' => [1000, '0.45'] 566 }, 567 { 568 'metric' => { 'instance' => 'instance2:8090' }, 569 'value' => [1000, '0.25'] 570 } 571 ]) 572 end 573 574 def receive_node_cpu_count_query(result: nil) 575 receive(:query) 576 .with(/node_cpus/, an_instance_of(Hash)) 577 .and_return(result || [ 578 { 579 'metric' => { 'instance' => 'instance2:8090' }, 580 'value' => [1000, '16'] 581 }, 582 { 583 'metric' => { 'instance' => 'instance1:8080' }, 584 'value' => [1000, '8'] 585 } 586 ]) 587 end 588 589 def receive_node_cpu_utilization_query(result: nil) 590 receive(:query) 591 .with(/node_cpu_utilization/, an_instance_of(Hash)) 592 .and_return(result || [ 593 { 594 'metric' => { 'instance' => 'instance2:8090' }, 595 'value' => [1000, '0.2'] 596 }, 597 { 598 'metric' => { 'instance' => 'instance1:8080' }, 599 'value' => [1000, '0.1'] 600 } 601 ]) 602 end 603 604 def receive_node_uname_info_query(result: nil) 605 receive(:query) 606 .with('node_uname_info') 607 .and_return(result || [ 608 { 609 "metric" => { 610 "__name__" => "node_uname_info", 611 "domainname" => "(none)", 612 "instance" => "instance1:9100", 613 "job" => "node_exporter", 614 "machine" => "x86_64", 615 "nodename" => "instance1", 616 "release" => "4.19.76-linuxkit", 617 "sysname" => "Linux" 618 }, 619 "value" => [1592463033.359, "1"] 620 }, 621 { 622 "metric" => { 623 "__name__" => "node_uname_info", 624 "domainname" => "(none)", 625 "instance" => "instance2:9100", 626 "job" => "node_exporter", 627 "machine" => "x86_64", 628 "nodename" => "instance2", 629 "release" => "4.15.0-101-generic", 630 "sysname" => "Linux" 631 }, 632 "value" => [1592463033.359, "1"] 633 } 634 ]) 635 end 636 637 def receive_node_service_memory_rss_query(result: nil) 638 receive(:query) 639 .with(/process_resident_memory_bytes/, an_instance_of(Hash)) 640 .and_return(result || [ 641 { 642 'metric' => { 'instance' => 'instance1:8080', 'job' => 'gitlab-rails' }, 643 'value' => [1000, '300'] 644 }, 645 { 646 'metric' => { 'instance' => 'instance1:8090', 'job' => 'gitlab-sidekiq' }, 647 'value' => [1000, '303'] 648 }, 649 # instance 2: runs a dedicated Sidekiq + Redis (which uses a different metric name) 650 { 651 'metric' => { 'instance' => 'instance2:8090', 'job' => 'gitlab-sidekiq' }, 652 'value' => [1000, '400'] 653 }, 654 { 655 'metric' => { 'instance' => 'instance2:9121', 'job' => 'redis' }, 656 'value' => [1000, '402'] 657 } 658 ]) 659 end 660 661 def receive_node_service_memory_uss_query(result: nil) 662 receive(:query) 663 .with(/process_unique_memory_bytes/, an_instance_of(Hash)) 664 .and_return(result || [ 665 { 666 'metric' => { 'instance' => 'instance1:8080', 'job' => 'gitlab-rails' }, 667 'value' => [1000, '301'] 668 } 669 ]) 670 end 671 672 def receive_node_service_memory_pss_query(result: nil) 673 receive(:query) 674 .with(/process_proportional_memory_bytes/, an_instance_of(Hash)) 675 .and_return(result || [ 676 { 677 'metric' => { 'instance' => 'instance1:8080', 'job' => 'gitlab-rails' }, 678 'value' => [1000, '302'] 679 }, 680 { 681 'metric' => { 'instance' => 'instance2:8090', 'job' => 'gitlab-sidekiq' }, 682 'value' => [1000, '401'] 683 } 684 ]) 685 end 686 687 def receive_node_service_process_count_query(result: nil) 688 receive(:query) 689 .with(/service_process:count/, an_instance_of(Hash)) 690 .and_return(result || [ 691 # instance 1 692 { 693 'metric' => { 'instance' => 'instance1:8080', 'job' => 'gitlab-rails' }, 694 'value' => [1000, '10'] 695 }, 696 { 697 'metric' => { 'instance' => 'instance1:8090', 'job' => 'gitlab-sidekiq' }, 698 'value' => [1000, '5'] 699 }, 700 # instance 2 701 { 702 'metric' => { 'instance' => 'instance2:8090', 'job' => 'gitlab-sidekiq' }, 703 'value' => [1000, '15'] 704 }, 705 { 706 'metric' => { 'instance' => 'instance2:9121', 'job' => 'redis' }, 707 'value' => [1000, '1'] 708 }, 709 { 710 'metric' => { 'instance' => 'instance2:8080', 'job' => 'registry' }, 711 'value' => [1000, '1'] 712 } 713 ]) 714 end 715 716 def receive_node_service_app_server_workers_query(result: nil) 717 receive(:query) 718 .with(/app_server_workers/, an_instance_of(Hash)) 719 .and_return(result || [ 720 # instance 1 721 { 722 'metric' => { 'instance' => 'instance1:8080', 'job' => 'gitlab-rails', 'server' => 'puma' }, 723 'value' => [1000, '2'] 724 }, 725 # instance 2 726 { 727 'metric' => { 'instance' => 'instance2:8080', 'job' => 'gitlab-rails', 'server' => 'puma' }, 728 'value' => [1000, '1'] 729 } 730 ]) 731 end 732end 733