1#!/usr/local/bin/python3.8 2 3# 4# A MongoDB Nagios check script 5# 6 7# Script idea taken from a Tag1 script I found and I modified it a lot 8# 9# Main Author 10# - Mike Zupan <mike@zcentric.com> 11# Contributers 12# - Frank Brandewiede <brande@travel-iq.com> <brande@bfiw.de> <brande@novolab.de> 13# - Sam Perman <sam@brightcove.com> 14# - Shlomo Priymak <shlomoid@gmail.com> 15# - @jhoff909 on github 16# - @jbraeuer on github 17# - Dag Stockstad <dag.stockstad@gmail.com> 18# - @Andor on github 19# 20# USAGE 21# 22# See the README.md 23# 24 25import sys 26import time 27import optparse 28import textwrap 29import re 30import os 31 32try: 33 import pymongo 34except ImportError as e: 35 print(e) 36 sys.exit(2) 37 38# As of pymongo v 1.9 the SON API is part of the BSON package, therefore attempt 39# to import from there and fall back to pymongo in cases of older pymongo 40if pymongo.version >= "1.9": 41 import bson.son as son 42else: 43 import pymongo.son as son 44 45 46# 47# thanks to http://stackoverflow.com/a/1229667/72987 48# 49def optional_arg(arg_default): 50 def func(option, opt_str, value, parser): 51 if parser.rargs and not parser.rargs[0].startswith('-'): 52 val = parser.rargs[0] 53 parser.rargs.pop(0) 54 else: 55 val = arg_default 56 setattr(parser.values, option.dest, val) 57 return func 58 59 60def performance_data(perf_data, params): 61 data = '' 62 if perf_data: 63 data = " |" 64 for p in params: 65 p += (None, None, None, None) 66 param, param_name, warning, critical = p[0:4] 67 data += "%s=%s" % (param_name, str(param)) 68 if warning or critical: 69 warning = warning or 0 70 critical = critical or 0 71 data += ";%s;%s" % (warning, critical) 72 73 data += " " 74 75 return data 76 77 78def numeric_type(param): 79 if ((type(param) == float or type(param) == int or param == None)): 80 return True 81 return False 82 83 84def check_levels(param, warning, critical, message, ok=[]): 85 if (numeric_type(critical) and numeric_type(warning)): 86 if param >= critical: 87 print("CRITICAL - " + message) 88 sys.exit(2) 89 elif param >= warning: 90 print("WARNING - " + message) 91 sys.exit(1) 92 else: 93 print("OK - " + message) 94 sys.exit(0) 95 else: 96 if param in critical: 97 print("CRITICAL - " + message) 98 sys.exit(2) 99 100 if param in warning: 101 print("WARNING - " + message) 102 sys.exit(1) 103 104 if param in ok: 105 print("OK - " + message) 106 sys.exit(0) 107 108 # unexpected param value 109 print("CRITICAL - Unexpected value : %d" % param + "; " + message) 110 return 2 111 112 113def get_server_status(con): 114 try: 115 set_read_preference(con.admin) 116 data = con.admin.command(pymongo.son_manipulator.SON([('serverStatus', 1)])) 117 except: 118 data = con.admin.command(son.SON([('serverStatus', 1)])) 119 return data 120 121 122def main(argv): 123 p = optparse.OptionParser(conflict_handler="resolve", description="This Nagios plugin checks the health of mongodb.") 124 125 p.add_option('-H', '--host', action='store', type='string', dest='host', default='127.0.0.1', help='The hostname you want to connect to') 126 p.add_option('-P', '--port', action='store', type='int', dest='port', default=27017, help='The port mongodb is runnung on') 127 p.add_option('-u', '--user', action='store', type='string', dest='user', default=None, help='The username you want to login as') 128 p.add_option('-p', '--pass', action='store', type='string', dest='passwd', default=None, help='The password you want to use for that user') 129 p.add_option('-W', '--warning', action='store', dest='warning', default=None, help='The warning threshold we want to set') 130 p.add_option('-C', '--critical', action='store', dest='critical', default=None, help='The critical threshold we want to set') 131 p.add_option('-A', '--action', action='store', type='choice', dest='action', default='connect', help='The action you want to take', 132 choices=['connect', 'connections', 'replication_lag', 'replication_lag_percent', 'replset_state', 'memory', 'memory_mapped', 'lock', 133 'flushing', 'last_flush_time', 'index_miss_ratio', 'databases', 'collections', 'database_size', 'database_indexes', 'collection_indexes', 134 'queues', 'oplog', 'journal_commits_in_wl', 'write_data_files', 'journaled', 'opcounters', 'current_lock', 'replica_primary', 'page_faults', 135 'asserts', 'queries_per_second', 'page_faults', 'chunks_balance', 'connect_primary', 'collection_state', 'row_count']) 136 p.add_option('--max-lag', action='store_true', dest='max_lag', default=False, help='Get max replication lag (for replication_lag action only)') 137 p.add_option('--mapped-memory', action='store_true', dest='mapped_memory', default=False, help='Get mapped memory instead of resident (if resident memory can not be read)') 138 p.add_option('-D', '--perf-data', action='store_true', dest='perf_data', default=False, help='Enable output of Nagios performance data') 139 p.add_option('-d', '--database', action='store', dest='database', default='admin', help='Specify the database to check') 140 p.add_option('--all-databases', action='store_true', dest='all_databases', default=False, help='Check all databases (action database_size)') 141 p.add_option('-s', '--ssl', dest='ssl', default=False, action='callback', callback=optional_arg(True), help='Connect using SSL') 142 p.add_option('-r', '--replicaset', dest='replicaset', default=None, action='callback', callback=optional_arg(True), help='Connect to replicaset') 143 p.add_option('-q', '--querytype', action='store', dest='query_type', default='query', help='The query type to check [query|insert|update|delete|getmore|command] from queries_per_second') 144 p.add_option('-c', '--collection', action='store', dest='collection', default='admin', help='Specify the collection to check') 145 p.add_option('-T', '--time', action='store', type='int', dest='sample_time', default=1, help='Time used to sample number of pages faults') 146 147 options, arguments = p.parse_args() 148 host = options.host 149 port = options.port 150 user = options.user 151 passwd = options.passwd 152 query_type = options.query_type 153 collection = options.collection 154 sample_time = options.sample_time 155 if (options.action == 'replset_state'): 156 warning = str(options.warning or "") 157 critical = str(options.critical or "") 158 else: 159 warning = float(options.warning or 0) 160 critical = float(options.critical or 0) 161 162 action = options.action 163 perf_data = options.perf_data 164 max_lag = options.max_lag 165 database = options.database 166 ssl = options.ssl 167 replicaset = options.replicaset 168 169 if action == 'replica_primary' and replicaset is None: 170 return "replicaset must be passed in when using replica_primary check" 171 elif not action == 'replica_primary' and replicaset: 172 return "passing a replicaset while not checking replica_primary does not work" 173 174 # 175 # moving the login up here and passing in the connection 176 # 177 start = time.time() 178 err, con = mongo_connect(host, port, ssl, user, passwd, replicaset) 179 if err != 0: 180 return err 181 182 conn_time = time.time() - start 183 conn_time = round(conn_time, 0) 184 185 if action == "connections": 186 return check_connections(con, warning, critical, perf_data) 187 elif action == "replication_lag": 188 return check_rep_lag(con, host, port, warning, critical, False, perf_data, max_lag, user, passwd) 189 elif action == "replication_lag_percent": 190 return check_rep_lag(con, host, port, warning, critical, True, perf_data, max_lag, user, passwd) 191 elif action == "replset_state": 192 return check_replset_state(con, perf_data, warning, critical) 193 elif action == "memory": 194 return check_memory(con, warning, critical, perf_data, options.mapped_memory) 195 elif action == "memory_mapped": 196 return check_memory_mapped(con, warning, critical, perf_data) 197 elif action == "queues": 198 return check_queues(con, warning, critical, perf_data) 199 elif action == "lock": 200 return check_lock(con, warning, critical, perf_data) 201 elif action == "current_lock": 202 return check_current_lock(con, host, warning, critical, perf_data) 203 elif action == "flushing": 204 return check_flushing(con, warning, critical, True, perf_data) 205 elif action == "last_flush_time": 206 return check_flushing(con, warning, critical, False, perf_data) 207 elif action == "index_miss_ratio": 208 index_miss_ratio(con, warning, critical, perf_data) 209 elif action == "databases": 210 return check_databases(con, warning, critical, perf_data) 211 elif action == "collections": 212 return check_collections(con, warning, critical, perf_data) 213 elif action == "oplog": 214 return check_oplog(con, warning, critical, perf_data) 215 elif action == "journal_commits_in_wl": 216 return check_journal_commits_in_wl(con, warning, critical, perf_data) 217 elif action == "database_size": 218 if options.all_databases: 219 return check_all_databases_size(con, warning, critical, perf_data) 220 else: 221 return check_database_size(con, database, warning, critical, perf_data) 222 elif action == "database_indexes": 223 return check_database_indexes(con, database, warning, critical, perf_data) 224 elif action == "collection_indexes": 225 return check_collection_indexes(con, database, collection, warning, critical, perf_data) 226 elif action == "journaled": 227 return check_journaled(con, warning, critical, perf_data) 228 elif action == "write_data_files": 229 return check_write_to_datafiles(con, warning, critical, perf_data) 230 elif action == "opcounters": 231 return check_opcounters(con, host, warning, critical, perf_data) 232 elif action == "asserts": 233 return check_asserts(con, host, warning, critical, perf_data) 234 elif action == "replica_primary": 235 return check_replica_primary(con, host, warning, critical, perf_data, replicaset) 236 elif action == "queries_per_second": 237 return check_queries_per_second(con, query_type, warning, critical, perf_data) 238 elif action == "page_faults": 239 check_page_faults(con, sample_time, warning, critical, perf_data) 240 elif action == "chunks_balance": 241 chunks_balance(con, database, collection, warning, critical) 242 elif action == "connect_primary": 243 return check_connect_primary(con, warning, critical, perf_data) 244 elif action == "collection_state": 245 return check_collection_state(con, database, collection) 246 elif action == "row_count": 247 return check_row_count(con, database, collection, warning, critical, perf_data) 248 else: 249 return check_connect(host, port, warning, critical, perf_data, user, passwd, conn_time) 250 251 252def mongo_connect(host=None, port=None, ssl=False, user=None, passwd=None, replica=None): 253 try: 254 # ssl connection for pymongo > 2.3 255 if pymongo.version >= "2.3": 256 if replica is None: 257 con = pymongo.MongoClient(host, port) 258 else: 259 con = pymongo.Connection(host, port, read_preference=pymongo.ReadPreference.SECONDARY, ssl=ssl, replicaSet=replica, network_timeout=10) 260 else: 261 if replica is None: 262 con = pymongo.Connection(host, port, slave_okay=True, network_timeout=10) 263 else: 264 con = pymongo.Connection(host, port, slave_okay=True, network_timeout=10) 265 #con = pymongo.Connection(host, port, slave_okay=True, replicaSet=replica, network_timeout=10) 266 267 if user and passwd: 268 db = con["admin"] 269 if not db.authenticate(user, passwd): 270 sys.exit("Username/Password incorrect") 271 except Exception as e: 272 if isinstance(e, pymongo.errors.AutoReconnect) and str(e).find(" is an arbiter") != -1: 273 # We got a pymongo AutoReconnect exception that tells us we connected to an Arbiter Server 274 # This means: Arbiter is reachable and can answer requests/votes - this is all we need to know from an arbiter 275 print("OK - State: 7 (Arbiter)") 276 sys.exit(0) 277 return exit_with_general_critical(e), None 278 return 0, con 279 280 281def exit_with_general_warning(e): 282 if isinstance(e, SystemExit): 283 return e 284 else: 285 print("WARNING - General MongoDB warning:", e) 286 return 1 287 288 289def exit_with_general_critical(e): 290 if isinstance(e, SystemExit): 291 return e 292 else: 293 print("CRITICAL - General MongoDB Error:", e) 294 return 2 295 296 297def set_read_preference(db): 298 if pymongo.version >= "2.1": 299 db.read_preference = pymongo.ReadPreference.SECONDARY 300 301 302def check_connect(host, port, warning, critical, perf_data, user, passwd, conn_time): 303 warning = warning or 3 304 critical = critical or 6 305 message = "Connection took %i seconds" % conn_time 306 message += performance_data(perf_data, [(conn_time, "connection_time", warning, critical)]) 307 308 return check_levels(conn_time, warning, critical, message) 309 310 311def check_connections(con, warning, critical, perf_data): 312 warning = warning or 80 313 critical = critical or 95 314 try: 315 data = get_server_status(con) 316 317 current = float(data['connections']['current']) 318 available = float(data['connections']['available']) 319 320 used_percent = int(float(current / (available + current)) * 100) 321 message = "%i percent (%i of %i connections) used" % (used_percent, current, current + available) 322 message += performance_data(perf_data, [(used_percent, "used_percent", warning, critical), 323 (current, "current_connections"), 324 (available, "available_connections")]) 325 return check_levels(used_percent, warning, critical, message) 326 327 except Exception as e: 328 return exit_with_general_critical(e) 329 330 331def check_rep_lag(con, host, port, warning, critical, percent, perf_data, max_lag, user, passwd): 332 # Get mongo to tell us replica set member name when connecting locally 333 if "127.0.0.1" == host: 334 host = con.admin.command("ismaster","1")["me"].split(':')[0] 335 336 if percent: 337 warning = warning or 50 338 critical = critical or 75 339 else: 340 warning = warning or 600 341 critical = critical or 3600 342 rs_status = {} 343 slaveDelays = {} 344 try: 345 set_read_preference(con.admin) 346 347 # Get replica set status 348 try: 349 rs_status = con.admin.command("replSetGetStatus") 350 except pymongo.errors.OperationFailure as e: 351 if e.code == None and str(e).find('failed: not running with --replSet"'): 352 print("OK - Not running with replSet") 353 return 0 354 355 serverVersion = tuple(con.server_info()['version'].split('.')) 356 if serverVersion >= tuple("2.0.0".split(".")): 357 # 358 # check for version greater then 2.0 359 # 360 rs_conf = con.local.system.replset.find_one() 361 for member in rs_conf['members']: 362 if member.get('slaveDelay') is not None: 363 slaveDelays[member['host']] = member.get('slaveDelay') 364 else: 365 slaveDelays[member['host']] = 0 366 367 # Find the primary and/or the current node 368 primary_node = None 369 host_node = None 370 371 for member in rs_status["members"]: 372 if member["stateStr"] == "PRIMARY": 373 primary_node = member 374 if member["name"].split(':')[0] == host and int(member["name"].split(':')[1]) == port: 375 host_node = member 376 377 # Check if we're in the middle of an election and don't have a primary 378 if primary_node is None: 379 print("WARNING - No primary defined. In an election?") 380 return 1 381 382 # Check if we failed to find the current host 383 # below should never happen 384 if host_node is None: 385 print("CRITICAL - Unable to find host '" + host + "' in replica set.") 386 return 2 387 388 # Is the specified host the primary? 389 if host_node["stateStr"] == "PRIMARY": 390 if max_lag == False: 391 print("OK - This is the primary.") 392 return 0 393 else: 394 #get the maximal replication lag 395 data = "" 396 maximal_lag = 0 397 for member in rs_status['members']: 398 if not member['stateStr'] == "ARBITER": 399 lastSlaveOpTime = member['optimeDate'] 400 replicationLag = abs(primary_node["optimeDate"] - lastSlaveOpTime).seconds - slaveDelays[member['name']] 401 data = data + member['name'] + " lag=%d;" % replicationLag 402 maximal_lag = max(maximal_lag, replicationLag) 403 if percent: 404 err, con = mongo_connect(primary_node['name'].split(':')[0], int(primary_node['name'].split(':')[1]), False, user, passwd) 405 if err != 0: 406 return err 407 primary_timediff = replication_get_time_diff(con) 408 maximal_lag = int(float(maximal_lag) / float(primary_timediff) * 100) 409 message = "Maximal lag is " + str(maximal_lag) + " percents" 410 message += performance_data(perf_data, [(maximal_lag, "replication_lag_percent", warning, critical)]) 411 else: 412 message = "Maximal lag is " + str(maximal_lag) + " seconds" 413 message += performance_data(perf_data, [(maximal_lag, "replication_lag", warning, critical)]) 414 return check_levels(maximal_lag, warning, critical, message) 415 elif host_node["stateStr"] == "ARBITER": 416 print("OK - This is an arbiter") 417 return 0 418 419 # Find the difference in optime between current node and PRIMARY 420 421 optime_lag = abs(primary_node["optimeDate"] - host_node["optimeDate"]) 422 423 if host_node['name'] in slaveDelays: 424 slave_delay = slaveDelays[host_node['name']] 425 elif host_node['name'].endswith(':27017') and host_node['name'][:-len(":27017")] in slaveDelays: 426 slave_delay = slaveDelays[host_node['name'][:-len(":27017")]] 427 else: 428 raise Exception("Unable to determine slave delay for {0}".format(host_node['name'])) 429 430 try: # work starting from python2.7 431 lag = optime_lag.total_seconds() 432 except: 433 lag = float(optime_lag.seconds + optime_lag.days * 24 * 3600) 434 435 if percent: 436 err, con = mongo_connect(primary_node['name'].split(':')[0], int(primary_node['name'].split(':')[1]), False, user, passwd) 437 if err != 0: 438 return err 439 primary_timediff = replication_get_time_diff(con) 440 if primary_timediff != 0: 441 lag = int(float(lag) / float(primary_timediff) * 100) 442 else: 443 lag = 0 444 message = "Lag is " + str(lag) + " percents" 445 message += performance_data(perf_data, [(lag, "replication_lag_percent", warning, critical)]) 446 else: 447 message = "Lag is " + str(lag) + " seconds" 448 message += performance_data(perf_data, [(lag, "replication_lag", warning, critical)]) 449 return check_levels(lag, warning + slaveDelays[host_node['name']], critical + slaveDelays[host_node['name']], message) 450 else: 451 # 452 # less than 2.0 check 453 # 454 # Get replica set status 455 rs_status = con.admin.command("replSetGetStatus") 456 457 # Find the primary and/or the current node 458 primary_node = None 459 host_node = None 460 for member in rs_status["members"]: 461 if member["stateStr"] == "PRIMARY": 462 primary_node = (member["name"], member["optimeDate"]) 463 if member["name"].split(":")[0].startswith(host): 464 host_node = member 465 466 # Check if we're in the middle of an election and don't have a primary 467 if primary_node is None: 468 print("WARNING - No primary defined. In an election?") 469 sys.exit(1) 470 471 # Is the specified host the primary? 472 if host_node["stateStr"] == "PRIMARY": 473 print("OK - This is the primary.") 474 sys.exit(0) 475 476 # Find the difference in optime between current node and PRIMARY 477 optime_lag = abs(primary_node[1] - host_node["optimeDate"]) 478 lag = optime_lag.seconds 479 if percent: 480 err, con = mongo_connect(primary_node['name'].split(':')[0], int(primary_node['name'].split(':')[1])) 481 if err != 0: 482 return err 483 primary_timediff = replication_get_time_diff(con) 484 lag = int(float(lag) / float(primary_timediff) * 100) 485 message = "Lag is " + str(lag) + " percents" 486 message += performance_data(perf_data, [(lag, "replication_lag_percent", warning, critical)]) 487 else: 488 message = "Lag is " + str(lag) + " seconds" 489 message += performance_data(perf_data, [(lag, "replication_lag", warning, critical)]) 490 return check_levels(lag, warning, critical, message) 491 492 except Exception as e: 493 return exit_with_general_critical(e) 494 495 496def check_memory(con, warning, critical, perf_data, mapped_memory): 497 # 498 # These thresholds are basically meaningless, and must be customized to your system's ram 499 # 500 warning = warning or 8 501 critical = critical or 16 502 try: 503 data = get_server_status(con) 504 if not data['mem']['supported'] and not mapped_memory: 505 print("OK - Platform not supported for memory info") 506 return 0 507 # 508 # convert to gigs 509 # 510 message = "Memory Usage:" 511 try: 512 mem_resident = float(data['mem']['resident']) / 1024.0 513 message += " %.2fGB resident," % (mem_resident) 514 except: 515 mem_resident = 0 516 message += " resident unsupported," 517 try: 518 mem_virtual = float(data['mem']['virtual']) / 1024.0 519 message += " %.2fGB virtual," % mem_virtual 520 except: 521 mem_virtual = 0 522 message += " virtual unsupported," 523 try: 524 mem_mapped = float(data['mem']['mapped']) / 1024.0 525 message += " %.2fGB mapped," % mem_mapped 526 except: 527 mem_mapped = 0 528 message += " mapped unsupported," 529 try: 530 mem_mapped_journal = float(data['mem']['mappedWithJournal']) / 1024.0 531 message += " %.2fGB mappedWithJournal" % mem_mapped_journal 532 except: 533 mem_mapped_journal = 0 534 message += performance_data(perf_data, [("%.2f" % mem_resident, "memory_usage", warning, critical), 535 ("%.2f" % mem_mapped, "memory_mapped"), ("%.2f" % mem_virtual, "memory_virtual"), ("%.2f" % mem_mapped_journal, "mappedWithJournal")]) 536 #added for unsupported systems like Solaris 537 if mapped_memory and mem_resident == 0: 538 return check_levels(mem_mapped, warning, critical, message) 539 else: 540 return check_levels(mem_resident, warning, critical, message) 541 542 except Exception as e: 543 return exit_with_general_critical(e) 544 545 546def check_memory_mapped(con, warning, critical, perf_data): 547 # 548 # These thresholds are basically meaningless, and must be customized to your application 549 # 550 warning = warning or 8 551 critical = critical or 16 552 try: 553 data = get_server_status(con) 554 if not data['mem']['supported']: 555 print("OK - Platform not supported for memory info") 556 return 0 557 # 558 # convert to gigs 559 # 560 message = "Memory Usage:" 561 try: 562 mem_mapped = float(data['mem']['mapped']) / 1024.0 563 message += " %.2fGB mapped," % mem_mapped 564 except: 565 mem_mapped = -1 566 message += " mapped unsupported," 567 try: 568 mem_mapped_journal = float(data['mem']['mappedWithJournal']) / 1024.0 569 message += " %.2fGB mappedWithJournal" % mem_mapped_journal 570 except: 571 mem_mapped_journal = 0 572 message += performance_data(perf_data, [("%.2f" % mem_mapped, "memory_mapped"), ("%.2f" % mem_mapped_journal, "mappedWithJournal")]) 573 574 if not mem_mapped == -1: 575 return check_levels(mem_mapped, warning, critical, message) 576 else: 577 print("OK - Server does not provide mem.mapped info") 578 return 0 579 580 except Exception as e: 581 return exit_with_general_critical(e) 582 583 584def check_lock(con, warning, critical, perf_data): 585 warning = warning or 10 586 critical = critical or 30 587 try: 588 data = get_server_status(con) 589 # 590 # calculate percentage 591 # 592 lock_percentage = float(data['globalLock']['lockTime']) / float(data['globalLock']['totalTime']) * 100 593 message = "Lock Percentage: %.2f%%" % lock_percentage 594 message += performance_data(perf_data, [("%.2f" % lock_percentage, "lock_percentage", warning, critical)]) 595 return check_levels(lock_percentage, warning, critical, message) 596 597 except Exception as e: 598 return exit_with_general_critical(e) 599 600 601def check_flushing(con, warning, critical, avg, perf_data): 602 # 603 # These thresholds mean it's taking 5 seconds to perform a background flush to issue a warning 604 # and 10 seconds to issue a critical. 605 # 606 warning = warning or 5000 607 critical = critical or 15000 608 try: 609 data = get_server_status(con) 610 if avg: 611 flush_time = float(data['backgroundFlushing']['average_ms']) 612 stat_type = "Average" 613 else: 614 flush_time = float(data['backgroundFlushing']['last_ms']) 615 stat_type = "Last" 616 617 message = "%s Flush Time: %.2fms" % (stat_type, flush_time) 618 message += performance_data(perf_data, [("%.2fms" % flush_time, "%s_flush_time" % stat_type.lower(), warning, critical)]) 619 620 return check_levels(flush_time, warning, critical, message) 621 622 except Exception as e: 623 return exit_with_general_critical(e) 624 625 626def index_miss_ratio(con, warning, critical, perf_data): 627 warning = warning or 10 628 critical = critical or 30 629 try: 630 data = get_server_status(con) 631 632 try: 633 serverVersion = tuple(con.server_info()['version'].split('.')) 634 if serverVersion >= tuple("2.4.0".split(".")): 635 miss_ratio = float(data['indexCounters']['missRatio']) 636 else: 637 miss_ratio = float(data['indexCounters']['btree']['missRatio']) 638 except KeyError: 639 not_supported_msg = "not supported on this platform" 640 if data['indexCounters'].has_key('note'): 641 print("OK - MongoDB says: " + not_supported_msg) 642 return 0 643 else: 644 print("WARNING - Can't get counter from MongoDB") 645 return 1 646 647 message = "Miss Ratio: %.2f" % miss_ratio 648 message += performance_data(perf_data, [("%.2f" % miss_ratio, "index_miss_ratio", warning, critical)]) 649 650 return check_levels(miss_ratio, warning, critical, message) 651 652 except Exception as e: 653 return exit_with_general_critical(e) 654 655 656def check_replset_state(con, perf_data, warning="", critical=""): 657 try: 658 warning = [int(x) for x in warning.split(",")] 659 except: 660 warning = [0, 3, 5] 661 try: 662 critical = [int(x) for x in critical.split(",")] 663 except: 664 critical = [8, 4, -1] 665 666 ok = list(range(-1, 8)) # should include the range of all posiible values 667 try: 668 try: 669 try: 670 set_read_preference(con.admin) 671 data = con.admin.command(pymongo.son_manipulator.SON([('replSetGetStatus', 1)])) 672 except: 673 data = con.admin.command(son.SON([('replSetGetStatus', 1)])) 674 state = int(data['myState']) 675 except pymongo.errors.OperationFailure as e: 676 if e.code == None and str(e).find('failed: not running with --replSet"'): 677 state = -1 678 679 if state == 8: 680 message = "State: %i (Down)" % state 681 elif state == 4: 682 message = "State: %i (Fatal error)" % state 683 elif state == 0: 684 message = "State: %i (Starting up, phase1)" % state 685 elif state == 3: 686 message = "State: %i (Recovering)" % state 687 elif state == 5: 688 message = "State: %i (Starting up, phase2)" % state 689 elif state == 1: 690 message = "State: %i (Primary)" % state 691 elif state == 2: 692 message = "State: %i (Secondary)" % state 693 elif state == 7: 694 message = "State: %i (Arbiter)" % state 695 elif state == -1: 696 message = "Not running with replSet" 697 else: 698 message = "State: %i (Unknown state)" % state 699 message += performance_data(perf_data, [(state, "state")]) 700 return check_levels(state, warning, critical, message, ok) 701 except Exception as e: 702 return exit_with_general_critical(e) 703 704 705def check_databases(con, warning, critical, perf_data=None): 706 try: 707 try: 708 set_read_preference(con.admin) 709 data = con.admin.command(pymongo.son_manipulator.SON([('listDatabases', 1)])) 710 except: 711 data = con.admin.command(son.SON([('listDatabases', 1)])) 712 713 count = len(data['databases']) 714 message = "Number of DBs: %.0f" % count 715 message += performance_data(perf_data, [(count, "databases", warning, critical, message)]) 716 return check_levels(count, warning, critical, message) 717 except Exception as e: 718 return exit_with_general_critical(e) 719 720 721def check_collections(con, warning, critical, perf_data=None): 722 try: 723 try: 724 set_read_preference(con.admin) 725 data = con.admin.command(pymongo.son_manipulator.SON([('listDatabases', 1)])) 726 except: 727 data = con.admin.command(son.SON([('listDatabases', 1)])) 728 729 count = 0 730 for db in data['databases']: 731 dbase = con[db['name']] 732 set_read_preference(dbase) 733 count += len(dbase.collection_names()) 734 735 message = "Number of collections: %.0f" % count 736 message += performance_data(perf_data, [(count, "collections", warning, critical, message)]) 737 return check_levels(count, warning, critical, message) 738 739 except Exception as e: 740 return exit_with_general_critical(e) 741 742 743def check_all_databases_size(con, warning, critical, perf_data): 744 warning = warning or 100 745 critical = critical or 1000 746 try: 747 set_read_preference(con.admin) 748 all_dbs_data = con.admin.command(pymongo.son_manipulator.SON([('listDatabases', 1)])) 749 except: 750 all_dbs_data = con.admin.command(son.SON([('listDatabases', 1)])) 751 752 total_storage_size = 0 753 message = "" 754 perf_data_param = [()] 755 for db in all_dbs_data['databases']: 756 database = db['name'] 757 data = con[database].command('dbstats') 758 storage_size = round(data['storageSize'] / 1024 / 1024, 1) 759 message += "; Database %s size: %.0f MB" % (database, storage_size) 760 perf_data_param.append((storage_size, database + "_database_size")) 761 total_storage_size += storage_size 762 763 perf_data_param[0] = (total_storage_size, "total_size", warning, critical) 764 message += performance_data(perf_data, perf_data_param) 765 message = "Total size: %.0f MB" % total_storage_size + message 766 return check_levels(total_storage_size, warning, critical, message) 767 768 769def check_database_size(con, database, warning, critical, perf_data): 770 warning = warning or 100 771 critical = critical or 1000 772 perfdata = "" 773 try: 774 set_read_preference(con.admin) 775 data = con[database].command('dbstats') 776 storage_size = data['storageSize'] / 1024 / 1024 777 if perf_data: 778 perfdata += " | database_size=%i;%i;%i" % (storage_size, warning, critical) 779 #perfdata += " database=%s" %(database) 780 781 if storage_size >= critical: 782 print("CRITICAL - Database size: %.0f MB, Database: %s%s" % (storage_size, database, perfdata)) 783 return 2 784 elif storage_size >= warning: 785 print("WARNING - Database size: %.0f MB, Database: %s%s" % (storage_size, database, perfdata)) 786 return 1 787 else: 788 print("OK - Database size: %.0f MB, Database: %s%s" % (storage_size, database, perfdata)) 789 return 0 790 except Exception as e: 791 return exit_with_general_critical(e) 792 793 794def check_database_indexes(con, database, warning, critical, perf_data): 795 # 796 # These thresholds are basically meaningless, and must be customized to your application 797 # 798 warning = warning or 100 799 critical = critical or 1000 800 perfdata = "" 801 try: 802 set_read_preference(con.admin) 803 data = con[database].command('dbstats') 804 index_size = data['indexSize'] / 1024 / 1024 805 if perf_data: 806 perfdata += " | database_indexes=%i;%i;%i" % (index_size, warning, critical) 807 808 if index_size >= critical: 809 print("CRITICAL - %s indexSize: %.0f MB %s" % (database, index_size, perfdata)) 810 return 2 811 elif index_size >= warning: 812 print("WARNING - %s indexSize: %.0f MB %s" % (database, index_size, perfdata)) 813 return 1 814 else: 815 print("OK - %s indexSize: %.0f MB %s" % (database, index_size, perfdata)) 816 return 0 817 except Exception as e: 818 return exit_with_general_critical(e) 819 820 821def check_collection_indexes(con, database, collection, warning, critical, perf_data): 822 # 823 # These thresholds are basically meaningless, and must be customized to your application 824 # 825 warning = warning or 100 826 critical = critical or 1000 827 perfdata = "" 828 try: 829 set_read_preference(con.admin) 830 data = con[database].command('collstats', collection) 831 total_index_size = data['totalIndexSize'] / 1024 / 1024 832 if perf_data: 833 perfdata += " | collection_indexes=%i;%i;%i" % (total_index_size, warning, critical) 834 835 if total_index_size >= critical: 836 print("CRITICAL - %s.%s totalIndexSize: %.0f MB %s" % (database, collection, total_index_size, perfdata)) 837 return 2 838 elif total_index_size >= warning: 839 print("WARNING - %s.%s totalIndexSize: %.0f MB %s" % (database, collection, total_index_size, perfdata)) 840 return 1 841 else: 842 print("OK - %s.%s totalIndexSize: %.0f MB %s" % (database, collection, total_index_size, perfdata)) 843 return 0 844 except Exception as e: 845 return exit_with_general_critical(e) 846 847 848def check_queues(con, warning, critical, perf_data): 849 warning = warning or 10 850 critical = critical or 30 851 try: 852 data = get_server_status(con) 853 854 total_queues = float(data['globalLock']['currentQueue']['total']) 855 readers_queues = float(data['globalLock']['currentQueue']['readers']) 856 writers_queues = float(data['globalLock']['currentQueue']['writers']) 857 message = "Current queue is : total = %d, readers = %d, writers = %d" % (total_queues, readers_queues, writers_queues) 858 message += performance_data(perf_data, [(total_queues, "total_queues", warning, critical), (readers_queues, "readers_queues"), (writers_queues, "writers_queues")]) 859 return check_levels(total_queues, warning, critical, message) 860 861 except Exception as e: 862 return exit_with_general_critical(e) 863 864 865def check_queries_per_second(con, query_type, warning, critical, perf_data): 866 warning = warning or 250 867 critical = critical or 500 868 869 if query_type not in ['insert', 'query', 'update', 'delete', 'getmore', 'command']: 870 return exit_with_general_critical("The query type of '%s' is not valid" % query_type) 871 872 try: 873 db = con.local 874 data = get_server_status(con) 875 876 # grab the count 877 num = int(data['opcounters'][query_type]) 878 879 # do the math 880 last_count = db.nagios_check.find_one({'check': 'query_counts'}) 881 try: 882 ts = int(time.time()) 883 diff_query = num - last_count['data'][query_type]['count'] 884 diff_ts = ts - last_count['data'][query_type]['ts'] 885 886 query_per_sec = float(diff_query) / float(diff_ts) 887 888 # update the count now 889 db.nagios_check.update({u'_id': last_count['_id']}, {'$set': {"data.%s" % query_type: {'count': num, 'ts': int(time.time())}}}) 890 891 message = "Queries / Sec: %f" % query_per_sec 892 message += performance_data(perf_data, [(query_per_sec, "%s_per_sec" % query_type, warning, critical, message)]) 893 except KeyError: 894 # 895 # since it is the first run insert it 896 query_per_sec = 0 897 message = "First run of check.. no data" 898 db.nagios_check.update({u'_id': last_count['_id']}, {'$set': {"data.%s" % query_type: {'count': num, 'ts': int(time.time())}}}) 899 except TypeError: 900 # 901 # since it is the first run insert it 902 query_per_sec = 0 903 message = "First run of check.. no data" 904 db.nagios_check.insert({'check': 'query_counts', 'data': {query_type: {'count': num, 'ts': int(time.time())}}}) 905 906 return check_levels(query_per_sec, warning, critical, message) 907 908 except Exception as e: 909 return exit_with_general_critical(e) 910 911 912def check_oplog(con, warning, critical, perf_data): 913 """ Checking the oplog time - the time of the log currntly saved in the oplog collection 914 defaults: 915 critical 4 hours 916 warning 24 hours 917 those can be changed as usual with -C and -W parameters""" 918 warning = warning or 24 919 critical = critical or 4 920 try: 921 db = con.local 922 ol = db.system.namespaces.find_one({"name": "local.oplog.rs"}) 923 if (db.system.namespaces.find_one({"name": "local.oplog.rs"}) != None): 924 oplog = "oplog.rs" 925 else: 926 ol = db.system.namespaces.find_one({"name": "local.oplog.$main"}) 927 if (db.system.namespaces.find_one({"name": "local.oplog.$main"}) != None): 928 oplog = "oplog.$main" 929 else: 930 message = "neither master/slave nor replica set replication detected" 931 return check_levels(None, warning, critical, message) 932 933 try: 934 set_read_preference(con.admin) 935 data = con.local.command(pymongo.son_manipulator.SON([('collstats', oplog)])) 936 except: 937 data = con.admin.command(son.SON([('collstats', oplog)])) 938 939 ol_size = data['size'] 940 ol_storage_size = data['storageSize'] 941 ol_used_storage = int(float(ol_size) / ol_storage_size * 100 + 1) 942 ol = con.local[oplog] 943 firstc = ol.find().sort("$natural", pymongo.ASCENDING).limit(1)[0]['ts'] 944 lastc = ol.find().sort("$natural", pymongo.DESCENDING).limit(1)[0]['ts'] 945 time_in_oplog = (lastc.as_datetime() - firstc.as_datetime()) 946 message = "Oplog saves " + str(time_in_oplog) + " %d%% used" % ol_used_storage 947 try: # work starting from python2.7 948 hours_in_oplog = time_in_oplog.total_seconds() / 60 / 60 949 except: 950 hours_in_oplog = float(time_in_oplog.seconds + time_in_oplog.days * 24 * 3600) / 60 / 60 951 approx_level = hours_in_oplog * 100 / ol_used_storage 952 message += performance_data(perf_data, [("%.2f" % hours_in_oplog, 'oplog_time', warning, critical), ("%.2f " % approx_level, 'oplog_time_100_percent_used')]) 953 return check_levels(-approx_level, -warning, -critical, message) 954 955 except Exception as e: 956 return exit_with_general_critical(e) 957 958 959def check_journal_commits_in_wl(con, warning, critical, perf_data): 960 """ Checking the number of commits which occurred in the db's write lock. 961Most commits are performed outside of this lock; committed while in the write lock is undesirable. 962Under very high write situations it is normal for this value to be nonzero. """ 963 964 warning = warning or 10 965 critical = critical or 40 966 try: 967 data = get_server_status(con) 968 j_commits_in_wl = data['dur']['commitsInWriteLock'] 969 message = "Journal commits in DB write lock : %d" % j_commits_in_wl 970 message += performance_data(perf_data, [(j_commits_in_wl, "j_commits_in_wl", warning, critical)]) 971 return check_levels(j_commits_in_wl, warning, critical, message) 972 973 except Exception as e: 974 return exit_with_general_critical(e) 975 976 977def check_journaled(con, warning, critical, perf_data): 978 """ Checking the average amount of data in megabytes written to the recovery log in the last four seconds""" 979 980 warning = warning or 20 981 critical = critical or 40 982 try: 983 data = get_server_status(con) 984 journaled = data['dur']['journaledMB'] 985 message = "Journaled : %.2f MB" % journaled 986 message += performance_data(perf_data, [("%.2f" % journaled, "journaled", warning, critical)]) 987 return check_levels(journaled, warning, critical, message) 988 989 except Exception as e: 990 return exit_with_general_critical(e) 991 992 993def check_write_to_datafiles(con, warning, critical, perf_data): 994 """ Checking the average amount of data in megabytes written to the databases datafiles in the last four seconds. 995As these writes are already journaled, they can occur lazily, and thus the number indicated here may be lower 996than the amount physically written to disk.""" 997 warning = warning or 20 998 critical = critical or 40 999 try: 1000 data = get_server_status(con) 1001 writes = data['dur']['writeToDataFilesMB'] 1002 message = "Write to data files : %.2f MB" % writes 1003 message += performance_data(perf_data, [("%.2f" % writes, "write_to_data_files", warning, critical)]) 1004 return check_levels(writes, warning, critical, message) 1005 1006 except Exception as e: 1007 return exit_with_general_critical(e) 1008 1009 1010def get_opcounters(data, opcounters_name, host): 1011 try: 1012 insert = data[opcounters_name]['insert'] 1013 query = data[opcounters_name]['query'] 1014 update = data[opcounters_name]['update'] 1015 delete = data[opcounters_name]['delete'] 1016 getmore = data[opcounters_name]['getmore'] 1017 command = data[opcounters_name]['command'] 1018 except KeyError as e: 1019 return 0, [0] * 100 1020 total_commands = insert + query + update + delete + getmore + command 1021 new_vals = [total_commands, insert, query, update, delete, getmore, command] 1022 return maintain_delta(new_vals, host, opcounters_name) 1023 1024 1025def check_opcounters(con, host, warning, critical, perf_data): 1026 """ A function to get all opcounters delta per minute. In case of a replication - gets the opcounters+opcountersRepl""" 1027 warning = warning or 10000 1028 critical = critical or 15000 1029 1030 data = get_server_status(con) 1031 err1, delta_opcounters = get_opcounters(data, 'opcounters', host) 1032 err2, delta_opcounters_repl = get_opcounters(data, 'opcountersRepl', host) 1033 if err1 == 0 and err2 == 0: 1034 delta = [(x + y) for x, y in zip(delta_opcounters, delta_opcounters_repl)] 1035 delta[0] = delta_opcounters[0] # only the time delta shouldn't be summarized 1036 per_minute_delta = [int(x / delta[0] * 60) for x in delta[1:]] 1037 message = "Test succeeded , old values missing" 1038 message = "Opcounters: total=%d,insert=%d,query=%d,update=%d,delete=%d,getmore=%d,command=%d" % tuple(per_minute_delta) 1039 message += performance_data(perf_data, ([(per_minute_delta[0], "total", warning, critical), (per_minute_delta[1], "insert"), 1040 (per_minute_delta[2], "query"), (per_minute_delta[3], "update"), (per_minute_delta[5], "delete"), 1041 (per_minute_delta[5], "getmore"), (per_minute_delta[6], "command")])) 1042 return check_levels(per_minute_delta[0], warning, critical, message) 1043 else: 1044 return exit_with_general_critical("problem reading data from temp file") 1045 1046 1047def check_current_lock(con, host, warning, critical, perf_data): 1048 """ A function to get current lock percentage and not a global one, as check_lock function does""" 1049 warning = warning or 10 1050 critical = critical or 30 1051 data = get_server_status(con) 1052 1053 lockTime = float(data['globalLock']['lockTime']) 1054 totalTime = float(data['globalLock']['totalTime']) 1055 1056 err, delta = maintain_delta([totalTime, lockTime], host, "locktime") 1057 if err == 0: 1058 lock_percentage = delta[2] / delta[1] * 100 # lockTime/totalTime*100 1059 message = "Current Lock Percentage: %.2f%%" % lock_percentage 1060 message += performance_data(perf_data, [("%.2f" % lock_percentage, "current_lock_percentage", warning, critical)]) 1061 return check_levels(lock_percentage, warning, critical, message) 1062 else: 1063 return exit_with_general_warning("problem reading data from temp file") 1064 1065 1066def check_page_faults(con, host, warning, critical, perf_data): 1067 """ A function to get page_faults per second from the system""" 1068 warning = warning or 10 1069 critical = critical or 30 1070 data = get_server_status(con) 1071 1072 try: 1073 page_faults = float(data['extra_info']['page_faults']) 1074 except: 1075 # page_faults unsupported on the underlaying system 1076 return exit_with_general_critical("page_faults unsupported on the underlaying system") 1077 1078 err, delta = maintain_delta([page_faults], host, "page_faults") 1079 if err == 0: 1080 page_faults_ps = delta[1] / delta[0] 1081 message = "Page faults : %.2f ps" % page_faults_ps 1082 message += performance_data(perf_data, [("%.2f" % page_faults_ps, "page_faults_ps", warning, critical)]) 1083 return check_levels(page_faults_ps, warning, critical, message) 1084 else: 1085 return exit_with_general_warning("problem reading data from temp file") 1086 1087 1088def check_asserts(con, host, warning, critical, perf_data): 1089 """ A function to get asserts from the system""" 1090 warning = warning or 1 1091 critical = critical or 10 1092 data = get_server_status(con) 1093 1094 asserts = data['asserts'] 1095 1096 #{ "regular" : 0, "warning" : 6, "msg" : 0, "user" : 12, "rollovers" : 0 } 1097 regular = asserts['regular'] 1098 warning_asserts = asserts['warning'] 1099 msg = asserts['msg'] 1100 user = asserts['user'] 1101 rollovers = asserts['rollovers'] 1102 1103 err, delta = maintain_delta([regular, warning_asserts, msg, user, rollovers], host, "asserts") 1104 1105 if err == 0: 1106 if delta[5] != 0: 1107 #the number of rollovers were increased 1108 warning = -1 # no matter the metrics this situation should raise a warning 1109 # if this is normal rollover - the warning will not appear again, but if there will be a lot of asserts 1110 # the warning will stay for a long period of time 1111 # although this is not a usual situation 1112 1113 regular_ps = delta[1] / delta[0] 1114 warning_ps = delta[2] / delta[0] 1115 msg_ps = delta[3] / delta[0] 1116 user_ps = delta[4] / delta[0] 1117 rollovers_ps = delta[5] / delta[0] 1118 total_ps = regular_ps + warning_ps + msg_ps + user_ps 1119 message = "Total asserts : %.2f ps" % total_ps 1120 message += performance_data(perf_data, [(total_ps, "asserts_ps", warning, critical), (regular_ps, "regular"), 1121 (warning_ps, "warning"), (msg_ps, "msg"), (user_ps, "user")]) 1122 return check_levels(total_ps, warning, critical, message) 1123 else: 1124 return exit_with_general_warning("problem reading data from temp file") 1125 1126 1127def get_stored_primary_server_name(db): 1128 """ get the stored primary server name from db. """ 1129 if "last_primary_server" in db.collection_names(): 1130 stored_primary_server = db.last_primary_server.find_one()["server"] 1131 else: 1132 stored_primary_server = None 1133 1134 return stored_primary_server 1135 1136 1137def check_replica_primary(con, host, warning, critical, perf_data, replicaset): 1138 """ A function to check if the primary server of a replica set has changed """ 1139 if warning is None and critical is None: 1140 warning = 1 1141 warning = warning or 2 1142 critical = critical or 2 1143 1144 primary_status = 0 1145 message = "Primary server has not changed" 1146 db = con["nagios"] 1147 data = get_server_status(con) 1148 if replicaset != data['repl'].get('setName'): 1149 message = "Replica set requested: %s differs from the one found: %s" % (replicaset, data['repl'].get('setName')) 1150 primary_status = 2 1151 return check_levels(primary_status, warning, critical, message) 1152 current_primary = data['repl'].get('primary') 1153 saved_primary = get_stored_primary_server_name(db) 1154 if current_primary is None: 1155 current_primary = "None" 1156 if saved_primary is None: 1157 saved_primary = "None" 1158 if current_primary != saved_primary: 1159 last_primary_server_record = {"server": current_primary} 1160 db.last_primary_server.update({"_id": "last_primary"}, {"$set": last_primary_server_record}, upsert=True, safe=True) 1161 message = "Primary server has changed from %s to %s" % (saved_primary, current_primary) 1162 primary_status = 1 1163 return check_levels(primary_status, warning, critical, message) 1164 1165 1166def check_page_faults(con, sample_time, warning, critical, perf_data): 1167 warning = warning or 10 1168 critical = critical or 20 1169 try: 1170 try: 1171 set_read_preference(con.admin) 1172 data1 = con.admin.command(pymongo.son_manipulator.SON([('serverStatus', 1)])) 1173 time.sleep(sample_time) 1174 data2 = con.admin.command(pymongo.son_manipulator.SON([('serverStatus', 1)])) 1175 except: 1176 data1 = con.admin.command(son.SON([('serverStatus', 1)])) 1177 time.sleep(sample_time) 1178 data2 = con.admin.command(son.SON([('serverStatus', 1)])) 1179 1180 try: 1181 #on linux servers only 1182 page_faults = (int(data2['extra_info']['page_faults']) - int(data1['extra_info']['page_faults'])) / sample_time 1183 except KeyError: 1184 print("WARNING - Can't get extra_info.page_faults counter from MongoDB") 1185 sys.exit(1) 1186 1187 message = "Page Faults: %i" % (page_faults) 1188 1189 message += performance_data(perf_data, [(page_faults, "page_faults", warning, critical)]) 1190 check_levels(page_faults, warning, critical, message) 1191 1192 except Exception as e: 1193 exit_with_general_critical(e) 1194 1195 1196def chunks_balance(con, database, collection, warning, critical): 1197 warning = warning or 10 1198 critical = critical or 20 1199 nsfilter = database + "." + collection 1200 try: 1201 try: 1202 set_read_preference(con.admin) 1203 col = con.config.chunks 1204 nscount = col.find({"ns": nsfilter}).count() 1205 shards = col.distinct("shard") 1206 1207 except: 1208 print("WARNING - Can't get chunks infos from MongoDB") 1209 sys.exit(1) 1210 1211 if nscount == 0: 1212 print("WARNING - Namespace %s is not sharded" % (nsfilter)) 1213 sys.exit(1) 1214 1215 avgchunksnb = nscount / len(shards) 1216 warningnb = avgchunksnb * warning / 100 1217 criticalnb = avgchunksnb * critical / 100 1218 1219 for shard in shards: 1220 delta = abs(avgchunksnb - col.find({"ns": nsfilter, "shard": shard}).count()) 1221 message = "Namespace: %s, Shard name: %s, Chunk delta: %i" % (nsfilter, shard, delta) 1222 1223 if delta >= criticalnb and delta > 0: 1224 print("CRITICAL - Chunks not well balanced " + message) 1225 sys.exit(2) 1226 elif delta >= warningnb and delta > 0: 1227 print("WARNING - Chunks not well balanced " + message) 1228 sys.exit(1) 1229 1230 print("OK - Chunks well balanced across shards") 1231 sys.exit(0) 1232 1233 except Exception as e: 1234 exit_with_general_critical(e) 1235 1236 print("OK - Chunks well balanced across shards") 1237 sys.exit(0) 1238 1239 1240def check_connect_primary(con, warning, critical, perf_data): 1241 warning = warning or 3 1242 critical = critical or 6 1243 1244 try: 1245 try: 1246 set_read_preference(con.admin) 1247 data = con.admin.command(pymongo.son_manipulator.SON([('isMaster', 1)])) 1248 except: 1249 data = con.admin.command(son.SON([('isMaster', 1)])) 1250 1251 if data['ismaster'] == True: 1252 print("OK - This server is primary") 1253 return 0 1254 1255 phost = data['primary'].split(':')[0] 1256 pport = int(data['primary'].split(':')[1]) 1257 start = time.time() 1258 1259 err, con = mongo_connect(phost, pport) 1260 if err != 0: 1261 return err 1262 1263 pconn_time = time.time() - start 1264 pconn_time = round(pconn_time, 0) 1265 message = "Connection to primary server " + data['primary'] + " took %i seconds" % pconn_time 1266 message += performance_data(perf_data, [(pconn_time, "connection_time", warning, critical)]) 1267 1268 return check_levels(pconn_time, warning, critical, message) 1269 1270 except Exception as e: 1271 return exit_with_general_critical(e) 1272 1273 1274def check_collection_state(con, database, collection): 1275 try: 1276 con[database][collection].find_one() 1277 print("OK - Collection %s.%s is reachable " % (database, collection)) 1278 return 0 1279 1280 except Exception as e: 1281 return exit_with_general_critical(e) 1282 1283 1284def check_row_count(con, database, collection, warning, critical, perf_data): 1285 try: 1286 count = con[database][collection].count() 1287 message = "Row count: %i" % (count) 1288 message += performance_data(perf_data, [(count, "row_count", warning, critical)]) 1289 1290 return check_levels(count, warning, critical, message) 1291 1292 except Exception as e: 1293 return exit_with_general_critical(e) 1294 1295 1296def build_file_name(host, action): 1297 #done this way so it will work when run independently and from shell 1298 module_name = re.match('(.*//*)*(.*)\..*', __file__).group(2) 1299 return "/tmp/" + module_name + "_data/" + host + "-" + action + ".data" 1300 1301 1302def ensure_dir(f): 1303 d = os.path.dirname(f) 1304 if not os.path.exists(d): 1305 os.makedirs(d) 1306 1307 1308def write_values(file_name, string): 1309 f = None 1310 try: 1311 f = open(file_name, 'w') 1312 except IOError as e: 1313 #try creating 1314 if (e.errno == 2): 1315 ensure_dir(file_name) 1316 f = open(file_name, 'w') 1317 else: 1318 raise IOError(e) 1319 f.write(string) 1320 f.close() 1321 return 0 1322 1323 1324def read_values(file_name): 1325 data = None 1326 try: 1327 f = open(file_name, 'r') 1328 data = f.read() 1329 f.close() 1330 return 0, data 1331 except IOError as e: 1332 if (e.errno == 2): 1333 #no previous data 1334 return 1, '' 1335 except Exception as e: 1336 return 2, None 1337 1338 1339def calc_delta(old, new): 1340 delta = [] 1341 if (len(old) != len(new)): 1342 raise Exception("unequal number of parameters") 1343 for i in range(0, len(old)): 1344 val = float(new[i]) - float(old[i]) 1345 if val < 0: 1346 val = new[i] 1347 delta.append(val) 1348 return 0, delta 1349 1350 1351def maintain_delta(new_vals, host, action): 1352 file_name = build_file_name(host, action) 1353 err, data = read_values(file_name) 1354 old_vals = data.split(';') 1355 new_vals = [str(int(time.time()))] + new_vals 1356 delta = None 1357 try: 1358 err, delta = calc_delta(old_vals, new_vals) 1359 except: 1360 err = 2 1361 write_res = write_values(file_name, ";" . join(str(x) for x in new_vals)) 1362 return err + write_res, delta 1363 1364 1365def replication_get_time_diff(con): 1366 col = 'oplog.rs' 1367 local = con.local 1368 ol = local.system.namespaces.find_one({"name": "local.oplog.$main"}) 1369 if ol: 1370 col = 'oplog.$main' 1371 firstc = local[col].find().sort("$natural", 1).limit(1) 1372 lastc = local[col].find().sort("$natural", -1).limit(1) 1373 first = firstc.next() 1374 last = lastc.next() 1375 tfirst = first["ts"] 1376 tlast = last["ts"] 1377 delta = tlast.time - tfirst.time 1378 return delta 1379 1380# 1381# main app 1382# 1383if __name__ == "__main__": 1384 sys.exit(main(sys.argv[1:])) 1385