1#!/usr/local/bin/python3.8
2
3#
4# A MongoDB Nagios check script
5#
6
7# Script idea taken from a Tag1 script I found and I modified it a lot
8#
9# Main Author
10#   - Mike Zupan <mike@zcentric.com>
11# Contributers
12#   - Frank Brandewiede <brande@travel-iq.com> <brande@bfiw.de> <brande@novolab.de>
13#   - Sam Perman <sam@brightcove.com>
14#   - Shlomo Priymak <shlomoid@gmail.com>
15#   - @jhoff909 on github
16#   - @jbraeuer on github
17#   - Dag Stockstad <dag.stockstad@gmail.com>
18#   - @Andor on github
19#
20# USAGE
21#
22# See the README.md
23#
24
25import sys
26import time
27import optparse
28import textwrap
29import re
30import os
31
32try:
33    import pymongo
34except ImportError as e:
35    print(e)
36    sys.exit(2)
37
38# As of pymongo v 1.9 the SON API is part of the BSON package, therefore attempt
39# to import from there and fall back to pymongo in cases of older pymongo
40if pymongo.version >= "1.9":
41    import bson.son as son
42else:
43    import pymongo.son as son
44
45
46#
47# thanks to http://stackoverflow.com/a/1229667/72987
48#
49def optional_arg(arg_default):
50    def func(option, opt_str, value, parser):
51        if parser.rargs and not parser.rargs[0].startswith('-'):
52            val = parser.rargs[0]
53            parser.rargs.pop(0)
54        else:
55            val = arg_default
56        setattr(parser.values, option.dest, val)
57    return func
58
59
60def performance_data(perf_data, params):
61    data = ''
62    if perf_data:
63        data = " |"
64        for p in params:
65            p += (None, None, None, None)
66            param, param_name, warning, critical = p[0:4]
67            data += "%s=%s" % (param_name, str(param))
68            if warning or critical:
69                warning = warning or 0
70                critical = critical or 0
71                data += ";%s;%s" % (warning, critical)
72
73            data += " "
74
75    return data
76
77
78def numeric_type(param):
79    if ((type(param) == float or type(param) == int or param == None)):
80        return True
81    return False
82
83
84def check_levels(param, warning, critical, message, ok=[]):
85    if (numeric_type(critical) and numeric_type(warning)):
86        if param >= critical:
87            print("CRITICAL - " + message)
88            sys.exit(2)
89        elif param >= warning:
90            print("WARNING - " + message)
91            sys.exit(1)
92        else:
93            print("OK - " + message)
94            sys.exit(0)
95    else:
96        if param in critical:
97            print("CRITICAL - " + message)
98            sys.exit(2)
99
100        if param in warning:
101            print("WARNING - " + message)
102            sys.exit(1)
103
104        if param in ok:
105            print("OK - " + message)
106            sys.exit(0)
107
108        # unexpected param value
109        print("CRITICAL - Unexpected value : %d" % param + "; " + message)
110        return 2
111
112
113def get_server_status(con):
114    try:
115        set_read_preference(con.admin)
116        data = con.admin.command(pymongo.son_manipulator.SON([('serverStatus', 1)]))
117    except:
118        data = con.admin.command(son.SON([('serverStatus', 1)]))
119    return data
120
121
122def main(argv):
123    p = optparse.OptionParser(conflict_handler="resolve", description="This Nagios plugin checks the health of mongodb.")
124
125    p.add_option('-H', '--host', action='store', type='string', dest='host', default='127.0.0.1', help='The hostname you want to connect to')
126    p.add_option('-P', '--port', action='store', type='int', dest='port', default=27017, help='The port mongodb is runnung on')
127    p.add_option('-u', '--user', action='store', type='string', dest='user', default=None, help='The username you want to login as')
128    p.add_option('-p', '--pass', action='store', type='string', dest='passwd', default=None, help='The password you want to use for that user')
129    p.add_option('-W', '--warning', action='store', dest='warning', default=None, help='The warning threshold we want to set')
130    p.add_option('-C', '--critical', action='store', dest='critical', default=None, help='The critical threshold we want to set')
131    p.add_option('-A', '--action', action='store', type='choice', dest='action', default='connect', help='The action you want to take',
132                 choices=['connect', 'connections', 'replication_lag', 'replication_lag_percent', 'replset_state', 'memory', 'memory_mapped', 'lock',
133                          'flushing', 'last_flush_time', 'index_miss_ratio', 'databases', 'collections', 'database_size', 'database_indexes', 'collection_indexes',
134                          'queues', 'oplog', 'journal_commits_in_wl', 'write_data_files', 'journaled', 'opcounters', 'current_lock', 'replica_primary', 'page_faults',
135                          'asserts', 'queries_per_second', 'page_faults', 'chunks_balance', 'connect_primary', 'collection_state', 'row_count'])
136    p.add_option('--max-lag', action='store_true', dest='max_lag', default=False, help='Get max replication lag (for replication_lag action only)')
137    p.add_option('--mapped-memory', action='store_true', dest='mapped_memory', default=False, help='Get mapped memory instead of resident (if resident memory can not be read)')
138    p.add_option('-D', '--perf-data', action='store_true', dest='perf_data', default=False, help='Enable output of Nagios performance data')
139    p.add_option('-d', '--database', action='store', dest='database', default='admin', help='Specify the database to check')
140    p.add_option('--all-databases', action='store_true', dest='all_databases', default=False, help='Check all databases (action database_size)')
141    p.add_option('-s', '--ssl', dest='ssl', default=False, action='callback', callback=optional_arg(True), help='Connect using SSL')
142    p.add_option('-r', '--replicaset', dest='replicaset', default=None, action='callback', callback=optional_arg(True), help='Connect to replicaset')
143    p.add_option('-q', '--querytype', action='store', dest='query_type', default='query', help='The query type to check [query|insert|update|delete|getmore|command] from queries_per_second')
144    p.add_option('-c', '--collection', action='store', dest='collection', default='admin', help='Specify the collection to check')
145    p.add_option('-T', '--time', action='store', type='int', dest='sample_time', default=1, help='Time used to sample number of pages faults')
146
147    options, arguments = p.parse_args()
148    host = options.host
149    port = options.port
150    user = options.user
151    passwd = options.passwd
152    query_type = options.query_type
153    collection = options.collection
154    sample_time = options.sample_time
155    if (options.action == 'replset_state'):
156        warning = str(options.warning or "")
157        critical = str(options.critical or "")
158    else:
159        warning = float(options.warning or 0)
160        critical = float(options.critical or 0)
161
162    action = options.action
163    perf_data = options.perf_data
164    max_lag = options.max_lag
165    database = options.database
166    ssl = options.ssl
167    replicaset = options.replicaset
168
169    if action == 'replica_primary' and replicaset is None:
170        return "replicaset must be passed in when using replica_primary check"
171    elif not action == 'replica_primary' and replicaset:
172        return "passing a replicaset while not checking replica_primary does not work"
173
174    #
175    # moving the login up here and passing in the connection
176    #
177    start = time.time()
178    err, con = mongo_connect(host, port, ssl, user, passwd, replicaset)
179    if err != 0:
180        return err
181
182    conn_time = time.time() - start
183    conn_time = round(conn_time, 0)
184
185    if action == "connections":
186        return check_connections(con, warning, critical, perf_data)
187    elif action == "replication_lag":
188        return check_rep_lag(con, host, port, warning, critical, False, perf_data, max_lag, user, passwd)
189    elif action == "replication_lag_percent":
190        return check_rep_lag(con, host, port, warning, critical, True, perf_data, max_lag, user, passwd)
191    elif action == "replset_state":
192        return check_replset_state(con, perf_data, warning, critical)
193    elif action == "memory":
194        return check_memory(con, warning, critical, perf_data, options.mapped_memory)
195    elif action == "memory_mapped":
196        return check_memory_mapped(con, warning, critical, perf_data)
197    elif action == "queues":
198        return check_queues(con, warning, critical, perf_data)
199    elif action == "lock":
200        return check_lock(con, warning, critical, perf_data)
201    elif action == "current_lock":
202        return check_current_lock(con, host, warning, critical, perf_data)
203    elif action == "flushing":
204        return check_flushing(con, warning, critical, True, perf_data)
205    elif action == "last_flush_time":
206        return check_flushing(con, warning, critical, False, perf_data)
207    elif action == "index_miss_ratio":
208        index_miss_ratio(con, warning, critical, perf_data)
209    elif action == "databases":
210        return check_databases(con, warning, critical, perf_data)
211    elif action == "collections":
212        return check_collections(con, warning, critical, perf_data)
213    elif action == "oplog":
214        return check_oplog(con, warning, critical, perf_data)
215    elif action == "journal_commits_in_wl":
216        return check_journal_commits_in_wl(con, warning, critical, perf_data)
217    elif action == "database_size":
218        if options.all_databases:
219            return check_all_databases_size(con, warning, critical, perf_data)
220        else:
221            return check_database_size(con, database, warning, critical, perf_data)
222    elif action == "database_indexes":
223        return check_database_indexes(con, database, warning, critical, perf_data)
224    elif action == "collection_indexes":
225        return check_collection_indexes(con, database, collection, warning, critical, perf_data)
226    elif action == "journaled":
227        return check_journaled(con, warning, critical, perf_data)
228    elif action == "write_data_files":
229        return check_write_to_datafiles(con, warning, critical, perf_data)
230    elif action == "opcounters":
231        return check_opcounters(con, host, warning, critical, perf_data)
232    elif action == "asserts":
233        return check_asserts(con, host, warning, critical, perf_data)
234    elif action == "replica_primary":
235        return check_replica_primary(con, host, warning, critical, perf_data, replicaset)
236    elif action == "queries_per_second":
237        return check_queries_per_second(con, query_type, warning, critical, perf_data)
238    elif action == "page_faults":
239        check_page_faults(con, sample_time, warning, critical, perf_data)
240    elif action == "chunks_balance":
241        chunks_balance(con, database, collection, warning, critical)
242    elif action == "connect_primary":
243        return check_connect_primary(con, warning, critical, perf_data)
244    elif action == "collection_state":
245        return check_collection_state(con, database, collection)
246    elif action == "row_count":
247        return check_row_count(con, database, collection, warning, critical, perf_data)
248    else:
249        return check_connect(host, port, warning, critical, perf_data, user, passwd, conn_time)
250
251
252def mongo_connect(host=None, port=None, ssl=False, user=None, passwd=None, replica=None):
253    try:
254        # ssl connection for pymongo > 2.3
255        if pymongo.version >= "2.3":
256            if replica is None:
257                con = pymongo.MongoClient(host, port)
258            else:
259                con = pymongo.Connection(host, port, read_preference=pymongo.ReadPreference.SECONDARY, ssl=ssl, replicaSet=replica, network_timeout=10)
260        else:
261            if replica is None:
262                con = pymongo.Connection(host, port, slave_okay=True, network_timeout=10)
263            else:
264                con = pymongo.Connection(host, port, slave_okay=True, network_timeout=10)
265                #con = pymongo.Connection(host, port, slave_okay=True, replicaSet=replica, network_timeout=10)
266
267        if user and passwd:
268            db = con["admin"]
269            if not db.authenticate(user, passwd):
270                sys.exit("Username/Password incorrect")
271    except Exception as e:
272        if isinstance(e, pymongo.errors.AutoReconnect) and str(e).find(" is an arbiter") != -1:
273            # We got a pymongo AutoReconnect exception that tells us we connected to an Arbiter Server
274            # This means: Arbiter is reachable and can answer requests/votes - this is all we need to know from an arbiter
275            print("OK - State: 7 (Arbiter)")
276            sys.exit(0)
277        return exit_with_general_critical(e), None
278    return 0, con
279
280
281def exit_with_general_warning(e):
282    if isinstance(e, SystemExit):
283        return e
284    else:
285        print("WARNING - General MongoDB warning:", e)
286    return 1
287
288
289def exit_with_general_critical(e):
290    if isinstance(e, SystemExit):
291        return e
292    else:
293        print("CRITICAL - General MongoDB Error:", e)
294    return 2
295
296
297def set_read_preference(db):
298    if pymongo.version >= "2.1":
299        db.read_preference = pymongo.ReadPreference.SECONDARY
300
301
302def check_connect(host, port, warning, critical, perf_data, user, passwd, conn_time):
303    warning = warning or 3
304    critical = critical or 6
305    message = "Connection took %i seconds" % conn_time
306    message += performance_data(perf_data, [(conn_time, "connection_time", warning, critical)])
307
308    return check_levels(conn_time, warning, critical, message)
309
310
311def check_connections(con, warning, critical, perf_data):
312    warning = warning or 80
313    critical = critical or 95
314    try:
315        data = get_server_status(con)
316
317        current = float(data['connections']['current'])
318        available = float(data['connections']['available'])
319
320        used_percent = int(float(current / (available + current)) * 100)
321        message = "%i percent (%i of %i connections) used" % (used_percent, current, current + available)
322        message += performance_data(perf_data, [(used_percent, "used_percent", warning, critical),
323                (current, "current_connections"),
324                (available, "available_connections")])
325        return check_levels(used_percent, warning, critical, message)
326
327    except Exception as e:
328        return exit_with_general_critical(e)
329
330
331def check_rep_lag(con, host, port, warning, critical, percent, perf_data, max_lag, user, passwd):
332    # Get mongo to tell us replica set member name when connecting locally
333    if "127.0.0.1" == host:
334        host = con.admin.command("ismaster","1")["me"].split(':')[0]
335
336    if percent:
337        warning = warning or 50
338        critical = critical or 75
339    else:
340        warning = warning or 600
341        critical = critical or 3600
342    rs_status = {}
343    slaveDelays = {}
344    try:
345        set_read_preference(con.admin)
346
347        # Get replica set status
348        try:
349            rs_status = con.admin.command("replSetGetStatus")
350        except pymongo.errors.OperationFailure as e:
351            if e.code == None and str(e).find('failed: not running with --replSet"'):
352                print("OK - Not running with replSet")
353                return 0
354
355        serverVersion = tuple(con.server_info()['version'].split('.'))
356        if serverVersion >= tuple("2.0.0".split(".")):
357            #
358            # check for version greater then 2.0
359            #
360            rs_conf = con.local.system.replset.find_one()
361            for member in rs_conf['members']:
362                if member.get('slaveDelay') is not None:
363                    slaveDelays[member['host']] = member.get('slaveDelay')
364                else:
365                    slaveDelays[member['host']] = 0
366
367            # Find the primary and/or the current node
368            primary_node = None
369            host_node = None
370
371            for member in rs_status["members"]:
372                if member["stateStr"] == "PRIMARY":
373                    primary_node = member
374                if member["name"].split(':')[0] == host and int(member["name"].split(':')[1]) == port:
375                    host_node = member
376
377            # Check if we're in the middle of an election and don't have a primary
378            if primary_node is None:
379                print("WARNING - No primary defined. In an election?")
380                return 1
381
382            # Check if we failed to find the current host
383            # below should never happen
384            if host_node is None:
385                print("CRITICAL - Unable to find host '" + host + "' in replica set.")
386                return 2
387
388            # Is the specified host the primary?
389            if host_node["stateStr"] == "PRIMARY":
390                if max_lag == False:
391                    print("OK - This is the primary.")
392                    return 0
393                else:
394                    #get the maximal replication lag
395                    data = ""
396                    maximal_lag = 0
397                    for member in rs_status['members']:
398                        if not member['stateStr'] == "ARBITER":
399                            lastSlaveOpTime = member['optimeDate']
400                            replicationLag = abs(primary_node["optimeDate"] - lastSlaveOpTime).seconds - slaveDelays[member['name']]
401                            data = data + member['name'] + " lag=%d;" % replicationLag
402                            maximal_lag = max(maximal_lag, replicationLag)
403                    if percent:
404                        err, con = mongo_connect(primary_node['name'].split(':')[0], int(primary_node['name'].split(':')[1]), False, user, passwd)
405                        if err != 0:
406                            return err
407                        primary_timediff = replication_get_time_diff(con)
408                        maximal_lag = int(float(maximal_lag) / float(primary_timediff) * 100)
409                        message = "Maximal lag is " + str(maximal_lag) + " percents"
410                        message += performance_data(perf_data, [(maximal_lag, "replication_lag_percent", warning, critical)])
411                    else:
412                        message = "Maximal lag is " + str(maximal_lag) + " seconds"
413                        message += performance_data(perf_data, [(maximal_lag, "replication_lag", warning, critical)])
414                    return check_levels(maximal_lag, warning, critical, message)
415            elif host_node["stateStr"] == "ARBITER":
416                print("OK - This is an arbiter")
417                return 0
418
419            # Find the difference in optime between current node and PRIMARY
420
421            optime_lag = abs(primary_node["optimeDate"] - host_node["optimeDate"])
422
423            if host_node['name'] in slaveDelays:
424                slave_delay = slaveDelays[host_node['name']]
425            elif host_node['name'].endswith(':27017') and host_node['name'][:-len(":27017")] in slaveDelays:
426                slave_delay = slaveDelays[host_node['name'][:-len(":27017")]]
427            else:
428                raise Exception("Unable to determine slave delay for {0}".format(host_node['name']))
429
430            try:  # work starting from python2.7
431                lag = optime_lag.total_seconds()
432            except:
433                lag = float(optime_lag.seconds + optime_lag.days * 24 * 3600)
434
435            if percent:
436                err, con = mongo_connect(primary_node['name'].split(':')[0], int(primary_node['name'].split(':')[1]), False, user, passwd)
437                if err != 0:
438                    return err
439                primary_timediff = replication_get_time_diff(con)
440                if primary_timediff != 0:
441                    lag = int(float(lag) / float(primary_timediff) * 100)
442                else:
443                    lag = 0
444                message = "Lag is " + str(lag) + " percents"
445                message += performance_data(perf_data, [(lag, "replication_lag_percent", warning, critical)])
446            else:
447                message = "Lag is " + str(lag) + " seconds"
448                message += performance_data(perf_data, [(lag, "replication_lag", warning, critical)])
449            return check_levels(lag, warning + slaveDelays[host_node['name']], critical + slaveDelays[host_node['name']], message)
450        else:
451            #
452            # less than 2.0 check
453            #
454            # Get replica set status
455            rs_status = con.admin.command("replSetGetStatus")
456
457            # Find the primary and/or the current node
458            primary_node = None
459            host_node = None
460            for member in rs_status["members"]:
461                if member["stateStr"] == "PRIMARY":
462                    primary_node = (member["name"], member["optimeDate"])
463                if member["name"].split(":")[0].startswith(host):
464                    host_node = member
465
466            # Check if we're in the middle of an election and don't have a primary
467            if primary_node is None:
468                print("WARNING - No primary defined. In an election?")
469                sys.exit(1)
470
471            # Is the specified host the primary?
472            if host_node["stateStr"] == "PRIMARY":
473                print("OK - This is the primary.")
474                sys.exit(0)
475
476            # Find the difference in optime between current node and PRIMARY
477            optime_lag = abs(primary_node[1] - host_node["optimeDate"])
478            lag = optime_lag.seconds
479            if percent:
480                err, con = mongo_connect(primary_node['name'].split(':')[0], int(primary_node['name'].split(':')[1]))
481                if err != 0:
482                    return err
483                primary_timediff = replication_get_time_diff(con)
484                lag = int(float(lag) / float(primary_timediff) * 100)
485                message = "Lag is " + str(lag) + " percents"
486                message += performance_data(perf_data, [(lag, "replication_lag_percent", warning, critical)])
487            else:
488                message = "Lag is " + str(lag) + " seconds"
489                message += performance_data(perf_data, [(lag, "replication_lag", warning, critical)])
490            return check_levels(lag, warning, critical, message)
491
492    except Exception as e:
493        return exit_with_general_critical(e)
494
495
496def check_memory(con, warning, critical, perf_data, mapped_memory):
497    #
498    # These thresholds are basically meaningless, and must be customized to your system's ram
499    #
500    warning = warning or 8
501    critical = critical or 16
502    try:
503        data = get_server_status(con)
504        if not data['mem']['supported'] and not mapped_memory:
505            print("OK - Platform not supported for memory info")
506            return 0
507        #
508        # convert to gigs
509        #
510        message = "Memory Usage:"
511        try:
512            mem_resident = float(data['mem']['resident']) / 1024.0
513            message += " %.2fGB resident," % (mem_resident)
514        except:
515            mem_resident = 0
516            message += " resident unsupported,"
517        try:
518            mem_virtual = float(data['mem']['virtual']) / 1024.0
519            message += " %.2fGB virtual," % mem_virtual
520        except:
521            mem_virtual = 0
522            message += " virtual unsupported,"
523        try:
524            mem_mapped = float(data['mem']['mapped']) / 1024.0
525            message += " %.2fGB mapped," % mem_mapped
526        except:
527            mem_mapped = 0
528            message += " mapped unsupported,"
529        try:
530            mem_mapped_journal = float(data['mem']['mappedWithJournal']) / 1024.0
531            message += " %.2fGB mappedWithJournal" % mem_mapped_journal
532        except:
533            mem_mapped_journal = 0
534        message += performance_data(perf_data, [("%.2f" % mem_resident, "memory_usage", warning, critical),
535                    ("%.2f" % mem_mapped, "memory_mapped"), ("%.2f" % mem_virtual, "memory_virtual"), ("%.2f" % mem_mapped_journal, "mappedWithJournal")])
536        #added for unsupported systems like Solaris
537        if mapped_memory and mem_resident == 0:
538            return check_levels(mem_mapped, warning, critical, message)
539        else:
540            return check_levels(mem_resident, warning, critical, message)
541
542    except Exception as e:
543        return exit_with_general_critical(e)
544
545
546def check_memory_mapped(con, warning, critical, perf_data):
547    #
548    # These thresholds are basically meaningless, and must be customized to your application
549    #
550    warning = warning or 8
551    critical = critical or 16
552    try:
553        data = get_server_status(con)
554        if not data['mem']['supported']:
555            print("OK - Platform not supported for memory info")
556            return 0
557        #
558        # convert to gigs
559        #
560        message = "Memory Usage:"
561        try:
562            mem_mapped = float(data['mem']['mapped']) / 1024.0
563            message += " %.2fGB mapped," % mem_mapped
564        except:
565            mem_mapped = -1
566            message += " mapped unsupported,"
567        try:
568            mem_mapped_journal = float(data['mem']['mappedWithJournal']) / 1024.0
569            message += " %.2fGB mappedWithJournal" % mem_mapped_journal
570        except:
571            mem_mapped_journal = 0
572        message += performance_data(perf_data, [("%.2f" % mem_mapped, "memory_mapped"), ("%.2f" % mem_mapped_journal, "mappedWithJournal")])
573
574        if not mem_mapped == -1:
575            return check_levels(mem_mapped, warning, critical, message)
576        else:
577            print("OK - Server does not provide mem.mapped info")
578            return 0
579
580    except Exception as e:
581        return exit_with_general_critical(e)
582
583
584def check_lock(con, warning, critical, perf_data):
585    warning = warning or 10
586    critical = critical or 30
587    try:
588        data = get_server_status(con)
589        #
590        # calculate percentage
591        #
592        lock_percentage = float(data['globalLock']['lockTime']) / float(data['globalLock']['totalTime']) * 100
593        message = "Lock Percentage: %.2f%%" % lock_percentage
594        message += performance_data(perf_data, [("%.2f" % lock_percentage, "lock_percentage", warning, critical)])
595        return check_levels(lock_percentage, warning, critical, message)
596
597    except Exception as e:
598        return exit_with_general_critical(e)
599
600
601def check_flushing(con, warning, critical, avg, perf_data):
602    #
603    # These thresholds mean it's taking 5 seconds to perform a background flush to issue a warning
604    # and 10 seconds to issue a critical.
605    #
606    warning = warning or 5000
607    critical = critical or 15000
608    try:
609        data = get_server_status(con)
610        if avg:
611            flush_time = float(data['backgroundFlushing']['average_ms'])
612            stat_type = "Average"
613        else:
614            flush_time = float(data['backgroundFlushing']['last_ms'])
615            stat_type = "Last"
616
617        message = "%s Flush Time: %.2fms" % (stat_type, flush_time)
618        message += performance_data(perf_data, [("%.2fms" % flush_time, "%s_flush_time" % stat_type.lower(), warning, critical)])
619
620        return check_levels(flush_time, warning, critical, message)
621
622    except Exception as e:
623        return exit_with_general_critical(e)
624
625
626def index_miss_ratio(con, warning, critical, perf_data):
627    warning = warning or 10
628    critical = critical or 30
629    try:
630        data = get_server_status(con)
631
632        try:
633            serverVersion = tuple(con.server_info()['version'].split('.'))
634            if serverVersion >= tuple("2.4.0".split(".")):
635                miss_ratio = float(data['indexCounters']['missRatio'])
636            else:
637                miss_ratio = float(data['indexCounters']['btree']['missRatio'])
638        except KeyError:
639            not_supported_msg = "not supported on this platform"
640            if data['indexCounters'].has_key('note'):
641                print("OK - MongoDB says: " + not_supported_msg)
642                return 0
643            else:
644                print("WARNING - Can't get counter from MongoDB")
645                return 1
646
647        message = "Miss Ratio: %.2f" % miss_ratio
648        message += performance_data(perf_data, [("%.2f" % miss_ratio, "index_miss_ratio", warning, critical)])
649
650        return check_levels(miss_ratio, warning, critical, message)
651
652    except Exception as e:
653        return exit_with_general_critical(e)
654
655
656def check_replset_state(con, perf_data, warning="", critical=""):
657    try:
658        warning = [int(x) for x in warning.split(",")]
659    except:
660        warning = [0, 3, 5]
661    try:
662        critical = [int(x) for x in critical.split(",")]
663    except:
664        critical = [8, 4, -1]
665
666    ok = list(range(-1, 8))  # should include the range of all posiible values
667    try:
668        try:
669            try:
670                set_read_preference(con.admin)
671                data = con.admin.command(pymongo.son_manipulator.SON([('replSetGetStatus', 1)]))
672            except:
673                data = con.admin.command(son.SON([('replSetGetStatus', 1)]))
674            state = int(data['myState'])
675        except pymongo.errors.OperationFailure as e:
676            if e.code == None and str(e).find('failed: not running with --replSet"'):
677                state = -1
678
679        if state == 8:
680            message = "State: %i (Down)" % state
681        elif state == 4:
682            message = "State: %i (Fatal error)" % state
683        elif state == 0:
684            message = "State: %i (Starting up, phase1)" % state
685        elif state == 3:
686            message = "State: %i (Recovering)" % state
687        elif state == 5:
688            message = "State: %i (Starting up, phase2)" % state
689        elif state == 1:
690            message = "State: %i (Primary)" % state
691        elif state == 2:
692            message = "State: %i (Secondary)" % state
693        elif state == 7:
694            message = "State: %i (Arbiter)" % state
695        elif state == -1:
696            message = "Not running with replSet"
697        else:
698            message = "State: %i (Unknown state)" % state
699        message += performance_data(perf_data, [(state, "state")])
700        return check_levels(state, warning, critical, message, ok)
701    except Exception as e:
702        return exit_with_general_critical(e)
703
704
705def check_databases(con, warning, critical, perf_data=None):
706    try:
707        try:
708            set_read_preference(con.admin)
709            data = con.admin.command(pymongo.son_manipulator.SON([('listDatabases', 1)]))
710        except:
711            data = con.admin.command(son.SON([('listDatabases', 1)]))
712
713        count = len(data['databases'])
714        message = "Number of DBs: %.0f" % count
715        message += performance_data(perf_data, [(count, "databases", warning, critical, message)])
716        return check_levels(count, warning, critical, message)
717    except Exception as e:
718        return exit_with_general_critical(e)
719
720
721def check_collections(con, warning, critical, perf_data=None):
722    try:
723        try:
724            set_read_preference(con.admin)
725            data = con.admin.command(pymongo.son_manipulator.SON([('listDatabases', 1)]))
726        except:
727            data = con.admin.command(son.SON([('listDatabases', 1)]))
728
729        count = 0
730        for db in data['databases']:
731            dbase = con[db['name']]
732            set_read_preference(dbase)
733            count += len(dbase.collection_names())
734
735        message = "Number of collections: %.0f" % count
736        message += performance_data(perf_data, [(count, "collections", warning, critical, message)])
737        return check_levels(count, warning, critical, message)
738
739    except Exception as e:
740        return exit_with_general_critical(e)
741
742
743def check_all_databases_size(con, warning, critical, perf_data):
744    warning = warning or 100
745    critical = critical or 1000
746    try:
747        set_read_preference(con.admin)
748        all_dbs_data = con.admin.command(pymongo.son_manipulator.SON([('listDatabases', 1)]))
749    except:
750        all_dbs_data = con.admin.command(son.SON([('listDatabases', 1)]))
751
752    total_storage_size = 0
753    message = ""
754    perf_data_param = [()]
755    for db in all_dbs_data['databases']:
756        database = db['name']
757        data = con[database].command('dbstats')
758        storage_size = round(data['storageSize'] / 1024 / 1024, 1)
759        message += "; Database %s size: %.0f MB" % (database, storage_size)
760        perf_data_param.append((storage_size, database + "_database_size"))
761        total_storage_size += storage_size
762
763    perf_data_param[0] = (total_storage_size, "total_size", warning, critical)
764    message += performance_data(perf_data, perf_data_param)
765    message = "Total size: %.0f MB" % total_storage_size + message
766    return check_levels(total_storage_size, warning, critical, message)
767
768
769def check_database_size(con, database, warning, critical, perf_data):
770    warning = warning or 100
771    critical = critical or 1000
772    perfdata = ""
773    try:
774        set_read_preference(con.admin)
775        data = con[database].command('dbstats')
776        storage_size = data['storageSize'] / 1024 / 1024
777        if perf_data:
778            perfdata += " | database_size=%i;%i;%i" % (storage_size, warning, critical)
779            #perfdata += " database=%s" %(database)
780
781        if storage_size >= critical:
782            print("CRITICAL - Database size: %.0f MB, Database: %s%s" % (storage_size, database, perfdata))
783            return 2
784        elif storage_size >= warning:
785            print("WARNING - Database size: %.0f MB, Database: %s%s" % (storage_size, database, perfdata))
786            return 1
787        else:
788            print("OK - Database size: %.0f MB, Database: %s%s" % (storage_size, database, perfdata))
789            return 0
790    except Exception as e:
791        return exit_with_general_critical(e)
792
793
794def check_database_indexes(con, database, warning, critical, perf_data):
795    #
796    # These thresholds are basically meaningless, and must be customized to your application
797    #
798    warning = warning or 100
799    critical = critical or 1000
800    perfdata = ""
801    try:
802        set_read_preference(con.admin)
803        data = con[database].command('dbstats')
804        index_size = data['indexSize'] / 1024 / 1024
805        if perf_data:
806            perfdata += " | database_indexes=%i;%i;%i" % (index_size, warning, critical)
807
808        if index_size >= critical:
809            print("CRITICAL - %s indexSize: %.0f MB %s" % (database, index_size, perfdata))
810            return 2
811        elif index_size >= warning:
812            print("WARNING - %s indexSize: %.0f MB %s" % (database, index_size, perfdata))
813            return 1
814        else:
815            print("OK - %s indexSize: %.0f MB %s" % (database, index_size, perfdata))
816            return 0
817    except Exception as e:
818        return exit_with_general_critical(e)
819
820
821def check_collection_indexes(con, database, collection, warning, critical, perf_data):
822    #
823    # These thresholds are basically meaningless, and must be customized to your application
824    #
825    warning = warning or 100
826    critical = critical or 1000
827    perfdata = ""
828    try:
829        set_read_preference(con.admin)
830        data = con[database].command('collstats', collection)
831        total_index_size = data['totalIndexSize'] / 1024 / 1024
832        if perf_data:
833            perfdata += " | collection_indexes=%i;%i;%i" % (total_index_size, warning, critical)
834
835        if total_index_size >= critical:
836            print("CRITICAL - %s.%s totalIndexSize: %.0f MB %s" % (database, collection, total_index_size, perfdata))
837            return 2
838        elif total_index_size >= warning:
839            print("WARNING - %s.%s totalIndexSize: %.0f MB %s" % (database, collection, total_index_size, perfdata))
840            return 1
841        else:
842            print("OK - %s.%s totalIndexSize: %.0f MB %s" % (database, collection, total_index_size, perfdata))
843            return 0
844    except Exception as e:
845        return exit_with_general_critical(e)
846
847
848def check_queues(con, warning, critical, perf_data):
849    warning = warning or 10
850    critical = critical or 30
851    try:
852        data = get_server_status(con)
853
854        total_queues = float(data['globalLock']['currentQueue']['total'])
855        readers_queues = float(data['globalLock']['currentQueue']['readers'])
856        writers_queues = float(data['globalLock']['currentQueue']['writers'])
857        message = "Current queue is : total = %d, readers = %d, writers = %d" % (total_queues, readers_queues, writers_queues)
858        message += performance_data(perf_data, [(total_queues, "total_queues", warning, critical), (readers_queues, "readers_queues"), (writers_queues, "writers_queues")])
859        return check_levels(total_queues, warning, critical, message)
860
861    except Exception as e:
862        return exit_with_general_critical(e)
863
864
865def check_queries_per_second(con, query_type, warning, critical, perf_data):
866    warning = warning or 250
867    critical = critical or 500
868
869    if query_type not in ['insert', 'query', 'update', 'delete', 'getmore', 'command']:
870        return exit_with_general_critical("The query type of '%s' is not valid" % query_type)
871
872    try:
873        db = con.local
874        data = get_server_status(con)
875
876        # grab the count
877        num = int(data['opcounters'][query_type])
878
879        # do the math
880        last_count = db.nagios_check.find_one({'check': 'query_counts'})
881        try:
882            ts = int(time.time())
883            diff_query = num - last_count['data'][query_type]['count']
884            diff_ts = ts - last_count['data'][query_type]['ts']
885
886            query_per_sec = float(diff_query) / float(diff_ts)
887
888            # update the count now
889            db.nagios_check.update({u'_id': last_count['_id']}, {'$set': {"data.%s" % query_type: {'count': num, 'ts': int(time.time())}}})
890
891            message = "Queries / Sec: %f" % query_per_sec
892            message += performance_data(perf_data, [(query_per_sec, "%s_per_sec" % query_type, warning, critical, message)])
893        except KeyError:
894            #
895            # since it is the first run insert it
896            query_per_sec = 0
897            message = "First run of check.. no data"
898            db.nagios_check.update({u'_id': last_count['_id']}, {'$set': {"data.%s" % query_type: {'count': num, 'ts': int(time.time())}}})
899        except TypeError:
900            #
901            # since it is the first run insert it
902            query_per_sec = 0
903            message = "First run of check.. no data"
904            db.nagios_check.insert({'check': 'query_counts', 'data': {query_type: {'count': num, 'ts': int(time.time())}}})
905
906        return check_levels(query_per_sec, warning, critical, message)
907
908    except Exception as e:
909        return exit_with_general_critical(e)
910
911
912def check_oplog(con, warning, critical, perf_data):
913    """ Checking the oplog time - the time of the log currntly saved in the oplog collection
914    defaults:
915        critical 4 hours
916        warning 24 hours
917    those can be changed as usual with -C and -W parameters"""
918    warning = warning or 24
919    critical = critical or 4
920    try:
921        db = con.local
922        ol = db.system.namespaces.find_one({"name": "local.oplog.rs"})
923        if (db.system.namespaces.find_one({"name": "local.oplog.rs"}) != None):
924            oplog = "oplog.rs"
925        else:
926            ol = db.system.namespaces.find_one({"name": "local.oplog.$main"})
927            if (db.system.namespaces.find_one({"name": "local.oplog.$main"}) != None):
928                oplog = "oplog.$main"
929            else:
930                message = "neither master/slave nor replica set replication detected"
931                return check_levels(None, warning, critical, message)
932
933        try:
934                set_read_preference(con.admin)
935                data = con.local.command(pymongo.son_manipulator.SON([('collstats', oplog)]))
936        except:
937                data = con.admin.command(son.SON([('collstats', oplog)]))
938
939        ol_size = data['size']
940        ol_storage_size = data['storageSize']
941        ol_used_storage = int(float(ol_size) / ol_storage_size * 100 + 1)
942        ol = con.local[oplog]
943        firstc = ol.find().sort("$natural", pymongo.ASCENDING).limit(1)[0]['ts']
944        lastc = ol.find().sort("$natural", pymongo.DESCENDING).limit(1)[0]['ts']
945        time_in_oplog = (lastc.as_datetime() - firstc.as_datetime())
946        message = "Oplog saves " + str(time_in_oplog) + " %d%% used" % ol_used_storage
947        try:  # work starting from python2.7
948            hours_in_oplog = time_in_oplog.total_seconds() / 60 / 60
949        except:
950            hours_in_oplog = float(time_in_oplog.seconds + time_in_oplog.days * 24 * 3600) / 60 / 60
951        approx_level = hours_in_oplog * 100 / ol_used_storage
952        message += performance_data(perf_data, [("%.2f" % hours_in_oplog, 'oplog_time', warning, critical), ("%.2f " % approx_level, 'oplog_time_100_percent_used')])
953        return check_levels(-approx_level, -warning, -critical, message)
954
955    except Exception as e:
956        return exit_with_general_critical(e)
957
958
959def check_journal_commits_in_wl(con, warning, critical, perf_data):
960    """  Checking the number of commits which occurred in the db's write lock.
961Most commits are performed outside of this lock; committed while in the write lock is undesirable.
962Under very high write situations it is normal for this value to be nonzero.  """
963
964    warning = warning or 10
965    critical = critical or 40
966    try:
967        data = get_server_status(con)
968        j_commits_in_wl = data['dur']['commitsInWriteLock']
969        message = "Journal commits in DB write lock : %d" % j_commits_in_wl
970        message += performance_data(perf_data, [(j_commits_in_wl, "j_commits_in_wl", warning, critical)])
971        return check_levels(j_commits_in_wl, warning, critical, message)
972
973    except Exception as e:
974        return exit_with_general_critical(e)
975
976
977def check_journaled(con, warning, critical, perf_data):
978    """ Checking the average amount of data in megabytes written to the recovery log in the last four seconds"""
979
980    warning = warning or 20
981    critical = critical or 40
982    try:
983        data = get_server_status(con)
984        journaled = data['dur']['journaledMB']
985        message = "Journaled : %.2f MB" % journaled
986        message += performance_data(perf_data, [("%.2f" % journaled, "journaled", warning, critical)])
987        return check_levels(journaled, warning, critical, message)
988
989    except Exception as e:
990        return exit_with_general_critical(e)
991
992
993def check_write_to_datafiles(con, warning, critical, perf_data):
994    """    Checking the average amount of data in megabytes written to the databases datafiles in the last four seconds.
995As these writes are already journaled, they can occur lazily, and thus the number indicated here may be lower
996than the amount physically written to disk."""
997    warning = warning or 20
998    critical = critical or 40
999    try:
1000        data = get_server_status(con)
1001        writes = data['dur']['writeToDataFilesMB']
1002        message = "Write to data files : %.2f MB" % writes
1003        message += performance_data(perf_data, [("%.2f" % writes, "write_to_data_files", warning, critical)])
1004        return check_levels(writes, warning, critical, message)
1005
1006    except Exception as e:
1007        return exit_with_general_critical(e)
1008
1009
1010def get_opcounters(data, opcounters_name, host):
1011    try:
1012        insert = data[opcounters_name]['insert']
1013        query = data[opcounters_name]['query']
1014        update = data[opcounters_name]['update']
1015        delete = data[opcounters_name]['delete']
1016        getmore = data[opcounters_name]['getmore']
1017        command = data[opcounters_name]['command']
1018    except KeyError as e:
1019        return 0, [0] * 100
1020    total_commands = insert + query + update + delete + getmore + command
1021    new_vals = [total_commands, insert, query, update, delete, getmore, command]
1022    return  maintain_delta(new_vals, host, opcounters_name)
1023
1024
1025def check_opcounters(con, host, warning, critical, perf_data):
1026    """ A function to get all opcounters delta per minute. In case of a replication - gets the opcounters+opcountersRepl"""
1027    warning = warning or 10000
1028    critical = critical or 15000
1029
1030    data = get_server_status(con)
1031    err1, delta_opcounters = get_opcounters(data, 'opcounters', host)
1032    err2, delta_opcounters_repl = get_opcounters(data, 'opcountersRepl', host)
1033    if err1 == 0 and err2 == 0:
1034        delta = [(x + y) for x, y in zip(delta_opcounters, delta_opcounters_repl)]
1035        delta[0] = delta_opcounters[0]  # only the time delta shouldn't be summarized
1036        per_minute_delta = [int(x / delta[0] * 60) for x in delta[1:]]
1037        message = "Test succeeded , old values missing"
1038        message = "Opcounters: total=%d,insert=%d,query=%d,update=%d,delete=%d,getmore=%d,command=%d" % tuple(per_minute_delta)
1039        message += performance_data(perf_data, ([(per_minute_delta[0], "total", warning, critical), (per_minute_delta[1], "insert"),
1040                    (per_minute_delta[2], "query"), (per_minute_delta[3], "update"), (per_minute_delta[5], "delete"),
1041                    (per_minute_delta[5], "getmore"), (per_minute_delta[6], "command")]))
1042        return check_levels(per_minute_delta[0], warning, critical, message)
1043    else:
1044        return exit_with_general_critical("problem reading data from temp file")
1045
1046
1047def check_current_lock(con, host, warning, critical, perf_data):
1048    """ A function to get current lock percentage and not a global one, as check_lock function does"""
1049    warning = warning or 10
1050    critical = critical or 30
1051    data = get_server_status(con)
1052
1053    lockTime = float(data['globalLock']['lockTime'])
1054    totalTime = float(data['globalLock']['totalTime'])
1055
1056    err, delta = maintain_delta([totalTime, lockTime], host, "locktime")
1057    if err == 0:
1058        lock_percentage = delta[2] / delta[1] * 100     # lockTime/totalTime*100
1059        message = "Current Lock Percentage: %.2f%%" % lock_percentage
1060        message += performance_data(perf_data, [("%.2f" % lock_percentage, "current_lock_percentage", warning, critical)])
1061        return check_levels(lock_percentage, warning, critical, message)
1062    else:
1063        return exit_with_general_warning("problem reading data from temp file")
1064
1065
1066def check_page_faults(con, host, warning, critical, perf_data):
1067    """ A function to get page_faults per second from the system"""
1068    warning = warning or 10
1069    critical = critical or 30
1070    data = get_server_status(con)
1071
1072    try:
1073        page_faults = float(data['extra_info']['page_faults'])
1074    except:
1075        # page_faults unsupported on the underlaying system
1076        return exit_with_general_critical("page_faults unsupported on the underlaying system")
1077
1078    err, delta = maintain_delta([page_faults], host, "page_faults")
1079    if err == 0:
1080        page_faults_ps = delta[1] / delta[0]
1081        message = "Page faults : %.2f ps" % page_faults_ps
1082        message += performance_data(perf_data, [("%.2f" % page_faults_ps, "page_faults_ps", warning, critical)])
1083        return check_levels(page_faults_ps, warning, critical, message)
1084    else:
1085        return exit_with_general_warning("problem reading data from temp file")
1086
1087
1088def check_asserts(con, host, warning, critical, perf_data):
1089    """ A function to get asserts from the system"""
1090    warning = warning or 1
1091    critical = critical or 10
1092    data = get_server_status(con)
1093
1094    asserts = data['asserts']
1095
1096    #{ "regular" : 0, "warning" : 6, "msg" : 0, "user" : 12, "rollovers" : 0 }
1097    regular = asserts['regular']
1098    warning_asserts = asserts['warning']
1099    msg = asserts['msg']
1100    user = asserts['user']
1101    rollovers = asserts['rollovers']
1102
1103    err, delta = maintain_delta([regular, warning_asserts, msg, user, rollovers], host, "asserts")
1104
1105    if err == 0:
1106        if delta[5] != 0:
1107            #the number of rollovers were increased
1108            warning = -1  # no matter the metrics this situation should raise a warning
1109            # if this is normal rollover - the warning will not appear again, but if there will be a lot of asserts
1110            # the warning will stay for a long period of time
1111            # although this is not a usual situation
1112
1113        regular_ps = delta[1] / delta[0]
1114        warning_ps = delta[2] / delta[0]
1115        msg_ps = delta[3] / delta[0]
1116        user_ps = delta[4] / delta[0]
1117        rollovers_ps = delta[5] / delta[0]
1118        total_ps = regular_ps + warning_ps + msg_ps + user_ps
1119        message = "Total asserts : %.2f ps" % total_ps
1120        message += performance_data(perf_data, [(total_ps, "asserts_ps", warning, critical), (regular_ps, "regular"),
1121                    (warning_ps, "warning"), (msg_ps, "msg"), (user_ps, "user")])
1122        return check_levels(total_ps, warning, critical, message)
1123    else:
1124        return exit_with_general_warning("problem reading data from temp file")
1125
1126
1127def get_stored_primary_server_name(db):
1128    """ get the stored primary server name from db. """
1129    if "last_primary_server" in db.collection_names():
1130        stored_primary_server = db.last_primary_server.find_one()["server"]
1131    else:
1132        stored_primary_server = None
1133
1134    return stored_primary_server
1135
1136
1137def check_replica_primary(con, host, warning, critical, perf_data, replicaset):
1138    """ A function to check if the primary server of a replica set has changed """
1139    if warning is None and critical is None:
1140        warning = 1
1141    warning = warning or 2
1142    critical = critical or 2
1143
1144    primary_status = 0
1145    message = "Primary server has not changed"
1146    db = con["nagios"]
1147    data = get_server_status(con)
1148    if replicaset != data['repl'].get('setName'):
1149        message = "Replica set requested: %s differs from the one found: %s" % (replicaset, data['repl'].get('setName'))
1150        primary_status = 2
1151        return check_levels(primary_status, warning, critical, message)
1152    current_primary = data['repl'].get('primary')
1153    saved_primary = get_stored_primary_server_name(db)
1154    if current_primary is None:
1155        current_primary = "None"
1156    if saved_primary is None:
1157        saved_primary = "None"
1158    if current_primary != saved_primary:
1159        last_primary_server_record = {"server": current_primary}
1160        db.last_primary_server.update({"_id": "last_primary"}, {"$set": last_primary_server_record}, upsert=True, safe=True)
1161        message = "Primary server has changed from %s to %s" % (saved_primary, current_primary)
1162        primary_status = 1
1163    return check_levels(primary_status, warning, critical, message)
1164
1165
1166def check_page_faults(con, sample_time, warning, critical, perf_data):
1167    warning = warning or 10
1168    critical = critical or 20
1169    try:
1170        try:
1171            set_read_preference(con.admin)
1172            data1 = con.admin.command(pymongo.son_manipulator.SON([('serverStatus', 1)]))
1173            time.sleep(sample_time)
1174            data2 = con.admin.command(pymongo.son_manipulator.SON([('serverStatus', 1)]))
1175        except:
1176            data1 = con.admin.command(son.SON([('serverStatus', 1)]))
1177            time.sleep(sample_time)
1178            data2 = con.admin.command(son.SON([('serverStatus', 1)]))
1179
1180        try:
1181            #on linux servers only
1182            page_faults = (int(data2['extra_info']['page_faults']) - int(data1['extra_info']['page_faults'])) / sample_time
1183        except KeyError:
1184            print("WARNING - Can't get extra_info.page_faults counter from MongoDB")
1185            sys.exit(1)
1186
1187        message = "Page Faults: %i" % (page_faults)
1188
1189        message += performance_data(perf_data, [(page_faults, "page_faults", warning, critical)])
1190        check_levels(page_faults, warning, critical, message)
1191
1192    except Exception as e:
1193        exit_with_general_critical(e)
1194
1195
1196def chunks_balance(con, database, collection, warning, critical):
1197    warning = warning or 10
1198    critical = critical or 20
1199    nsfilter = database + "." + collection
1200    try:
1201        try:
1202            set_read_preference(con.admin)
1203            col = con.config.chunks
1204            nscount = col.find({"ns": nsfilter}).count()
1205            shards = col.distinct("shard")
1206
1207        except:
1208            print("WARNING - Can't get chunks infos from MongoDB")
1209            sys.exit(1)
1210
1211        if nscount == 0:
1212            print("WARNING - Namespace %s is not sharded" % (nsfilter))
1213            sys.exit(1)
1214
1215        avgchunksnb = nscount / len(shards)
1216        warningnb = avgchunksnb * warning / 100
1217        criticalnb = avgchunksnb * critical / 100
1218
1219        for shard in shards:
1220            delta = abs(avgchunksnb - col.find({"ns": nsfilter, "shard": shard}).count())
1221            message = "Namespace: %s, Shard name: %s, Chunk delta: %i" % (nsfilter, shard, delta)
1222
1223            if delta >= criticalnb and delta > 0:
1224                print("CRITICAL - Chunks not well balanced " + message)
1225                sys.exit(2)
1226            elif delta >= warningnb  and delta > 0:
1227                print("WARNING - Chunks not well balanced  " + message)
1228                sys.exit(1)
1229
1230        print("OK - Chunks well balanced across shards")
1231        sys.exit(0)
1232
1233    except Exception as e:
1234        exit_with_general_critical(e)
1235
1236    print("OK - Chunks well balanced across shards")
1237    sys.exit(0)
1238
1239
1240def check_connect_primary(con, warning, critical, perf_data):
1241    warning = warning or 3
1242    critical = critical or 6
1243
1244    try:
1245        try:
1246            set_read_preference(con.admin)
1247            data = con.admin.command(pymongo.son_manipulator.SON([('isMaster', 1)]))
1248        except:
1249            data = con.admin.command(son.SON([('isMaster', 1)]))
1250
1251        if data['ismaster'] == True:
1252            print("OK - This server is primary")
1253            return 0
1254
1255        phost = data['primary'].split(':')[0]
1256        pport = int(data['primary'].split(':')[1])
1257        start = time.time()
1258
1259        err, con = mongo_connect(phost, pport)
1260        if err != 0:
1261            return err
1262
1263        pconn_time = time.time() - start
1264        pconn_time = round(pconn_time, 0)
1265        message = "Connection to primary server " + data['primary'] + " took %i seconds" % pconn_time
1266        message += performance_data(perf_data, [(pconn_time, "connection_time", warning, critical)])
1267
1268        return check_levels(pconn_time, warning, critical, message)
1269
1270    except Exception as e:
1271        return exit_with_general_critical(e)
1272
1273
1274def check_collection_state(con, database, collection):
1275    try:
1276        con[database][collection].find_one()
1277        print("OK - Collection %s.%s is reachable " % (database, collection))
1278        return 0
1279
1280    except Exception as e:
1281        return exit_with_general_critical(e)
1282
1283
1284def check_row_count(con, database, collection, warning, critical, perf_data):
1285    try:
1286        count = con[database][collection].count()
1287        message = "Row count: %i" % (count)
1288        message += performance_data(perf_data, [(count, "row_count", warning, critical)])
1289
1290        return check_levels(count, warning, critical, message)
1291
1292    except Exception as e:
1293        return exit_with_general_critical(e)
1294
1295
1296def build_file_name(host, action):
1297    #done this way so it will work when run independently and from shell
1298    module_name = re.match('(.*//*)*(.*)\..*', __file__).group(2)
1299    return "/tmp/" + module_name + "_data/" + host + "-" + action + ".data"
1300
1301
1302def ensure_dir(f):
1303    d = os.path.dirname(f)
1304    if not os.path.exists(d):
1305        os.makedirs(d)
1306
1307
1308def write_values(file_name, string):
1309    f = None
1310    try:
1311        f = open(file_name, 'w')
1312    except IOError as e:
1313        #try creating
1314        if (e.errno == 2):
1315            ensure_dir(file_name)
1316            f = open(file_name, 'w')
1317        else:
1318            raise IOError(e)
1319    f.write(string)
1320    f.close()
1321    return 0
1322
1323
1324def read_values(file_name):
1325    data = None
1326    try:
1327        f = open(file_name, 'r')
1328        data = f.read()
1329        f.close()
1330        return 0, data
1331    except IOError as e:
1332        if (e.errno == 2):
1333            #no previous data
1334            return 1, ''
1335    except Exception as e:
1336        return 2, None
1337
1338
1339def calc_delta(old, new):
1340    delta = []
1341    if (len(old) != len(new)):
1342        raise Exception("unequal number of parameters")
1343    for i in range(0, len(old)):
1344        val = float(new[i]) - float(old[i])
1345        if val < 0:
1346            val = new[i]
1347        delta.append(val)
1348    return 0, delta
1349
1350
1351def maintain_delta(new_vals, host, action):
1352    file_name = build_file_name(host, action)
1353    err, data = read_values(file_name)
1354    old_vals = data.split(';')
1355    new_vals = [str(int(time.time()))] + new_vals
1356    delta = None
1357    try:
1358        err, delta = calc_delta(old_vals, new_vals)
1359    except:
1360        err = 2
1361    write_res = write_values(file_name, ";" . join(str(x) for x in new_vals))
1362    return err + write_res, delta
1363
1364
1365def replication_get_time_diff(con):
1366    col = 'oplog.rs'
1367    local = con.local
1368    ol = local.system.namespaces.find_one({"name": "local.oplog.$main"})
1369    if ol:
1370        col = 'oplog.$main'
1371    firstc = local[col].find().sort("$natural", 1).limit(1)
1372    lastc = local[col].find().sort("$natural", -1).limit(1)
1373    first = firstc.next()
1374    last = lastc.next()
1375    tfirst = first["ts"]
1376    tlast = last["ts"]
1377    delta = tlast.time - tfirst.time
1378    return delta
1379
1380#
1381# main app
1382#
1383if __name__ == "__main__":
1384    sys.exit(main(sys.argv[1:]))
1385