1#!python
2# Copyright (c) gocept gmbh & co. kg
3# See also LICENSE.txt
4
5"""haproxy log check for request time and error rate.
6
7This check shows an advanced programming technique: we allow the user to
8define the thresholds dynamically. You can specify a list of conditions
9like:
10
11* the N1th percentile of t_tot must match range R1
12* the N2th percentile of t_tot must match range R2
13
14Implementation-wise, the command line parameter "percentiles" is used to
15compute both metric and context names. The default is to check for the
1650th and 95th percentile. The `MultiArg` class is used to specify sets
17of thresholds. It has the nice property to fill up missing values so the
18user is free in how many thresholds he specifies.
19"""
20
21import argparse
22import itertools
23import nagiosplugin
24import numpy
25import re
26
27
28class HAProxyLog(nagiosplugin.Resource):
29    """haproxy.log parser.
30
31    Goes through a haproxy log file and extracts total request time
32    (t_tot) and error status for each request. The error status is used
33    to compute the error rate.
34    """
35
36    r_logline = re.compile(
37        r'haproxy.*: [0-9.:]+ \[\S+\] .* \d+/\d+/\d+/\d+/(\d+) (\d\d\d) ')
38
39    def __init__(self, logfile, statefile, percentiles):
40        self.logfile = logfile
41        self.statefile = statefile
42        self.percentiles = percentiles
43
44    def parse_log(self):
45        """Yields ttot and error status for each log line."""
46        cookie = nagiosplugin.Cookie(self.statefile)
47        with nagiosplugin.LogTail(self.logfile, cookie) as lf:
48            for line in lf:
49                match = self.r_logline.search(line.decode())
50                if not match:
51                    continue
52                ttot, stat = match.groups()
53                err = not (stat.startswith('2') or stat.startswith('3'))
54                yield int(ttot), err
55
56    def probe(self):
57        """Computes error rate and t_tot percentiles."""
58        d = numpy.fromiter(self.parse_log(), dtype=[
59            ('ttot', numpy.int32), ('err', numpy.uint16)])
60        requests = len(d['err'])
61        metrics = []
62        if requests:
63            for pct in self.percentiles:
64                metrics.append(nagiosplugin.Metric(
65                    'ttot%s' % pct, numpy.percentile(
66                        d['ttot'], int(pct)) / 1000.0, 's', 0))
67        error_rate = (100 * numpy.sum(d['err'] / requests)
68                      if requests else 0)
69        metrics += [nagiosplugin.Metric('error_rate', error_rate, '%', 0, 100),
70                    nagiosplugin.Metric('request_total', requests, min=0,
71                                        context='default')]
72        return metrics
73
74
75def parse_args():
76    argp = argparse.ArgumentParser()
77    argp.add_argument('logfile')
78    argp.add_argument('--ew', '--error-warning', metavar='RANGE', default='')
79    argp.add_argument('--ec', '--error-critical', metavar='RANGE', default='')
80    argp.add_argument('--tw', '--ttot-warning', metavar='RANGE[,RANGE,...]',
81                      type=nagiosplugin.MultiArg, default='')
82    argp.add_argument('--tc', '--ttot-critical', metavar='RANGE[,RANGE,...]',
83                      type=nagiosplugin.MultiArg, default='')
84    argp.add_argument('-p', '--percentiles', metavar='N,N,...',
85                      default='50,95', help='check Nth percentiles of '
86                      'total time (default: %(default)s)')
87    argp.add_argument('-v', '--verbose', action='count', default=0,
88                      help='increase output verbosity (use up to 3 times)')
89    argp.add_argument('-t', '--timeout', default=30,
90                      help='abort execution after TIMEOUT seconds')
91    argp.add_argument('-s', '--state-file', default='check_haproxy_log.state',
92                      help='cookie file to save last log file position '
93                      '(default: "%(default)s")')
94    return argp.parse_args()
95
96
97@nagiosplugin.guarded
98def main():
99    args = parse_args()
100    percentiles = args.percentiles.split(',')
101    check = nagiosplugin.Check(
102        HAProxyLog(args.logfile, args.state_file, percentiles),
103        nagiosplugin.ScalarContext('error_rate', args.ew, args.ec))
104    for pct, i in zip(percentiles, itertools.count()):
105        check.add(nagiosplugin.ScalarContext(
106            'ttot%s' % pct, args.tw[i], args.tc[i],
107            'total time (%s.pct) is {valueunit}' % pct))
108    check.main(args.verbose, args.timeout)
109
110if __name__ == '__main__':
111    main()
112