1# Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
2#  This source code is licensed under both the GPLv2 (found in the
3#  COPYING file in the root directory) and Apache 2.0 License
4#  (found in the LICENSE.Apache file in the root directory).
5
6from abc import ABC, abstractmethod
7from calendar import timegm
8from enum import Enum
9import glob
10import re
11import time
12
13
14NO_COL_FAMILY = 'DB_WIDE'
15
16
17class DataSource(ABC):
18    class Type(Enum):
19        LOG = 1
20        DB_OPTIONS = 2
21        TIME_SERIES = 3
22
23    def __init__(self, type):
24        self.type = type
25
26    @abstractmethod
27    def check_and_trigger_conditions(self, conditions):
28        pass
29
30
31class Log:
32    @staticmethod
33    def is_new_log(log_line):
34        # The assumption is that a new log will start with a date printed in
35        # the below regex format.
36        date_regex = '\d{4}/\d{2}/\d{2}-\d{2}:\d{2}:\d{2}\.\d{6}'
37        return re.match(date_regex, log_line)
38
39    def __init__(self, log_line, column_families):
40        token_list = log_line.strip().split()
41        self.time = token_list[0]
42        self.context = token_list[1]
43        self.message = " ".join(token_list[2:])
44        self.column_family = None
45        # example log for 'default' column family:
46        # "2018/07/25-17:29:05.176080 7f969de68700 [db/compaction_job.cc:1634]
47        # [default] [JOB 3] Compacting 24@0 + 16@1 files to L1, score 6.00\n"
48        for col_fam in column_families:
49            search_for_str = '\[' + col_fam + '\]'
50            if re.search(search_for_str, self.message):
51                self.column_family = col_fam
52                break
53        if not self.column_family:
54            self.column_family = NO_COL_FAMILY
55
56    def get_human_readable_time(self):
57        # example from a log line: '2018/07/25-11:25:45.782710'
58        return self.time
59
60    def get_column_family(self):
61        return self.column_family
62
63    def get_context(self):
64        return self.context
65
66    def get_message(self):
67        return self.message
68
69    def append_message(self, remaining_log):
70        self.message = self.message + '\n' + remaining_log.strip()
71
72    def get_timestamp(self):
73        # example: '2018/07/25-11:25:45.782710' will be converted to the GMT
74        # Unix timestamp 1532517945 (note: this method assumes that self.time
75        # is in GMT)
76        hr_time = self.time + 'GMT'
77        timestamp = timegm(time.strptime(hr_time, "%Y/%m/%d-%H:%M:%S.%f%Z"))
78        return timestamp
79
80    def __repr__(self):
81        return (
82            'time: ' + self.time + '; context: ' + self.context +
83            '; col_fam: ' + self.column_family +
84            '; message: ' + self.message
85        )
86
87
88class DatabaseLogs(DataSource):
89    def __init__(self, logs_path_prefix, column_families):
90        super().__init__(DataSource.Type.LOG)
91        self.logs_path_prefix = logs_path_prefix
92        self.column_families = column_families
93
94    def trigger_conditions_for_log(self, conditions, log):
95        # For a LogCondition object, trigger is:
96        # Dict[column_family_name, List[Log]]. This explains why the condition
97        # was triggered and for which column families.
98        for cond in conditions:
99            if re.search(cond.regex, log.get_message(), re.IGNORECASE):
100                trigger = cond.get_trigger()
101                if not trigger:
102                    trigger = {}
103                if log.get_column_family() not in trigger:
104                    trigger[log.get_column_family()] = []
105                trigger[log.get_column_family()].append(log)
106                cond.set_trigger(trigger)
107
108    def check_and_trigger_conditions(self, conditions):
109        for file_name in glob.glob(self.logs_path_prefix + '*'):
110            # TODO(poojam23): find a way to distinguish between log files
111            # - generated in the current experiment but are labeled 'old'
112            # because they LOGs exceeded the file size limit  AND
113            # - generated in some previous experiment that are also labeled
114            # 'old' and were not deleted for some reason
115            if re.search('old', file_name, re.IGNORECASE):
116                continue
117            with open(file_name, 'r') as db_logs:
118                new_log = None
119                for line in db_logs:
120                    if Log.is_new_log(line):
121                        if new_log:
122                            self.trigger_conditions_for_log(
123                                conditions, new_log
124                            )
125                        new_log = Log(line, self.column_families)
126                    else:
127                        # To account for logs split into multiple lines
128                        new_log.append_message(line)
129            # Check for the last log in the file.
130            if new_log:
131                self.trigger_conditions_for_log(conditions, new_log)
132