1 /**
2  * Copyright (c) 2007-2017, Timothy Stack
3  *
4  * All rights reserved.
5  *
6  * Redistribution and use in source and binary forms, with or without
7  * modification, are permitted provided that the following conditions are met:
8  *
9  * * Redistributions of source code must retain the above copyright notice, this
10  * list of conditions and the following disclaimer.
11  * * Redistributions in binary form must reproduce the above copyright notice,
12  * this list of conditions and the following disclaimer in the documentation
13  * and/or other materials provided with the distribution.
14  * * Neither the name of Timothy Stack nor the names of its contributors
15  * may be used to endorse or promote products derived from this software
16  * without specific prior written permission.
17  *
18  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ''AS IS'' AND ANY
19  * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
20  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
21  * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE FOR ANY
22  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
23  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
24  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
25  * ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
26  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
27  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28  *
29  * @file log_format_impls.cc
30  */
31 
32 #include "config.h"
33 
34 #include <stdio.h>
35 
36 #include <utility>
37 #include <algorithm>
38 
39 #include "pcrepp/pcrepp.hh"
40 #include "sql_util.hh"
41 #include "log_format.hh"
42 #include "log_vtab_impl.hh"
43 #include "base/opt_util.hh"
44 #include "base/injector.bind.hh"
45 #include "yajlpp/yajlpp.hh"
46 #include "formats/logfmt/logfmt.parser.hh"
47 
48 using namespace std;
49 
50 static pcrepp RDNS_PATTERN("^(?:com|net|org|edu|[a-z][a-z])"
51                            "(\\.\\w+)+(.+)");
52 
53 /**
54  * Attempt to scrub a reverse-DNS string.
55  *
56  * @param  str The string to scrub.  If the string looks like a reverse-DNS
57  *   string, the leading components of the name will be reduced to a single
58  *   letter.  For example, "com.example.foo" will be reduced to "c.e.foo".
59  * @return     The scrubbed version of the input string or the original string
60  *   if it is not a reverse-DNS string.
61  */
scrub_rdns(const string & str)62 static string scrub_rdns(const string &str)
63 {
64     pcre_context_static<30> context;
65     pcre_input input(str);
66     string     retval;
67 
68     if (RDNS_PATTERN.match(context, input)) {
69         pcre_context::capture_t *cap;
70 
71         cap = context.begin();
72         for (int index = 0; index < cap->c_begin; index++) {
73             if (index == 0 || str[index - 1] == '.') {
74                 if (index > 0) {
75                     retval.append(1, '.');
76                 }
77                 retval.append(1, str[index]);
78             }
79         }
80         retval += input.get_substr(cap);
81         retval += input.get_substr(cap + 1);
82     }
83     else {
84         retval = str;
85     }
86     return retval;
87 }
88 
89 class generic_log_format : public log_format {
scrub_pattern()90     static pcrepp &scrub_pattern()
91     {
92         static pcrepp SCRUB_PATTERN(
93             "\\d+-(\\d+-\\d+ \\d+:\\d+:\\d+(?:,\\d+)?:)\\w+:(.*)");
94 
95         return SCRUB_PATTERN;
96     }
97 
get_pcre_log_formats()98     static pcre_format *get_pcre_log_formats() {
99         static pcre_format log_fmt[] = {
100             pcre_format("^(?:\\*\\*\\*\\s+)?(?<timestamp>@[0-9a-zA-Z]{16,24})(.*)"),
101             pcre_format("^(?:\\*\\*\\*\\s+)?(?<timestamp>[\\dTZ: +/\\-,\\.-]+)([^:]+)"),
102             pcre_format("^(?:\\*\\*\\*\\s+)?(?<timestamp>[\\w:+/\\.-]+) \\[\\w (.*)"),
103             pcre_format("^(?:\\*\\*\\*\\s+)?(?<timestamp>[\\w:,/\\.-]+) (.*)"),
104             pcre_format("^(?:\\*\\*\\*\\s+)?(?<timestamp>[\\w:,/\\.-]+) - (.*)"),
105             pcre_format("^(?:\\*\\*\\*\\s+)?(?<timestamp>[\\w: \\.,/-]+) - (.*)"),
106             pcre_format("^(?:\\*\\*\\*\\s+)?(?<timestamp>[\\w: \\.,/-]+)\\[[^\\]]+\\](.*)"),
107             pcre_format("^(?:\\*\\*\\*\\s+)?(?<timestamp>[\\w: \\.,/-]+) (.*)"),
108 
109             pcre_format(R"(^(?:\*\*\*\s+)?\[(?<timestamp>[\w: \.,+/-]+)\]\s*(\w+):?)"),
110             pcre_format("^(?:\\*\\*\\*\\s+)?\\[(?<timestamp>[\\w: \\.,+/-]+)\\] (.*)"),
111             pcre_format("^(?:\\*\\*\\*\\s+)?\\[(?<timestamp>[\\w: \\.,+/-]+)\\] \\[(\\w+)\\]"),
112             pcre_format("^(?:\\*\\*\\*\\s+)?\\[(?<timestamp>[\\w: \\.,+/-]+)\\] \\w+ (.*)"),
113             pcre_format("^(?:\\*\\*\\*\\s+)?\\[(?<timestamp>[\\w: ,+/-]+)\\] \\(\\d+\\) (.*)"),
114 
115             pcre_format()
116         };
117 
118         return log_fmt;
119     };
120 
get_pattern_regex(uint64_t line_number) const121     std::string get_pattern_regex(uint64_t line_number) const {
122         int pat_index = this->pattern_index_for_line(line_number);
123         return get_pcre_log_formats()[pat_index].name;
124     }
125 
get_name() const126     const intern_string_t get_name() const {
127         return intern_string::lookup("generic_log");
128     };
129 
scrub(string & line)130     void scrub(string &line)
131     {
132         pcre_context_static<30> context;
133         pcre_input pi(line);
134         string     new_line;
135 
136         if (scrub_pattern().match(context, pi)) {
137             pcre_context::capture_t *cap;
138 
139             for (cap = context.begin(); cap != context.end(); cap++) {
140                 new_line += scrub_rdns(pi.get_substr(cap));
141             }
142 
143             line = new_line;
144         }
145     };
146 
scan(logfile & lf,vector<logline> & dst,const line_info & li,shared_buffer_ref & sbr)147     scan_result_t scan(logfile &lf,
148                        vector<logline> &dst,
149                        const line_info &li,
150                        shared_buffer_ref &sbr)
151     {
152         struct exttm log_time;
153         struct timeval log_tv;
154         pcre_context::capture_t ts, level;
155         const char *last_pos;
156 
157         if ((last_pos = this->log_scanf(
158                 dst.size(),
159                 sbr.get_data(),
160                 sbr.length(),
161                 get_pcre_log_formats(),
162                 nullptr,
163                 &log_time,
164                 &log_tv,
165 
166                 &ts,
167                 &level)) != nullptr) {
168             const char *level_str = &sbr.get_data()[level.c_begin];
169             log_level_t level_val = string2level(level_str, level.length());
170 
171             if (!((log_time.et_flags & ETF_DAY_SET) &&
172                   (log_time.et_flags & ETF_MONTH_SET) &&
173                   (log_time.et_flags & ETF_YEAR_SET))) {
174                 this->check_for_new_year(dst, log_time, log_tv);
175             }
176 
177             dst.emplace_back(li.li_file_range.fr_offset, log_tv, level_val);
178             return SCAN_MATCH;
179         }
180 
181         return SCAN_NO_MATCH;
182     };
183 
annotate(uint64_t line_number,shared_buffer_ref & line,string_attrs_t & sa,std::vector<logline_value> & values,bool annotate_module) const184     void annotate(uint64_t line_number, shared_buffer_ref &line, string_attrs_t &sa,
185                       std::vector<logline_value> &values, bool annotate_module) const
186     {
187         int pat_index = this->pattern_index_for_line(line_number);
188         pcre_format &fmt = get_pcre_log_formats()[pat_index];
189         struct line_range lr;
190         int prefix_len = 0;
191         pcre_input pi(line.get_data(), 0, line.length());
192         pcre_context_static<30> pc;
193 
194         if (!fmt.pcre.match(pc, pi)) {
195             return;
196         }
197 
198         lr.lr_start = pc[0]->c_begin;
199         lr.lr_end   = pc[0]->c_end;
200         sa.emplace_back(lr, &logline::L_TIMESTAMP);
201 
202         const char *level = &line.get_data()[pc[1]->c_begin];
203 
204         if (string2level(level, pc[1]->length(), true) == LEVEL_UNKNOWN) {
205             prefix_len = pc[0]->c_end;
206         }
207         else {
208             prefix_len = pc[1]->c_end;
209         }
210 
211         lr.lr_start = 0;
212         lr.lr_end   = prefix_len;
213         sa.emplace_back(lr, &logline::L_PREFIX);
214 
215         lr.lr_start = prefix_len;
216         lr.lr_end   = line.length();
217         sa.emplace_back(lr, &SA_BODY);
218     };
219 
specialized(int fmt_lock)220     shared_ptr<log_format> specialized(int fmt_lock)
221     {
222         return std::make_shared<generic_log_format>(*this);
223     };
224 };
225 
from_escaped_string(const char * str,size_t len)226 string from_escaped_string(const char *str, size_t len)
227 {
228     string retval;
229 
230     for (size_t lpc = 0; lpc < len; lpc++) {
231         switch (str[lpc]) {
232             case '\\':
233                 if ((lpc + 3) < len && str[lpc + 1] == 'x') {
234                     int ch;
235 
236                     if (sscanf(&str[lpc + 2], "%2x", &ch) == 1) {
237                         retval.append(1, (char) ch & 0xff);
238                         lpc += 3;
239                     }
240                 }
241                 break;
242             default:
243                 retval.append(1, str[lpc]);
244                 break;
245         }
246     }
247 
248     return retval;
249 }
250 
251 nonstd::optional<const char *>
lnav_strnstr(const char * s,const char * find,size_t slen)252 lnav_strnstr(const char *s, const char *find, size_t slen)
253 {
254 	char c, sc;
255 	size_t len;
256 
257 	if ((c = *find++) != '\0') {
258 		len = strlen(find);
259 		do {
260 			do {
261 				if (slen < 1 || (sc = *s) == '\0') {
262                     return nonstd::nullopt;
263                 }
264 				--slen;
265 				++s;
266 			} while (sc != c);
267 			if (len > slen) {
268                 return nonstd::nullopt;
269             }
270 		} while (strncmp(s, find, len) != 0);
271 		s--;
272 	}
273 	return s;
274 }
275 
276 struct separated_string {
277     const char *ss_str;
278     size_t ss_len;
279     const char *ss_separator;
280     size_t ss_separator_len;
281 
separated_stringseparated_string282     separated_string(const char *str, size_t len)
283         : ss_str(str), ss_len(len), ss_separator(",") {
284         this->ss_separator_len = strlen(this->ss_separator);
285     };
286 
with_separatorseparated_string287     separated_string &with_separator(const char *sep) {
288         this->ss_separator = sep;
289         this->ss_separator_len = strlen(sep);
290         return *this;
291     };
292 
293     struct iterator {
294         const separated_string &i_parent;
295         const char *i_pos;
296         const char *i_next_pos;
297         size_t i_index;
298 
iteratorseparated_string::iterator299         iterator(const separated_string &ss, const char *pos)
300             : i_parent(ss), i_pos(pos), i_next_pos(pos), i_index(0) {
301             this->update();
302         };
303 
updateseparated_string::iterator304         void update() {
305             const separated_string &ss = this->i_parent;
306             auto next_field = lnav_strnstr(
307                 this->i_pos,
308                 ss.ss_separator,
309                 ss.ss_len - (this->i_pos - ss.ss_str));
310             if (next_field) {
311                 this->i_next_pos = next_field.value() + ss.ss_separator_len;
312             } else {
313                 this->i_next_pos = ss.ss_str + ss.ss_len;
314             }
315         };
316 
operator ++separated_string::iterator317         iterator &operator++() {
318             this->i_pos = this->i_next_pos;
319             this->update();
320             this->i_index += 1;
321 
322             return *this;
323         };
324 
operator *separated_string::iterator325         string_fragment operator*() {
326             const separated_string &ss = this->i_parent;
327             int end;
328 
329             if (this->i_next_pos < (ss.ss_str + ss.ss_len)) {
330                 end = this->i_next_pos - ss.ss_str - ss.ss_separator_len;
331             } else {
332                 end = this->i_next_pos - ss.ss_str;
333             }
334             return string_fragment(ss.ss_str, this->i_pos - ss.ss_str, end);
335         };
336 
operator ==separated_string::iterator337         bool operator==(const iterator &other) const {
338             return (&this->i_parent == &other.i_parent) &&
339                    (this->i_pos == other.i_pos);
340         };
341 
operator !=separated_string::iterator342         bool operator!=(const iterator &other) const {
343             return !(*this == other);
344         };
345 
indexseparated_string::iterator346         size_t index() const {
347             return this->i_index;
348         };
349     };
350 
beginseparated_string351     iterator begin() {
352         return {*this, this->ss_str};
353     };
354 
endseparated_string355     iterator end() {
356         return {*this, this->ss_str + this->ss_len};
357     };
358 };
359 
360 class bro_log_format : public log_format {
361 public:
362 
363     struct field_def {
364         logline_value_meta fd_meta;
365         std::string fd_collator;
366         int fd_numeric_index;
367 
field_defbro_log_format::field_def368         explicit field_def(const intern_string_t name, int col, log_format *format)
369             : fd_meta(name, value_kind_t::VALUE_TEXT, col, format),
370               fd_numeric_index(-1) {
371         };
372 
with_kindbro_log_format::field_def373         field_def &with_kind(value_kind_t kind,
374                              bool identifier = false,
375                              const std::string &collator = "") {
376             this->fd_meta.lvm_kind = kind;
377             this->fd_meta.lvm_identifier = identifier;
378             this->fd_collator = collator;
379             return *this;
380         };
381 
with_numeric_indexbro_log_format::field_def382         field_def &with_numeric_index(int index) {
383             this->fd_numeric_index = index;
384             return *this;
385         }
386     };
387 
bro_log_format()388     bro_log_format() {
389         this->lf_is_self_describing = true;
390         this->lf_time_ordered = false;
391     };
392 
get_name() const393     const intern_string_t get_name() const {
394         static const intern_string_t name(intern_string::lookup("bro"));
395 
396         return this->blf_format_name.empty() ? name : this->blf_format_name;
397     };
398 
clear()399     virtual void clear() {
400         this->log_format::clear();
401         this->blf_format_name.clear();
402         this->blf_field_defs.clear();
403     };
404 
scan_int(std::vector<logline> & dst,const line_info & li,shared_buffer_ref & sbr)405     scan_result_t scan_int(std::vector<logline> &dst,
406                            const line_info &li,
407                            shared_buffer_ref &sbr) {
408         static const intern_string_t STATUS_CODE = intern_string::lookup("bro_status_code");
409         static const intern_string_t TS = intern_string::lookup("bro_ts");
410         static const intern_string_t UID = intern_string::lookup("bro_uid");
411 
412         separated_string ss(sbr.get_data(), sbr.length());
413         struct timeval tv;
414         struct exttm tm;
415         bool found_ts = false;
416         log_level_t level = LEVEL_INFO;
417         uint8_t opid = 0;
418 
419         ss.with_separator(this->blf_separator.get());
420 
421         for (auto iter = ss.begin(); iter != ss.end(); ++iter) {
422             if (iter.index() == 0 && *iter == "#close") {
423                 return SCAN_MATCH;
424             }
425 
426             if (iter.index() >= this->blf_field_defs.size()) {
427                 break;
428             }
429 
430             const auto &fd = this->blf_field_defs[iter.index()];
431 
432             if (TS == fd.fd_meta.lvm_name) {
433                 string_fragment sf = *iter;
434 
435                 if (this->lf_date_time.scan(sf.data(),
436                                             sf.length(),
437                                             nullptr,
438                                             &tm,
439                                             tv)) {
440                     this->lf_timestamp_flags = tm.et_flags;
441                     found_ts = true;
442                 }
443             } else if (STATUS_CODE == fd.fd_meta.lvm_name) {
444                 string_fragment sf = *iter;
445 
446                 if (!sf.empty() && sf[0] >= '4') {
447                     level = LEVEL_ERROR;
448                 }
449             } else if (UID == fd.fd_meta.lvm_name) {
450                 string_fragment sf = *iter;
451 
452                 opid = hash_str(sf.data(), sf.length());
453             }
454 
455             if (fd.fd_numeric_index >= 0) {
456                 switch (fd.fd_meta.lvm_kind) {
457                     case value_kind_t::VALUE_INTEGER:
458                     case value_kind_t::VALUE_FLOAT: {
459                         string_fragment sf = *iter;
460                         char field_copy[sf.length() + 1];
461                         double val;
462 
463                         if (sscanf(sf.to_string(field_copy), "%lf", &val) == 1) {
464                             this->lf_value_stats[fd.fd_numeric_index].add_value(val);
465                         }
466                         break;
467                     }
468                     default:
469                         break;
470                 }
471             }
472         }
473 
474         if (found_ts) {
475             dst.emplace_back(li.li_file_range.fr_offset, tv, level, 0, opid);
476             return SCAN_MATCH;
477         } else {
478             return SCAN_NO_MATCH;
479         }
480     }
481 
scan(logfile & lf,std::vector<logline> & dst,const line_info & li,shared_buffer_ref & sbr)482     scan_result_t scan(logfile &lf,
483                        std::vector<logline> &dst,
484                        const line_info &li,
485                        shared_buffer_ref &sbr) {
486         static pcrepp SEP_RE(R"(^#separator\s+(.+))");
487 
488         if (!this->blf_format_name.empty()) {
489             return this->scan_int(dst, li, sbr);
490         }
491 
492         if (dst.empty() || dst.size() > 20 || sbr.empty() || sbr.get_data()[0] == '#') {
493             return SCAN_NO_MATCH;
494         }
495 
496         pcre_context_static<20> pc;
497         auto line_iter = dst.begin();
498         auto read_result = lf.read_line(line_iter);
499 
500         if (read_result.isErr()) {
501             return SCAN_NO_MATCH;
502         }
503 
504         auto line = read_result.unwrap();
505         pcre_input pi(line.get_data(), 0, line.length());
506 
507         if (!SEP_RE.match(pc, pi)) {
508             return SCAN_NO_MATCH;
509         }
510 
511         this->clear();
512 
513         string sep = from_escaped_string(pi.get_substr_start(pc[0]), pc[0]->length());
514         this->blf_separator = intern_string::lookup(sep);
515 
516         for (++line_iter; line_iter != dst.end(); ++line_iter) {
517             auto next_read_result = lf.read_line(line_iter);
518 
519             if (next_read_result.isErr()) {
520                 return SCAN_NO_MATCH;
521             }
522 
523             line = next_read_result.unwrap();
524             separated_string ss(line.get_data(), line.length());
525 
526             ss.with_separator(this->blf_separator.get());
527             auto iter = ss.begin();
528 
529             string_fragment directive = *iter;
530 
531             if (directive.empty() || directive[0] != '#') {
532                 continue;
533             }
534 
535             ++iter;
536             if (iter == ss.end()) {
537                 continue;
538             }
539 
540             if (directive == "#set_separator") {
541                 this->blf_set_separator = intern_string::lookup(*iter);
542             } else if (directive == "#empty_field") {
543                 this->blf_empty_field = intern_string::lookup(*iter);
544             } else if (directive == "#unset_field") {
545                 this->blf_unset_field = intern_string::lookup(*iter);
546             } else if (directive == "#path") {
547                 string path = to_string(*iter);
548                 char full_name[128];
549                 snprintf(full_name, sizeof(full_name), "bro_%s_log", path.c_str());
550                 this->blf_format_name = intern_string::lookup(full_name);
551             } else if (directive == "#fields") {
552                 do {
553                     this->blf_field_defs.emplace_back(
554                         intern_string::lookup("bro_" + sql_safe_ident(*iter)),
555                         this->blf_field_defs.size(),
556                         this);
557                     ++iter;
558                 } while (iter != ss.end());
559             } else if (directive == "#types") {
560                 static const char *KNOWN_IDS[] = {
561                     "bro_conn_uids",
562                     "bro_fuid",
563                     "bro_host",
564                     "bro_info_code",
565                     "bro_method",
566                     "bro_mime_type",
567                     "bro_orig_fuids",
568                     "bro_parent_fuid",
569                     "bro_proto",
570                     "bro_referrer",
571                     "bro_resp_fuids",
572                     "bro_service",
573                     "bro_status_code",
574                     "bro_uid",
575                     "bro_uri",
576                     "bro_user_agent",
577                     "bro_username",
578                 };
579 
580                 int numeric_count = 0;
581 
582                 do {
583                     string_fragment field_type = *iter;
584                     auto &fd = this->blf_field_defs[iter.index() - 1];
585 
586                     if (field_type == "time") {
587                         fd.with_kind(value_kind_t::VALUE_TIMESTAMP);
588                     } else if (field_type == "string") {
589                         bool ident = binary_search(begin(KNOWN_IDS), end(KNOWN_IDS), fd.fd_meta.lvm_name);
590                         fd.with_kind(value_kind_t::VALUE_TEXT, ident);
591                     } else if (field_type == "count") {
592                         bool ident = binary_search(begin(KNOWN_IDS), end(KNOWN_IDS), fd.fd_meta.lvm_name);
593                         fd.with_kind(value_kind_t::VALUE_INTEGER, ident)
594                           .with_numeric_index(numeric_count);
595                         numeric_count += 1;
596                     } else if (field_type == "bool") {
597                         fd.with_kind(value_kind_t::VALUE_BOOLEAN);
598                     } else if (field_type == "addr") {
599                         fd.with_kind(value_kind_t::VALUE_TEXT, true, "ipaddress");
600                     } else if (field_type == "port") {
601                         fd.with_kind(value_kind_t::VALUE_INTEGER, true);
602                     } else if (field_type == "interval") {
603                         fd.with_kind(value_kind_t::VALUE_FLOAT)
604                           .with_numeric_index(numeric_count);
605                         numeric_count += 1;
606                     }
607 
608                     ++iter;
609                 } while (iter != ss.end());
610 
611                 this->lf_value_stats.resize(numeric_count);
612             }
613         }
614 
615         if (!this->blf_format_name.empty() &&
616             !this->blf_separator.empty() &&
617             !this->blf_field_defs.empty()) {
618             dst.clear();
619             return this->scan_int(dst, li, sbr);
620         }
621 
622         this->blf_format_name.clear();
623         this->lf_value_stats.clear();
624 
625         return SCAN_NO_MATCH;
626     };
627 
annotate(uint64_t line_number,shared_buffer_ref & sbr,string_attrs_t & sa,std::vector<logline_value> & values,bool annotate_module) const628     void annotate(uint64_t line_number, shared_buffer_ref &sbr, string_attrs_t &sa,
629                       std::vector<logline_value> &values, bool annotate_module) const {
630         static const intern_string_t TS = intern_string::lookup("bro_ts");
631         static const intern_string_t UID = intern_string::lookup("bro_uid");
632 
633         separated_string ss(sbr.get_data(), sbr.length());
634 
635         ss.with_separator(this->blf_separator.get());
636 
637         for (auto iter = ss.begin(); iter != ss.end(); ++iter) {
638             if (iter.index() >= this->blf_field_defs.size()) {
639                 return;
640             }
641 
642             const field_def &fd = this->blf_field_defs[iter.index()];
643             string_fragment sf = *iter;
644 
645             if (sf == this->blf_empty_field) {
646                 sf.clear();
647             } else if (sf == this->blf_unset_field) {
648                 sf.invalidate();
649             }
650 
651             auto lr = line_range(sf.sf_begin, sf.sf_end);
652 
653             if (fd.fd_meta.lvm_name == TS) {
654                 sa.emplace_back(lr, &logline::L_TIMESTAMP);
655             } else if (fd.fd_meta.lvm_name == UID) {
656                 sa.emplace_back(lr, &logline::L_OPID);
657             }
658 
659             if (lr.is_valid()) {
660                 values.emplace_back(fd.fd_meta, sbr, lr);
661             } else {
662                 values.emplace_back(fd.fd_meta);
663             }
664         }
665     };
666 
stats_for_value(const intern_string_t & name) const667     const logline_value_stats *stats_for_value(const intern_string_t &name) const {
668         const logline_value_stats *retval = nullptr;
669 
670         for (size_t lpc = 0; lpc < this->blf_field_defs.size(); lpc++) {
671             if (this->blf_field_defs[lpc].fd_meta.lvm_name == name) {
672                 if (this->blf_field_defs[lpc].fd_numeric_index < 0) {
673                     break;
674                 }
675                 retval = &this->lf_value_stats[this->blf_field_defs[lpc].fd_numeric_index];
676                 break;
677             }
678         }
679 
680         return retval;
681     };
682 
specialized(int fmt_lock=-1)683     std::shared_ptr<log_format> specialized(int fmt_lock = -1) {
684         return make_shared<bro_log_format>(*this);
685     };
686 
687     class bro_log_table : public log_format_vtab_impl {
688     public:
bro_log_table(const bro_log_format & format)689         bro_log_table(const bro_log_format &format)
690             : log_format_vtab_impl(format), blt_format(format) {
691 
692         }
693 
get_columns(vector<vtab_column> & cols) const694         void get_columns(vector<vtab_column> &cols) const override {
695             for (const auto &fd : this->blt_format.blf_field_defs) {
696                 std::pair<int, unsigned int> type_pair = log_vtab_impl::logline_value_to_sqlite_type(fd.fd_meta.lvm_kind);
697 
698                 cols.emplace_back(fd.fd_meta.lvm_name.to_string(), type_pair.first, fd.fd_collator, false, "", type_pair.second);
699             }
700         };
701 
get_foreign_keys(std::vector<std::string> & keys_inout) const702         void get_foreign_keys(std::vector<std::string> &keys_inout) const override {
703             this->log_vtab_impl::get_foreign_keys(keys_inout);
704 
705             for (const auto &fd : this->blt_format.blf_field_defs) {
706                 if (fd.fd_meta.lvm_identifier) {
707                     keys_inout.push_back(fd.fd_meta.lvm_name.to_string());
708                 }
709             }
710         }
711 
712         const bro_log_format &blt_format;
713     };
714 
get_tables()715     static map<intern_string_t, std::shared_ptr<bro_log_table>> &get_tables() {
716         static map<intern_string_t, std::shared_ptr<bro_log_table>> retval;
717 
718         return retval;
719     };
720 
get_vtab_impl() const721     std::shared_ptr<log_vtab_impl> get_vtab_impl() const {
722         if (this->blf_format_name.empty()) {
723             return nullptr;
724         }
725 
726         std::shared_ptr<bro_log_table> retval = nullptr;
727 
728         auto &tables = get_tables();
729         auto iter = tables.find(this->blf_format_name);
730         if (iter == tables.end()) {
731             retval = std::make_shared<bro_log_table>(*this);
732             tables[this->blf_format_name] = retval;
733         }
734 
735         return retval;
736     };
737 
get_subline(const logline & ll,shared_buffer_ref & sbr,bool full_message)738     void get_subline(const logline &ll,
739                      shared_buffer_ref &sbr,
740                      bool full_message) {
741     }
742 
743     intern_string_t blf_format_name;
744     intern_string_t blf_separator;
745     intern_string_t blf_set_separator;
746     intern_string_t blf_empty_field;
747     intern_string_t blf_unset_field;
748     vector<field_def> blf_field_defs;
749 
750 };
751 
752 struct ws_separated_string {
753     const char *ss_str;
754     size_t ss_len;
755 
ws_separated_stringws_separated_string756     explicit ws_separated_string(const char *str = nullptr, size_t len = -1)
757         : ss_str(str), ss_len(len) {
758     };
759 
760     struct iterator {
761         enum class state_t {
762             NORMAL,
763             QUOTED,
764         };
765 
766         const ws_separated_string &i_parent;
767         const char *i_pos;
768         const char *i_next_pos;
769         size_t i_index{0};
770         state_t i_state{state_t::NORMAL};
771 
iteratorws_separated_string::iterator772         iterator(const ws_separated_string &ss, const char *pos)
773             : i_parent(ss), i_pos(pos), i_next_pos(pos) {
774             this->update();
775         };
776 
updatews_separated_string::iterator777         void update() {
778             const auto &ss = this->i_parent;
779             bool done = false;
780 
781             while (!done && this->i_next_pos < (ss.ss_str + ss.ss_len)) {
782                 switch (this->i_state) {
783                     case state_t::NORMAL:
784                         if (*this->i_next_pos == '"') {
785                             this->i_state = state_t::QUOTED;
786                         } else if (isspace(*this->i_next_pos)) {
787                             done = true;
788                         }
789                         break;
790                     case state_t::QUOTED:
791                         if (*this->i_next_pos == '"') {
792                             this->i_state = state_t::NORMAL;
793                         }
794                         break;
795                 }
796                 if (!done) {
797                     this->i_next_pos += 1;
798                 }
799             }
800         };
801 
operator ++ws_separated_string::iterator802         iterator &operator++() {
803             const auto &ss = this->i_parent;
804 
805             this->i_pos = this->i_next_pos;
806             while (this->i_pos < (ss.ss_str + ss.ss_len) &&
807                    isspace(*this->i_pos)) {
808                 this->i_pos += 1;
809                 this->i_next_pos += 1;
810             }
811             this->update();
812             this->i_index += 1;
813 
814             return *this;
815         };
816 
operator *ws_separated_string::iterator817         string_fragment operator*() {
818             const auto &ss = this->i_parent;
819             int end = this->i_next_pos - ss.ss_str;
820 
821             return string_fragment(ss.ss_str, this->i_pos - ss.ss_str, end);
822         };
823 
operator ==ws_separated_string::iterator824         bool operator==(const iterator &other) const {
825             return (&this->i_parent == &other.i_parent) &&
826                    (this->i_pos == other.i_pos);
827         };
828 
operator !=ws_separated_string::iterator829         bool operator!=(const iterator &other) const {
830             return !(*this == other);
831         };
832 
indexws_separated_string::iterator833         size_t index() const {
834             return this->i_index;
835         };
836     };
837 
beginws_separated_string838     iterator begin() {
839         return {*this, this->ss_str};
840     };
841 
endws_separated_string842     iterator end() {
843         return {*this, this->ss_str + this->ss_len};
844     };
845 };
846 
847 class w3c_log_format : public log_format {
848 public:
849 
850     struct field_def {
851         const intern_string_t fd_name;
852         logline_value_meta fd_meta;
853         std::string fd_collator;
854         int fd_numeric_index;
855 
field_defw3c_log_format::field_def856         explicit field_def(const intern_string_t name)
857             : fd_name(name),
858               fd_meta(intern_string::lookup(sql_safe_ident(name.to_string_fragment())),
859                       value_kind_t::VALUE_TEXT),
860               fd_numeric_index(-1) {
861         };
862 
field_defw3c_log_format::field_def863         field_def(const intern_string_t name, logline_value_meta meta)
864             : fd_name(name), fd_meta(meta), fd_numeric_index(-1) {
865         }
866 
field_defw3c_log_format::field_def867         field_def(int col, const char *name, value_kind_t kind, bool ident = false, std::string coll = "")
868             : fd_name(intern_string::lookup(name)),
869               fd_meta(intern_string::lookup(sql_safe_ident(string_fragment(name))),
870                       kind,
871                       col),
872               fd_collator(std::move(coll)),
873               fd_numeric_index(-1) {
874             this->fd_meta.lvm_identifier = ident;
875         }
876 
with_kindw3c_log_format::field_def877         field_def &with_kind(value_kind_t kind,
878                              bool identifier = false,
879                              const std::string &collator = "") {
880             this->fd_meta.lvm_kind = kind;
881             this->fd_meta.lvm_identifier = identifier;
882             this->fd_collator = collator;
883             return *this;
884         };
885 
with_numeric_indexw3c_log_format::field_def886         field_def &with_numeric_index(int index) {
887             this->fd_numeric_index = index;
888             return *this;
889         }
890     };
891 
892     struct field_to_struct_t {
field_to_struct_tw3c_log_format::field_to_struct_t893         field_to_struct_t(const char *prefix, const char *struct_name)
894             : fs_prefix(prefix),
895               fs_struct_name(intern_string::lookup(struct_name)) {
896         }
897 
898         const char *fs_prefix;
899         intern_string_t fs_struct_name;
900     };
901 
902     static const std::vector<field_def> KNOWN_FIELDS;
903     const static std::vector<field_to_struct_t> KNOWN_STRUCT_FIELDS;
904 
w3c_log_format()905     w3c_log_format() {
906         this->lf_is_self_describing = true;
907         this->lf_time_ordered = false;
908     };
909 
get_name() const910     const intern_string_t get_name() const override {
911         static const intern_string_t name(intern_string::lookup("w3c"));
912 
913         return this->wlf_format_name.empty() ? name : this->wlf_format_name;
914     };
915 
clear()916     void clear() override {
917         this->log_format::clear();
918         this->wlf_time_scanner.clear();
919         this->wlf_format_name.clear();
920         this->wlf_field_defs.clear();
921     };
922 
scan_int(std::vector<logline> & dst,const line_info & li,shared_buffer_ref & sbr)923     scan_result_t scan_int(std::vector<logline> &dst,
924                            const line_info &li,
925                            shared_buffer_ref &sbr) {
926         static const intern_string_t F_DATE = intern_string::lookup("date");
927         static const intern_string_t F_DATE_LOCAL = intern_string::lookup("date-local");
928         static const intern_string_t F_DATE_UTC = intern_string::lookup("date-UTC");
929         static const intern_string_t F_TIME = intern_string::lookup("time");
930         static const intern_string_t F_TIME_LOCAL = intern_string::lookup("time-local");
931         static const intern_string_t F_TIME_UTC = intern_string::lookup("time-UTC");
932         static const intern_string_t F_STATUS_CODE = intern_string::lookup("sc-status");
933 
934         ws_separated_string ss(sbr.get_data(), sbr.length());
935         struct timeval date_tv{0, 0}, time_tv{0, 0};
936         struct exttm date_tm, time_tm;
937         bool found_date = false, found_time = false;
938         log_level_t level = LEVEL_INFO;
939 
940         for (auto iter = ss.begin(); iter != ss.end(); ++iter) {
941             if (iter.index() >= this->wlf_field_defs.size()) {
942                 level = LEVEL_INVALID;
943                 break;
944             }
945 
946             const field_def &fd = this->wlf_field_defs[iter.index()];
947             string_fragment sf = *iter;
948 
949             if (sf.startswith("#")) {
950                 if (sf == "#Date:") {
951                     date_time_scanner dts;
952                     struct exttm tm;
953                     struct timeval tv;
954 
955                     if (dts.scan(sbr.get_data_at(sf.length() + 1),
956                                  sbr.length() - sf.length() - 1,
957                                  nullptr,
958                                  &tm,
959                                  tv)) {
960                         this->lf_date_time.set_base_time(tv.tv_sec);
961                         this->wlf_time_scanner.set_base_time(tv.tv_sec);
962                     }
963                 }
964                 dst.emplace_back(li.li_file_range.fr_offset, 0, 0, LEVEL_IGNORE, 0);
965                 return SCAN_MATCH;
966             }
967 
968             sf.trim("\" \t");
969             if (F_DATE == fd.fd_name ||
970                 F_DATE_LOCAL == fd.fd_name ||
971                 F_DATE_UTC == fd.fd_name) {
972                 if (this->lf_date_time.scan(sf.data(),
973                                             sf.length(),
974                                             nullptr,
975                                             &date_tm,
976                                             date_tv)) {
977                     this->lf_timestamp_flags |= date_tm.et_flags;
978                     found_date = true;
979                 }
980             } else if (F_TIME == fd.fd_name ||
981                        F_TIME_LOCAL == fd.fd_name ||
982                        F_TIME_UTC == fd.fd_name) {
983                 if (this->wlf_time_scanner.scan(sf.data(),
984                                                 sf.length(),
985                                                 nullptr,
986                                                 &time_tm,
987                                                 time_tv)) {
988                     this->lf_timestamp_flags |= time_tm.et_flags;
989                     found_time = true;
990                 }
991             } else if (F_STATUS_CODE == fd.fd_name) {
992                 if (!sf.empty() && sf[0] >= '4') {
993                     level = LEVEL_ERROR;
994                 }
995             }
996 
997             if (fd.fd_numeric_index >= 0) {
998                 switch (fd.fd_meta.lvm_kind) {
999                     case value_kind_t::VALUE_INTEGER:
1000                     case value_kind_t::VALUE_FLOAT: {
1001                         char field_copy[sf.length() + 1];
1002                         double val;
1003 
1004                         if (sscanf(sf.to_string(field_copy), "%lf", &val) == 1) {
1005                             this->lf_value_stats[fd.fd_numeric_index].add_value(val);
1006                         }
1007                         break;
1008                     }
1009                     default:
1010                         break;
1011                 }
1012             }
1013         }
1014 
1015         if (found_time) {
1016             struct exttm tm = time_tm;
1017             struct timeval tv;
1018 
1019             if (found_date) {
1020                 tm.et_tm.tm_year = date_tm.et_tm.tm_year;
1021                 tm.et_tm.tm_mday = date_tm.et_tm.tm_mday;
1022                 tm.et_tm.tm_mon = date_tm.et_tm.tm_mon;
1023                 tm.et_tm.tm_wday = date_tm.et_tm.tm_wday;
1024                 tm.et_tm.tm_yday = date_tm.et_tm.tm_yday;
1025             }
1026 
1027             tv.tv_sec = tm2sec(&tm.et_tm);
1028             tv.tv_usec = tm.et_nsec / 1000;
1029 
1030             dst.emplace_back(li.li_file_range.fr_offset, tv, level, 0);
1031             return SCAN_MATCH;
1032         } else {
1033             return SCAN_NO_MATCH;
1034         }
1035     }
1036 
scan(logfile & lf,std::vector<logline> & dst,const line_info & li,shared_buffer_ref & sbr)1037     scan_result_t scan(logfile &lf,
1038                        std::vector<logline> &dst,
1039                        const line_info &li,
1040                        shared_buffer_ref &sbr) override {
1041         static auto W3C_LOG_NAME = intern_string::lookup("w3c_log");
1042         static auto X_FIELDS_NAME = intern_string::lookup("x_fields");
1043         static auto X_FIELDS_IDX = 0;
1044 
1045         if (!this->wlf_format_name.empty()) {
1046             return this->scan_int(dst, li, sbr);
1047         }
1048 
1049         if (dst.empty() || dst.size() > 20 || sbr.empty() || sbr.get_data()[0] == '#') {
1050             return SCAN_NO_MATCH;
1051         }
1052 
1053         this->clear();
1054 
1055         for (auto line_iter = dst.begin(); line_iter != dst.end(); ++line_iter) {
1056             auto next_read_result = lf.read_line(line_iter);
1057 
1058             if (next_read_result.isErr()) {
1059                 return SCAN_NO_MATCH;
1060             }
1061 
1062             auto line = next_read_result.unwrap();
1063             ws_separated_string ss(line.get_data(), line.length());
1064             auto iter = ss.begin();
1065 
1066             string_fragment directive = *iter;
1067 
1068             if (directive.empty() || directive[0] != '#') {
1069                 continue;
1070             }
1071 
1072             ++iter;
1073             if (iter == ss.end()) {
1074                 continue;
1075             }
1076 
1077             if (directive == "#Date:") {
1078                 date_time_scanner dts;
1079                 struct exttm tm;
1080                 struct timeval tv;
1081 
1082                 if (dts.scan(line.get_data_at(directive.length() + 1),
1083                              line.length() - directive.length() - 1,
1084                              nullptr,
1085                              &tm,
1086                              tv)) {
1087                     this->lf_date_time.set_base_time(tv.tv_sec);
1088                     this->wlf_time_scanner.set_base_time(tv.tv_sec);
1089                 }
1090             } else if (directive == "#Fields:") {
1091                 int numeric_count = 0;
1092 
1093                 do {
1094                     string_fragment sf = *iter;
1095 
1096                     sf.trim(")");
1097                     auto field_iter = std::find_if(begin(KNOWN_FIELDS),
1098                                                    end(KNOWN_FIELDS),
1099                                                    [&sf](auto elem) {
1100                                                        return sf == elem.fd_name;
1101                                                    });
1102                     if (field_iter != end(KNOWN_FIELDS)) {
1103                         this->wlf_field_defs.emplace_back(*field_iter);
1104                     } else if (sf == "date" || sf == "time") {
1105                         this->wlf_field_defs.emplace_back(
1106                             intern_string::lookup(sf));
1107                     } else {
1108                         const auto fs_iter = std::find_if(
1109                             begin(KNOWN_STRUCT_FIELDS),
1110                             end(KNOWN_STRUCT_FIELDS),
1111                             [&sf](auto elem) {
1112                                 return sf.startswith(elem.fs_prefix);
1113                             });
1114                         if (fs_iter != end(KNOWN_STRUCT_FIELDS)) {
1115                             auto field_name = intern_string::lookup(sf.substr(3));
1116                             this->wlf_field_defs.emplace_back(
1117                                 field_name, logline_value_meta(
1118                                     field_name,
1119                                     value_kind_t::VALUE_TEXT,
1120                                     KNOWN_FIELDS.size() + 1 +
1121                                     std::distance(begin(KNOWN_STRUCT_FIELDS), fs_iter),
1122                                     this)
1123                                     .with_struct_name(fs_iter->fs_struct_name));
1124                         } else {
1125                             auto field_name = intern_string::lookup(sf);
1126                             this->wlf_field_defs.emplace_back(
1127                                 field_name,
1128                                 logline_value_meta(field_name,
1129                                                    value_kind_t::VALUE_TEXT,
1130                                                    KNOWN_FIELDS.size() +
1131                                                    X_FIELDS_IDX,
1132                                                    this)
1133                                     .with_struct_name(X_FIELDS_NAME));
1134                         }
1135                     }
1136                     auto& fd = this->wlf_field_defs.back();
1137                     fd.fd_meta.lvm_format = nonstd::make_optional(this);
1138                     switch (fd.fd_meta.lvm_kind) {
1139                         case value_kind_t::VALUE_FLOAT:
1140                         case value_kind_t::VALUE_INTEGER:
1141                             fd.with_numeric_index(numeric_count);
1142                             numeric_count += 1;
1143                             break;
1144                         default:
1145                             break;
1146                     }
1147 
1148                     ++iter;
1149                 } while (iter != ss.end());
1150 
1151                 this->wlf_format_name = W3C_LOG_NAME;
1152                 this->lf_value_stats.resize(numeric_count);
1153             }
1154         }
1155 
1156         if (!this->wlf_format_name.empty() &&
1157             !this->wlf_field_defs.empty()) {
1158             dst.clear();
1159             return this->scan_int(dst, li, sbr);
1160         }
1161 
1162         this->wlf_format_name.clear();
1163         this->lf_value_stats.clear();
1164 
1165         return SCAN_NO_MATCH;
1166     };
1167 
annotate(uint64_t line_number,shared_buffer_ref & sbr,string_attrs_t & sa,std::vector<logline_value> & values,bool annotate_module) const1168     void annotate(uint64_t line_number, shared_buffer_ref &sbr, string_attrs_t &sa,
1169                   std::vector<logline_value> &values, bool annotate_module) const override {
1170         ws_separated_string ss(sbr.get_data(), sbr.length());
1171 
1172         for (auto iter = ss.begin(); iter != ss.end(); ++iter) {
1173             string_fragment sf = *iter;
1174 
1175             if (iter.index() >= this->wlf_field_defs.size()) {
1176                 sa.emplace_back(line_range{sf.sf_begin, -1},
1177                                 &SA_INVALID,
1178                                 (void *) "extra fields detected");
1179                 return;
1180             }
1181 
1182             const field_def &fd = this->wlf_field_defs[iter.index()];
1183 
1184             if (sf == "-") {
1185                 sf.invalidate();
1186             }
1187 
1188             auto lr = line_range(sf.sf_begin, sf.sf_end);
1189 
1190             if (lr.is_valid()) {
1191                 values.emplace_back(fd.fd_meta, sbr, lr);
1192                 if (sf.startswith("\"")) {
1193                     auto& meta = values.back().lv_meta;
1194 
1195                     if (meta.lvm_kind == value_kind_t::VALUE_TEXT) {
1196                         meta.lvm_kind = value_kind_t::VALUE_W3C_QUOTED;
1197                     } else {
1198                         meta.lvm_kind = value_kind_t::VALUE_NULL;
1199                     }
1200                 }
1201             } else {
1202                 values.emplace_back(fd.fd_meta);
1203             }
1204         }
1205     };
1206 
stats_for_value(const intern_string_t & name) const1207     const logline_value_stats *stats_for_value(const intern_string_t &name) const override {
1208         const logline_value_stats *retval = nullptr;
1209 
1210         for (const auto & wlf_field_def : this->wlf_field_defs) {
1211             if (wlf_field_def.fd_meta.lvm_name == name) {
1212                 if (wlf_field_def.fd_numeric_index < 0) {
1213                     break;
1214                 }
1215                 retval = &this->lf_value_stats[wlf_field_def.fd_numeric_index];
1216                 break;
1217             }
1218         }
1219 
1220         return retval;
1221     };
1222 
specialized(int fmt_lock=-1)1223     std::shared_ptr<log_format> specialized(int fmt_lock = -1) override {
1224         return make_shared<w3c_log_format>(*this);
1225     };
1226 
1227     class w3c_log_table : public log_format_vtab_impl {
1228     public:
w3c_log_table(const w3c_log_format & format)1229         explicit w3c_log_table(const w3c_log_format &format)
1230             : log_format_vtab_impl(format), wlt_format(format) {
1231 
1232         }
1233 
get_columns(vector<vtab_column> & cols) const1234         void get_columns(vector<vtab_column> &cols) const override {
1235             for (const auto &fd : KNOWN_FIELDS) {
1236                 auto type_pair = log_vtab_impl::logline_value_to_sqlite_type(
1237                     fd.fd_meta.lvm_kind);
1238 
1239                 cols.emplace_back(fd.fd_meta.lvm_name.to_string(),
1240                                   type_pair.first,
1241                                   fd.fd_collator,
1242                                   false,
1243                                   "",
1244                                   type_pair.second);
1245             }
1246             cols.emplace_back("x_fields");
1247             cols.back().with_comment(
1248                 "A JSON-object that contains fields that are not first-class columns");
1249             for (const auto& fs : KNOWN_STRUCT_FIELDS) {
1250                 cols.emplace_back(fs.fs_struct_name.to_string());
1251             }
1252         };
1253 
get_foreign_keys(std::vector<std::string> & keys_inout) const1254         void get_foreign_keys(std::vector<std::string> &keys_inout) const override {
1255             this->log_vtab_impl::get_foreign_keys(keys_inout);
1256 
1257             for (const auto &fd : KNOWN_FIELDS) {
1258                 if (fd.fd_meta.lvm_identifier) {
1259                     keys_inout.push_back(fd.fd_meta.lvm_name.to_string());
1260                 }
1261             }
1262         }
1263 
1264         const w3c_log_format &wlt_format;
1265     };
1266 
get_tables()1267     static map<intern_string_t, std::shared_ptr<w3c_log_table>> &get_tables() {
1268         static map<intern_string_t, std::shared_ptr<w3c_log_table>> retval;
1269 
1270         return retval;
1271     };
1272 
get_vtab_impl() const1273     std::shared_ptr<log_vtab_impl> get_vtab_impl() const override {
1274         if (this->wlf_format_name.empty()) {
1275             return nullptr;
1276         }
1277 
1278         std::shared_ptr<w3c_log_table> retval = nullptr;
1279 
1280         auto &tables = get_tables();
1281         auto iter = tables.find(this->wlf_format_name);
1282         if (iter == tables.end()) {
1283             retval = std::make_shared<w3c_log_table>(*this);
1284             tables[this->wlf_format_name] = retval;
1285         }
1286 
1287         return retval;
1288     };
1289 
get_subline(const logline & ll,shared_buffer_ref & sbr,bool full_message)1290     void get_subline(const logline &ll,
1291                      shared_buffer_ref &sbr,
1292                      bool full_message) override {
1293     }
1294 
1295     date_time_scanner wlf_time_scanner;
1296     intern_string_t wlf_format_name;
1297     vector<field_def> wlf_field_defs;
1298 };
1299 
1300 static int KNOWN_FIELD_INDEX = 0;
1301 const std::vector<w3c_log_format::field_def> w3c_log_format::KNOWN_FIELDS = {
1302     {
1303         KNOWN_FIELD_INDEX++,
1304         "cs-method",
1305         value_kind_t::VALUE_TEXT,
1306         true,
1307     },
1308     {
1309         KNOWN_FIELD_INDEX++,
1310         "c-ip",
1311         value_kind_t::VALUE_TEXT,
1312         true,
1313         "ipaddress",
1314     },
1315     {
1316         KNOWN_FIELD_INDEX++,
1317         "cs-bytes",
1318         value_kind_t::VALUE_INTEGER,
1319         false,
1320     },
1321     {
1322         KNOWN_FIELD_INDEX++,
1323         "cs-host",
1324         value_kind_t::VALUE_TEXT,
1325         true,
1326     },
1327     {
1328         KNOWN_FIELD_INDEX++,
1329         "cs-uri-stem",
1330         value_kind_t::VALUE_TEXT,
1331         true,
1332         "naturalnocase",
1333     },
1334     {
1335         KNOWN_FIELD_INDEX++,
1336         "cs-uri-query",
1337         value_kind_t::VALUE_TEXT,
1338         false,
1339     },
1340     {
1341         KNOWN_FIELD_INDEX++,
1342         "cs-username",
1343         value_kind_t::VALUE_TEXT,
1344         false,
1345     },
1346     {
1347         KNOWN_FIELD_INDEX++,
1348         "cs-version",
1349         value_kind_t::VALUE_TEXT,
1350         true,
1351     },
1352     {
1353         KNOWN_FIELD_INDEX++,
1354         "s-ip",
1355         value_kind_t::VALUE_TEXT,
1356         true,
1357         "ipaddress",
1358     },
1359     {
1360         KNOWN_FIELD_INDEX++,
1361         "s-port",
1362         value_kind_t::VALUE_INTEGER,
1363         true,
1364     },
1365     {
1366         KNOWN_FIELD_INDEX++,
1367         "s-computername",
1368         value_kind_t::VALUE_TEXT,
1369         true,
1370     },
1371     {
1372         KNOWN_FIELD_INDEX++,
1373         "s-sitename",
1374         value_kind_t::VALUE_TEXT,
1375         true,
1376     },
1377     {
1378         KNOWN_FIELD_INDEX++,
1379         "sc-bytes",
1380         value_kind_t::VALUE_INTEGER,
1381         false,
1382     },
1383     {
1384         KNOWN_FIELD_INDEX++,
1385         "sc-status",
1386         value_kind_t::VALUE_INTEGER,
1387         false,
1388     },
1389     {
1390         KNOWN_FIELD_INDEX++,
1391         "sc-substatus",
1392         value_kind_t::VALUE_INTEGER,
1393         false,
1394     },
1395     {
1396         KNOWN_FIELD_INDEX++,
1397         "time-taken",
1398         value_kind_t::VALUE_FLOAT,
1399         false,
1400     },
1401 };
1402 
1403 const std::vector<w3c_log_format::field_to_struct_t> w3c_log_format::KNOWN_STRUCT_FIELDS = {
1404     {"cs(", "cs_headers"},
1405     {"sc(", "sc_headers"},
1406     {"rs(", "rs_headers"},
1407     {"sr(", "sr_headers"},
1408 };
1409 
1410 struct logfmt_pair_handler {
logfmt_pair_handlerlogfmt_pair_handler1411     explicit logfmt_pair_handler(date_time_scanner &dts) : lph_dt_scanner(dts)
1412     {
1413     }
1414 
process_valuelogfmt_pair_handler1415     bool process_value(const string_fragment& value_frag) {
1416         if (this->lph_key_frag == "time" ||
1417             this->lph_key_frag == "ts") {
1418             if (!this->lph_dt_scanner.scan(value_frag.data(),
1419                                            value_frag.length(),
1420                                            nullptr,
1421                                            &this->lph_time_tm,
1422                                            this->lph_tv)) {
1423                 return false;
1424             }
1425             this->lph_found_time = true;
1426         } else if (this->lph_key_frag == "level") {
1427             this->lph_level = string2level(value_frag.data(), value_frag.length());
1428         }
1429         return true;
1430     }
1431 
1432     date_time_scanner &lph_dt_scanner;
1433     bool lph_found_time{false};
1434     struct exttm lph_time_tm{};
1435     struct timeval lph_tv{0, 0};
1436     log_level_t lph_level{log_level_t::LEVEL_INFO};
1437     string_fragment lph_key_frag{""};
1438 };
1439 
1440 class logfmt_format : public log_format {
1441 public:
get_name() const1442     const intern_string_t get_name() const override
1443     {
1444         const static auto NAME = intern_string::lookup("logfmt_log");
1445 
1446         return NAME;
1447     }
1448 
1449     class logfmt_log_table : public log_format_vtab_impl {
1450     public:
logfmt_log_table(const log_format & format)1451         logfmt_log_table(const log_format &format) : log_format_vtab_impl(format) {}
1452 
get_columns(vector<vtab_column> & cols) const1453         void get_columns(vector<vtab_column> &cols) const override {
1454             static const auto FIELDS = std::string("fields");
1455 
1456             cols.emplace_back(FIELDS);
1457         };
1458     };
1459 
get_vtab_impl() const1460     shared_ptr<log_vtab_impl> get_vtab_impl() const override
1461     {
1462         static auto retval = std::make_shared<logfmt_log_table>(*this);
1463 
1464         return retval;
1465     }
1466 
scan(logfile & lf,vector<logline> & dst,const line_info & li,shared_buffer_ref & sbr)1467     scan_result_t scan(logfile &lf, vector<logline> &dst, const line_info &li,
1468                        shared_buffer_ref &sbr) override
1469     {
1470         auto p = logfmt::parser(string_fragment{sbr.get_data(), 0, (int) sbr.length()});
1471         scan_result_t retval = scan_result_t::SCAN_NO_MATCH;
1472         bool done = false;
1473         logfmt_pair_handler lph(this->lf_date_time);
1474 
1475         while (!done) {
1476             auto parse_result = p.step();
1477 
1478             done = parse_result.match(
1479                 [](const logfmt::parser::end_of_input &) {
1480                     return true;
1481                 },
1482                 [&lph](const logfmt::parser::kvpair &kvp) {
1483                     lph.lph_key_frag = kvp.first;
1484 
1485                     return kvp.second.match(
1486                         [](const logfmt::parser::bool_value& bv) {
1487                             return false;
1488                         },
1489                         [&lph](const logfmt::parser::float_value& fv) {
1490                             return lph.process_value(fv.fv_str_value);
1491                         },
1492                         [&lph](const logfmt::parser::int_value& iv) {
1493                             return lph.process_value(iv.iv_str_value);
1494                         },
1495                         [&lph](const logfmt::parser::quoted_value &qv) {
1496                             auto_mem<yajl_handle_t> handle(yajl_free);
1497                             yajl_callbacks cb;
1498 
1499                             handle = yajl_alloc(&cb, nullptr, &lph);
1500                             memset(&cb, 0, sizeof(cb));
1501                             cb.yajl_string = +[](void *ctx, const unsigned char* str, size_t len) -> int {
1502                                 auto& lph = *((logfmt_pair_handler *)ctx);
1503                                 string_fragment value_frag{str, 0, (int) len};
1504 
1505                                 return lph.process_value(value_frag);
1506                             };
1507 
1508                             if (yajl_parse(handle,
1509                                            (const unsigned char *) qv.qv_value.data(),
1510                                            qv.qv_value.length()) != yajl_status_ok ||
1511                                 yajl_complete_parse(handle) != yajl_status_ok) {
1512                                 log_debug("json parsing failed");
1513                                 string_fragment unq_frag{
1514                                     qv.qv_value.sf_string,
1515                                     qv.qv_value.sf_begin + 1,
1516                                     qv.qv_value.sf_end - 1,
1517                                 };
1518 
1519                                 return lph.process_value(unq_frag);
1520                             }
1521 
1522                             return false;
1523                         },
1524                         [&lph](const logfmt::parser::unquoted_value &uv) {
1525                             return lph.process_value(uv.uv_value);
1526                         }
1527                     );
1528                 },
1529                 [](const logfmt::parser::error &err) {
1530                     // log_error("logfmt parse error: %s", err.e_msg.c_str());
1531                     return true;
1532                 }
1533             );
1534         }
1535 
1536         if (lph.lph_found_time) {
1537             dst.emplace_back(li.li_file_range.fr_offset, lph.lph_tv, lph.lph_level);
1538             retval = scan_result_t::SCAN_MATCH;
1539         }
1540 
1541         return retval;
1542     }
1543 
1544     void
annotate(uint64_t line_number,shared_buffer_ref & sbr,string_attrs_t & sa,vector<logline_value> & values,bool annotate_module) const1545     annotate(uint64_t line_number, shared_buffer_ref &sbr, string_attrs_t &sa,
1546              vector<logline_value> &values, bool annotate_module) const override
1547     {
1548         static const auto FIELDS_NAME = intern_string::lookup("fields");
1549 
1550         auto p = logfmt::parser(
1551             string_fragment{sbr.get_data(), 0, (int) sbr.length()});
1552         bool done = false;
1553 
1554         while (!done) {
1555             auto parse_result = p.step();
1556 
1557             done = parse_result.match(
1558                 [](const logfmt::parser::end_of_input &) {
1559                     return true;
1560                 },
1561                 [this, &sa, &values, &sbr](const logfmt::parser::kvpair &kvp) {
1562                     auto value_frag = kvp.second.match(
1563                         [this, &kvp, &values](const logfmt::parser::bool_value& bv) {
1564                             auto lvm = logline_value_meta{
1565                                 intern_string::lookup(kvp.first),
1566                                 value_kind_t::VALUE_INTEGER,
1567                                 0,
1568                                 (log_format *) this
1569                             }
1570                                 .with_struct_name(FIELDS_NAME);
1571                             values.emplace_back(lvm, bv.bv_value);
1572 
1573                             return bv.bv_str_value;
1574                         },
1575                         [this, &kvp, &values](const logfmt::parser::int_value& iv) {
1576                             auto lvm = logline_value_meta{
1577                                 intern_string::lookup(kvp.first),
1578                                 value_kind_t::VALUE_INTEGER,
1579                                 0,
1580                                 (log_format *) this
1581                             }
1582                                 .with_struct_name(FIELDS_NAME);
1583                             values.emplace_back(lvm, iv.iv_value);
1584 
1585                             return iv.iv_str_value;
1586                         },
1587                         [this, &kvp, &values](const logfmt::parser::float_value& fv) {
1588                             auto lvm = logline_value_meta{
1589                                 intern_string::lookup(kvp.first),
1590                                 value_kind_t::VALUE_INTEGER,
1591                                 0,
1592                                 (log_format *) this
1593                             }
1594                                 .with_struct_name(FIELDS_NAME);
1595                             values.emplace_back(lvm, fv.fv_value);
1596 
1597                             return fv.fv_str_value;
1598                         },
1599                         [](const logfmt::parser::quoted_value &qv) {
1600                             return qv.qv_value;
1601                         },
1602                         [](const logfmt::parser::unquoted_value &uv) {
1603                             return uv.uv_value;
1604                         }
1605                     );
1606                     auto value_lr = line_range{
1607                         value_frag.sf_begin, value_frag.sf_end
1608                     };
1609 
1610                     if (kvp.first == "time" || kvp.first == "ts") {
1611                         sa.emplace_back(value_lr, &logline::L_TIMESTAMP);
1612                     } else if (kvp.first == "level") {
1613                     } else if (kvp.first == "msg") {
1614                         sa.emplace_back(value_lr, &SA_BODY);
1615                     } else if (!kvp.second.is<logfmt::parser::int_value>() &&
1616                                !kvp.second.is<logfmt::parser::bool_value>()) {
1617                         auto lvm = logline_value_meta{
1618                             intern_string::lookup(kvp.first),
1619                             value_frag.startswith("\"") ?
1620                             value_kind_t::VALUE_JSON :
1621                             value_kind_t::VALUE_TEXT,
1622                             0,
1623                             (log_format *) this
1624                         }
1625                             .with_struct_name(FIELDS_NAME);
1626                         shared_buffer_ref value_sbr;
1627 
1628                         value_sbr.subset(sbr, value_frag.sf_begin, value_frag.length());
1629                         values.emplace_back(lvm, value_sbr);
1630                     }
1631 
1632                     return false;
1633                 },
1634                 [line_number, &sbr](const logfmt::parser::error &err) {
1635                     log_error("bad line %.*s", sbr.length(), sbr.get_data());
1636                     log_error("%lld:logfmt parse error: %s", line_number, err.e_msg.c_str());
1637                     return true;
1638                 }
1639             );
1640         }
1641     }
1642 
specialized(int fmt_lock)1643     shared_ptr<log_format> specialized(int fmt_lock) override
1644     {
1645         return std::make_shared<logfmt_format>(*this);
1646     };
1647 };
1648 
1649 static auto format_binder = injector::bind_multiple<log_format>()
1650     .add<logfmt_format>()
1651     .add<bro_log_format>()
1652     .add<w3c_log_format>()
1653     .add<generic_log_format>();
1654