1 /*
2  * Written by Alexey Tourbin <at@altlinux.org>.
3  *
4  * The author has dedicated the code to the public domain.  Anyone is free
5  * to copy, modify, publish, use, compile, sell, or distribute the original
6  * code, either in source code form or as a compiled binary, for any purpose,
7  * commercial or non-commercial, and by any means.
8  */
9 
10 #include "config.h"
11 
12 #include <stdlib.h>
13 #include <string.h>
14 #include <sqlite3.h>
15 
16 #include <unordered_map>
17 
18 #include "pcrepp/pcrepp.hh"
19 
20 #include "base/humanize.hh"
21 #include "base/lnav.gzip.hh"
22 #include "base/string_util.hh"
23 #include "formats/logfmt/logfmt.parser.hh"
24 #include "yajlpp/yajlpp.hh"
25 #include "yajlpp/json_op.hh"
26 #include "column_namer.hh"
27 #include "yajl/api/yajl_gen.h"
28 #include "sqlite-extension-func.hh"
29 #include "data_scanner.hh"
30 #include "data_parser.hh"
31 #include "elem_to_json.hh"
32 #include "vtab_module.hh"
33 #include "vtab_module_json.hh"
34 #include "safe/safe.h"
35 #include "spookyhash/SpookyV2.h"
36 
37 #include "optional.hpp"
38 #include "mapbox/variant.hpp"
39 
40 using namespace std;
41 using namespace mapbox;
42 
43 typedef struct {
44     shared_ptr<pcrepp> re2;
45 } cache_entry;
46 
find_re(const char * re)47 static cache_entry *find_re(const char *re)
48 {
49     using safe_cache = safe::Safe<unordered_map<string, cache_entry>>;
50     static safe_cache CACHE;
51 
52     safe::WriteAccess<safe_cache> wcache(CACHE);
53     string re_str = re;
54     auto iter = wcache->find(re_str);
55 
56     if (iter == wcache->end()) {
57         cache_entry c;
58 
59         c.re2 = make_shared<pcrepp>(re_str);
60         auto pair = wcache->insert(std::make_pair(re_str, c));
61 
62         iter = pair.first;
63     }
64 
65     return &iter->second;
66 }
67 
regexp(const char * re,const char * str)68 static bool regexp(const char *re, const char *str)
69 {
70     cache_entry *reobj = find_re(re);
71     pcre_context_static<30> pc;
72     pcre_input pi(str);
73 
74     return reobj->re2->match(pc, pi);
75 }
76 
77 static
78 util::variant<int64_t, double, const char*, string_fragment, json_string>
regexp_match(const char * re,const char * str)79 regexp_match(const char *re, const char *str)
80 {
81     cache_entry *reobj = find_re(re);
82     pcre_context_static<30> pc;
83     pcre_input pi(str);
84     pcrepp &extractor = *reobj->re2;
85 
86     if (extractor.get_capture_count() == 0) {
87         throw pcrepp::error("regular expression does not have any captures");
88     }
89 
90     if (!extractor.match(pc, pi)) {
91         return static_cast<const char *>(nullptr);
92     }
93 
94     yajlpp_gen gen;
95     yajl_gen_config(gen, yajl_gen_beautify, false);
96 
97     if (extractor.get_capture_count() == 1) {
98         pcre_context::capture_t *cap = pc[0];
99         const char *cap_start = pi.get_substr_start(cap);
100 
101         if (!cap->is_valid()) {
102             return static_cast<const char *>(nullptr);
103         }
104         else {
105             char *cap_copy = (char *)alloca(cap->length() + 1);
106             long long int i_value;
107             double d_value;
108             int end_index;
109 
110             memcpy(cap_copy, cap_start, cap->length());
111             cap_copy[cap->length()] = '\0';
112 
113             if (sscanf(cap_copy, "%lld%n", &i_value, &end_index) == 1 &&
114                 (end_index == cap->length())) {
115                 return (int64_t)i_value;
116             }
117             else if (sscanf(cap_copy, "%lf%n", &d_value, &end_index) == 1 &&
118                      (end_index == cap->length())) {
119                 return d_value;
120             }
121             else {
122                 return string_fragment(str, cap->c_begin, cap->c_end);
123             }
124         }
125     }
126     else {
127         yajlpp_map root_map(gen);
128         column_namer cn;
129 
130         for (int lpc = 0; lpc < extractor.get_capture_count(); lpc++) {
131             string colname = cn.add_column(extractor.name_for_capture(lpc));
132             pcre_context::capture_t *cap = pc[lpc];
133 
134             yajl_gen_string(gen, colname);
135 
136             if (!cap->is_valid()) {
137                 yajl_gen_null(gen);
138             }
139             else {
140                 const char *cap_start = pi.get_substr_start(cap);
141                 char *cap_copy = (char *) alloca(cap->length() + 1);
142                 long long int i_value;
143                 double d_value;
144                 int end_index;
145 
146                 memcpy(cap_copy, cap_start, cap->length());
147                 cap_copy[cap->length()] = '\0';
148 
149                 if (sscanf(cap_copy, "%lld%n", &i_value, &end_index) == 1 &&
150                     (end_index == cap->length())) {
151                     yajl_gen_integer(gen, i_value);
152                 }
153                 else if (sscanf(cap_copy, "%lf%n", &d_value, &end_index) == 1 &&
154                          (end_index == cap->length())) {
155                     yajl_gen_number(gen, cap_start, cap->length());
156                 }
157                 else {
158                     yajl_gen_pstring(gen, cap_start, cap->length());
159                 }
160             }
161         }
162     }
163 
164     return json_string(gen);
165 #if 0
166     sqlite3_result_text(ctx, (const char *) buf, len, SQLITE_TRANSIENT);
167 #ifdef HAVE_SQLITE3_VALUE_SUBTYPE
168     sqlite3_result_subtype(ctx, JSON_SUBTYPE);
169 #endif
170 #endif
171 }
172 
extract(const char * str)173 json_string extract(const char *str)
174 {
175     data_scanner ds(str);
176     data_parser dp(&ds);
177 
178     dp.parse();
179     // dp.print(stderr, dp.dp_pairs);
180 
181     yajlpp_gen gen;
182     yajl_gen_config(gen, yajl_gen_beautify, false);
183 
184     elements_to_json(gen, dp, &dp.dp_pairs);
185 
186     return json_string(gen);
187 }
188 
logfmt2json(string_fragment line)189 json_string logfmt2json(string_fragment line)
190 {
191     logfmt::parser p(line);
192     yajlpp_gen gen;
193     yajl_gen_config(gen, yajl_gen_beautify, false);
194 
195     {
196         yajlpp_map root(gen);
197         bool done = false;
198 
199         while (!done) {
200             auto pair = p.step();
201 
202             done = pair.match(
203                 [](const logfmt::parser::end_of_input& eoi) {
204                     return true;
205                 },
206                 [&root, &gen](const logfmt::parser::kvpair& kvp) {
207                     root.gen(kvp.first);
208 
209                     kvp.second.match(
210                         [&root](const logfmt::parser::bool_value& bv) {
211                             root.gen(bv.bv_value);
212                         },
213                         [&root](const logfmt::parser::int_value& iv) {
214                             root.gen(iv.iv_value);
215                         },
216                         [&root](const logfmt::parser::float_value& fv) {
217                             root.gen(fv.fv_value);
218                         },
219                         [&root, &gen](const logfmt::parser::quoted_value& qv) {
220                             auto_mem<yajl_handle_t> parse_handle(yajl_free);
221                             json_ptr jp("");
222                             json_op jo(jp);
223 
224                             jo.jo_ptr_callbacks = json_op::gen_callbacks;
225                             jo.jo_ptr_data = gen;
226                             parse_handle.reset(yajl_alloc(&json_op::ptr_callbacks, nullptr, &jo));
227 
228                             auto json_in = (const unsigned char *) qv.qv_value.data();
229                             auto json_len = qv.qv_value.length();
230 
231                             if (yajl_parse(parse_handle.in(), json_in, json_len) != yajl_status_ok ||
232                                 yajl_complete_parse(parse_handle.in()) != yajl_status_ok) {
233                                 root.gen(qv.qv_value);
234                             }
235                         },
236                         [&root](const logfmt::parser::unquoted_value& uv) {
237                             root.gen(uv.uv_value);
238                         }
239                     );
240 
241                     return false;
242                 },
243                 [](const logfmt::parser::error& e) -> bool {
244                     throw sqlite_func_error("Invalid logfmt: {}", e.e_msg);
245                 }
246             );
247         }
248     }
249 
250     return json_string(gen);
251 }
252 
253 static
regexp_replace(const char * str,const char * re,const char * repl)254 string regexp_replace(const char *str, const char *re, const char *repl)
255 {
256     cache_entry *reobj = find_re(re);
257 
258     return reobj->re2->replace(str, repl);
259 }
260 
261 static
spooky_hash(const vector<const char * > & args)262 string spooky_hash(const vector<const char *> &args)
263 {
264     byte_array<2, uint64> hash;
265     SpookyHash context;
266 
267     context.Init(0, 0);
268     for (const auto arg : args) {
269         int64_t len = arg != nullptr ? strlen(arg) : 0;
270 
271         context.Update(&len, sizeof(len));
272         if (arg == nullptr) {
273             continue;
274         }
275         context.Update(arg, len);
276     }
277     context.Final(hash.out(0), hash.out(1));
278 
279     return hash.to_string();
280 }
281 
sql_spooky_hash_step(sqlite3_context * context,int argc,sqlite3_value ** argv)282 static void sql_spooky_hash_step(sqlite3_context *context,
283                                  int argc,
284                                  sqlite3_value **argv)
285 {
286     auto *hasher = (SpookyHash *)sqlite3_aggregate_context(context,
287         sizeof(SpookyHash));
288 
289     for (int lpc = 0; lpc < argc; lpc++) {
290         auto value = sqlite3_value_text(argv[lpc]);
291         int64_t len = value != nullptr ? strlen((const char *) value) : 0;
292 
293         hasher->Update(&len, sizeof(len));
294         if (value == nullptr) {
295             continue;
296         }
297         hasher->Update(value, len);
298     }
299 }
300 
sql_spooky_hash_final(sqlite3_context * context)301 static void sql_spooky_hash_final(sqlite3_context *context)
302 {
303     auto *hasher = (SpookyHash *)sqlite3_aggregate_context(
304         context, sizeof(SpookyHash));
305 
306     if (hasher == nullptr) {
307         sqlite3_result_null(context);
308     } else {
309         byte_array<2, uint64> hash;
310 
311         hasher->Final(hash.out(0), hash.out(1));
312 
313         string hex = hash.to_string();
314         sqlite3_result_text(context, hex.c_str(), hex.length(),
315                             SQLITE_TRANSIENT);
316     }
317 }
318 
319 struct sparkline_context {
320     bool sc_initialized{true};
321     double sc_max_value{0.0};
322     std::vector<double> sc_values;
323 };
324 
sparkline_step(sqlite3_context * context,int argc,sqlite3_value ** argv)325 static void sparkline_step(sqlite3_context *context,
326                            int argc,
327                            sqlite3_value **argv)
328 {
329     auto *sc = (sparkline_context *)
330         sqlite3_aggregate_context(context, sizeof(sparkline_context));
331 
332     if (!sc->sc_initialized) {
333         new (sc) sparkline_context;
334     }
335 
336     if (argc == 0) {
337         return;
338     }
339 
340     sc->sc_values.push_back(sqlite3_value_double(argv[0]));
341     sc->sc_max_value = std::max(sc->sc_max_value, sc->sc_values.back());
342 
343     if (argc >= 2) {
344         sc->sc_max_value = std::max(sc->sc_max_value,
345                                     sqlite3_value_double(argv[1]));
346     }
347 }
348 
sparkline_final(sqlite3_context * context)349 static void sparkline_final(sqlite3_context *context)
350 {
351     auto *sc = (sparkline_context *)
352         sqlite3_aggregate_context(context, sizeof(sparkline_context));
353 
354     if (!sc->sc_initialized) {
355         sqlite3_result_text(context, "", 0, SQLITE_STATIC);
356         return;
357     }
358 
359     auto retval = (char *) malloc(sc->sc_values.size() * 3 + 1);
360     auto start = retval;
361 
362     for (const auto& value : sc->sc_values) {
363         auto bar = humanize::sparkline(value, sc->sc_max_value);
364 
365         strcpy(start, bar.c_str());
366         start += bar.length();
367     }
368     *start = '\0';
369 
370     sqlite3_result_text(context, retval, -1, free);
371 
372     sc->~sparkline_context();
373 }
374 
375 nonstd::optional<util::variant<blob_auto_buffer, sqlite3_int64, double>>
sql_gunzip(sqlite3_value * val)376 sql_gunzip(sqlite3_value *val)
377 {
378     switch (sqlite3_value_type(val)) {
379         case SQLITE3_TEXT:
380         case SQLITE_BLOB: {
381             auto buffer = sqlite3_value_blob(val);
382             auto len = sqlite3_value_bytes(val);
383 
384             if (!lnav::gzip::is_gzipped((const char *) buffer, len)) {
385                 auto retval = auto_buffer::alloc(len);
386 
387                 memcpy(retval.in(), buffer, len);
388                 return blob_auto_buffer{ std::move(retval) };
389             }
390 
391             auto res = lnav::gzip::uncompress("", buffer, len);
392 
393             if (res.isErr()) {
394                 throw sqlite_func_error("unable to uncompress -- {}",
395                                         res.unwrapErr());
396             }
397 
398             return blob_auto_buffer{ res.unwrap() };
399         }
400         case SQLITE_INTEGER:
401             return sqlite3_value_int64(val);
402         case SQLITE_FLOAT:
403             return sqlite3_value_double(val);
404     }
405 
406     return nonstd::nullopt;
407 }
408 
409 nonstd::optional<blob_auto_buffer>
sql_gzip(sqlite3_value * val)410 sql_gzip(sqlite3_value *val)
411 {
412     switch (sqlite3_value_type(val)) {
413         case SQLITE3_TEXT:
414         case SQLITE_BLOB: {
415             auto buffer = sqlite3_value_blob(val);
416             auto len = sqlite3_value_bytes(val);
417             auto res = lnav::gzip::compress(buffer, len);
418 
419             if (res.isErr()) {
420                 throw sqlite_func_error("unable to compress -- {}",
421                                         res.unwrapErr());
422             }
423 
424             return blob_auto_buffer{ res.unwrap() };
425         }
426         case SQLITE_INTEGER:
427         case SQLITE_FLOAT: {
428             auto buffer = sqlite3_value_text(val);
429             log_debug("buf %s", buffer);
430             auto res = lnav::gzip::compress(buffer, strlen((const char *) buffer));
431 
432             if (res.isErr()) {
433                 throw sqlite_func_error("unable to compress -- {}",
434                                         res.unwrapErr());
435             }
436 
437             return blob_auto_buffer{ res.unwrap() };
438         }
439     }
440 
441     return nonstd::nullopt;
442 }
443 
string_extension_functions(struct FuncDef ** basic_funcs,struct FuncDefAgg ** agg_funcs)444 int string_extension_functions(struct FuncDef **basic_funcs,
445                                struct FuncDefAgg **agg_funcs)
446 {
447     static struct FuncDef string_funcs[] = {
448         sqlite_func_adapter<decltype(&regexp), regexp>::builder(
449             help_text("regexp",
450                       "Test if a string matches a regular expression")
451                 .sql_function()
452                 .with_parameter({"re", "The regular expression to use"})
453                 .with_parameter({"str", "The string to test against the regular expression"})
454         ),
455 
456         sqlite_func_adapter<decltype(&regexp_match), regexp_match>::builder(
457             help_text("regexp_match",
458                       "Match a string against a regular expression and return the capture groups as JSON.")
459                 .sql_function()
460                 .with_parameter({"re", "The regular expression to use"})
461                 .with_parameter({"str", "The string to test against the regular expression"})
462                 .with_tags({"string", "regex"})
463                 .with_example({
464                     "To capture the digits from the string '123'",
465                     "SELECT regexp_match('(\\d+)', '123')"
466                 })
467                 .with_example({
468                     "To capture a number and word into a JSON object with the properties 'col_0' and 'col_1'",
469                     "SELECT regexp_match('(\\d+) (\\w+)', '123 four')"
470                 })
471                 .with_example({
472                     "To capture a number and word into a JSON object with the named properties 'num' and 'str'",
473                     "SELECT regexp_match('(?<num>\\d+) (?<str>\\w+)', '123 four')"
474                 })
475         ),
476 
477         sqlite_func_adapter<decltype(&regexp_replace), regexp_replace>::builder(
478             help_text("regexp_replace",
479                       "Replace the parts of a string that match a regular expression.")
480                 .sql_function()
481                 .with_parameter({"str", "The string to perform replacements on"})
482                 .with_parameter({"re", "The regular expression to match"})
483                 .with_parameter({"repl", "The replacement string.  "
484                     "You can reference capture groups with a backslash followed by the number of the "
485                     "group, starting with 1."})
486                 .with_tags({"string", "regex"})
487                 .with_example({
488                     "To replace the word at the start of the string 'Hello, World!' with 'Goodbye'",
489                     "SELECT regexp_replace('Hello, World!', '^(\\w+)', 'Goodbye')"
490                 })
491                 .with_example({
492                     "To wrap alphanumeric words with angle brackets",
493                     "SELECT regexp_replace('123 abc', '(\\w+)', '<\\1>')"
494                 })
495         ),
496 
497         sqlite_func_adapter<decltype(&humanize::file_size), humanize::file_size>::builder(
498             help_text("humanize_file_size",
499                       "Format the given file size as a human-friendly string")
500                 .sql_function()
501                 .with_parameter({"value", "The file size to format"})
502                 .with_tags({"string"})
503                 .with_example({
504                     "To format an amount",
505                     "SELECT humanize_file_size(10 * 1024 * 1024)"
506                 })
507         ),
508 
509         sqlite_func_adapter<decltype(&humanize::sparkline), humanize::sparkline>::builder(
510             help_text("sparkline",
511                       "Function used to generate a sparkline bar chart.  "
512                       "The non-aggregate version converts a single numeric "
513                       "value on a range to a bar chart character.  The "
514                       "aggregate version returns a string with a bar "
515                       "character for every numeric input")
516                 .sql_function()
517                 .with_parameter({"value", "The numeric value to convert"})
518                 .with_parameter(help_text(
519                     "upper",
520                     "The upper bound of the numeric range.  The non-aggregate "
521                     "version defaults to 100.  The aggregate version uses the "
522                     "largest value in the inputs.")
523                                     .optional())
524                 .with_tags({"string"})
525                 .with_example({
526                     "To get the unicode block element for the value 32 in the "
527                     "range of 0-128",
528                     "SELECT sparkline(32, 128)"
529                 })
530                 .with_example({
531                     "To chart the values in a JSON array",
532                     "SELECT sparkline(value) FROM json_each('[0, 1, 2, 3, 4, 5, 6, 7, 8]')"
533                 })
534         ),
535 
536         sqlite_func_adapter<decltype(&extract), extract>::builder(
537             help_text("extract",
538                       "Automatically Parse and extract data from a string")
539                 .sql_function()
540                 .with_parameter({"str", "The string to parse"})
541                 .with_tags({"string"})
542                 .with_example({
543                     "To extract key/value pairs from a string",
544                     "SELECT extract('foo=1 bar=2 name=\"Rolo Tomassi\"')"
545                 })
546                 .with_example({
547                     "To extract columnar data from a string",
548                     "SELECT extract('1.0 abc 2.0')"
549                 })
550         ),
551 
552         sqlite_func_adapter<decltype(&logfmt2json), logfmt2json>::builder(
553             help_text("logfmt2json",
554                       "Convert a logfmt-encoded string into JSON")
555                 .sql_function()
556                 .with_parameter({"str", "The logfmt message to parse"})
557                 .with_tags({"string"})
558                 .with_example({
559                     "To extract key/value pairs from a log message",
560                     "SELECT logfmt2json('foo=1 bar=2 name=\"Rolo Tomassi\"')"
561                 })
562         ),
563 
564         sqlite_func_adapter<decltype(
565              static_cast<bool (*)(const char *, const char *)>(&startswith)),
566             startswith>::builder(
567             help_text("startswith",
568                       "Test if a string begins with the given prefix")
569                 .sql_function()
570                 .with_parameter({"str", "The string to test"})
571                 .with_parameter({"prefix", "The prefix to check in the string"})
572                 .with_tags({"string"})
573                 .with_example({
574                     "To test if the string 'foobar' starts with 'foo'",
575                     "SELECT startswith('foobar', 'foo')"
576                 })
577                 .with_example({
578                     "To test if the string 'foobar' starts with 'bar'",
579                     "SELECT startswith('foobar', 'bar')"
580                 })
581         ),
582 
583         sqlite_func_adapter<decltype(
584             static_cast<bool (*)(const char *, const char *)>(&endswith)),
585             endswith>::builder(
586             help_text("endswith",
587                       "Test if a string ends with the given suffix")
588                 .sql_function()
589                 .with_parameter({"str", "The string to test"})
590                 .with_parameter({"suffix", "The suffix to check in the string"})
591                 .with_tags({"string"})
592                 .with_example({
593                     "To test if the string 'notbad.jpg' ends with '.jpg'",
594                     "SELECT endswith('notbad.jpg', '.jpg')"
595                 })
596                 .with_example({
597                     "To test if the string 'notbad.png' starts with '.jpg'",
598                     "SELECT endswith('notbad.png', '.jpg')"
599                 })
600         ),
601 
602         sqlite_func_adapter<decltype(&spooky_hash), spooky_hash>::builder(
603             help_text("spooky_hash",
604                       "Compute the hash value for the given arguments.")
605                 .sql_function()
606                 .with_parameter(help_text("str", "The string to hash")
607                                     .one_or_more())
608                 .with_tags({"string"})
609                 .with_example({
610                     "To produce a hash for the string 'Hello, World!'",
611                     "SELECT spooky_hash('Hello, World!')"
612                 })
613                 .with_example({
614                     "To produce a hash for the parameters where one is NULL",
615                     "SELECT spooky_hash('Hello, World!', NULL)"
616                 })
617                 .with_example({
618                     "To produce a hash for the parameters where one is an empty string",
619                     "SELECT spooky_hash('Hello, World!', '')"
620                 })
621                 .with_example({
622                     "To produce a hash for the parameters where one is a number",
623                     "SELECT spooky_hash('Hello, World!', 123)"
624                 })
625         ),
626 
627         sqlite_func_adapter<decltype(&sql_gunzip), sql_gunzip>::builder(
628             help_text("gunzip", "Decompress a gzip file")
629                 .sql_function()
630                 .with_parameter(help_text("b", "The blob to decompress")
631                                     .one_or_more())
632                 .with_tags({"string"})
633         ),
634 
635         sqlite_func_adapter<decltype(&sql_gzip), sql_gzip>::builder(
636             help_text("gzip", "Compress a string into a gzip file")
637                 .sql_function()
638                 .with_parameter(help_text("value", "The value to compress")
639                                     .one_or_more())
640                 .with_tags({"string"})
641         ),
642 
643         {nullptr}
644     };
645 
646     static struct FuncDefAgg str_agg_funcs[] = {
647         {"group_spooky_hash", -1, 0,
648             sql_spooky_hash_step, sql_spooky_hash_final,
649             help_text("group_spooky_hash",
650                       "Compute the hash value for the given arguments")
651                 .sql_agg_function()
652                 .with_parameter(help_text("str", "The string to hash")
653                                     .one_or_more())
654                 .with_tags({"string"})
655                 .with_example({
656                     "To produce a hash of all of the values of 'column1'",
657                     "SELECT group_spooky_hash(column1) FROM (VALUES ('abc'), ('123'))"
658                 })
659         },
660 
661         {
662             "sparkline", -1, 0,
663             sparkline_step, sparkline_final,
664         },
665 
666         {nullptr}
667     };
668 
669     *basic_funcs = string_funcs;
670     *agg_funcs = str_agg_funcs;
671 
672     return SQLITE_OK;
673 }
674