1 /**
2  * Copyright (c) 2017, Timothy Stack
3  *
4  * All rights reserved.
5  *
6  * Redistribution and use in source and binary forms, with or without
7  * modification, are permitted provided that the following conditions are met:
8  *
9  * * Redistributions of source code must retain the above copyright notice, this
10  * list of conditions and the following disclaimer.
11  * * Redistributions in binary form must reproduce the above copyright notice,
12  * this list of conditions and the following disclaimer in the documentation
13  * and/or other materials provided with the distribution.
14  * * Neither the name of Timothy Stack nor the names of its contributors
15  * may be used to endorse or promote products derived from this software
16  * without specific prior written permission.
17  *
18  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ''AS IS'' AND ANY
19  * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
20  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
21  * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE FOR ANY
22  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
23  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
24  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
25  * ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
26  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
27  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28  */
29 
30 #include "config.h"
31 
32 #include "base/lnav_log.hh"
33 #include "pcrepp/pcrepp.hh"
34 #include "sql_util.hh"
35 #include "vtab_module.hh"
36 
37 using namespace std;
38 
39 enum {
40     RC_COL_MATCH_INDEX,
41     RC_COL_INDEX,
42     RC_COL_NAME,
43     RC_COL_CAPTURE_COUNT,
44     RC_COL_RANGE_START,
45     RC_COL_RANGE_STOP,
46     RC_COL_CONTENT,
47     RC_COL_VALUE,
48     RC_COL_PATTERN,
49 };
50 
51 struct regexp_capture {
52     static constexpr const char *NAME = "regexp_capture";
53     static constexpr const char *CREATE_STMT = R"(
54 -- The regexp_capture() table-valued function allows you to execute a regular-
55 -- expression over a given string and get the captured data as rows in a table.
56 CREATE TABLE regexp_capture (
57     match_index integer,
58     capture_index integer,
59     capture_name text,
60     capture_count integer,
61     range_start integer,
62     range_stop integer,
63     content text,
64     value text HIDDEN,
65     pattern text HIDDEN
66 );
67 )";
68 
69     struct cursor {
70         sqlite3_vtab_cursor base;
71         pcrepp c_pattern;
72         pcre_context_static<30> c_context;
73         unique_ptr<pcre_input> c_input;
74         string c_content;
75         bool c_content_as_blob{false};
76         int c_index;
77         int c_start_index;
78         bool c_matched{false};
79         int c_match_index;
80         sqlite3_int64 c_rowid;
81 
cursorregexp_capture::cursor82         cursor(sqlite3_vtab *vt)
83                 : base({vt}),
84                   c_index(0),
85                   c_start_index(0),
86                   c_match_index(0),
87                   c_rowid(0) {
88             this->c_context.set_count(0);
89         };
90 
resetregexp_capture::cursor91         int reset() {
92             return SQLITE_OK;
93         };
94 
nextregexp_capture::cursor95         int next() {
96             if (this->c_index >= (this->c_context.get_count() - 1)) {
97                 this->c_input->pi_offset = this->c_input->pi_next_offset;
98                 this->c_matched = this->c_pattern.match(this->c_context, *(this->c_input));
99                 this->c_index = -1;
100                 this->c_match_index += 1;
101             }
102 
103             if (this->c_pattern.empty() || !this->c_matched) {
104                 return SQLITE_OK;
105             }
106 
107             this->c_index += 1;
108 
109             return SQLITE_OK;
110         };
111 
eofregexp_capture::cursor112         int eof() {
113             return this->c_pattern.empty() || !this->c_matched;
114         };
115 
get_rowidregexp_capture::cursor116         int get_rowid(sqlite3_int64 &rowid_out) {
117             rowid_out = this->c_rowid;
118 
119             return SQLITE_OK;
120         };
121     };
122 
get_columnregexp_capture123     int get_column(const cursor &vc, sqlite3_context *ctx, int col) {
124         pcre_context::capture_t &cap = vc.c_context.all()[vc.c_index];
125 
126         switch (col) {
127             case RC_COL_MATCH_INDEX:
128                 sqlite3_result_int64(ctx, vc.c_match_index);
129                 break;
130             case RC_COL_INDEX:
131                 sqlite3_result_int64(ctx, vc.c_index);
132                 break;
133             case RC_COL_NAME:
134                 if (vc.c_index == 0) {
135                     sqlite3_result_null(ctx);
136                 } else {
137                     sqlite3_result_text(ctx, vc.c_pattern.name_for_capture(
138                         vc.c_index - 1), -1, SQLITE_TRANSIENT);
139                 }
140                 break;
141             case RC_COL_CAPTURE_COUNT:
142                 sqlite3_result_int64(ctx, vc.c_context.get_count());
143                 break;
144             case RC_COL_RANGE_START:
145                 sqlite3_result_int64(ctx, cap.c_begin + 1);
146                 break;
147             case RC_COL_RANGE_STOP:
148                 sqlite3_result_int64(ctx, cap.c_end + 1);
149                 break;
150             case RC_COL_CONTENT:
151                 if (cap.is_valid()) {
152                     sqlite3_result_text(ctx,
153                                         vc.c_input->get_substr_start(&cap),
154                                         cap.length(),
155                                         SQLITE_TRANSIENT);
156                 } else {
157                     sqlite3_result_null(ctx);
158                 }
159                 break;
160             case RC_COL_VALUE:
161                 if (vc.c_content_as_blob) {
162                     sqlite3_result_blob64(ctx,
163                                           vc.c_content.c_str(),
164                                           vc.c_content.length(),
165                                           SQLITE_STATIC);
166                 } else {
167                     sqlite3_result_text(ctx,
168                                         vc.c_content.c_str(),
169                                         vc.c_content.length(),
170                                         SQLITE_STATIC);
171                 }
172                 break;
173             case RC_COL_PATTERN: {
174                 auto str = vc.c_pattern.get_pattern();
175 
176                 sqlite3_result_text(ctx, str.c_str(), str.length(),
177                                     SQLITE_TRANSIENT);
178                 break;
179             }
180         }
181 
182         return SQLITE_OK;
183     }
184 };
185 
rcBestIndex(sqlite3_vtab * tab,sqlite3_index_info * pIdxInfo)186 static int rcBestIndex(sqlite3_vtab *tab, sqlite3_index_info *pIdxInfo)
187 {
188     vtab_index_constraints vic(pIdxInfo);
189     vtab_index_usage viu(pIdxInfo);
190 
191     for (auto iter = vic.begin(); iter != vic.end(); ++iter) {
192         if (iter->op != SQLITE_INDEX_CONSTRAINT_EQ) {
193             continue;
194         }
195 
196         switch (iter->iColumn) {
197             case RC_COL_VALUE:
198             case RC_COL_PATTERN:
199                 viu.column_used(iter);
200                 break;
201         }
202     }
203 
204     viu.allocate_args(2);
205     return SQLITE_OK;
206 }
207 
rcFilter(sqlite3_vtab_cursor * pVtabCursor,int idxNum,const char * idxStr,int argc,sqlite3_value ** argv)208 static int rcFilter(sqlite3_vtab_cursor *pVtabCursor,
209                     int idxNum, const char *idxStr,
210                     int argc, sqlite3_value **argv)
211 {
212     regexp_capture::cursor *pCur = (regexp_capture::cursor *)pVtabCursor;
213 
214     if (argc != 2) {
215         pCur->c_content.clear();
216         pCur->c_pattern.clear();
217         return SQLITE_OK;
218     }
219 
220     auto byte_count = sqlite3_value_bytes(argv[0]);
221     auto blob = (const char *) sqlite3_value_blob(argv[0]);
222 
223     pCur->c_content_as_blob = (sqlite3_value_type(argv[0]) == SQLITE_BLOB);
224     pCur->c_content.assign(blob, byte_count);
225 
226     const char *pattern = (const char *) sqlite3_value_text(argv[1]);
227     auto re_res = pcrepp::from_str(pattern);
228     if (re_res.isErr()) {
229         pVtabCursor->pVtab->zErrMsg = sqlite3_mprintf(
230             "Invalid regular expression: %s", re_res.unwrapErr().ce_msg);
231         return SQLITE_ERROR;
232     }
233 
234     pCur->c_pattern = re_res.unwrap();
235 
236     pCur->c_index = 0;
237     pCur->c_context.set_count(0);
238 
239     pCur->c_input = make_unique<pcre_input>(pCur->c_content);
240     pCur->c_matched = pCur->c_pattern.match(pCur->c_context, *(pCur->c_input));
241 
242     log_debug("matched %d", pCur->c_matched);
243 
244     return SQLITE_OK;
245 }
246 
register_regexp_vtab(sqlite3 * db)247 int register_regexp_vtab(sqlite3 *db)
248 {
249     static vtab_module<tvt_no_update<regexp_capture>> REGEXP_CAPTURE_MODULE;
250     static help_text regexp_capture_help = help_text("regexp_capture",
251         "A table-valued function that executes a regular-expression over a "
252         "string and returns the captured values.  If the regex only matches a "
253         "subset of the input string, it will be rerun on the remaining parts "
254         "of the string until no more matches are found.")
255         .sql_table_valued_function()
256         .with_parameter({"string",
257                          "The string to match against the given pattern."})
258         .with_parameter({"pattern",
259                          "The regular expression to match."})
260         .with_result({"match_index",
261                       "The match iteration.  This value will increase "
262                       "each time a new match is found in the input string."})
263         .with_result({"capture_index",
264                       "The index of the capture in the regex."})
265         .with_result({"capture_name",
266                       "The name of the capture in the regex."})
267         .with_result({"capture_count",
268                       "The total number of captures in the regex."})
269         .with_result({"range_start",
270                       "The start of the capture in the input string."})
271         .with_result({"range_stop",
272                       "The stop of the capture in the input string."})
273         .with_result({"content",
274                       "The captured value from the string."})
275         .with_tags({"string"})
276         .with_example({
277             "To extract the key/value pairs 'a'/1 and 'b'/2 from the string 'a=1; b=2'",
278             "SELECT * FROM regexp_capture('a=1; b=2', '(\\w+)=(\\d+)')"
279         });
280 
281     int rc;
282 
283     REGEXP_CAPTURE_MODULE.vm_module.xBestIndex = rcBestIndex;
284     REGEXP_CAPTURE_MODULE.vm_module.xFilter = rcFilter;
285 
286     rc = REGEXP_CAPTURE_MODULE.create(db, "regexp_capture");
287     sqlite_function_help.insert(make_pair("regexp_capture", &regexp_capture_help));
288     regexp_capture_help.index_tags();
289 
290     ensure(rc == SQLITE_OK);
291 
292     return rc;
293 }
294