1 /**
2 * Copyright (c) 2017, Timothy Stack
3 *
4 * All rights reserved.
5 *
6 * Redistribution and use in source and binary forms, with or without
7 * modification, are permitted provided that the following conditions are met:
8 *
9 * * Redistributions of source code must retain the above copyright notice, this
10 * list of conditions and the following disclaimer.
11 * * Redistributions in binary form must reproduce the above copyright notice,
12 * this list of conditions and the following disclaimer in the documentation
13 * and/or other materials provided with the distribution.
14 * * Neither the name of Timothy Stack nor the names of its contributors
15 * may be used to endorse or promote products derived from this software
16 * without specific prior written permission.
17 *
18 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ''AS IS'' AND ANY
19 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
20 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
21 * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE FOR ANY
22 * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
23 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
24 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
25 * ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
26 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
27 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28 */
29
30 #include "config.h"
31
32 #include "base/lnav_log.hh"
33 #include "pcrepp/pcrepp.hh"
34 #include "sql_util.hh"
35 #include "vtab_module.hh"
36
37 using namespace std;
38
39 enum {
40 RC_COL_MATCH_INDEX,
41 RC_COL_INDEX,
42 RC_COL_NAME,
43 RC_COL_CAPTURE_COUNT,
44 RC_COL_RANGE_START,
45 RC_COL_RANGE_STOP,
46 RC_COL_CONTENT,
47 RC_COL_VALUE,
48 RC_COL_PATTERN,
49 };
50
51 struct regexp_capture {
52 static constexpr const char *NAME = "regexp_capture";
53 static constexpr const char *CREATE_STMT = R"(
54 -- The regexp_capture() table-valued function allows you to execute a regular-
55 -- expression over a given string and get the captured data as rows in a table.
56 CREATE TABLE regexp_capture (
57 match_index integer,
58 capture_index integer,
59 capture_name text,
60 capture_count integer,
61 range_start integer,
62 range_stop integer,
63 content text,
64 value text HIDDEN,
65 pattern text HIDDEN
66 );
67 )";
68
69 struct cursor {
70 sqlite3_vtab_cursor base;
71 pcrepp c_pattern;
72 pcre_context_static<30> c_context;
73 unique_ptr<pcre_input> c_input;
74 string c_content;
75 bool c_content_as_blob{false};
76 int c_index;
77 int c_start_index;
78 bool c_matched{false};
79 int c_match_index;
80 sqlite3_int64 c_rowid;
81
cursorregexp_capture::cursor82 cursor(sqlite3_vtab *vt)
83 : base({vt}),
84 c_index(0),
85 c_start_index(0),
86 c_match_index(0),
87 c_rowid(0) {
88 this->c_context.set_count(0);
89 };
90
resetregexp_capture::cursor91 int reset() {
92 return SQLITE_OK;
93 };
94
nextregexp_capture::cursor95 int next() {
96 if (this->c_index >= (this->c_context.get_count() - 1)) {
97 this->c_input->pi_offset = this->c_input->pi_next_offset;
98 this->c_matched = this->c_pattern.match(this->c_context, *(this->c_input));
99 this->c_index = -1;
100 this->c_match_index += 1;
101 }
102
103 if (this->c_pattern.empty() || !this->c_matched) {
104 return SQLITE_OK;
105 }
106
107 this->c_index += 1;
108
109 return SQLITE_OK;
110 };
111
eofregexp_capture::cursor112 int eof() {
113 return this->c_pattern.empty() || !this->c_matched;
114 };
115
get_rowidregexp_capture::cursor116 int get_rowid(sqlite3_int64 &rowid_out) {
117 rowid_out = this->c_rowid;
118
119 return SQLITE_OK;
120 };
121 };
122
get_columnregexp_capture123 int get_column(const cursor &vc, sqlite3_context *ctx, int col) {
124 pcre_context::capture_t &cap = vc.c_context.all()[vc.c_index];
125
126 switch (col) {
127 case RC_COL_MATCH_INDEX:
128 sqlite3_result_int64(ctx, vc.c_match_index);
129 break;
130 case RC_COL_INDEX:
131 sqlite3_result_int64(ctx, vc.c_index);
132 break;
133 case RC_COL_NAME:
134 if (vc.c_index == 0) {
135 sqlite3_result_null(ctx);
136 } else {
137 sqlite3_result_text(ctx, vc.c_pattern.name_for_capture(
138 vc.c_index - 1), -1, SQLITE_TRANSIENT);
139 }
140 break;
141 case RC_COL_CAPTURE_COUNT:
142 sqlite3_result_int64(ctx, vc.c_context.get_count());
143 break;
144 case RC_COL_RANGE_START:
145 sqlite3_result_int64(ctx, cap.c_begin + 1);
146 break;
147 case RC_COL_RANGE_STOP:
148 sqlite3_result_int64(ctx, cap.c_end + 1);
149 break;
150 case RC_COL_CONTENT:
151 if (cap.is_valid()) {
152 sqlite3_result_text(ctx,
153 vc.c_input->get_substr_start(&cap),
154 cap.length(),
155 SQLITE_TRANSIENT);
156 } else {
157 sqlite3_result_null(ctx);
158 }
159 break;
160 case RC_COL_VALUE:
161 if (vc.c_content_as_blob) {
162 sqlite3_result_blob64(ctx,
163 vc.c_content.c_str(),
164 vc.c_content.length(),
165 SQLITE_STATIC);
166 } else {
167 sqlite3_result_text(ctx,
168 vc.c_content.c_str(),
169 vc.c_content.length(),
170 SQLITE_STATIC);
171 }
172 break;
173 case RC_COL_PATTERN: {
174 auto str = vc.c_pattern.get_pattern();
175
176 sqlite3_result_text(ctx, str.c_str(), str.length(),
177 SQLITE_TRANSIENT);
178 break;
179 }
180 }
181
182 return SQLITE_OK;
183 }
184 };
185
rcBestIndex(sqlite3_vtab * tab,sqlite3_index_info * pIdxInfo)186 static int rcBestIndex(sqlite3_vtab *tab, sqlite3_index_info *pIdxInfo)
187 {
188 vtab_index_constraints vic(pIdxInfo);
189 vtab_index_usage viu(pIdxInfo);
190
191 for (auto iter = vic.begin(); iter != vic.end(); ++iter) {
192 if (iter->op != SQLITE_INDEX_CONSTRAINT_EQ) {
193 continue;
194 }
195
196 switch (iter->iColumn) {
197 case RC_COL_VALUE:
198 case RC_COL_PATTERN:
199 viu.column_used(iter);
200 break;
201 }
202 }
203
204 viu.allocate_args(2);
205 return SQLITE_OK;
206 }
207
rcFilter(sqlite3_vtab_cursor * pVtabCursor,int idxNum,const char * idxStr,int argc,sqlite3_value ** argv)208 static int rcFilter(sqlite3_vtab_cursor *pVtabCursor,
209 int idxNum, const char *idxStr,
210 int argc, sqlite3_value **argv)
211 {
212 regexp_capture::cursor *pCur = (regexp_capture::cursor *)pVtabCursor;
213
214 if (argc != 2) {
215 pCur->c_content.clear();
216 pCur->c_pattern.clear();
217 return SQLITE_OK;
218 }
219
220 auto byte_count = sqlite3_value_bytes(argv[0]);
221 auto blob = (const char *) sqlite3_value_blob(argv[0]);
222
223 pCur->c_content_as_blob = (sqlite3_value_type(argv[0]) == SQLITE_BLOB);
224 pCur->c_content.assign(blob, byte_count);
225
226 const char *pattern = (const char *) sqlite3_value_text(argv[1]);
227 auto re_res = pcrepp::from_str(pattern);
228 if (re_res.isErr()) {
229 pVtabCursor->pVtab->zErrMsg = sqlite3_mprintf(
230 "Invalid regular expression: %s", re_res.unwrapErr().ce_msg);
231 return SQLITE_ERROR;
232 }
233
234 pCur->c_pattern = re_res.unwrap();
235
236 pCur->c_index = 0;
237 pCur->c_context.set_count(0);
238
239 pCur->c_input = make_unique<pcre_input>(pCur->c_content);
240 pCur->c_matched = pCur->c_pattern.match(pCur->c_context, *(pCur->c_input));
241
242 log_debug("matched %d", pCur->c_matched);
243
244 return SQLITE_OK;
245 }
246
register_regexp_vtab(sqlite3 * db)247 int register_regexp_vtab(sqlite3 *db)
248 {
249 static vtab_module<tvt_no_update<regexp_capture>> REGEXP_CAPTURE_MODULE;
250 static help_text regexp_capture_help = help_text("regexp_capture",
251 "A table-valued function that executes a regular-expression over a "
252 "string and returns the captured values. If the regex only matches a "
253 "subset of the input string, it will be rerun on the remaining parts "
254 "of the string until no more matches are found.")
255 .sql_table_valued_function()
256 .with_parameter({"string",
257 "The string to match against the given pattern."})
258 .with_parameter({"pattern",
259 "The regular expression to match."})
260 .with_result({"match_index",
261 "The match iteration. This value will increase "
262 "each time a new match is found in the input string."})
263 .with_result({"capture_index",
264 "The index of the capture in the regex."})
265 .with_result({"capture_name",
266 "The name of the capture in the regex."})
267 .with_result({"capture_count",
268 "The total number of captures in the regex."})
269 .with_result({"range_start",
270 "The start of the capture in the input string."})
271 .with_result({"range_stop",
272 "The stop of the capture in the input string."})
273 .with_result({"content",
274 "The captured value from the string."})
275 .with_tags({"string"})
276 .with_example({
277 "To extract the key/value pairs 'a'/1 and 'b'/2 from the string 'a=1; b=2'",
278 "SELECT * FROM regexp_capture('a=1; b=2', '(\\w+)=(\\d+)')"
279 });
280
281 int rc;
282
283 REGEXP_CAPTURE_MODULE.vm_module.xBestIndex = rcBestIndex;
284 REGEXP_CAPTURE_MODULE.vm_module.xFilter = rcFilter;
285
286 rc = REGEXP_CAPTURE_MODULE.create(db, "regexp_capture");
287 sqlite_function_help.insert(make_pair("regexp_capture", ®exp_capture_help));
288 regexp_capture_help.index_tags();
289
290 ensure(rc == SQLITE_OK);
291
292 return rc;
293 }
294