1 /*
2 * Parse a regular expression, and extract a static suffix.
3 *
4 * Copyright (C) 2013-2022 Cisco Systems, Inc. and/or its affiliates. All rights reserved.
5 * Copyright (C) 2007-2013 Sourcefire, Inc.
6 *
7 * Authors: Török Edvin
8 *
9 * This program is free software; you can redistribute it and/or modify
10 * it under the terms of the GNU General Public License version 2 as
11 * published by the Free Software Foundation.
12 *
13 * This program is distributed in the hope that it will be useful,
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 * GNU General Public License for more details.
17 *
18 * You should have received a copy of the GNU General Public License
19 * along with this program; if not, write to the Free Software
20 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
21 * MA 02110-1301, USA.
22 */
23 #if HAVE_CONFIG_H
24 #include "clamav-config.h"
25 #endif
26
27 #include <stdio.h>
28 #include <stdlib.h>
29 #include <string.h>
30 #include <assert.h>
31
32 #include "clamav.h"
33 #include "others.h"
34 #include "jsparse/textbuf.h"
35 #include "regex_suffix.h"
36 #define MODULE "regex_suffix: "
37
38 enum node_type {
39 root = 0,
40 concat,
41 alternate, /* | */
42 optional, /* ?, * */
43 leaf, /* a character */
44 leaf_class /* character class */
45 /* (x)+ is transformed into (x)*(x) */
46 };
47
48 struct node {
49 enum node_type type; /* must be first field */
50 struct node *parent;
51 union {
52 struct {
53 struct node *left;
54 struct node *right;
55 } children;
56 uint8_t *leaf_class_bitmap;
57 uint8_t leaf_char;
58 } u;
59 };
60
61 /* --- Prototypes --*/
62 static cl_error_t build_suffixtree_descend(struct node *n, struct text_buffer *buf, suffix_callback cb, void *cbdata, struct regex_list *regex);
63 /* -----------------*/
64
65 static uint8_t dot_bitmap[32] = {
66 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
67 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
68 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
69 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff};
70
make_node(enum node_type type,struct node * left,struct node * right)71 static struct node *make_node(enum node_type type, struct node *left, struct node *right)
72 {
73 struct node *n;
74 if (type == concat) {
75 if (left == NULL)
76 return right;
77 if (right == NULL)
78 return left;
79 }
80 n = cli_malloc(sizeof(*n));
81 if (!n) {
82 cli_errmsg("make_node: Unable to allocate memory for new node\n");
83 return NULL;
84 }
85 n->type = type;
86 n->parent = NULL;
87 n->u.children.left = left;
88 n->u.children.right = right;
89 if (left)
90 left->parent = n;
91 if (right)
92 right->parent = n;
93 return n;
94 }
95
dup_node(struct node * p)96 static struct node *dup_node(struct node *p)
97 {
98 struct node *node_left, *node_right;
99 struct node *d;
100
101 if (!p)
102 return NULL;
103 d = cli_malloc(sizeof(*d));
104 if (!d) {
105 cli_errmsg("dup_node: Unable to allocate memory for duplicate node\n");
106 return NULL;
107 }
108 d->type = p->type;
109 d->parent = NULL;
110 switch (p->type) {
111 case leaf:
112 d->u.leaf_char = p->u.leaf_char;
113 break;
114 case leaf_class:
115 d->u.leaf_class_bitmap = cli_malloc(32);
116 if (!d->u.leaf_class_bitmap) {
117 cli_errmsg("make_node: Unable to allocate memory for leaf class\n");
118 free(d);
119 return NULL;
120 }
121 memcpy(d->u.leaf_class_bitmap, p->u.leaf_class_bitmap, 32);
122 break;
123 default:
124 node_left = dup_node(p->u.children.left);
125 node_right = dup_node(p->u.children.right);
126 d->u.children.left = node_left;
127 d->u.children.right = node_right;
128 if (node_left)
129 node_left->parent = d;
130 if (node_right)
131 node_right->parent = d;
132 break;
133 }
134 return d;
135 }
136
make_charclass(uint8_t * bitmap)137 static struct node *make_charclass(uint8_t *bitmap)
138 {
139 struct node *v = cli_malloc(sizeof(*v));
140 if (!v) {
141 cli_errmsg("make_charclass: Unable to allocate memory for character class\n");
142 return NULL;
143 }
144 v->type = leaf_class;
145 v->parent = NULL;
146 v->u.leaf_class_bitmap = bitmap;
147 return v;
148 }
149
make_leaf(char c)150 static struct node *make_leaf(char c)
151 {
152 struct node *v = cli_malloc(sizeof(*v));
153 if (!v)
154 return NULL;
155 v->type = leaf;
156 v->parent = NULL;
157 v->u.leaf_char = c;
158 return v;
159 }
160
destroy_tree(struct node * n)161 static void destroy_tree(struct node *n)
162 {
163 if (!n)
164 return;
165 switch (n->type) {
166 case concat:
167 case alternate:
168 case optional:
169 destroy_tree(n->u.children.left);
170 destroy_tree(n->u.children.right);
171 break;
172 case leaf_class:
173 if (n->u.leaf_class_bitmap != dot_bitmap)
174 free(n->u.leaf_class_bitmap);
175 break;
176 case root:
177 case leaf:
178 break;
179 }
180 free(n);
181 }
182
parse_char_class(const char * pat,size_t * pos)183 static uint8_t *parse_char_class(const char *pat, size_t *pos)
184 {
185 unsigned char range_start = 0;
186 int hasprev = 0;
187 uint8_t *bitmap = cli_malloc(32);
188 if (!bitmap) {
189 cli_errmsg("parse_char_class: Unable to allocate memory for bitmap\n");
190 return NULL;
191 }
192 if (pat[*pos] == '^') {
193 memset(bitmap, 0xFF, 32); /*match chars not in brackets*/
194 ++*pos;
195 } else
196 memset(bitmap, 0x00, 32);
197 do {
198 /* literal ] can be first character, so test for it at the end of the loop, for example: []] */
199 if (pat[*pos] == '-' && hasprev) {
200 /* it is a range*/
201 unsigned char range_end;
202 unsigned int c;
203 assert(range_start);
204 ++*pos;
205 if (pat[*pos] == '[')
206 if (pat[*pos + 1] == '.') {
207 /* collating sequence not handled */
208 free(bitmap);
209 /* we are parsing the regex for a
210 * filter, be conservative and
211 * tell the filter that anything could
212 * match here */
213 while (pat[*pos] != ']') ++*pos;
214 ++*pos;
215 while (pat[*pos] != ']') ++*pos;
216 return dot_bitmap;
217 } else
218 range_end = pat[*pos];
219 else
220 range_end = pat[*pos];
221 for (c = range_start + 1; c <= range_end; c++)
222 bitmap[c >> 3] ^= 1 << (c & 0x7);
223 hasprev = 0;
224 } else if (pat[*pos] == '[' && pat[*pos] == ':') {
225 /* char class */
226 free(bitmap);
227 while (pat[*pos] != ']') ++*pos;
228 ++*pos;
229 while (pat[*pos] != ']') ++*pos;
230 return dot_bitmap;
231 } else {
232 bitmap[pat[*pos] >> 3] ^= 1 << (pat[*pos] & 0x7);
233 range_start = pat[*pos];
234 ++*pos;
235 hasprev = 1;
236 }
237 } while (pat[*pos] != ']');
238 return bitmap;
239 }
240
parse_regex(const char * p,size_t * last)241 static struct node *parse_regex(const char *p, size_t *last)
242 {
243 struct node *v = NULL;
244 struct node *right;
245 struct node *tmp;
246
247 while (p[*last] != '$' && p[*last] != '\0') {
248 switch (p[*last]) {
249 case '|':
250 ++*last;
251 right = parse_regex(p, last);
252 v = make_node(alternate, v, right);
253 if (!v)
254 return NULL;
255 break;
256 case '*':
257 case '?':
258 v = make_node(optional, v, NULL);
259 if (!v)
260 return NULL;
261 ++*last;
262 break;
263 case '+':
264 /* (x)* */
265 tmp = make_node(optional, v, NULL);
266 if (!tmp)
267 return NULL;
268 /* (x) */
269 right = dup_node(v);
270 if (!right) {
271 free(tmp);
272 return NULL;
273 }
274 /* (x)*(x) => (x)+ */
275 v = make_node(concat, tmp, right);
276 if (!v)
277 return NULL;
278 ++*last;
279 break;
280 case '(':
281 ++*last;
282 right = parse_regex(p, last);
283 if (!right)
284 return NULL;
285 ++*last;
286 v = make_node(concat, v, right);
287 break;
288 case ')':
289 return v;
290 case '.':
291 right = make_charclass(dot_bitmap);
292 if (!right)
293 return NULL;
294 v = make_node(concat, v, right);
295 if (!v)
296 return NULL;
297 ++*last;
298 break;
299 case '[':
300 ++*last;
301 right = make_charclass(parse_char_class(p, last));
302 if (!right)
303 return NULL;
304 v = make_node(concat, v, right);
305 if (!v)
306 return NULL;
307 ++*last;
308 break;
309 case '\\':
310 /* next char is escaped, advance pointer
311 * and let fall-through handle it */
312 ++*last;
313 default:
314 right = make_leaf(p[*last]);
315 v = make_node(concat, v, right);
316 if (!v)
317 return NULL;
318 ++*last;
319 break;
320 }
321 }
322 return v;
323 }
324
325 #define BITMAP_HASSET(b, i) (b[i >> 3] & (1 << (i & 7)))
326
build_suffixtree_ascend(struct node * n,struct text_buffer * buf,struct node * prev,suffix_callback cb,void * cbdata,struct regex_list * regex)327 static cl_error_t build_suffixtree_ascend(struct node *n, struct text_buffer *buf, struct node *prev, suffix_callback cb, void *cbdata, struct regex_list *regex)
328 {
329 size_t i, cnt;
330 while (n) {
331 struct node *q = n;
332 switch (n->type) {
333 case root:
334 textbuffer_putc(buf, '\0');
335 if (cb(cbdata, buf->data, buf->pos - 1, regex) != CL_SUCCESS)
336 return CL_EMEM;
337 return CL_SUCCESS;
338 case leaf:
339 textbuffer_putc(buf, n->u.leaf_char);
340 n = n->parent;
341 break;
342 case leaf_class:
343 cnt = 0;
344 for (i = 0; i < 255; i++)
345 if (BITMAP_HASSET(n->u.leaf_class_bitmap, i))
346 cnt++;
347 if (cnt > 16) {
348 textbuffer_putc(buf, '\0');
349 if (cb(cbdata, buf->data, buf->pos - 1, regex) != CL_SUCCESS)
350 return CL_EMEM;
351 return CL_SUCCESS;
352 }
353 /* handle small classes by expanding */
354 for (i = 0; i < 255; i++) {
355 if (BITMAP_HASSET(n->u.leaf_class_bitmap, i)) {
356 size_t pos;
357 pos = buf->pos;
358 textbuffer_putc(buf, (char)i);
359 if (build_suffixtree_ascend(n->parent, buf, n, cb, cbdata, regex) != CL_SUCCESS)
360 return CL_EMEM;
361 buf->pos = pos;
362 }
363 }
364 return 0;
365 case concat:
366 if (prev != n->u.children.left) {
367 if (build_suffixtree_descend(n->u.children.left, buf, cb, cbdata, regex) != CL_SUCCESS)
368 return CL_EMEM;
369 /* we're done here, descend will call
370 * ascend if needed */
371 return CL_SUCCESS;
372 } else {
373 n = n->parent;
374 }
375 break;
376 case alternate:
377 n = n->parent;
378 break;
379 case optional:
380 textbuffer_putc(buf, '\0');
381 if (cb(cbdata, buf->data, buf->pos - 1, regex) != CL_SUCCESS)
382 return CL_EMEM;
383 return CL_SUCCESS;
384 }
385 prev = q;
386 }
387 return CL_SUCCESS;
388 }
389
build_suffixtree_descend(struct node * n,struct text_buffer * buf,suffix_callback cb,void * cbdata,struct regex_list * regex)390 static cl_error_t build_suffixtree_descend(struct node *n, struct text_buffer *buf, suffix_callback cb, void *cbdata, struct regex_list *regex)
391 {
392 size_t pos;
393 while (n && n->type == concat) {
394 n = n->u.children.right;
395 }
396 if (!n)
397 return CL_SUCCESS;
398 /* find out end of the regular expression,
399 * if it ends with a static pattern */
400 switch (n->type) {
401 case alternate:
402 /* save pos as restart point */
403 pos = buf->pos;
404 if (build_suffixtree_descend(n->u.children.left, buf, cb, cbdata, regex) != CL_SUCCESS)
405 return CL_EMEM;
406 buf->pos = pos;
407 if (build_suffixtree_descend(n->u.children.right, buf, cb, cbdata, regex) != CL_SUCCESS)
408 return CL_EMEM;
409 buf->pos = pos;
410 break;
411 case optional:
412 textbuffer_putc(buf, '\0');
413 if (cb(cbdata, buf->data, buf->pos - 1, regex) != CL_SUCCESS)
414 return CL_EMEM;
415 return CL_SUCCESS;
416 case leaf:
417 case leaf_class:
418 if (build_suffixtree_ascend(n, buf, NULL, cb, cbdata, regex) != CL_SUCCESS)
419 return CL_EMEM;
420 return CL_SUCCESS;
421 default:
422 break;
423 }
424 return CL_SUCCESS;
425 }
426
cli_regex2suffix(const char * pattern,regex_t * preg,suffix_callback cb,void * cbdata)427 cl_error_t cli_regex2suffix(const char *pattern, regex_t *preg, suffix_callback cb, void *cbdata)
428 {
429 struct regex_list regex;
430 struct text_buffer buf;
431 struct node root_node;
432 struct node *n;
433 size_t last = 0;
434 int rc;
435
436 assert(pattern);
437
438 regex.preg = preg;
439 rc = cli_regcomp(regex.preg, pattern, REG_EXTENDED);
440 if (rc) {
441 size_t buflen = cli_regerror(rc, regex.preg, NULL, 0);
442 char *errbuf = cli_malloc(buflen);
443 if (errbuf) {
444 cli_regerror(rc, regex.preg, errbuf, buflen);
445 cli_errmsg(MODULE "Error compiling regular expression %s: %s\n", pattern, errbuf);
446 free(errbuf);
447 } else {
448 cli_errmsg(MODULE "Error compiling regular expression: %s\n", pattern);
449 }
450 return rc;
451 }
452 regex.nxt = NULL;
453 regex.pattern = cli_strdup(pattern);
454
455 n = parse_regex(pattern, &last);
456 if (!n)
457 return REG_ESPACE;
458 memset(&buf, 0, sizeof(buf));
459 memset(&root_node, 0, sizeof(buf));
460 n->parent = &root_node;
461
462 rc = build_suffixtree_descend(n, &buf, cb, cbdata, ®ex);
463 free(regex.pattern);
464 free(buf.data);
465 destroy_tree(n);
466 return rc;
467 }
468