1 /*
2  *  Parse a regular expression, and extract a static suffix.
3  *
4  *  Copyright (C) 2013-2022 Cisco Systems, Inc. and/or its affiliates. All rights reserved.
5  *  Copyright (C) 2007-2013 Sourcefire, Inc.
6  *
7  *  Authors: Török Edvin
8  *
9  *  This program is free software; you can redistribute it and/or modify
10  *  it under the terms of the GNU General Public License version 2 as
11  *  published by the Free Software Foundation.
12  *
13  *  This program is distributed in the hope that it will be useful,
14  *  but WITHOUT ANY WARRANTY; without even the implied warranty of
15  *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
16  *  GNU General Public License for more details.
17  *
18  *  You should have received a copy of the GNU General Public License
19  *  along with this program; if not, write to the Free Software
20  *  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
21  *  MA 02110-1301, USA.
22  */
23 #if HAVE_CONFIG_H
24 #include "clamav-config.h"
25 #endif
26 
27 #include <stdio.h>
28 #include <stdlib.h>
29 #include <string.h>
30 #include <assert.h>
31 
32 #include "clamav.h"
33 #include "others.h"
34 #include "jsparse/textbuf.h"
35 #include "regex_suffix.h"
36 #define MODULE "regex_suffix: "
37 
38 enum node_type {
39     root = 0,
40     concat,
41     alternate, /* | */
42     optional,  /* ?, * */
43     leaf,      /* a character */
44     leaf_class /* character class */
45                /* (x)+ is transformed into (x)*(x) */
46 };
47 
48 struct node {
49     enum node_type type; /* must be first field */
50     struct node *parent;
51     union {
52         struct {
53             struct node *left;
54             struct node *right;
55         } children;
56         uint8_t *leaf_class_bitmap;
57         uint8_t leaf_char;
58     } u;
59 };
60 
61 /* --- Prototypes --*/
62 static cl_error_t build_suffixtree_descend(struct node *n, struct text_buffer *buf, suffix_callback cb, void *cbdata, struct regex_list *regex);
63 /* -----------------*/
64 
65 static uint8_t dot_bitmap[32] = {
66     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
67     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
68     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
69     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff};
70 
make_node(enum node_type type,struct node * left,struct node * right)71 static struct node *make_node(enum node_type type, struct node *left, struct node *right)
72 {
73     struct node *n;
74     if (type == concat) {
75         if (left == NULL)
76             return right;
77         if (right == NULL)
78             return left;
79     }
80     n = cli_malloc(sizeof(*n));
81     if (!n) {
82         cli_errmsg("make_node: Unable to allocate memory for new node\n");
83         return NULL;
84     }
85     n->type             = type;
86     n->parent           = NULL;
87     n->u.children.left  = left;
88     n->u.children.right = right;
89     if (left)
90         left->parent = n;
91     if (right)
92         right->parent = n;
93     return n;
94 }
95 
dup_node(struct node * p)96 static struct node *dup_node(struct node *p)
97 {
98     struct node *node_left, *node_right;
99     struct node *d;
100 
101     if (!p)
102         return NULL;
103     d = cli_malloc(sizeof(*d));
104     if (!d) {
105         cli_errmsg("dup_node: Unable to allocate memory for duplicate node\n");
106         return NULL;
107     }
108     d->type   = p->type;
109     d->parent = NULL;
110     switch (p->type) {
111         case leaf:
112             d->u.leaf_char = p->u.leaf_char;
113             break;
114         case leaf_class:
115             d->u.leaf_class_bitmap = cli_malloc(32);
116             if (!d->u.leaf_class_bitmap) {
117                 cli_errmsg("make_node: Unable to allocate memory for leaf class\n");
118                 free(d);
119                 return NULL;
120             }
121             memcpy(d->u.leaf_class_bitmap, p->u.leaf_class_bitmap, 32);
122             break;
123         default:
124             node_left           = dup_node(p->u.children.left);
125             node_right          = dup_node(p->u.children.right);
126             d->u.children.left  = node_left;
127             d->u.children.right = node_right;
128             if (node_left)
129                 node_left->parent = d;
130             if (node_right)
131                 node_right->parent = d;
132             break;
133     }
134     return d;
135 }
136 
make_charclass(uint8_t * bitmap)137 static struct node *make_charclass(uint8_t *bitmap)
138 {
139     struct node *v = cli_malloc(sizeof(*v));
140     if (!v) {
141         cli_errmsg("make_charclass: Unable to allocate memory for character class\n");
142         return NULL;
143     }
144     v->type                = leaf_class;
145     v->parent              = NULL;
146     v->u.leaf_class_bitmap = bitmap;
147     return v;
148 }
149 
make_leaf(char c)150 static struct node *make_leaf(char c)
151 {
152     struct node *v = cli_malloc(sizeof(*v));
153     if (!v)
154         return NULL;
155     v->type        = leaf;
156     v->parent      = NULL;
157     v->u.leaf_char = c;
158     return v;
159 }
160 
destroy_tree(struct node * n)161 static void destroy_tree(struct node *n)
162 {
163     if (!n)
164         return;
165     switch (n->type) {
166         case concat:
167         case alternate:
168         case optional:
169             destroy_tree(n->u.children.left);
170             destroy_tree(n->u.children.right);
171             break;
172         case leaf_class:
173             if (n->u.leaf_class_bitmap != dot_bitmap)
174                 free(n->u.leaf_class_bitmap);
175             break;
176         case root:
177         case leaf:
178             break;
179     }
180     free(n);
181 }
182 
parse_char_class(const char * pat,size_t * pos)183 static uint8_t *parse_char_class(const char *pat, size_t *pos)
184 {
185     unsigned char range_start = 0;
186     int hasprev               = 0;
187     uint8_t *bitmap           = cli_malloc(32);
188     if (!bitmap) {
189         cli_errmsg("parse_char_class: Unable to allocate memory for bitmap\n");
190         return NULL;
191     }
192     if (pat[*pos] == '^') {
193         memset(bitmap, 0xFF, 32); /*match chars not in brackets*/
194         ++*pos;
195     } else
196         memset(bitmap, 0x00, 32);
197     do {
198         /* literal ] can be first character, so test for it at the end of the loop, for example: []] */
199         if (pat[*pos] == '-' && hasprev) {
200             /* it is a range*/
201             unsigned char range_end;
202             unsigned int c;
203             assert(range_start);
204             ++*pos;
205             if (pat[*pos] == '[')
206                 if (pat[*pos + 1] == '.') {
207                     /* collating sequence not handled */
208                     free(bitmap);
209                     /* we are parsing the regex for a
210 					 * filter, be conservative and
211 					 * tell the filter that anything could
212 					 * match here */
213                     while (pat[*pos] != ']') ++*pos;
214                     ++*pos;
215                     while (pat[*pos] != ']') ++*pos;
216                     return dot_bitmap;
217                 } else
218                     range_end = pat[*pos];
219             else
220                 range_end = pat[*pos];
221             for (c = range_start + 1; c <= range_end; c++)
222                 bitmap[c >> 3] ^= 1 << (c & 0x7);
223             hasprev = 0;
224         } else if (pat[*pos] == '[' && pat[*pos] == ':') {
225             /* char class */
226             free(bitmap);
227             while (pat[*pos] != ']') ++*pos;
228             ++*pos;
229             while (pat[*pos] != ']') ++*pos;
230             return dot_bitmap;
231         } else {
232             bitmap[pat[*pos] >> 3] ^= 1 << (pat[*pos] & 0x7);
233             range_start = pat[*pos];
234             ++*pos;
235             hasprev = 1;
236         }
237     } while (pat[*pos] != ']');
238     return bitmap;
239 }
240 
parse_regex(const char * p,size_t * last)241 static struct node *parse_regex(const char *p, size_t *last)
242 {
243     struct node *v = NULL;
244     struct node *right;
245     struct node *tmp;
246 
247     while (p[*last] != '$' && p[*last] != '\0') {
248         switch (p[*last]) {
249             case '|':
250                 ++*last;
251                 right = parse_regex(p, last);
252                 v     = make_node(alternate, v, right);
253                 if (!v)
254                     return NULL;
255                 break;
256             case '*':
257             case '?':
258                 v = make_node(optional, v, NULL);
259                 if (!v)
260                     return NULL;
261                 ++*last;
262                 break;
263             case '+':
264                 /* (x)* */
265                 tmp = make_node(optional, v, NULL);
266                 if (!tmp)
267                     return NULL;
268                 /* (x) */
269                 right = dup_node(v);
270                 if (!right) {
271                     free(tmp);
272                     return NULL;
273                 }
274                 /* (x)*(x) => (x)+ */
275                 v = make_node(concat, tmp, right);
276                 if (!v)
277                     return NULL;
278                 ++*last;
279                 break;
280             case '(':
281                 ++*last;
282                 right = parse_regex(p, last);
283                 if (!right)
284                     return NULL;
285                 ++*last;
286                 v = make_node(concat, v, right);
287                 break;
288             case ')':
289                 return v;
290             case '.':
291                 right = make_charclass(dot_bitmap);
292                 if (!right)
293                     return NULL;
294                 v = make_node(concat, v, right);
295                 if (!v)
296                     return NULL;
297                 ++*last;
298                 break;
299             case '[':
300                 ++*last;
301                 right = make_charclass(parse_char_class(p, last));
302                 if (!right)
303                     return NULL;
304                 v = make_node(concat, v, right);
305                 if (!v)
306                     return NULL;
307                 ++*last;
308                 break;
309             case '\\':
310                 /* next char is escaped, advance pointer
311 				 * and let fall-through handle it */
312                 ++*last;
313             default:
314                 right = make_leaf(p[*last]);
315                 v     = make_node(concat, v, right);
316                 if (!v)
317                     return NULL;
318                 ++*last;
319                 break;
320         }
321     }
322     return v;
323 }
324 
325 #define BITMAP_HASSET(b, i) (b[i >> 3] & (1 << (i & 7)))
326 
build_suffixtree_ascend(struct node * n,struct text_buffer * buf,struct node * prev,suffix_callback cb,void * cbdata,struct regex_list * regex)327 static cl_error_t build_suffixtree_ascend(struct node *n, struct text_buffer *buf, struct node *prev, suffix_callback cb, void *cbdata, struct regex_list *regex)
328 {
329     size_t i, cnt;
330     while (n) {
331         struct node *q = n;
332         switch (n->type) {
333             case root:
334                 textbuffer_putc(buf, '\0');
335                 if (cb(cbdata, buf->data, buf->pos - 1, regex) != CL_SUCCESS)
336                     return CL_EMEM;
337                 return CL_SUCCESS;
338             case leaf:
339                 textbuffer_putc(buf, n->u.leaf_char);
340                 n = n->parent;
341                 break;
342             case leaf_class:
343                 cnt = 0;
344                 for (i = 0; i < 255; i++)
345                     if (BITMAP_HASSET(n->u.leaf_class_bitmap, i))
346                         cnt++;
347                 if (cnt > 16) {
348                     textbuffer_putc(buf, '\0');
349                     if (cb(cbdata, buf->data, buf->pos - 1, regex) != CL_SUCCESS)
350                         return CL_EMEM;
351                     return CL_SUCCESS;
352                 }
353                 /* handle small classes by expanding */
354                 for (i = 0; i < 255; i++) {
355                     if (BITMAP_HASSET(n->u.leaf_class_bitmap, i)) {
356                         size_t pos;
357                         pos = buf->pos;
358                         textbuffer_putc(buf, (char)i);
359                         if (build_suffixtree_ascend(n->parent, buf, n, cb, cbdata, regex) != CL_SUCCESS)
360                             return CL_EMEM;
361                         buf->pos = pos;
362                     }
363                 }
364                 return 0;
365             case concat:
366                 if (prev != n->u.children.left) {
367                     if (build_suffixtree_descend(n->u.children.left, buf, cb, cbdata, regex) != CL_SUCCESS)
368                         return CL_EMEM;
369                     /* we're done here, descend will call
370 					 * ascend if needed */
371                     return CL_SUCCESS;
372                 } else {
373                     n = n->parent;
374                 }
375                 break;
376             case alternate:
377                 n = n->parent;
378                 break;
379             case optional:
380                 textbuffer_putc(buf, '\0');
381                 if (cb(cbdata, buf->data, buf->pos - 1, regex) != CL_SUCCESS)
382                     return CL_EMEM;
383                 return CL_SUCCESS;
384         }
385         prev = q;
386     }
387     return CL_SUCCESS;
388 }
389 
build_suffixtree_descend(struct node * n,struct text_buffer * buf,suffix_callback cb,void * cbdata,struct regex_list * regex)390 static cl_error_t build_suffixtree_descend(struct node *n, struct text_buffer *buf, suffix_callback cb, void *cbdata, struct regex_list *regex)
391 {
392     size_t pos;
393     while (n && n->type == concat) {
394         n = n->u.children.right;
395     }
396     if (!n)
397         return CL_SUCCESS;
398     /* find out end of the regular expression,
399 	 * if it ends with a static pattern */
400     switch (n->type) {
401         case alternate:
402             /* save pos as restart point */
403             pos = buf->pos;
404             if (build_suffixtree_descend(n->u.children.left, buf, cb, cbdata, regex) != CL_SUCCESS)
405                 return CL_EMEM;
406             buf->pos = pos;
407             if (build_suffixtree_descend(n->u.children.right, buf, cb, cbdata, regex) != CL_SUCCESS)
408                 return CL_EMEM;
409             buf->pos = pos;
410             break;
411         case optional:
412             textbuffer_putc(buf, '\0');
413             if (cb(cbdata, buf->data, buf->pos - 1, regex) != CL_SUCCESS)
414                 return CL_EMEM;
415             return CL_SUCCESS;
416         case leaf:
417         case leaf_class:
418             if (build_suffixtree_ascend(n, buf, NULL, cb, cbdata, regex) != CL_SUCCESS)
419                 return CL_EMEM;
420             return CL_SUCCESS;
421         default:
422             break;
423     }
424     return CL_SUCCESS;
425 }
426 
cli_regex2suffix(const char * pattern,regex_t * preg,suffix_callback cb,void * cbdata)427 cl_error_t cli_regex2suffix(const char *pattern, regex_t *preg, suffix_callback cb, void *cbdata)
428 {
429     struct regex_list regex;
430     struct text_buffer buf;
431     struct node root_node;
432     struct node *n;
433     size_t last = 0;
434     int rc;
435 
436     assert(pattern);
437 
438     regex.preg = preg;
439     rc         = cli_regcomp(regex.preg, pattern, REG_EXTENDED);
440     if (rc) {
441         size_t buflen = cli_regerror(rc, regex.preg, NULL, 0);
442         char *errbuf  = cli_malloc(buflen);
443         if (errbuf) {
444             cli_regerror(rc, regex.preg, errbuf, buflen);
445             cli_errmsg(MODULE "Error compiling regular expression %s: %s\n", pattern, errbuf);
446             free(errbuf);
447         } else {
448             cli_errmsg(MODULE "Error compiling regular expression: %s\n", pattern);
449         }
450         return rc;
451     }
452     regex.nxt     = NULL;
453     regex.pattern = cli_strdup(pattern);
454 
455     n = parse_regex(pattern, &last);
456     if (!n)
457         return REG_ESPACE;
458     memset(&buf, 0, sizeof(buf));
459     memset(&root_node, 0, sizeof(buf));
460     n->parent = &root_node;
461 
462     rc = build_suffixtree_descend(n, &buf, cb, cbdata, &regex);
463     free(regex.pattern);
464     free(buf.data);
465     destroy_tree(n);
466     return rc;
467 }
468