1 /*
2 * Small POSIX-only regex engine.
3 *
4 * Copyright (c) 2009 Marko Kreen
5 *
6 * Permission to use, copy, modify, and/or distribute this software for any
7 * purpose with or without fee is hereby granted, provided that the above
8 * copyright notice and this permission notice appear in all copies.
9 *
10 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
11 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
12 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
13 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
14 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
15 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
16 * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
17 */
18
19 /*
20 * Simple recursive matcher, only features are small size
21 * and POSIX compatibility.
22 *
23 * ERE syntax: . * ^ $ [] [[:cname:]] () {} | + ?
24 * BRE syntax: . * ^ $ [] [[:cname:]] \(\) \{\} \1-9
25 *
26 * With REG_RELAXED_SYNTAX, following common escapes will be available:
27 * \b\B\d\D\s\S\w\W BRE: \| ERE: \1-9
28 *
29 * With REG_RELAXED_MATCHING it returns the first match found after applying
30 * leftmost-longest to all elements. It skips the combinatorics to turn it
31 * into guaranteed-longest match.
32 *
33 * Skipped POSIX features:
34 * - collation classes: [[. .]]
35 * - equivalence classes: [[= =]]
36 * - char ranges by locale order: [a-z] (byte order will be used)
37 * - multi-byte chars: UTF-8
38 */
39
40 #include <usual/regex.h>
41
42 #ifndef USE_SYSTEM_REGEX
43
44 #include <usual/mempool.h>
45 #include <usual/ctype.h>
46 #include <string.h>
47 #include <stdio.h>
48
49 #undef STRICT
50
51 /* either dynamic or static decision */
52 #define STRICT (ctx->strict)
53
54 /* how many regmatch_t can be reported */
55 #define MAX_GROUPS 128
56
57 /* max count we want to store, means 'infinite' for simple atoms */
58 #define MAX_COUNT 0x7fff
59
60 /* max count for simple atoms: char, any or class */
61 #define SIMPLE_MAXCNT(op) (((op)->maxcnt == MAX_COUNT) ? 0x7FFFFFFF : (op)->maxcnt)
62
63 #define is_word(c) (isalnum(c) || (c) == '_')
64
65 struct Op;
66 struct ExecCtx;
67 struct GMatch;
68
69 /* Operation type */
70 enum OpType {
71 /* ops that take count */
72 OP_CHAR,
73 OP_ANY,
74 OP_CLASS,
75 OP_GROUP,
76 OP_BREF,
77 /* ops that dont take count */
78 OP_BOL,
79 OP_EOL,
80 OP_WCHANGE,
81 OP_NWCHANGE,
82 OP_GMATCH,
83 OP_FULLMATCH,
84 };
85 #define NONCOUNT_OPS_START OP_BOL
86
87 /* regex_t->internal */
88 struct RegexInt {
89 struct Op *root;
90 struct Op *glist;
91 struct MemPool *pool;
92 int flags;
93 };
94
95 /* match function and its setter */
96 typedef int (*matcher_f)(struct ExecCtx *ctx, const struct Op *op, const char *str, struct GMatch *gm);
97 static void set_op_type(struct Op *op, enum OpType op_type);
98
99 /* List of tokens to be AND-ed together */
100 struct AndList {
101 struct AndList *next;
102 struct Op *op_list;
103 };
104
105 /* extra data for group Op */
106 struct GroupData {
107 struct Op *parent; /* parent group or NULL for first group */
108 struct AndList *or_list;/* alternative AndLists */
109 struct Op *glist_prev; /* prev group Op */
110 bool has_refs; /* if bref references it */
111 };
112
113 /* char class data */
114 struct ClassData {
115 uint32_t bitmap[256 / 32];
116 };
117
118 /* operation data */
119 struct Op {
120 struct Op *next;
121 matcher_f matcher;
122 uint16_t mincnt;
123 uint16_t maxcnt;
124 uint8_t type;
125 union {
126 uint8_t grp_no; /* OP_GROUP: group nr, 0-toplevel */
127 char lit; /* OP_CHAR */
128 uint8_t bref; /* OP_BREF */
129 };
130 union {
131 struct ClassData cdata;
132 struct GroupData gdata;
133 };
134 };
135 #define OP_BASE (offsetof(struct Op, cdata))
136
137 /*
138 * Operations on ClassData
139 */
140
class_isset(const struct ClassData * cd,unsigned char c)141 static bool class_isset(const struct ClassData *cd, unsigned char c)
142 {
143 return cd->bitmap[c / 32] & (1 << (c % 32));
144 }
145
class_set(struct ClassData * cd,unsigned char c)146 static void class_set(struct ClassData *cd, unsigned char c)
147 {
148 cd->bitmap[c / 32] |= (1 << (c % 32));
149 }
150
class_negate(struct ClassData * cd)151 static void class_negate(struct ClassData *cd)
152 {
153 int i;
154 class_set(cd, 0);
155 for (i = 0; i < 256/32; i++) cd->bitmap[i] ^= -1;
156 }
157
158 /*
159 * Parsing code
160 */
161
162 /* top-level context */
163 struct ParseCtx {
164 regex_t *rx;
165 struct RegexInt *rxi;
166 struct Op *last_group;
167 struct AndList *last_andlist;
168 struct Op *last_elem; /* last op in current OR branch */
169 bool gotcnt; /* count was attached to last op */
170 bool strict; /* strict syntax */
171 };
172
new_andlist(struct ParseCtx * ctx,struct Op * g)173 static struct AndList *new_andlist(struct ParseCtx *ctx, struct Op *g)
174 {
175 struct AndList *al = mempool_alloc(&ctx->rxi->pool, sizeof(*al));
176 if (!al)
177 return NULL;
178 if (ctx->last_andlist) {
179 ctx->last_andlist->next = al;
180 } else {
181 g->gdata.or_list = al;
182 }
183 ctx->last_andlist = al;
184 return al;
185 }
186
new_op(struct ParseCtx * ctx,enum OpType t,int extra)187 static struct Op *new_op(struct ParseCtx *ctx, enum OpType t, int extra)
188 {
189 struct Op *op = mempool_alloc(&ctx->rxi->pool, OP_BASE + extra);
190 if (!op)
191 return NULL;
192 set_op_type(op, t);
193 op->mincnt = op->maxcnt = 1;
194 ctx->gotcnt = false;
195
196 /* append */
197 if (ctx->last_elem) {
198 ctx->last_elem->next = op;
199 } else if (ctx->last_andlist) {
200 ctx->last_andlist->op_list = op;
201 } else if (ctx->last_group) {
202 struct AndList *alist;
203 alist = new_andlist(ctx, ctx->last_group);
204 if (!alist)
205 return NULL;
206 alist->op_list = op;
207 }
208 ctx->last_elem = op;
209
210 if (t == OP_GROUP) {
211 struct Op *parent = ctx->last_group;
212 int gno = ++ctx->rx->re_nsub;
213 op->grp_no = gno;
214 op->gdata.parent = parent;
215 op->gdata.glist_prev = ctx->rxi->glist;
216 ctx->rxi->glist = op;
217 ctx->last_group = op;
218 ctx->last_andlist = NULL;
219 ctx->last_elem = NULL;
220 if (!ctx->rxi->root)
221 ctx->rxi->root = op;
222 }
223 return op;
224 }
225
op_char(struct ParseCtx * ctx,unsigned c)226 static int op_char(struct ParseCtx *ctx, unsigned c)
227 {
228 struct Op *op = new_op(ctx, OP_CHAR, 0);
229 if (!op)
230 return REG_ESPACE;
231 op->lit = c;
232 if ((ctx->rxi->flags & REG_ICASE) && isalpha(c))
233 op->lit = tolower(c);
234 return 0;
235 }
236
op_bref(struct ParseCtx * ctx,unsigned c)237 static int op_bref(struct ParseCtx *ctx, unsigned c)
238 {
239 struct Op *g, *op;
240
241 op = new_op(ctx, OP_BREF, 0);
242 if (!op)
243 return REG_ESPACE;
244 op->bref = c - '0';
245
246 /* check if valid ref */
247 for (g = ctx->last_group; g; g = g->gdata.parent) {
248 if (g->grp_no == op->bref)
249 return REG_ESUBREG;
250 }
251 /* tag the group as referenced */
252 for (g = ctx->rxi->glist; g; g = g->gdata.glist_prev) {
253 if (g->grp_no == op->bref) {
254 g->gdata.has_refs = true;
255 return 0;
256 }
257 }
258 return REG_ESUBREG;
259 }
260
op_simple(struct ParseCtx * ctx,enum OpType t)261 static int op_simple(struct ParseCtx *ctx, enum OpType t)
262 {
263 struct Op *op = new_op(ctx, t, 0);
264 if (!op)
265 return REG_ESPACE;
266 return 0;
267 }
268
op_count_simple(struct ParseCtx * ctx,int min,int max)269 static int op_count_simple(struct ParseCtx *ctx, int min, int max)
270 {
271 struct Op *op = ctx->last_elem;
272 if (!op || ctx->gotcnt)
273 return REG_BADRPT;
274 if (op->type >= NONCOUNT_OPS_START)
275 return REG_BADRPT;
276 ctx->gotcnt = true;
277 op->mincnt = min;
278 op->maxcnt = max;
279 return 0;
280 }
281
op_count_full(struct ParseCtx * ctx,const char ** re)282 static int op_count_full(struct ParseCtx *ctx, const char **re)
283 {
284 unsigned a, b;
285 char *end = (char *)*re;
286 bool ext = ctx->rxi->flags & REG_EXTENDED;
287 int err;
288
289 /* apply sanity check */
290 err = op_count_simple(ctx, 1, 1);
291 if (err)
292 return err;
293
294 /* parse */
295 a = b = strtoul(*re, &end, 10);
296 if (end == *re)
297 return REG_EBRACE;
298 if (*end == ',') {
299 *re = end + 1;
300 end = (char*)*re;
301 b = strtoul(*re, &end, 10);
302 if (end == *re)
303 b = MAX_COUNT;
304 }
305 if (a > b || b > MAX_COUNT || a >= MAX_COUNT)
306 return REG_BADBR;
307
308 /* check for correct termination */
309 if (ext && end[0] == '}') {
310 *re = end + 1;
311 goto done;
312 } else if (!ext && end[0] == '\\' && end[1] == '}') {
313 *re = end + 2;
314 goto done;
315 }
316
317 /* bad fmt, decide between error codes */
318 return strchr(end, '}') ? REG_BADBR : REG_EBRACE;
319
320 done:
321 ctx->last_elem->mincnt = a;
322 ctx->last_elem->maxcnt = b;
323 return 0;
324 }
325
op_gstart(struct ParseCtx * ctx)326 static int op_gstart(struct ParseCtx *ctx)
327 {
328 struct Op *op;
329 op = new_op(ctx, OP_GROUP, sizeof(struct GroupData));
330 if (!op)
331 return REG_ESPACE;
332 if (op->grp_no >= MAX_GROUPS)
333 return REG_BADPAT;
334 return 0;
335 }
336
finish_branch(struct ParseCtx * ctx)337 static int finish_branch(struct ParseCtx *ctx)
338 {
339 int err;
340
341 /* disallow empty OR fragments, but not empty groups */
342 if (!ctx->last_elem && ctx->last_andlist && STRICT)
343 return REG_BADPAT;
344
345 if (ctx->last_group->gdata.parent)
346 err = op_simple(ctx, OP_GMATCH);
347 else
348 err = op_simple(ctx, OP_FULLMATCH);
349 if (err)
350 return err;
351 ctx->last_elem = NULL;
352 return 0;
353 }
354
op_gend(struct ParseCtx * ctx)355 static int op_gend(struct ParseCtx *ctx)
356 {
357 struct Op *op = ctx->last_group;
358 struct AndList *alist;
359 int err;
360
361 if (!op)
362 return REG_EPAREN;
363
364 err = finish_branch(ctx);
365 if (err)
366 return err;
367 ctx->last_group = op->gdata.parent;
368 ctx->last_elem = op;
369
370 /* recover previous andlist... */
371 alist = ctx->last_group->gdata.or_list;
372 while (alist && alist->next)
373 alist = alist->next;
374 ctx->last_andlist = alist;
375
376 return 0;
377 }
378
op_or(struct ParseCtx * ctx)379 static int op_or(struct ParseCtx *ctx)
380 {
381 struct Op *gop = ctx->last_group;
382 struct AndList *alist;
383 int err;
384
385 /* disallow empty OR branches */
386 if (!ctx->last_elem && STRICT)
387 return REG_BADPAT;
388
389 /* start new branch */
390 err = finish_branch(ctx);
391 if (err)
392 return err;
393 alist = new_andlist(ctx, gop);
394 if (!alist)
395 return REG_ESPACE;
396 ctx->last_andlist = alist;
397 ctx->last_elem = NULL;
398
399 return 0;
400 }
401
402 /*
403 * Parse bracketed classes.
404 */
405
add_char(struct ClassData * cd,unsigned char c,bool icase)406 static void add_char(struct ClassData *cd, unsigned char c, bool icase)
407 {
408 if (icase && isalpha(c)) {
409 class_set(cd, tolower(c));
410 class_set(cd, toupper(c));
411 } else {
412 class_set(cd, c);
413 }
414 }
415
416 struct NamedClass {
417 const char name[7];
418 unsigned char name_len;
419 int (*check_func)(int c);
420 };
421 static const struct NamedClass ctype_list[] = {
422 { "alnum", 5, isalnum },
423 { "alpha", 5, isalpha },
424 { "blank", 5, isblank },
425 { "cntrl", 5, iscntrl },
426 { "digit", 5, isdigit },
427 { "graph", 5, isgraph },
428 { "lower", 5, islower },
429 { "print", 5, isprint },
430 { "punct", 5, ispunct },
431 { "space", 5, isspace },
432 { "upper", 5, isupper },
433 { "xdigit", 6, isxdigit },
434 };
435
fill_class(struct ClassData * cd,const char * name,const char ** s_p,bool icase)436 static int fill_class(struct ClassData *cd, const char *name, const char **s_p, bool icase)
437 {
438 unsigned c;
439 const struct NamedClass *cc = ctype_list;
440 for (c = 0; c < ARRAY_NELEM(ctype_list); c++) {
441 cc = ctype_list + c;
442 if (strncmp(name, cc->name, cc->name_len) != 0)
443 continue;
444 name += cc->name_len;
445 if (name[0] == ':' && name[1] == ']')
446 goto found;
447 break;
448 }
449 return *name ? REG_ECTYPE : REG_EBRACK;
450 found:
451 /* fill map */
452 for (c = 1; c < 256; c++) {
453 if (cc->check_func(c))
454 add_char(cd, c, icase);
455 }
456 *s_p = name + 2;
457 return 0;
458 }
459
460 #define MAP_RANGE 0x7FFF0001
461 #define MAP_END 0x7FFF0002
462 #define MAP_OTHER 0x7FFF0003
463
get_map_token(struct ParseCtx * ctx,const char ** s_p,unsigned * dst_p,bool start,struct ClassData * cd,bool icase)464 static int get_map_token(struct ParseCtx *ctx, const char **s_p, unsigned *dst_p,
465 bool start, struct ClassData *cd, bool icase)
466 {
467 const char *s = *s_p;
468 unsigned res;
469 if (*s == '-') {
470 if (start || s[1] == ']')
471 res = '-';
472 else
473 res = MAP_RANGE;
474 s += 1;
475 } else if (*s == ']' && !start) {
476 res = MAP_END;
477 s++;
478 } else if (*s == '[' && (s[1] == '.' || s[1] == ':' || s[1] == '=')) {
479 if (s[1] == ':') {
480 s += 2;
481 *dst_p = MAP_OTHER;
482 return fill_class(cd, s, s_p, icase);
483 }
484 return REG_BADPAT;
485 } else {
486 res = (unsigned char)*s++;
487 }
488 *dst_p = res;
489 *s_p = s;
490 return 0;
491 }
492
op_class(struct ParseCtx * ctx,const char ** re)493 static int op_class(struct ParseCtx *ctx, const char **re)
494 {
495 const char *s = *re;
496 struct ClassData *cd;
497 struct Op *op;
498 bool not = false, icase = ctx->rxi->flags & REG_ICASE;
499 const char *start;
500 unsigned tk, c, prevtk = 0;
501 bool is_range = false;
502 int err;
503
504 if (*s == '^') {
505 s++;
506 not = true;
507 }
508 start = s;
509
510 op = new_op(ctx, OP_CLASS, sizeof(struct ClassData));
511 if (!op)
512 return REG_ESPACE;
513 cd = &op->cdata;
514
515 if (not && (ctx->rxi->flags & REG_NEWLINE))
516 class_set(cd, '\n');
517
518 while (*s) {
519 err = get_map_token(ctx, &s, &tk, s == start, cd, icase);
520 if (err)
521 return err;
522
523 if (tk == MAP_END) {
524 if (prevtk)
525 add_char(cd, prevtk, icase);
526 goto done;
527 } else if (tk == MAP_OTHER) {
528 if (is_range)
529 return REG_ERANGE;
530 if (prevtk)
531 add_char(cd, prevtk, icase);
532 prevtk = 0;
533 } else if (tk == MAP_RANGE) {
534 if (!prevtk)
535 return REG_ERANGE;
536 is_range = true;
537 } else if (is_range) {
538 if (tk < prevtk)
539 return REG_ERANGE;
540 for (c = prevtk; c <= tk; c++)
541 add_char(cd, c, icase);
542 is_range = false;
543 prevtk = 0;
544 } else {
545 if (prevtk)
546 add_char(cd, prevtk, icase);
547 prevtk = tk;
548 }
549 }
550 return REG_EBRACK;
551 done:
552 *re = s;
553 if (not) class_negate(cd);
554 return 0;
555 }
556
op_class_const(struct ParseCtx * ctx,const char * def)557 static int op_class_const(struct ParseCtx *ctx, const char *def)
558 {
559 const char *p = def + 1;
560 return op_class(ctx, &p);
561 }
562
563 /*
564 * Top-level syntax
565 */
566
parse_relaxed_escapes(struct ParseCtx * ctx,char c)567 static int parse_relaxed_escapes(struct ParseCtx *ctx, char c)
568 {
569 if (STRICT)
570 return REG_BADPAT;
571 switch (c) {
572 case 'b': return op_simple(ctx, OP_WCHANGE);
573 case 'B': return op_simple(ctx, OP_NWCHANGE);
574 case 'w': return op_class_const(ctx, "[_[:alnum:]]");
575 case 'W': return op_class_const(ctx, "[^_[:alnum:]]");
576 case 'd': return op_class_const(ctx, "[[:digit:]]");
577 case 'D': return op_class_const(ctx, "[^[:digit:]]");
578 case 's': return op_class_const(ctx, "[[:space:]]");
579 case 'S': return op_class_const(ctx, "[^[:space:]]");
580 }
581 return REG_BADPAT;
582 }
583
parse_posix_ext(struct ParseCtx * ctx,const char * re)584 static int parse_posix_ext(struct ParseCtx *ctx, const char *re)
585 {
586 int err = 0;
587 unsigned c;
588 int glevel = 0;
589 loop:
590 if (err)
591 return err;
592 c = *re++;
593 switch (c) {
594 case 0:
595 return (glevel == 0) ? 0 : REG_EPAREN;
596 case '(':
597 glevel++;
598 err = op_gstart(ctx);
599 break;
600 case ')':
601 if (glevel > 0) {
602 glevel--;
603 err = op_gend(ctx);
604 } else {
605 err = op_char(ctx, c); /* POSIX bug */
606 }
607 break;
608 case '|':
609 err = op_or(ctx);
610 break;
611 case '*':
612 err = op_count_simple(ctx, 0, MAX_COUNT);
613 break;
614 case '?':
615 err = op_count_simple(ctx, 0, 1);
616 break;
617 case '+':
618 err = op_count_simple(ctx, 1, MAX_COUNT);
619 break;
620 case '[':
621 err = op_class(ctx, &re);
622 break;
623 case '{':
624 err = op_count_full(ctx, &re);
625 break;
626 case '.':
627 err = op_simple(ctx, OP_ANY);
628 break;
629 case '^':
630 err = op_simple(ctx, OP_BOL);
631 break;
632 case '$':
633 err = op_simple(ctx, OP_EOL);
634 break;
635 case '\\':
636 goto escaped;
637 default:
638 err = op_char(ctx, c);
639 }
640 goto loop;
641
642 escaped:
643 c = *re++;
644 if (c == 0)
645 err = REG_EESCAPE;
646 else if (c >= '0' && c <= '9')
647 err = STRICT ? REG_BADPAT : op_bref(ctx, c);
648 else if (isalpha(c))
649 err = parse_relaxed_escapes(ctx, c);
650 else
651 err = op_char(ctx, c);
652 goto loop;
653 }
654
parse_posix_basic(struct ParseCtx * ctx,const char * re)655 static int parse_posix_basic(struct ParseCtx *ctx, const char *re)
656 {
657 int err = 0;
658 unsigned c;
659 int glevel = 0;
660 loop:
661 if (err)
662 return err;
663 c = *re++;
664 switch (c) {
665 case 0:
666 return (glevel == 0) ? 0 : REG_EPAREN;
667 case '*':
668 if (ctx->last_elem && ctx->last_elem->type != OP_BOL)
669 err = op_count_simple(ctx, 0, MAX_COUNT);
670 else
671 err = op_char(ctx, '*');
672 break;
673 case '.':
674 err = op_simple(ctx, OP_ANY);
675 break;
676 case '[':
677 err = op_class(ctx, &re);
678 break;
679 case '^':
680 if (!ctx->last_elem)
681 err = op_simple(ctx, OP_BOL);
682 else
683 err = op_char(ctx, c);
684 break;
685 case '$':
686 if (!*re || (re[0] == '\\' && re[1] == ')'))
687 err = op_simple(ctx, OP_EOL);
688 else
689 err = op_char(ctx, c);
690 break;
691 case '\\':
692 goto escaped;
693 default:
694 err = op_char(ctx, c);
695 }
696 goto loop;
697
698 escaped:
699 c = *re++;
700 switch (c) {
701 case 0:
702 return REG_EESCAPE;
703 case '(':
704 glevel++;
705 err = op_gstart(ctx);
706 break;
707 case ')':
708 glevel--;
709 if (glevel < 0)
710 return REG_EPAREN;
711 err = op_gend(ctx);
712 break;
713 case '{':
714 err = op_count_full(ctx, &re);
715 break;
716 case '.': case '^': case '$': case '*':
717 case '[': case ']': case '\\':
718 err = op_char(ctx, c);
719 break;
720 case '1': case '2': case '3': case '4': case '5':
721 case '6': case '7': case '8': case '9':
722 err = op_bref(ctx, c);
723 break;
724 case '|':
725 err = STRICT ? REG_BADPAT : op_or(ctx);
726 break;
727 default:
728 err = parse_relaxed_escapes(ctx, c);
729 }
730 goto loop;
731 }
732
733 /*
734 * Public compiling API.
735 */
736
regcomp(regex_t * rx,const char * re,int flags)737 int regcomp(regex_t *rx, const char *re, int flags)
738 {
739 struct ParseCtx ctx;
740 struct RegexInt *rxi;
741 int err;
742 struct MemPool *pool = NULL;
743
744 /* do it first, to allow regfree() */
745 memset(rx, 0, sizeof(*rx));
746
747 if (flags & ~(REG_EXTENDED | REG_ICASE | REG_NOSUB | REG_NEWLINE | REG_RELAXED))
748 return REG_BADPAT;
749 if (!*re)
750 return REG_BADPAT;
751 rxi = mempool_alloc(&pool, sizeof(*rxi));
752 if (!rxi)
753 return REG_ESPACE;
754 rx->internal = rxi;
755 rxi->pool = pool;
756
757 /* initialize rx and local context */
758 memset(&ctx, 0, sizeof(ctx));
759 ctx.rx = rx;
760 ctx.rxi = rxi;
761 ctx.strict = !(flags & REG_RELAXED_SYNTAX);
762 rxi->flags = flags;
763
764 /* setup group #0 */
765 rx->re_nsub = -1;
766 err = op_gstart(&ctx);
767 if (err)
768 goto failed;
769
770 /* launch proper parser */
771 if (flags & REG_EXTENDED)
772 err = parse_posix_ext(&ctx, re);
773 else
774 err = parse_posix_basic(&ctx, re);
775
776 /* finalize group #0 */
777 if (!err)
778 err = finish_branch(&ctx);
779
780 /* relax if details are not needed */
781 if (flags & REG_NOSUB) {
782 rxi->flags |= REG_RELAXED_MATCHING;
783 rx->re_nsub = 0;
784 }
785 failed:
786 /* clean up if problems */
787 if (err)
788 regfree(rx);
789 return err;
790 }
791
792 /*
793 * Matching code
794 */
795
796 /* historical best match */
797 struct HMatch {
798 const char *hist_start;
799 const char *hist_end;
800 int rep_len; /* if repeated seq, full len thus far */
801 };
802
803 /* per-group-match context */
804 struct GMatch {
805 struct GMatch *parent; /* parent group */
806 const struct Op *owner; /* Op for this group */
807 const char *start; /* match start */
808 const char *end; /* match end, NULL if no match */
809 struct GMatch *prevgm; /* older stack entry */
810 struct HMatch hm_next; /* best match for following stack entry */
811 int count; /* match nr in repeated seq */
812 };
813
814 /* top context */
815 struct ExecCtx {
816 const regex_t *rx;
817 const struct RegexInt *rxi;
818 const char *str_start;
819 regmatch_t *pmatch;
820 int nmatch;
821 int flags;
822 bool strict;
823 const char *last_endpos;
824 struct HMatch hm_first[MAX_GROUPS];
825 struct GMatch *gm_stack[MAX_GROUPS];
826 struct GMatch *gm_cache[MAX_GROUPS];
827 };
828
push_gm(struct ExecCtx * ctx,struct GMatch * gm)829 static void push_gm(struct ExecCtx *ctx, struct GMatch *gm)
830 {
831 int gno = gm->owner->grp_no;
832 gm->prevgm = ctx->gm_stack[gno];
833 ctx->gm_stack[gno] = gm;
834 }
835
pop_gm(struct ExecCtx * ctx,struct GMatch * gm)836 static void pop_gm(struct ExecCtx *ctx, struct GMatch *gm)
837 {
838 int gno = gm->owner->grp_no;
839 ctx->gm_stack[gno] = gm->prevgm;
840 }
841
do_match(struct ExecCtx * ctx,const struct Op * op,const char * str,struct GMatch * gm)842 static inline int do_match(struct ExecCtx *ctx, const struct Op *op, const char *str, struct GMatch *gm)
843 {
844 return op->matcher(ctx, op, str, gm);
845 }
846
scan_next(struct ExecCtx * ctx,const struct Op * op,const char * str,struct GMatch * gm,int curcnt,int alen)847 static int scan_next(struct ExecCtx *ctx, const struct Op *op, const char *str, struct GMatch *gm, int curcnt, int alen)
848 {
849 int err = REG_NOMATCH;
850 bool gotmatch = false;
851
852 if (curcnt == op->mincnt)
853 return do_match(ctx, op->next, str, gm);
854
855 for (; curcnt >= op->mincnt; curcnt--) {
856 err = do_match(ctx, op->next, str, gm);
857 if (STRICT && err == 0)
858 gotmatch = true;
859 else if (err != REG_NOMATCH)
860 break;
861 str -= alen;
862 }
863 if (err == REG_NOMATCH && gotmatch)
864 err = 0;
865 return err;
866 }
867
match_char(struct ExecCtx * ctx,const struct Op * op,const char * str,struct GMatch * gm)868 static int match_char(struct ExecCtx *ctx, const struct Op *op, const char *str, struct GMatch *gm)
869 {
870 bool icase = (ctx->flags & REG_ICASE);
871 int c, i, maxcnt = SIMPLE_MAXCNT(op);
872
873 for (i = 0; (i < maxcnt) && str[i]; i++) {
874 c = icase ? tolower((unsigned char)str[i]) : str[i];
875 if (c != op->lit)
876 break;
877 }
878 return scan_next(ctx, op, str + i, gm, i, 1);
879 }
880
match_any(struct ExecCtx * ctx,const struct Op * op,const char * str,struct GMatch * gm)881 static int match_any(struct ExecCtx *ctx, const struct Op *op, const char *str, struct GMatch *gm)
882 {
883 bool nl = (ctx->flags & REG_NEWLINE);
884 int i, maxcnt = SIMPLE_MAXCNT(op);
885
886 for (i = 0; (i < maxcnt) && str[i]; i++) {
887 if (nl && str[i] == '\n')
888 break;
889 }
890 return scan_next(ctx, op, str + i, gm, i, 1);
891 }
892
match_class(struct ExecCtx * ctx,const struct Op * op,const char * str,struct GMatch * gm)893 static int match_class(struct ExecCtx *ctx, const struct Op *op, const char *str, struct GMatch *gm)
894 {
895 int i, maxcnt = SIMPLE_MAXCNT(op);
896
897 for (i = 0; (i < maxcnt); i++) {
898 if (!class_isset(&op->cdata, str[i]))
899 break;
900 }
901 return scan_next(ctx, op, str + i, gm, i, 1);
902 }
903
match_bol(struct ExecCtx * ctx,const struct Op * op,const char * str,struct GMatch * gm)904 static int match_bol(struct ExecCtx *ctx, const struct Op *op, const char *str, struct GMatch *gm)
905 {
906 if (str == ctx->str_start && !(ctx->flags & REG_NOTBOL))
907 return do_match(ctx, op->next, str, gm);
908 else if (str != ctx->str_start && str[-1] == '\n' && (ctx->flags & REG_NEWLINE))
909 return do_match(ctx, op->next, str, gm);
910 return REG_NOMATCH;
911 }
912
match_eol(struct ExecCtx * ctx,const struct Op * op,const char * str,struct GMatch * gm)913 static int match_eol(struct ExecCtx *ctx, const struct Op *op, const char *str, struct GMatch *gm)
914 {
915 if (*str == '\n' && (ctx->flags & REG_NEWLINE))
916 return do_match(ctx, op->next, str, gm);
917 else if (*str == 0 && !(ctx->flags & REG_NOTEOL))
918 return do_match(ctx, op->next, str, gm);
919 return REG_NOMATCH;
920 }
921
match_wchange(struct ExecCtx * ctx,const struct Op * op,const char * str,struct GMatch * gm)922 static int match_wchange(struct ExecCtx *ctx, const struct Op *op, const char *str, struct GMatch *gm)
923 {
924 bool prevw = (str == ctx->str_start) ? false : is_word(str[-1]);
925 bool curw = is_word(str[0]);
926 bool ischange = prevw ^ curw;
927
928 if ((op->type == OP_WCHANGE) ? ischange : !ischange)
929 return do_match(ctx, op->next, str, gm);
930 return REG_NOMATCH;
931 }
932
match_bref(struct ExecCtx * ctx,const struct Op * op,const char * str,struct GMatch * gm)933 static int match_bref(struct ExecCtx *ctx, const struct Op *op, const char *str, struct GMatch *gm)
934 {
935 bool icase = ctx->flags & REG_ICASE;
936 int i;
937 struct GMatch *bgm = ctx->gm_stack[op->bref];
938 int blen = (bgm && bgm->end) ? (bgm->end - bgm->start) : -1;
939
940 /* handle no-match, zero-len, zero-count */
941 if (blen < 0 && op->mincnt > 0)
942 return REG_NOMATCH;
943 if (blen <= 0 || op->maxcnt == 0)
944 return do_match(ctx, op->next, str, gm);
945
946 /* find max matches */
947 for (i = 0; (i < op->maxcnt) && *str; i++) {
948 if (icase && strncasecmp(str, bgm->start, blen) != 0)
949 break;
950 else if (!icase && strncmp(str, bgm->start, blen) != 0)
951 break;
952 str += blen;
953 }
954 return scan_next(ctx, op, str, gm, i, blen);
955 }
956
match_group(struct ExecCtx * ctx,const struct Op * op,const char * str,struct GMatch * gm)957 static int match_group(struct ExecCtx *ctx, const struct Op *op, const char *str, struct GMatch *gm)
958 {
959 int err = REG_NOMATCH;
960 bool gotmatch = false;
961 struct GMatch gthis;
962
963 /* per-group-match context */
964 memset(>his, 0, sizeof(gthis));
965 gthis.owner = op;
966 gthis.start = str;
967 gthis.parent = gm;
968 if (gm && gm->owner == op) {
969 gthis.parent = gm->parent;
970 gthis.count = gm->count + 1;
971 }
972 gm = >his;
973 push_gm(ctx, gm);
974
975 if (op->maxcnt > 0) {
976 struct AndList *alist = op->gdata.or_list;
977 /* check all branches, unless relaxed matching */
978 while (alist) {
979 err = do_match(ctx, alist->op_list, str, gm);
980 if (err == 0 && STRICT) {
981 gm->end = NULL;
982 gotmatch = true;
983 } else if (err != REG_NOMATCH)
984 break;
985 alist = alist->next;
986 }
987 }
988
989 /* is no-match allowed? */
990 if ((op->mincnt == 0) && (gm->count == 0)
991 && (err == REG_NOMATCH || (err == 0 && STRICT))) {
992 gm->end = NULL;
993 err = do_match(ctx, op->next, str, gm->parent);
994 }
995
996 pop_gm(ctx, gm);
997 return gotmatch ? 0 : err;
998 }
999
match_gend(struct ExecCtx * ctx,const struct Op * f_op,const char * str,struct GMatch * gm)1000 static int match_gend(struct ExecCtx *ctx, const struct Op *f_op, const char *str, struct GMatch *gm)
1001 {
1002 int err = REG_NOMATCH;
1003 const struct Op *op = gm->owner;
1004 bool zeromatch = (str == gm->start);
1005 bool gotmatch = false;
1006
1007 /* ignore follow-up empty matches, unless it has backrefs */
1008 if (zeromatch && gm->count > 0 && gm->count >= op->mincnt && !gm->owner->gdata.has_refs)
1009 return REG_NOMATCH;
1010
1011 /* tag as matched */
1012 gm->end = str;
1013
1014 /* try more repeats, stop if count full or last match was zero-length */
1015 if (gm->count + 1 < op->maxcnt && !zeromatch) {
1016 err = match_group(ctx, op, str, gm);
1017 if (err == 0 && STRICT)
1018 gotmatch = true;
1019 else if (err != REG_NOMATCH)
1020 return err;
1021 }
1022
1023 /* fail if not enough repeats */
1024 if (!zeromatch && gm->count + 1 < op->mincnt)
1025 return err;
1026
1027 /* continue with parent branch */
1028 err = do_match(ctx, op->next, str, gm->parent);
1029 if (err == REG_NOMATCH && gotmatch)
1030 err = 0;
1031 return err;
1032 }
1033
1034 /*
1035 * The juice of POSIX - match weighting.
1036 */
1037
gmatch_hist_cmp(struct ExecCtx * ctx,int gno,struct GMatch * gm,int replen)1038 static int gmatch_hist_cmp(struct ExecCtx *ctx, int gno, struct GMatch *gm, int replen)
1039 {
1040 struct HMatch *hm = (gm->prevgm) ? &gm->prevgm->hm_next : &ctx->hm_first[gno];
1041 int gmlen = (gm->end) ? (gm->end - gm->start) : -1;
1042 int hmlen = (hm->hist_end) ? (hm->hist_end - hm->hist_start) : -1;
1043 int gmreplen = (gmlen >= 0) ? (gmlen + replen) : replen;
1044 int hmreplen = ((hmlen >= 0) ? hmlen : 0) + hm->rep_len;
1045 int gmofs = (gm->end) ? (gm->start - ctx->str_start) : -1;
1046 int hmofs = (hm->hist_start) ? (hm->hist_start - ctx->str_start) : -1;
1047
1048 /* prefer rightmost match, to allow preceding elements match more */
1049 int res = (gmofs - hmofs);
1050
1051 /* prefer longer repeated match */
1052 if (res == 0 && gm->count == 0)
1053 res = (gmreplen - hmreplen);
1054
1055 /* prefer longer single match */
1056 if (res == 0)
1057 res = (gmlen - hmlen);
1058
1059 return res;
1060 }
1061
cmp_gmatches(struct ExecCtx * ctx,int gno,struct GMatch * gm,int replen)1062 static int cmp_gmatches(struct ExecCtx *ctx, int gno, struct GMatch *gm, int replen)
1063 {
1064 int cmp = 0, gmlen;
1065 if (gm) {
1066 /* need to compare preceding groups first */
1067 gmlen = gm->end ? gm->end - gm->start : 0;
1068 cmp = cmp_gmatches(ctx, gno, gm->prevgm,
1069 (gm->count == 0) ? 0 : (replen + gmlen));
1070 /* actual comparision */
1071 if (!cmp) cmp = gmatch_hist_cmp(ctx, gno, gm, replen);
1072 }
1073 return cmp;
1074 }
1075
gm_resolve_tie(struct ExecCtx * ctx,int gno)1076 static int gm_resolve_tie(struct ExecCtx *ctx, int gno)
1077 {
1078 struct GMatch *gm = ctx->gm_stack[gno];
1079 if (!gm) /* 0-count match is better than no match */
1080 return ctx->hm_first[gno].hist_start ? -1 : 0;
1081
1082 return cmp_gmatches(ctx, gno, gm, 0);
1083 }
1084
fill_history(struct ExecCtx * ctx,int gno)1085 static void fill_history(struct ExecCtx *ctx, int gno)
1086 {
1087 struct HMatch *hm;
1088 int gmlen, rep_len = 0;
1089 struct GMatch *gm = ctx->gm_stack[gno];
1090 while (STRICT && gm) {
1091 hm = (gm->prevgm) ? &gm->prevgm->hm_next : &ctx->hm_first[gno];
1092 hm->hist_start = gm->start;
1093 hm->hist_end = gm->end;
1094 hm->rep_len = rep_len;
1095 gmlen = gm->end ? (gm->end - gm->start) : 0;
1096 rep_len += gmlen;
1097 if (gm->count == 0)
1098 rep_len = 0;
1099 gm = gm->prevgm;
1100 }
1101 }
1102
publish_gm(struct ExecCtx * ctx,int gno)1103 static void publish_gm(struct ExecCtx *ctx, int gno)
1104 {
1105 struct GMatch *gm = ctx->gm_stack[gno];
1106 regmatch_t *rm = ctx->pmatch + gno;
1107
1108 /* ignore non-matches */
1109 while (gm && !gm->end)
1110 gm = gm->prevgm;
1111
1112 /* require it to be inside reported parent */
1113 if (gm && gm->parent) {
1114 int pno = gm->parent->owner->grp_no;
1115 if (gm->parent != ctx->gm_cache[pno])
1116 gm = NULL;
1117 }
1118 ctx->gm_cache[gno] = gm;
1119
1120 /* publish new match */
1121 if (gm) {
1122 rm->rm_so = gm->start - ctx->str_start;
1123 rm->rm_eo = gm->end - ctx->str_start;
1124 } else {
1125 rm->rm_so = -1;
1126 rm->rm_eo = -1;
1127 }
1128 }
1129
1130 /* compare and publish */
got_full_match(struct ExecCtx * ctx,const struct Op * f_op,const char * str,struct GMatch * gm)1131 static int got_full_match(struct ExecCtx *ctx, const struct Op *f_op, const char *str, struct GMatch *gm)
1132 {
1133 int gno, cmp;
1134
1135 /* tag group as matched */
1136 gm->end = str;
1137
1138 /* ignore shorter matches */
1139 if (ctx->last_endpos && str < ctx->last_endpos)
1140 return 0;
1141
1142 /* longer or equal length */
1143 if (str > ctx->last_endpos) {
1144 ctx->last_endpos = str;
1145 goto better_match;
1146 } else if (STRICT && ctx->nmatch > 1) {
1147 for (gno = 0; gno < ctx->nmatch; gno++) {
1148 cmp = gm_resolve_tie(ctx, gno);
1149 if (cmp < 0)
1150 break;
1151 if (cmp > 0)
1152 goto better_match;
1153 }
1154 }
1155 return 0;
1156
1157 better_match:
1158 for (gno = 0; gno < ctx->nmatch; gno++) {
1159 publish_gm(ctx, gno);
1160 fill_history(ctx, gno);
1161 }
1162 return 0;
1163 }
1164
1165 /* fill in proper matcher */
set_op_type(struct Op * op,enum OpType op_type)1166 static void set_op_type(struct Op *op, enum OpType op_type)
1167 {
1168 static const matcher_f mlist[] = {
1169 match_char, match_any, match_class, match_group, match_bref,
1170 match_bol, match_eol, match_wchange, match_wchange,
1171 match_gend, got_full_match
1172 };
1173 op->matcher = mlist[op_type];
1174 op->type = op_type;
1175 }
1176
1177 /*
1178 * Public matching API
1179 */
1180
regexec(const regex_t * rx,const char * str,size_t nmatch,regmatch_t pmatch[],int eflags)1181 int regexec(const regex_t *rx, const char *str, size_t nmatch, regmatch_t pmatch[], int eflags)
1182 {
1183 int err;
1184 struct ExecCtx ctx;
1185
1186 if (eflags & ~(REG_NOTBOL | REG_NOTEOL))
1187 return REG_BADPAT;
1188
1189 /* init local context */
1190 memset(&ctx, 0, sizeof(ctx));
1191 ctx.pmatch = pmatch;
1192 ctx.nmatch = nmatch;
1193 ctx.str_start = str;
1194 ctx.rx = rx;
1195 ctx.rxi = rx->internal;
1196 ctx.flags = ctx.rxi->flags | eflags;
1197
1198 /* reset pmatch area */
1199 if (!(ctx.flags & REG_NOSUB))
1200 memset(pmatch, -1, nmatch * sizeof(regmatch_t));
1201
1202 /* decide pmatch area that will be used */
1203 if (!pmatch || (ctx.flags & REG_NOSUB))
1204 ctx.nmatch = 0;
1205 else if (nmatch > (size_t)rx->re_nsub + 1)
1206 ctx.nmatch = rx->re_nsub + 1;
1207 ctx.strict = !(ctx.flags & REG_RELAXED_MATCHING) && (ctx.nmatch > 0);
1208
1209 /* execute search */
1210 str--;
1211 do {
1212 str++;
1213 err = do_match(&ctx, ctx.rxi->root, str, NULL);
1214 } while ((err == REG_NOMATCH) && *str);
1215
1216 return err;
1217 }
1218
1219 /*
1220 * Free parse tree
1221 */
1222
regfree(regex_t * rx)1223 void regfree(regex_t *rx)
1224 {
1225 struct RegexInt *rxi;
1226 if (rx) {
1227 rxi = rx->internal;
1228 if (rxi)
1229 mempool_destroy(&rxi->pool);
1230 memset(rx, 0, sizeof(*rx));
1231 }
1232 }
1233
1234 /*
1235 * Error strings
1236 */
1237
regerror(int err,const regex_t * rx,char * dst,size_t dstlen)1238 size_t regerror(int err, const regex_t *rx, char *dst, size_t dstlen)
1239 {
1240 static const char errlist[][9] = {
1241 "NOERROR", /* 0 */
1242 "NOMATCH", /* 1 */
1243 "BADBR", /* 2 */
1244 "BADPAT", /* 3 */
1245 "BADRPT", /* 4 */
1246 "EBRACE", /* 5 */
1247 "EBRACK", /* 6 */
1248 "ECOLLATE", /* 7 */
1249 "ECTYPE", /* 8 */
1250 "EESCAPE", /* 9 */
1251 "EPAREN", /* 10 */
1252 "ERANGE", /* 11 */
1253 "ESPACE", /* 12 */
1254 "ESUBREG", /* 13 */
1255 };
1256 const char *s = "EUNKNOWN";
1257 if ((size_t)err < ARRAY_NELEM(errlist))
1258 s = errlist[err];
1259 return snprintf(dst, dstlen, "%s", s);
1260 }
1261
1262 #endif /* !USE_SYSTEM_REGEX */
1263