1 /*
2  * Small POSIX-only regex engine.
3  *
4  * Copyright (c) 2009  Marko Kreen
5  *
6  * Permission to use, copy, modify, and/or distribute this software for any
7  * purpose with or without fee is hereby granted, provided that the above
8  * copyright notice and this permission notice appear in all copies.
9  *
10  * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
11  * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
12  * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
13  * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
14  * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
15  * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
16  * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
17  */
18 
19 /*
20  * Simple recursive matcher, only features are small size
21  * and POSIX compatibility.
22  *
23  * ERE syntax: . * ^ $ [] [[:cname:]] () {} | + ?
24  * BRE syntax: . * ^ $ [] [[:cname:]] \(\) \{\} \1-9
25  *
26  * With REG_RELAXED_SYNTAX, following common escapes will be available:
27  *    \b\B\d\D\s\S\w\W   BRE: \|   ERE: \1-9
28  *
29  * With REG_RELAXED_MATCHING it returns the first match found after applying
30  * leftmost-longest to all elements.  It skips the combinatorics to turn it
31  * into guaranteed-longest match.
32  *
33  * Skipped POSIX features:
34  * - collation classes: [[. .]]
35  * - equivalence classes: [[= =]]
36  * - char ranges by locale order: [a-z]  (byte order will be used)
37  * - multi-byte chars: UTF-8
38  */
39 
40 #include <usual/regex.h>
41 
42 #ifndef USE_SYSTEM_REGEX
43 
44 #include <usual/mempool.h>
45 #include <usual/ctype.h>
46 #include <string.h>
47 #include <stdio.h>
48 
49 #undef STRICT
50 
51 /* either dynamic or static decision */
52 #define STRICT (ctx->strict)
53 
54 /* how many regmatch_t can be reported */
55 #define MAX_GROUPS		128
56 
57 /* max count we want to store, means 'infinite' for simple atoms */
58 #define MAX_COUNT		0x7fff
59 
60 /* max count for simple atoms: char, any or class */
61 #define SIMPLE_MAXCNT(op) (((op)->maxcnt == MAX_COUNT) ? 0x7FFFFFFF : (op)->maxcnt)
62 
63 #define is_word(c) (isalnum(c) || (c) == '_')
64 
65 struct Op;
66 struct ExecCtx;
67 struct GMatch;
68 
69 /* Operation type */
70 enum OpType {
71 	/* ops that take count */
72 	OP_CHAR,
73 	OP_ANY,
74 	OP_CLASS,
75 	OP_GROUP,
76 	OP_BREF,
77 	/* ops that dont take count */
78 	OP_BOL,
79 	OP_EOL,
80 	OP_WCHANGE,
81 	OP_NWCHANGE,
82 	OP_GMATCH,
83 	OP_FULLMATCH,
84 };
85 #define NONCOUNT_OPS_START  OP_BOL
86 
87 /* regex_t->internal */
88 struct RegexInt {
89 	struct Op *root;
90 	struct Op *glist;
91 	struct MemPool *pool;
92 	int flags;
93 };
94 
95 /* match function and its setter */
96 typedef int (*matcher_f)(struct ExecCtx *ctx, const struct Op *op, const char *str, struct GMatch *gm);
97 static void set_op_type(struct Op *op, enum OpType op_type);
98 
99 /* List of tokens to be AND-ed together */
100 struct AndList {
101 	struct AndList *next;
102 	struct Op *op_list;
103 };
104 
105 /* extra data for group Op */
106 struct GroupData {
107 	struct Op *parent;	/* parent group or NULL for first group */
108 	struct AndList *or_list;/* alternative AndLists */
109 	struct Op *glist_prev;	/* prev group Op */
110 	bool has_refs;		/* if bref references it */
111 };
112 
113 /* char class data */
114 struct ClassData {
115 	uint32_t bitmap[256 / 32];
116 };
117 
118 /* operation data */
119 struct Op {
120 	struct Op *next;
121 	matcher_f matcher;
122 	uint16_t mincnt;
123 	uint16_t maxcnt;
124 	uint8_t type;
125 	union {
126 		uint8_t grp_no;		/* OP_GROUP: group nr, 0-toplevel */
127 		char lit;		/* OP_CHAR */
128 		uint8_t bref;		/* OP_BREF */
129 	};
130 	union {
131 		struct ClassData cdata;
132 		struct GroupData gdata;
133 	};
134 };
135 #define OP_BASE (offsetof(struct Op, cdata))
136 
137 /*
138  * Operations on ClassData
139  */
140 
class_isset(const struct ClassData * cd,unsigned char c)141 static bool class_isset(const struct ClassData *cd, unsigned char c)
142 {
143 	return cd->bitmap[c / 32] & (1 << (c % 32));
144 }
145 
class_set(struct ClassData * cd,unsigned char c)146 static void class_set(struct ClassData *cd, unsigned char c)
147 {
148 	cd->bitmap[c / 32] |= (1 << (c % 32));
149 }
150 
class_negate(struct ClassData * cd)151 static void class_negate(struct ClassData *cd)
152 {
153 	int i;
154 	class_set(cd, 0);
155 	for (i = 0; i < 256/32; i++) cd->bitmap[i] ^= -1;
156 }
157 
158 /*
159  * Parsing code
160  */
161 
162 /* top-level context */
163 struct ParseCtx {
164 	regex_t *rx;
165 	struct RegexInt *rxi;
166 	struct Op *last_group;
167 	struct AndList *last_andlist;
168 	struct Op *last_elem;	/* last op in current OR branch */
169 	bool gotcnt;		/* count was attached to last op */
170 	bool strict;		/* strict syntax */
171 };
172 
new_andlist(struct ParseCtx * ctx,struct Op * g)173 static struct AndList *new_andlist(struct ParseCtx *ctx, struct Op *g)
174 {
175 	struct AndList *al = mempool_alloc(&ctx->rxi->pool, sizeof(*al));
176 	if (!al)
177 		return NULL;
178 	if (ctx->last_andlist) {
179 		ctx->last_andlist->next = al;
180 	} else {
181 		g->gdata.or_list = al;
182 	}
183 	ctx->last_andlist = al;
184 	return al;
185 }
186 
new_op(struct ParseCtx * ctx,enum OpType t,int extra)187 static struct Op *new_op(struct ParseCtx *ctx, enum OpType t, int extra)
188 {
189 	struct Op *op = mempool_alloc(&ctx->rxi->pool, OP_BASE + extra);
190 	if (!op)
191 		return NULL;
192 	set_op_type(op, t);
193 	op->mincnt = op->maxcnt = 1;
194 	ctx->gotcnt = false;
195 
196 	/* append */
197 	if (ctx->last_elem) {
198 		ctx->last_elem->next = op;
199 	} else if (ctx->last_andlist) {
200 		ctx->last_andlist->op_list = op;
201 	} else if (ctx->last_group) {
202 		struct AndList *alist;
203 		alist = new_andlist(ctx, ctx->last_group);
204 		if (!alist)
205 			return NULL;
206 		alist->op_list = op;
207 	}
208 	ctx->last_elem = op;
209 
210 	if (t == OP_GROUP) {
211 		struct Op *parent = ctx->last_group;
212 		int gno = ++ctx->rx->re_nsub;
213 		op->grp_no = gno;
214 		op->gdata.parent = parent;
215 		op->gdata.glist_prev = ctx->rxi->glist;
216 		ctx->rxi->glist = op;
217 		ctx->last_group = op;
218 		ctx->last_andlist = NULL;
219 		ctx->last_elem = NULL;
220 		if (!ctx->rxi->root)
221 			ctx->rxi->root = op;
222 	}
223 	return op;
224 }
225 
op_char(struct ParseCtx * ctx,unsigned c)226 static int op_char(struct ParseCtx *ctx, unsigned c)
227 {
228 	struct Op *op = new_op(ctx, OP_CHAR, 0);
229 	if (!op)
230 		return REG_ESPACE;
231 	op->lit = c;
232 	if ((ctx->rxi->flags & REG_ICASE) && isalpha(c))
233 		op->lit = tolower(c);
234 	return 0;
235 }
236 
op_bref(struct ParseCtx * ctx,unsigned c)237 static int op_bref(struct ParseCtx *ctx, unsigned c)
238 {
239 	struct Op *g, *op;
240 
241 	op = new_op(ctx, OP_BREF, 0);
242 	if (!op)
243 		return REG_ESPACE;
244 	op->bref = c - '0';
245 
246 	/* check if valid ref */
247 	for (g = ctx->last_group; g; g = g->gdata.parent) {
248 		if (g->grp_no == op->bref)
249 			return REG_ESUBREG;
250 	}
251 	/* tag the group as referenced */
252 	for (g = ctx->rxi->glist; g; g = g->gdata.glist_prev) {
253 		if (g->grp_no == op->bref) {
254 			g->gdata.has_refs = true;
255 			return 0;
256 		}
257 	}
258 	return REG_ESUBREG;
259 }
260 
op_simple(struct ParseCtx * ctx,enum OpType t)261 static int op_simple(struct ParseCtx *ctx, enum OpType t)
262 {
263 	struct Op *op = new_op(ctx, t, 0);
264 	if (!op)
265 		return REG_ESPACE;
266 	return 0;
267 }
268 
op_count_simple(struct ParseCtx * ctx,int min,int max)269 static int op_count_simple(struct ParseCtx *ctx, int min, int max)
270 {
271 	struct Op *op = ctx->last_elem;
272 	if (!op || ctx->gotcnt)
273 		return REG_BADRPT;
274 	if (op->type >= NONCOUNT_OPS_START)
275 		return REG_BADRPT;
276 	ctx->gotcnt = true;
277 	op->mincnt = min;
278 	op->maxcnt = max;
279 	return 0;
280 }
281 
op_count_full(struct ParseCtx * ctx,const char ** re)282 static int op_count_full(struct ParseCtx *ctx, const char **re)
283 {
284 	unsigned a, b;
285 	char *end = (char *)*re;
286 	bool ext = ctx->rxi->flags & REG_EXTENDED;
287 	int err;
288 
289 	/* apply sanity check */
290 	err = op_count_simple(ctx, 1, 1);
291 	if (err)
292 		return err;
293 
294 	/* parse */
295 	a = b = strtoul(*re, &end, 10);
296 	if (end == *re)
297 		return REG_EBRACE;
298 	if (*end == ',') {
299 		*re = end + 1;
300 		end = (char*)*re;
301 		b = strtoul(*re, &end, 10);
302 		if (end == *re)
303 			b = MAX_COUNT;
304 	}
305 	if (a > b || b > MAX_COUNT || a >= MAX_COUNT)
306 		return REG_BADBR;
307 
308 	/* check for correct termination */
309 	if (ext && end[0] == '}') {
310 		*re = end + 1;
311 		goto done;
312 	} else if (!ext && end[0] == '\\' && end[1] == '}') {
313 		*re = end + 2;
314 		goto done;
315 	}
316 
317 	/* bad fmt, decide between error codes */
318 	return strchr(end, '}') ? REG_BADBR : REG_EBRACE;
319 
320 done:
321 	ctx->last_elem->mincnt = a;
322 	ctx->last_elem->maxcnt = b;
323 	return 0;
324 }
325 
op_gstart(struct ParseCtx * ctx)326 static int op_gstart(struct ParseCtx *ctx)
327 {
328 	struct Op *op;
329 	op = new_op(ctx, OP_GROUP, sizeof(struct GroupData));
330 	if (!op)
331 		return REG_ESPACE;
332 	if (op->grp_no >= MAX_GROUPS)
333 		return REG_BADPAT;
334 	return 0;
335 }
336 
finish_branch(struct ParseCtx * ctx)337 static int finish_branch(struct ParseCtx *ctx)
338 {
339 	int err;
340 
341 	/* disallow empty OR fragments, but not empty groups */
342 	if (!ctx->last_elem && ctx->last_andlist && STRICT)
343 		return REG_BADPAT;
344 
345 	if (ctx->last_group->gdata.parent)
346 		err = op_simple(ctx, OP_GMATCH);
347 	else
348 		err = op_simple(ctx, OP_FULLMATCH);
349 	if (err)
350 		return err;
351 	ctx->last_elem = NULL;
352 	return 0;
353 }
354 
op_gend(struct ParseCtx * ctx)355 static int op_gend(struct ParseCtx *ctx)
356 {
357 	struct Op *op = ctx->last_group;
358 	struct AndList *alist;
359 	int err;
360 
361 	if (!op)
362 		return REG_EPAREN;
363 
364 	err = finish_branch(ctx);
365 	if (err)
366 		return err;
367 	ctx->last_group = op->gdata.parent;
368 	ctx->last_elem = op;
369 
370 	/* recover previous andlist... */
371 	alist = ctx->last_group->gdata.or_list;
372 	while (alist && alist->next)
373 		alist = alist->next;
374 	ctx->last_andlist = alist;
375 
376 	return 0;
377 }
378 
op_or(struct ParseCtx * ctx)379 static int op_or(struct ParseCtx *ctx)
380 {
381 	struct Op *gop = ctx->last_group;
382 	struct AndList *alist;
383 	int err;
384 
385 	/* disallow empty OR branches */
386 	if (!ctx->last_elem && STRICT)
387 		return REG_BADPAT;
388 
389 	/* start new branch */
390 	err = finish_branch(ctx);
391 	if (err)
392 		return err;
393 	alist = new_andlist(ctx, gop);
394 	if (!alist)
395 		return REG_ESPACE;
396 	ctx->last_andlist = alist;
397 	ctx->last_elem = NULL;
398 
399 	return 0;
400 }
401 
402 /*
403  * Parse bracketed classes.
404  */
405 
add_char(struct ClassData * cd,unsigned char c,bool icase)406 static void add_char(struct ClassData *cd, unsigned char c, bool icase)
407 {
408 	if (icase && isalpha(c)) {
409 		class_set(cd, tolower(c));
410 		class_set(cd, toupper(c));
411 	} else {
412 		class_set(cd, c);
413 	}
414 }
415 
416 struct NamedClass {
417 	const char name[7];
418 	unsigned char name_len;
419 	int (*check_func)(int c);
420 };
421 static const struct NamedClass ctype_list[] = {
422 	{ "alnum", 5, isalnum },
423 	{ "alpha", 5, isalpha },
424 	{ "blank", 5, isblank },
425 	{ "cntrl", 5, iscntrl },
426 	{ "digit", 5, isdigit },
427 	{ "graph", 5, isgraph },
428 	{ "lower", 5, islower },
429 	{ "print", 5, isprint },
430 	{ "punct", 5, ispunct },
431 	{ "space", 5, isspace },
432 	{ "upper", 5, isupper },
433 	{ "xdigit", 6, isxdigit },
434 };
435 
fill_class(struct ClassData * cd,const char * name,const char ** s_p,bool icase)436 static int fill_class(struct ClassData *cd, const char *name, const char **s_p, bool icase)
437 {
438 	unsigned c;
439 	const struct NamedClass *cc = ctype_list;
440 	for (c = 0; c < ARRAY_NELEM(ctype_list); c++) {
441 		cc = ctype_list + c;
442 		if (strncmp(name, cc->name, cc->name_len) != 0)
443 			continue;
444 		name += cc->name_len;
445 		if (name[0] == ':' && name[1] == ']')
446 			goto found;
447 		break;
448 	}
449 	return *name ? REG_ECTYPE : REG_EBRACK;
450 found:
451 	/* fill map */
452 	for (c = 1; c < 256; c++) {
453 		if (cc->check_func(c))
454 			add_char(cd, c, icase);
455 	}
456 	*s_p = name + 2;
457 	return 0;
458 }
459 
460 #define MAP_RANGE 0x7FFF0001
461 #define MAP_END 0x7FFF0002
462 #define MAP_OTHER 0x7FFF0003
463 
get_map_token(struct ParseCtx * ctx,const char ** s_p,unsigned * dst_p,bool start,struct ClassData * cd,bool icase)464 static int get_map_token(struct ParseCtx *ctx, const char **s_p, unsigned *dst_p,
465 			 bool start, struct ClassData *cd, bool icase)
466 {
467 	const char *s = *s_p;
468 	unsigned res;
469 	if (*s == '-') {
470 		if (start || s[1] == ']')
471 			res = '-';
472 		else
473 			res = MAP_RANGE;
474 		s += 1;
475 	} else if (*s == ']' && !start) {
476 		res = MAP_END;
477 		s++;
478 	} else if (*s == '[' && (s[1] == '.' || s[1] == ':' || s[1] == '=')) {
479 		if (s[1] == ':') {
480 			s += 2;
481 			*dst_p = MAP_OTHER;
482 			return fill_class(cd, s, s_p, icase);
483 		}
484 		return REG_BADPAT;
485 	} else {
486 		res = (unsigned char)*s++;
487 	}
488 	*dst_p = res;
489 	*s_p = s;
490 	return 0;
491 }
492 
op_class(struct ParseCtx * ctx,const char ** re)493 static int op_class(struct ParseCtx *ctx, const char **re)
494 {
495 	const char *s = *re;
496 	struct ClassData *cd;
497 	struct Op *op;
498 	bool not = false, icase = ctx->rxi->flags & REG_ICASE;
499 	const char *start;
500 	unsigned tk, c, prevtk = 0;
501 	bool is_range = false;
502 	int err;
503 
504 	if (*s == '^') {
505 		s++;
506 		not = true;
507 	}
508 	start = s;
509 
510 	op = new_op(ctx, OP_CLASS, sizeof(struct ClassData));
511 	if (!op)
512 		return REG_ESPACE;
513 	cd = &op->cdata;
514 
515 	if (not && (ctx->rxi->flags & REG_NEWLINE))
516 		class_set(cd, '\n');
517 
518 	while (*s) {
519 		err = get_map_token(ctx, &s, &tk, s == start, cd, icase);
520 		if (err)
521 			return err;
522 
523 		if (tk == MAP_END) {
524 			if (prevtk)
525 				add_char(cd, prevtk, icase);
526 			goto done;
527 		} else if (tk == MAP_OTHER) {
528 			if (is_range)
529 				return REG_ERANGE;
530 			if (prevtk)
531 				add_char(cd, prevtk, icase);
532 			prevtk = 0;
533 		} else if (tk == MAP_RANGE) {
534 			if (!prevtk)
535 				return REG_ERANGE;
536 			is_range = true;
537 		} else if (is_range) {
538 			if (tk < prevtk)
539 				return REG_ERANGE;
540 			for (c = prevtk; c <= tk; c++)
541 				add_char(cd, c, icase);
542 			is_range = false;
543 			prevtk = 0;
544 		} else {
545 			if (prevtk)
546 				add_char(cd, prevtk, icase);
547 			prevtk = tk;
548 		}
549 	}
550 	return REG_EBRACK;
551 done:
552 	*re = s;
553 	if (not) class_negate(cd);
554 	return 0;
555 }
556 
op_class_const(struct ParseCtx * ctx,const char * def)557 static int op_class_const(struct ParseCtx *ctx, const char *def)
558 {
559 	const char *p = def + 1;
560 	return op_class(ctx, &p);
561 }
562 
563 /*
564  * Top-level syntax
565  */
566 
parse_relaxed_escapes(struct ParseCtx * ctx,char c)567 static int parse_relaxed_escapes(struct ParseCtx *ctx, char c)
568 {
569 	if (STRICT)
570 		return REG_BADPAT;
571 	switch (c) {
572 	case 'b': return op_simple(ctx, OP_WCHANGE);
573 	case 'B': return op_simple(ctx, OP_NWCHANGE);
574 	case 'w': return op_class_const(ctx, "[_[:alnum:]]");
575 	case 'W': return op_class_const(ctx, "[^_[:alnum:]]");
576 	case 'd': return op_class_const(ctx, "[[:digit:]]");
577 	case 'D': return op_class_const(ctx, "[^[:digit:]]");
578 	case 's': return op_class_const(ctx, "[[:space:]]");
579 	case 'S': return op_class_const(ctx, "[^[:space:]]");
580 	}
581 	return REG_BADPAT;
582 }
583 
parse_posix_ext(struct ParseCtx * ctx,const char * re)584 static int parse_posix_ext(struct ParseCtx *ctx, const char *re)
585 {
586 	int err = 0;
587 	unsigned c;
588 	int glevel = 0;
589 loop:
590 	if (err)
591 		return err;
592 	c = *re++;
593 	switch (c) {
594 	case 0:
595 		return (glevel == 0) ? 0 : REG_EPAREN;
596 	case '(':
597 		glevel++;
598 		err = op_gstart(ctx);
599 		break;
600 	case ')':
601 		if (glevel > 0) {
602 			glevel--;
603 			err = op_gend(ctx);
604 		} else  {
605 			err = op_char(ctx, c); /* POSIX bug */
606 		}
607 		break;
608 	case '|':
609 		err = op_or(ctx);
610 		break;
611 	case '*':
612 		err = op_count_simple(ctx, 0, MAX_COUNT);
613 		break;
614 	case '?':
615 		err = op_count_simple(ctx, 0, 1);
616 		break;
617 	case '+':
618 		err = op_count_simple(ctx, 1, MAX_COUNT);
619 		break;
620 	case '[':
621 		err = op_class(ctx, &re);
622 		break;
623 	case '{':
624 		err = op_count_full(ctx, &re);
625 		break;
626 	case '.':
627 		err = op_simple(ctx, OP_ANY);
628 		break;
629 	case '^':
630 		err = op_simple(ctx, OP_BOL);
631 		break;
632 	case '$':
633 		err = op_simple(ctx, OP_EOL);
634 		break;
635 	case '\\':
636 		goto escaped;
637 	default:
638 		err = op_char(ctx, c);
639 	}
640 	goto loop;
641 
642 escaped:
643 	c = *re++;
644 	if (c == 0)
645 		err = REG_EESCAPE;
646 	else if (c >= '0' && c <= '9')
647 		err = STRICT ? REG_BADPAT : op_bref(ctx, c);
648 	else if (isalpha(c))
649 		err = parse_relaxed_escapes(ctx, c);
650 	else
651 		err = op_char(ctx, c);
652 	goto loop;
653 }
654 
parse_posix_basic(struct ParseCtx * ctx,const char * re)655 static int parse_posix_basic(struct ParseCtx *ctx, const char *re)
656 {
657 	int err = 0;
658 	unsigned c;
659 	int glevel = 0;
660 loop:
661 	if (err)
662 		return err;
663 	c = *re++;
664 	switch (c) {
665 	case 0:
666 		return (glevel == 0) ? 0 : REG_EPAREN;
667 	case '*':
668 		if (ctx->last_elem && ctx->last_elem->type != OP_BOL)
669 			err = op_count_simple(ctx, 0, MAX_COUNT);
670 		else
671 			err = op_char(ctx, '*');
672 		break;
673 	case '.':
674 		err = op_simple(ctx, OP_ANY);
675 		break;
676 	case '[':
677 		err = op_class(ctx, &re);
678 		break;
679 	case '^':
680 		if (!ctx->last_elem)
681 			err = op_simple(ctx, OP_BOL);
682 		else
683 			err = op_char(ctx, c);
684 		break;
685 	case '$':
686 		if (!*re || (re[0] == '\\' && re[1] == ')'))
687 			err = op_simple(ctx, OP_EOL);
688 		else
689 			err = op_char(ctx, c);
690 		break;
691 	case '\\':
692 		goto escaped;
693 	default:
694 		err = op_char(ctx, c);
695 	}
696 	goto loop;
697 
698 escaped:
699 	c = *re++;
700 	switch (c) {
701 	case 0:
702 		return REG_EESCAPE;
703 	case '(':
704 		glevel++;
705 		err = op_gstart(ctx);
706 		break;
707 	case ')':
708 		glevel--;
709 		if (glevel < 0)
710 			return REG_EPAREN;
711 		err = op_gend(ctx);
712 		break;
713 	case '{':
714 		err = op_count_full(ctx, &re);
715 		break;
716 	case '.': case '^': case '$': case '*':
717 	case '[': case ']': case '\\':
718 		err = op_char(ctx, c);
719 		break;
720 	case '1': case '2': case '3': case '4': case '5':
721 	case '6': case '7': case '8': case '9':
722 		err = op_bref(ctx, c);
723 		break;
724 	case '|':
725 		err = STRICT ? REG_BADPAT : op_or(ctx);
726 		break;
727 	default:
728 		err = parse_relaxed_escapes(ctx, c);
729 	}
730 	goto loop;
731 }
732 
733 /*
734  * Public compiling API.
735  */
736 
regcomp(regex_t * rx,const char * re,int flags)737 int regcomp(regex_t *rx, const char *re, int flags)
738 {
739 	struct ParseCtx ctx;
740 	struct RegexInt *rxi;
741 	int err;
742 	struct MemPool *pool = NULL;
743 
744 	/* do it first, to allow regfree() */
745 	memset(rx, 0, sizeof(*rx));
746 
747 	if (flags & ~(REG_EXTENDED | REG_ICASE | REG_NOSUB | REG_NEWLINE | REG_RELAXED))
748 		return REG_BADPAT;
749 	if (!*re)
750 		return REG_BADPAT;
751 	rxi = mempool_alloc(&pool, sizeof(*rxi));
752 	if (!rxi)
753 		return REG_ESPACE;
754 	rx->internal = rxi;
755 	rxi->pool = pool;
756 
757 	/* initialize rx and local context */
758 	memset(&ctx, 0, sizeof(ctx));
759 	ctx.rx = rx;
760 	ctx.rxi = rxi;
761 	ctx.strict = !(flags & REG_RELAXED_SYNTAX);
762 	rxi->flags = flags;
763 
764 	/* setup group #0 */
765 	rx->re_nsub = -1;
766 	err = op_gstart(&ctx);
767 	if (err)
768 		goto failed;
769 
770 	/* launch proper parser */
771 	if (flags & REG_EXTENDED)
772 		err = parse_posix_ext(&ctx, re);
773 	else
774 		err = parse_posix_basic(&ctx, re);
775 
776 	/* finalize group #0 */
777 	if (!err)
778 		err = finish_branch(&ctx);
779 
780 	/* relax if details are not needed */
781 	if (flags & REG_NOSUB) {
782 		rxi->flags |= REG_RELAXED_MATCHING;
783 		rx->re_nsub = 0;
784 	}
785 failed:
786 	/* clean up if problems */
787 	if (err)
788 		regfree(rx);
789 	return err;
790 }
791 
792 /*
793  * Matching code
794  */
795 
796 /* historical best match */
797 struct HMatch {
798 	const char *hist_start;
799 	const char *hist_end;
800 	int rep_len; 		/* if repeated seq, full len thus far */
801 };
802 
803 /* per-group-match context */
804 struct GMatch {
805 	struct GMatch *parent;	/* parent group */
806 	const struct Op *owner;	/* Op for this group */
807 	const char *start;	/* match start */
808 	const char *end;	/* match end, NULL if no match */
809 	struct GMatch *prevgm;	/* older stack entry */
810 	struct HMatch hm_next;	/* best match for following stack entry */
811 	int count;		/* match nr in repeated seq */
812 };
813 
814 /* top context */
815 struct ExecCtx {
816 	const regex_t *rx;
817 	const struct RegexInt *rxi;
818 	const char *str_start;
819 	regmatch_t *pmatch;
820 	int nmatch;
821 	int flags;
822 	bool strict;
823 	const char *last_endpos;
824 	struct HMatch hm_first[MAX_GROUPS];
825 	struct GMatch *gm_stack[MAX_GROUPS];
826 	struct GMatch *gm_cache[MAX_GROUPS];
827 };
828 
push_gm(struct ExecCtx * ctx,struct GMatch * gm)829 static void push_gm(struct ExecCtx *ctx, struct GMatch *gm)
830 {
831 	int gno = gm->owner->grp_no;
832 	gm->prevgm = ctx->gm_stack[gno];
833 	ctx->gm_stack[gno] = gm;
834 }
835 
pop_gm(struct ExecCtx * ctx,struct GMatch * gm)836 static void pop_gm(struct ExecCtx *ctx, struct GMatch *gm)
837 {
838 	int gno = gm->owner->grp_no;
839 	ctx->gm_stack[gno] = gm->prevgm;
840 }
841 
do_match(struct ExecCtx * ctx,const struct Op * op,const char * str,struct GMatch * gm)842 static inline int do_match(struct ExecCtx *ctx, const struct Op *op, const char *str, struct GMatch *gm)
843 {
844 	return op->matcher(ctx, op, str, gm);
845 }
846 
scan_next(struct ExecCtx * ctx,const struct Op * op,const char * str,struct GMatch * gm,int curcnt,int alen)847 static int scan_next(struct ExecCtx *ctx, const struct Op *op, const char *str, struct GMatch *gm, int curcnt, int alen)
848 {
849 	int err = REG_NOMATCH;
850 	bool gotmatch = false;
851 
852 	if (curcnt == op->mincnt)
853 		return do_match(ctx, op->next, str, gm);
854 
855 	for (; curcnt >= op->mincnt; curcnt--) {
856 		err = do_match(ctx, op->next, str, gm);
857 		if (STRICT && err == 0)
858 			gotmatch = true;
859 		else if (err != REG_NOMATCH)
860 			break;
861 		str -= alen;
862 	}
863 	if (err == REG_NOMATCH && gotmatch)
864 		err = 0;
865 	return err;
866 }
867 
match_char(struct ExecCtx * ctx,const struct Op * op,const char * str,struct GMatch * gm)868 static int match_char(struct ExecCtx *ctx, const struct Op *op, const char *str, struct GMatch *gm)
869 {
870 	bool icase = (ctx->flags & REG_ICASE);
871 	int c, i, maxcnt = SIMPLE_MAXCNT(op);
872 
873 	for (i = 0; (i < maxcnt) && str[i]; i++) {
874 		c = icase ? tolower((unsigned char)str[i]) : str[i];
875 		if (c != op->lit)
876 			break;
877 	}
878 	return scan_next(ctx, op, str + i, gm, i, 1);
879 }
880 
match_any(struct ExecCtx * ctx,const struct Op * op,const char * str,struct GMatch * gm)881 static int match_any(struct ExecCtx *ctx, const struct Op *op, const char *str, struct GMatch *gm)
882 {
883 	bool nl = (ctx->flags & REG_NEWLINE);
884 	int i, maxcnt = SIMPLE_MAXCNT(op);
885 
886 	for (i = 0; (i < maxcnt) && str[i]; i++) {
887 		if (nl && str[i] == '\n')
888 			break;
889 	}
890 	return scan_next(ctx, op, str + i, gm, i, 1);
891 }
892 
match_class(struct ExecCtx * ctx,const struct Op * op,const char * str,struct GMatch * gm)893 static int match_class(struct ExecCtx *ctx, const struct Op *op, const char *str, struct GMatch *gm)
894 {
895 	int i, maxcnt = SIMPLE_MAXCNT(op);
896 
897 	for (i = 0; (i < maxcnt); i++) {
898 		if (!class_isset(&op->cdata, str[i]))
899 			break;
900 	}
901 	return scan_next(ctx, op, str + i, gm, i, 1);
902 }
903 
match_bol(struct ExecCtx * ctx,const struct Op * op,const char * str,struct GMatch * gm)904 static int match_bol(struct ExecCtx *ctx, const struct Op *op, const char *str, struct GMatch *gm)
905 {
906 	if (str == ctx->str_start && !(ctx->flags & REG_NOTBOL))
907 		return do_match(ctx, op->next, str, gm);
908 	else if (str != ctx->str_start && str[-1] == '\n' && (ctx->flags & REG_NEWLINE))
909 		return do_match(ctx, op->next, str, gm);
910 	return REG_NOMATCH;
911 }
912 
match_eol(struct ExecCtx * ctx,const struct Op * op,const char * str,struct GMatch * gm)913 static int match_eol(struct ExecCtx *ctx, const struct Op *op, const char *str, struct GMatch *gm)
914 {
915 	if (*str == '\n' && (ctx->flags & REG_NEWLINE))
916 		return do_match(ctx, op->next, str, gm);
917 	else if (*str == 0 && !(ctx->flags & REG_NOTEOL))
918 		return do_match(ctx, op->next, str, gm);
919 	return REG_NOMATCH;
920 }
921 
match_wchange(struct ExecCtx * ctx,const struct Op * op,const char * str,struct GMatch * gm)922 static int match_wchange(struct ExecCtx *ctx, const struct Op *op, const char *str, struct GMatch *gm)
923 {
924 	bool prevw = (str == ctx->str_start) ? false : is_word(str[-1]);
925 	bool curw = is_word(str[0]);
926 	bool ischange = prevw ^ curw;
927 
928 	if ((op->type == OP_WCHANGE) ? ischange : !ischange)
929 		return do_match(ctx, op->next, str, gm);
930 	return REG_NOMATCH;
931 }
932 
match_bref(struct ExecCtx * ctx,const struct Op * op,const char * str,struct GMatch * gm)933 static int match_bref(struct ExecCtx *ctx, const struct Op *op, const char *str, struct GMatch *gm)
934 {
935 	bool icase = ctx->flags & REG_ICASE;
936 	int i;
937 	struct GMatch *bgm = ctx->gm_stack[op->bref];
938 	int blen = (bgm && bgm->end) ? (bgm->end - bgm->start) : -1;
939 
940 	/* handle no-match, zero-len, zero-count */
941 	if (blen < 0 && op->mincnt > 0)
942 		return REG_NOMATCH;
943 	if (blen <= 0 || op->maxcnt == 0)
944 		return do_match(ctx, op->next, str, gm);
945 
946 	/* find max matches */
947 	for (i = 0; (i < op->maxcnt) && *str; i++) {
948 		if (icase && strncasecmp(str, bgm->start, blen) != 0)
949 			break;
950 		else if (!icase && strncmp(str, bgm->start, blen) != 0)
951 			break;
952 		str += blen;
953 	}
954 	return scan_next(ctx, op, str, gm, i, blen);
955 }
956 
match_group(struct ExecCtx * ctx,const struct Op * op,const char * str,struct GMatch * gm)957 static int match_group(struct ExecCtx *ctx, const struct Op *op, const char *str, struct GMatch *gm)
958 {
959 	int err = REG_NOMATCH;
960 	bool gotmatch = false;
961 	struct GMatch gthis;
962 
963 	/* per-group-match context */
964 	memset(&gthis, 0, sizeof(gthis));
965 	gthis.owner = op;
966 	gthis.start = str;
967 	gthis.parent = gm;
968 	if (gm && gm->owner == op) {
969 		gthis.parent = gm->parent;
970 		gthis.count = gm->count + 1;
971 	}
972 	gm = &gthis;
973 	push_gm(ctx, gm);
974 
975 	if (op->maxcnt > 0) {
976 		struct AndList *alist = op->gdata.or_list;
977 		/* check all branches, unless relaxed matching */
978 		while (alist) {
979 			err = do_match(ctx, alist->op_list, str, gm);
980 			if (err == 0 && STRICT) {
981 				gm->end = NULL;
982 				gotmatch = true;
983 			} else if (err != REG_NOMATCH)
984 				break;
985 			alist = alist->next;
986 		}
987 	}
988 
989 	/* is no-match allowed? */
990 	if ((op->mincnt == 0) && (gm->count == 0)
991 	    && (err == REG_NOMATCH || (err == 0 && STRICT))) {
992 		gm->end = NULL;
993 		err = do_match(ctx, op->next, str, gm->parent);
994 	}
995 
996 	pop_gm(ctx, gm);
997 	return gotmatch ? 0 : err;
998 }
999 
match_gend(struct ExecCtx * ctx,const struct Op * f_op,const char * str,struct GMatch * gm)1000 static int match_gend(struct ExecCtx *ctx, const struct Op *f_op, const char *str, struct GMatch *gm)
1001 {
1002 	int err = REG_NOMATCH;
1003 	const struct Op *op = gm->owner;
1004 	bool zeromatch = (str == gm->start);
1005 	bool gotmatch = false;
1006 
1007 	/* ignore follow-up empty matches, unless it has backrefs */
1008 	if (zeromatch && gm->count > 0 && gm->count >= op->mincnt && !gm->owner->gdata.has_refs)
1009 		return REG_NOMATCH;
1010 
1011 	/* tag as matched */
1012 	gm->end = str;
1013 
1014 	/* try more repeats, stop if count full or last match was zero-length */
1015 	if (gm->count + 1 < op->maxcnt && !zeromatch) {
1016 		err = match_group(ctx, op, str, gm);
1017 		if (err == 0 && STRICT)
1018 			gotmatch = true;
1019 		else if (err != REG_NOMATCH)
1020 			return err;
1021 	}
1022 
1023 	/* fail if not enough repeats */
1024 	if (!zeromatch && gm->count + 1 < op->mincnt)
1025 		return err;
1026 
1027 	/* continue with parent branch */
1028 	err = do_match(ctx, op->next, str, gm->parent);
1029 	if (err == REG_NOMATCH && gotmatch)
1030 		err = 0;
1031 	return err;
1032 }
1033 
1034 /*
1035  * The juice of POSIX - match weighting.
1036  */
1037 
gmatch_hist_cmp(struct ExecCtx * ctx,int gno,struct GMatch * gm,int replen)1038 static int gmatch_hist_cmp(struct ExecCtx *ctx, int gno, struct GMatch *gm, int replen)
1039 {
1040 	struct HMatch *hm = (gm->prevgm) ? &gm->prevgm->hm_next : &ctx->hm_first[gno];
1041 	int gmlen = (gm->end) ? (gm->end - gm->start) : -1;
1042 	int hmlen = (hm->hist_end) ? (hm->hist_end - hm->hist_start) : -1;
1043 	int gmreplen = (gmlen >= 0) ? (gmlen + replen) : replen;
1044 	int hmreplen = ((hmlen >= 0) ? hmlen : 0) + hm->rep_len;
1045 	int gmofs = (gm->end) ? (gm->start - ctx->str_start) : -1;
1046 	int hmofs = (hm->hist_start) ? (hm->hist_start - ctx->str_start) : -1;
1047 
1048 	/* prefer rightmost match, to allow preceding elements match more */
1049 	int res = (gmofs - hmofs);
1050 
1051 	/* prefer longer repeated match */
1052 	if (res == 0 && gm->count == 0)
1053 		res = (gmreplen - hmreplen);
1054 
1055 	/* prefer longer single match */
1056 	if (res == 0)
1057 		res = (gmlen - hmlen);
1058 
1059 	return res;
1060 }
1061 
cmp_gmatches(struct ExecCtx * ctx,int gno,struct GMatch * gm,int replen)1062 static int cmp_gmatches(struct ExecCtx *ctx, int gno, struct GMatch *gm, int replen)
1063 {
1064 	int cmp = 0, gmlen;
1065 	if (gm) {
1066 		/* need to compare preceding groups first */
1067 		gmlen = gm->end ? gm->end - gm->start : 0;
1068 		cmp = cmp_gmatches(ctx, gno, gm->prevgm,
1069 				   (gm->count == 0) ? 0 : (replen + gmlen));
1070 		/* actual comparision */
1071 		if (!cmp) cmp = gmatch_hist_cmp(ctx, gno, gm, replen);
1072 	}
1073 	return cmp;
1074 }
1075 
gm_resolve_tie(struct ExecCtx * ctx,int gno)1076 static int gm_resolve_tie(struct ExecCtx *ctx, int gno)
1077 {
1078 	struct GMatch *gm = ctx->gm_stack[gno];
1079 	if (!gm) /* 0-count match is better than no match */
1080 		return ctx->hm_first[gno].hist_start ? -1 : 0;
1081 
1082 	return cmp_gmatches(ctx, gno, gm, 0);
1083 }
1084 
fill_history(struct ExecCtx * ctx,int gno)1085 static void fill_history(struct ExecCtx *ctx, int gno)
1086 {
1087 	struct HMatch *hm;
1088 	int gmlen, rep_len = 0;
1089 	struct GMatch *gm = ctx->gm_stack[gno];
1090 	while (STRICT && gm) {
1091 		hm = (gm->prevgm) ? &gm->prevgm->hm_next : &ctx->hm_first[gno];
1092 		hm->hist_start = gm->start;
1093 		hm->hist_end = gm->end;
1094 		hm->rep_len = rep_len;
1095 		gmlen = gm->end ? (gm->end - gm->start) : 0;
1096 		rep_len += gmlen;
1097 		if (gm->count == 0)
1098 			rep_len = 0;
1099 		gm = gm->prevgm;
1100 	}
1101 }
1102 
publish_gm(struct ExecCtx * ctx,int gno)1103 static void publish_gm(struct ExecCtx *ctx, int gno)
1104 {
1105 	struct GMatch *gm = ctx->gm_stack[gno];
1106 	regmatch_t *rm = ctx->pmatch + gno;
1107 
1108 	/* ignore non-matches */
1109 	while (gm && !gm->end)
1110 		gm = gm->prevgm;
1111 
1112 	/* require it to be inside reported parent */
1113 	if (gm && gm->parent) {
1114 		int pno = gm->parent->owner->grp_no;
1115 		if (gm->parent != ctx->gm_cache[pno])
1116 			gm = NULL;
1117 	}
1118 	ctx->gm_cache[gno] = gm;
1119 
1120 	/* publish new match */
1121 	if (gm) {
1122 		rm->rm_so = gm->start - ctx->str_start;
1123 		rm->rm_eo = gm->end - ctx->str_start;
1124 	} else {
1125 		rm->rm_so = -1;
1126 		rm->rm_eo = -1;
1127 	}
1128 }
1129 
1130 /* compare and publish */
got_full_match(struct ExecCtx * ctx,const struct Op * f_op,const char * str,struct GMatch * gm)1131 static int got_full_match(struct ExecCtx *ctx, const struct Op *f_op, const char *str, struct GMatch *gm)
1132 {
1133 	int gno, cmp;
1134 
1135 	/* tag group as matched */
1136 	gm->end = str;
1137 
1138 	/* ignore shorter matches */
1139 	if (ctx->last_endpos && str < ctx->last_endpos)
1140 		return 0;
1141 
1142 	/* longer or equal length */
1143 	if (str > ctx->last_endpos) {
1144 		ctx->last_endpos = str;
1145 		goto better_match;
1146 	} else if (STRICT && ctx->nmatch > 1) {
1147 		for (gno = 0; gno < ctx->nmatch; gno++) {
1148 			cmp = gm_resolve_tie(ctx, gno);
1149 			if (cmp < 0)
1150 				break;
1151 			if (cmp > 0)
1152 				goto better_match;
1153 		}
1154 	}
1155 	return 0;
1156 
1157 better_match:
1158 	for (gno = 0; gno < ctx->nmatch; gno++) {
1159 		publish_gm(ctx, gno);
1160 		fill_history(ctx, gno);
1161 	}
1162 	return 0;
1163 }
1164 
1165 /* fill in proper matcher */
set_op_type(struct Op * op,enum OpType op_type)1166 static void set_op_type(struct Op *op, enum OpType op_type)
1167 {
1168 	static const matcher_f mlist[] = {
1169 		match_char, match_any, match_class, match_group, match_bref,
1170 		match_bol, match_eol, match_wchange, match_wchange,
1171 		match_gend, got_full_match
1172 	};
1173 	op->matcher = mlist[op_type];
1174 	op->type = op_type;
1175 }
1176 
1177 /*
1178  * Public matching API
1179  */
1180 
regexec(const regex_t * rx,const char * str,size_t nmatch,regmatch_t pmatch[],int eflags)1181 int regexec(const regex_t *rx, const char *str, size_t nmatch, regmatch_t pmatch[], int eflags)
1182 {
1183 	int err;
1184 	struct ExecCtx ctx;
1185 
1186 	if (eflags & ~(REG_NOTBOL | REG_NOTEOL))
1187 		return REG_BADPAT;
1188 
1189 	/* init local context */
1190 	memset(&ctx, 0, sizeof(ctx));
1191 	ctx.pmatch = pmatch;
1192 	ctx.nmatch = nmatch;
1193 	ctx.str_start = str;
1194 	ctx.rx = rx;
1195 	ctx.rxi = rx->internal;
1196 	ctx.flags = ctx.rxi->flags | eflags;
1197 
1198 	/* reset pmatch area */
1199 	if (!(ctx.flags & REG_NOSUB))
1200 		memset(pmatch, -1, nmatch * sizeof(regmatch_t));
1201 
1202 	/* decide pmatch area that will be used */
1203 	if (!pmatch || (ctx.flags & REG_NOSUB))
1204 		ctx.nmatch = 0;
1205 	else if (nmatch > (size_t)rx->re_nsub + 1)
1206 		ctx.nmatch = rx->re_nsub + 1;
1207 	ctx.strict = !(ctx.flags & REG_RELAXED_MATCHING) && (ctx.nmatch > 0);
1208 
1209 	/* execute search */
1210 	str--;
1211 	do {
1212 		str++;
1213 		err = do_match(&ctx, ctx.rxi->root, str, NULL);
1214 	} while ((err == REG_NOMATCH) && *str);
1215 
1216 	return err;
1217 }
1218 
1219 /*
1220  * Free parse tree
1221  */
1222 
regfree(regex_t * rx)1223 void regfree(regex_t *rx)
1224 {
1225 	struct RegexInt *rxi;
1226 	if (rx) {
1227 		rxi = rx->internal;
1228 		if (rxi)
1229 			mempool_destroy(&rxi->pool);
1230 		memset(rx, 0, sizeof(*rx));
1231 	}
1232 }
1233 
1234 /*
1235  * Error strings
1236  */
1237 
regerror(int err,const regex_t * rx,char * dst,size_t dstlen)1238 size_t regerror(int err, const regex_t *rx, char *dst, size_t dstlen)
1239 {
1240 	static const char errlist[][9] = {
1241 		"NOERROR",	/* 0 */
1242 		"NOMATCH",	/* 1 */
1243 		"BADBR",	/* 2 */
1244 		"BADPAT",	/* 3 */
1245 		"BADRPT",	/* 4 */
1246 		"EBRACE",	/* 5 */
1247 		"EBRACK",	/* 6 */
1248 		"ECOLLATE",	/* 7 */
1249 		"ECTYPE",	/* 8 */
1250 		"EESCAPE",	/* 9 */
1251 		"EPAREN",	/* 10 */
1252 		"ERANGE",	/* 11 */
1253 		"ESPACE",	/* 12 */
1254 		"ESUBREG",	/* 13 */
1255 	};
1256 	const char *s = "EUNKNOWN";
1257 	if ((size_t)err < ARRAY_NELEM(errlist))
1258 		s = errlist[err];
1259 	return snprintf(dst, dstlen, "%s", s);
1260 }
1261 
1262 #endif /* !USE_SYSTEM_REGEX */
1263