1 /*
2  * vi:se ts=8:
3  *
4  * regcomp and regexec -- regsub and regerror are elsewhere
5  *
6  *	Copyright (c) 1986 by University of Toronto.
7  *	Written by Henry Spencer.  Not derived from licensed software.
8  *
9  *	Permission is granted to anyone to use this software for any
10  *	purpose on any computer system, and to redistribute it freely,
11  *	subject to the following restrictions:
12  *
13  *	1. The author is not responsible for the consequences of use of
14  *		this software, no matter how awful, even if they arise
15  *		from defects in it.
16  *
17  *	2. The origin of this software must not be misrepresented, either
18  *		by explicit claim or by omission.
19  *
20  *	3. Altered versions must be plainly marked as such, and must not
21  *		be misrepresented as being the original software.
22  *** THIS IS AN ALTERED VERSION.  It was altered by John Gilmore,
23  *** hoptoad!gnu, on 27 Dec 1986, to add \n as an alternative to |
24  *** to assist in implementing egrep.
25  *** THIS IS AN ALTERED VERSION.  It was altered by John Gilmore,
26  *** hoptoad!gnu, on 27 Dec 1986, to add \< and \> for word-matching
27  *** as in BSD grep and ex.
28  *** THIS IS AN ALTERED VERSION.  It was altered by John Gilmore,
29  *** hoptoad!gnu, on 28 Dec 1986, to optimize characters quoted with \.
30  *** THIS IS AN ALTERED VERSION.  It was altered by James A. Woods,
31  *** ames!jaw, on 19 June 1987, to quash a regcomp() redundancy.
32  *** THIS IS AN ALTERED VERSION.  It was altered by Christopher Seiwald
33  *** seiwald@vix.com, on 28 August 1993, for use in jam.  Regmagic.h
34  *** was moved into regexp.h, and the include of regexp.h now uses "'s
35  *** to avoid conflicting with the system regexp.h.  Const, bless its
36  *** soul, was removed so it can compile everywhere.  The declaration
37  *** of strchr() was in conflict on AIX, so it was removed (as it is
38  *** happily defined in string.h).
39  *** THIS IS AN ALTERED VERSION.  It was altered by Christopher Seiwald
40  *** seiwald@perforce.com, on 20 January 2000, to use function prototypes.
41  *** THIS IS AN ALTERED VERSION.  It was altered by Christopher Seiwald
42  *** seiwald@perforce.com, on 05 November 2002, to const string literals.
43  *
44  *   THIS IS AN ALTERED VERSION.  It was altered by Steve Bennett <steveb@workware.net.au>
45  *   on 16 October 2010, to remove static state and add better Tcl ARE compatibility.
46  *   This includes counted repetitions, UTF-8 support, character classes,
47  *   shorthand character classes, increased number of parentheses to 100,
48  *   backslash escape sequences. It also removes \n as an alternative to |.
49  *
50  *** THIS IS AN ALTERED VERSION.  It was altered to offer POSIX-like
51  *** regular expression routines of regcomp/regexec/regerror/regfree,
52  *** with UTF-8 support, by NIIBE Yutaka <gniibe@fsij.org> on
53  *** 2020-02-14.
54  *
55  * Beware that some of this code is subtly aware of the way operator
56  * precedence is structured in regular expressions.  Serious changes in
57  * regular-expression syntax might require a total rethink.
58  */
59 
60 #if defined(JIM_REGEXP)
61 #include <stdio.h>
62 #include <ctype.h>
63 #include <stdlib.h>
64 #include <string.h>
65 
66 #include "jimregexp.h"
67 #include "utf8.h"
68 
69 #define UCHAR(c) ((unsigned char)(c))
70 
71 /* An arbitrary limit, but this seems enough. Must be less than 1000. */
72 #define REG_MAX_PAREN 100
73 
74 /*
75  * Structure for regexp "program".  This is essentially a linear encoding
76  * of a nondeterministic finite-state machine (aka syntax charts or
77  * "railroad normal form" in parsing technology).  Each node is an opcode
78  * plus a "next" pointer, possibly plus an operand.  "Next" pointers of
79  * all nodes except BRANCH implement concatenation; a "next" pointer with
80  * a BRANCH on both ends of it is connecting two alternatives.  (Here we
81  * have one of the subtle syntax dependencies:  an individual BRANCH (as
82  * opposed to a collection of them) is never concatenated with anything
83  * because of operator precedence.)  The operand of some types of node is
84  * a literal string; for others, it is a node leading into a sub-FSM.  In
85  * particular, the operand of a BRANCH node is the first node of the branch.
86  * (NB this is *not* a tree structure:  the tail of the branch connects
87  * to the thing following the set of BRANCHes.)  The opcodes are:
88  */
89 
90 /* definition		number	opnd?	meaning */
91 #define	END	0	/* no	End of program. */
92 #define	BOL	1	/* no	Match "" at beginning of line. */
93 #define	EOL	2	/* no	Match "" at end of line. */
94 #define	ANY	3	/* no	Match any one character. */
95 #define	ANYOF	4	/* str	Match any character in this string. */
96 #define	ANYBUT	5	/* str	Match any character not in this string. */
97 #define	BRANCH	6	/* node	Match this alternative, or the next... */
98 #define	BACK	7	/* no	Match "", "next" ptr points backward. */
99 #define	EXACTLY	8	/* str	Match this string. */
100 #define	NOTHING	9	/* no	Match empty string. */
101 #define	REP	10	/* max,min	Match this (simple) thing [min,max] times. */
102 #define	REPMIN	11	/* max,min	Match this (simple) thing [min,max] times, minimal match. */
103 #define	REPX	12	/* max,min	Match this (complex) thing [min,max] times. */
104 #define	REPXMIN	13	/* max,min	Match this (complex) thing [min,max] times, minimal match. */
105 #define	BOLX	14	/* no	Match "" at beginning of input. */
106 #define	EOLX	15	/* no	Match "" at end of input. */
107 #define	WORDA	16	/* no	Match "" at wordchar, where prev is nonword */
108 #define	WORDZ	17	/* no	Match "" at nonwordchar, where prev is word */
109 
110 #define	OPENNC 	1000	/* no	Non-capturing parentheses - must be OPEN-1 */
111 #define	OPEN   	1001	/* no	Mark this point in input as start of #n. */
112 			/*	OPEN+1 is number 1, etc. */
113 
114 /* must not be any other opts between OPEN and CLOSE */
115 
116 #define	CLOSENC	2000 	/* no	Non-capturing parentheses - must be CLOSE-1 */
117 #define	CLOSE	2001 	/* no	Analogous to OPEN. */
118 #define	CLOSE_END	(CLOSE+REG_MAX_PAREN)
119 
120 /*
121  * The first word of the regexp internal "program" is actually this magic
122  * number; the start node begins in the second word.
123  */
124 #define	REG_MAGIC	0xFADED00D
125 
126 /*
127  * Opcode notes:
128  *
129  * BRANCH	The set of branches constituting a single choice are hooked
130  *		together with their "next" pointers, since precedence prevents
131  *		anything being concatenated to any individual branch.  The
132  *		"next" pointer of the last BRANCH in a choice points to the
133  *		thing following the whole choice.  This is also where the
134  *		final "next" pointer of each individual branch points; each
135  *		branch starts with the operand node of a BRANCH node.
136  *
137  * BACK		Normal "next" pointers all implicitly point forward; BACK
138  *		exists to make loop structures possible.
139  *
140  * REP,REPX	Repeated matches ('?', '*', '+' and {min,max}) are implemented
141  *              as either simple repeats (REP) or complex repeats (REPX).
142  *              These opcodes include a "min" and "max" count after the opcode.
143  *		This is followed by a fourth "current count" word that is
144  *		only used by REPX, as it implements a recursive match.
145  *		REPMIN and REPXMIN are identical except they implement minimal repeats.
146  *
147  * OPEN,CLOSE	...are numbered at compile time.
148  */
149 
150 /*
151  * A node is one word of opcode followed by one word of "next" pointer.
152  * The "next" pointer value is a positive offset from the opcode of the node
153  * containing it.
154  * An operand, if any, simply follows the node.  (Note that much of the
155  * code generation knows about this implicit relationship.)
156  */
157 #define	OP(preg, p)	(preg->program[p])
158 #define	NEXT(preg, p)	(preg->program[p + 1])
159 #define	OPERAND(p)	((p) + 2)
160 
161 /*
162  * See regmagic.h for one further detail of program structure.
163  */
164 
165 
166 /*
167  * Utility definitions.
168  */
169 
170 #define	FAIL(R,M)	{ (R)->err = (M); return (M); }
171 #define	ISMULT(c)	((c) == '*' || (c) == '+' || (c) == '?' || (c) == '{')
172 #define	META		"^$.[()|?{+*"
173 
174 /*
175  * Flags to be passed up and down.
176  */
177 #define	HASWIDTH	1	/* Known never to match null string. */
178 #define	SIMPLE		2	/* Simple enough to be STAR/PLUS operand. */
179 #define	SPSTART		4	/* Starts with * or +. */
180 #define	WORST		0	/* Worst case. */
181 
182 #define MAX_REP_COUNT 1000000
183 
184 /*
185  * Forward declarations for regcomp()'s friends.
186  */
187 static int reg(regex_t *preg, int paren /* Parenthesized? */, int *flagp );
188 static int regpiece(regex_t *preg, int *flagp );
189 static int regbranch(regex_t *preg, int *flagp );
190 static int regatom(regex_t *preg, int *flagp );
191 static int regnode(regex_t *preg, int op );
192 static int regnext(regex_t *preg, int p );
193 static void regc(regex_t *preg, int b );
194 static int reginsert(regex_t *preg, int op, int size, int opnd );
195 static void regtail(regex_t *preg, int p, int val);
196 static void regoptail(regex_t *preg, int p, int val );
197 static int regopsize(regex_t *preg, int p );
198 
199 static int reg_range_find(const int *string, int c);
200 static const char *str_find(const char *string, int c, int nocase);
201 static int prefix_cmp(const int *prog, int proglen, const char *string, int nocase);
202 
203 /*#define DEBUG*/
204 #ifdef DEBUG
205 static int regnarrate = 0;
206 static void regdump(regex_t *preg);
207 static const char *regprop( int op );
208 #endif
209 
210 
211 /**
212  * Returns the length of the null-terminated integer sequence.
213  */
str_int_len(const int * seq)214 static int str_int_len(const int *seq)
215 {
216 	int n = 0;
217 	while (*seq++) {
218 		n++;
219 	}
220 	return n;
221 }
222 
223 /*
224  - regcomp - compile a regular expression into internal code
225  *
226  * We can't allocate space until we know how big the compiled form will be,
227  * but we can't compile it (and thus know how big it is) until we've got a
228  * place to put the code.  So we cheat:  we compile it twice, once with code
229  * generation turned off and size counting turned on, and once "for real".
230  * This also means that we don't allocate space until we are sure that the
231  * thing really will compile successfully, and we never have to move the
232  * code and thus invalidate pointers into it.  (Note that it has to be in
233  * one piece because free() must be able to free it all.)
234  *
235  * Beware that the optimization-preparation code in here knows about some
236  * of the structure of the compiled regexp.
237  */
regcomp(regex_t * preg,const char * exp,int cflags)238 int regcomp(regex_t *preg, const char *exp, int cflags)
239 {
240 	int scan;
241 	int longest;
242 	unsigned len;
243 	int flags;
244 
245 #ifdef DEBUG
246 	fprintf(stderr, "Compiling: '%s'\n", exp);
247 #endif
248 	memset(preg, 0, sizeof(*preg));
249 
250 	if (exp == NULL)
251 		FAIL(preg, REG_ERR_NULL_ARGUMENT);
252 
253 	/* First pass: determine size, legality. */
254 	preg->cflags = cflags;
255 	preg->regparse = exp;
256 
257 	/* Allocate space. */
258 	preg->proglen = (strlen(exp) + 1) * 5;
259 	preg->program = malloc(preg->proglen * sizeof(int));
260 	if (preg->program == NULL)
261 		FAIL(preg, REG_ERR_NOMEM);
262 
263 	/* Note that since we store a magic value as the first item in the program,
264 	 * program offsets will never be 0
265 	 */
266 	regc(preg, REG_MAGIC);
267 	if (reg(preg, 0, &flags) == 0) {
268 		return preg->err;
269 	}
270 
271 	/* Small enough for pointer-storage convention? */
272 	if (preg->re_nsub >= REG_MAX_PAREN)		/* Probably could be 65535L. */
273 		FAIL(preg,REG_ERR_TOO_BIG);
274 
275 	/* Dig out information for optimizations. */
276 	preg->regstart = 0;	/* Worst-case defaults. */
277 	preg->reganch = 0;
278 	preg->regmust = 0;
279 	preg->regmlen = 0;
280 	scan = 1;			/* First BRANCH. */
281 	if (OP(preg, regnext(preg, scan)) == END) {		/* Only one top-level choice. */
282 		scan = OPERAND(scan);
283 
284 		/* Starting-point info. */
285 		if (OP(preg, scan) == EXACTLY) {
286 			preg->regstart = preg->program[OPERAND(scan)];
287 		}
288 		else if (OP(preg, scan) == BOL)
289 			preg->reganch++;
290 
291 		/*
292 		 * If there's something expensive in the r.e., find the
293 		 * longest literal string that must appear and make it the
294 		 * regmust.  Resolve ties in favor of later strings, since
295 		 * the regstart check works with the beginning of the r.e.
296 		 * and avoiding duplication strengthens checking.  Not a
297 		 * strong reason, but sufficient in the absence of others.
298 		 */
299 		if (flags&SPSTART) {
300 			longest = 0;
301 			len = 0;
302 			for (; scan != 0; scan = regnext(preg, scan)) {
303 				if (OP(preg, scan) == EXACTLY) {
304 					int plen = str_int_len(preg->program + OPERAND(scan));
305 					if (plen >= len) {
306 						longest = OPERAND(scan);
307 						len = plen;
308 					}
309 				}
310 			}
311 			preg->regmust = longest;
312 			preg->regmlen = len;
313 		}
314 	}
315 
316 #ifdef DEBUG
317 	regdump(preg);
318 #endif
319 
320 	return 0;
321 }
322 
323 /*
324  - reg - regular expression, i.e. main body or parenthesized thing
325  *
326  * Caller must absorb opening parenthesis.
327  *
328  * Combining parenthesis handling with the base level of regular expression
329  * is a trifle forced, but the need to tie the tails of the branches to what
330  * follows makes it hard to avoid.
331  */
reg(regex_t * preg,int paren,int * flagp)332 static int reg(regex_t *preg, int paren /* Parenthesized? */, int *flagp )
333 {
334 	int ret;
335 	int br;
336 	int ender;
337 	int parno = 0;
338 	int flags;
339 
340 	*flagp = HASWIDTH;	/* Tentatively. */
341 
342 	/* Make an OPEN node, if parenthesized. */
343 	if (paren) {
344 		if (preg->regparse[0] == '?' && preg->regparse[1] == ':') {
345 			/* non-capturing paren */
346 			preg->regparse += 2;
347 			parno = -1;
348 		}
349 		else {
350 			parno = ++preg->re_nsub;
351 		}
352 		ret = regnode(preg, OPEN+parno);
353 	} else
354 		ret = 0;
355 
356 	/* Pick up the branches, linking them together. */
357 	br = regbranch(preg, &flags);
358 	if (br == 0)
359 		return 0;
360 	if (ret != 0)
361 		regtail(preg, ret, br);	/* OPEN -> first. */
362 	else
363 		ret = br;
364 	if (!(flags&HASWIDTH))
365 		*flagp &= ~HASWIDTH;
366 	*flagp |= flags&SPSTART;
367 	while (*preg->regparse == '|') {
368 		preg->regparse++;
369 		br = regbranch(preg, &flags);
370 		if (br == 0)
371 			return 0;
372 		regtail(preg, ret, br);	/* BRANCH -> BRANCH. */
373 		if (!(flags&HASWIDTH))
374 			*flagp &= ~HASWIDTH;
375 		*flagp |= flags&SPSTART;
376 	}
377 
378 	/* Make a closing node, and hook it on the end. */
379 	ender = regnode(preg, (paren) ? CLOSE+parno : END);
380 	regtail(preg, ret, ender);
381 
382 	/* Hook the tails of the branches to the closing node. */
383 	for (br = ret; br != 0; br = regnext(preg, br))
384 		regoptail(preg, br, ender);
385 
386 	/* Check for proper termination. */
387 	if (paren && *preg->regparse++ != ')') {
388 		preg->err = REG_ERR_UNMATCHED_PAREN;
389 		return 0;
390 	} else if (!paren && *preg->regparse != '\0') {
391 		if (*preg->regparse == ')') {
392 			preg->err = REG_ERR_UNMATCHED_PAREN;
393 			return 0;
394 		} else {
395 			preg->err = REG_ERR_JUNK_ON_END;
396 			return 0;
397 		}
398 	}
399 
400 	return(ret);
401 }
402 
403 /*
404  - regbranch - one alternative of an | operator
405  *
406  * Implements the concatenation operator.
407  */
regbranch(regex_t * preg,int * flagp)408 static int regbranch(regex_t *preg, int *flagp )
409 {
410 	int ret;
411 	int chain;
412 	int latest;
413 	int flags;
414 
415 	*flagp = WORST;		/* Tentatively. */
416 
417 	ret = regnode(preg, BRANCH);
418 	chain = 0;
419 	while (*preg->regparse != '\0' && *preg->regparse != ')' &&
420 	       *preg->regparse != '|') {
421 		latest = regpiece(preg, &flags);
422 		if (latest == 0)
423 			return 0;
424 		*flagp |= flags&HASWIDTH;
425 		if (chain == 0) {/* First piece. */
426 			*flagp |= flags&SPSTART;
427 		}
428 		else {
429 			regtail(preg, chain, latest);
430 		}
431 		chain = latest;
432 	}
433 	if (chain == 0)	/* Loop ran zero times. */
434 		(void) regnode(preg, NOTHING);
435 
436 	return(ret);
437 }
438 
439 /*
440  - regpiece - something followed by possible [*+?]
441  *
442  * Note that the branching code sequences used for ? and the general cases
443  * of * and + are somewhat optimized:  they use the same NOTHING node as
444  * both the endmarker for their branch list and the body of the last branch.
445  * It might seem that this node could be dispensed with entirely, but the
446  * endmarker role is not redundant.
447  */
regpiece(regex_t * preg,int * flagp)448 static int regpiece(regex_t *preg, int *flagp)
449 {
450 	int ret;
451 	char op;
452 	int next;
453 	int flags;
454 	int min;
455 	int max;
456 
457 	ret = regatom(preg, &flags);
458 	if (ret == 0)
459 		return 0;
460 
461 	op = *preg->regparse;
462 	if (!ISMULT(op)) {
463 		*flagp = flags;
464 		return(ret);
465 	}
466 
467 	if (!(flags&HASWIDTH) && op != '?') {
468 		preg->err = REG_ERR_OPERAND_COULD_BE_EMPTY;
469 		return 0;
470 	}
471 
472 	/* Handle braces (counted repetition) by expansion */
473 	if (op == '{') {
474 		char *end;
475 
476 		min = strtoul(preg->regparse + 1, &end, 10);
477 		if (end == preg->regparse + 1) {
478 			preg->err = REG_ERR_BAD_COUNT;
479 			return 0;
480 		}
481 		if (*end == '}') {
482 			max = min;
483 		}
484 		else if (*end == '\0') {
485 			preg->err = REG_ERR_UNMATCHED_BRACES;
486 			return 0;
487 		}
488 		else {
489 			preg->regparse = end;
490 			max = strtoul(preg->regparse + 1, &end, 10);
491 			if (*end != '}') {
492 				preg->err = REG_ERR_UNMATCHED_BRACES;
493 				return 0;
494 			}
495 		}
496 		if (end == preg->regparse + 1) {
497 			max = MAX_REP_COUNT;
498 		}
499 		else if (max < min || max >= 100) {
500 			preg->err = REG_ERR_BAD_COUNT;
501 			return 0;
502 		}
503 		if (min >= 100) {
504 			preg->err = REG_ERR_BAD_COUNT;
505 			return 0;
506 		}
507 
508 		preg->regparse = strchr(preg->regparse, '}');
509 	}
510 	else {
511 		min = (op == '+');
512 		max = (op == '?' ? 1 : MAX_REP_COUNT);
513 	}
514 
515 	if (preg->regparse[1] == '?') {
516 		preg->regparse++;
517 		next = reginsert(preg, flags & SIMPLE ? REPMIN : REPXMIN, 5, ret);
518 	}
519 	else {
520 		next = reginsert(preg, flags & SIMPLE ? REP: REPX, 5, ret);
521 	}
522 	preg->program[ret + 2] = max;
523 	preg->program[ret + 3] = min;
524 	preg->program[ret + 4] = 0;
525 
526 	*flagp = (min) ? (WORST|HASWIDTH) : (WORST|SPSTART);
527 
528 	if (!(flags & SIMPLE)) {
529 		int back = regnode(preg, BACK);
530 		regtail(preg, back, ret);
531 		regtail(preg, next, back);
532 	}
533 
534 	preg->regparse++;
535 	if (ISMULT(*preg->regparse)) {
536 		preg->err = REG_ERR_NESTED_COUNT;
537 		return 0;
538 	}
539 
540 	return ret;
541 }
542 
543 /**
544  * Add all characters in the inclusive range between lower and upper.
545  *
546  * Handles a swapped range (upper < lower).
547  */
reg_addrange(regex_t * preg,int lower,int upper)548 static void reg_addrange(regex_t *preg, int lower, int upper)
549 {
550 	if (lower > upper) {
551 		reg_addrange(preg, upper, lower);
552 	}
553 	/* Add a range as length, start */
554 	regc(preg, upper - lower + 1);
555 	regc(preg, lower);
556 }
557 
558 /**
559  * Add a null-terminated literal string as a set of ranges.
560  */
reg_addrange_str(regex_t * preg,const char * str)561 static void reg_addrange_str(regex_t *preg, const char *str)
562 {
563 	while (*str) {
564 		reg_addrange(preg, *str, *str);
565 		str++;
566 	}
567 }
568 
569 /**
570  * Extracts the next unicode char from utf8.
571  *
572  * If 'upper' is set, converts the char to uppercase.
573  */
reg_utf8_tounicode_case(const char * s,int * uc,int upper)574 static int reg_utf8_tounicode_case(const char *s, int *uc, int upper)
575 {
576 	int l = utf8_tounicode(s, uc);
577 	if (upper) {
578 		*uc = utf8_upper(*uc);
579 	}
580 	return l;
581 }
582 
583 /**
584  * Converts a hex digit to decimal.
585  *
586  * Returns -1 for an invalid hex digit.
587  */
hexdigitval(int c)588 static int hexdigitval(int c)
589 {
590 	if (c >= '0' && c <= '9')
591 		return c - '0';
592 	if (c >= 'a' && c <= 'f')
593 		return c - 'a' + 10;
594 	if (c >= 'A' && c <= 'F')
595 		return c - 'A' + 10;
596 	return -1;
597 }
598 
599 /**
600  * Parses up to 'n' hex digits at 's' and stores the result in *uc.
601  *
602  * Returns the number of hex digits parsed.
603  * If there are no hex digits, returns 0 and stores nothing.
604  */
parse_hex(const char * s,int n,int * uc)605 static int parse_hex(const char *s, int n, int *uc)
606 {
607 	int val = 0;
608 	int k;
609 
610 	for (k = 0; k < n; k++) {
611 		int c = hexdigitval(*s++);
612 		if (c == -1) {
613 			break;
614 		}
615 		val = (val << 4) | c;
616 	}
617 	if (k) {
618 		*uc = val;
619 	}
620 	return k;
621 }
622 
623 /**
624  * Call for chars after a backlash to decode the escape sequence.
625  *
626  * Stores the result in *ch.
627  *
628  * Returns the number of bytes consumed.
629  */
reg_decode_escape(const char * s,int * ch)630 static int reg_decode_escape(const char *s, int *ch)
631 {
632 	int n;
633 	const char *s0 = s;
634 
635 	*ch = *s++;
636 
637 	switch (*ch) {
638 		case 'b': *ch = '\b'; break;
639 		case 'e': *ch = 27; break;
640 		case 'f': *ch = '\f'; break;
641 		case 'n': *ch = '\n'; break;
642 		case 'r': *ch = '\r'; break;
643 		case 't': *ch = '\t'; break;
644 		case 'v': *ch = '\v'; break;
645 		case 'u':
646 			if (*s == '{') {
647 				/* Expect \u{NNNN} */
648 				n = parse_hex(s + 1, 6, ch);
649 				if (n > 0 && s[n + 1] == '}' && *ch >= 0 && *ch <= 0x1fffff) {
650 					s += n + 2;
651 				}
652 				else {
653 					/* Invalid, so just treat as an escaped 'u' */
654 					*ch = 'u';
655 				}
656 			}
657 			else if ((n = parse_hex(s, 4, ch)) > 0) {
658 				s += n;
659 			}
660 			break;
661 		case 'U':
662 			if ((n = parse_hex(s, 8, ch)) > 0) {
663 				s += n;
664 			}
665 			break;
666 		case 'x':
667 			if ((n = parse_hex(s, 2, ch)) > 0) {
668 				s += n;
669 			}
670 			break;
671 		case '\0':
672 			s--;
673 			*ch = '\\';
674 			break;
675 	}
676 	return s - s0;
677 }
678 
679 /*
680  - regatom - the lowest level
681  *
682  * Optimization:  gobbles an entire sequence of ordinary characters so that
683  * it can turn them into a single node, which is smaller to store and
684  * faster to run.  Backslashed characters are exceptions, each becoming a
685  * separate node; the code is simpler that way and it's not worth fixing.
686  */
regatom(regex_t * preg,int * flagp)687 static int regatom(regex_t *preg, int *flagp)
688 {
689 	int ret;
690 	int flags;
691 	int nocase = (preg->cflags & REG_ICASE);
692 
693 	int ch;
694 	int n = reg_utf8_tounicode_case(preg->regparse, &ch, nocase);
695 
696 	*flagp = WORST;		/* Tentatively. */
697 
698 	preg->regparse += n;
699 	switch (ch) {
700 	/* FIXME: these chars only have meaning at beg/end of pat? */
701 	case '^':
702 		ret = regnode(preg, BOL);
703 		break;
704 	case '$':
705 		ret = regnode(preg, EOL);
706 		break;
707 	case '.':
708 		ret = regnode(preg, ANY);
709 		*flagp |= HASWIDTH|SIMPLE;
710 		break;
711 	case '[': {
712 			const char *pattern = preg->regparse;
713 
714 			if (*pattern == '^') {	/* Complement of range. */
715 				ret = regnode(preg, ANYBUT);
716 				pattern++;
717 			} else
718 				ret = regnode(preg, ANYOF);
719 
720 			/* Special case. If the first char is ']' or '-', it is part of the set */
721 			if (*pattern == ']' || *pattern == '-') {
722 				reg_addrange(preg, *pattern, *pattern);
723 				pattern++;
724 			}
725 
726 			while (*pattern != ']') {
727 				/* Is this a range? a-z */
728 				int start;
729 				int end;
730 
731 				enum {
732 					CC_ALPHA, CC_ALNUM, CC_SPACE, CC_BLANK, CC_UPPER, CC_LOWER,
733 					CC_DIGIT, CC_XDIGIT, CC_CNTRL, CC_GRAPH, CC_PRINT, CC_PUNCT,
734 					CC_NUM
735 				};
736 				int cc;
737 
738 				if (!*pattern) {
739 					preg->err = REG_ERR_UNMATCHED_BRACKET;
740 					return 0;
741 				}
742 
743 				pattern += reg_utf8_tounicode_case(pattern, &start, nocase);
744 				if (start == '\\') {
745 					/* First check for class shorthand escapes */
746 					switch (*pattern) {
747 						case 's':
748 							pattern++;
749 							cc = CC_SPACE;
750 							goto cc_switch;
751 						case 'd':
752 							pattern++;
753 							cc = CC_DIGIT;
754 							goto cc_switch;
755 						case 'w':
756 							pattern++;
757 							reg_addrange(preg, '_', '_');
758 							cc = CC_ALNUM;
759 							goto cc_switch;
760 					}
761 					pattern += reg_decode_escape(pattern, &start);
762 					if (start == 0) {
763 						preg->err = REG_ERR_NULL_CHAR;
764 						return 0;
765 					}
766 					if (start == '\\' && *pattern == 0) {
767 						preg->err = REG_ERR_INVALID_ESCAPE;
768 						return 0;
769 					}
770 				}
771 				if (pattern[0] == '-' && pattern[1] && pattern[1] != ']') {
772 					/* skip '-' */
773 					pattern += utf8_tounicode(pattern, &end);
774 					pattern += reg_utf8_tounicode_case(pattern, &end, nocase);
775 					if (end == '\\') {
776 						pattern += reg_decode_escape(pattern, &end);
777 						if (end == 0) {
778 							preg->err = REG_ERR_NULL_CHAR;
779 							return 0;
780 						}
781 						if (start == '\\' && *pattern == 0) {
782 							preg->err = REG_ERR_INVALID_ESCAPE;
783 							return 0;
784 						}
785 					}
786 
787 					reg_addrange(preg, start, end);
788 					continue;
789 				}
790 				if (start == '[' && pattern[0] == ':') {
791 					static const char *character_class[] = {
792 						":alpha:", ":alnum:", ":space:", ":blank:", ":upper:", ":lower:",
793 						":digit:", ":xdigit:", ":cntrl:", ":graph:", ":print:", ":punct:",
794 					};
795 
796 					for (cc = 0; cc < CC_NUM; cc++) {
797 						n = strlen(character_class[cc]);
798 						if (strncmp(pattern, character_class[cc], n) == 0) {
799 							/* Found a character class */
800 							pattern += n + 1;
801 							break;
802 						}
803 					}
804 					if (cc != CC_NUM) {
805 cc_switch:
806 						switch (cc) {
807 							case CC_ALNUM:
808 								reg_addrange(preg, '0', '9');
809 								/* Fall through */
810 							case CC_ALPHA:
811 								if ((preg->cflags & REG_ICASE) == 0) {
812 									reg_addrange(preg, 'a', 'z');
813 								}
814 								reg_addrange(preg, 'A', 'Z');
815 								break;
816 							case CC_SPACE:
817 								reg_addrange_str(preg, " \t\r\n\f\v");
818 								break;
819 							case CC_BLANK:
820 								reg_addrange_str(preg, " \t");
821 								break;
822 							case CC_UPPER:
823 								reg_addrange(preg, 'A', 'Z');
824 								break;
825 							case CC_LOWER:
826 								reg_addrange(preg, 'a', 'z');
827 								break;
828 							case CC_XDIGIT:
829 								reg_addrange(preg, 'a', 'f');
830 								reg_addrange(preg, 'A', 'F');
831 								/* Fall through */
832 							case CC_DIGIT:
833 								reg_addrange(preg, '0', '9');
834 								break;
835 							case CC_CNTRL:
836 								reg_addrange(preg, 0, 31);
837 								reg_addrange(preg, 127, 127);
838 								break;
839 							case CC_PRINT:
840 								reg_addrange(preg, ' ', '~');
841 								break;
842 							case CC_GRAPH:
843 								reg_addrange(preg, '!', '~');
844 								break;
845 							case CC_PUNCT:
846 								reg_addrange(preg, '!', '/');
847 								reg_addrange(preg, ':', '@');
848 								reg_addrange(preg, '[', '`');
849 								reg_addrange(preg, '{', '~');
850 								break;
851 						}
852 						continue;
853 					}
854 				}
855 				/* Not a range, so just add the char */
856 				reg_addrange(preg, start, start);
857 			}
858 			regc(preg, '\0');
859 
860 			if (*pattern) {
861 				pattern++;
862 			}
863 			preg->regparse = pattern;
864 
865 			*flagp |= HASWIDTH|SIMPLE;
866 		}
867 		break;
868 	case '(':
869 		ret = reg(preg, 1, &flags);
870 		if (ret == 0)
871 			return 0;
872 		*flagp |= flags&(HASWIDTH|SPSTART);
873 		break;
874 	case '\0':
875 	case '|':
876 	case ')':
877 		preg->err = REG_ERR_INTERNAL;
878 		return 0;	/* Supposed to be caught earlier. */
879 	case '?':
880 	case '+':
881 	case '*':
882 	case '{':
883 		preg->err = REG_ERR_COUNT_FOLLOWS_NOTHING;
884 		return 0;
885 	case '\\':
886 		ch = *preg->regparse++;
887 		switch (ch) {
888 		case '\0':
889 			preg->err = REG_ERR_INVALID_ESCAPE;
890 			return 0;
891 		case 'A':
892 			ret = regnode(preg, BOLX);
893 			break;
894 		case 'Z':
895 			ret = regnode(preg, EOLX);
896 			break;
897 		case '<':
898 		case 'm':
899 			ret = regnode(preg, WORDA);
900 			break;
901 		case '>':
902 		case 'M':
903 			ret = regnode(preg, WORDZ);
904 			break;
905 		case 'd':
906 		case 'D':
907 			ret = regnode(preg, ch == 'd' ? ANYOF : ANYBUT);
908 			reg_addrange(preg, '0', '9');
909 			regc(preg, '\0');
910 			*flagp |= HASWIDTH|SIMPLE;
911 			break;
912 		case 'w':
913 		case 'W':
914 			ret = regnode(preg, ch == 'w' ? ANYOF : ANYBUT);
915 			if ((preg->cflags & REG_ICASE) == 0) {
916 				reg_addrange(preg, 'a', 'z');
917 			}
918 			reg_addrange(preg, 'A', 'Z');
919 			reg_addrange(preg, '0', '9');
920 			reg_addrange(preg, '_', '_');
921 			regc(preg, '\0');
922 			*flagp |= HASWIDTH|SIMPLE;
923 			break;
924 		case 's':
925 		case 'S':
926 			ret = regnode(preg, ch == 's' ? ANYOF : ANYBUT);
927 			reg_addrange_str(preg," \t\r\n\f\v");
928 			regc(preg, '\0');
929 			*flagp |= HASWIDTH|SIMPLE;
930 			break;
931 		/* FIXME: Someday handle \1, \2, ... */
932 		default:
933 			/* Handle general quoted chars in exact-match routine */
934 			/* Back up to include the backslash */
935 			preg->regparse--;
936 			goto de_fault;
937 		}
938 		break;
939 	de_fault:
940 	default: {
941 			/*
942 			 * Encode a string of characters to be matched exactly.
943 			 */
944 			int added = 0;
945 
946 			/* Back up to pick up the first char of interest */
947 			preg->regparse -= n;
948 
949 			ret = regnode(preg, EXACTLY);
950 
951 			/* Note that a META operator such as ? or * consumes the
952 			 * preceding char.
953 			 * Thus we must be careful to look ahead by 2 and add the
954 			 * last char as it's own EXACTLY if necessary
955 			 */
956 
957 			/* Until end of string or a META char is reached */
958 			while (*preg->regparse && strchr(META, *preg->regparse) == NULL) {
959 				n = reg_utf8_tounicode_case(preg->regparse, &ch, (preg->cflags & REG_ICASE));
960 				if (ch == '\\' && preg->regparse[n]) {
961 					/* Non-trailing backslash.
962 					 * Is this a special escape, or a regular escape?
963 					 */
964 					if (strchr("<>mMwWdDsSAZ", preg->regparse[n])) {
965 						/* A special escape. All done with EXACTLY */
966 						break;
967 					}
968 					/* Decode it. Note that we add the length for the escape
969 					 * sequence to the length for the backlash so we can skip
970 					 * the entire sequence, or not as required.
971 					 */
972 					n += reg_decode_escape(preg->regparse + n, &ch);
973 					if (ch == 0) {
974 						preg->err = REG_ERR_NULL_CHAR;
975 						return 0;
976 					}
977 				}
978 
979 				/* Now we have one char 'ch' of length 'n'.
980 				 * Check to see if the following char is a MULT
981 				 */
982 
983 				if (ISMULT(preg->regparse[n])) {
984 					/* Yes. But do we already have some EXACTLY chars? */
985 					if (added) {
986 						/* Yes, so return what we have and pick up the current char next time around */
987 						break;
988 					}
989 					/* No, so add this single char and finish */
990 					regc(preg, ch);
991 					added++;
992 					preg->regparse += n;
993 					break;
994 				}
995 
996 				/* No, so just add this char normally */
997 				regc(preg, ch);
998 				added++;
999 				preg->regparse += n;
1000 			}
1001 			regc(preg, '\0');
1002 
1003 			*flagp |= HASWIDTH;
1004 			if (added == 1)
1005 				*flagp |= SIMPLE;
1006 			break;
1007 		}
1008 		break;
1009 	}
1010 
1011 	return(ret);
1012 }
1013 
reg_grow(regex_t * preg,int n)1014 static void reg_grow(regex_t *preg, int n)
1015 {
1016 	if (preg->p + n >= preg->proglen) {
1017 		preg->proglen = (preg->p + n) * 2;
1018 		preg->program = realloc(preg->program, preg->proglen * sizeof(int));
1019 	}
1020 }
1021 
1022 /*
1023  - regnode - emit a node
1024  */
1025 /* Location. */
regnode(regex_t * preg,int op)1026 static int regnode(regex_t *preg, int op)
1027 {
1028 	reg_grow(preg, 2);
1029 
1030 	/* The OP followed by a next pointer */
1031 	preg->program[preg->p++] = op;
1032 	preg->program[preg->p++] = 0;
1033 
1034 	/* Return the start of the node */
1035 	return preg->p - 2;
1036 }
1037 
1038 /*
1039  - regc - emit (if appropriate) a byte of code
1040  */
regc(regex_t * preg,int b)1041 static void regc(regex_t *preg, int b )
1042 {
1043 	reg_grow(preg, 1);
1044 	preg->program[preg->p++] = b;
1045 }
1046 
1047 /*
1048  - reginsert - insert an operator in front of already-emitted operand
1049  *
1050  * Means relocating the operand.
1051  * Returns the new location of the original operand.
1052  */
reginsert(regex_t * preg,int op,int size,int opnd)1053 static int reginsert(regex_t *preg, int op, int size, int opnd )
1054 {
1055 	reg_grow(preg, size);
1056 
1057 	/* Move everything from opnd up */
1058 	memmove(preg->program + opnd + size, preg->program + opnd, sizeof(int) * (preg->p - opnd));
1059 	/* Zero out the new space */
1060 	memset(preg->program + opnd, 0, sizeof(int) * size);
1061 
1062 	preg->program[opnd] = op;
1063 
1064 	preg->p += size;
1065 
1066 	return opnd + size;
1067 }
1068 
1069 /*
1070  - regtail - set the next-pointer at the end of a node chain
1071  */
regtail(regex_t * preg,int p,int val)1072 static void regtail(regex_t *preg, int p, int val)
1073 {
1074 	int scan;
1075 	int temp;
1076 	int offset;
1077 
1078 	/* Find last node. */
1079 	scan = p;
1080 	for (;;) {
1081 		temp = regnext(preg, scan);
1082 		if (temp == 0)
1083 			break;
1084 		scan = temp;
1085 	}
1086 
1087 	if (OP(preg, scan) == BACK)
1088 		offset = scan - val;
1089 	else
1090 		offset = val - scan;
1091 
1092 	preg->program[scan + 1] = offset;
1093 }
1094 
1095 /*
1096  - regoptail - regtail on operand of first argument; nop if operandless
1097  */
1098 
regoptail(regex_t * preg,int p,int val)1099 static void regoptail(regex_t *preg, int p, int val )
1100 {
1101 	/* "Operandless" and "op != BRANCH" are synonymous in practice. */
1102 	if (p != 0 && OP(preg, p) == BRANCH) {
1103 		regtail(preg, OPERAND(p), val);
1104 	}
1105 }
1106 
1107 /*
1108  * regexec and friends
1109  */
1110 
1111 /*
1112  * Forwards.
1113  */
1114 static int regtry(regex_t *preg, const char *string );
1115 static int regmatch(regex_t *preg, int prog);
1116 static int regrepeat(regex_t *preg, int p, int max);
1117 
1118 /*
1119  - regexec - match a regexp against a string
1120  */
regexec(regex_t * preg,const char * string,size_t nmatch,regmatch_t pmatch[],int eflags)1121 int regexec(regex_t  *preg,  const  char *string, size_t nmatch, regmatch_t pmatch[], int eflags)
1122 {
1123 	const char *s;
1124 	int scan;
1125 
1126 	/* Be paranoid... */
1127 	if (preg == NULL || preg->program == NULL || string == NULL) {
1128 		return REG_ERR_NULL_ARGUMENT;
1129 	}
1130 
1131 	/* Check validity of program. */
1132 	if (*preg->program != REG_MAGIC) {
1133 		return REG_ERR_CORRUPTED;
1134 	}
1135 
1136 #ifdef DEBUG
1137 	fprintf(stderr, "regexec: %s\n", string);
1138 	regdump(preg);
1139 #endif
1140 
1141 	preg->eflags = eflags;
1142 	preg->pmatch = pmatch;
1143 	preg->nmatch = nmatch;
1144 	preg->start = string;	/* All offsets are computed from here */
1145 
1146 	/* Must clear out the embedded repeat counts of REPX and REPXMIN opcodes */
1147 	for (scan = OPERAND(1); scan != 0; scan += regopsize(preg, scan)) {
1148 		int op = OP(preg, scan);
1149 		if (op == END)
1150 			break;
1151 		if (op == REPX || op == REPXMIN)
1152 			preg->program[scan + 4] = 0;
1153 	}
1154 
1155 	/* If there is a "must appear" string, look for it. */
1156 	if (preg->regmust != 0) {
1157 		s = string;
1158 		while ((s = str_find(s, preg->program[preg->regmust], preg->cflags & REG_ICASE)) != NULL) {
1159 			if (prefix_cmp(preg->program + preg->regmust, preg->regmlen, s, preg->cflags & REG_ICASE) >= 0) {
1160 				break;
1161 			}
1162 			s++;
1163 		}
1164 		if (s == NULL)	/* Not present. */
1165 			return REG_NOMATCH;
1166 	}
1167 
1168 	/* Mark beginning of line for ^ . */
1169 	preg->regbol = string;
1170 
1171 	/* Simplest case:  anchored match need be tried only once (maybe per line). */
1172 	if (preg->reganch) {
1173 		if (eflags & REG_NOTBOL) {
1174 			/* This is an anchored search, but not an BOL, so possibly skip to the next line */
1175 			goto nextline;
1176 		}
1177 		while (1) {
1178 			if (regtry(preg, string)) {
1179 				return REG_NOERROR;
1180 			}
1181 			if (*string) {
1182 nextline:
1183 				if (preg->cflags & REG_NEWLINE) {
1184 					/* Try the next anchor? */
1185 					string = strchr(string, '\n');
1186 					if (string) {
1187 						preg->regbol = ++string;
1188 						continue;
1189 					}
1190 				}
1191 			}
1192 			return REG_NOMATCH;
1193 		}
1194 	}
1195 
1196 	/* Messy cases:  unanchored match. */
1197 	s = string;
1198 	if (preg->regstart != '\0') {
1199 		/* We know what char it must start with. */
1200 		while ((s = str_find(s, preg->regstart, preg->cflags & REG_ICASE)) != NULL) {
1201 			if (regtry(preg, s))
1202 				return REG_NOERROR;
1203 			s++;
1204 		}
1205 	}
1206 	else
1207 		/* We don't -- general case. */
1208 		while (1) {
1209 			if (regtry(preg, s))
1210 				return REG_NOERROR;
1211 			if (*s == '\0') {
1212 				break;
1213 			}
1214 			else {
1215 				int c;
1216 				s += utf8_tounicode(s, &c);
1217 			}
1218 		}
1219 
1220 	/* Failure. */
1221 	return REG_NOMATCH;
1222 }
1223 
1224 /*
1225  - regtry - try match at specific point
1226  */
1227 			/* 0 failure, 1 success */
regtry(regex_t * preg,const char * string)1228 static int regtry( regex_t *preg, const char *string )
1229 {
1230 	int i;
1231 
1232 	preg->reginput = string;
1233 
1234 	for (i = 0; i < preg->nmatch; i++) {
1235 		if (preg->pmatch) {
1236 			preg->pmatch[i].rm_so = -1;
1237 			preg->pmatch[i].rm_eo = -1;
1238 		}
1239 	}
1240 	if (regmatch(preg, 1)) {
1241 		if (preg->pmatch) {
1242 			preg->pmatch[0].rm_so = string - preg->start;
1243 			preg->pmatch[0].rm_eo = preg->reginput - preg->start;
1244 		}
1245 		return(1);
1246 	} else
1247 		return(0);
1248 }
1249 
1250 /**
1251  * Returns bytes matched if 'pattern' is a prefix of 'string'.
1252  *
1253  * If 'nocase' is non-zero, does a case-insensitive match.
1254  *
1255  * Returns -1 on not found.
1256  */
prefix_cmp(const int * prog,int proglen,const char * string,int nocase)1257 static int prefix_cmp(const int *prog, int proglen, const char *string, int nocase)
1258 {
1259 	const char *s = string;
1260 	while (proglen && *s) {
1261 		int ch;
1262 		int n = reg_utf8_tounicode_case(s, &ch, nocase);
1263 		if (ch != *prog) {
1264 			return -1;
1265 		}
1266 		prog++;
1267 		s += n;
1268 		proglen--;
1269 	}
1270 	if (proglen == 0) {
1271 		return s - string;
1272 	}
1273 	return -1;
1274 }
1275 
1276 /**
1277  * Searchs for 'c' in the range 'range'.
1278  *
1279  * Returns 1 if found, or 0 if not.
1280  */
reg_range_find(const int * range,int c)1281 static int reg_range_find(const int *range, int c)
1282 {
1283 	while (*range) {
1284 		/*printf("Checking %d in range [%d,%d]\n", c, range[1], (range[0] + range[1] - 1));*/
1285 		if (c >= range[1] && c <= (range[0] + range[1] - 1)) {
1286 			return 1;
1287 		}
1288 		range += 2;
1289 	}
1290 	return 0;
1291 }
1292 
1293 /**
1294  * Search for the character 'c' in the utf-8 string 'string'.
1295  *
1296  * If 'nocase' is set, the 'string' is assumed to be uppercase
1297  * and 'c' is converted to uppercase before matching.
1298  *
1299  * Returns the byte position in the string where the 'c' was found, or
1300  * NULL if not found.
1301  */
str_find(const char * string,int c,int nocase)1302 static const char *str_find(const char *string, int c, int nocase)
1303 {
1304 	if (nocase) {
1305 		/* The "string" should already be converted to uppercase */
1306 		c = utf8_upper(c);
1307 	}
1308 	while (*string) {
1309 		int ch;
1310 		int n = reg_utf8_tounicode_case(string, &ch, nocase);
1311 		if (c == ch) {
1312 			return string;
1313 		}
1314 		string += n;
1315 	}
1316 	return NULL;
1317 }
1318 
1319 /**
1320  * Returns true if 'ch' is an end-of-line char.
1321  *
1322  * In REG_NEWLINE mode, \n is considered EOL in
1323  * addition to \0
1324  */
reg_iseol(regex_t * preg,int ch)1325 static int reg_iseol(regex_t *preg, int ch)
1326 {
1327 	if (preg->cflags & REG_NEWLINE) {
1328 		return ch == '\0' || ch == '\n';
1329 	}
1330 	else {
1331 		return ch == '\0';
1332 	}
1333 }
1334 
regmatchsimplerepeat(regex_t * preg,int scan,int matchmin)1335 static int regmatchsimplerepeat(regex_t *preg, int scan, int matchmin)
1336 {
1337 	int nextch = '\0';
1338 	const char *save;
1339 	int no;
1340 	int c;
1341 
1342 	int max = preg->program[scan + 2];
1343 	int min = preg->program[scan + 3];
1344 	int next = regnext(preg, scan);
1345 
1346 	/*
1347 	 * Lookahead to avoid useless match attempts
1348 	 * when we know what character comes next.
1349 	 */
1350 	if (OP(preg, next) == EXACTLY) {
1351 		nextch = preg->program[OPERAND(next)];
1352 	}
1353 	save = preg->reginput;
1354 	no = regrepeat(preg, scan + 5, max);
1355 	if (no < min) {
1356 		return 0;
1357 	}
1358 	if (matchmin) {
1359 		/* from min up to no */
1360 		max = no;
1361 		no = min;
1362 	}
1363 	/* else from no down to min */
1364 	while (1) {
1365 		if (matchmin) {
1366 			if (no > max) {
1367 				break;
1368 			}
1369 		}
1370 		else {
1371 			if (no < min) {
1372 				break;
1373 			}
1374 		}
1375 		preg->reginput = save + utf8_index(save, no);
1376 		reg_utf8_tounicode_case(preg->reginput, &c, (preg->cflags & REG_ICASE));
1377 		/* If it could work, try it. */
1378 		if (reg_iseol(preg, nextch) || c == nextch) {
1379 			if (regmatch(preg, next)) {
1380 				return(1);
1381 			}
1382 		}
1383 		if (matchmin) {
1384 			/* Couldn't or didn't, add one more */
1385 			no++;
1386 		}
1387 		else {
1388 			/* Couldn't or didn't -- back up. */
1389 			no--;
1390 		}
1391 	}
1392 	return(0);
1393 }
1394 
regmatchrepeat(regex_t * preg,int scan,int matchmin)1395 static int regmatchrepeat(regex_t *preg, int scan, int matchmin)
1396 {
1397 	int *scanpt = preg->program + scan;
1398 
1399 	int max = scanpt[2];
1400 	int min = scanpt[3];
1401 
1402 	/* Have we reached min? */
1403 	if (scanpt[4] < min) {
1404 		/* No, so get another one */
1405 		scanpt[4]++;
1406 		if (regmatch(preg, scan + 5)) {
1407 			return 1;
1408 		}
1409 		scanpt[4]--;
1410 		return 0;
1411 	}
1412 	if (scanpt[4] > max) {
1413 		return 0;
1414 	}
1415 
1416 	if (matchmin) {
1417 		/* minimal, so try other branch first */
1418 		if (regmatch(preg, regnext(preg, scan))) {
1419 			return 1;
1420 		}
1421 		/* No, so try one more */
1422 		scanpt[4]++;
1423 		if (regmatch(preg, scan + 5)) {
1424 			return 1;
1425 		}
1426 		scanpt[4]--;
1427 		return 0;
1428 	}
1429 	/* maximal, so try this branch again */
1430 	if (scanpt[4] < max) {
1431 		scanpt[4]++;
1432 		if (regmatch(preg, scan + 5)) {
1433 			return 1;
1434 		}
1435 		scanpt[4]--;
1436 	}
1437 	/* At this point we are at max with no match. Try the other branch */
1438 	return regmatch(preg, regnext(preg, scan));
1439 }
1440 
1441 /*
1442  - regmatch - main matching routine
1443  *
1444  * Conceptually the strategy is simple:  check to see whether the current
1445  * node matches, call self recursively to see whether the rest matches,
1446  * and then act accordingly.  In practice we make some effort to avoid
1447  * recursion, in particular by going through "ordinary" nodes (that don't
1448  * need to know whether the rest of the match failed) by a loop instead of
1449  * by recursion.
1450  */
1451 /* 0 failure, 1 success */
regmatch(regex_t * preg,int prog)1452 static int regmatch(regex_t *preg, int prog)
1453 {
1454 	int scan;	/* Current node. */
1455 	int next;		/* Next node. */
1456 	const char *save;
1457 
1458 	scan = prog;
1459 
1460 #ifdef DEBUG
1461 	if (scan != 0 && regnarrate)
1462 		fprintf(stderr, "%s(\n", regprop(scan));
1463 #endif
1464 	while (scan != 0) {
1465 		int n;
1466 		int c;
1467 #ifdef DEBUG
1468 		if (regnarrate) {
1469 			fprintf(stderr, "%3d: %s...\n", scan, regprop(OP(preg, scan)));	/* Where, what. */
1470 		}
1471 #endif
1472 		next = regnext(preg, scan);
1473 		n = reg_utf8_tounicode_case(preg->reginput, &c, (preg->cflags & REG_ICASE));
1474 
1475 		switch (OP(preg, scan)) {
1476 		case BOLX:
1477 			if ((preg->eflags & REG_NOTBOL)) {
1478 				return(0);
1479 			}
1480 			/* Fall through */
1481 		case BOL:
1482 			if (preg->reginput != preg->regbol) {
1483 				return(0);
1484 			}
1485 			break;
1486 		case EOLX:
1487 			if (c != 0) {
1488 				/* For EOLX, only match real end of line, not newline */
1489 				return 0;
1490 			}
1491 			break;
1492 		case EOL:
1493 			if (!reg_iseol(preg, c)) {
1494 				return(0);
1495 			}
1496 			break;
1497 		case WORDA:
1498 			/* Must be looking at a letter, digit, or _ */
1499 			if ((!isalnum(UCHAR(c))) && c != '_')
1500 				return(0);
1501 			/* Prev must be BOL or nonword */
1502 			if (preg->reginput > preg->regbol &&
1503 				(isalnum(UCHAR(preg->reginput[-1])) || preg->reginput[-1] == '_'))
1504 				return(0);
1505 			break;
1506 		case WORDZ:
1507 			/* Can't match at BOL */
1508 			if (preg->reginput > preg->regbol) {
1509 				/* Current must be EOL or nonword */
1510 				if (reg_iseol(preg, c) || !isalnum(UCHAR(c)) || c != '_') {
1511 					c = preg->reginput[-1];
1512 					/* Previous must be word */
1513 					if (isalnum(UCHAR(c)) || c == '_') {
1514 						break;
1515 					}
1516 				}
1517 			}
1518 			/* No */
1519 			return(0);
1520 
1521 		case ANY:
1522 			if (reg_iseol(preg, c))
1523 				return 0;
1524 			preg->reginput += n;
1525 			break;
1526 		case EXACTLY: {
1527 				int opnd;
1528 				int len;
1529 				int slen;
1530 
1531 				opnd = OPERAND(scan);
1532 				len = str_int_len(preg->program + opnd);
1533 
1534 				slen = prefix_cmp(preg->program + opnd, len, preg->reginput, preg->cflags & REG_ICASE);
1535 				if (slen < 0) {
1536 					return(0);
1537 				}
1538 				preg->reginput += slen;
1539 			}
1540 			break;
1541 		case ANYOF:
1542 			if (reg_iseol(preg, c) || reg_range_find(preg->program + OPERAND(scan), c) == 0) {
1543 				return(0);
1544 			}
1545 			preg->reginput += n;
1546 			break;
1547 		case ANYBUT:
1548 			if (reg_iseol(preg, c) || reg_range_find(preg->program + OPERAND(scan), c) != 0) {
1549 				return(0);
1550 			}
1551 			preg->reginput += n;
1552 			break;
1553 		case NOTHING:
1554 			break;
1555 		case BACK:
1556 			break;
1557 		case BRANCH:
1558 			if (OP(preg, next) != BRANCH)		/* No choice. */
1559 				next = OPERAND(scan);	/* Avoid recursion. */
1560 			else {
1561 				do {
1562 					save = preg->reginput;
1563 					if (regmatch(preg, OPERAND(scan))) {
1564 						return(1);
1565 					}
1566 					preg->reginput = save;
1567 					scan = regnext(preg, scan);
1568 				} while (scan != 0 && OP(preg, scan) == BRANCH);
1569 				return(0);
1570 				/* NOTREACHED */
1571 			}
1572 			break;
1573 		case REP:
1574 		case REPMIN:
1575 			return regmatchsimplerepeat(preg, scan, OP(preg, scan) == REPMIN);
1576 
1577 		case REPX:
1578 		case REPXMIN:
1579 			return regmatchrepeat(preg, scan, OP(preg, scan) == REPXMIN);
1580 
1581 		case END:
1582 			return 1;	/* Success! */
1583 
1584 		case OPENNC:
1585 		case CLOSENC:
1586 			return regmatch(preg, next);
1587 
1588 		default:
1589 			if (OP(preg, scan) >= OPEN+1 && OP(preg, scan) < CLOSE_END) {
1590 				save = preg->reginput;
1591 				if (regmatch(preg, next)) {
1592 					if (OP(preg, scan) < CLOSE) {
1593 						int no = OP(preg, scan) - OPEN;
1594 						if (no < preg->nmatch && preg->pmatch && preg->pmatch[no].rm_so == -1) {
1595 							preg->pmatch[no].rm_so = save - preg->start;
1596 						}
1597 					}
1598 					else {
1599 						int no = OP(preg, scan) - CLOSE;
1600 						if (no < preg->nmatch && preg->pmatch && preg->pmatch[no].rm_eo == -1) {
1601 							preg->pmatch[no].rm_eo = save - preg->start;
1602 						}
1603 					}
1604 					return(1);
1605 				}
1606 				/* Restore input position after failure */
1607 				preg->reginput = save;
1608 				return(0);
1609 			}
1610 			return REG_ERR_INTERNAL;
1611 		}
1612 
1613 		scan = next;
1614 	}
1615 
1616 	/*
1617 	 * We get here only if there's trouble -- normally "case END" is
1618 	 * the terminating point.
1619 	 */
1620 	return REG_ERR_INTERNAL;
1621 }
1622 
1623 /*
1624  - regrepeat - repeatedly match something simple, report how many
1625  */
regrepeat(regex_t * preg,int p,int max)1626 static int regrepeat(regex_t *preg, int p, int max)
1627 {
1628 	int count = 0;
1629 	const char *scan;
1630 	int opnd;
1631 	int ch;
1632 	int n;
1633 
1634 	scan = preg->reginput;
1635 	opnd = OPERAND(p);
1636 	switch (OP(preg, p)) {
1637 	case ANY:
1638 		while (!reg_iseol(preg, *scan) && count < max) {
1639 			count++;
1640 			scan += utf8_charlen(*scan);
1641 		}
1642 		break;
1643 	case EXACTLY:
1644 		while (count < max) {
1645 			n = reg_utf8_tounicode_case(scan, &ch, preg->cflags & REG_ICASE);
1646 			if (preg->program[opnd] != ch) {
1647 				break;
1648 			}
1649 			count++;
1650 			scan += n;
1651 		}
1652 		break;
1653 	case ANYOF:
1654 		while (count < max) {
1655 			n = reg_utf8_tounicode_case(scan, &ch, preg->cflags & REG_ICASE);
1656 			if (reg_iseol(preg, ch) || reg_range_find(preg->program + opnd, ch) == 0) {
1657 				break;
1658 			}
1659 			count++;
1660 			scan += n;
1661 		}
1662 		break;
1663 	case ANYBUT:
1664 		while (count < max) {
1665 			n = reg_utf8_tounicode_case(scan, &ch, preg->cflags & REG_ICASE);
1666 			if (reg_iseol(preg, ch) || reg_range_find(preg->program + opnd, ch) != 0) {
1667 				break;
1668 			}
1669 			count++;
1670 			scan += n;
1671 		}
1672 		break;
1673 	default:		/* Oh dear.  Called inappropriately. */
1674 		preg->err = REG_ERR_INTERNAL;
1675 		count = 0;	/* Best compromise. */
1676 		break;
1677 	}
1678 	preg->reginput = scan;
1679 
1680 	return(count);
1681 }
1682 
1683 /*
1684  - regnext - dig the "next" pointer out of a node
1685  */
regnext(regex_t * preg,int p)1686 static int regnext(regex_t *preg, int p )
1687 {
1688 	int offset;
1689 
1690 	offset = NEXT(preg, p);
1691 
1692 	if (offset == 0)
1693 		return 0;
1694 
1695 	if (OP(preg, p) == BACK)
1696 		return(p-offset);
1697 	else
1698 		return(p+offset);
1699 }
1700 
1701 /*
1702  - regopsize - returns the size of opcode + operands at 'p' in words
1703  */
regopsize(regex_t * preg,int p)1704 static int regopsize(regex_t *preg, int p )
1705 {
1706 	/* Almost all opcodes are 2 words, but some are more */
1707 	switch (OP(preg, p)) {
1708 		case REP:
1709 		case REPMIN:
1710 		case REPX:
1711 		case REPXMIN:
1712 			return 5;
1713 
1714 		case ANYOF:
1715 		case ANYBUT:
1716 		case EXACTLY: {
1717 			int s = p + 2;
1718 			while (preg->program[s++]) {
1719 			}
1720 			return s - p;
1721 		}
1722 	}
1723 	return 2;
1724 }
1725 
1726 #if defined(DEBUG) && !defined(JIM_BOOTSTRAP)
1727 
1728 /*
1729  - regdump - dump a regexp onto stdout in vaguely comprehensible form
1730  */
regdump(regex_t * preg)1731 static void regdump(regex_t *preg)
1732 {
1733 	int s;
1734 	int op = EXACTLY;	/* Arbitrary non-END op. */
1735 	int next;
1736 	char buf[MAX_UTF8_LEN + 1];
1737 
1738 	int i;
1739 	for (i = 1; i < preg->p; i++) {
1740 		printf("%02x ", (unsigned char)preg->program[i]);
1741 		if (i % 16 == 0) {
1742 			printf("\n");
1743 		}
1744 	}
1745 	printf("\n");
1746 
1747 	s = 1;
1748 	while (op != END && s < preg->p) {	/* While that wasn't END last time... */
1749 		op = OP(preg, s);
1750 		printf("%3d: %s", s, regprop(op));	/* Where, what. */
1751 		next = regnext(preg, s);
1752 		if (next == 0)		/* Next ptr. */
1753 			printf("(0)");
1754 		else
1755 			printf("(%d)", next);
1756 		s += 2;
1757 		if (op == REP || op == REPMIN || op == REPX || op == REPXMIN) {
1758 			int max = preg->program[s];
1759 			int min = preg->program[s + 1];
1760 			if (max == 65535) {
1761 				printf("{%d,*}", min);
1762 			}
1763 			else {
1764 				printf("{%d,%d}", min, max);
1765 			}
1766 			printf(" %d", preg->program[s + 2]);
1767 			s += 3;
1768 		}
1769 		else if (op == ANYOF || op == ANYBUT) {
1770 			/* set of ranges */
1771 
1772 			while (preg->program[s]) {
1773 				int len = preg->program[s++];
1774 				int first = preg->program[s++];
1775 				buf[utf8_getchars(buf, first)] = 0;
1776 				printf("%s", buf);
1777 				if (len > 1) {
1778 					buf[utf8_getchars(buf, first + len - 1)] = 0;
1779 					printf("-%s", buf);
1780 				}
1781 			}
1782 			s++;
1783 		}
1784 		else if (op == EXACTLY) {
1785 			/* Literal string, where present. */
1786 
1787 			while (preg->program[s]) {
1788 				buf[utf8_getchars(buf, preg->program[s])] = 0;
1789 				printf("%s", buf);
1790 				s++;
1791 			}
1792 			s++;
1793 		}
1794 		putchar('\n');
1795 	}
1796 
1797 	if (op == END) {
1798 		/* Header fields of interest. */
1799 		if (preg->regstart) {
1800 			buf[utf8_getchars(buf, preg->regstart)] = 0;
1801 			printf("start '%s' ", buf);
1802 		}
1803 		if (preg->reganch)
1804 			printf("anchored ");
1805 		if (preg->regmust != 0) {
1806 			int i;
1807 			printf("must have:");
1808 			for (i = 0; i < preg->regmlen; i++) {
1809 				putchar(preg->program[preg->regmust + i]);
1810 			}
1811 			putchar('\n');
1812 		}
1813 	}
1814 	printf("\n");
1815 }
1816 
1817 /*
1818  - regprop - printable representation of opcode
1819  */
regprop(int op)1820 static const char *regprop( int op )
1821 {
1822 	static char buf[50];
1823 
1824 	switch (op) {
1825 	case BOL:
1826 		return "BOL";
1827 	case EOL:
1828 		return "EOL";
1829 	case BOLX:
1830 		return "BOLX";
1831 	case EOLX:
1832 		return "EOLX";
1833 	case ANY:
1834 		return "ANY";
1835 	case ANYOF:
1836 		return "ANYOF";
1837 	case ANYBUT:
1838 		return "ANYBUT";
1839 	case BRANCH:
1840 		return "BRANCH";
1841 	case EXACTLY:
1842 		return "EXACTLY";
1843 	case NOTHING:
1844 		return "NOTHING";
1845 	case BACK:
1846 		return "BACK";
1847 	case END:
1848 		return "END";
1849 	case REP:
1850 		return "REP";
1851 	case REPMIN:
1852 		return "REPMIN";
1853 	case REPX:
1854 		return "REPX";
1855 	case REPXMIN:
1856 		return "REPXMIN";
1857 	case WORDA:
1858 		return "WORDA";
1859 	case WORDZ:
1860 		return "WORDZ";
1861 	case OPENNC:
1862 		return "OPEN";
1863 	case CLOSENC:
1864 		return "CLOSE";
1865 	default:
1866 		if (op >= OPEN && op < CLOSE) {
1867 			snprintf(buf, sizeof(buf), "OPEN%d", op-OPEN);
1868 		}
1869 		else if (op >= CLOSE && op < CLOSE_END) {
1870 			snprintf(buf, sizeof(buf), "CLOSE%d", op-CLOSE);
1871 		}
1872 		else {
1873 			snprintf(buf, sizeof(buf), "?%d?\n", op);
1874 		}
1875 		return(buf);
1876 	}
1877 }
1878 #endif /* JIM_BOOTSTRAP */
1879 
regerror(int errcode,const regex_t * preg,char * errbuf,size_t errbuf_size)1880 size_t regerror(int errcode, const regex_t *preg, char *errbuf,  size_t errbuf_size)
1881 {
1882 	static const char *error_strings[] = {
1883 		"success",
1884 		"no match",
1885 		"bad pattern",
1886 		"null argument",
1887 		"unknown error",
1888 		"too big",
1889 		"out of memory",
1890 		"too many ()",
1891 		"parentheses () not balanced",
1892 		"braces {} not balanced",
1893 		"invalid repetition count(s)",
1894 		"extra characters",
1895 		"*+ of empty atom",
1896 		"nested count",
1897 		"internal error",
1898 		"count follows nothing",
1899 		"invalid escape \\ sequence",
1900 		"corrupted program",
1901 		"contains null char",
1902 		"brackets [] not balanced",
1903 	};
1904 	const char *err;
1905 
1906         (void)preg;
1907 
1908 	if (errcode < 0 || errcode >= REG_ERR_NUM) {
1909 		err = "Bad error code";
1910 	}
1911 	else {
1912 		err = error_strings[errcode];
1913 	}
1914 
1915 	return snprintf(errbuf, errbuf_size, "%s", err);
1916 }
1917 
regfree(regex_t * preg)1918 void regfree(regex_t *preg)
1919 {
1920 	free(preg->program);
1921 }
1922 
1923 #endif
1924