1 /*
2 * vi:se ts=8:
3 *
4 * regcomp and regexec -- regsub and regerror are elsewhere
5 *
6 * Copyright (c) 1986 by University of Toronto.
7 * Written by Henry Spencer. Not derived from licensed software.
8 *
9 * Permission is granted to anyone to use this software for any
10 * purpose on any computer system, and to redistribute it freely,
11 * subject to the following restrictions:
12 *
13 * 1. The author is not responsible for the consequences of use of
14 * this software, no matter how awful, even if they arise
15 * from defects in it.
16 *
17 * 2. The origin of this software must not be misrepresented, either
18 * by explicit claim or by omission.
19 *
20 * 3. Altered versions must be plainly marked as such, and must not
21 * be misrepresented as being the original software.
22 *** THIS IS AN ALTERED VERSION. It was altered by John Gilmore,
23 *** hoptoad!gnu, on 27 Dec 1986, to add \n as an alternative to |
24 *** to assist in implementing egrep.
25 *** THIS IS AN ALTERED VERSION. It was altered by John Gilmore,
26 *** hoptoad!gnu, on 27 Dec 1986, to add \< and \> for word-matching
27 *** as in BSD grep and ex.
28 *** THIS IS AN ALTERED VERSION. It was altered by John Gilmore,
29 *** hoptoad!gnu, on 28 Dec 1986, to optimize characters quoted with \.
30 *** THIS IS AN ALTERED VERSION. It was altered by James A. Woods,
31 *** ames!jaw, on 19 June 1987, to quash a regcomp() redundancy.
32 *** THIS IS AN ALTERED VERSION. It was altered by Christopher Seiwald
33 *** seiwald@vix.com, on 28 August 1993, for use in jam. Regmagic.h
34 *** was moved into regexp.h, and the include of regexp.h now uses "'s
35 *** to avoid conflicting with the system regexp.h. Const, bless its
36 *** soul, was removed so it can compile everywhere. The declaration
37 *** of strchr() was in conflict on AIX, so it was removed (as it is
38 *** happily defined in string.h).
39 *** THIS IS AN ALTERED VERSION. It was altered by Christopher Seiwald
40 *** seiwald@perforce.com, on 20 January 2000, to use function prototypes.
41 *** THIS IS AN ALTERED VERSION. It was altered by Christopher Seiwald
42 *** seiwald@perforce.com, on 05 November 2002, to const string literals.
43 *
44 * THIS IS AN ALTERED VERSION. It was altered by Steve Bennett <steveb@workware.net.au>
45 * on 16 October 2010, to remove static state and add better Tcl ARE compatibility.
46 * This includes counted repetitions, UTF-8 support, character classes,
47 * shorthand character classes, increased number of parentheses to 100,
48 * backslash escape sequences. It also removes \n as an alternative to |.
49 *
50 *** THIS IS AN ALTERED VERSION. It was altered to offer POSIX-like
51 *** regular expression routines of regcomp/regexec/regerror/regfree,
52 *** with UTF-8 support, by NIIBE Yutaka <gniibe@fsij.org> on
53 *** 2020-02-14.
54 *
55 * Beware that some of this code is subtly aware of the way operator
56 * precedence is structured in regular expressions. Serious changes in
57 * regular-expression syntax might require a total rethink.
58 */
59
60 #if defined(JIM_REGEXP)
61 #include <stdio.h>
62 #include <ctype.h>
63 #include <stdlib.h>
64 #include <string.h>
65
66 #include "jimregexp.h"
67 #include "utf8.h"
68
69 #define UCHAR(c) ((unsigned char)(c))
70
71 /* An arbitrary limit, but this seems enough. Must be less than 1000. */
72 #define REG_MAX_PAREN 100
73
74 /*
75 * Structure for regexp "program". This is essentially a linear encoding
76 * of a nondeterministic finite-state machine (aka syntax charts or
77 * "railroad normal form" in parsing technology). Each node is an opcode
78 * plus a "next" pointer, possibly plus an operand. "Next" pointers of
79 * all nodes except BRANCH implement concatenation; a "next" pointer with
80 * a BRANCH on both ends of it is connecting two alternatives. (Here we
81 * have one of the subtle syntax dependencies: an individual BRANCH (as
82 * opposed to a collection of them) is never concatenated with anything
83 * because of operator precedence.) The operand of some types of node is
84 * a literal string; for others, it is a node leading into a sub-FSM. In
85 * particular, the operand of a BRANCH node is the first node of the branch.
86 * (NB this is *not* a tree structure: the tail of the branch connects
87 * to the thing following the set of BRANCHes.) The opcodes are:
88 */
89
90 /* definition number opnd? meaning */
91 #define END 0 /* no End of program. */
92 #define BOL 1 /* no Match "" at beginning of line. */
93 #define EOL 2 /* no Match "" at end of line. */
94 #define ANY 3 /* no Match any one character. */
95 #define ANYOF 4 /* str Match any character in this string. */
96 #define ANYBUT 5 /* str Match any character not in this string. */
97 #define BRANCH 6 /* node Match this alternative, or the next... */
98 #define BACK 7 /* no Match "", "next" ptr points backward. */
99 #define EXACTLY 8 /* str Match this string. */
100 #define NOTHING 9 /* no Match empty string. */
101 #define REP 10 /* max,min Match this (simple) thing [min,max] times. */
102 #define REPMIN 11 /* max,min Match this (simple) thing [min,max] times, minimal match. */
103 #define REPX 12 /* max,min Match this (complex) thing [min,max] times. */
104 #define REPXMIN 13 /* max,min Match this (complex) thing [min,max] times, minimal match. */
105 #define BOLX 14 /* no Match "" at beginning of input. */
106 #define EOLX 15 /* no Match "" at end of input. */
107 #define WORDA 16 /* no Match "" at wordchar, where prev is nonword */
108 #define WORDZ 17 /* no Match "" at nonwordchar, where prev is word */
109
110 #define OPENNC 1000 /* no Non-capturing parentheses - must be OPEN-1 */
111 #define OPEN 1001 /* no Mark this point in input as start of #n. */
112 /* OPEN+1 is number 1, etc. */
113
114 /* must not be any other opts between OPEN and CLOSE */
115
116 #define CLOSENC 2000 /* no Non-capturing parentheses - must be CLOSE-1 */
117 #define CLOSE 2001 /* no Analogous to OPEN. */
118 #define CLOSE_END (CLOSE+REG_MAX_PAREN)
119
120 /*
121 * The first word of the regexp internal "program" is actually this magic
122 * number; the start node begins in the second word.
123 */
124 #define REG_MAGIC 0xFADED00D
125
126 /*
127 * Opcode notes:
128 *
129 * BRANCH The set of branches constituting a single choice are hooked
130 * together with their "next" pointers, since precedence prevents
131 * anything being concatenated to any individual branch. The
132 * "next" pointer of the last BRANCH in a choice points to the
133 * thing following the whole choice. This is also where the
134 * final "next" pointer of each individual branch points; each
135 * branch starts with the operand node of a BRANCH node.
136 *
137 * BACK Normal "next" pointers all implicitly point forward; BACK
138 * exists to make loop structures possible.
139 *
140 * REP,REPX Repeated matches ('?', '*', '+' and {min,max}) are implemented
141 * as either simple repeats (REP) or complex repeats (REPX).
142 * These opcodes include a "min" and "max" count after the opcode.
143 * This is followed by a fourth "current count" word that is
144 * only used by REPX, as it implements a recursive match.
145 * REPMIN and REPXMIN are identical except they implement minimal repeats.
146 *
147 * OPEN,CLOSE ...are numbered at compile time.
148 */
149
150 /*
151 * A node is one word of opcode followed by one word of "next" pointer.
152 * The "next" pointer value is a positive offset from the opcode of the node
153 * containing it.
154 * An operand, if any, simply follows the node. (Note that much of the
155 * code generation knows about this implicit relationship.)
156 */
157 #define OP(preg, p) (preg->program[p])
158 #define NEXT(preg, p) (preg->program[p + 1])
159 #define OPERAND(p) ((p) + 2)
160
161 /*
162 * See regmagic.h for one further detail of program structure.
163 */
164
165
166 /*
167 * Utility definitions.
168 */
169
170 #define FAIL(R,M) { (R)->err = (M); return (M); }
171 #define ISMULT(c) ((c) == '*' || (c) == '+' || (c) == '?' || (c) == '{')
172 #define META "^$.[()|?{+*"
173
174 /*
175 * Flags to be passed up and down.
176 */
177 #define HASWIDTH 1 /* Known never to match null string. */
178 #define SIMPLE 2 /* Simple enough to be STAR/PLUS operand. */
179 #define SPSTART 4 /* Starts with * or +. */
180 #define WORST 0 /* Worst case. */
181
182 #define MAX_REP_COUNT 1000000
183
184 /*
185 * Forward declarations for regcomp()'s friends.
186 */
187 static int reg(regex_t *preg, int paren /* Parenthesized? */, int *flagp );
188 static int regpiece(regex_t *preg, int *flagp );
189 static int regbranch(regex_t *preg, int *flagp );
190 static int regatom(regex_t *preg, int *flagp );
191 static int regnode(regex_t *preg, int op );
192 static int regnext(regex_t *preg, int p );
193 static void regc(regex_t *preg, int b );
194 static int reginsert(regex_t *preg, int op, int size, int opnd );
195 static void regtail(regex_t *preg, int p, int val);
196 static void regoptail(regex_t *preg, int p, int val );
197 static int regopsize(regex_t *preg, int p );
198
199 static int reg_range_find(const int *string, int c);
200 static const char *str_find(const char *string, int c, int nocase);
201 static int prefix_cmp(const int *prog, int proglen, const char *string, int nocase);
202
203 /*#define DEBUG*/
204 #ifdef DEBUG
205 static int regnarrate = 0;
206 static void regdump(regex_t *preg);
207 static const char *regprop( int op );
208 #endif
209
210
211 /**
212 * Returns the length of the null-terminated integer sequence.
213 */
str_int_len(const int * seq)214 static int str_int_len(const int *seq)
215 {
216 int n = 0;
217 while (*seq++) {
218 n++;
219 }
220 return n;
221 }
222
223 /*
224 - regcomp - compile a regular expression into internal code
225 *
226 * We can't allocate space until we know how big the compiled form will be,
227 * but we can't compile it (and thus know how big it is) until we've got a
228 * place to put the code. So we cheat: we compile it twice, once with code
229 * generation turned off and size counting turned on, and once "for real".
230 * This also means that we don't allocate space until we are sure that the
231 * thing really will compile successfully, and we never have to move the
232 * code and thus invalidate pointers into it. (Note that it has to be in
233 * one piece because free() must be able to free it all.)
234 *
235 * Beware that the optimization-preparation code in here knows about some
236 * of the structure of the compiled regexp.
237 */
regcomp(regex_t * preg,const char * exp,int cflags)238 int regcomp(regex_t *preg, const char *exp, int cflags)
239 {
240 int scan;
241 int longest;
242 unsigned len;
243 int flags;
244
245 #ifdef DEBUG
246 fprintf(stderr, "Compiling: '%s'\n", exp);
247 #endif
248 memset(preg, 0, sizeof(*preg));
249
250 if (exp == NULL)
251 FAIL(preg, REG_ERR_NULL_ARGUMENT);
252
253 /* First pass: determine size, legality. */
254 preg->cflags = cflags;
255 preg->regparse = exp;
256
257 /* Allocate space. */
258 preg->proglen = (strlen(exp) + 1) * 5;
259 preg->program = malloc(preg->proglen * sizeof(int));
260 if (preg->program == NULL)
261 FAIL(preg, REG_ERR_NOMEM);
262
263 /* Note that since we store a magic value as the first item in the program,
264 * program offsets will never be 0
265 */
266 regc(preg, REG_MAGIC);
267 if (reg(preg, 0, &flags) == 0) {
268 return preg->err;
269 }
270
271 /* Small enough for pointer-storage convention? */
272 if (preg->re_nsub >= REG_MAX_PAREN) /* Probably could be 65535L. */
273 FAIL(preg,REG_ERR_TOO_BIG);
274
275 /* Dig out information for optimizations. */
276 preg->regstart = 0; /* Worst-case defaults. */
277 preg->reganch = 0;
278 preg->regmust = 0;
279 preg->regmlen = 0;
280 scan = 1; /* First BRANCH. */
281 if (OP(preg, regnext(preg, scan)) == END) { /* Only one top-level choice. */
282 scan = OPERAND(scan);
283
284 /* Starting-point info. */
285 if (OP(preg, scan) == EXACTLY) {
286 preg->regstart = preg->program[OPERAND(scan)];
287 }
288 else if (OP(preg, scan) == BOL)
289 preg->reganch++;
290
291 /*
292 * If there's something expensive in the r.e., find the
293 * longest literal string that must appear and make it the
294 * regmust. Resolve ties in favor of later strings, since
295 * the regstart check works with the beginning of the r.e.
296 * and avoiding duplication strengthens checking. Not a
297 * strong reason, but sufficient in the absence of others.
298 */
299 if (flags&SPSTART) {
300 longest = 0;
301 len = 0;
302 for (; scan != 0; scan = regnext(preg, scan)) {
303 if (OP(preg, scan) == EXACTLY) {
304 int plen = str_int_len(preg->program + OPERAND(scan));
305 if (plen >= len) {
306 longest = OPERAND(scan);
307 len = plen;
308 }
309 }
310 }
311 preg->regmust = longest;
312 preg->regmlen = len;
313 }
314 }
315
316 #ifdef DEBUG
317 regdump(preg);
318 #endif
319
320 return 0;
321 }
322
323 /*
324 - reg - regular expression, i.e. main body or parenthesized thing
325 *
326 * Caller must absorb opening parenthesis.
327 *
328 * Combining parenthesis handling with the base level of regular expression
329 * is a trifle forced, but the need to tie the tails of the branches to what
330 * follows makes it hard to avoid.
331 */
reg(regex_t * preg,int paren,int * flagp)332 static int reg(regex_t *preg, int paren /* Parenthesized? */, int *flagp )
333 {
334 int ret;
335 int br;
336 int ender;
337 int parno = 0;
338 int flags;
339
340 *flagp = HASWIDTH; /* Tentatively. */
341
342 /* Make an OPEN node, if parenthesized. */
343 if (paren) {
344 if (preg->regparse[0] == '?' && preg->regparse[1] == ':') {
345 /* non-capturing paren */
346 preg->regparse += 2;
347 parno = -1;
348 }
349 else {
350 parno = ++preg->re_nsub;
351 }
352 ret = regnode(preg, OPEN+parno);
353 } else
354 ret = 0;
355
356 /* Pick up the branches, linking them together. */
357 br = regbranch(preg, &flags);
358 if (br == 0)
359 return 0;
360 if (ret != 0)
361 regtail(preg, ret, br); /* OPEN -> first. */
362 else
363 ret = br;
364 if (!(flags&HASWIDTH))
365 *flagp &= ~HASWIDTH;
366 *flagp |= flags&SPSTART;
367 while (*preg->regparse == '|') {
368 preg->regparse++;
369 br = regbranch(preg, &flags);
370 if (br == 0)
371 return 0;
372 regtail(preg, ret, br); /* BRANCH -> BRANCH. */
373 if (!(flags&HASWIDTH))
374 *flagp &= ~HASWIDTH;
375 *flagp |= flags&SPSTART;
376 }
377
378 /* Make a closing node, and hook it on the end. */
379 ender = regnode(preg, (paren) ? CLOSE+parno : END);
380 regtail(preg, ret, ender);
381
382 /* Hook the tails of the branches to the closing node. */
383 for (br = ret; br != 0; br = regnext(preg, br))
384 regoptail(preg, br, ender);
385
386 /* Check for proper termination. */
387 if (paren && *preg->regparse++ != ')') {
388 preg->err = REG_ERR_UNMATCHED_PAREN;
389 return 0;
390 } else if (!paren && *preg->regparse != '\0') {
391 if (*preg->regparse == ')') {
392 preg->err = REG_ERR_UNMATCHED_PAREN;
393 return 0;
394 } else {
395 preg->err = REG_ERR_JUNK_ON_END;
396 return 0;
397 }
398 }
399
400 return(ret);
401 }
402
403 /*
404 - regbranch - one alternative of an | operator
405 *
406 * Implements the concatenation operator.
407 */
regbranch(regex_t * preg,int * flagp)408 static int regbranch(regex_t *preg, int *flagp )
409 {
410 int ret;
411 int chain;
412 int latest;
413 int flags;
414
415 *flagp = WORST; /* Tentatively. */
416
417 ret = regnode(preg, BRANCH);
418 chain = 0;
419 while (*preg->regparse != '\0' && *preg->regparse != ')' &&
420 *preg->regparse != '|') {
421 latest = regpiece(preg, &flags);
422 if (latest == 0)
423 return 0;
424 *flagp |= flags&HASWIDTH;
425 if (chain == 0) {/* First piece. */
426 *flagp |= flags&SPSTART;
427 }
428 else {
429 regtail(preg, chain, latest);
430 }
431 chain = latest;
432 }
433 if (chain == 0) /* Loop ran zero times. */
434 (void) regnode(preg, NOTHING);
435
436 return(ret);
437 }
438
439 /*
440 - regpiece - something followed by possible [*+?]
441 *
442 * Note that the branching code sequences used for ? and the general cases
443 * of * and + are somewhat optimized: they use the same NOTHING node as
444 * both the endmarker for their branch list and the body of the last branch.
445 * It might seem that this node could be dispensed with entirely, but the
446 * endmarker role is not redundant.
447 */
regpiece(regex_t * preg,int * flagp)448 static int regpiece(regex_t *preg, int *flagp)
449 {
450 int ret;
451 char op;
452 int next;
453 int flags;
454 int min;
455 int max;
456
457 ret = regatom(preg, &flags);
458 if (ret == 0)
459 return 0;
460
461 op = *preg->regparse;
462 if (!ISMULT(op)) {
463 *flagp = flags;
464 return(ret);
465 }
466
467 if (!(flags&HASWIDTH) && op != '?') {
468 preg->err = REG_ERR_OPERAND_COULD_BE_EMPTY;
469 return 0;
470 }
471
472 /* Handle braces (counted repetition) by expansion */
473 if (op == '{') {
474 char *end;
475
476 min = strtoul(preg->regparse + 1, &end, 10);
477 if (end == preg->regparse + 1) {
478 preg->err = REG_ERR_BAD_COUNT;
479 return 0;
480 }
481 if (*end == '}') {
482 max = min;
483 }
484 else if (*end == '\0') {
485 preg->err = REG_ERR_UNMATCHED_BRACES;
486 return 0;
487 }
488 else {
489 preg->regparse = end;
490 max = strtoul(preg->regparse + 1, &end, 10);
491 if (*end != '}') {
492 preg->err = REG_ERR_UNMATCHED_BRACES;
493 return 0;
494 }
495 }
496 if (end == preg->regparse + 1) {
497 max = MAX_REP_COUNT;
498 }
499 else if (max < min || max >= 100) {
500 preg->err = REG_ERR_BAD_COUNT;
501 return 0;
502 }
503 if (min >= 100) {
504 preg->err = REG_ERR_BAD_COUNT;
505 return 0;
506 }
507
508 preg->regparse = strchr(preg->regparse, '}');
509 }
510 else {
511 min = (op == '+');
512 max = (op == '?' ? 1 : MAX_REP_COUNT);
513 }
514
515 if (preg->regparse[1] == '?') {
516 preg->regparse++;
517 next = reginsert(preg, flags & SIMPLE ? REPMIN : REPXMIN, 5, ret);
518 }
519 else {
520 next = reginsert(preg, flags & SIMPLE ? REP: REPX, 5, ret);
521 }
522 preg->program[ret + 2] = max;
523 preg->program[ret + 3] = min;
524 preg->program[ret + 4] = 0;
525
526 *flagp = (min) ? (WORST|HASWIDTH) : (WORST|SPSTART);
527
528 if (!(flags & SIMPLE)) {
529 int back = regnode(preg, BACK);
530 regtail(preg, back, ret);
531 regtail(preg, next, back);
532 }
533
534 preg->regparse++;
535 if (ISMULT(*preg->regparse)) {
536 preg->err = REG_ERR_NESTED_COUNT;
537 return 0;
538 }
539
540 return ret;
541 }
542
543 /**
544 * Add all characters in the inclusive range between lower and upper.
545 *
546 * Handles a swapped range (upper < lower).
547 */
reg_addrange(regex_t * preg,int lower,int upper)548 static void reg_addrange(regex_t *preg, int lower, int upper)
549 {
550 if (lower > upper) {
551 reg_addrange(preg, upper, lower);
552 }
553 /* Add a range as length, start */
554 regc(preg, upper - lower + 1);
555 regc(preg, lower);
556 }
557
558 /**
559 * Add a null-terminated literal string as a set of ranges.
560 */
reg_addrange_str(regex_t * preg,const char * str)561 static void reg_addrange_str(regex_t *preg, const char *str)
562 {
563 while (*str) {
564 reg_addrange(preg, *str, *str);
565 str++;
566 }
567 }
568
569 /**
570 * Extracts the next unicode char from utf8.
571 *
572 * If 'upper' is set, converts the char to uppercase.
573 */
reg_utf8_tounicode_case(const char * s,int * uc,int upper)574 static int reg_utf8_tounicode_case(const char *s, int *uc, int upper)
575 {
576 int l = utf8_tounicode(s, uc);
577 if (upper) {
578 *uc = utf8_upper(*uc);
579 }
580 return l;
581 }
582
583 /**
584 * Converts a hex digit to decimal.
585 *
586 * Returns -1 for an invalid hex digit.
587 */
hexdigitval(int c)588 static int hexdigitval(int c)
589 {
590 if (c >= '0' && c <= '9')
591 return c - '0';
592 if (c >= 'a' && c <= 'f')
593 return c - 'a' + 10;
594 if (c >= 'A' && c <= 'F')
595 return c - 'A' + 10;
596 return -1;
597 }
598
599 /**
600 * Parses up to 'n' hex digits at 's' and stores the result in *uc.
601 *
602 * Returns the number of hex digits parsed.
603 * If there are no hex digits, returns 0 and stores nothing.
604 */
parse_hex(const char * s,int n,int * uc)605 static int parse_hex(const char *s, int n, int *uc)
606 {
607 int val = 0;
608 int k;
609
610 for (k = 0; k < n; k++) {
611 int c = hexdigitval(*s++);
612 if (c == -1) {
613 break;
614 }
615 val = (val << 4) | c;
616 }
617 if (k) {
618 *uc = val;
619 }
620 return k;
621 }
622
623 /**
624 * Call for chars after a backlash to decode the escape sequence.
625 *
626 * Stores the result in *ch.
627 *
628 * Returns the number of bytes consumed.
629 */
reg_decode_escape(const char * s,int * ch)630 static int reg_decode_escape(const char *s, int *ch)
631 {
632 int n;
633 const char *s0 = s;
634
635 *ch = *s++;
636
637 switch (*ch) {
638 case 'b': *ch = '\b'; break;
639 case 'e': *ch = 27; break;
640 case 'f': *ch = '\f'; break;
641 case 'n': *ch = '\n'; break;
642 case 'r': *ch = '\r'; break;
643 case 't': *ch = '\t'; break;
644 case 'v': *ch = '\v'; break;
645 case 'u':
646 if (*s == '{') {
647 /* Expect \u{NNNN} */
648 n = parse_hex(s + 1, 6, ch);
649 if (n > 0 && s[n + 1] == '}' && *ch >= 0 && *ch <= 0x1fffff) {
650 s += n + 2;
651 }
652 else {
653 /* Invalid, so just treat as an escaped 'u' */
654 *ch = 'u';
655 }
656 }
657 else if ((n = parse_hex(s, 4, ch)) > 0) {
658 s += n;
659 }
660 break;
661 case 'U':
662 if ((n = parse_hex(s, 8, ch)) > 0) {
663 s += n;
664 }
665 break;
666 case 'x':
667 if ((n = parse_hex(s, 2, ch)) > 0) {
668 s += n;
669 }
670 break;
671 case '\0':
672 s--;
673 *ch = '\\';
674 break;
675 }
676 return s - s0;
677 }
678
679 /*
680 - regatom - the lowest level
681 *
682 * Optimization: gobbles an entire sequence of ordinary characters so that
683 * it can turn them into a single node, which is smaller to store and
684 * faster to run. Backslashed characters are exceptions, each becoming a
685 * separate node; the code is simpler that way and it's not worth fixing.
686 */
regatom(regex_t * preg,int * flagp)687 static int regatom(regex_t *preg, int *flagp)
688 {
689 int ret;
690 int flags;
691 int nocase = (preg->cflags & REG_ICASE);
692
693 int ch;
694 int n = reg_utf8_tounicode_case(preg->regparse, &ch, nocase);
695
696 *flagp = WORST; /* Tentatively. */
697
698 preg->regparse += n;
699 switch (ch) {
700 /* FIXME: these chars only have meaning at beg/end of pat? */
701 case '^':
702 ret = regnode(preg, BOL);
703 break;
704 case '$':
705 ret = regnode(preg, EOL);
706 break;
707 case '.':
708 ret = regnode(preg, ANY);
709 *flagp |= HASWIDTH|SIMPLE;
710 break;
711 case '[': {
712 const char *pattern = preg->regparse;
713
714 if (*pattern == '^') { /* Complement of range. */
715 ret = regnode(preg, ANYBUT);
716 pattern++;
717 } else
718 ret = regnode(preg, ANYOF);
719
720 /* Special case. If the first char is ']' or '-', it is part of the set */
721 if (*pattern == ']' || *pattern == '-') {
722 reg_addrange(preg, *pattern, *pattern);
723 pattern++;
724 }
725
726 while (*pattern != ']') {
727 /* Is this a range? a-z */
728 int start;
729 int end;
730
731 enum {
732 CC_ALPHA, CC_ALNUM, CC_SPACE, CC_BLANK, CC_UPPER, CC_LOWER,
733 CC_DIGIT, CC_XDIGIT, CC_CNTRL, CC_GRAPH, CC_PRINT, CC_PUNCT,
734 CC_NUM
735 };
736 int cc;
737
738 if (!*pattern) {
739 preg->err = REG_ERR_UNMATCHED_BRACKET;
740 return 0;
741 }
742
743 pattern += reg_utf8_tounicode_case(pattern, &start, nocase);
744 if (start == '\\') {
745 /* First check for class shorthand escapes */
746 switch (*pattern) {
747 case 's':
748 pattern++;
749 cc = CC_SPACE;
750 goto cc_switch;
751 case 'd':
752 pattern++;
753 cc = CC_DIGIT;
754 goto cc_switch;
755 case 'w':
756 pattern++;
757 reg_addrange(preg, '_', '_');
758 cc = CC_ALNUM;
759 goto cc_switch;
760 }
761 pattern += reg_decode_escape(pattern, &start);
762 if (start == 0) {
763 preg->err = REG_ERR_NULL_CHAR;
764 return 0;
765 }
766 if (start == '\\' && *pattern == 0) {
767 preg->err = REG_ERR_INVALID_ESCAPE;
768 return 0;
769 }
770 }
771 if (pattern[0] == '-' && pattern[1] && pattern[1] != ']') {
772 /* skip '-' */
773 pattern += utf8_tounicode(pattern, &end);
774 pattern += reg_utf8_tounicode_case(pattern, &end, nocase);
775 if (end == '\\') {
776 pattern += reg_decode_escape(pattern, &end);
777 if (end == 0) {
778 preg->err = REG_ERR_NULL_CHAR;
779 return 0;
780 }
781 if (start == '\\' && *pattern == 0) {
782 preg->err = REG_ERR_INVALID_ESCAPE;
783 return 0;
784 }
785 }
786
787 reg_addrange(preg, start, end);
788 continue;
789 }
790 if (start == '[' && pattern[0] == ':') {
791 static const char *character_class[] = {
792 ":alpha:", ":alnum:", ":space:", ":blank:", ":upper:", ":lower:",
793 ":digit:", ":xdigit:", ":cntrl:", ":graph:", ":print:", ":punct:",
794 };
795
796 for (cc = 0; cc < CC_NUM; cc++) {
797 n = strlen(character_class[cc]);
798 if (strncmp(pattern, character_class[cc], n) == 0) {
799 /* Found a character class */
800 pattern += n + 1;
801 break;
802 }
803 }
804 if (cc != CC_NUM) {
805 cc_switch:
806 switch (cc) {
807 case CC_ALNUM:
808 reg_addrange(preg, '0', '9');
809 /* Fall through */
810 case CC_ALPHA:
811 if ((preg->cflags & REG_ICASE) == 0) {
812 reg_addrange(preg, 'a', 'z');
813 }
814 reg_addrange(preg, 'A', 'Z');
815 break;
816 case CC_SPACE:
817 reg_addrange_str(preg, " \t\r\n\f\v");
818 break;
819 case CC_BLANK:
820 reg_addrange_str(preg, " \t");
821 break;
822 case CC_UPPER:
823 reg_addrange(preg, 'A', 'Z');
824 break;
825 case CC_LOWER:
826 reg_addrange(preg, 'a', 'z');
827 break;
828 case CC_XDIGIT:
829 reg_addrange(preg, 'a', 'f');
830 reg_addrange(preg, 'A', 'F');
831 /* Fall through */
832 case CC_DIGIT:
833 reg_addrange(preg, '0', '9');
834 break;
835 case CC_CNTRL:
836 reg_addrange(preg, 0, 31);
837 reg_addrange(preg, 127, 127);
838 break;
839 case CC_PRINT:
840 reg_addrange(preg, ' ', '~');
841 break;
842 case CC_GRAPH:
843 reg_addrange(preg, '!', '~');
844 break;
845 case CC_PUNCT:
846 reg_addrange(preg, '!', '/');
847 reg_addrange(preg, ':', '@');
848 reg_addrange(preg, '[', '`');
849 reg_addrange(preg, '{', '~');
850 break;
851 }
852 continue;
853 }
854 }
855 /* Not a range, so just add the char */
856 reg_addrange(preg, start, start);
857 }
858 regc(preg, '\0');
859
860 if (*pattern) {
861 pattern++;
862 }
863 preg->regparse = pattern;
864
865 *flagp |= HASWIDTH|SIMPLE;
866 }
867 break;
868 case '(':
869 ret = reg(preg, 1, &flags);
870 if (ret == 0)
871 return 0;
872 *flagp |= flags&(HASWIDTH|SPSTART);
873 break;
874 case '\0':
875 case '|':
876 case ')':
877 preg->err = REG_ERR_INTERNAL;
878 return 0; /* Supposed to be caught earlier. */
879 case '?':
880 case '+':
881 case '*':
882 case '{':
883 preg->err = REG_ERR_COUNT_FOLLOWS_NOTHING;
884 return 0;
885 case '\\':
886 ch = *preg->regparse++;
887 switch (ch) {
888 case '\0':
889 preg->err = REG_ERR_INVALID_ESCAPE;
890 return 0;
891 case 'A':
892 ret = regnode(preg, BOLX);
893 break;
894 case 'Z':
895 ret = regnode(preg, EOLX);
896 break;
897 case '<':
898 case 'm':
899 ret = regnode(preg, WORDA);
900 break;
901 case '>':
902 case 'M':
903 ret = regnode(preg, WORDZ);
904 break;
905 case 'd':
906 case 'D':
907 ret = regnode(preg, ch == 'd' ? ANYOF : ANYBUT);
908 reg_addrange(preg, '0', '9');
909 regc(preg, '\0');
910 *flagp |= HASWIDTH|SIMPLE;
911 break;
912 case 'w':
913 case 'W':
914 ret = regnode(preg, ch == 'w' ? ANYOF : ANYBUT);
915 if ((preg->cflags & REG_ICASE) == 0) {
916 reg_addrange(preg, 'a', 'z');
917 }
918 reg_addrange(preg, 'A', 'Z');
919 reg_addrange(preg, '0', '9');
920 reg_addrange(preg, '_', '_');
921 regc(preg, '\0');
922 *flagp |= HASWIDTH|SIMPLE;
923 break;
924 case 's':
925 case 'S':
926 ret = regnode(preg, ch == 's' ? ANYOF : ANYBUT);
927 reg_addrange_str(preg," \t\r\n\f\v");
928 regc(preg, '\0');
929 *flagp |= HASWIDTH|SIMPLE;
930 break;
931 /* FIXME: Someday handle \1, \2, ... */
932 default:
933 /* Handle general quoted chars in exact-match routine */
934 /* Back up to include the backslash */
935 preg->regparse--;
936 goto de_fault;
937 }
938 break;
939 de_fault:
940 default: {
941 /*
942 * Encode a string of characters to be matched exactly.
943 */
944 int added = 0;
945
946 /* Back up to pick up the first char of interest */
947 preg->regparse -= n;
948
949 ret = regnode(preg, EXACTLY);
950
951 /* Note that a META operator such as ? or * consumes the
952 * preceding char.
953 * Thus we must be careful to look ahead by 2 and add the
954 * last char as it's own EXACTLY if necessary
955 */
956
957 /* Until end of string or a META char is reached */
958 while (*preg->regparse && strchr(META, *preg->regparse) == NULL) {
959 n = reg_utf8_tounicode_case(preg->regparse, &ch, (preg->cflags & REG_ICASE));
960 if (ch == '\\' && preg->regparse[n]) {
961 /* Non-trailing backslash.
962 * Is this a special escape, or a regular escape?
963 */
964 if (strchr("<>mMwWdDsSAZ", preg->regparse[n])) {
965 /* A special escape. All done with EXACTLY */
966 break;
967 }
968 /* Decode it. Note that we add the length for the escape
969 * sequence to the length for the backlash so we can skip
970 * the entire sequence, or not as required.
971 */
972 n += reg_decode_escape(preg->regparse + n, &ch);
973 if (ch == 0) {
974 preg->err = REG_ERR_NULL_CHAR;
975 return 0;
976 }
977 }
978
979 /* Now we have one char 'ch' of length 'n'.
980 * Check to see if the following char is a MULT
981 */
982
983 if (ISMULT(preg->regparse[n])) {
984 /* Yes. But do we already have some EXACTLY chars? */
985 if (added) {
986 /* Yes, so return what we have and pick up the current char next time around */
987 break;
988 }
989 /* No, so add this single char and finish */
990 regc(preg, ch);
991 added++;
992 preg->regparse += n;
993 break;
994 }
995
996 /* No, so just add this char normally */
997 regc(preg, ch);
998 added++;
999 preg->regparse += n;
1000 }
1001 regc(preg, '\0');
1002
1003 *flagp |= HASWIDTH;
1004 if (added == 1)
1005 *flagp |= SIMPLE;
1006 break;
1007 }
1008 break;
1009 }
1010
1011 return(ret);
1012 }
1013
reg_grow(regex_t * preg,int n)1014 static void reg_grow(regex_t *preg, int n)
1015 {
1016 if (preg->p + n >= preg->proglen) {
1017 preg->proglen = (preg->p + n) * 2;
1018 preg->program = realloc(preg->program, preg->proglen * sizeof(int));
1019 }
1020 }
1021
1022 /*
1023 - regnode - emit a node
1024 */
1025 /* Location. */
regnode(regex_t * preg,int op)1026 static int regnode(regex_t *preg, int op)
1027 {
1028 reg_grow(preg, 2);
1029
1030 /* The OP followed by a next pointer */
1031 preg->program[preg->p++] = op;
1032 preg->program[preg->p++] = 0;
1033
1034 /* Return the start of the node */
1035 return preg->p - 2;
1036 }
1037
1038 /*
1039 - regc - emit (if appropriate) a byte of code
1040 */
regc(regex_t * preg,int b)1041 static void regc(regex_t *preg, int b )
1042 {
1043 reg_grow(preg, 1);
1044 preg->program[preg->p++] = b;
1045 }
1046
1047 /*
1048 - reginsert - insert an operator in front of already-emitted operand
1049 *
1050 * Means relocating the operand.
1051 * Returns the new location of the original operand.
1052 */
reginsert(regex_t * preg,int op,int size,int opnd)1053 static int reginsert(regex_t *preg, int op, int size, int opnd )
1054 {
1055 reg_grow(preg, size);
1056
1057 /* Move everything from opnd up */
1058 memmove(preg->program + opnd + size, preg->program + opnd, sizeof(int) * (preg->p - opnd));
1059 /* Zero out the new space */
1060 memset(preg->program + opnd, 0, sizeof(int) * size);
1061
1062 preg->program[opnd] = op;
1063
1064 preg->p += size;
1065
1066 return opnd + size;
1067 }
1068
1069 /*
1070 - regtail - set the next-pointer at the end of a node chain
1071 */
regtail(regex_t * preg,int p,int val)1072 static void regtail(regex_t *preg, int p, int val)
1073 {
1074 int scan;
1075 int temp;
1076 int offset;
1077
1078 /* Find last node. */
1079 scan = p;
1080 for (;;) {
1081 temp = regnext(preg, scan);
1082 if (temp == 0)
1083 break;
1084 scan = temp;
1085 }
1086
1087 if (OP(preg, scan) == BACK)
1088 offset = scan - val;
1089 else
1090 offset = val - scan;
1091
1092 preg->program[scan + 1] = offset;
1093 }
1094
1095 /*
1096 - regoptail - regtail on operand of first argument; nop if operandless
1097 */
1098
regoptail(regex_t * preg,int p,int val)1099 static void regoptail(regex_t *preg, int p, int val )
1100 {
1101 /* "Operandless" and "op != BRANCH" are synonymous in practice. */
1102 if (p != 0 && OP(preg, p) == BRANCH) {
1103 regtail(preg, OPERAND(p), val);
1104 }
1105 }
1106
1107 /*
1108 * regexec and friends
1109 */
1110
1111 /*
1112 * Forwards.
1113 */
1114 static int regtry(regex_t *preg, const char *string );
1115 static int regmatch(regex_t *preg, int prog);
1116 static int regrepeat(regex_t *preg, int p, int max);
1117
1118 /*
1119 - regexec - match a regexp against a string
1120 */
regexec(regex_t * preg,const char * string,size_t nmatch,regmatch_t pmatch[],int eflags)1121 int regexec(regex_t *preg, const char *string, size_t nmatch, regmatch_t pmatch[], int eflags)
1122 {
1123 const char *s;
1124 int scan;
1125
1126 /* Be paranoid... */
1127 if (preg == NULL || preg->program == NULL || string == NULL) {
1128 return REG_ERR_NULL_ARGUMENT;
1129 }
1130
1131 /* Check validity of program. */
1132 if (*preg->program != REG_MAGIC) {
1133 return REG_ERR_CORRUPTED;
1134 }
1135
1136 #ifdef DEBUG
1137 fprintf(stderr, "regexec: %s\n", string);
1138 regdump(preg);
1139 #endif
1140
1141 preg->eflags = eflags;
1142 preg->pmatch = pmatch;
1143 preg->nmatch = nmatch;
1144 preg->start = string; /* All offsets are computed from here */
1145
1146 /* Must clear out the embedded repeat counts of REPX and REPXMIN opcodes */
1147 for (scan = OPERAND(1); scan != 0; scan += regopsize(preg, scan)) {
1148 int op = OP(preg, scan);
1149 if (op == END)
1150 break;
1151 if (op == REPX || op == REPXMIN)
1152 preg->program[scan + 4] = 0;
1153 }
1154
1155 /* If there is a "must appear" string, look for it. */
1156 if (preg->regmust != 0) {
1157 s = string;
1158 while ((s = str_find(s, preg->program[preg->regmust], preg->cflags & REG_ICASE)) != NULL) {
1159 if (prefix_cmp(preg->program + preg->regmust, preg->regmlen, s, preg->cflags & REG_ICASE) >= 0) {
1160 break;
1161 }
1162 s++;
1163 }
1164 if (s == NULL) /* Not present. */
1165 return REG_NOMATCH;
1166 }
1167
1168 /* Mark beginning of line for ^ . */
1169 preg->regbol = string;
1170
1171 /* Simplest case: anchored match need be tried only once (maybe per line). */
1172 if (preg->reganch) {
1173 if (eflags & REG_NOTBOL) {
1174 /* This is an anchored search, but not an BOL, so possibly skip to the next line */
1175 goto nextline;
1176 }
1177 while (1) {
1178 if (regtry(preg, string)) {
1179 return REG_NOERROR;
1180 }
1181 if (*string) {
1182 nextline:
1183 if (preg->cflags & REG_NEWLINE) {
1184 /* Try the next anchor? */
1185 string = strchr(string, '\n');
1186 if (string) {
1187 preg->regbol = ++string;
1188 continue;
1189 }
1190 }
1191 }
1192 return REG_NOMATCH;
1193 }
1194 }
1195
1196 /* Messy cases: unanchored match. */
1197 s = string;
1198 if (preg->regstart != '\0') {
1199 /* We know what char it must start with. */
1200 while ((s = str_find(s, preg->regstart, preg->cflags & REG_ICASE)) != NULL) {
1201 if (regtry(preg, s))
1202 return REG_NOERROR;
1203 s++;
1204 }
1205 }
1206 else
1207 /* We don't -- general case. */
1208 while (1) {
1209 if (regtry(preg, s))
1210 return REG_NOERROR;
1211 if (*s == '\0') {
1212 break;
1213 }
1214 else {
1215 int c;
1216 s += utf8_tounicode(s, &c);
1217 }
1218 }
1219
1220 /* Failure. */
1221 return REG_NOMATCH;
1222 }
1223
1224 /*
1225 - regtry - try match at specific point
1226 */
1227 /* 0 failure, 1 success */
regtry(regex_t * preg,const char * string)1228 static int regtry( regex_t *preg, const char *string )
1229 {
1230 int i;
1231
1232 preg->reginput = string;
1233
1234 for (i = 0; i < preg->nmatch; i++) {
1235 if (preg->pmatch) {
1236 preg->pmatch[i].rm_so = -1;
1237 preg->pmatch[i].rm_eo = -1;
1238 }
1239 }
1240 if (regmatch(preg, 1)) {
1241 if (preg->pmatch) {
1242 preg->pmatch[0].rm_so = string - preg->start;
1243 preg->pmatch[0].rm_eo = preg->reginput - preg->start;
1244 }
1245 return(1);
1246 } else
1247 return(0);
1248 }
1249
1250 /**
1251 * Returns bytes matched if 'pattern' is a prefix of 'string'.
1252 *
1253 * If 'nocase' is non-zero, does a case-insensitive match.
1254 *
1255 * Returns -1 on not found.
1256 */
prefix_cmp(const int * prog,int proglen,const char * string,int nocase)1257 static int prefix_cmp(const int *prog, int proglen, const char *string, int nocase)
1258 {
1259 const char *s = string;
1260 while (proglen && *s) {
1261 int ch;
1262 int n = reg_utf8_tounicode_case(s, &ch, nocase);
1263 if (ch != *prog) {
1264 return -1;
1265 }
1266 prog++;
1267 s += n;
1268 proglen--;
1269 }
1270 if (proglen == 0) {
1271 return s - string;
1272 }
1273 return -1;
1274 }
1275
1276 /**
1277 * Searchs for 'c' in the range 'range'.
1278 *
1279 * Returns 1 if found, or 0 if not.
1280 */
reg_range_find(const int * range,int c)1281 static int reg_range_find(const int *range, int c)
1282 {
1283 while (*range) {
1284 /*printf("Checking %d in range [%d,%d]\n", c, range[1], (range[0] + range[1] - 1));*/
1285 if (c >= range[1] && c <= (range[0] + range[1] - 1)) {
1286 return 1;
1287 }
1288 range += 2;
1289 }
1290 return 0;
1291 }
1292
1293 /**
1294 * Search for the character 'c' in the utf-8 string 'string'.
1295 *
1296 * If 'nocase' is set, the 'string' is assumed to be uppercase
1297 * and 'c' is converted to uppercase before matching.
1298 *
1299 * Returns the byte position in the string where the 'c' was found, or
1300 * NULL if not found.
1301 */
str_find(const char * string,int c,int nocase)1302 static const char *str_find(const char *string, int c, int nocase)
1303 {
1304 if (nocase) {
1305 /* The "string" should already be converted to uppercase */
1306 c = utf8_upper(c);
1307 }
1308 while (*string) {
1309 int ch;
1310 int n = reg_utf8_tounicode_case(string, &ch, nocase);
1311 if (c == ch) {
1312 return string;
1313 }
1314 string += n;
1315 }
1316 return NULL;
1317 }
1318
1319 /**
1320 * Returns true if 'ch' is an end-of-line char.
1321 *
1322 * In REG_NEWLINE mode, \n is considered EOL in
1323 * addition to \0
1324 */
reg_iseol(regex_t * preg,int ch)1325 static int reg_iseol(regex_t *preg, int ch)
1326 {
1327 if (preg->cflags & REG_NEWLINE) {
1328 return ch == '\0' || ch == '\n';
1329 }
1330 else {
1331 return ch == '\0';
1332 }
1333 }
1334
regmatchsimplerepeat(regex_t * preg,int scan,int matchmin)1335 static int regmatchsimplerepeat(regex_t *preg, int scan, int matchmin)
1336 {
1337 int nextch = '\0';
1338 const char *save;
1339 int no;
1340 int c;
1341
1342 int max = preg->program[scan + 2];
1343 int min = preg->program[scan + 3];
1344 int next = regnext(preg, scan);
1345
1346 /*
1347 * Lookahead to avoid useless match attempts
1348 * when we know what character comes next.
1349 */
1350 if (OP(preg, next) == EXACTLY) {
1351 nextch = preg->program[OPERAND(next)];
1352 }
1353 save = preg->reginput;
1354 no = regrepeat(preg, scan + 5, max);
1355 if (no < min) {
1356 return 0;
1357 }
1358 if (matchmin) {
1359 /* from min up to no */
1360 max = no;
1361 no = min;
1362 }
1363 /* else from no down to min */
1364 while (1) {
1365 if (matchmin) {
1366 if (no > max) {
1367 break;
1368 }
1369 }
1370 else {
1371 if (no < min) {
1372 break;
1373 }
1374 }
1375 preg->reginput = save + utf8_index(save, no);
1376 reg_utf8_tounicode_case(preg->reginput, &c, (preg->cflags & REG_ICASE));
1377 /* If it could work, try it. */
1378 if (reg_iseol(preg, nextch) || c == nextch) {
1379 if (regmatch(preg, next)) {
1380 return(1);
1381 }
1382 }
1383 if (matchmin) {
1384 /* Couldn't or didn't, add one more */
1385 no++;
1386 }
1387 else {
1388 /* Couldn't or didn't -- back up. */
1389 no--;
1390 }
1391 }
1392 return(0);
1393 }
1394
regmatchrepeat(regex_t * preg,int scan,int matchmin)1395 static int regmatchrepeat(regex_t *preg, int scan, int matchmin)
1396 {
1397 int *scanpt = preg->program + scan;
1398
1399 int max = scanpt[2];
1400 int min = scanpt[3];
1401
1402 /* Have we reached min? */
1403 if (scanpt[4] < min) {
1404 /* No, so get another one */
1405 scanpt[4]++;
1406 if (regmatch(preg, scan + 5)) {
1407 return 1;
1408 }
1409 scanpt[4]--;
1410 return 0;
1411 }
1412 if (scanpt[4] > max) {
1413 return 0;
1414 }
1415
1416 if (matchmin) {
1417 /* minimal, so try other branch first */
1418 if (regmatch(preg, regnext(preg, scan))) {
1419 return 1;
1420 }
1421 /* No, so try one more */
1422 scanpt[4]++;
1423 if (regmatch(preg, scan + 5)) {
1424 return 1;
1425 }
1426 scanpt[4]--;
1427 return 0;
1428 }
1429 /* maximal, so try this branch again */
1430 if (scanpt[4] < max) {
1431 scanpt[4]++;
1432 if (regmatch(preg, scan + 5)) {
1433 return 1;
1434 }
1435 scanpt[4]--;
1436 }
1437 /* At this point we are at max with no match. Try the other branch */
1438 return regmatch(preg, regnext(preg, scan));
1439 }
1440
1441 /*
1442 - regmatch - main matching routine
1443 *
1444 * Conceptually the strategy is simple: check to see whether the current
1445 * node matches, call self recursively to see whether the rest matches,
1446 * and then act accordingly. In practice we make some effort to avoid
1447 * recursion, in particular by going through "ordinary" nodes (that don't
1448 * need to know whether the rest of the match failed) by a loop instead of
1449 * by recursion.
1450 */
1451 /* 0 failure, 1 success */
regmatch(regex_t * preg,int prog)1452 static int regmatch(regex_t *preg, int prog)
1453 {
1454 int scan; /* Current node. */
1455 int next; /* Next node. */
1456 const char *save;
1457
1458 scan = prog;
1459
1460 #ifdef DEBUG
1461 if (scan != 0 && regnarrate)
1462 fprintf(stderr, "%s(\n", regprop(scan));
1463 #endif
1464 while (scan != 0) {
1465 int n;
1466 int c;
1467 #ifdef DEBUG
1468 if (regnarrate) {
1469 fprintf(stderr, "%3d: %s...\n", scan, regprop(OP(preg, scan))); /* Where, what. */
1470 }
1471 #endif
1472 next = regnext(preg, scan);
1473 n = reg_utf8_tounicode_case(preg->reginput, &c, (preg->cflags & REG_ICASE));
1474
1475 switch (OP(preg, scan)) {
1476 case BOLX:
1477 if ((preg->eflags & REG_NOTBOL)) {
1478 return(0);
1479 }
1480 /* Fall through */
1481 case BOL:
1482 if (preg->reginput != preg->regbol) {
1483 return(0);
1484 }
1485 break;
1486 case EOLX:
1487 if (c != 0) {
1488 /* For EOLX, only match real end of line, not newline */
1489 return 0;
1490 }
1491 break;
1492 case EOL:
1493 if (!reg_iseol(preg, c)) {
1494 return(0);
1495 }
1496 break;
1497 case WORDA:
1498 /* Must be looking at a letter, digit, or _ */
1499 if ((!isalnum(UCHAR(c))) && c != '_')
1500 return(0);
1501 /* Prev must be BOL or nonword */
1502 if (preg->reginput > preg->regbol &&
1503 (isalnum(UCHAR(preg->reginput[-1])) || preg->reginput[-1] == '_'))
1504 return(0);
1505 break;
1506 case WORDZ:
1507 /* Can't match at BOL */
1508 if (preg->reginput > preg->regbol) {
1509 /* Current must be EOL or nonword */
1510 if (reg_iseol(preg, c) || !isalnum(UCHAR(c)) || c != '_') {
1511 c = preg->reginput[-1];
1512 /* Previous must be word */
1513 if (isalnum(UCHAR(c)) || c == '_') {
1514 break;
1515 }
1516 }
1517 }
1518 /* No */
1519 return(0);
1520
1521 case ANY:
1522 if (reg_iseol(preg, c))
1523 return 0;
1524 preg->reginput += n;
1525 break;
1526 case EXACTLY: {
1527 int opnd;
1528 int len;
1529 int slen;
1530
1531 opnd = OPERAND(scan);
1532 len = str_int_len(preg->program + opnd);
1533
1534 slen = prefix_cmp(preg->program + opnd, len, preg->reginput, preg->cflags & REG_ICASE);
1535 if (slen < 0) {
1536 return(0);
1537 }
1538 preg->reginput += slen;
1539 }
1540 break;
1541 case ANYOF:
1542 if (reg_iseol(preg, c) || reg_range_find(preg->program + OPERAND(scan), c) == 0) {
1543 return(0);
1544 }
1545 preg->reginput += n;
1546 break;
1547 case ANYBUT:
1548 if (reg_iseol(preg, c) || reg_range_find(preg->program + OPERAND(scan), c) != 0) {
1549 return(0);
1550 }
1551 preg->reginput += n;
1552 break;
1553 case NOTHING:
1554 break;
1555 case BACK:
1556 break;
1557 case BRANCH:
1558 if (OP(preg, next) != BRANCH) /* No choice. */
1559 next = OPERAND(scan); /* Avoid recursion. */
1560 else {
1561 do {
1562 save = preg->reginput;
1563 if (regmatch(preg, OPERAND(scan))) {
1564 return(1);
1565 }
1566 preg->reginput = save;
1567 scan = regnext(preg, scan);
1568 } while (scan != 0 && OP(preg, scan) == BRANCH);
1569 return(0);
1570 /* NOTREACHED */
1571 }
1572 break;
1573 case REP:
1574 case REPMIN:
1575 return regmatchsimplerepeat(preg, scan, OP(preg, scan) == REPMIN);
1576
1577 case REPX:
1578 case REPXMIN:
1579 return regmatchrepeat(preg, scan, OP(preg, scan) == REPXMIN);
1580
1581 case END:
1582 return 1; /* Success! */
1583
1584 case OPENNC:
1585 case CLOSENC:
1586 return regmatch(preg, next);
1587
1588 default:
1589 if (OP(preg, scan) >= OPEN+1 && OP(preg, scan) < CLOSE_END) {
1590 save = preg->reginput;
1591 if (regmatch(preg, next)) {
1592 if (OP(preg, scan) < CLOSE) {
1593 int no = OP(preg, scan) - OPEN;
1594 if (no < preg->nmatch && preg->pmatch && preg->pmatch[no].rm_so == -1) {
1595 preg->pmatch[no].rm_so = save - preg->start;
1596 }
1597 }
1598 else {
1599 int no = OP(preg, scan) - CLOSE;
1600 if (no < preg->nmatch && preg->pmatch && preg->pmatch[no].rm_eo == -1) {
1601 preg->pmatch[no].rm_eo = save - preg->start;
1602 }
1603 }
1604 return(1);
1605 }
1606 /* Restore input position after failure */
1607 preg->reginput = save;
1608 return(0);
1609 }
1610 return REG_ERR_INTERNAL;
1611 }
1612
1613 scan = next;
1614 }
1615
1616 /*
1617 * We get here only if there's trouble -- normally "case END" is
1618 * the terminating point.
1619 */
1620 return REG_ERR_INTERNAL;
1621 }
1622
1623 /*
1624 - regrepeat - repeatedly match something simple, report how many
1625 */
regrepeat(regex_t * preg,int p,int max)1626 static int regrepeat(regex_t *preg, int p, int max)
1627 {
1628 int count = 0;
1629 const char *scan;
1630 int opnd;
1631 int ch;
1632 int n;
1633
1634 scan = preg->reginput;
1635 opnd = OPERAND(p);
1636 switch (OP(preg, p)) {
1637 case ANY:
1638 while (!reg_iseol(preg, *scan) && count < max) {
1639 count++;
1640 scan += utf8_charlen(*scan);
1641 }
1642 break;
1643 case EXACTLY:
1644 while (count < max) {
1645 n = reg_utf8_tounicode_case(scan, &ch, preg->cflags & REG_ICASE);
1646 if (preg->program[opnd] != ch) {
1647 break;
1648 }
1649 count++;
1650 scan += n;
1651 }
1652 break;
1653 case ANYOF:
1654 while (count < max) {
1655 n = reg_utf8_tounicode_case(scan, &ch, preg->cflags & REG_ICASE);
1656 if (reg_iseol(preg, ch) || reg_range_find(preg->program + opnd, ch) == 0) {
1657 break;
1658 }
1659 count++;
1660 scan += n;
1661 }
1662 break;
1663 case ANYBUT:
1664 while (count < max) {
1665 n = reg_utf8_tounicode_case(scan, &ch, preg->cflags & REG_ICASE);
1666 if (reg_iseol(preg, ch) || reg_range_find(preg->program + opnd, ch) != 0) {
1667 break;
1668 }
1669 count++;
1670 scan += n;
1671 }
1672 break;
1673 default: /* Oh dear. Called inappropriately. */
1674 preg->err = REG_ERR_INTERNAL;
1675 count = 0; /* Best compromise. */
1676 break;
1677 }
1678 preg->reginput = scan;
1679
1680 return(count);
1681 }
1682
1683 /*
1684 - regnext - dig the "next" pointer out of a node
1685 */
regnext(regex_t * preg,int p)1686 static int regnext(regex_t *preg, int p )
1687 {
1688 int offset;
1689
1690 offset = NEXT(preg, p);
1691
1692 if (offset == 0)
1693 return 0;
1694
1695 if (OP(preg, p) == BACK)
1696 return(p-offset);
1697 else
1698 return(p+offset);
1699 }
1700
1701 /*
1702 - regopsize - returns the size of opcode + operands at 'p' in words
1703 */
regopsize(regex_t * preg,int p)1704 static int regopsize(regex_t *preg, int p )
1705 {
1706 /* Almost all opcodes are 2 words, but some are more */
1707 switch (OP(preg, p)) {
1708 case REP:
1709 case REPMIN:
1710 case REPX:
1711 case REPXMIN:
1712 return 5;
1713
1714 case ANYOF:
1715 case ANYBUT:
1716 case EXACTLY: {
1717 int s = p + 2;
1718 while (preg->program[s++]) {
1719 }
1720 return s - p;
1721 }
1722 }
1723 return 2;
1724 }
1725
1726 #if defined(DEBUG) && !defined(JIM_BOOTSTRAP)
1727
1728 /*
1729 - regdump - dump a regexp onto stdout in vaguely comprehensible form
1730 */
regdump(regex_t * preg)1731 static void regdump(regex_t *preg)
1732 {
1733 int s;
1734 int op = EXACTLY; /* Arbitrary non-END op. */
1735 int next;
1736 char buf[MAX_UTF8_LEN + 1];
1737
1738 int i;
1739 for (i = 1; i < preg->p; i++) {
1740 printf("%02x ", (unsigned char)preg->program[i]);
1741 if (i % 16 == 0) {
1742 printf("\n");
1743 }
1744 }
1745 printf("\n");
1746
1747 s = 1;
1748 while (op != END && s < preg->p) { /* While that wasn't END last time... */
1749 op = OP(preg, s);
1750 printf("%3d: %s", s, regprop(op)); /* Where, what. */
1751 next = regnext(preg, s);
1752 if (next == 0) /* Next ptr. */
1753 printf("(0)");
1754 else
1755 printf("(%d)", next);
1756 s += 2;
1757 if (op == REP || op == REPMIN || op == REPX || op == REPXMIN) {
1758 int max = preg->program[s];
1759 int min = preg->program[s + 1];
1760 if (max == 65535) {
1761 printf("{%d,*}", min);
1762 }
1763 else {
1764 printf("{%d,%d}", min, max);
1765 }
1766 printf(" %d", preg->program[s + 2]);
1767 s += 3;
1768 }
1769 else if (op == ANYOF || op == ANYBUT) {
1770 /* set of ranges */
1771
1772 while (preg->program[s]) {
1773 int len = preg->program[s++];
1774 int first = preg->program[s++];
1775 buf[utf8_getchars(buf, first)] = 0;
1776 printf("%s", buf);
1777 if (len > 1) {
1778 buf[utf8_getchars(buf, first + len - 1)] = 0;
1779 printf("-%s", buf);
1780 }
1781 }
1782 s++;
1783 }
1784 else if (op == EXACTLY) {
1785 /* Literal string, where present. */
1786
1787 while (preg->program[s]) {
1788 buf[utf8_getchars(buf, preg->program[s])] = 0;
1789 printf("%s", buf);
1790 s++;
1791 }
1792 s++;
1793 }
1794 putchar('\n');
1795 }
1796
1797 if (op == END) {
1798 /* Header fields of interest. */
1799 if (preg->regstart) {
1800 buf[utf8_getchars(buf, preg->regstart)] = 0;
1801 printf("start '%s' ", buf);
1802 }
1803 if (preg->reganch)
1804 printf("anchored ");
1805 if (preg->regmust != 0) {
1806 int i;
1807 printf("must have:");
1808 for (i = 0; i < preg->regmlen; i++) {
1809 putchar(preg->program[preg->regmust + i]);
1810 }
1811 putchar('\n');
1812 }
1813 }
1814 printf("\n");
1815 }
1816
1817 /*
1818 - regprop - printable representation of opcode
1819 */
regprop(int op)1820 static const char *regprop( int op )
1821 {
1822 static char buf[50];
1823
1824 switch (op) {
1825 case BOL:
1826 return "BOL";
1827 case EOL:
1828 return "EOL";
1829 case BOLX:
1830 return "BOLX";
1831 case EOLX:
1832 return "EOLX";
1833 case ANY:
1834 return "ANY";
1835 case ANYOF:
1836 return "ANYOF";
1837 case ANYBUT:
1838 return "ANYBUT";
1839 case BRANCH:
1840 return "BRANCH";
1841 case EXACTLY:
1842 return "EXACTLY";
1843 case NOTHING:
1844 return "NOTHING";
1845 case BACK:
1846 return "BACK";
1847 case END:
1848 return "END";
1849 case REP:
1850 return "REP";
1851 case REPMIN:
1852 return "REPMIN";
1853 case REPX:
1854 return "REPX";
1855 case REPXMIN:
1856 return "REPXMIN";
1857 case WORDA:
1858 return "WORDA";
1859 case WORDZ:
1860 return "WORDZ";
1861 case OPENNC:
1862 return "OPEN";
1863 case CLOSENC:
1864 return "CLOSE";
1865 default:
1866 if (op >= OPEN && op < CLOSE) {
1867 snprintf(buf, sizeof(buf), "OPEN%d", op-OPEN);
1868 }
1869 else if (op >= CLOSE && op < CLOSE_END) {
1870 snprintf(buf, sizeof(buf), "CLOSE%d", op-CLOSE);
1871 }
1872 else {
1873 snprintf(buf, sizeof(buf), "?%d?\n", op);
1874 }
1875 return(buf);
1876 }
1877 }
1878 #endif /* JIM_BOOTSTRAP */
1879
regerror(int errcode,const regex_t * preg,char * errbuf,size_t errbuf_size)1880 size_t regerror(int errcode, const regex_t *preg, char *errbuf, size_t errbuf_size)
1881 {
1882 static const char *error_strings[] = {
1883 "success",
1884 "no match",
1885 "bad pattern",
1886 "null argument",
1887 "unknown error",
1888 "too big",
1889 "out of memory",
1890 "too many ()",
1891 "parentheses () not balanced",
1892 "braces {} not balanced",
1893 "invalid repetition count(s)",
1894 "extra characters",
1895 "*+ of empty atom",
1896 "nested count",
1897 "internal error",
1898 "count follows nothing",
1899 "invalid escape \\ sequence",
1900 "corrupted program",
1901 "contains null char",
1902 "brackets [] not balanced",
1903 };
1904 const char *err;
1905
1906 (void)preg;
1907
1908 if (errcode < 0 || errcode >= REG_ERR_NUM) {
1909 err = "Bad error code";
1910 }
1911 else {
1912 err = error_strings[errcode];
1913 }
1914
1915 return snprintf(errbuf, errbuf_size, "%s", err);
1916 }
1917
regfree(regex_t * preg)1918 void regfree(regex_t *preg)
1919 {
1920 free(preg->program);
1921 }
1922
1923 #endif
1924