1 /* $OpenBSD: regcomp.c,v 1.20 2010/11/21 00:02:30 tedu Exp $ */
2 /*-
3 * Copyright (c) 1992, 1993, 1994 Henry Spencer.
4 * Copyright (c) 1992, 1993, 1994
5 * The Regents of the University of California. All rights reserved.
6 *
7 * This code is derived from software contributed to Berkeley by
8 * Henry Spencer.
9 *
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
12 * are met:
13 * 1. Redistributions of source code must retain the above copyright
14 * notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright
16 * notice, this list of conditions and the following disclaimer in the
17 * documentation and/or other materials provided with the distribution.
18 * 3. Neither the name of the University nor the names of its contributors
19 * may be used to endorse or promote products derived from this software
20 * without specific prior written permission.
21 *
22 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
23 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
24 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
25 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
26 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
27 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
28 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
29 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
30 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
31 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32 * SUCH DAMAGE.
33 *
34 * @(#)regcomp.c 8.5 (Berkeley) 3/20/94
35 */
36
37 #include <sys/types.h>
38 #include <stdio.h>
39 #include <string.h>
40 #include <ctype.h>
41 #include <limits.h>
42 #include <stdlib.h>
43 #include "r_regex.h"
44 #include "r_util/r_str.h"
45 #include "r_util/r_assert.h"
46
47 #include "utils.h"
48 #include "regex2.h"
49
50 #include "cclass.h"
51 #include "cname.h"
52
53 /*
54 * parse structure, passed up and down to avoid global variables and
55 * other clumsinesses
56 */
57 struct parse {
58 char *next; /* next character in RE */
59 char *end; /* end of string (-> NUL normally) */
60 int error; /* has an error been seen? */
61 sop *strip; /* malloced strip */
62 sopno ssize; /* malloced strip size (allocated) */
63 sopno slen; /* malloced strip length (used) */
64 int ncsalloc; /* number of csets allocated */
65 struct re_guts *g;
66 # define NPAREN 10 /* we need to remember () 1-9 for back refs */
67 sopno pbegin[NPAREN]; /* -> ( ([0] unused) */
68 sopno pend[NPAREN]; /* -> ) ([0] unused) */
69 };
70
71 static void p_ere(struct parse *, int);
72 static void p_ere_exp(struct parse *);
73 static void p_str(struct parse *);
74 static void p_bre(struct parse *, int, int);
75 static int p_simp_re(struct parse *, int);
76 static int p_count(struct parse *);
77 static void p_bracket(struct parse *);
78 static void p_b_term(struct parse *, cset *);
79 static void p_b_cclass(struct parse *, cset *);
80 static void p_b_eclass(struct parse *, cset *);
81 static char p_b_symbol(struct parse *);
82 static char p_b_coll_elem(struct parse *, int);
83 static char othercase(int);
84 static void bothcases(struct parse *, int);
85 static void ordinary(struct parse *, int);
86 static void special(struct parse *, int);
87 static void nonnewline(struct parse *);
88 static void repeat(struct parse *, sopno, int, int);
89 static int seterr(struct parse *, int);
90 static cset *allocset(struct parse *);
91 static void freeset(struct parse *, cset *);
92 static int freezeset(struct parse *, cset *);
93 static int firstch(struct parse *, cset *);
94 static int nch(struct parse *, cset *);
95 static void mcadd(struct parse *, cset *, char *);
96 static void mcinvert(struct parse *, cset *);
97 static void mccase(struct parse *, cset *);
98 static int isinsets(struct re_guts *, int);
99 static int samesets(struct re_guts *, int, int);
100 static void categorize(struct parse *, struct re_guts *);
101 static sopno dupl(struct parse *, sopno, sopno);
102 static void doemit(struct parse *, sop, size_t);
103 static void doinsert(struct parse *, sop, size_t, sopno);
104 static void dofwd(struct parse *, sopno, sop);
105 static void enlarge(struct parse *, sopno);
106 static void stripsnug(struct parse *, struct re_guts *);
107 static void findmust(struct parse *, struct re_guts *);
108 static sopno pluscount(struct parse *, struct re_guts *);
109
110 static char nuls[10]; /* place to point scanner in event of error */
111
112 /*
113 * macros for use with parse structure
114 * BEWARE: these know that the parse structure is named `p' !!!
115 */
116 #define PEEK() (*p->next)
117 #define PEEK2() (*(p->next+1))
118 #define MORE() (p->next < p->end)
119 #define MORE2() (p->next+1 < p->end)
120 #define SEE(c) (MORE() && PEEK() == (c))
121 #define SEETWO(a, b) (MORE() && MORE2() && PEEK() == (a) && PEEK2() == (b))
122 #define EAT(c) ((SEE(c)) ? (NEXT(), 1) : 0)
123 #define EATTWO(a, b) ((SEETWO(a, b)) ? (NEXT2(), 1) : 0)
124 #define NEXT() (p->next++)
125 #define NEXT2() (p->next += 2)
126 #define NEXTn(n) (p->next += (n))
127 #define GETNEXT() (*p->next++)
128 #define SETERROR(e) seterr(p, (e))
129 #define REQUIRE(co, e) (void)((co) || SETERROR(e))
130 #define MUSTSEE(c, e) (REQUIRE(MORE() && PEEK() == (c), e))
131 #define MUSTEAT(c, e) (REQUIRE(MORE() && GETNEXT() == (c), e))
132 #define MUSTNOTSEE(c, e) (REQUIRE(!MORE() || PEEK() != (c), e))
133 #define EMIT(op, sopnd) doemit(p, (sop)(op), (size_t)(sopnd))
134 #define INSERT(op, pos) doinsert(p, (sop)(op), HERE()-(pos)+1, pos)
135 #define AHEAD(pos) dofwd(p, pos, HERE()-(pos))
136 #define ASTERN(sop, pos) EMIT(sop, HERE()-(pos))
137 #define HERE() (p->slen)
138 #define THERE() (p->slen - 1)
139 #define THERETHERE() (p->slen - 2)
140 #define DROP(n) (p->slen -= (n))
141
142
r_regex_match(const char * pattern,const char * flags,const char * text)143 R_API bool r_regex_match(const char *pattern, const char *flags, const char *text) {
144 RRegex rx;
145 int re_flags = r_regex_flags (flags);
146 if (r_regex_init (&rx, pattern, re_flags)) {
147 eprintf ("r_regex_match: /%s/ fails to compile.\n", pattern);
148 return false;
149 }
150 int rc = r_regex_exec (&rx, text, 0, 0, re_flags);
151 r_regex_fini (&rx);
152 return rc == 0;
153 }
154
r_regex_match_list(RRegex * rx,const char * text)155 R_API RList *r_regex_match_list(RRegex *rx, const char *text) {
156 RList *list = r_list_newf (free);
157 RRegexMatch match;
158
159 /* Initialize the boundaries for R_REGEX_STARTEND */
160 match.rm_so = 0;
161 match.rm_eo = strlen (text);
162 while (!r_regex_exec (rx, text, 1, &match, rx->re_flags | R_REGEX_STARTEND)) {
163 size_t entry_len = match.rm_eo - match.rm_so + 1;
164 char *entry = r_str_ndup (text + match.rm_so, entry_len);
165 r_list_append (list, entry);
166 /* Update the boundaries for R_REGEX_STARTEND */
167 match.rm_so = match.rm_eo;
168 match.rm_eo = strlen (text);
169 }
170 return list;
171 }
172
r_regex_new(const char * pattern,const char * flags)173 R_API RRegex *r_regex_new(const char *pattern, const char *flags) {
174 r_return_val_if_fail (pattern, NULL);
175 RRegex *r, rx = {0};
176 if (r_regex_init (&rx, pattern, r_regex_flags (flags))) {
177 return NULL;
178 }
179 r = R_NEW (RRegex);
180 if (!r) {
181 return NULL;
182 }
183 memcpy (r, &rx, sizeof (RRegex));
184 return r;
185 }
186
r_regex_flags(const char * f)187 R_API int r_regex_flags(const char *f) {
188 int flags = 0;
189 if (!f || !*f) {
190 return 0;
191 }
192 if (strchr (f, 'e')) {
193 flags |= R_REGEX_EXTENDED;
194 }
195 if (strchr (f, 'i')) {
196 flags |= R_REGEX_ICASE;
197 }
198 if (strchr (f, 's')) {
199 flags |= R_REGEX_NOSUB;
200 }
201 if (strchr (f, 'n')) {
202 flags |= R_REGEX_NEWLINE;
203 }
204 if (strchr (f, 'N')) {
205 flags |= R_REGEX_NOSPEC;
206 }
207 if (strchr (f, 'p')) {
208 flags |= R_REGEX_PEND;
209 }
210 if (strchr (f, 'd')) {
211 flags |= R_REGEX_DUMP;
212 }
213 return flags;
214 }
215
r_regex_fini(RRegex * preg)216 R_API void r_regex_fini(RRegex *preg) {
217 struct re_guts *g;
218 if (!preg) {
219 return;
220 }
221 if (preg->re_magic != MAGIC1) { /* oops */
222 return; /* nice to complain, but hard */
223 }
224
225 g = preg->re_g;
226 if (!g || g->magic != MAGIC2) { /* oops again */
227 return;
228 }
229 preg->re_magic = 0; /* mark it invalid */
230 g->magic = 0; /* mark it invalid */
231
232 free (g->strip);
233 free (g->sets);
234 free (g->setbits);
235 free (g->must);
236 free (g);
237 }
238
r_regex_free(RRegex * preg)239 R_API void r_regex_free(RRegex *preg) {
240 r_regex_fini (preg);
241 free (preg);
242 }
243
244 /*
245 - regcomp - interface for parser and compilation
246 - 0 success, otherwise R_REGEX_something
247 */
r_regex_init(RRegex * preg,const char * pattern,int cflags)248 R_API int r_regex_init(RRegex *preg, const char *pattern, int cflags) {
249 struct parse pa;
250 struct parse *p = &pa;
251 int i;
252 size_t len;
253 cflags &= ~R_REGEX_DUMP;
254 if (!preg || ((cflags & R_REGEX_EXTENDED) && (cflags & R_REGEX_NOSPEC))) {
255 return R_REGEX_INVARG;
256 }
257 if (cflags & R_REGEX_PEND) {
258 if (preg->re_endp < pattern) {
259 return R_REGEX_INVARG;
260 }
261 len = preg->re_endp - pattern;
262 } else {
263 len = strlen ((char *)pattern);
264 }
265 /* do the mallocs early so failure handling is easy */
266 struct re_guts *g = (struct re_guts *)calloc (sizeof (struct re_guts) + (NC - 1), sizeof (cat_t));
267 if (!g) {
268 return R_REGEX_ESPACE;
269 }
270 /*
271 * Limit the pattern space to avoid a 32-bit overflow on buffer
272 * extension. Also avoid any signed overflow in case of conversion
273 * so make the real limit based on a 31-bit overflow.
274 *
275 * Likely not applicable on 64-bit systems but handle the case
276 * generically (who are we to stop people from using ~715MB+
277 * patterns?).
278 */
279 size_t maxlen = ((size_t) - 1 >> 1) / sizeof (sop) * 2 / 3;
280 if (len >= maxlen) {
281 free (g);
282 return R_REGEX_ESPACE;
283 }
284 preg->re_flags = cflags;
285 p->ssize = len / (size_t)2 * (size_t)3 + (size_t)1; /* ugh */
286 if (p->ssize < len) {
287 free (g);
288 return R_REGEX_ESPACE;
289 }
290
291 p->strip = (sop *)calloc (p->ssize, sizeof (sop));
292 if (!p->strip) {
293 free (g);
294 return R_REGEX_ESPACE;
295 }
296 p->slen = 0;
297 if (!p->strip) {
298 free (g);
299 return R_REGEX_ESPACE;
300 }
301
302 /* set things up */
303 p->g = g;
304 p->next = (char *)pattern; /* convenience; we do not modify it */
305 p->end = p->next + len;
306 p->error = 0;
307 p->ncsalloc = 0;
308 for (i = 0; i < NPAREN; i++) {
309 p->pbegin[i] = 0;
310 p->pend[i] = 0;
311 }
312 g->csetsize = NC;
313 g->sets = NULL;
314 g->setbits = NULL;
315 g->ncsets = 0;
316 g->cflags = cflags;
317 g->iflags = 0;
318 g->nbol = 0;
319 g->neol = 0;
320 g->must = NULL;
321 g->mlen = 0;
322 g->nsub = 0;
323 g->ncategories = 1; /* category 0 is "everything else" */
324 g->categories = &g->catspace[-(CHAR_MIN)]; // WTF
325 (void) memset ((char *)g->catspace, 0, NC*sizeof(cat_t));
326 g->backrefs = 0;
327
328 /* do it */
329 EMIT (OEND, 0);
330 g->firststate = THERE ();
331 if (cflags & R_REGEX_EXTENDED) {
332 p_ere(p, OUT);
333 } else if (cflags & R_REGEX_NOSPEC) {
334 p_str (p);
335 } else {
336 p_bre (p, OUT, OUT);
337 }
338 EMIT (OEND, 0);
339 g->laststate = THERE ();
340
341 /* tidy up loose ends and fill things in */
342 categorize (p, g);
343 stripsnug (p, g);
344 findmust (p, g);
345 g->nplus = pluscount (p, g);
346 g->magic = MAGIC2;
347 preg->re_nsub = g->nsub;
348 preg->re_g = g;
349 preg->re_magic = MAGIC1;
350 /* not debugging, so can't rely on the asssert() in regexec() */
351 if (g->iflags & BAD) {
352 SETERROR (R_REGEX_ASSERT);
353 }
354 if (p->error) {
355 r_regex_fini (preg);
356 }
357 return p->error;
358 }
359
360 /*
361 - p_ere - ERE parser top level, concatenation and alternation
362 */
p_ere(struct parse * p,int stop)363 static void p_ere(struct parse *p, int stop) { /* character this ERE should end at */
364 bool isFirst = true;
365 sopno prevback = 0;
366 sopno prevfwd = 0;
367 sopno conc = 0;
368 char c;
369
370 for (;;) {
371 /* do a bunch of concatenated expressions */
372 conc = HERE();
373 while (MORE() && (c = PEEK()) != '|' && c != stop) {
374 p_ere_exp (p);
375 }
376 REQUIRE (HERE () != conc, R_REGEX_EMPTY); /* require nonempty */
377
378 if (!EAT('|')) {
379 break; /* NOTE BREAK OUT */
380 }
381 if (isFirst) {
382 INSERT (OCH_, conc); /* offset is wrong */
383 prevfwd = conc;
384 prevback = conc;
385 isFirst = false;
386 }
387 ASTERN(OOR1, prevback);
388 prevback = THERE();
389 AHEAD(prevfwd); /* fix previous offset */
390 prevfwd = HERE();
391 EMIT(OOR2, 0); /* offset is very wrong */
392 }
393
394 if (!isFirst) { /* tail-end fixups */
395 AHEAD(prevfwd);
396 ASTERN(O_CH, prevback);
397 }
398 //asert(!MORE() || SEE(stop));
399 }
400
401 /*
402 - p_ere_exp - parse one subERE, an atom possibly followed by a repetition op
403 */
p_ere_exp(struct parse * p)404 static void p_ere_exp(struct parse *p) {
405 char c;
406 sopno pos;
407 int count;
408 int count2;
409 sopno subno;
410 int wascaret = 0;
411
412 if (!MORE()) { /* caller should have ensured this */
413 return;
414 }
415 c = GETNEXT();
416
417 pos = HERE();
418 switch (c) {
419 case '(':
420 REQUIRE(MORE(), R_REGEX_EPAREN);
421 p->g->nsub++;
422 subno = p->g->nsub;
423 if (subno < NPAREN) {
424 p->pbegin[subno] = HERE ();
425 }
426 EMIT(OLPAREN, subno);
427 if (!SEE (')')) {
428 p_ere (p, ')');
429 }
430 if (subno < NPAREN) {
431 p->pend[subno] = HERE();
432 if (!p->pend[subno]) {
433 break;
434 }
435 }
436 EMIT(ORPAREN, subno);
437 MUSTEAT(')', R_REGEX_EPAREN);
438 break;
439 case '^':
440 EMIT(OBOL, 0);
441 p->g->iflags |= USEBOL;
442 p->g->nbol++;
443 wascaret = 1;
444 break;
445 case '$':
446 EMIT(OEOL, 0);
447 p->g->iflags |= USEEOL;
448 p->g->neol++;
449 break;
450 case '|':
451 SETERROR(R_REGEX_EMPTY);
452 break;
453 case '*':
454 case '+':
455 case '?':
456 SETERROR(R_REGEX_BADRPT);
457 break;
458 case '.':
459 if (p->g->cflags & R_REGEX_NEWLINE) {
460 nonnewline(p);
461 } else {
462 EMIT (OANY, 0);
463 }
464 break;
465 case '[':
466 p_bracket(p);
467 break;
468 case '\\':
469 REQUIRE(MORE(), R_REGEX_EESCAPE);
470 c = GETNEXT();
471 if (!isalpha(c)) {
472 ordinary(p, c);
473 } else {
474 special(p, c);
475 }
476 break;
477 case '{': /* okay as ordinary except if digit follows */
478 REQUIRE(!MORE() || !isdigit((ut8)PEEK()), R_REGEX_BADRPT);
479 /* FALLTHROUGH */
480 default:
481 ordinary(p, c);
482 break;
483 }
484
485 if (!MORE ()) {
486 return;
487 }
488 c = PEEK();
489 /* we call { a repetition if followed by a digit */
490 if (!(c == '*' || c == '+' || c == '?' ||
491 (c == '{' && MORE2 () && isdigit ((ut8)PEEK2 ())))) {
492 return; /* no repetition, we're done */
493 }
494 NEXT();
495
496 REQUIRE(!wascaret, R_REGEX_BADRPT);
497 switch (c) {
498 case '*': /* implemented as +? */
499 /* this case does not require the (y|) trick, noKLUDGE */
500 INSERT(OPLUS_, pos);
501 ASTERN(O_PLUS, pos);
502 INSERT(OQUEST_, pos);
503 ASTERN(O_QUEST, pos);
504 break;
505 case '+':
506 INSERT(OPLUS_, pos);
507 ASTERN(O_PLUS, pos);
508 break;
509 case '?':
510 /* KLUDGE: emit y? as (y|) until subtle bug gets fixed */
511 INSERT(OCH_, pos); /* offset slightly wrong */
512 ASTERN(OOR1, pos); /* this one's right */
513 AHEAD(pos); /* fix the OCH_ */
514 EMIT(OOR2, 0); /* offset very wrong... */
515 AHEAD(THERE()); /* ...so fix it */
516 ASTERN(O_CH, THERETHERE());
517 break;
518 case '{':
519 count = p_count(p);
520 if (EAT(',')) {
521 if (isdigit((ut8)PEEK())) {
522 count2 = p_count(p);
523 REQUIRE(count <= count2, R_REGEX_BADBR);
524 } else { /* single number with comma */
525 count2 = INTFINITY;
526 }
527 } else { /* just a single number */
528 count2 = count;
529 }
530 repeat(p, pos, count, count2);
531 if (!EAT('}')) { /* error heuristics */
532 while (MORE () && PEEK () != '}') {
533 NEXT ();
534 }
535 REQUIRE(MORE(), R_REGEX_EBRACE);
536 SETERROR(R_REGEX_BADBR);
537 }
538 break;
539 }
540
541 if (!MORE ()) {
542 return;
543 }
544 c = PEEK();
545 if (!(c == '*' || c == '+' || c == '?' ||
546 (c == '{' && MORE2 () && isdigit ((ut8)PEEK2 ())))) {
547 return;
548 }
549 SETERROR(R_REGEX_BADRPT);
550 }
551
552 /*
553 - p_str - string (no metacharacters) "parser"
554 */
p_str(struct parse * p)555 static void p_str(struct parse *p) {
556 REQUIRE(MORE(), R_REGEX_EMPTY);
557 while (MORE()) {
558 ordinary(p, GETNEXT());
559 }
560 }
561
562 /*
563 - p_bre - BRE parser top level, anchoring and concatenation
564 * Giving end1 as OUT essentially eliminates the end1/end2 check.
565 *
566 * This implementation is a bit of a kludge, in that a trailing $ is first
567 * taken as an ordinary character and then revised to be an anchor. The
568 * only undesirable side effect is that '$' gets included as a character
569 * category in such cases. This is fairly harmless; not worth fixing.
570 * The amount of lookahead needed to avoid this kludge is excessive.
571 */
p_bre(struct parse * p,int end1,int end2)572 static void p_bre(struct parse *p,
573 int end1, /* first terminating character */
574 int end2) /* second terminating character */
575 {
576 sopno start = HERE();
577 int first = 1; /* first subexpression? */
578 int wasdollar = 0;
579
580 if (EAT('^')) {
581 EMIT(OBOL, 0);
582 p->g->iflags |= USEBOL;
583 p->g->nbol++;
584 }
585 while (MORE() && !SEETWO(end1, end2)) {
586 wasdollar = p_simp_re(p, first);
587 first = 0;
588 }
589 if (wasdollar) { /* oops, that was a trailing anchor */
590 DROP(1);
591 EMIT(OEOL, 0);
592 p->g->iflags |= USEEOL;
593 p->g->neol++;
594 }
595
596 REQUIRE(HERE() != start, R_REGEX_EMPTY); /* require nonempty */
597 }
598
599 /*
600 - p_simp_re - parse a simple RE, an atom possibly followed by a repetition
601 */
602 static int /* was the simple RE an unbackslashed $? */
p_simp_re(struct parse * p,int starordinary)603 p_simp_re(struct parse *p,
604 int starordinary) /* is a leading * an ordinary character? */
605 {
606 int c;
607 int count;
608 int count2;
609 sopno pos;
610 int i;
611 sopno subno;
612 # define BACKSL (1<<CHAR_BIT)
613
614 pos = HERE(); /* repetion op, if any, covers from here */
615
616 if (!MORE()) { /* caller should have ensured this */
617 return 0;
618 }
619 c = GETNEXT();
620 if (c == '\\') {
621 REQUIRE(MORE(), R_REGEX_EESCAPE);
622 c = BACKSL | GETNEXT();
623 }
624 switch (c) {
625 case '.':
626 if (p->g->cflags & R_REGEX_NEWLINE) {
627 nonnewline(p);
628 } else {
629 EMIT (OANY, 0);
630 }
631 break;
632 case '[':
633 p_bracket(p);
634 break;
635 case BACKSL|'{':
636 SETERROR(R_REGEX_BADRPT);
637 break;
638 case BACKSL|'(':
639 p->g->nsub++;
640 subno = p->g->nsub;
641 if (subno < NPAREN) {
642 p->pbegin[subno] = HERE ();
643 }
644 EMIT(OLPAREN, subno);
645 /* the MORE here is an error heuristic */
646 if (MORE () && !SEETWO ('\\', ')')) {
647 p_bre (p, '\\', ')');
648 }
649 if (subno < NPAREN) {
650 p->pend[subno] = HERE();
651 if (!p->pend[subno]) {
652 break;
653 }
654 }
655 EMIT(ORPAREN, subno);
656 REQUIRE(EATTWO('\\', ')'), R_REGEX_EPAREN);
657 break;
658 case BACKSL|')': /* should not get here -- must be user */
659 case BACKSL|'}':
660 SETERROR(R_REGEX_EPAREN);
661 break;
662 case BACKSL|'1':
663 case BACKSL|'2':
664 case BACKSL|'3':
665 case BACKSL|'4':
666 case BACKSL|'5':
667 case BACKSL|'6':
668 case BACKSL|'7':
669 case BACKSL|'8':
670 case BACKSL|'9':
671 i = (c&~BACKSL) - '0';
672 if (p->pend[i] != 0) {
673 if (i <= p->g->nsub) {
674 EMIT(OBACK_, i);
675 if (p->pbegin[i] != 0 && OP(p->strip[p->pbegin[i]]) == OLPAREN &&
676 OP(p->strip[p->pend[i]]) == ORPAREN) {
677 (void) dupl(p, p->pbegin[i]+1, p->pend[i]);
678 EMIT(O_BACK, i);
679 }
680 }
681 } else {
682 SETERROR(R_REGEX_ESUBREG);
683 }
684 p->g->backrefs = 1;
685 break;
686 case '*':
687 REQUIRE(starordinary, R_REGEX_BADRPT);
688 /* FALLTHROUGH */
689 default:
690 ordinary(p, (char)c);
691 break;
692 }
693
694 if (EAT('*')) { /* implemented as +? */
695 /* this case does not require the (y|) trick, noKLUDGE */
696 INSERT(OPLUS_, pos);
697 ASTERN(O_PLUS, pos);
698 INSERT(OQUEST_, pos);
699 ASTERN(O_QUEST, pos);
700 } else if (EATTWO('\\', '{')) {
701 count = p_count(p);
702 if (EAT(',')) {
703 if (MORE() && isdigit((ut8)PEEK())) {
704 count2 = p_count(p);
705 REQUIRE(count <= count2, R_REGEX_BADBR);
706 } else { /* single number with comma */
707 count2 = INTFINITY;
708 }
709 } else { /* just a single number */
710 count2 = count;
711 }
712 repeat(p, pos, count, count2);
713 if (!EATTWO('\\', '}')) { /* error heuristics */
714 while (MORE () && !SEETWO ('\\', '}')) {
715 NEXT ();
716 }
717 REQUIRE(MORE(), R_REGEX_EBRACE);
718 SETERROR(R_REGEX_BADBR);
719 }
720 } else if (c == '$') { /* $ (but not \$) ends it */
721 return (1);
722 }
723
724 return(0);
725 }
726
727 /*
728 - p_count - parse a repetition count
729 */
730 static int /* the value */
p_count(struct parse * p)731 p_count(struct parse *p)
732 {
733 int count = 0;
734 int ndigits = 0;
735
736 while (MORE() && isdigit((ut8)PEEK()) && count <= DUPMAX) {
737 count = count*10 + (GETNEXT() - '0');
738 ndigits++;
739 }
740
741 REQUIRE(ndigits > 0 && count <= DUPMAX, R_REGEX_BADBR);
742 return(count);
743 }
744
745 /*
746 - p_bracket - parse a bracketed character list
747 *
748 * Note a significant property of this code: if the allocset() did SETERROR,
749 * no set operations are done.
750 */
p_bracket(struct parse * p)751 static void p_bracket(struct parse *p) {
752 cset *cs;
753 int invert = 0;
754
755 /* Dept of Truly Sickening Special-Case Kludges */
756 if (p->next + 5 < p->end && strncmp(p->next, "[:<:]]", 6) == 0) {
757 EMIT(OBOW, 0);
758 NEXTn(6);
759 return;
760 }
761 if (p->next + 5 < p->end && strncmp(p->next, "[:>:]]", 6) == 0) {
762 EMIT(OEOW, 0);
763 NEXTn(6);
764 return;
765 }
766
767 if (!(cs = allocset(p))) {
768 /* allocset did set error status in p */
769 return;
770 }
771
772 if (EAT ('^')) {
773 invert++; /* make note to invert set at end */
774 }
775 if (EAT (']')) {
776 CHadd(cs, ']');
777 } else if (EAT ('-')) {
778 CHadd (cs, '-');
779 }
780 while (MORE () && PEEK () != ']' && !SEETWO ('-', ']')) {
781 p_b_term (p, cs);
782 }
783 if (EAT ('-')) {
784 CHadd (cs, '-');
785 }
786 MUSTEAT(']', R_REGEX_EBRACK);
787
788 if (p->error != 0) { /* don't mess things up further */
789 freeset(p, cs);
790 return;
791 }
792
793 if (p->g->cflags&R_REGEX_ICASE) {
794 int i;
795 int ci;
796
797 for (i = p->g->csetsize - 1; i >= 0; i--) {
798 if (CHIN(cs, i) && isalpha(i)) {
799 ci = othercase(i);
800 if (ci != i) {
801 CHadd (cs, ci);
802 }
803 }
804 }
805 if (cs->multis != NULL) {
806 mccase (p, cs);
807 }
808 }
809 if (invert) {
810 int i;
811
812 for (i = p->g->csetsize - 1; i >= 0; i--) {
813 if (CHIN (cs, i)) {
814 CHsub(cs, i);
815 } else {
816 CHadd (cs, i);
817 }
818 }
819 if (p->g->cflags & R_REGEX_NEWLINE) {
820 CHsub (cs, '\n');
821 }
822 if (cs->multis != NULL) {
823 mcinvert (p, cs);
824 }
825 }
826
827 if (cs->multis) { /* xxx */
828 return;
829 }
830
831 if (nch(p, cs) == 1) { /* optimize singleton sets */
832 ordinary(p, firstch(p, cs));
833 freeset(p, cs);
834 } else {
835 EMIT(OANYOF, freezeset(p, cs));
836 }
837 }
838
839 /*
840 - p_b_term - parse one term of a bracketed character list
841 */
p_b_term(struct parse * p,cset * cs)842 static void p_b_term(struct parse *p, cset *cs) {
843 char c;
844 char start = 0, finish;
845 int i;
846
847 /* classify what we've got */
848 switch ((MORE()) ? PEEK() : '\0') {
849 case '[':
850 c = (MORE2()) ? PEEK2() : '\0';
851 break;
852 case '-':
853 SETERROR(R_REGEX_ERANGE);
854 return; /* NOTE RETURN */
855 break;
856 default:
857 c = '\0';
858 break;
859 }
860
861 switch (c) {
862 case ':': /* character class */
863 NEXT2();
864 REQUIRE(MORE(), R_REGEX_EBRACK);
865 c = PEEK();
866 REQUIRE(c != '-' && c != ']', R_REGEX_ECTYPE);
867 p_b_cclass(p, cs);
868 REQUIRE(MORE(), R_REGEX_EBRACK);
869 REQUIRE(EATTWO(':', ']'), R_REGEX_ECTYPE);
870 break;
871 case '=': /* equivalence class */
872 NEXT2();
873 REQUIRE(MORE(), R_REGEX_EBRACK);
874 c = PEEK();
875 REQUIRE(c != '-' && c != ']', R_REGEX_ECOLLATE);
876 p_b_eclass(p, cs);
877 REQUIRE(MORE(), R_REGEX_EBRACK);
878 REQUIRE(EATTWO('=', ']'), R_REGEX_ECOLLATE);
879 break;
880 default: /* symbol, ordinary character, or range */
881 /* xxx revision needed for multichar stuff */
882 start = p_b_symbol(p);
883 if (SEE('-') && MORE2() && PEEK2() != ']') {
884 /* range */
885 NEXT();
886 if (EAT ('-')) {
887 finish = '-';
888 } else {
889 finish = p_b_symbol (p);
890 }
891 } else {
892 finish = start;
893 }
894 /* xxx what about signed chars here... */
895 REQUIRE(start <= finish, R_REGEX_ERANGE);
896 for (i = start; i <= finish; i++) {
897 CHadd (cs, i);
898 }
899 break;
900 }
901 }
902
903 /*
904 - p_b_cclass - parse a character-class name and deal with it
905 */
p_b_cclass(struct parse * p,cset * cs)906 static void p_b_cclass(struct parse *p, cset *cs) {
907 char *sp = p->next;
908 struct cclass *cp;
909 size_t len;
910 char *u;
911 char c;
912
913 while (MORE () && isalpha ((unsigned char)PEEK ())) {
914 NEXT ();
915 }
916 len = p->next - sp;
917 for (cp = cclasses; cp->name != NULL; cp++) {
918 if (strncmp (cp->name, sp, len) == 0 && cp->name[len] == '\0') {
919 break;
920 }
921 }
922 if (!cp->name) {
923 /* oops, didn't find it */
924 SETERROR(R_REGEX_ECTYPE);
925 return;
926 }
927
928 u = cp->chars;
929 while ((c = *u++) != '\0') {
930 CHadd (cs, c);
931 }
932 for (u = cp->multis; *u != '\0'; u += strlen (u) + 1) {
933 MCadd (p, cs, u);
934 }
935 }
936
937 /*
938 - p_b_eclass - parse an equivalence-class name and deal with it
939 *
940 * This implementation is incomplete. xxx
941 */
p_b_eclass(struct parse * p,cset * cs)942 static void p_b_eclass(struct parse *p, cset *cs) {
943 char c;
944
945 c = p_b_coll_elem(p, '=');
946 CHadd(cs, c);
947 }
948
949 /*
950 - p_b_symbol - parse a character or [..]ed multicharacter collating symbol
951 */
952 static char /* value of symbol */
p_b_symbol(struct parse * p)953 p_b_symbol(struct parse *p)
954 {
955 char value;
956
957 REQUIRE(MORE(), R_REGEX_EBRACK);
958 if (!EATTWO ('[', '.')) {
959 return (GETNEXT ());
960 }
961
962 /* collating symbol */
963 value = p_b_coll_elem(p, '.');
964 REQUIRE(EATTWO('.', ']'), R_REGEX_ECOLLATE);
965 return(value);
966 }
967
968 /*
969 - p_b_coll_elem - parse a collating-element name and look it up
970 */
971 static char /* value of collating element */
p_b_coll_elem(struct parse * p,int endc)972 p_b_coll_elem(struct parse *p,
973 int endc) /* name ended by endc,']' */
974 {
975 char *sp = p->next;
976 struct cname *cp;
977 int len;
978
979 while (MORE () && !SEETWO (endc, ']')) {
980 NEXT ();
981 }
982 if (!MORE()) {
983 SETERROR(R_REGEX_EBRACK);
984 return(0);
985 }
986 len = p->next - sp;
987 for (cp = cnames; cp->name != NULL; cp++) {
988 if (strncmp (cp->name, sp, len) == 0 && cp->name[len] == '\0') {
989 return (cp->code); /* known name */
990 }
991 }
992 if (len == 1) {
993 return (*sp); /* single character */
994 }
995 SETERROR(R_REGEX_ECOLLATE); /* neither */
996 return(0);
997 }
998
999 /*
1000 - othercase - return the case counterpart of an alphabetic
1001 */
1002 static char /* if no counterpart, return ch */
othercase(int ch)1003 othercase(int ch)
1004 {
1005 ch = (ut8)ch;
1006 if (isalpha(ch)) {
1007 if (isupper (ch)) {
1008 return ((ut8)tolower(ch));
1009 } else if (islower (ch)) {
1010 return ((ut8)toupper(ch));
1011 } else { /* peculiar, but could happen */
1012 return (ch);
1013 }
1014 }
1015 return ch;
1016 }
1017
1018 /*
1019 - bothcases - emit a dualcase version of a two-case character
1020 *
1021 * Boy, is this implementation ever a kludge...
1022 */
bothcases(struct parse * p,int ch)1023 static void bothcases(struct parse *p, int ch) {
1024 char *oldnext = p->next;
1025 char *oldend = p->end;
1026 char bracket[3];
1027
1028 ch = (ut8)ch;
1029 if (othercase(ch) != ch) { /* p_bracket() would recurse */
1030 p->next = bracket;
1031 p->end = bracket+2;
1032 bracket[0] = ch;
1033 bracket[1] = ']';
1034 bracket[2] = '\0';
1035 p_bracket(p);
1036 if (p->next == bracket+2) {
1037 p->next = oldnext;
1038 p->end = oldend;
1039 }
1040 }
1041 }
1042
1043 /*
1044 - ordinary - emit an ordinary character
1045 */
1046 static void
ordinary(struct parse * p,int ch)1047 ordinary(struct parse *p, int ch)
1048 {
1049 cat_t *cap = p->g->categories;
1050
1051 if ((p->g->cflags & R_REGEX_ICASE) && isalpha ((ut8)ch) && othercase (ch) != ch) {
1052 bothcases(p, ch);
1053 } else {
1054 EMIT(OCHAR, (ut8)ch);
1055 if (cap[ch] == 0) {
1056 cap[ch] = p->g->ncategories++;
1057 }
1058 }
1059 }
1060
1061 static void
special(struct parse * p,int ch)1062 special(struct parse *p, int ch) {
1063 char *oldnext = p->next;
1064 char *oldend = p->end;
1065 char bracket[16] = {0};
1066 char digits[3] = {0};
1067 char c;
1068 int num = 0;
1069 switch (ch) {
1070 case 'x':
1071 digits[0] = GETNEXT ();
1072 digits[1] = GETNEXT ();
1073 c = (char)strtol (digits, NULL, 16);
1074 ordinary (p, c);
1075 return;
1076 case 'n':
1077 ordinary (p, '\n');
1078 return;
1079 case 't':
1080 ordinary (p, '\t');
1081 return;
1082 case 'r':
1083 ordinary (p, '\r');
1084 return;
1085 case 's':
1086 num = 5;
1087 memcpy (bracket, "\t\r\n ]", num);
1088 break;
1089 case 'd':
1090 num = 4;
1091 memcpy (bracket, "0-9]", num);
1092 break;
1093 case 'w':
1094 num = 4;
1095 memcpy (bracket, "a-z]", num);
1096 break;
1097 default:
1098 SETERROR (R_REGEX_INVARG);
1099 return;
1100 }
1101
1102 p->next = bracket;
1103 p->end = bracket + num;
1104
1105 p_bracket (p);
1106
1107 if (p->next == bracket + num) {
1108 p->next = oldnext;
1109 p->end = oldend;
1110 }
1111 }
1112
1113 /*
1114 - nonnewline - emit R_REGEX_NEWLINE version of OANY
1115 *
1116 * Boy, is this implementation ever a kludge...
1117 */
1118 static void
nonnewline(struct parse * p)1119 nonnewline(struct parse *p)
1120 {
1121 char *oldnext = p->next;
1122 char *oldend = p->end;
1123 char bracket[4];
1124
1125 p->next = bracket;
1126 p->end = bracket+3;
1127 bracket[0] = '^';
1128 bracket[1] = '\n';
1129 bracket[2] = ']';
1130 bracket[3] = '\0';
1131 p_bracket(p);
1132 if (p->next == bracket+3) {
1133 p->next = oldnext;
1134 p->end = oldend;
1135 }
1136 }
1137
1138 /*
1139 - repeat - generate code for a bounded repetition, recursively if needed
1140 */
1141 static void
repeat(struct parse * p,sopno start,int from,int to)1142 repeat(struct parse *p,
1143 sopno start, /* operand from here to end of strip */
1144 int from, /* repeated from this number */
1145 int to) /* to this number of times (maybe INTFINITY) */
1146 {
1147 sopno finish = HERE();
1148 # define N 2
1149 # define INF 3
1150 # define REP(f, t) ((f)*8 + (t))
1151 # define MAP(n) (((n) <= 1) ? (n) : ((n) == INTFINITY) ? INF : N)
1152 sopno copy;
1153
1154 if (p->error != 0) { /* head off possible runaway recursion */
1155 return;
1156 }
1157
1158 if (from > to) {
1159 return;
1160 }
1161
1162 switch (REP(MAP(from), MAP(to))) {
1163 case REP(0, 0): /* must be user doing this */
1164 DROP(finish-start); /* drop the operand */
1165 break;
1166 case REP(0, 1): /* as x{1,1}? */
1167 case REP(0, N): /* as x{1,n}? */
1168 case REP(0, INF): /* as x{1,}? */
1169 /* KLUDGE: emit y? as (y|) until subtle bug gets fixed */
1170 INSERT(OCH_, start); /* offset is wrong... */
1171 repeat(p, start+1, 1, to);
1172 ASTERN(OOR1, start);
1173 AHEAD(start); /* ... fix it */
1174 EMIT(OOR2, 0);
1175 AHEAD(THERE());
1176 ASTERN(O_CH, THERETHERE());
1177 break;
1178 case REP(1, 1): /* trivial case */
1179 /* done */
1180 break;
1181 case REP(1, N): /* as x?x{1,n-1} */
1182 /* KLUDGE: emit y? as (y|) until subtle bug gets fixed */
1183 INSERT(OCH_, start);
1184 ASTERN(OOR1, start);
1185 AHEAD(start);
1186 EMIT(OOR2, 0); /* offset very wrong... */
1187 AHEAD(THERE()); /* ...so fix it */
1188 ASTERN(O_CH, THERETHERE());
1189 copy = dupl(p, start+1, finish+1);
1190 if (copy == finish+4) {
1191 repeat(p, copy, 1, to-1);
1192 }
1193 break;
1194 case REP(1, INF): /* as x+ */
1195 INSERT(OPLUS_, start);
1196 ASTERN(O_PLUS, start);
1197 break;
1198 case REP(N, N): /* as xx{m-1,n-1} */
1199 copy = dupl(p, start, finish);
1200 repeat(p, copy, from-1, to-1);
1201 break;
1202 case REP(N, INF): /* as xx{n-1,INF} */
1203 copy = dupl(p, start, finish);
1204 repeat(p, copy, from-1, to);
1205 break;
1206 default: /* "can't happen" */
1207 SETERROR(R_REGEX_ASSERT); /* just in case */
1208 break;
1209 }
1210 }
1211
1212 /*
1213 - seterr - set an error condition
1214 */
1215 static int /* useless but makes type checking happy */
seterr(struct parse * p,int e)1216 seterr(struct parse *p, int e)
1217 {
1218 if (p->error == 0) { /* keep earliest error condition */
1219 p->error = e;
1220 }
1221 p->next = nuls; /* try to bring things to a halt */
1222 p->end = nuls;
1223 return(0); /* make the return value well-defined */
1224 }
1225
1226 /*
1227 - allocset - allocate a set of characters for []
1228 */
allocset(struct parse * p)1229 static cset * allocset(struct parse *p) {
1230 int no = p->g->ncsets++;
1231 size_t nc;
1232 size_t nbytes;
1233 cset *cs;
1234 size_t css = (size_t)p->g->csetsize;
1235 int i;
1236
1237 if (no >= p->ncsalloc) { /* need another column of space */
1238 void *ptr;
1239
1240 p->ncsalloc += CHAR_BIT;
1241 nc = p->ncsalloc;
1242 if (nc % CHAR_BIT) {
1243 goto nomem;
1244 }
1245 nbytes = nc / CHAR_BIT * css;
1246
1247 ptr = (cset *)realloc((char *)p->g->sets, nc * sizeof(cset));
1248 if (!ptr) {
1249 goto nomem;
1250 }
1251 p->g->sets = ptr;
1252
1253 ptr = (ut8 *)realloc((char *)p->g->setbits, nbytes);
1254 if (!ptr) {
1255 goto nomem;
1256 }
1257 p->g->setbits = ptr;
1258
1259 for (i = 0; i < no; i++) {
1260 p->g->sets[i].ptr = p->g->setbits + css * (i / CHAR_BIT);
1261 }
1262
1263 (void) memset((char *)p->g->setbits + (nbytes - css), 0, css);
1264 }
1265 /* XXX should not happen */
1266 if (!p->g->sets || !p->g->setbits) {
1267 goto nomem;
1268 }
1269
1270 cs = &p->g->sets[no];
1271 cs->ptr = p->g->setbits + css*((no)/CHAR_BIT);
1272 cs->mask = 1 << ((no) % CHAR_BIT);
1273 cs->hash = 0;
1274 cs->smultis = 0;
1275 cs->multis = NULL;
1276
1277 return(cs);
1278 nomem:
1279 R_FREE(p->g->sets);
1280 R_FREE(p->g->setbits);
1281
1282 SETERROR(R_REGEX_ESPACE);
1283 /* caller's responsibility not to do set ops */
1284 return(NULL);
1285 }
1286
1287 /*
1288 - freeset - free a now-unused set
1289 */
freeset(struct parse * p,cset * cs)1290 static void freeset(struct parse *p, cset *cs) {
1291 int i;
1292 cset *top = &p->g->sets[p->g->ncsets];
1293 size_t css = (size_t)p->g->csetsize;
1294
1295 for (i = 0; i < css; i++) {
1296 CHsub (cs, i);
1297 }
1298 if (cs == top - 1) { /* recover only the easy case */
1299 p->g->ncsets--;
1300 }
1301 }
1302
1303 /*
1304 - freezeset - final processing on a set of characters
1305 *
1306 * The main task here is merging identical sets. This is usually a waste
1307 * of time (although the hash code minimizes the overhead), but can win
1308 * big if R_REGEX_ICASE is being used. R_REGEX_ICASE, by the way, is why the hash
1309 * is done using addition rather than xor -- all ASCII [aA] sets xor to
1310 * the same value!
1311 */
1312 static int /* set number */
freezeset(struct parse * p,cset * cs)1313 freezeset(struct parse *p, cset *cs)
1314 {
1315 ut8 h = cs->hash;
1316 int i;
1317 cset *top = &p->g->sets[p->g->ncsets];
1318 cset *cs2;
1319 size_t css = (size_t)p->g->csetsize;
1320
1321 /* look for an earlier one which is the same */
1322 for (cs2 = &p->g->sets[0]; cs2 < top; cs2++) {
1323 if (cs2->hash == h && cs2 != cs) {
1324 /* maybe */
1325 for (i = 0; i < css; i++) {
1326 if (!!CHIN (cs2, i) != !!CHIN (cs, i)) {
1327 break; /* no */
1328 }
1329 }
1330 if (i == css) {
1331 break; /* yes */
1332 }
1333 }
1334 }
1335
1336 if (cs2 < top) { /* found one */
1337 freeset(p, cs);
1338 cs = cs2;
1339 }
1340
1341 return((int)(cs - p->g->sets));
1342 }
1343
1344 /*
1345 - firstch - return first character in a set (which must have at least one)
1346 */
1347 static int /* character; there is no "none" value */
firstch(struct parse * p,cset * cs)1348 firstch(struct parse *p, cset *cs)
1349 {
1350 int i;
1351 size_t css = (size_t)p->g->csetsize;
1352
1353 for (i = 0; i < css; i++) {
1354 if (CHIN (cs, i)) {
1355 return ((char)i);
1356 }
1357 }
1358 return(0); /* arbitrary */
1359 }
1360
1361 /*
1362 - nch - number of characters in a set
1363 */
nch(struct parse * p,cset * cs)1364 static int nch(struct parse *p, cset *cs) {
1365 int i;
1366 size_t css = (size_t)p->g->csetsize;
1367 int n = 0;
1368
1369 for (i = 0; i < css; i++) {
1370 if (CHIN (cs, i)) {
1371 n++;
1372 }
1373 }
1374 return(n);
1375 }
1376
1377 /*
1378 - mcadd - add a collating element to a cset
1379 */
mcadd(struct parse * p,cset * cs,char * cp)1380 static void mcadd( struct parse *p, cset *cs, char *cp) {
1381 size_t oldend = cs->smultis;
1382 void *np;
1383
1384 cs->smultis += strlen(cp) + 1;
1385 np = realloc(cs->multis, cs->smultis);
1386 if (!np) {
1387 if (cs->multis) {
1388 free (cs->multis);
1389 }
1390 cs->multis = NULL;
1391 SETERROR(R_REGEX_ESPACE);
1392 return;
1393 }
1394 cs->multis = np;
1395
1396 r_str_ncpy (cs->multis + oldend - 1, cp, cs->smultis - oldend + 1);
1397 }
1398
1399 /*
1400 - mcinvert - invert the list of collating elements in a cset
1401 *
1402 * This would have to know the set of possibilities. Implementation
1403 * is deferred.
1404 */
1405 /* ARGSUSED */
mcinvert(struct parse * p,cset * cs)1406 static void mcinvert(struct parse *p, cset *cs) {
1407 //asert(!cs->multis); /* xxx */
1408 return;
1409 }
1410
1411 /*
1412 - mccase - add case counterparts of the list of collating elements in a cset
1413 *
1414 * This would have to know the set of possibilities. Implementation
1415 * is deferred.
1416 */
1417 /* ARGSUSED */
mccase(struct parse * p,cset * cs)1418 static void mccase(struct parse *p, cset *cs) {
1419 //asert(!cs->multis); /* xxx */
1420 return;
1421 }
1422
1423 /*
1424 - isinsets - is this character in any sets?
1425 */
1426 static int /* predicate */
isinsets(struct re_guts * g,int c)1427 isinsets(struct re_guts *g, int c)
1428 {
1429 ut8 *col;
1430 int i;
1431 int ncols = (g->ncsets+(CHAR_BIT-1)) / CHAR_BIT;
1432 unsigned uc = (ut8)c;
1433
1434 for (i = 0, col = g->setbits; i < ncols; i++, col += g->csetsize) {
1435 if (col[uc] != 0) {
1436 return (1);
1437 }
1438 }
1439 return(0);
1440 }
1441
1442 /*
1443 - samesets - are these two characters in exactly the same sets?
1444 */
1445 static int /* predicate */
samesets(struct re_guts * g,int c1,int c2)1446 samesets(struct re_guts *g, int c1, int c2)
1447 {
1448 ut8 *col;
1449 int i;
1450 int ncols = (g->ncsets+(CHAR_BIT-1)) / CHAR_BIT;
1451 unsigned uc1 = (ut8)c1;
1452 unsigned uc2 = (ut8)c2;
1453
1454 for (i = 0, col = g->setbits; i < ncols; i++, col += g->csetsize) {
1455 if (col[uc1] != col[uc2]) {
1456 return (0);
1457 }
1458 }
1459 return(1);
1460 }
1461
1462 /*
1463 - categorize - sort out character categories
1464 */
1465 static void
categorize(struct parse * p,struct re_guts * g)1466 categorize(struct parse *p, struct re_guts *g)
1467 {
1468 cat_t *cats = g? g->categories : NULL;
1469 unsigned int c;
1470 unsigned int c2;
1471 cat_t cat;
1472
1473 /* avoid making error situations worse */
1474 if (!p || p->error != 0 || !cats) {
1475 return;
1476 }
1477
1478 for (c = CHAR_MIN; c <= CHAR_MAX; c++) {
1479 if ( *(cats+c) && isinsets(g, c)) {
1480 cat = g->ncategories++;
1481 cats[c] = cat;
1482 for (c2 = c + 1; c2 <= CHAR_MAX; c2++) {
1483 if (cats[c2] == 0 && samesets (g, c, c2)) {
1484 cats[c2] = cat;
1485 }
1486 }
1487 }
1488 }
1489 }
1490
1491 /*
1492 - dupl - emit a duplicate of a bunch of sops
1493 */
1494 static sopno /* start of duplicate */
dupl(struct parse * p,sopno start,sopno finish)1495 dupl(struct parse *p,
1496 sopno start, /* from here */
1497 sopno finish) /* to this less one */
1498 {
1499 sopno ret = HERE();
1500 sopno len = finish - start;
1501
1502 if (finish >= start) {
1503 if (len == 0) {
1504 return (ret);
1505 }
1506 enlarge(p, p->ssize + len); /* this many unexpected additions */
1507 if (p->ssize >= p->slen + len) {
1508 (void) memcpy((char *)(p->strip + p->slen),
1509 (char *)(p->strip + start), (size_t)len*sizeof(sop));
1510 p->slen += len;
1511 return(ret);
1512 }
1513 }
1514 return ret;
1515 }
1516
1517 /*
1518 - doemit - emit a strip operator
1519 *
1520 * It might seem better to implement this as a macro with a function as
1521 * hard-case backup, but it's just too big and messy unless there are
1522 * some changes to the data structures. Maybe later.
1523 */
1524 static void
doemit(struct parse * p,sop op,size_t opnd)1525 doemit(struct parse *p, sop op, size_t opnd)
1526 {
1527 /* avoid making error situations worse */
1528 if (p->error != 0) {
1529 return;
1530 }
1531
1532 /* deal with oversize operands ("can't happen", more or less) */
1533 if (opnd < 1<<OPSHIFT) {
1534
1535 /* deal with undersized strip */
1536 if (p->slen >= p->ssize) {
1537 enlarge (p, (p->ssize + 1) / 2 * 3); /* +50% */
1538 }
1539 if (p->slen < p->ssize) {
1540 /* finally, it's all reduced to the easy case */
1541 p->strip[p->slen++] = SOP(op, opnd);
1542 }
1543 }
1544 }
1545
1546 /*
1547 - doinsert - insert a sop into the strip
1548 */
1549 static void
doinsert(struct parse * p,sop op,size_t opnd,sopno pos)1550 doinsert(struct parse *p, sop op, size_t opnd, sopno pos)
1551 {
1552 sopno sn;
1553 sop s;
1554 int i;
1555
1556 /* avoid making error situations worse */
1557 if (p->error != 0) {
1558 return;
1559 }
1560
1561 sn = HERE();
1562 EMIT(op, opnd); /* do checks, ensure space */
1563 if (HERE() != sn+1) {
1564 return;
1565 }
1566 s = p->strip[sn];
1567
1568 /* adjust paren pointers */
1569 if (pos > 0) {
1570 for (i = 1; i < NPAREN; i++) {
1571 if (p->pbegin[i] >= pos) {
1572 p->pbegin[i]++;
1573 }
1574 if (p->pend[i] >= pos) {
1575 p->pend[i]++;
1576 }
1577 }
1578 }
1579
1580 memmove((char *)&p->strip[pos+1], (char *)&p->strip[pos],
1581 (HERE()-pos-1)*sizeof(sop));
1582 p->strip[pos] = s;
1583 }
1584
1585 /*
1586 - dofwd - complete a forward reference
1587 */
1588 static void
dofwd(struct parse * p,sopno pos,sop value)1589 dofwd(struct parse *p, sopno pos, sop value)
1590 {
1591 /* avoid making error situations worse */
1592 if (p->error != 0) {
1593 return;
1594 }
1595
1596 if (value < 1<<OPSHIFT) {
1597 p->strip[pos] = OP(p->strip[pos]) | value;
1598 }
1599 }
1600
1601 /*
1602 - enlarge - enlarge the strip
1603 */
1604 static void
enlarge(struct parse * p,sopno size)1605 enlarge(struct parse *p, sopno size)
1606 {
1607 sop *sp;
1608
1609 if (p->ssize >= size) {
1610 return;
1611 }
1612
1613 sp = (sop *)realloc(p->strip, size*sizeof(sop));
1614 if (!sp) {
1615 SETERROR(R_REGEX_ESPACE);
1616 return;
1617 }
1618 p->strip = sp;
1619 p->ssize = size;
1620 }
1621
1622 /*
1623 - stripsnug - compact the strip
1624 */
1625 static void
stripsnug(struct parse * p,struct re_guts * g)1626 stripsnug(struct parse *p, struct re_guts *g)
1627 {
1628 g->nstates = p->slen;
1629 g->strip = (sop *)realloc((char *)p->strip, p->slen * sizeof(sop));
1630 if (!g->strip) {
1631 SETERROR(R_REGEX_ESPACE);
1632 g->strip = p->strip;
1633 }
1634 }
1635
1636 /*
1637 - findmust - fill in must and mlen with longest mandatory literal string
1638 *
1639 * This algorithm could do fancy things like analyzing the operands of |
1640 * for common subsequences. Someday. This code is simple and finds most
1641 * of the interesting cases.
1642 *
1643 * Note that must and mlen got initialized during setup.
1644 */
1645 static void
findmust(struct parse * p,struct re_guts * g)1646 findmust(struct parse *p, struct re_guts *g)
1647 {
1648 sop *scan;
1649 sop *start = NULL; /* start initialized in the default case, after that */
1650 sop *newstart = NULL; /* newstart was initialized in the OCHAR case */
1651 sopno newlen;
1652 sop s;
1653 char *cp;
1654 sopno i;
1655
1656 /* avoid making error situations worse */
1657 if (p->error != 0) {
1658 return;
1659 }
1660
1661 /* find the longest OCHAR sequence in strip */
1662 newlen = 0;
1663 start = scan = g->strip + 1;
1664 do {
1665 s = *scan++;
1666 switch (OP(s)) {
1667 case OCHAR: /* sequence member */
1668 if (newlen == 0) { /* new sequence */
1669 newstart = scan - 1;
1670 }
1671 newlen++;
1672 break;
1673 case OPLUS_: /* things that don't break one */
1674 case OLPAREN:
1675 case ORPAREN:
1676 break;
1677 case OQUEST_: /* things that must be skipped */
1678 case OCH_:
1679 scan--;
1680 do {
1681 scan += OPND(s);
1682 s = *scan;
1683 /* asert() interferes w debug printouts */
1684 if (OP(s) != O_QUEST && OP(s) != O_CH &&
1685 OP(s) != OOR2) {
1686 g->iflags |= BAD;
1687 return;
1688 }
1689 } while (OP(s) != O_QUEST && OP(s) != O_CH);
1690 /* fallthrough */
1691 default: /* things that break a sequence */
1692 if (newlen > g->mlen) { /* ends one */
1693 start = newstart;
1694 g->mlen = newlen;
1695 }
1696 newlen = 0;
1697 break;
1698 }
1699 } while (OP(s) != OEND);
1700
1701 if (g->mlen == 0) { /* there isn't one */
1702 return;
1703 }
1704
1705 /* turn it into a character string */
1706 g->must = malloc((size_t)g->mlen + 1);
1707 if (!g->must) { /* argh; just forget it */
1708 g->mlen = 0;
1709 return;
1710 }
1711 cp = g->must;
1712 scan = start;
1713 for (i = g->mlen; i > 0; i--) {
1714 while (OP (s = *scan++) != OCHAR) {
1715 continue;
1716 }
1717 if (cp < g->must + g->mlen) {
1718 *cp++ = (char)OPND(s);
1719 }
1720 }
1721 if (cp == g->must + g->mlen) {
1722 *cp++ = '\0'; /* just on general principles */
1723 }
1724 }
1725
1726 /*
1727 - pluscount - count + nesting
1728 */
1729 static sopno /* nesting depth */
pluscount(struct parse * p,struct re_guts * g)1730 pluscount(struct parse *p, struct re_guts *g)
1731 {
1732 sop *scan;
1733 sop s;
1734 sopno plusnest = 0;
1735 sopno maxnest = 0;
1736
1737 if (p->error != 0) {
1738 return (0); /* there may not be an OEND */
1739 }
1740
1741 scan = g->strip + 1;
1742 do {
1743 s = *scan++;
1744 switch (OP(s)) {
1745 case OPLUS_:
1746 plusnest++;
1747 break;
1748 case O_PLUS:
1749 if (plusnest > maxnest) {
1750 maxnest = plusnest;
1751 }
1752 plusnest--;
1753 break;
1754 }
1755 } while (OP(s) != OEND);
1756 if (plusnest != 0) {
1757 g->iflags |= BAD;
1758 }
1759 return(maxnest);
1760 }
1761