1 #include <sys/types.h>
2 #include <stdio.h>
3 #include <stdlib.h>
4 #include <string.h>
5 #include <limits.h>
6 #include <ctype.h>
7 #include <assert.h>
8
9 #include "regex.h"
10
11 /* utility definitions */
12 #ifdef _POSIX2_RE_DUP_MAX
13 #define DUPMAX _POSIX2_RE_DUP_MAX
14 #else
15 #define DUPMAX 255
16 #endif
17 #define INFINITY (DUPMAX + 1)
18 #define NC (CHAR_MAX - CHAR_MIN + 1)
19 typedef unsigned char uch;
20
21 /* for old systems with bcopy() but no memmove() */
22 #ifdef USEBCOPY
23 #define memmove(d, s, c) bcopy(s, d, c)
24 #endif
25
26 #define MAGIC1 ((('r'^0200)<<8) | 'e')
27
28 /*
29 * The internal representation is a *strip*, a sequence of
30 * operators ending with an endmarker. (Some terminology etc. is a
31 * historical relic of earlier versions which used multiple strips.)
32 * Certain oddities in the representation are there to permit running
33 * the machinery backwards; in particular, any deviation from sequential
34 * flow must be marked at both its source and its destination. Some
35 * fine points:
36 *
37 * - OPLUS_ and O_PLUS are *inside* the loop they create.
38 * - OQUEST_ and O_QUEST are *outside* the bypass they create.
39 * - OCH_ and O_CH are *outside* the multi-way branch they create, while
40 * OOR1 and OOR2 are respectively the end and the beginning of one of
41 * the branches. Note that there is an implicit OOR2 following OCH_
42 * and an implicit OOR1 preceding O_CH.
43 *
44 * In state representations, an operator's bit is on to signify a state
45 * immediately *preceding* "execution" of that operator.
46 */
47 typedef long sop; /* strip operator */
48 typedef long sopno;
49 #define OPRMASK 0x7c000000
50 #define OPDMASK 0x03ffffff
51 #define OPSHIFT (26)
52 #define OP(n) ((n)&OPRMASK)
53 #define OPND(n) ((n)&OPDMASK)
54 #define SOP(op, opnd) ((op)|(opnd))
55 /* operators meaning operand */
56 /* (back, fwd are offsets) */
57 #define OEND (1<<OPSHIFT) /* endmarker - */
58 #define OCHAR (2<<OPSHIFT) /* character unsigned char */
59 #define OBOL (3<<OPSHIFT) /* left anchor - */
60 #define OEOL (4<<OPSHIFT) /* right anchor - */
61 #define OANY (5<<OPSHIFT) /* . - */
62 #define OANYOF (6<<OPSHIFT) /* [...] set number */
63 #define OBACK_ (7<<OPSHIFT) /* begin \d paren number */
64 #define O_BACK (8<<OPSHIFT) /* end \d paren number */
65 #define OPLUS_ (9<<OPSHIFT) /* + prefix fwd to suffix */
66 #define O_PLUS (10<<OPSHIFT) /* + suffix back to prefix */
67 #define OQUEST_ (11<<OPSHIFT) /* ? prefix fwd to suffix */
68 #define O_QUEST (12<<OPSHIFT) /* ? suffix back to prefix */
69 #define OLPAREN (13<<OPSHIFT) /* ( fwd to ) */
70 #define ORPAREN (14<<OPSHIFT) /* ) back to ( */
71 #define OCH_ (15<<OPSHIFT) /* begin choice fwd to OOR2 */
72 #define OOR1 (16<<OPSHIFT) /* | pt. 1 back to OOR1 or OCH_ */
73 #define OOR2 (17<<OPSHIFT) /* | pt. 2 fwd to OOR2 or O_CH */
74 #define O_CH (18<<OPSHIFT) /* end choice back to OOR1 */
75 #define OBOW (19<<OPSHIFT) /* begin word - */
76 #define OEOW (20<<OPSHIFT) /* end word - */
77
78 /*
79 * Structure for [] character-set representation. Character sets are
80 * done as bit vectors, grouped 8 to a byte vector for compactness.
81 * The individual set therefore has both a pointer to the byte vector
82 * and a mask to pick out the relevant bit of each byte. A hash code
83 * simplifies testing whether two sets could be identical.
84 *
85 * This will get trickier for multicharacter collating elements. As
86 * preliminary hooks for dealing with such things, we also carry along
87 * a string of multi-character elements, and decide the size of the
88 * vectors at run time.
89 */
90 typedef struct {
91 uch *ptr; /* -> uch [csetsize] */
92 uch mask; /* bit within array */
93 uch hash; /* hash code */
94 size_t smultis;
95 char *multis; /* -> char[smulti] ab\0cd\0ef\0\0 */
96 } cset;
97 /* note that CHadd and CHsub are unsafe, and CHIN doesn't yield 0/1 */
98 #define CHadd(cs, c) ((cs)->ptr[(uch)(c)] |= (cs)->mask, (cs)->hash += (c))
99 #define CHsub(cs, c) ((cs)->ptr[(uch)(c)] &= ~(cs)->mask, (cs)->hash -= (c))
100 #define CHIN(cs, c) ((cs)->ptr[(uch)(c)] & (cs)->mask)
101 #define MCadd(p, cs, cp) mcadd(p, cs, cp) /* regcomp() internal fns */
102
103 /* stuff for character categories */
104 typedef unsigned char cat_t;
105
106 /*
107 * main compiled-expression structure
108 */
109 struct re_guts {
110 int magic;
111 # define MAGIC2 ((('R'^0200)<<8)|'E')
112 sop *strip; /* malloced area for strip */
113 int csetsize; /* number of bits in a cset vector */
114 int ncsets; /* number of csets in use */
115 cset *sets; /* -> cset [ncsets] */
116 uch *setbits; /* -> uch[csetsize][ncsets/CHAR_BIT] */
117 int cflags; /* copy of regcomp() cflags argument */
118 sopno nstates; /* = number of sops */
119 sopno firststate; /* the initial OEND (normally 0) */
120 sopno laststate; /* the final OEND */
121 int iflags; /* internal flags */
122 # define USEBOL 01 /* used ^ */
123 # define USEEOL 02 /* used $ */
124 # define BAD 04 /* something wrong */
125 int nbol; /* number of ^ used */
126 int neol; /* number of $ used */
127 int ncategories; /* how many character categories */
128 cat_t *categories; /* ->catspace[-CHAR_MIN] */
129 char *must; /* match must contain this string */
130 int mlen; /* length of must */
131 size_t nsub; /* copy of re_nsub */
132 int backrefs; /* does it use back references? */
133 sopno nplus; /* how deep does it nest +s? */
134 /* catspace must be last */
135 cat_t catspace[1]; /* actually [NC] */
136 };
137
138 /* misc utilities */
139 #define OUT (CHAR_MAX+1) /* a non-character value */
140 #define ISWORD(c) (isalnum(c) || (c) == '_')
141
142
143 /* character-class table */
144 static struct cclass {
145 char *name;
146 char *chars;
147 char *multis;
148 } cclasses[] = {
149 {"alnum","ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789",""},
150 {"alpha", "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz", ""},
151 {"blank"," \t", ""},
152 {"cntrl","\007\b\t\n\v\f\r\1\2\3\4\5\6\16\17\20\21\22\23\24\25\26\27\30\31\32\33\34\35\36\37\177",""},
153 {"digit","0123456789",""},
154 {"graph","ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~",""},
155 {"lower","abcdefghijklmnopqrstuvwxyz",""},
156 {"print","ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~ ",""},
157 {"punct","!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~",""},
158 {"space","\t\n\v\f\r ", ""},
159 {"upper","ABCDEFGHIJKLMNOPQRSTUVWXYZ",""},
160 {"xdigit","0123456789ABCDEFabcdef",""},
161 {NULL,NULL,""}
162 };
163
164 /* character-name table */
165 static struct cname {
166 char *name;
167 char code;
168 } cnames[] = {
169 {"NUL", '\0'},
170 {"SOH", '\001'},
171 {"STX", '\002'},
172 {"ETX", '\003'},
173 {"EOT", '\004'},
174 {"ENQ", '\005'},
175 {"ACK", '\006'},
176 {"BEL", '\007'},
177 {"alert", '\007'},
178 {"BS", '\010'},
179 {"backspace", '\b'},
180 {"HT", '\011'},
181 {"tab", '\t'},
182 {"LF", '\012'},
183 {"newline", '\n'},
184 {"VT", '\013'},
185 {"vertical-tab", '\v'},
186 {"FF", '\014'},
187 {"form-feed", '\f'},
188 {"CR", '\015'},
189 {"carriage-return", '\r'},
190 {"SO", '\016'},
191 {"SI", '\017'},
192 {"DLE", '\020'},
193 {"DC1", '\021'},
194 {"DC2", '\022'},
195 {"DC3", '\023'},
196 {"DC4", '\024'},
197 {"NAK", '\025'},
198 {"SYN", '\026'},
199 {"ETB", '\027'},
200 {"CAN", '\030'},
201 {"EM", '\031'},
202 {"SUB", '\032'},
203 {"ESC", '\033'},
204 {"IS4", '\034'},
205 {"FS", '\034'},
206 {"IS3", '\035'},
207 {"GS", '\035'},
208 {"IS2", '\036'},
209 {"RS", '\036'},
210 {"IS1", '\037'},
211 {"US", '\037'},
212 {"space", ' '},
213 {"exclamation-mark", '!'},
214 {"quotation-mark", '"'},
215 {"number-sign", '#'},
216 {"dollar-sign", '$'},
217 {"percent-sign", '%'},
218 {"ampersand", '&'},
219 {"apostrophe", '\''},
220 {"left-parenthesis", '('},
221 {"right-parenthesis", ')'},
222 {"asterisk", '*'},
223 {"plus-sign", '+'},
224 {"comma", ','},
225 {"hyphen", '-'},
226 {"hyphen-minus", '-'},
227 {"period", '.'},
228 {"full-stop", '.'},
229 {"slash", '/'},
230 {"solidus", '/'},
231 {"zero", '0'},
232 {"one", '1'},
233 {"two", '2'},
234 {"three", '3'},
235 {"four", '4'},
236 {"five", '5'},
237 {"six", '6'},
238 {"seven", '7'},
239 {"eight", '8'},
240 {"nine", '9'},
241 {"colon", ':'},
242 {"semicolon", ';'},
243 {"less-than-sign", '<'},
244 {"equals-sign", '='},
245 {"greater-than-sign", '>'},
246 {"question-mark", '?'},
247 {"commercial-at", '@'},
248 {"left-square-bracket", '['},
249 {"backslash", '\\'},
250 {"reverse-solidus", '\\'},
251 {"right-square-bracket", ']'},
252 {"circumflex", '^'},
253 {"circumflex-accent", '^'},
254 {"underscore", '_'},
255 {"low-line", '_'},
256 {"grave-accent", '`'},
257 {"left-brace", '{'},
258 {"left-curly-bracket", '{'},
259 {"vertical-line", '|'},
260 {"right-brace", '}'},
261 {"right-curly-bracket", '}'},
262 {"tilde", '~'},
263 {"DEL", '\177'},
264 {NULL, 0}
265 };
266
267
268 /*
269 * parse structure, passed up and down to avoid global variables and
270 * other clumsinesses
271 */
272 struct parse {
273 char *next; /* next character in RE */
274 char *end; /* end of string (-> NUL normally) */
275 int error; /* has an error been seen? */
276 sop *strip; /* malloced strip */
277 sopno ssize; /* malloced strip size (allocated) */
278 sopno slen; /* malloced strip length (used) */
279 int ncsalloc; /* number of csets allocated */
280 struct re_guts *g;
281 # define NPAREN 10 /* we need to remember () 1-9 for back refs */
282 sopno pbegin[NPAREN]; /* -> ( ([0] unused) */
283 sopno pend[NPAREN]; /* -> ) ([0] unused) */
284 };
285
286 #ifdef __cplusplus
287 extern "C" {
288 #endif
289
290 static void p_ere(register struct parse *p, int stop);
291 static void p_ere_exp(register struct parse *p);
292 static void p_str(register struct parse *p);
293 static void p_bre(register struct parse *p, register int end1, register int end2);
294 static int p_simp_re(register struct parse *p, int starordinary);
295 static int p_count(register struct parse *p);
296 static void p_bracket(register struct parse *p);
297 static void p_b_term(register struct parse *p, register cset *cs);
298 static void p_b_cclass(register struct parse *p, register cset *cs);
299 static void p_b_eclass(register struct parse *p, register cset *cs);
300 static char p_b_symbol(register struct parse *p);
301 static char p_b_coll_elem(register struct parse *p, int endc);
302 static char othercase(int ch);
303 static void bothcases(register struct parse *p, int ch);
304 static void ordinary(register struct parse *p, register int ch);
305 static void nonnewline(register struct parse *p);
306 static void repeat(register struct parse *p, sopno start, int from, int to);
307 static int seterr(register struct parse *p, int e);
308 static cset *allocset(register struct parse *p);
309 static void freeset(register struct parse *p, register cset *cs);
310 static int freezeset(register struct parse *p, register cset *cs);
311 static int firstch(register struct parse *p, register cset *cs);
312 static int nch(register struct parse *p, register cset *cs);
313 static void mcadd(register struct parse *p, register cset *cs, register char *cp);
314 static void mcinvert(register struct parse *p, register cset *cs);
315 static void mccase(register struct parse *p, register cset *cs);
316 static int isinsets(register struct re_guts *g, int c);
317 static int samesets(register struct re_guts *g, int c1, int c2);
318 static void categorize(struct parse *p, register struct re_guts *g);
319 static sopno dupl(register struct parse *p, sopno start, sopno finish);
320 static void doemit(register struct parse *p, sop op, size_t opnd);
321 static void doinsert(register struct parse *p, sop op, size_t opnd, sopno pos);
322 static void dofwd(register struct parse *p, sopno pos, sop value);
323 static void enlarge(register struct parse *p, sopno size);
324 static void stripsnug(register struct parse *p, register struct re_guts *g);
325 static void findmust(register struct parse *p, register struct re_guts *g);
326 static sopno pluscount(register struct parse *p, register struct re_guts *g);
327
328 #ifdef __cplusplus
329 }
330 #endif
331
332 static char nuls[10]; /* place to point scanner in event of error */
333
334 /*
335 * macros for use with parse structure
336 * BEWARE: these know that the parse structure is named `p' !!!
337 */
338 #define PEEK() (*p->next)
339 #define PEEK2() (*(p->next+1))
340 #define MORE() (p->next < p->end)
341 #define MORE2() (p->next+1 < p->end)
342 #define SEE(c) (MORE() && PEEK() == (c))
343 #define SEETWO(a, b) (MORE() && MORE2() && PEEK() == (a) && PEEK2() == (b))
344 #define EAT(c) ((SEE(c)) ? (NEXT(), 1) : 0)
345 #define EATTWO(a, b) ((SEETWO(a, b)) ? (NEXT2(), 1) : 0)
346 #define NEXT() (p->next++)
347 #define NEXT2() (p->next += 2)
348 #define NEXTn(n) (p->next += (n))
349 #define GETNEXT() (*p->next++)
350 #define SETERROR(e) seterr(p, (e))
351 #define REQUIRE(co, e) ((co) || SETERROR(e))
352 #define MUSTSEE(c, e) (REQUIRE(MORE() && PEEK() == (c), e))
353 #define MUSTEAT(c, e) (REQUIRE(MORE() && GETNEXT() == (c), e))
354 #define MUSTNOTSEE(c, e) (REQUIRE(!MORE() || PEEK() != (c), e))
355 #define EMIT(op, sopnd) doemit(p, (sop)(op), (size_t)(sopnd))
356 #define INSERT(op, pos) doinsert(p, (sop)(op), HERE()-(pos)+1, pos)
357 #define AHEAD(pos) dofwd(p, pos, HERE()-(pos))
358 #define ASTERN(sop, pos) EMIT(sop, HERE()-pos)
359 #define HERE() (p->slen)
360 #define THERE() (p->slen - 1)
361 #define THERETHERE() (p->slen - 2)
362 #define DROP(n) (p->slen -= (n))
363
364 #define never 0 /* some <assert.h>s have bugs too */
365
366 int /* 0 success, otherwise REG_something */
regcomp(preg,pattern,cflags)367 regcomp(preg, pattern, cflags)
368 regex_t *preg;
369 const char *pattern;
370 int cflags;
371 {
372 struct parse pa;
373 register struct re_guts *g;
374 register struct parse *p = &pa;
375 register int i;
376 register size_t len;
377 #define GOODFLAGS(f) ((f)&~REG_DUMP)
378
379 cflags = GOODFLAGS(cflags);
380 if ((cflags®_EXTENDED) && (cflags®_NOSPEC))
381 return(REG_INVARG);
382
383 if (cflags®_PEND) {
384 if (preg->re_endp < pattern)
385 return(REG_INVARG);
386 len = preg->re_endp - pattern;
387 } else
388 len = strlen((char *)pattern);
389
390 /* do the mallocs early so failure handling is easy */
391 g = (struct re_guts *)malloc(sizeof(struct re_guts) +
392 (NC-1)*sizeof(cat_t));
393 if (g == NULL)
394 return(REG_ESPACE);
395 p->ssize = len/(size_t)2*(size_t)3 + (size_t)1; /* ugh */
396 p->strip = (sop *)malloc(p->ssize * sizeof(sop));
397 p->slen = 0;
398 if (p->strip == NULL) {
399 free((char *)g);
400 return(REG_ESPACE);
401 }
402
403 /* set things up */
404 p->g = g;
405 p->next = (char *)pattern; /* convenience; we do not modify it */
406 p->end = p->next + len;
407 p->error = 0;
408 p->ncsalloc = 0;
409 for (i = 0; i < NPAREN; i++) {
410 p->pbegin[i] = 0;
411 p->pend[i] = 0;
412 }
413 g->csetsize = NC;
414 g->sets = NULL;
415 g->setbits = NULL;
416 g->ncsets = 0;
417 g->cflags = cflags;
418 g->iflags = 0;
419 g->nbol = 0;
420 g->neol = 0;
421 g->must = NULL;
422 g->mlen = 0;
423 g->nsub = 0;
424 g->ncategories = 1; /* category 0 is "everything else" */
425 g->categories = &g->catspace[-(CHAR_MIN)];
426 (void) memset((char *)g->catspace, 0, NC*sizeof(cat_t));
427 g->backrefs = 0;
428
429 /* do it */
430 EMIT(OEND, 0);
431 g->firststate = THERE();
432 if (cflags®_EXTENDED)
433 p_ere(p, OUT);
434 else if (cflags®_NOSPEC)
435 p_str(p);
436 else
437 p_bre(p, OUT, OUT);
438 EMIT(OEND, 0);
439 g->laststate = THERE();
440
441 /* tidy up loose ends and fill things in */
442 categorize(p, g);
443 stripsnug(p, g);
444 findmust(p, g);
445 g->nplus = pluscount(p, g);
446 g->magic = MAGIC2;
447 preg->re_nsub = g->nsub;
448 preg->re_g = g;
449 preg->re_magic = MAGIC1;
450 /* not debugging, so can't rely on the assert() in regexec() */
451 if (g->iflags&BAD)
452 SETERROR(REG_ASSERT);
453
454 /* win or lose, we're done */
455 if (p->error != 0) /* lose */
456 regfree(preg);
457 return(p->error);
458 #undef GOODFLAGS
459 }
460
461 /*
462 - p_ere - ERE parser top level, concatenation and alternation
463 == static void p_ere(register struct parse *p, int stop);
464 */
465 static void
p_ere(p,stop)466 p_ere(p, stop)
467 register struct parse *p;
468 int stop; /* character this ERE should end at */
469 {
470 register char c;
471 register sopno prevback;
472 register sopno prevfwd;
473 register sopno conc;
474 register int first = 1; /* is this the first alternative? */
475
476 for (;;) {
477 /* do a bunch of concatenated expressions */
478 conc = HERE();
479 while (MORE() && (c = PEEK()) != '|' && c != stop)
480 p_ere_exp(p);
481 REQUIRE(HERE() != conc, REG_EMPTY); /* require nonempty */
482
483 if (!EAT('|'))
484 break; /* NOTE BREAK OUT */
485
486 if (first) {
487 INSERT(OCH_, conc); /* offset is wrong */
488 prevfwd = conc;
489 prevback = conc;
490 first = 0;
491 }
492 ASTERN(OOR1, prevback);
493 prevback = THERE();
494 AHEAD(prevfwd); /* fix previous offset */
495 prevfwd = HERE();
496 EMIT(OOR2, 0); /* offset is very wrong */
497 }
498
499 if (!first) { /* tail-end fixups */
500 AHEAD(prevfwd);
501 ASTERN(O_CH, prevback);
502 }
503
504 assert(!MORE() || SEE(stop));
505 }
506
507 /*
508 - p_ere_exp - parse one subERE, an atom possibly followed by a repetition op
509 == static void p_ere_exp(register struct parse *p);
510 */
511 static void
p_ere_exp(p)512 p_ere_exp(p)
513 register struct parse *p;
514 {
515 register char c;
516 register sopno pos;
517 register int count;
518 register int count2;
519 register sopno subno;
520 int wascaret = 0;
521
522 assert(MORE()); /* caller should have ensured this */
523 c = GETNEXT();
524
525 pos = HERE();
526 switch (c) {
527 case '(':
528 REQUIRE(MORE(), REG_EPAREN);
529 p->g->nsub++;
530 subno = p->g->nsub;
531 if (subno < NPAREN)
532 p->pbegin[subno] = HERE();
533 EMIT(OLPAREN, subno);
534 if (!SEE(')'))
535 p_ere(p, ')');
536 if (subno < NPAREN) {
537 p->pend[subno] = HERE();
538 assert(p->pend[subno] != 0);
539 }
540 EMIT(ORPAREN, subno);
541 MUSTEAT(')', REG_EPAREN);
542 break;
543 case '^':
544 EMIT(OBOL, 0);
545 p->g->iflags |= USEBOL;
546 p->g->nbol++;
547 wascaret = 1;
548 break;
549 case '$':
550 EMIT(OEOL, 0);
551 p->g->iflags |= USEEOL;
552 p->g->neol++;
553 break;
554 case '|':
555 SETERROR(REG_EMPTY);
556 break;
557 case '*':
558 case '+':
559 case '?':
560 SETERROR(REG_BADRPT);
561 break;
562 case '.':
563 if (p->g->cflags®_NEWLINE)
564 nonnewline(p);
565 else
566 EMIT(OANY, 0);
567 break;
568 case '[':
569 p_bracket(p);
570 break;
571 case '\\':
572 REQUIRE(MORE(), REG_EESCAPE);
573 c = GETNEXT();
574 ordinary(p, c);
575 break;
576 case '{': /* okay as ordinary except if digit follows */
577 REQUIRE(!MORE() || !isdigit(PEEK()), REG_BADRPT);
578 /* FALLTHROUGH */
579 default:
580 ordinary(p, c);
581 break;
582 }
583
584 if (!MORE())
585 return;
586 c = PEEK();
587 /* we call { a repetition if followed by a digit */
588 if (!( c == '*' || c == '+' || c == '?' ||
589 (c == '{' && MORE2() && isdigit(PEEK2())) ))
590 return; /* no repetition, we're done */
591 NEXT();
592
593 REQUIRE(!wascaret, REG_BADRPT);
594 switch (c) {
595 case '*': /* implemented as +? */
596 /* this case does not require the (y|) trick, noKLUDGE */
597 INSERT(OPLUS_, pos);
598 ASTERN(O_PLUS, pos);
599 INSERT(OQUEST_, pos);
600 ASTERN(O_QUEST, pos);
601 break;
602 case '+':
603 INSERT(OPLUS_, pos);
604 ASTERN(O_PLUS, pos);
605 break;
606 case '?':
607 /* KLUDGE: emit y? as (y|) until subtle bug gets fixed */
608 INSERT(OCH_, pos); /* offset slightly wrong */
609 ASTERN(OOR1, pos); /* this one's right */
610 AHEAD(pos); /* fix the OCH_ */
611 EMIT(OOR2, 0); /* offset very wrong... */
612 AHEAD(THERE()); /* ...so fix it */
613 ASTERN(O_CH, THERETHERE());
614 break;
615 case '{':
616 count = p_count(p);
617 if (EAT(',')) {
618 if (isdigit(PEEK())) {
619 count2 = p_count(p);
620 REQUIRE(count <= count2, REG_BADBR);
621 } else /* single number with comma */
622 count2 = INFINITY;
623 } else /* just a single number */
624 count2 = count;
625 repeat(p, pos, count, count2);
626 if (!EAT('}')) { /* error heuristics */
627 while (MORE() && PEEK() != '}')
628 NEXT();
629 REQUIRE(MORE(), REG_EBRACE);
630 SETERROR(REG_BADBR);
631 }
632 break;
633 }
634
635 if (!MORE())
636 return;
637 c = PEEK();
638 if (!( c == '*' || c == '+' || c == '?' ||
639 (c == '{' && MORE2() && isdigit(PEEK2())) ) )
640 return;
641 SETERROR(REG_BADRPT);
642 }
643
644 /*
645 - p_str - string (no metacharacters) "parser"
646 == static void p_str(register struct parse *p);
647 */
648 static void
p_str(p)649 p_str(p)
650 register struct parse *p;
651 {
652 REQUIRE(MORE(), REG_EMPTY);
653 while (MORE())
654 ordinary(p, GETNEXT());
655 }
656
657 /*
658 - p_bre - BRE parser top level, anchoring and concatenation
659 == static void p_bre(register struct parse *p, register int end1, \
660 == register int end2);
661 * Giving end1 as OUT essentially eliminates the end1/end2 check.
662 *
663 * This implementation is a bit of a kludge, in that a trailing $ is first
664 * taken as an ordinary character and then revised to be an anchor. The
665 * only undesirable side effect is that '$' gets included as a character
666 * category in such cases. This is fairly harmless; not worth fixing.
667 * The amount of lookahead needed to avoid this kludge is excessive.
668 */
669 static void
p_bre(p,end1,end2)670 p_bre(p, end1, end2)
671 register struct parse *p;
672 register int end1; /* first terminating character */
673 register int end2; /* second terminating character */
674 {
675 register sopno start = HERE();
676 register int first = 1; /* first subexpression? */
677 register int wasdollar = 0;
678
679 if (EAT('^')) {
680 EMIT(OBOL, 0);
681 p->g->iflags |= USEBOL;
682 p->g->nbol++;
683 }
684 while (MORE() && !SEETWO(end1, end2)) {
685 wasdollar = p_simp_re(p, first);
686 first = 0;
687 }
688 if (wasdollar) { /* oops, that was a trailing anchor */
689 DROP(1);
690 EMIT(OEOL, 0);
691 p->g->iflags |= USEEOL;
692 p->g->neol++;
693 }
694
695 REQUIRE(HERE() != start, REG_EMPTY); /* require nonempty */
696 }
697
698 /*
699 - p_simp_re - parse a simple RE, an atom possibly followed by a repetition
700 == static int p_simp_re(register struct parse *p, int starordinary);
701 */
702 static int /* was the simple RE an unbackslashed $? */
p_simp_re(p,starordinary)703 p_simp_re(p, starordinary)
704 register struct parse *p;
705 int starordinary; /* is a leading * an ordinary character? */
706 {
707 register int c;
708 register int count;
709 register int count2;
710 register sopno pos;
711 register int i;
712 register sopno subno;
713 # define BACKSL (1<<CHAR_BIT)
714
715 pos = HERE(); /* repetion op, if any, covers from here */
716
717 assert(MORE()); /* caller should have ensured this */
718 c = GETNEXT();
719 if (c == '\\') {
720 REQUIRE(MORE(), REG_EESCAPE);
721 c = BACKSL | (unsigned char)GETNEXT();
722 }
723 switch (c) {
724 case '.':
725 if (p->g->cflags®_NEWLINE)
726 nonnewline(p);
727 else
728 EMIT(OANY, 0);
729 break;
730 case '[':
731 p_bracket(p);
732 break;
733 case BACKSL|'{':
734 SETERROR(REG_BADRPT);
735 break;
736 case BACKSL|'(':
737 p->g->nsub++;
738 subno = p->g->nsub;
739 if (subno < NPAREN)
740 p->pbegin[subno] = HERE();
741 EMIT(OLPAREN, subno);
742 /* the MORE here is an error heuristic */
743 if (MORE() && !SEETWO('\\', ')'))
744 p_bre(p, '\\', ')');
745 if (subno < NPAREN) {
746 p->pend[subno] = HERE();
747 assert(p->pend[subno] != 0);
748 }
749 EMIT(ORPAREN, subno);
750 REQUIRE(EATTWO('\\', ')'), REG_EPAREN);
751 break;
752 case BACKSL|')': /* should not get here -- must be user */
753 case BACKSL|'}':
754 SETERROR(REG_EPAREN);
755 break;
756 case BACKSL|'1':
757 case BACKSL|'2':
758 case BACKSL|'3':
759 case BACKSL|'4':
760 case BACKSL|'5':
761 case BACKSL|'6':
762 case BACKSL|'7':
763 case BACKSL|'8':
764 case BACKSL|'9':
765 i = (c&~BACKSL) - '0';
766 assert(i < NPAREN);
767 if (p->pend[i] != 0) {
768 assert(i <= p->g->nsub);
769 EMIT(OBACK_, i);
770 assert(p->pbegin[i] != 0);
771 assert(OP(p->strip[p->pbegin[i]]) == OLPAREN);
772 assert(OP(p->strip[p->pend[i]]) == ORPAREN);
773 (void) dupl(p, p->pbegin[i]+1, p->pend[i]);
774 EMIT(O_BACK, i);
775 } else
776 SETERROR(REG_ESUBREG);
777 p->g->backrefs = 1;
778 break;
779 case '*':
780 REQUIRE(starordinary, REG_BADRPT);
781 /* FALLTHROUGH */
782 default:
783 ordinary(p, (char)c); /* takes off BACKSL, if any */
784 break;
785 }
786
787 if (EAT('*')) { /* implemented as +? */
788 /* this case does not require the (y|) trick, noKLUDGE */
789 INSERT(OPLUS_, pos);
790 ASTERN(O_PLUS, pos);
791 INSERT(OQUEST_, pos);
792 ASTERN(O_QUEST, pos);
793 } else if (EATTWO('\\', '{')) {
794 count = p_count(p);
795 if (EAT(',')) {
796 if (MORE() && isdigit(PEEK())) {
797 count2 = p_count(p);
798 REQUIRE(count <= count2, REG_BADBR);
799 } else /* single number with comma */
800 count2 = INFINITY;
801 } else /* just a single number */
802 count2 = count;
803 repeat(p, pos, count, count2);
804 if (!EATTWO('\\', '}')) { /* error heuristics */
805 while (MORE() && !SEETWO('\\', '}'))
806 NEXT();
807 REQUIRE(MORE(), REG_EBRACE);
808 SETERROR(REG_BADBR);
809 }
810 } else if (c == (unsigned char)'$') /* $ (but not \$) ends it */
811 return(1);
812
813 return(0);
814 }
815
816 /*
817 - p_count - parse a repetition count
818 == static int p_count(register struct parse *p);
819 */
820 static int /* the value */
p_count(p)821 p_count(p)
822 register struct parse *p;
823 {
824 register int count = 0;
825 register int ndigits = 0;
826
827 while (MORE() && isdigit(PEEK()) && count <= DUPMAX) {
828 count = count*10 + (GETNEXT() - '0');
829 ndigits++;
830 }
831
832 REQUIRE(ndigits > 0 && count <= DUPMAX, REG_BADBR);
833 return(count);
834 }
835
836 /*
837 - p_bracket - parse a bracketed character list
838 == static void p_bracket(register struct parse *p);
839 *
840 * Note a significant property of this code: if the allocset() did SETERROR,
841 * no set operations are done.
842 */
843 static void
p_bracket(p)844 p_bracket(p)
845 register struct parse *p;
846 {
847 register cset *cs = allocset(p);
848 register int invert = 0;
849
850 /* Dept of Truly Sickening Special-Case Kludges */
851 if (p->next + 5 < p->end && strncmp(p->next, "[:<:]]", 6) == 0) {
852 EMIT(OBOW, 0);
853 NEXTn(6);
854 return;
855 }
856 if (p->next + 5 < p->end && strncmp(p->next, "[:>:]]", 6) == 0) {
857 EMIT(OEOW, 0);
858 NEXTn(6);
859 return;
860 }
861
862 if (EAT('^'))
863 invert++; /* make note to invert set at end */
864 if (EAT(']'))
865 CHadd(cs, ']');
866 else if (EAT('-'))
867 CHadd(cs, '-');
868 while (MORE() && PEEK() != ']' && !SEETWO('-', ']'))
869 p_b_term(p, cs);
870 if (EAT('-'))
871 CHadd(cs, '-');
872 MUSTEAT(']', REG_EBRACK);
873
874 if (p->error != 0) /* don't mess things up further */
875 return;
876
877 if (p->g->cflags®_ICASE) {
878 register int i;
879 register int ci;
880
881 for (i = p->g->csetsize - 1; i >= 0; i--)
882 if (CHIN(cs, i) && isalpha(i)) {
883 ci = othercase(i);
884 if (ci != i)
885 CHadd(cs, ci);
886 }
887 if (cs->multis != NULL)
888 mccase(p, cs);
889 }
890 if (invert) {
891 register int i;
892
893 for (i = p->g->csetsize - 1; i >= 0; i--)
894 if (CHIN(cs, i))
895 CHsub(cs, i);
896 else
897 CHadd(cs, i);
898 if (p->g->cflags®_NEWLINE)
899 CHsub(cs, '\n');
900 if (cs->multis != NULL)
901 mcinvert(p, cs);
902 }
903
904 assert(cs->multis == NULL); /* xxx */
905
906 if (nch(p, cs) == 1) { /* optimize singleton sets */
907 ordinary(p, firstch(p, cs));
908 freeset(p, cs);
909 } else
910 EMIT(OANYOF, freezeset(p, cs));
911 }
912
913 /*
914 - p_b_term - parse one term of a bracketed character list
915 == static void p_b_term(register struct parse *p, register cset *cs);
916 */
917 static void
p_b_term(p,cs)918 p_b_term(p, cs)
919 register struct parse *p;
920 register cset *cs;
921 {
922 register char c;
923 register char start, finish;
924 register int i;
925
926 /* classify what we've got */
927 switch ((MORE()) ? PEEK() : '\0') {
928 case '[':
929 c = (MORE2()) ? PEEK2() : '\0';
930 break;
931 case '-':
932 SETERROR(REG_ERANGE);
933 return; /* NOTE RETURN */
934 break;
935 default:
936 c = '\0';
937 break;
938 }
939
940 switch (c) {
941 case ':': /* character class */
942 NEXT2();
943 REQUIRE(MORE(), REG_EBRACK);
944 c = PEEK();
945 REQUIRE(c != '-' && c != ']', REG_ECTYPE);
946 p_b_cclass(p, cs);
947 REQUIRE(MORE(), REG_EBRACK);
948 REQUIRE(EATTWO(':', ']'), REG_ECTYPE);
949 break;
950 case '=': /* equivalence class */
951 NEXT2();
952 REQUIRE(MORE(), REG_EBRACK);
953 c = PEEK();
954 REQUIRE(c != '-' && c != ']', REG_ECOLLATE);
955 p_b_eclass(p, cs);
956 REQUIRE(MORE(), REG_EBRACK);
957 REQUIRE(EATTWO('=', ']'), REG_ECOLLATE);
958 break;
959 default: /* symbol, ordinary character, or range */
960 /* xxx revision needed for multichar stuff */
961 start = p_b_symbol(p);
962 if (SEE('-') && MORE2() && PEEK2() != ']') {
963 /* range */
964 NEXT();
965 if (EAT('-'))
966 finish = '-';
967 else
968 finish = p_b_symbol(p);
969 } else
970 finish = start;
971 /* xxx what about signed chars here... */
972 REQUIRE(start <= finish, REG_ERANGE);
973 for (i = start; i <= finish; i++)
974 CHadd(cs, i);
975 break;
976 }
977 }
978
979 /*
980 - p_b_cclass - parse a character-class name and deal with it
981 == static void p_b_cclass(register struct parse *p, register cset *cs);
982 */
983 static void
p_b_cclass(p,cs)984 p_b_cclass(p, cs)
985 register struct parse *p;
986 register cset *cs;
987 {
988 register char *sp = p->next;
989 register struct cclass *cp;
990 register size_t len;
991 register char *u;
992 register char c;
993
994 while (MORE() && isalpha(PEEK()))
995 NEXT();
996 len = p->next - sp;
997 for (cp = cclasses; cp->name != NULL; cp++)
998 if (strncmp(cp->name, sp, len) == 0 && cp->name[len] == '\0')
999 break;
1000 if (cp->name == NULL) {
1001 /* oops, didn't find it */
1002 SETERROR(REG_ECTYPE);
1003 return;
1004 }
1005
1006 u = cp->chars;
1007 while ((c = *u++) != '\0')
1008 CHadd(cs, c);
1009 for (u = cp->multis; *u != '\0'; u += strlen(u) + 1)
1010 MCadd(p, cs, u);
1011 }
1012
1013 /*
1014 - p_b_eclass - parse an equivalence-class name and deal with it
1015 == static void p_b_eclass(register struct parse *p, register cset *cs);
1016 *
1017 * This implementation is incomplete. xxx
1018 */
1019 static void
p_b_eclass(p,cs)1020 p_b_eclass(p, cs)
1021 register struct parse *p;
1022 register cset *cs;
1023 {
1024 register char c;
1025
1026 c = p_b_coll_elem(p, '=');
1027 CHadd(cs, c);
1028 }
1029
1030 /*
1031 - p_b_symbol - parse a character or [..]ed multicharacter collating symbol
1032 == static char p_b_symbol(register struct parse *p);
1033 */
1034 static char /* value of symbol */
p_b_symbol(p)1035 p_b_symbol(p)
1036 register struct parse *p;
1037 {
1038 register char value;
1039
1040 REQUIRE(MORE(), REG_EBRACK);
1041 if (!EATTWO('[', '.'))
1042 return(GETNEXT());
1043
1044 /* collating symbol */
1045 value = p_b_coll_elem(p, '.');
1046 REQUIRE(EATTWO('.', ']'), REG_ECOLLATE);
1047 return(value);
1048 }
1049
1050 /*
1051 - p_b_coll_elem - parse a collating-element name and look it up
1052 == static char p_b_coll_elem(register struct parse *p, int endc);
1053 */
1054 static char /* value of collating element */
p_b_coll_elem(p,endc)1055 p_b_coll_elem(p, endc)
1056 register struct parse *p;
1057 int endc; /* name ended by endc,']' */
1058 {
1059 register char *sp = p->next;
1060 register struct cname *cp;
1061 register int len;
1062
1063 while (MORE() && !SEETWO(endc, ']'))
1064 NEXT();
1065 if (!MORE()) {
1066 SETERROR(REG_EBRACK);
1067 return(0);
1068 }
1069 len = p->next - sp;
1070 for (cp = cnames; cp->name != NULL; cp++)
1071 if (strncmp(cp->name, sp, len) == 0 && cp->name[len] == '\0')
1072 return(cp->code); /* known name */
1073 if (len == 1)
1074 return(*sp); /* single character */
1075 SETERROR(REG_ECOLLATE); /* neither */
1076 return(0);
1077 }
1078
1079 /*
1080 - othercase - return the case counterpart of an alphabetic
1081 == static char othercase(int ch);
1082 */
1083 static char /* if no counterpart, return ch */
othercase(ch)1084 othercase(ch)
1085 int ch;
1086 {
1087 assert(isalpha(ch));
1088 if (isupper(ch))
1089 return(tolower(ch));
1090 else if (islower(ch))
1091 return(toupper(ch));
1092 else /* peculiar, but could happen */
1093 return(ch);
1094 }
1095
1096 /*
1097 - bothcases - emit a dualcase version of a two-case character
1098 == static void bothcases(register struct parse *p, int ch);
1099 *
1100 * Boy, is this implementation ever a kludge...
1101 */
1102 static void
bothcases(p,ch)1103 bothcases(p, ch)
1104 register struct parse *p;
1105 int ch;
1106 {
1107 register char *oldnext = p->next;
1108 register char *oldend = p->end;
1109 char bracket[3];
1110
1111 assert(othercase(ch) != ch); /* p_bracket() would recurse */
1112 p->next = bracket;
1113 p->end = bracket+2;
1114 bracket[0] = ch;
1115 bracket[1] = ']';
1116 bracket[2] = '\0';
1117 p_bracket(p);
1118 assert(p->next == bracket+2);
1119 p->next = oldnext;
1120 p->end = oldend;
1121 }
1122
1123 /*
1124 - ordinary - emit an ordinary character
1125 == static void ordinary(register struct parse *p, register int ch);
1126 */
1127 static void
ordinary(p,ch)1128 ordinary(p, ch)
1129 register struct parse *p;
1130 register int ch;
1131 {
1132 register cat_t *cap = p->g->categories;
1133
1134 if ((p->g->cflags®_ICASE) && isalpha(ch) && othercase(ch) != ch)
1135 bothcases(p, ch);
1136 else {
1137 EMIT(OCHAR, (unsigned char)ch);
1138 if (cap[ch] == 0)
1139 cap[ch] = p->g->ncategories++;
1140 }
1141 }
1142
1143 /*
1144 - nonnewline - emit REG_NEWLINE version of OANY
1145 == static void nonnewline(register struct parse *p);
1146 *
1147 * Boy, is this implementation ever a kludge...
1148 */
1149 static void
nonnewline(p)1150 nonnewline(p)
1151 register struct parse *p;
1152 {
1153 register char *oldnext = p->next;
1154 register char *oldend = p->end;
1155 char bracket[4];
1156
1157 p->next = bracket;
1158 p->end = bracket+3;
1159 bracket[0] = '^';
1160 bracket[1] = '\n';
1161 bracket[2] = ']';
1162 bracket[3] = '\0';
1163 p_bracket(p);
1164 assert(p->next == bracket+3);
1165 p->next = oldnext;
1166 p->end = oldend;
1167 }
1168
1169 /*
1170 - repeat - generate code for a bounded repetition, recursively if needed
1171 == static void repeat(register struct parse *p, sopno start, int from, int to);
1172 */
1173 static void
repeat(p,start,from,to)1174 repeat(p, start, from, to)
1175 register struct parse *p;
1176 sopno start; /* operand from here to end of strip */
1177 int from; /* repeated from this number */
1178 int to; /* to this number of times (maybe INFINITY) */
1179 {
1180 register sopno finish = HERE();
1181 # define N 2
1182 # define INF 3
1183 # define REP(f, t) ((f)*8 + (t))
1184 # define MAP(n) (((n) <= 1) ? (n) : ((n) == INFINITY) ? INF : N)
1185 register sopno copy;
1186
1187 if (p->error != 0) /* head off possible runaway recursion */
1188 return;
1189
1190 assert(from <= to);
1191
1192 switch (REP(MAP(from), MAP(to))) {
1193 case REP(0, 0): /* must be user doing this */
1194 DROP(finish-start); /* drop the operand */
1195 break;
1196 case REP(0, 1): /* as x{1,1}? */
1197 case REP(0, N): /* as x{1,n}? */
1198 case REP(0, INF): /* as x{1,}? */
1199 /* KLUDGE: emit y? as (y|) until subtle bug gets fixed */
1200 INSERT(OCH_, start); /* offset is wrong... */
1201 repeat(p, start+1, 1, to);
1202 ASTERN(OOR1, start);
1203 AHEAD(start); /* ... fix it */
1204 EMIT(OOR2, 0);
1205 AHEAD(THERE());
1206 ASTERN(O_CH, THERETHERE());
1207 break;
1208 case REP(1, 1): /* trivial case */
1209 /* done */
1210 break;
1211 case REP(1, N): /* as x?x{1,n-1} */
1212 /* KLUDGE: emit y? as (y|) until subtle bug gets fixed */
1213 INSERT(OCH_, start);
1214 ASTERN(OOR1, start);
1215 AHEAD(start);
1216 EMIT(OOR2, 0); /* offset very wrong... */
1217 AHEAD(THERE()); /* ...so fix it */
1218 ASTERN(O_CH, THERETHERE());
1219 copy = dupl(p, start+1, finish+1);
1220 assert(copy == finish+4);
1221 repeat(p, copy, 1, to-1);
1222 break;
1223 case REP(1, INF): /* as x+ */
1224 INSERT(OPLUS_, start);
1225 ASTERN(O_PLUS, start);
1226 break;
1227 case REP(N, N): /* as xx{m-1,n-1} */
1228 copy = dupl(p, start, finish);
1229 repeat(p, copy, from-1, to-1);
1230 break;
1231 case REP(N, INF): /* as xx{n-1,INF} */
1232 copy = dupl(p, start, finish);
1233 repeat(p, copy, from-1, to);
1234 break;
1235 default: /* "can't happen" */
1236 SETERROR(REG_ASSERT); /* just in case */
1237 break;
1238 }
1239 }
1240
1241 /*
1242 - seterr - set an error condition
1243 == static int seterr(register struct parse *p, int e);
1244 */
1245 static int /* useless but makes type checking happy */
seterr(p,e)1246 seterr(p, e)
1247 register struct parse *p;
1248 int e;
1249 {
1250 if (p->error == 0) /* keep earliest error condition */
1251 p->error = e;
1252 p->next = nuls; /* try to bring things to a halt */
1253 p->end = nuls;
1254 return(0); /* make the return value well-defined */
1255 }
1256
1257 /*
1258 - allocset - allocate a set of characters for []
1259 == static cset *allocset(register struct parse *p);
1260 */
1261 static cset *
allocset(p)1262 allocset(p)
1263 register struct parse *p;
1264 {
1265 register int no = p->g->ncsets++;
1266 register size_t nc;
1267 register size_t nbytes;
1268 register cset *cs;
1269 register size_t css = (size_t)p->g->csetsize;
1270 register int i;
1271
1272 if (no >= p->ncsalloc) { /* need another column of space */
1273 p->ncsalloc += CHAR_BIT;
1274 nc = p->ncsalloc;
1275 assert(nc % CHAR_BIT == 0);
1276 nbytes = nc / CHAR_BIT * css;
1277 if (p->g->sets == NULL)
1278 p->g->sets = (cset *)malloc(nc * sizeof(cset));
1279 else
1280 p->g->sets = (cset *)realloc((char *)p->g->sets,
1281 nc * sizeof(cset));
1282 if (p->g->setbits == NULL)
1283 p->g->setbits = (uch *)malloc(nbytes);
1284 else {
1285 p->g->setbits = (uch *)realloc((char *)p->g->setbits,
1286 nbytes);
1287 /* xxx this isn't right if setbits is now NULL */
1288 for (i = 0; i < no; i++)
1289 p->g->sets[i].ptr = p->g->setbits + css*(i/CHAR_BIT);
1290 }
1291 if (p->g->sets != NULL && p->g->setbits != NULL)
1292 (void) memset((char *)p->g->setbits + (nbytes - css),
1293 0, css);
1294 else {
1295 no = 0;
1296 SETERROR(REG_ESPACE);
1297 /* caller's responsibility not to do set ops */
1298 }
1299 }
1300
1301 assert(p->g->sets != NULL); /* xxx */
1302 cs = &p->g->sets[no];
1303 cs->ptr = p->g->setbits + css*((no)/CHAR_BIT);
1304 cs->mask = 1 << ((no) % CHAR_BIT);
1305 cs->hash = 0;
1306 cs->smultis = 0;
1307 cs->multis = NULL;
1308
1309 return(cs);
1310 }
1311
1312 /*
1313 - freeset - free a now-unused set
1314 == static void freeset(register struct parse *p, register cset *cs);
1315 */
1316 static void
freeset(p,cs)1317 freeset(p, cs)
1318 register struct parse *p;
1319 register cset *cs;
1320 {
1321 register int i;
1322 register cset *top = &p->g->sets[p->g->ncsets];
1323 register size_t css = (size_t)p->g->csetsize;
1324
1325 for (i = 0; i < css; i++)
1326 CHsub(cs, i);
1327 if (cs == top-1) /* recover only the easy case */
1328 p->g->ncsets--;
1329 }
1330
1331 /*
1332 - freezeset - final processing on a set of characters
1333 == static int freezeset(register struct parse *p, register cset *cs);
1334 *
1335 * The main task here is merging identical sets. This is usually a waste
1336 * of time (although the hash code minimizes the overhead), but can win
1337 * big if REG_ICASE is being used. REG_ICASE, by the way, is why the hash
1338 * is done using addition rather than xor -- all ASCII [aA] sets xor to
1339 * the same value!
1340 */
1341 static int /* set number */
freezeset(p,cs)1342 freezeset(p, cs)
1343 register struct parse *p;
1344 register cset *cs;
1345 {
1346 register uch h = cs->hash;
1347 register int i;
1348 register cset *top = &p->g->sets[p->g->ncsets];
1349 register cset *cs2;
1350 register size_t css = (size_t)p->g->csetsize;
1351
1352 /* look for an earlier one which is the same */
1353 for (cs2 = &p->g->sets[0]; cs2 < top; cs2++)
1354 if (cs2->hash == h && cs2 != cs) {
1355 /* maybe */
1356 for (i = 0; i < css; i++)
1357 if (!!CHIN(cs2, i) != !!CHIN(cs, i))
1358 break; /* no */
1359 if (i == css)
1360 break; /* yes */
1361 }
1362
1363 if (cs2 < top) { /* found one */
1364 freeset(p, cs);
1365 cs = cs2;
1366 }
1367
1368 return((int)(cs - p->g->sets));
1369 }
1370
1371 /*
1372 - firstch - return first character in a set (which must have at least one)
1373 == static int firstch(register struct parse *p, register cset *cs);
1374 */
1375 static int /* character; there is no "none" value */
firstch(p,cs)1376 firstch(p, cs)
1377 register struct parse *p;
1378 register cset *cs;
1379 {
1380 register int i;
1381 register size_t css = (size_t)p->g->csetsize;
1382
1383 for (i = 0; i < css; i++)
1384 if (CHIN(cs, i))
1385 return((char)i);
1386 assert(never);
1387 return(0); /* arbitrary */
1388 }
1389
1390 /*
1391 - nch - number of characters in a set
1392 == static int nch(register struct parse *p, register cset *cs);
1393 */
1394 static int
nch(p,cs)1395 nch(p, cs)
1396 register struct parse *p;
1397 register cset *cs;
1398 {
1399 register int i;
1400 register size_t css = (size_t)p->g->csetsize;
1401 register int n = 0;
1402
1403 for (i = 0; i < css; i++)
1404 if (CHIN(cs, i))
1405 n++;
1406 return(n);
1407 }
1408
1409 /*
1410 - mcadd - add a collating element to a cset
1411 == static void mcadd(register struct parse *p, register cset *cs, \
1412 == register char *cp);
1413 */
1414 static void
mcadd(p,cs,cp)1415 mcadd(p, cs, cp)
1416 register struct parse *p;
1417 register cset *cs;
1418 register char *cp;
1419 {
1420 register size_t oldend = cs->smultis;
1421
1422 cs->smultis += strlen(cp) + 1;
1423 if (cs->multis == NULL)
1424 cs->multis = malloc(cs->smultis);
1425 else
1426 cs->multis = realloc(cs->multis, cs->smultis);
1427 if (cs->multis == NULL) {
1428 SETERROR(REG_ESPACE);
1429 return;
1430 }
1431
1432 (void) strcpy(cs->multis + oldend - 1, cp);
1433 cs->multis[cs->smultis - 1] = '\0';
1434 }
1435
1436 /*
1437 - mcinvert - invert the list of collating elements in a cset
1438 == static void mcinvert(register struct parse *p, register cset *cs);
1439 *
1440 * This would have to know the set of possibilities. Implementation
1441 * is deferred.
1442 */
1443 static void
mcinvert(p,cs)1444 mcinvert(p, cs)
1445 register struct parse *p;
1446 register cset *cs;
1447 {
1448 assert(cs->multis == NULL); /* xxx */
1449 }
1450
1451 /*
1452 - mccase - add case counterparts of the list of collating elements in a cset
1453 == static void mccase(register struct parse *p, register cset *cs);
1454 *
1455 * This would have to know the set of possibilities. Implementation
1456 * is deferred.
1457 */
1458 static void
mccase(p,cs)1459 mccase(p, cs)
1460 register struct parse *p;
1461 register cset *cs;
1462 {
1463 assert(cs->multis == NULL); /* xxx */
1464 }
1465
1466 /*
1467 - isinsets - is this character in any sets?
1468 == static int isinsets(register struct re_guts *g, int c);
1469 */
1470 static int /* predicate */
isinsets(g,c)1471 isinsets(g, c)
1472 register struct re_guts *g;
1473 int c;
1474 {
1475 register uch *col;
1476 register int i;
1477 register int ncols = (g->ncsets+(CHAR_BIT-1)) / CHAR_BIT;
1478 register unsigned uc = (unsigned char)c;
1479
1480 for (i = 0, col = g->setbits; i < ncols; i++, col += g->csetsize)
1481 if (col[uc] != 0)
1482 return(1);
1483 return(0);
1484 }
1485
1486 /*
1487 - samesets - are these two characters in exactly the same sets?
1488 == static int samesets(register struct re_guts *g, int c1, int c2);
1489 */
1490 static int /* predicate */
samesets(g,c1,c2)1491 samesets(g, c1, c2)
1492 register struct re_guts *g;
1493 int c1;
1494 int c2;
1495 {
1496 register uch *col;
1497 register int i;
1498 register int ncols = (g->ncsets+(CHAR_BIT-1)) / CHAR_BIT;
1499 register unsigned uc1 = (unsigned char)c1;
1500 register unsigned uc2 = (unsigned char)c2;
1501
1502 for (i = 0, col = g->setbits; i < ncols; i++, col += g->csetsize)
1503 if (col[uc1] != col[uc2])
1504 return(0);
1505 return(1);
1506 }
1507
1508 /*
1509 - categorize - sort out character categories
1510 == static void categorize(struct parse *p, register struct re_guts *g);
1511 */
1512 static void
categorize(p,g)1513 categorize(p, g)
1514 struct parse *p;
1515 register struct re_guts *g;
1516 {
1517 register cat_t *cats = g->categories;
1518 register int c;
1519 register int c2;
1520 register cat_t cat;
1521
1522 /* avoid making error situations worse */
1523 if (p->error != 0)
1524 return;
1525
1526 for (c = CHAR_MIN; c <= CHAR_MAX; c++)
1527 if (cats[c] == 0 && isinsets(g, c)) {
1528 cat = g->ncategories++;
1529 cats[c] = cat;
1530 for (c2 = c+1; c2 <= CHAR_MAX; c2++)
1531 if (cats[c2] == 0 && samesets(g, c, c2))
1532 cats[c2] = cat;
1533 }
1534 }
1535
1536 /*
1537 - dupl - emit a duplicate of a bunch of sops
1538 == static sopno dupl(register struct parse *p, sopno start, sopno finish);
1539 */
1540 static sopno /* start of duplicate */
dupl(p,start,finish)1541 dupl(p, start, finish)
1542 register struct parse *p;
1543 sopno start; /* from here */
1544 sopno finish; /* to this less one */
1545 {
1546 register sopno ret = HERE();
1547 register sopno len = finish - start;
1548
1549 assert(finish >= start);
1550 if (len == 0)
1551 return(ret);
1552 enlarge(p, p->ssize + len); /* this many unexpected additions */
1553 assert(p->ssize >= p->slen + len);
1554 (void) memcpy((char *)(p->strip + p->slen),
1555 (char *)(p->strip + start), (size_t)len*sizeof(sop));
1556 p->slen += len;
1557 return(ret);
1558 }
1559
1560 /*
1561 - doemit - emit a strip operator
1562 == static void doemit(register struct parse *p, sop op, size_t opnd);
1563 *
1564 * It might seem better to implement this as a macro with a function as
1565 * hard-case backup, but it's just too big and messy unless there are
1566 * some changes to the data structures. Maybe later.
1567 */
1568 static void
doemit(p,op,opnd)1569 doemit(p, op, opnd)
1570 register struct parse *p;
1571 sop op;
1572 size_t opnd;
1573 {
1574 /* avoid making error situations worse */
1575 if (p->error != 0)
1576 return;
1577
1578 /* deal with oversize operands ("can't happen", more or less) */
1579 assert(opnd < 1<<OPSHIFT);
1580
1581 /* deal with undersized strip */
1582 if (p->slen >= p->ssize)
1583 enlarge(p, (p->ssize+1) / 2 * 3); /* +50% */
1584 assert(p->slen < p->ssize);
1585
1586 /* finally, it's all reduced to the easy case */
1587 p->strip[p->slen++] = SOP(op, opnd);
1588 }
1589
1590 /*
1591 - doinsert - insert a sop into the strip
1592 == static void doinsert(register struct parse *p, sop op, size_t opnd, sopno pos);
1593 */
1594 static void
doinsert(p,op,opnd,pos)1595 doinsert(p, op, opnd, pos)
1596 register struct parse *p;
1597 sop op;
1598 size_t opnd;
1599 sopno pos;
1600 {
1601 register sopno sn;
1602 register sop s;
1603 register int i;
1604
1605 /* avoid making error situations worse */
1606 if (p->error != 0)
1607 return;
1608
1609 sn = HERE();
1610 EMIT(op, opnd); /* do checks, ensure space */
1611 assert(HERE() == sn+1);
1612 s = p->strip[sn];
1613
1614 /* adjust paren pointers */
1615 assert(pos > 0);
1616 for (i = 1; i < NPAREN; i++) {
1617 if (p->pbegin[i] >= pos) {
1618 p->pbegin[i]++;
1619 }
1620 if (p->pend[i] >= pos) {
1621 p->pend[i]++;
1622 }
1623 }
1624
1625 memmove((char *)&p->strip[pos+1], (char *)&p->strip[pos],
1626 (HERE()-pos-1)*sizeof(sop));
1627 p->strip[pos] = s;
1628 }
1629
1630 /*
1631 - dofwd - complete a forward reference
1632 == static void dofwd(register struct parse *p, sopno pos, sop value);
1633 */
1634 static void
dofwd(p,pos,value)1635 dofwd(p, pos, value)
1636 register struct parse *p;
1637 register sopno pos;
1638 sop value;
1639 {
1640 /* avoid making error situations worse */
1641 if (p->error != 0)
1642 return;
1643
1644 assert(value < 1<<OPSHIFT);
1645 p->strip[pos] = OP(p->strip[pos]) | value;
1646 }
1647
1648 /*
1649 - enlarge - enlarge the strip
1650 == static void enlarge(register struct parse *p, sopno size);
1651 */
1652 static void
enlarge(p,size)1653 enlarge(p, size)
1654 register struct parse *p;
1655 register sopno size;
1656 {
1657 register sop *sp;
1658
1659 if (p->ssize >= size)
1660 return;
1661
1662 sp = (sop *)realloc(p->strip, size*sizeof(sop));
1663 if (sp == NULL) {
1664 SETERROR(REG_ESPACE);
1665 return;
1666 }
1667 p->strip = sp;
1668 p->ssize = size;
1669 }
1670
1671 /*
1672 - stripsnug - compact the strip
1673 == static void stripsnug(register struct parse *p, register struct re_guts *g);
1674 */
1675 static void
stripsnug(p,g)1676 stripsnug(p, g)
1677 register struct parse *p;
1678 register struct re_guts *g;
1679 {
1680 g->nstates = p->slen;
1681 g->strip = (sop *)realloc((char *)p->strip, p->slen * sizeof(sop));
1682 if (g->strip == NULL) {
1683 SETERROR(REG_ESPACE);
1684 g->strip = p->strip;
1685 }
1686 }
1687
1688 /*
1689 - findmust - fill in must and mlen with longest mandatory literal string
1690 == static void findmust(register struct parse *p, register struct re_guts *g);
1691 *
1692 * This algorithm could do fancy things like analyzing the operands of |
1693 * for common subsequences. Someday. This code is simple and finds most
1694 * of the interesting cases.
1695 *
1696 * Note that must and mlen got initialized during setup.
1697 */
1698 static void
findmust(p,g)1699 findmust(p, g)
1700 struct parse *p;
1701 register struct re_guts *g;
1702 {
1703 register sop *scan;
1704 sop *start;
1705 register sop *newstart;
1706 register sopno newlen;
1707 register sop s;
1708 register char *cp;
1709 register sopno i;
1710
1711 /* avoid making error situations worse */
1712 if (p->error != 0)
1713 return;
1714
1715 /* find the longest OCHAR sequence in strip */
1716 newlen = 0;
1717 scan = g->strip + 1;
1718 do {
1719 s = *scan++;
1720 switch (OP(s)) {
1721 case OCHAR: /* sequence member */
1722 if (newlen == 0) /* new sequence */
1723 newstart = scan - 1;
1724 newlen++;
1725 break;
1726 case OPLUS_: /* things that don't break one */
1727 case OLPAREN:
1728 case ORPAREN:
1729 break;
1730 case OQUEST_: /* things that must be skipped */
1731 case OCH_:
1732 scan--;
1733 do {
1734 scan += OPND(s);
1735 s = *scan;
1736 /* assert() interferes w debug printouts */
1737 if (OP(s) != O_QUEST && OP(s) != O_CH &&
1738 OP(s) != OOR2) {
1739 g->iflags |= BAD;
1740 return;
1741 }
1742 } while (OP(s) != O_QUEST && OP(s) != O_CH);
1743 /* fallthrough */
1744 default: /* things that break a sequence */
1745 if (newlen > g->mlen) { /* ends one */
1746 start = newstart;
1747 g->mlen = newlen;
1748 }
1749 newlen = 0;
1750 break;
1751 }
1752 } while (OP(s) != OEND);
1753
1754 if (g->mlen == 0) /* there isn't one */
1755 return;
1756
1757 /* turn it into a character string */
1758 g->must = malloc((size_t)g->mlen + 1);
1759 if (g->must == NULL) { /* argh; just forget it */
1760 g->mlen = 0;
1761 return;
1762 }
1763 cp = g->must;
1764 scan = start;
1765 for (i = g->mlen; i > 0; i--) {
1766 while (OP(s = *scan++) != OCHAR)
1767 continue;
1768 assert(cp < g->must + g->mlen);
1769 *cp++ = (char)OPND(s);
1770 }
1771 assert(cp == g->must + g->mlen);
1772 *cp++ = '\0'; /* just on general principles */
1773 }
1774
1775 /*
1776 - pluscount - count + nesting
1777 == static sopno pluscount(register struct parse *p, register struct re_guts *g);
1778 */
1779 static sopno /* nesting depth */
pluscount(p,g)1780 pluscount(p, g)
1781 struct parse *p;
1782 register struct re_guts *g;
1783 {
1784 register sop *scan;
1785 register sop s;
1786 register sopno plusnest = 0;
1787 register sopno maxnest = 0;
1788
1789 if (p->error != 0)
1790 return(0); /* there may not be an OEND */
1791
1792 scan = g->strip + 1;
1793 do {
1794 s = *scan++;
1795 switch (OP(s)) {
1796 case OPLUS_:
1797 plusnest++;
1798 break;
1799 case O_PLUS:
1800 if (plusnest > maxnest)
1801 maxnest = plusnest;
1802 plusnest--;
1803 break;
1804 }
1805 } while (OP(s) != OEND);
1806 if (plusnest != 0)
1807 g->iflags |= BAD;
1808 return(maxnest);
1809 }
1810
1811 static int nope = 0; /* for use in asserts; shuts lint up */
1812
1813 /* macros for manipulating states, small version */
1814 #define states unsigned
1815 #define states1 unsigned /* for later use in regexec() decision */
1816 #define CLEAR(v) ((v) = 0)
1817 #define SET0(v, n) ((v) &= ~((unsigned)1 << (n)))
1818 #define SET1(v, n) ((v) |= (unsigned)1 << (n))
1819 #define ISSET(v, n) ((v) & ((unsigned)1 << (n)))
1820 #define ASSIGN(d, s) ((d) = (s))
1821 #define EQ(a, b) ((a) == (b))
1822 #define STATEVARS int dummy /* dummy version */
1823 #define STATESETUP(m, n) /* nothing */
1824 #define STATETEARDOWN(m) /* nothing */
1825 #define SETUP(v) ((v) = 0)
1826 #define onestate unsigned
1827 #define INIT(o, n) ((o) = (unsigned)1 << (n))
1828 #define INC(o) ((o) <<= 1)
1829 #define ISSTATEIN(v, o) ((v) & (o))
1830 /* some abbreviations; note that some of these know variable names! */
1831 /* do "if I'm here, I can also be there" etc without branches */
1832 #define FWD(dst, src, n) ((dst) |= ((unsigned)(src)&(here)) << (n))
1833 #define BACK(dst, src, n) ((dst) |= ((unsigned)(src)&(here)) >> (n))
1834 #define ISSETBACK(v, n) ((v) & ((unsigned)here >> (n)))
1835 /* function names */
1836 #define SNAMES /* engine.c looks after details */
1837
1838 /*
1839 * The matching engine and friends. This file is #included by regexec.c
1840 * after suitable #defines of a variety of macros used herein, so that
1841 * different state representations can be used without duplicating masses
1842 * of code.
1843 */
1844
1845 #ifdef SNAMES
1846 #define matcher smatcher
1847 #define fast sfast
1848 #define slow sslow
1849 #define dissect sdissect
1850 #define backref sbackref
1851 #define step sstep
1852 #define print sprint
1853 #define at sat
1854 #define match smat
1855 #endif
1856 #ifdef LNAMES
1857 #define matcher lmatcher
1858 #define fast lfast
1859 #define slow lslow
1860 #define dissect ldissect
1861 #define backref lbackref
1862 #define step lstep
1863 #define print lprint
1864 #define at lat
1865 #define match lmat
1866 #endif
1867
1868 /* another structure passed up and down to avoid zillions of parameters */
1869 struct match {
1870 struct re_guts *g;
1871 int eflags;
1872 regmatch_t *pmatch; /* [nsub+1] (0 element unused) */
1873 char *offp; /* offsets work from here */
1874 char *beginp; /* start of string -- virtual NUL precedes */
1875 char *endp; /* end of string -- virtual NUL here */
1876 char *coldp; /* can be no match starting before here */
1877 char **lastpos; /* [nplus+1] */
1878 STATEVARS;
1879 states st; /* current states */
1880 states fresh; /* states for a fresh start */
1881 states tmp; /* temporary */
1882 states empty; /* empty set of states */
1883 };
1884
1885 static int matcher(register struct re_guts *g, char *string, size_t nmatch, regmatch_t pmatch[], int eflags);
1886 static char *dissect(register struct match *m, char *start, char *stop, sopno startst, sopno stopst);
1887 static char *backref(register struct match *m, char *start, char *stop, sopno startst, sopno stopst, sopno lev);
1888 static char *fast(register struct match *m, char *start, char *stop, sopno startst, sopno stopst);
1889 static char *slow(register struct match *m, char *start, char *stop, sopno startst, sopno stopst);
1890 static states step(register struct re_guts *g, sopno start, sopno stop, register states bef, int ch, register states aft);
1891 #define BOL (OUT+1)
1892 #define EOL (BOL+1)
1893 #define BOLEOL (BOL+2)
1894 #define NOTHING (BOL+3)
1895 #define BOW (BOL+4)
1896 #define EOW (BOL+5)
1897 #define CODEMAX (BOL+5) /* highest code used */
1898 #define NONCHAR(c) ((c) > CHAR_MAX)
1899 #define NNONCHAR (CODEMAX-CHAR_MAX)
1900 #define SP(t, s, c) /* nothing */
1901 #define AT(t, p1, p2, s1, s2) /* nothing */
1902 #define NOTE(s) /* nothing */
1903
1904 /*
1905 - matcher - the actual matching engine
1906 == static int matcher(register struct re_guts *g, char *string, \
1907 == size_t nmatch, regmatch_t pmatch[], int eflags);
1908 */
1909 static int /* 0 success, REG_NOMATCH failure */
matcher(g,string,nmatch,pmatch,eflags)1910 matcher(g, string, nmatch, pmatch, eflags)
1911 register struct re_guts *g;
1912 char *string;
1913 size_t nmatch;
1914 regmatch_t pmatch[];
1915 int eflags;
1916 {
1917 register char *endp;
1918 register int i;
1919 struct match mv;
1920 register struct match *m = &mv;
1921 register char *dp;
1922 const register sopno gf = g->firststate+1; /* +1 for OEND */
1923 const register sopno gl = g->laststate;
1924 char *start;
1925 char *stop;
1926
1927 /* simplify the situation where possible */
1928 if (g->cflags®_NOSUB)
1929 nmatch = 0;
1930 if (eflags®_STARTEND) {
1931 start = string + pmatch[0].rm_so;
1932 stop = string + pmatch[0].rm_eo;
1933 } else {
1934 start = string;
1935 stop = start + strlen(start);
1936 }
1937 if (stop < start)
1938 return(REG_INVARG);
1939
1940 /* prescreening; this does wonders for this rather slow code */
1941 if (g->must != NULL) {
1942 for (dp = start; dp < stop; dp++)
1943 if (*dp == g->must[0] && stop - dp >= g->mlen &&
1944 memcmp(dp, g->must, (size_t)g->mlen) == 0)
1945 break;
1946 if (dp == stop) /* we didn't find g->must */
1947 return(REG_NOMATCH);
1948 }
1949
1950 /* match struct setup */
1951 m->g = g;
1952 m->eflags = eflags;
1953 m->pmatch = NULL;
1954 m->lastpos = NULL;
1955 m->offp = string;
1956 m->beginp = start;
1957 m->endp = stop;
1958 STATESETUP(m, 4);
1959 SETUP(m->st);
1960 SETUP(m->fresh);
1961 SETUP(m->tmp);
1962 SETUP(m->empty);
1963 CLEAR(m->empty);
1964
1965 /* this loop does only one repetition except for backrefs */
1966 for (;;) {
1967 endp = fast(m, start, stop, gf, gl);
1968 if (endp == NULL) { /* a miss */
1969 STATETEARDOWN(m);
1970 return(REG_NOMATCH);
1971 }
1972 if (nmatch == 0 && !g->backrefs)
1973 break; /* no further info needed */
1974
1975 /* where? */
1976 assert(m->coldp != NULL);
1977 for (;;) {
1978 NOTE("finding start");
1979 endp = slow(m, m->coldp, stop, gf, gl);
1980 if (endp != NULL)
1981 break;
1982 assert(m->coldp < m->endp);
1983 m->coldp++;
1984 }
1985 if (nmatch == 1 && !g->backrefs)
1986 break; /* no further info needed */
1987
1988 /* oh my, he wants the subexpressions... */
1989 if (m->pmatch == NULL)
1990 m->pmatch = (regmatch_t *)malloc((m->g->nsub + 1) *
1991 sizeof(regmatch_t));
1992 if (m->pmatch == NULL) {
1993 STATETEARDOWN(m);
1994 return(REG_ESPACE);
1995 }
1996 for (i = 1; i <= m->g->nsub; i++)
1997 m->pmatch[i].rm_so = m->pmatch[i].rm_eo = -1;
1998 if (!g->backrefs && !(m->eflags®_BACKR)) {
1999 NOTE("dissecting");
2000 dp = dissect(m, m->coldp, endp, gf, gl);
2001 } else {
2002 if (g->nplus > 0 && m->lastpos == NULL)
2003 m->lastpos = (char **)malloc((g->nplus+1) *
2004 sizeof(char *));
2005 if (g->nplus > 0 && m->lastpos == NULL) {
2006 free(m->pmatch);
2007 STATETEARDOWN(m);
2008 return(REG_ESPACE);
2009 }
2010 NOTE("backref dissect");
2011 dp = backref(m, m->coldp, endp, gf, gl, (sopno)0);
2012 }
2013 if (dp != NULL)
2014 break;
2015
2016 /* uh-oh... we couldn't find a subexpression-level match */
2017 assert(g->backrefs); /* must be back references doing it */
2018 assert(g->nplus == 0 || m->lastpos != NULL);
2019 for (;;) {
2020 if (dp != NULL || endp <= m->coldp)
2021 break; /* defeat */
2022 NOTE("backoff");
2023 endp = slow(m, m->coldp, endp-1, gf, gl);
2024 if (endp == NULL)
2025 break; /* defeat */
2026 /* try it on a shorter possibility */
2027 NOTE("backoff dissect");
2028 dp = backref(m, m->coldp, endp, gf, gl, (sopno)0);
2029 }
2030 assert(dp == NULL || dp == endp);
2031 if (dp != NULL) /* found a shorter one */
2032 break;
2033
2034 /* despite initial appearances, there is no match here */
2035 NOTE("false alarm");
2036 start = m->coldp + 1; /* recycle starting later */
2037 assert(start <= stop);
2038 }
2039
2040 /* fill in the details if requested */
2041 if (nmatch > 0) {
2042 pmatch[0].rm_so = m->coldp - m->offp;
2043 pmatch[0].rm_eo = endp - m->offp;
2044 }
2045 if (nmatch > 1) {
2046 assert(m->pmatch != NULL);
2047 for (i = 1; i < nmatch; i++)
2048 if (i <= m->g->nsub)
2049 pmatch[i] = m->pmatch[i];
2050 else {
2051 pmatch[i].rm_so = -1;
2052 pmatch[i].rm_eo = -1;
2053 }
2054 }
2055
2056 if (m->pmatch != NULL)
2057 free((char *)m->pmatch);
2058 if (m->lastpos != NULL)
2059 free((char *)m->lastpos);
2060 STATETEARDOWN(m);
2061 return(0);
2062 }
2063
2064 /*
2065 - dissect - figure out what matched what, no back references
2066 == static char *dissect(register struct match *m, char *start, \
2067 == char *stop, sopno startst, sopno stopst);
2068 */
2069 static char * /* == stop (success) always */
dissect(m,start,stop,startst,stopst)2070 dissect(m, start, stop, startst, stopst)
2071 register struct match *m;
2072 char *start;
2073 char *stop;
2074 sopno startst;
2075 sopno stopst;
2076 {
2077 register int i;
2078 register sopno ss; /* start sop of current subRE */
2079 register sopno es; /* end sop of current subRE */
2080 register char *sp; /* start of string matched by it */
2081 register char *stp; /* string matched by it cannot pass here */
2082 register char *rest; /* start of rest of string */
2083 register char *tail; /* string unmatched by rest of RE */
2084 register sopno ssub; /* start sop of subsubRE */
2085 register sopno esub; /* end sop of subsubRE */
2086 register char *ssp; /* start of string matched by subsubRE */
2087 register char *sep; /* end of string matched by subsubRE */
2088 register char *oldssp; /* previous ssp */
2089 register char *dp;
2090
2091 AT("diss", start, stop, startst, stopst);
2092 sp = start;
2093 for (ss = startst; ss < stopst; ss = es) {
2094 /* identify end of subRE */
2095 es = ss;
2096 switch (OP(m->g->strip[es])) {
2097 case OPLUS_:
2098 case OQUEST_:
2099 es += OPND(m->g->strip[es]);
2100 break;
2101 case OCH_:
2102 while (OP(m->g->strip[es]) != O_CH)
2103 es += OPND(m->g->strip[es]);
2104 break;
2105 }
2106 es++;
2107
2108 /* figure out what it matched */
2109 switch (OP(m->g->strip[ss])) {
2110 case OEND:
2111 assert(nope);
2112 break;
2113 case OCHAR:
2114 sp++;
2115 break;
2116 case OBOL:
2117 case OEOL:
2118 case OBOW:
2119 case OEOW:
2120 break;
2121 case OANY:
2122 case OANYOF:
2123 sp++;
2124 break;
2125 case OBACK_:
2126 case O_BACK:
2127 assert(nope);
2128 break;
2129 /* cases where length of match is hard to find */
2130 case OQUEST_:
2131 stp = stop;
2132 for (;;) {
2133 /* how long could this one be? */
2134 rest = slow(m, sp, stp, ss, es);
2135 assert(rest != NULL); /* it did match */
2136 /* could the rest match the rest? */
2137 tail = slow(m, rest, stop, es, stopst);
2138 if (tail == stop)
2139 break; /* yes! */
2140 /* no -- try a shorter match for this one */
2141 stp = rest - 1;
2142 assert(stp >= sp); /* it did work */
2143 }
2144 ssub = ss + 1;
2145 esub = es - 1;
2146 /* did innards match? */
2147 if (slow(m, sp, rest, ssub, esub) != NULL) {
2148 dp = dissect(m, sp, rest, ssub, esub);
2149 assert(dp == rest);
2150 } else /* no */
2151 assert(sp == rest);
2152 sp = rest;
2153 break;
2154 case OPLUS_:
2155 stp = stop;
2156 for (;;) {
2157 /* how long could this one be? */
2158 rest = slow(m, sp, stp, ss, es);
2159 assert(rest != NULL); /* it did match */
2160 /* could the rest match the rest? */
2161 tail = slow(m, rest, stop, es, stopst);
2162 if (tail == stop)
2163 break; /* yes! */
2164 /* no -- try a shorter match for this one */
2165 stp = rest - 1;
2166 assert(stp >= sp); /* it did work */
2167 }
2168 ssub = ss + 1;
2169 esub = es - 1;
2170 ssp = sp;
2171 oldssp = ssp;
2172 for (;;) { /* find last match of innards */
2173 sep = slow(m, ssp, rest, ssub, esub);
2174 if (sep == NULL || sep == ssp)
2175 break; /* failed or matched null */
2176 oldssp = ssp; /* on to next try */
2177 ssp = sep;
2178 }
2179 if (sep == NULL) {
2180 /* last successful match */
2181 sep = ssp;
2182 ssp = oldssp;
2183 }
2184 assert(sep == rest); /* must exhaust substring */
2185 assert(slow(m, ssp, sep, ssub, esub) == rest);
2186 dp = dissect(m, ssp, sep, ssub, esub);
2187 assert(dp == sep);
2188 sp = rest;
2189 break;
2190 case OCH_:
2191 stp = stop;
2192 for (;;) {
2193 /* how long could this one be? */
2194 rest = slow(m, sp, stp, ss, es);
2195 assert(rest != NULL); /* it did match */
2196 /* could the rest match the rest? */
2197 tail = slow(m, rest, stop, es, stopst);
2198 if (tail == stop)
2199 break; /* yes! */
2200 /* no -- try a shorter match for this one */
2201 stp = rest - 1;
2202 assert(stp >= sp); /* it did work */
2203 }
2204 ssub = ss + 1;
2205 esub = ss + OPND(m->g->strip[ss]) - 1;
2206 assert(OP(m->g->strip[esub]) == OOR1);
2207 for (;;) { /* find first matching branch */
2208 if (slow(m, sp, rest, ssub, esub) == rest)
2209 break; /* it matched all of it */
2210 /* that one missed, try next one */
2211 assert(OP(m->g->strip[esub]) == OOR1);
2212 esub++;
2213 assert(OP(m->g->strip[esub]) == OOR2);
2214 ssub = esub + 1;
2215 esub += OPND(m->g->strip[esub]);
2216 if (OP(m->g->strip[esub]) == OOR2)
2217 esub--;
2218 else
2219 assert(OP(m->g->strip[esub]) == O_CH);
2220 }
2221 dp = dissect(m, sp, rest, ssub, esub);
2222 assert(dp == rest);
2223 sp = rest;
2224 break;
2225 case O_PLUS:
2226 case O_QUEST:
2227 case OOR1:
2228 case OOR2:
2229 case O_CH:
2230 assert(nope);
2231 break;
2232 case OLPAREN:
2233 i = OPND(m->g->strip[ss]);
2234 assert(0 < i && i <= m->g->nsub);
2235 m->pmatch[i].rm_so = sp - m->offp;
2236 break;
2237 case ORPAREN:
2238 i = OPND(m->g->strip[ss]);
2239 assert(0 < i && i <= m->g->nsub);
2240 m->pmatch[i].rm_eo = sp - m->offp;
2241 break;
2242 default: /* uh oh */
2243 assert(nope);
2244 break;
2245 }
2246 }
2247
2248 assert(sp == stop);
2249 return(sp);
2250 }
2251
2252 /*
2253 - backref - figure out what matched what, figuring in back references
2254 == static char *backref(register struct match *m, char *start, \
2255 == char *stop, sopno startst, sopno stopst, sopno lev);
2256 */
2257 static char * /* == stop (success) or NULL (failure) */
backref(m,start,stop,startst,stopst,lev)2258 backref(m, start, stop, startst, stopst, lev)
2259 register struct match *m;
2260 char *start;
2261 char *stop;
2262 sopno startst;
2263 sopno stopst;
2264 sopno lev; /* PLUS nesting level */
2265 {
2266 register int i;
2267 register sopno ss; /* start sop of current subRE */
2268 register char *sp; /* start of string matched by it */
2269 register sopno ssub; /* start sop of subsubRE */
2270 register sopno esub; /* end sop of subsubRE */
2271 register char *ssp; /* start of string matched by subsubRE */
2272 register char *dp;
2273 register size_t len;
2274 register int hard;
2275 register sop s;
2276 register regoff_t offsave;
2277 register cset *cs;
2278
2279 AT("back", start, stop, startst, stopst);
2280 sp = start;
2281
2282 /* get as far as we can with easy stuff */
2283 hard = 0;
2284 for (ss = startst; !hard && ss < stopst; ss++)
2285 switch (OP(s = m->g->strip[ss])) {
2286 case OCHAR:
2287 if (sp == stop || *sp++ != (char)OPND(s))
2288 return(NULL);
2289 break;
2290 case OANY:
2291 if (sp == stop)
2292 return(NULL);
2293 sp++;
2294 break;
2295 case OANYOF:
2296 cs = &m->g->sets[OPND(s)];
2297 if (sp == stop || !CHIN(cs, *sp++))
2298 return(NULL);
2299 break;
2300 case OBOL:
2301 if ( (sp == m->beginp && !(m->eflags®_NOTBOL)) ||
2302 (sp < m->endp && *(sp-1) == '\n' &&
2303 (m->g->cflags®_NEWLINE)) )
2304 { /* yes */ }
2305 else
2306 return(NULL);
2307 break;
2308 case OEOL:
2309 if ( (sp == m->endp && !(m->eflags®_NOTEOL)) ||
2310 (sp < m->endp && *sp == '\n' &&
2311 (m->g->cflags®_NEWLINE)) )
2312 { /* yes */ }
2313 else
2314 return(NULL);
2315 break;
2316 case OBOW:
2317 if (( (sp == m->beginp && !(m->eflags®_NOTBOL)) ||
2318 (sp < m->endp && *(sp-1) == '\n' &&
2319 (m->g->cflags®_NEWLINE)) ||
2320 (sp > m->beginp &&
2321 !ISWORD(*(sp-1))) ) &&
2322 (sp < m->endp && ISWORD(*sp)) )
2323 { /* yes */ }
2324 else
2325 return(NULL);
2326 break;
2327 case OEOW:
2328 if (( (sp == m->endp && !(m->eflags®_NOTEOL)) ||
2329 (sp < m->endp && *sp == '\n' &&
2330 (m->g->cflags®_NEWLINE)) ||
2331 (sp < m->endp && !ISWORD(*sp)) ) &&
2332 (sp > m->beginp && ISWORD(*(sp-1))) )
2333 { /* yes */ }
2334 else
2335 return(NULL);
2336 break;
2337 case O_QUEST:
2338 break;
2339 case OOR1: /* matches null but needs to skip */
2340 ss++;
2341 s = m->g->strip[ss];
2342 do {
2343 assert(OP(s) == OOR2);
2344 ss += OPND(s);
2345 } while (OP(s = m->g->strip[ss]) != O_CH);
2346 /* note that the ss++ gets us past the O_CH */
2347 break;
2348 default: /* have to make a choice */
2349 hard = 1;
2350 break;
2351 }
2352 if (!hard) { /* that was it! */
2353 if (sp != stop)
2354 return(NULL);
2355 return(sp);
2356 }
2357 ss--; /* adjust for the for's final increment */
2358
2359 /* the hard stuff */
2360 AT("hard", sp, stop, ss, stopst);
2361 s = m->g->strip[ss];
2362 switch (OP(s)) {
2363 case OBACK_: /* the vilest depths */
2364 i = OPND(s);
2365 assert(0 < i && i <= m->g->nsub);
2366 if (m->pmatch[i].rm_eo == -1)
2367 return(NULL);
2368 assert(m->pmatch[i].rm_so != -1);
2369 len = m->pmatch[i].rm_eo - m->pmatch[i].rm_so;
2370 assert(stop - m->beginp >= len);
2371 if (sp > stop - len)
2372 return(NULL); /* not enough left to match */
2373 ssp = m->offp + m->pmatch[i].rm_so;
2374 if (memcmp(sp, ssp, len) != 0)
2375 return(NULL);
2376 while (m->g->strip[ss] != SOP(O_BACK, i))
2377 ss++;
2378 return(backref(m, sp+len, stop, ss+1, stopst, lev));
2379 break;
2380 case OQUEST_: /* to null or not */
2381 dp = backref(m, sp, stop, ss+1, stopst, lev);
2382 if (dp != NULL)
2383 return(dp); /* not */
2384 return(backref(m, sp, stop, ss+OPND(s)+1, stopst, lev));
2385 break;
2386 case OPLUS_:
2387 assert(m->lastpos != NULL);
2388 assert(lev+1 <= m->g->nplus);
2389 m->lastpos[lev+1] = sp;
2390 return(backref(m, sp, stop, ss+1, stopst, lev+1));
2391 break;
2392 case O_PLUS:
2393 if (sp == m->lastpos[lev]) /* last pass matched null */
2394 return(backref(m, sp, stop, ss+1, stopst, lev-1));
2395 /* try another pass */
2396 m->lastpos[lev] = sp;
2397 dp = backref(m, sp, stop, ss-OPND(s)+1, stopst, lev);
2398 if (dp == NULL)
2399 return(backref(m, sp, stop, ss+1, stopst, lev-1));
2400 else
2401 return(dp);
2402 break;
2403 case OCH_: /* find the right one, if any */
2404 ssub = ss + 1;
2405 esub = ss + OPND(s) - 1;
2406 assert(OP(m->g->strip[esub]) == OOR1);
2407 for (;;) { /* find first matching branch */
2408 dp = backref(m, sp, stop, ssub, esub, lev);
2409 if (dp != NULL)
2410 return(dp);
2411 /* that one missed, try next one */
2412 if (OP(m->g->strip[esub]) == O_CH)
2413 return(NULL); /* there is none */
2414 esub++;
2415 assert(OP(m->g->strip[esub]) == OOR2);
2416 ssub = esub + 1;
2417 esub += OPND(m->g->strip[esub]);
2418 if (OP(m->g->strip[esub]) == OOR2)
2419 esub--;
2420 else
2421 assert(OP(m->g->strip[esub]) == O_CH);
2422 }
2423 break;
2424 case OLPAREN: /* must undo assignment if rest fails */
2425 i = OPND(s);
2426 assert(0 < i && i <= m->g->nsub);
2427 offsave = m->pmatch[i].rm_so;
2428 m->pmatch[i].rm_so = sp - m->offp;
2429 dp = backref(m, sp, stop, ss+1, stopst, lev);
2430 if (dp != NULL)
2431 return(dp);
2432 m->pmatch[i].rm_so = offsave;
2433 return(NULL);
2434 break;
2435 case ORPAREN: /* must undo assignment if rest fails */
2436 i = OPND(s);
2437 assert(0 < i && i <= m->g->nsub);
2438 offsave = m->pmatch[i].rm_eo;
2439 m->pmatch[i].rm_eo = sp - m->offp;
2440 dp = backref(m, sp, stop, ss+1, stopst, lev);
2441 if (dp != NULL)
2442 return(dp);
2443 m->pmatch[i].rm_eo = offsave;
2444 return(NULL);
2445 break;
2446 default: /* uh oh */
2447 assert(nope);
2448 break;
2449 }
2450
2451 /* "can't happen" */
2452 assert(nope);
2453 /* NOTREACHED */
2454 return((char *)NULL); /* dummy */
2455 }
2456
2457 /*
2458 - fast - step through the string at top speed
2459 == static char *fast(register struct match *m, char *start, \
2460 == char *stop, sopno startst, sopno stopst);
2461 */
2462 static char * /* where tentative match ended, or NULL */
fast(m,start,stop,startst,stopst)2463 fast(m, start, stop, startst, stopst)
2464 register struct match *m;
2465 char *start;
2466 char *stop;
2467 sopno startst;
2468 sopno stopst;
2469 {
2470 register states st = m->st;
2471 register states fresh = m->fresh;
2472 register states tmp = m->tmp;
2473 register char *p = start;
2474 register int c = (start == m->beginp) ? OUT : *(start-1);
2475 register int lastc; /* previous c */
2476 register int flagch;
2477 register int i;
2478 register char *coldp; /* last p after which no match was underway */
2479
2480 CLEAR(st);
2481 SET1(st, startst);
2482 st = step(m->g, startst, stopst, st, NOTHING, st);
2483 ASSIGN(fresh, st);
2484 SP("start", st, *p);
2485 coldp = NULL;
2486 for (;;) {
2487 /* next character */
2488 lastc = c;
2489 c = (p == m->endp) ? OUT : *p;
2490 if (EQ(st, fresh))
2491 coldp = p;
2492
2493 /* is there an EOL and/or BOL between lastc and c? */
2494 flagch = '\0';
2495 i = 0;
2496 if ( (lastc == '\n' && m->g->cflags®_NEWLINE) ||
2497 (lastc == OUT && !(m->eflags®_NOTBOL)) ) {
2498 flagch = BOL;
2499 i = m->g->nbol;
2500 }
2501 if ( (c == '\n' && m->g->cflags®_NEWLINE) ||
2502 (c == OUT && !(m->eflags®_NOTEOL)) ) {
2503 flagch = (flagch == BOL) ? BOLEOL : EOL;
2504 i += m->g->neol;
2505 }
2506 if (i != 0) {
2507 for (; i > 0; i--)
2508 st = step(m->g, startst, stopst, st, flagch, st);
2509 SP("boleol", st, c);
2510 }
2511
2512 /* how about a word boundary? */
2513 if ( (flagch == BOL || (lastc != OUT && !ISWORD(lastc))) &&
2514 (c != OUT && ISWORD(c)) ) {
2515 flagch = BOW;
2516 }
2517 if ( (lastc != OUT && ISWORD(lastc)) &&
2518 (flagch == EOL || (c != OUT && !ISWORD(c))) ) {
2519 flagch = EOW;
2520 }
2521 if (flagch == BOW || flagch == EOW) {
2522 st = step(m->g, startst, stopst, st, flagch, st);
2523 SP("boweow", st, c);
2524 }
2525
2526 /* are we done? */
2527 if (ISSET(st, stopst) || p == stop)
2528 break; /* NOTE BREAK OUT */
2529
2530 /* no, we must deal with this character */
2531 ASSIGN(tmp, st);
2532 ASSIGN(st, fresh);
2533 assert(c != OUT);
2534 st = step(m->g, startst, stopst, tmp, c, st);
2535 SP("aft", st, c);
2536 assert(EQ(step(m->g, startst, stopst, st, NOTHING, st), st));
2537 p++;
2538 }
2539
2540 assert(coldp != NULL);
2541 m->coldp = coldp;
2542 if (ISSET(st, stopst))
2543 return(p+1);
2544 else
2545 return(NULL);
2546 }
2547
2548 /*
2549 - slow - step through the string more deliberately
2550 == static char *slow(register struct match *m, char *start, \
2551 == char *stop, sopno startst, sopno stopst);
2552 */
2553 static char * /* where it ended */
slow(m,start,stop,startst,stopst)2554 slow(m, start, stop, startst, stopst)
2555 register struct match *m;
2556 char *start;
2557 char *stop;
2558 sopno startst;
2559 sopno stopst;
2560 {
2561 register states st = m->st;
2562 register states empty = m->empty;
2563 register states tmp = m->tmp;
2564 register char *p = start;
2565 register int c = (start == m->beginp) ? OUT : *(start-1);
2566 register int lastc; /* previous c */
2567 register int flagch;
2568 register int i;
2569 register char *matchp; /* last p at which a match ended */
2570
2571 AT("slow", start, stop, startst, stopst);
2572 CLEAR(st);
2573 SET1(st, startst);
2574 SP("sstart", st, *p);
2575 st = step(m->g, startst, stopst, st, NOTHING, st);
2576 matchp = NULL;
2577 for (;;) {
2578 /* next character */
2579 lastc = c;
2580 c = (p == m->endp) ? OUT : *p;
2581
2582 /* is there an EOL and/or BOL between lastc and c? */
2583 flagch = '\0';
2584 i = 0;
2585 if ( (lastc == '\n' && m->g->cflags®_NEWLINE) ||
2586 (lastc == OUT && !(m->eflags®_NOTBOL)) ) {
2587 flagch = BOL;
2588 i = m->g->nbol;
2589 }
2590 if ( (c == '\n' && m->g->cflags®_NEWLINE) ||
2591 (c == OUT && !(m->eflags®_NOTEOL)) ) {
2592 flagch = (flagch == BOL) ? BOLEOL : EOL;
2593 i += m->g->neol;
2594 }
2595 if (i != 0) {
2596 for (; i > 0; i--)
2597 st = step(m->g, startst, stopst, st, flagch, st);
2598 SP("sboleol", st, c);
2599 }
2600
2601 /* how about a word boundary? */
2602 if ( (flagch == BOL || (lastc != OUT && !ISWORD(lastc))) &&
2603 (c != OUT && ISWORD(c)) ) {
2604 flagch = BOW;
2605 }
2606 if ( (lastc != OUT && ISWORD(lastc)) &&
2607 (flagch == EOL || (c != OUT && !ISWORD(c))) ) {
2608 flagch = EOW;
2609 }
2610 if (flagch == BOW || flagch == EOW) {
2611 st = step(m->g, startst, stopst, st, flagch, st);
2612 SP("sboweow", st, c);
2613 }
2614
2615 /* are we done? */
2616 if (ISSET(st, stopst))
2617 matchp = p;
2618 if (EQ(st, empty) || p == stop)
2619 break; /* NOTE BREAK OUT */
2620
2621 /* no, we must deal with this character */
2622 ASSIGN(tmp, st);
2623 ASSIGN(st, empty);
2624 assert(c != OUT);
2625 st = step(m->g, startst, stopst, tmp, c, st);
2626 SP("saft", st, c);
2627 assert(EQ(step(m->g, startst, stopst, st, NOTHING, st), st));
2628 p++;
2629 }
2630
2631 return(matchp);
2632 }
2633
2634
2635 /*
2636 - step - map set of states reachable before char to set reachable after
2637 == static states step(register struct re_guts *g, sopno start, sopno stop, \
2638 == register states bef, int ch, register states aft);
2639 == #define BOL (OUT+1)
2640 == #define EOL (BOL+1)
2641 == #define BOLEOL (BOL+2)
2642 == #define NOTHING (BOL+3)
2643 == #define BOW (BOL+4)
2644 == #define EOW (BOL+5)
2645 == #define CODEMAX (BOL+5) // highest code used
2646 == #define NONCHAR(c) ((c) > CHAR_MAX)
2647 == #define NNONCHAR (CODEMAX-CHAR_MAX)
2648 */
2649 static states
step(g,start,stop,bef,ch,aft)2650 step(g, start, stop, bef, ch, aft)
2651 register struct re_guts *g;
2652 sopno start; /* start state within strip */
2653 sopno stop; /* state after stop state within strip */
2654 register states bef; /* states reachable before */
2655 int ch; /* character or NONCHAR code */
2656 register states aft; /* states already known reachable after */
2657 {
2658 register cset *cs;
2659 register sop s;
2660 register sopno pc;
2661 register onestate here; /* note, macros know this name */
2662 register sopno look;
2663 register long i;
2664
2665 for (pc = start, INIT(here, pc); pc != stop; pc++, INC(here)) {
2666 s = g->strip[pc];
2667 switch (OP(s)) {
2668 case OEND:
2669 assert(pc == stop-1);
2670 break;
2671 case OCHAR:
2672 /* only characters can match */
2673 assert(!NONCHAR(ch) || ch != (char)OPND(s));
2674 if (ch == (char)OPND(s))
2675 FWD(aft, bef, 1);
2676 break;
2677 case OBOL:
2678 if (ch == BOL || ch == BOLEOL)
2679 FWD(aft, bef, 1);
2680 break;
2681 case OEOL:
2682 if (ch == EOL || ch == BOLEOL)
2683 FWD(aft, bef, 1);
2684 break;
2685 case OBOW:
2686 if (ch == BOW)
2687 FWD(aft, bef, 1);
2688 break;
2689 case OEOW:
2690 if (ch == EOW)
2691 FWD(aft, bef, 1);
2692 break;
2693 case OANY:
2694 if (!NONCHAR(ch))
2695 FWD(aft, bef, 1);
2696 break;
2697 case OANYOF:
2698 cs = &g->sets[OPND(s)];
2699 if (!NONCHAR(ch) && CHIN(cs, ch))
2700 FWD(aft, bef, 1);
2701 break;
2702 case OBACK_: /* ignored here */
2703 case O_BACK:
2704 FWD(aft, aft, 1);
2705 break;
2706 case OPLUS_: /* forward, this is just an empty */
2707 FWD(aft, aft, 1);
2708 break;
2709 case O_PLUS: /* both forward and back */
2710 FWD(aft, aft, 1);
2711 i = ISSETBACK(aft, OPND(s));
2712 BACK(aft, aft, OPND(s));
2713 if (!i && ISSETBACK(aft, OPND(s))) {
2714 /* oho, must reconsider loop body */
2715 pc -= OPND(s) + 1;
2716 INIT(here, pc);
2717 }
2718 break;
2719 case OQUEST_: /* two branches, both forward */
2720 FWD(aft, aft, 1);
2721 FWD(aft, aft, OPND(s));
2722 break;
2723 case O_QUEST: /* just an empty */
2724 FWD(aft, aft, 1);
2725 break;
2726 case OLPAREN: /* not significant here */
2727 case ORPAREN:
2728 FWD(aft, aft, 1);
2729 break;
2730 case OCH_: /* mark the first two branches */
2731 FWD(aft, aft, 1);
2732 assert(OP(g->strip[pc+OPND(s)]) == OOR2);
2733 FWD(aft, aft, OPND(s));
2734 break;
2735 case OOR1: /* done a branch, find the O_CH */
2736 if (ISSTATEIN(aft, here)) {
2737 for (look = 1;
2738 OP(s = g->strip[pc+look]) != O_CH;
2739 look += OPND(s))
2740 assert(OP(s) == OOR2);
2741 FWD(aft, aft, look);
2742 }
2743 break;
2744 case OOR2: /* propagate OCH_'s marking */
2745 FWD(aft, aft, 1);
2746 if (OP(g->strip[pc+OPND(s)]) != O_CH) {
2747 assert(OP(g->strip[pc+OPND(s)]) == OOR2);
2748 FWD(aft, aft, OPND(s));
2749 }
2750 break;
2751 case O_CH: /* just empty */
2752 FWD(aft, aft, 1);
2753 break;
2754 default: /* ooooops... */
2755 assert(nope);
2756 break;
2757 }
2758 }
2759
2760 return(aft);
2761 }
2762
2763 #undef matcher
2764 #undef fast
2765 #undef slow
2766 #undef dissect
2767 #undef backref
2768 #undef step
2769 #undef print
2770 #undef at
2771 #undef match
2772
2773 /* now undo things */
2774 #undef states
2775 #undef CLEAR
2776 #undef SET0
2777 #undef SET1
2778 #undef ISSET
2779 #undef ASSIGN
2780 #undef EQ
2781 #undef STATEVARS
2782 #undef STATESETUP
2783 #undef STATETEARDOWN
2784 #undef SETUP
2785 #undef onestate
2786 #undef INIT
2787 #undef INC
2788 #undef ISSTATEIN
2789 #undef FWD
2790 #undef BACK
2791 #undef ISSETBACK
2792 #undef SNAMES
2793
2794 /* macros for manipulating states, large version */
2795 #define states char *
2796 #define CLEAR(v) memset(v, 0, m->g->nstates)
2797 #define SET0(v, n) ((v)[n] = 0)
2798 #define SET1(v, n) ((v)[n] = 1)
2799 #define ISSET(v, n) ((v)[n])
2800 #define ASSIGN(d, s) memcpy(d, s, m->g->nstates)
2801 #define EQ(a, b) (memcmp(a, b, m->g->nstates) == 0)
2802 #define STATEVARS int vn; char *space
2803 #define STATESETUP(m, nv) { (m)->space = malloc((nv)*(m)->g->nstates); \
2804 if ((m)->space == NULL) return(REG_ESPACE); \
2805 (m)->vn = 0; }
2806 #define STATETEARDOWN(m) { free((m)->space); }
2807 #define SETUP(v) ((v) = &m->space[m->vn++ * m->g->nstates])
2808 #define onestate int
2809 #define INIT(o, n) ((o) = (n))
2810 #define INC(o) ((o)++)
2811 #define ISSTATEIN(v, o) ((v)[o])
2812 /* some abbreviations; note that some of these know variable names! */
2813 /* do "if I'm here, I can also be there" etc without branches */
2814 #define FWD(dst, src, n) ((dst)[here+(n)] |= (src)[here])
2815 #define BACK(dst, src, n) ((dst)[here-(n)] |= (src)[here])
2816 #define ISSETBACK(v, n) ((v)[here - (n)])
2817 /* function names */
2818 #define LNAMES /* flag */
2819
2820 /*
2821 * The matching engine and friends. This file is #included by regexec.c
2822 * after suitable #defines of a variety of macros used herein, so that
2823 * different state representations can be used without duplicating masses
2824 * of code.
2825 */
2826
2827 #ifdef SNAMES
2828 #define matcher smatcher
2829 #define fast sfast
2830 #define slow sslow
2831 #define dissect sdissect
2832 #define backref sbackref
2833 #define step sstep
2834 #define print sprint
2835 #define at sat
2836 #define match smat
2837 #endif
2838 #ifdef LNAMES
2839 #define matcher lmatcher
2840 #define fast lfast
2841 #define slow lslow
2842 #define dissect ldissect
2843 #define backref lbackref
2844 #define step lstep
2845 #define print lprint
2846 #define at lat
2847 #define match lmat
2848 #endif
2849
2850 /* another structure passed up and down to avoid zillions of parameters */
2851 struct match {
2852 struct re_guts *g;
2853 int eflags;
2854 regmatch_t *pmatch; /* [nsub+1] (0 element unused) */
2855 char *offp; /* offsets work from here */
2856 char *beginp; /* start of string -- virtual NUL precedes */
2857 char *endp; /* end of string -- virtual NUL here */
2858 char *coldp; /* can be no match starting before here */
2859 char **lastpos; /* [nplus+1] */
2860 STATEVARS;
2861 states st; /* current states */
2862 states fresh; /* states for a fresh start */
2863 states tmp; /* temporary */
2864 states empty; /* empty set of states */
2865 };
2866
2867 static int matcher(register struct re_guts *g, char *string, size_t nmatch, regmatch_t pmatch[], int eflags);
2868 static char *dissect(register struct match *m, char *start, char *stop, sopno startst, sopno stopst);
2869 static char *backref(register struct match *m, char *start, char *stop, sopno startst, sopno stopst, sopno lev);
2870 static char *fast(register struct match *m, char *start, char *stop, sopno startst, sopno stopst);
2871 static char *slow(register struct match *m, char *start, char *stop, sopno startst, sopno stopst);
2872 static states step(register struct re_guts *g, sopno start, sopno stop, register states bef, int ch, register states aft);
2873 #define BOL (OUT+1)
2874 #define EOL (BOL+1)
2875 #define BOLEOL (BOL+2)
2876 #define NOTHING (BOL+3)
2877 #define BOW (BOL+4)
2878 #define EOW (BOL+5)
2879 #define CODEMAX (BOL+5) /* highest code used */
2880 #define NONCHAR(c) ((c) > CHAR_MAX)
2881 #define NNONCHAR (CODEMAX-CHAR_MAX)
2882 #define SP(t, s, c) /* nothing */
2883 #define AT(t, p1, p2, s1, s2) /* nothing */
2884 #define NOTE(s) /* nothing */
2885
2886 /*
2887 - matcher - the actual matching engine
2888 == static int matcher(register struct re_guts *g, char *string, \
2889 == size_t nmatch, regmatch_t pmatch[], int eflags);
2890 */
2891 static int /* 0 success, REG_NOMATCH failure */
matcher(g,string,nmatch,pmatch,eflags)2892 matcher(g, string, nmatch, pmatch, eflags)
2893 register struct re_guts *g;
2894 char *string;
2895 size_t nmatch;
2896 regmatch_t pmatch[];
2897 int eflags;
2898 {
2899 register char *endp;
2900 register int i;
2901 struct match mv;
2902 register struct match *m = &mv;
2903 register char *dp;
2904 const register sopno gf = g->firststate+1; /* +1 for OEND */
2905 const register sopno gl = g->laststate;
2906 char *start;
2907 char *stop;
2908
2909 /* simplify the situation where possible */
2910 if (g->cflags®_NOSUB)
2911 nmatch = 0;
2912 if (eflags®_STARTEND) {
2913 start = string + pmatch[0].rm_so;
2914 stop = string + pmatch[0].rm_eo;
2915 } else {
2916 start = string;
2917 stop = start + strlen(start);
2918 }
2919 if (stop < start)
2920 return(REG_INVARG);
2921
2922 /* prescreening; this does wonders for this rather slow code */
2923 if (g->must != NULL) {
2924 for (dp = start; dp < stop; dp++)
2925 if (*dp == g->must[0] && stop - dp >= g->mlen &&
2926 memcmp(dp, g->must, (size_t)g->mlen) == 0)
2927 break;
2928 if (dp == stop) /* we didn't find g->must */
2929 return(REG_NOMATCH);
2930 }
2931
2932 /* match struct setup */
2933 m->g = g;
2934 m->eflags = eflags;
2935 m->pmatch = NULL;
2936 m->lastpos = NULL;
2937 m->offp = string;
2938 m->beginp = start;
2939 m->endp = stop;
2940 STATESETUP(m, 4);
2941 SETUP(m->st);
2942 SETUP(m->fresh);
2943 SETUP(m->tmp);
2944 SETUP(m->empty);
2945 CLEAR(m->empty);
2946
2947 /* this loop does only one repetition except for backrefs */
2948 for (;;) {
2949 endp = fast(m, start, stop, gf, gl);
2950 if (endp == NULL) { /* a miss */
2951 STATETEARDOWN(m);
2952 return(REG_NOMATCH);
2953 }
2954 if (nmatch == 0 && !g->backrefs)
2955 break; /* no further info needed */
2956
2957 /* where? */
2958 assert(m->coldp != NULL);
2959 for (;;) {
2960 NOTE("finding start");
2961 endp = slow(m, m->coldp, stop, gf, gl);
2962 if (endp != NULL)
2963 break;
2964 assert(m->coldp < m->endp);
2965 m->coldp++;
2966 }
2967 if (nmatch == 1 && !g->backrefs)
2968 break; /* no further info needed */
2969
2970 /* oh my, he wants the subexpressions... */
2971 if (m->pmatch == NULL)
2972 m->pmatch = (regmatch_t *)malloc((m->g->nsub + 1) *
2973 sizeof(regmatch_t));
2974 if (m->pmatch == NULL) {
2975 STATETEARDOWN(m);
2976 return(REG_ESPACE);
2977 }
2978 for (i = 1; i <= m->g->nsub; i++)
2979 m->pmatch[i].rm_so = m->pmatch[i].rm_eo = -1;
2980 if (!g->backrefs && !(m->eflags®_BACKR)) {
2981 NOTE("dissecting");
2982 dp = dissect(m, m->coldp, endp, gf, gl);
2983 } else {
2984 if (g->nplus > 0 && m->lastpos == NULL)
2985 m->lastpos = (char **)malloc((g->nplus+1) *
2986 sizeof(char *));
2987 if (g->nplus > 0 && m->lastpos == NULL) {
2988 free(m->pmatch);
2989 STATETEARDOWN(m);
2990 return(REG_ESPACE);
2991 }
2992 NOTE("backref dissect");
2993 dp = backref(m, m->coldp, endp, gf, gl, (sopno)0);
2994 }
2995 if (dp != NULL)
2996 break;
2997
2998 /* uh-oh... we couldn't find a subexpression-level match */
2999 assert(g->backrefs); /* must be back references doing it */
3000 assert(g->nplus == 0 || m->lastpos != NULL);
3001 for (;;) {
3002 if (dp != NULL || endp <= m->coldp)
3003 break; /* defeat */
3004 NOTE("backoff");
3005 endp = slow(m, m->coldp, endp-1, gf, gl);
3006 if (endp == NULL)
3007 break; /* defeat */
3008 /* try it on a shorter possibility */
3009 NOTE("backoff dissect");
3010 dp = backref(m, m->coldp, endp, gf, gl, (sopno)0);
3011 }
3012 assert(dp == NULL || dp == endp);
3013 if (dp != NULL) /* found a shorter one */
3014 break;
3015
3016 /* despite initial appearances, there is no match here */
3017 NOTE("false alarm");
3018 start = m->coldp + 1; /* recycle starting later */
3019 assert(start <= stop);
3020 }
3021
3022 /* fill in the details if requested */
3023 if (nmatch > 0) {
3024 pmatch[0].rm_so = m->coldp - m->offp;
3025 pmatch[0].rm_eo = endp - m->offp;
3026 }
3027 if (nmatch > 1) {
3028 assert(m->pmatch != NULL);
3029 for (i = 1; i < nmatch; i++)
3030 if (i <= m->g->nsub)
3031 pmatch[i] = m->pmatch[i];
3032 else {
3033 pmatch[i].rm_so = -1;
3034 pmatch[i].rm_eo = -1;
3035 }
3036 }
3037
3038 if (m->pmatch != NULL)
3039 free((char *)m->pmatch);
3040 if (m->lastpos != NULL)
3041 free((char *)m->lastpos);
3042 STATETEARDOWN(m);
3043 return(0);
3044 }
3045
3046 /*
3047 - dissect - figure out what matched what, no back references
3048 == static char *dissect(register struct match *m, char *start, \
3049 == char *stop, sopno startst, sopno stopst);
3050 */
3051 static char * /* == stop (success) always */
dissect(m,start,stop,startst,stopst)3052 dissect(m, start, stop, startst, stopst)
3053 register struct match *m;
3054 char *start;
3055 char *stop;
3056 sopno startst;
3057 sopno stopst;
3058 {
3059 register int i;
3060 register sopno ss; /* start sop of current subRE */
3061 register sopno es; /* end sop of current subRE */
3062 register char *sp; /* start of string matched by it */
3063 register char *stp; /* string matched by it cannot pass here */
3064 register char *rest; /* start of rest of string */
3065 register char *tail; /* string unmatched by rest of RE */
3066 register sopno ssub; /* start sop of subsubRE */
3067 register sopno esub; /* end sop of subsubRE */
3068 register char *ssp; /* start of string matched by subsubRE */
3069 register char *sep; /* end of string matched by subsubRE */
3070 register char *oldssp; /* previous ssp */
3071 register char *dp;
3072
3073 AT("diss", start, stop, startst, stopst);
3074 sp = start;
3075 for (ss = startst; ss < stopst; ss = es) {
3076 /* identify end of subRE */
3077 es = ss;
3078 switch (OP(m->g->strip[es])) {
3079 case OPLUS_:
3080 case OQUEST_:
3081 es += OPND(m->g->strip[es]);
3082 break;
3083 case OCH_:
3084 while (OP(m->g->strip[es]) != O_CH)
3085 es += OPND(m->g->strip[es]);
3086 break;
3087 }
3088 es++;
3089
3090 /* figure out what it matched */
3091 switch (OP(m->g->strip[ss])) {
3092 case OEND:
3093 assert(nope);
3094 break;
3095 case OCHAR:
3096 sp++;
3097 break;
3098 case OBOL:
3099 case OEOL:
3100 case OBOW:
3101 case OEOW:
3102 break;
3103 case OANY:
3104 case OANYOF:
3105 sp++;
3106 break;
3107 case OBACK_:
3108 case O_BACK:
3109 assert(nope);
3110 break;
3111 /* cases where length of match is hard to find */
3112 case OQUEST_:
3113 stp = stop;
3114 for (;;) {
3115 /* how long could this one be? */
3116 rest = slow(m, sp, stp, ss, es);
3117 assert(rest != NULL); /* it did match */
3118 /* could the rest match the rest? */
3119 tail = slow(m, rest, stop, es, stopst);
3120 if (tail == stop)
3121 break; /* yes! */
3122 /* no -- try a shorter match for this one */
3123 stp = rest - 1;
3124 assert(stp >= sp); /* it did work */
3125 }
3126 ssub = ss + 1;
3127 esub = es - 1;
3128 /* did innards match? */
3129 if (slow(m, sp, rest, ssub, esub) != NULL) {
3130 dp = dissect(m, sp, rest, ssub, esub);
3131 assert(dp == rest);
3132 } else /* no */
3133 assert(sp == rest);
3134 sp = rest;
3135 break;
3136 case OPLUS_:
3137 stp = stop;
3138 for (;;) {
3139 /* how long could this one be? */
3140 rest = slow(m, sp, stp, ss, es);
3141 assert(rest != NULL); /* it did match */
3142 /* could the rest match the rest? */
3143 tail = slow(m, rest, stop, es, stopst);
3144 if (tail == stop)
3145 break; /* yes! */
3146 /* no -- try a shorter match for this one */
3147 stp = rest - 1;
3148 assert(stp >= sp); /* it did work */
3149 }
3150 ssub = ss + 1;
3151 esub = es - 1;
3152 ssp = sp;
3153 oldssp = ssp;
3154 for (;;) { /* find last match of innards */
3155 sep = slow(m, ssp, rest, ssub, esub);
3156 if (sep == NULL || sep == ssp)
3157 break; /* failed or matched null */
3158 oldssp = ssp; /* on to next try */
3159 ssp = sep;
3160 }
3161 if (sep == NULL) {
3162 /* last successful match */
3163 sep = ssp;
3164 ssp = oldssp;
3165 }
3166 assert(sep == rest); /* must exhaust substring */
3167 assert(slow(m, ssp, sep, ssub, esub) == rest);
3168 dp = dissect(m, ssp, sep, ssub, esub);
3169 assert(dp == sep);
3170 sp = rest;
3171 break;
3172 case OCH_:
3173 stp = stop;
3174 for (;;) {
3175 /* how long could this one be? */
3176 rest = slow(m, sp, stp, ss, es);
3177 assert(rest != NULL); /* it did match */
3178 /* could the rest match the rest? */
3179 tail = slow(m, rest, stop, es, stopst);
3180 if (tail == stop)
3181 break; /* yes! */
3182 /* no -- try a shorter match for this one */
3183 stp = rest - 1;
3184 assert(stp >= sp); /* it did work */
3185 }
3186 ssub = ss + 1;
3187 esub = ss + OPND(m->g->strip[ss]) - 1;
3188 assert(OP(m->g->strip[esub]) == OOR1);
3189 for (;;) { /* find first matching branch */
3190 if (slow(m, sp, rest, ssub, esub) == rest)
3191 break; /* it matched all of it */
3192 /* that one missed, try next one */
3193 assert(OP(m->g->strip[esub]) == OOR1);
3194 esub++;
3195 assert(OP(m->g->strip[esub]) == OOR2);
3196 ssub = esub + 1;
3197 esub += OPND(m->g->strip[esub]);
3198 if (OP(m->g->strip[esub]) == OOR2)
3199 esub--;
3200 else
3201 assert(OP(m->g->strip[esub]) == O_CH);
3202 }
3203 dp = dissect(m, sp, rest, ssub, esub);
3204 assert(dp == rest);
3205 sp = rest;
3206 break;
3207 case O_PLUS:
3208 case O_QUEST:
3209 case OOR1:
3210 case OOR2:
3211 case O_CH:
3212 assert(nope);
3213 break;
3214 case OLPAREN:
3215 i = OPND(m->g->strip[ss]);
3216 assert(0 < i && i <= m->g->nsub);
3217 m->pmatch[i].rm_so = sp - m->offp;
3218 break;
3219 case ORPAREN:
3220 i = OPND(m->g->strip[ss]);
3221 assert(0 < i && i <= m->g->nsub);
3222 m->pmatch[i].rm_eo = sp - m->offp;
3223 break;
3224 default: /* uh oh */
3225 assert(nope);
3226 break;
3227 }
3228 }
3229
3230 assert(sp == stop);
3231 return(sp);
3232 }
3233
3234 /*
3235 - backref - figure out what matched what, figuring in back references
3236 == static char *backref(register struct match *m, char *start, \
3237 == char *stop, sopno startst, sopno stopst, sopno lev);
3238 */
3239 static char * /* == stop (success) or NULL (failure) */
backref(m,start,stop,startst,stopst,lev)3240 backref(m, start, stop, startst, stopst, lev)
3241 register struct match *m;
3242 char *start;
3243 char *stop;
3244 sopno startst;
3245 sopno stopst;
3246 sopno lev; /* PLUS nesting level */
3247 {
3248 register int i;
3249 register sopno ss; /* start sop of current subRE */
3250 register char *sp; /* start of string matched by it */
3251 register sopno ssub; /* start sop of subsubRE */
3252 register sopno esub; /* end sop of subsubRE */
3253 register char *ssp; /* start of string matched by subsubRE */
3254 register char *dp;
3255 register size_t len;
3256 register int hard;
3257 register sop s;
3258 register regoff_t offsave;
3259 register cset *cs;
3260
3261 AT("back", start, stop, startst, stopst);
3262 sp = start;
3263
3264 /* get as far as we can with easy stuff */
3265 hard = 0;
3266 for (ss = startst; !hard && ss < stopst; ss++)
3267 switch (OP(s = m->g->strip[ss])) {
3268 case OCHAR:
3269 if (sp == stop || *sp++ != (char)OPND(s))
3270 return(NULL);
3271 break;
3272 case OANY:
3273 if (sp == stop)
3274 return(NULL);
3275 sp++;
3276 break;
3277 case OANYOF:
3278 cs = &m->g->sets[OPND(s)];
3279 if (sp == stop || !CHIN(cs, *sp++))
3280 return(NULL);
3281 break;
3282 case OBOL:
3283 if ( (sp == m->beginp && !(m->eflags®_NOTBOL)) ||
3284 (sp < m->endp && *(sp-1) == '\n' &&
3285 (m->g->cflags®_NEWLINE)) )
3286 { /* yes */ }
3287 else
3288 return(NULL);
3289 break;
3290 case OEOL:
3291 if ( (sp == m->endp && !(m->eflags®_NOTEOL)) ||
3292 (sp < m->endp && *sp == '\n' &&
3293 (m->g->cflags®_NEWLINE)) )
3294 { /* yes */ }
3295 else
3296 return(NULL);
3297 break;
3298 case OBOW:
3299 if (( (sp == m->beginp && !(m->eflags®_NOTBOL)) ||
3300 (sp < m->endp && *(sp-1) == '\n' &&
3301 (m->g->cflags®_NEWLINE)) ||
3302 (sp > m->beginp &&
3303 !ISWORD(*(sp-1))) ) &&
3304 (sp < m->endp && ISWORD(*sp)) )
3305 { /* yes */ }
3306 else
3307 return(NULL);
3308 break;
3309 case OEOW:
3310 if (( (sp == m->endp && !(m->eflags®_NOTEOL)) ||
3311 (sp < m->endp && *sp == '\n' &&
3312 (m->g->cflags®_NEWLINE)) ||
3313 (sp < m->endp && !ISWORD(*sp)) ) &&
3314 (sp > m->beginp && ISWORD(*(sp-1))) )
3315 { /* yes */ }
3316 else
3317 return(NULL);
3318 break;
3319 case O_QUEST:
3320 break;
3321 case OOR1: /* matches null but needs to skip */
3322 ss++;
3323 s = m->g->strip[ss];
3324 do {
3325 assert(OP(s) == OOR2);
3326 ss += OPND(s);
3327 } while (OP(s = m->g->strip[ss]) != O_CH);
3328 /* note that the ss++ gets us past the O_CH */
3329 break;
3330 default: /* have to make a choice */
3331 hard = 1;
3332 break;
3333 }
3334 if (!hard) { /* that was it! */
3335 if (sp != stop)
3336 return(NULL);
3337 return(sp);
3338 }
3339 ss--; /* adjust for the for's final increment */
3340
3341 /* the hard stuff */
3342 AT("hard", sp, stop, ss, stopst);
3343 s = m->g->strip[ss];
3344 switch (OP(s)) {
3345 case OBACK_: /* the vilest depths */
3346 i = OPND(s);
3347 assert(0 < i && i <= m->g->nsub);
3348 if (m->pmatch[i].rm_eo == -1)
3349 return(NULL);
3350 assert(m->pmatch[i].rm_so != -1);
3351 len = m->pmatch[i].rm_eo - m->pmatch[i].rm_so;
3352 assert(stop - m->beginp >= len);
3353 if (sp > stop - len)
3354 return(NULL); /* not enough left to match */
3355 ssp = m->offp + m->pmatch[i].rm_so;
3356 if (memcmp(sp, ssp, len) != 0)
3357 return(NULL);
3358 while (m->g->strip[ss] != SOP(O_BACK, i))
3359 ss++;
3360 return(backref(m, sp+len, stop, ss+1, stopst, lev));
3361 break;
3362 case OQUEST_: /* to null or not */
3363 dp = backref(m, sp, stop, ss+1, stopst, lev);
3364 if (dp != NULL)
3365 return(dp); /* not */
3366 return(backref(m, sp, stop, ss+OPND(s)+1, stopst, lev));
3367 break;
3368 case OPLUS_:
3369 assert(m->lastpos != NULL);
3370 assert(lev+1 <= m->g->nplus);
3371 m->lastpos[lev+1] = sp;
3372 return(backref(m, sp, stop, ss+1, stopst, lev+1));
3373 break;
3374 case O_PLUS:
3375 if (sp == m->lastpos[lev]) /* last pass matched null */
3376 return(backref(m, sp, stop, ss+1, stopst, lev-1));
3377 /* try another pass */
3378 m->lastpos[lev] = sp;
3379 dp = backref(m, sp, stop, ss-OPND(s)+1, stopst, lev);
3380 if (dp == NULL)
3381 return(backref(m, sp, stop, ss+1, stopst, lev-1));
3382 else
3383 return(dp);
3384 break;
3385 case OCH_: /* find the right one, if any */
3386 ssub = ss + 1;
3387 esub = ss + OPND(s) - 1;
3388 assert(OP(m->g->strip[esub]) == OOR1);
3389 for (;;) { /* find first matching branch */
3390 dp = backref(m, sp, stop, ssub, esub, lev);
3391 if (dp != NULL)
3392 return(dp);
3393 /* that one missed, try next one */
3394 if (OP(m->g->strip[esub]) == O_CH)
3395 return(NULL); /* there is none */
3396 esub++;
3397 assert(OP(m->g->strip[esub]) == OOR2);
3398 ssub = esub + 1;
3399 esub += OPND(m->g->strip[esub]);
3400 if (OP(m->g->strip[esub]) == OOR2)
3401 esub--;
3402 else
3403 assert(OP(m->g->strip[esub]) == O_CH);
3404 }
3405 break;
3406 case OLPAREN: /* must undo assignment if rest fails */
3407 i = OPND(s);
3408 assert(0 < i && i <= m->g->nsub);
3409 offsave = m->pmatch[i].rm_so;
3410 m->pmatch[i].rm_so = sp - m->offp;
3411 dp = backref(m, sp, stop, ss+1, stopst, lev);
3412 if (dp != NULL)
3413 return(dp);
3414 m->pmatch[i].rm_so = offsave;
3415 return(NULL);
3416 break;
3417 case ORPAREN: /* must undo assignment if rest fails */
3418 i = OPND(s);
3419 assert(0 < i && i <= m->g->nsub);
3420 offsave = m->pmatch[i].rm_eo;
3421 m->pmatch[i].rm_eo = sp - m->offp;
3422 dp = backref(m, sp, stop, ss+1, stopst, lev);
3423 if (dp != NULL)
3424 return(dp);
3425 m->pmatch[i].rm_eo = offsave;
3426 return(NULL);
3427 break;
3428 default: /* uh oh */
3429 assert(nope);
3430 break;
3431 }
3432
3433 /* "can't happen" */
3434 assert(nope);
3435 /* NOTREACHED */
3436 return((char *)NULL); /* dummy */
3437 }
3438
3439 /*
3440 - fast - step through the string at top speed
3441 == static char *fast(register struct match *m, char *start, \
3442 == char *stop, sopno startst, sopno stopst);
3443 */
3444 static char * /* where tentative match ended, or NULL */
fast(m,start,stop,startst,stopst)3445 fast(m, start, stop, startst, stopst)
3446 register struct match *m;
3447 char *start;
3448 char *stop;
3449 sopno startst;
3450 sopno stopst;
3451 {
3452 register states st = m->st;
3453 register states fresh = m->fresh;
3454 register states tmp = m->tmp;
3455 register char *p = start;
3456 register int c = (start == m->beginp) ? OUT : *(start-1);
3457 register int lastc; /* previous c */
3458 register int flagch;
3459 register int i;
3460 register char *coldp; /* last p after which no match was underway */
3461
3462 CLEAR(st);
3463 SET1(st, startst);
3464 st = step(m->g, startst, stopst, st, NOTHING, st);
3465 ASSIGN(fresh, st);
3466 SP("start", st, *p);
3467 coldp = NULL;
3468 for (;;) {
3469 /* next character */
3470 lastc = c;
3471 c = (p == m->endp) ? OUT : *p;
3472 if (EQ(st, fresh))
3473 coldp = p;
3474
3475 /* is there an EOL and/or BOL between lastc and c? */
3476 flagch = '\0';
3477 i = 0;
3478 if ( (lastc == '\n' && m->g->cflags®_NEWLINE) ||
3479 (lastc == OUT && !(m->eflags®_NOTBOL)) ) {
3480 flagch = BOL;
3481 i = m->g->nbol;
3482 }
3483 if ( (c == '\n' && m->g->cflags®_NEWLINE) ||
3484 (c == OUT && !(m->eflags®_NOTEOL)) ) {
3485 flagch = (flagch == BOL) ? BOLEOL : EOL;
3486 i += m->g->neol;
3487 }
3488 if (i != 0) {
3489 for (; i > 0; i--)
3490 st = step(m->g, startst, stopst, st, flagch, st);
3491 SP("boleol", st, c);
3492 }
3493
3494 /* how about a word boundary? */
3495 if ( (flagch == BOL || (lastc != OUT && !ISWORD(lastc))) &&
3496 (c != OUT && ISWORD(c)) ) {
3497 flagch = BOW;
3498 }
3499 if ( (lastc != OUT && ISWORD(lastc)) &&
3500 (flagch == EOL || (c != OUT && !ISWORD(c))) ) {
3501 flagch = EOW;
3502 }
3503 if (flagch == BOW || flagch == EOW) {
3504 st = step(m->g, startst, stopst, st, flagch, st);
3505 SP("boweow", st, c);
3506 }
3507
3508 /* are we done? */
3509 if (ISSET(st, stopst) || p == stop)
3510 break; /* NOTE BREAK OUT */
3511
3512 /* no, we must deal with this character */
3513 ASSIGN(tmp, st);
3514 ASSIGN(st, fresh);
3515 assert(c != OUT);
3516 st = step(m->g, startst, stopst, tmp, c, st);
3517 SP("aft", st, c);
3518 assert(EQ(step(m->g, startst, stopst, st, NOTHING, st), st));
3519 p++;
3520 }
3521
3522 assert(coldp != NULL);
3523 m->coldp = coldp;
3524 if (ISSET(st, stopst))
3525 return(p+1);
3526 else
3527 return(NULL);
3528 }
3529
3530 /*
3531 - slow - step through the string more deliberately
3532 == static char *slow(register struct match *m, char *start, \
3533 == char *stop, sopno startst, sopno stopst);
3534 */
3535 static char * /* where it ended */
slow(m,start,stop,startst,stopst)3536 slow(m, start, stop, startst, stopst)
3537 register struct match *m;
3538 char *start;
3539 char *stop;
3540 sopno startst;
3541 sopno stopst;
3542 {
3543 register states st = m->st;
3544 register states empty = m->empty;
3545 register states tmp = m->tmp;
3546 register char *p = start;
3547 register int c = (start == m->beginp) ? OUT : *(start-1);
3548 register int lastc; /* previous c */
3549 register int flagch;
3550 register int i;
3551 register char *matchp; /* last p at which a match ended */
3552
3553 AT("slow", start, stop, startst, stopst);
3554 CLEAR(st);
3555 SET1(st, startst);
3556 SP("sstart", st, *p);
3557 st = step(m->g, startst, stopst, st, NOTHING, st);
3558 matchp = NULL;
3559 for (;;) {
3560 /* next character */
3561 lastc = c;
3562 c = (p == m->endp) ? OUT : *p;
3563
3564 /* is there an EOL and/or BOL between lastc and c? */
3565 flagch = '\0';
3566 i = 0;
3567 if ( (lastc == '\n' && m->g->cflags®_NEWLINE) ||
3568 (lastc == OUT && !(m->eflags®_NOTBOL)) ) {
3569 flagch = BOL;
3570 i = m->g->nbol;
3571 }
3572 if ( (c == '\n' && m->g->cflags®_NEWLINE) ||
3573 (c == OUT && !(m->eflags®_NOTEOL)) ) {
3574 flagch = (flagch == BOL) ? BOLEOL : EOL;
3575 i += m->g->neol;
3576 }
3577 if (i != 0) {
3578 for (; i > 0; i--)
3579 st = step(m->g, startst, stopst, st, flagch, st);
3580 SP("sboleol", st, c);
3581 }
3582
3583 /* how about a word boundary? */
3584 if ( (flagch == BOL || (lastc != OUT && !ISWORD(lastc))) &&
3585 (c != OUT && ISWORD(c)) ) {
3586 flagch = BOW;
3587 }
3588 if ( (lastc != OUT && ISWORD(lastc)) &&
3589 (flagch == EOL || (c != OUT && !ISWORD(c))) ) {
3590 flagch = EOW;
3591 }
3592 if (flagch == BOW || flagch == EOW) {
3593 st = step(m->g, startst, stopst, st, flagch, st);
3594 SP("sboweow", st, c);
3595 }
3596
3597 /* are we done? */
3598 if (ISSET(st, stopst))
3599 matchp = p;
3600 if (EQ(st, empty) || p == stop)
3601 break; /* NOTE BREAK OUT */
3602
3603 /* no, we must deal with this character */
3604 ASSIGN(tmp, st);
3605 ASSIGN(st, empty);
3606 assert(c != OUT);
3607 st = step(m->g, startst, stopst, tmp, c, st);
3608 SP("saft", st, c);
3609 assert(EQ(step(m->g, startst, stopst, st, NOTHING, st), st));
3610 p++;
3611 }
3612
3613 return(matchp);
3614 }
3615
3616
3617 static states
step(g,start,stop,bef,ch,aft)3618 step(g, start, stop, bef, ch, aft)
3619 register struct re_guts *g;
3620 sopno start; /* start state within strip */
3621 sopno stop; /* state after stop state within strip */
3622 register states bef; /* states reachable before */
3623 int ch; /* character or NONCHAR code */
3624 register states aft; /* states already known reachable after */
3625 {
3626 register cset *cs;
3627 register sop s;
3628 register sopno pc;
3629 register onestate here; /* note, macros know this name */
3630 register sopno look;
3631 register long i;
3632
3633 for (pc = start, INIT(here, pc); pc != stop; pc++, INC(here)) {
3634 s = g->strip[pc];
3635 switch (OP(s)) {
3636 case OEND:
3637 assert(pc == stop-1);
3638 break;
3639 case OCHAR:
3640 /* only characters can match */
3641 assert(!NONCHAR(ch) || ch != (char)OPND(s));
3642 if (ch == (char)OPND(s))
3643 FWD(aft, bef, 1);
3644 break;
3645 case OBOL:
3646 if (ch == BOL || ch == BOLEOL)
3647 FWD(aft, bef, 1);
3648 break;
3649 case OEOL:
3650 if (ch == EOL || ch == BOLEOL)
3651 FWD(aft, bef, 1);
3652 break;
3653 case OBOW:
3654 if (ch == BOW)
3655 FWD(aft, bef, 1);
3656 break;
3657 case OEOW:
3658 if (ch == EOW)
3659 FWD(aft, bef, 1);
3660 break;
3661 case OANY:
3662 if (!NONCHAR(ch))
3663 FWD(aft, bef, 1);
3664 break;
3665 case OANYOF:
3666 cs = &g->sets[OPND(s)];
3667 if (!NONCHAR(ch) && CHIN(cs, ch))
3668 FWD(aft, bef, 1);
3669 break;
3670 case OBACK_: /* ignored here */
3671 case O_BACK:
3672 FWD(aft, aft, 1);
3673 break;
3674 case OPLUS_: /* forward, this is just an empty */
3675 FWD(aft, aft, 1);
3676 break;
3677 case O_PLUS: /* both forward and back */
3678 FWD(aft, aft, 1);
3679 i = ISSETBACK(aft, OPND(s));
3680 BACK(aft, aft, OPND(s));
3681 if (!i && ISSETBACK(aft, OPND(s))) {
3682 /* oho, must reconsider loop body */
3683 pc -= OPND(s) + 1;
3684 INIT(here, pc);
3685 }
3686 break;
3687 case OQUEST_: /* two branches, both forward */
3688 FWD(aft, aft, 1);
3689 FWD(aft, aft, OPND(s));
3690 break;
3691 case O_QUEST: /* just an empty */
3692 FWD(aft, aft, 1);
3693 break;
3694 case OLPAREN: /* not significant here */
3695 case ORPAREN:
3696 FWD(aft, aft, 1);
3697 break;
3698 case OCH_: /* mark the first two branches */
3699 FWD(aft, aft, 1);
3700 assert(OP(g->strip[pc+OPND(s)]) == OOR2);
3701 FWD(aft, aft, OPND(s));
3702 break;
3703 case OOR1: /* done a branch, find the O_CH */
3704 if (ISSTATEIN(aft, here)) {
3705 for (look = 1;
3706 OP(s = g->strip[pc+look]) != O_CH;
3707 look += OPND(s))
3708 assert(OP(s) == OOR2);
3709 FWD(aft, aft, look);
3710 }
3711 break;
3712 case OOR2: /* propagate OCH_'s marking */
3713 FWD(aft, aft, 1);
3714 if (OP(g->strip[pc+OPND(s)]) != O_CH) {
3715 assert(OP(g->strip[pc+OPND(s)]) == OOR2);
3716 FWD(aft, aft, OPND(s));
3717 }
3718 break;
3719 case O_CH: /* just empty */
3720 FWD(aft, aft, 1);
3721 break;
3722 default: /* ooooops... */
3723 assert(nope);
3724 break;
3725 }
3726 }
3727
3728 return(aft);
3729 }
3730
3731 #undef matcher
3732 #undef fast
3733 #undef slow
3734 #undef dissect
3735 #undef backref
3736 #undef step
3737 #undef print
3738 #undef at
3739 #undef match
3740
3741 int /* 0 success, REG_NOMATCH failure */
regexec(preg,string,nmatch,pmatch,eflags)3742 regexec(preg, string, nmatch, pmatch, eflags)
3743 const regex_t *preg;
3744 const char *string;
3745 size_t nmatch;
3746 regmatch_t pmatch[];
3747 int eflags;
3748 {
3749 register struct re_guts *g = preg->re_g;
3750 #define GOODFLAGS(f) ((f)&(REG_NOTBOL|REG_NOTEOL|REG_STARTEND))
3751
3752 if (preg->re_magic != MAGIC1 || g->magic != MAGIC2)
3753 return(REG_BADPAT);
3754 assert(!(g->iflags&BAD));
3755 if (g->iflags&BAD) /* backstop for no-debug case */
3756 return(REG_BADPAT);
3757 eflags = GOODFLAGS(eflags);
3758
3759 if (g->nstates <= CHAR_BIT*sizeof(states1) && !(eflags®_LARGE))
3760 return(smatcher(g, (char *)string, nmatch, pmatch, eflags));
3761 else
3762 return(lmatcher(g, (char *)string, nmatch, pmatch, eflags));
3763 #undef GOODFLAGS
3764 }
3765 /*
3766 - regfree - free everything
3767 = extern void regfree(regex_t *);
3768 */
3769 void
regfree(preg)3770 regfree(preg)
3771 regex_t *preg;
3772 {
3773 register struct re_guts *g;
3774
3775 if (preg->re_magic != MAGIC1) /* oops */
3776 return; /* nice to complain, but hard */
3777
3778 g = preg->re_g;
3779 if (g == NULL || g->magic != MAGIC2) /* oops again */
3780 return;
3781 preg->re_magic = 0; /* mark it invalid */
3782 g->magic = 0; /* mark it invalid */
3783
3784 if (g->strip != NULL)
3785 free((char *)g->strip);
3786 if (g->sets != NULL)
3787 free((char *)g->sets);
3788 if (g->setbits != NULL)
3789 free((char *)g->setbits);
3790 if (g->must != NULL)
3791 free(g->must);
3792 free((char *)g);
3793 }
3794
3795 #ifdef WITH_MAIN
main(int argc,char * argv[])3796 int main(int argc, char* argv[]){
3797 regex_t preg;
3798 int i, s;
3799 if(argc<2) return 1;
3800 if (regcomp(&preg, argv[1], REG_NOSUB)) {
3801 fprintf(stderr, "Failed to compile regex\n");
3802 return 2;
3803 }
3804 for (i = 2; i<argc; i++){
3805 s = regexec(&preg, argv[i], 0, NULL, 0);
3806 fprintf(stdout, "String[%d]: ", i-1);
3807 switch(s){
3808 case 0:
3809 printf("match\n");
3810 break;
3811 case REG_NOMATCH:
3812 printf("not match\n");
3813 break;
3814 default:
3815 printf("failed (%d)\n", s);
3816 }
3817 }
3818 regfree(&preg);
3819 return 0;
3820 }
3821 #endif
3822