1 /*-
2 * Copyright (c) 1992, 1993, 1994 Henry Spencer.
3 * Copyright (c) 1992, 1993, 1994
4 * The Regents of the University of California. All rights reserved.
5 *
6 * This code is derived from software contributed to Berkeley by
7 * Henry Spencer.
8 *
9 * Redistribution and use in source and binary forms, with or without
10 * modification, are permitted provided that the following conditions
11 * are met:
12 * 1. Redistributions of source code must retain the above copyright
13 * notice, this list of conditions and the following disclaimer.
14 * 2. Redistributions in binary form must reproduce the above copyright
15 * notice, this list of conditions and the following disclaimer in the
16 * documentation and/or other materials provided with the distribution.
17 * 3. All advertising materials mentioning features or use of this software
18 * must display the following acknowledgement:
19 * This product includes software developed by the University of
20 * California, Berkeley and its contributors.
21 * 4. Neither the name of the University nor the names of its contributors
22 * may be used to endorse or promote products derived from this software
23 * without specific prior written permission.
24 *
25 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
26 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
27 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
28 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
29 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
30 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
31 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
32 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
33 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
34 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
35 * SUCH DAMAGE.
36 *
37 * @(#)regcomp.c 8.5 (Berkeley) 3/20/94
38 */
39
40 #include <sys/types.h>
41 #include <stdio.h>
42 #include <string.h>
43 #include <ctype.h>
44 #include <limits.h>
45 #include <stdlib.h>
46 #include "regex.h"
47
48 #include "utils.h"
49 #include "regex2.h"
50
51 #include "cclass.h"
52 #include "cname.h"
53
54 /*
55 * parse structure, passed up and down to avoid global variables and
56 * other clumsinesses
57 */
58 struct parse {
59 char *next; /* next character in RE */
60 char *end; /* end of string (-> NUL normally) */
61 int error; /* has an error been seen? */
62 sop *strip; /* malloced strip */
63 sopno ssize; /* malloced strip size (allocated) */
64 sopno slen; /* malloced strip length (used) */
65 int ncsalloc; /* number of csets allocated */
66 struct re_guts *g;
67 # define NPAREN 10 /* we need to remember () 1-9 for back refs */
68 sopno pbegin[NPAREN]; /* -> ( ([0] unused) */
69 sopno pend[NPAREN]; /* -> ) ([0] unused) */
70 };
71
72 /* ========= begin header generated by ./mkh ========= */
73 #ifdef __cplusplus
74 extern "C" {
75 #endif
76
77 /* === regcomp.c === */
78 static void p_ere __P((struct parse *p, int stop));
79 static void p_ere_exp __P((struct parse *p));
80 static void p_str __P((struct parse *p));
81 static void p_bre __P((struct parse *p, int end1, int end2));
82 static int p_simp_re __P((struct parse *p, int starordinary));
83 static int p_count __P((struct parse *p));
84 static void p_bracket __P((struct parse *p));
85 static void p_b_term __P((struct parse *p, cset *cs));
86 static void p_b_cclass __P((struct parse *p, cset *cs));
87 static void p_b_eclass __P((struct parse *p, cset *cs));
88 static char p_b_symbol __P((struct parse *p));
89 static char p_b_coll_elem __P((struct parse *p, int endc));
90 static char othercase __P((int ch));
91 static void bothcases __P((struct parse *p, int ch));
92 static void ordinary __P((struct parse *p, int ch));
93 static void nonnewline __P((struct parse *p));
94 static void repeat __P((struct parse *p, sopno start, int from, int to));
95 static int seterr __P((struct parse *p, int e));
96 static cset *allocset __P((struct parse *p));
97 static void freeset __P((struct parse *p, cset *cs));
98 static int freezeset __P((struct parse *p, cset *cs));
99 static int firstch __P((struct parse *p, cset *cs));
100 static int nch __P((struct parse *p, cset *cs));
101 static void mcadd __P((struct parse *p, cset *cs, char *cp));
102 static void mcinvert __P((struct parse *p, cset *cs));
103 static void mccase __P((struct parse *p, cset *cs));
104 static int isinsets __P((struct re_guts *g, int c));
105 static int samesets __P((struct re_guts *g, int c1, int c2));
106 static void categorize __P((struct parse *p, struct re_guts *g));
107 static sopno dupl __P((struct parse *p, sopno start, sopno finish));
108 static void doemit __P((struct parse *p, sop op, size_t opnd));
109 static void doinsert __P((struct parse *p, sop op, size_t opnd, sopno pos));
110 static void dofwd __P((struct parse *p, sopno pos, sop value));
111 static void enlarge __P((struct parse *p, sopno size));
112 static void stripsnug __P((struct parse *p, struct re_guts *g));
113 static void findmust __P((struct parse *p, struct re_guts *g));
114 static sopno pluscount __P((struct parse *p, struct re_guts *g));
115
116 #ifdef __cplusplus
117 }
118 #endif
119 /* ========= end header generated by ./mkh ========= */
120
121 static char nuls[10]; /* place to point scanner in event of error */
122
123 /*
124 * macros for use with parse structure
125 * BEWARE: these know that the parse structure is named `p' !!!
126 */
127 #define PEEK() (*p->next)
128 #define PEEK2() (*(p->next+1))
129 #define MORE() (p->next < p->end)
130 #define MORE2() (p->next+1 < p->end)
131 #define SEE(c) (MORE() && PEEK() == (c))
132 #define SEETWO(a, b) (MORE() && MORE2() && PEEK() == (a) && PEEK2() == (b))
133 #define EAT(c) ((SEE(c)) ? (NEXT(), 1) : 0)
134 #define EATTWO(a, b) ((SEETWO(a, b)) ? (NEXT2(), 1) : 0)
135 #define NEXT() (p->next++)
136 #define NEXT2() (p->next += 2)
137 #define NEXTn(n) (p->next += (n))
138 #define GETNEXT() (*p->next++)
139 #define SETERROR(e) seterr(p, (e))
140 #define REQUIRE(co, e) ((co) || SETERROR(e))
141 #define MUSTSEE(c, e) (REQUIRE(MORE() && PEEK() == (c), e))
142 #define MUSTEAT(c, e) (REQUIRE(MORE() && GETNEXT() == (c), e))
143 #define MUSTNOTSEE(c, e) (REQUIRE(!MORE() || PEEK() != (c), e))
144 #define EMIT(op, sopnd) doemit(p, (sop)(op), (size_t)(sopnd))
145 #define INSERT(op, pos) doinsert(p, (sop)(op), HERE()-(pos)+1, pos)
146 #define AHEAD(pos) dofwd(p, pos, HERE()-(pos))
147 #define ASTERN(sop, pos) EMIT(sop, HERE()-pos)
148 #define HERE() (p->slen)
149 #define THERE() (p->slen - 1)
150 #define THERETHERE() (p->slen - 2)
151 #define DROP(n) (p->slen -= (n))
152
153 #ifndef NDEBUG
154 static int never = 0; /* for use in asserts; shuts lint up */
155 #else
156 #define never 0 /* some <assert.h>s have bugs too */
157 #endif
158
159 /*
160 - regcomp - interface for parser and compilation
161 = extern int regcomp(regex_t *, const char *, int);
162 = #define REG_BASIC 0000
163 = #define REG_EXTENDED 0001
164 = #define REG_ICASE 0002
165 = #define REG_NOSUB 0004
166 = #define REG_NEWLINE 0010
167 = #define REG_NOSPEC 0020
168 = #define REG_PEND 0040
169 = #define REG_DUMP 0200
170 */
171 int /* 0 success, otherwise REG_something */
regcomp(preg,pattern,cflags)172 regcomp(preg, pattern, cflags)
173 regex_t *preg;
174 const char *pattern;
175 int cflags;
176 {
177 struct parse pa;
178 register struct re_guts *g;
179 register struct parse *p = &pa;
180 register int i;
181 register size_t len;
182 #ifdef REDEBUG
183 # define GOODFLAGS(f) (f)
184 #else
185 # define GOODFLAGS(f) ((f)&~REG_DUMP)
186 #endif
187
188 cflags = GOODFLAGS(cflags);
189 if ((cflags®_EXTENDED) && (cflags®_NOSPEC))
190 return(REG_INVARG);
191
192 if (cflags®_PEND) {
193 if (preg->re_endp < pattern)
194 return(REG_INVARG);
195 len = preg->re_endp - pattern;
196 } else
197 len = strlen((char *)pattern);
198
199 /* do the mallocs early so failure handling is easy */
200 g = (struct re_guts *)malloc(sizeof(struct re_guts) +
201 (NC-1)*sizeof(cat_t));
202 if (g == NULL)
203 return(REG_ESPACE);
204 p->ssize = len/(size_t)2*(size_t)3 + (size_t)1; /* ugh */
205 p->strip = (sop *)malloc(p->ssize * sizeof(sop));
206 p->slen = 0;
207 if (p->strip == NULL) {
208 free((char *)g);
209 return(REG_ESPACE);
210 }
211
212 /* set things up */
213 p->g = g;
214 p->next = (char *)pattern; /* convenience; we do not modify it */
215 p->end = p->next + len;
216 p->error = 0;
217 p->ncsalloc = 0;
218 for (i = 0; i < NPAREN; i++) {
219 p->pbegin[i] = 0;
220 p->pend[i] = 0;
221 }
222 g->csetsize = NC;
223 g->sets = NULL;
224 g->setbits = NULL;
225 g->ncsets = 0;
226 g->cflags = cflags;
227 g->iflags = 0;
228 g->nbol = 0;
229 g->neol = 0;
230 g->must = NULL;
231 g->mlen = 0;
232 g->nsub = 0;
233 g->ncategories = 1; /* category 0 is "everything else" */
234 g->categories = &g->catspace[-(CHAR_MIN)];
235 (void) memset((char *)g->catspace, 0, NC*sizeof(cat_t));
236 g->backrefs = 0;
237
238 /* do it */
239 EMIT(OEND, 0);
240 g->firststate = THERE();
241 if (cflags®_EXTENDED)
242 p_ere(p, OUT);
243 else if (cflags®_NOSPEC)
244 p_str(p);
245 else
246 p_bre(p, OUT, OUT);
247 EMIT(OEND, 0);
248 g->laststate = THERE();
249
250 /* tidy up loose ends and fill things in */
251 categorize(p, g);
252 stripsnug(p, g);
253 findmust(p, g);
254 g->nplus = pluscount(p, g);
255 g->magic = MAGIC2;
256 preg->re_nsub = g->nsub;
257 preg->re_g = g;
258 preg->re_magic = MAGIC1;
259 #ifndef REDEBUG
260 /* not debugging, so can't rely on the assert() in regexec() */
261 if (g->iflags&BAD)
262 SETERROR(REG_ASSERT);
263 #endif
264
265 /* win or lose, we're done */
266 if (p->error != 0) /* lose */
267 regfree(preg);
268 return(p->error);
269 }
270
271 /*
272 - p_ere - ERE parser top level, concatenation and alternation
273 == static void p_ere(register struct parse *p, int stop);
274 */
275 static void
p_ere(p,stop)276 p_ere(p, stop)
277 register struct parse *p;
278 int stop; /* character this ERE should end at */
279 {
280 register char c;
281 register sopno prevback;
282 register sopno prevfwd;
283 register sopno conc;
284 register int first = 1; /* is this the first alternative? */
285
286 for (;;) {
287 /* do a bunch of concatenated expressions */
288 conc = HERE();
289 while (MORE() && (c = PEEK()) != '|' && c != stop)
290 p_ere_exp(p);
291 REQUIRE(HERE() != conc, REG_EMPTY); /* require nonempty */
292
293 if (!EAT('|'))
294 break; /* NOTE BREAK OUT */
295
296 if (first) {
297 INSERT(OCH_, conc); /* offset is wrong */
298 prevfwd = conc;
299 prevback = conc;
300 first = 0;
301 }
302 ASTERN(OOR1, prevback);
303 prevback = THERE();
304 AHEAD(prevfwd); /* fix previous offset */
305 prevfwd = HERE();
306 EMIT(OOR2, 0); /* offset is very wrong */
307 }
308
309 if (!first) { /* tail-end fixups */
310 AHEAD(prevfwd);
311 ASTERN(O_CH, prevback);
312 }
313
314 assert(!MORE() || SEE(stop));
315 }
316
317 /*
318 - p_ere_exp - parse one subERE, an atom possibly followed by a repetition op
319 == static void p_ere_exp(register struct parse *p);
320 */
321 static void
p_ere_exp(p)322 p_ere_exp(p)
323 register struct parse *p;
324 {
325 register char c;
326 register sopno pos;
327 register int count;
328 register int count2;
329 register sopno subno;
330 int wascaret = 0;
331
332 assert(MORE()); /* caller should have ensured this */
333 c = GETNEXT();
334
335 pos = HERE();
336 switch (c) {
337 case '(':
338 REQUIRE(MORE(), REG_EPAREN);
339 p->g->nsub++;
340 subno = p->g->nsub;
341 if (subno < NPAREN)
342 p->pbegin[subno] = HERE();
343 EMIT(OLPAREN, subno);
344 if (!SEE(')'))
345 p_ere(p, ')');
346 if (subno < NPAREN) {
347 p->pend[subno] = HERE();
348 assert(p->pend[subno] != 0);
349 }
350 EMIT(ORPAREN, subno);
351 MUSTEAT(')', REG_EPAREN);
352 break;
353 #ifndef POSIX_MISTAKE
354 case ')': /* happens only if no current unmatched ( */
355 /*
356 * You may ask, why the ifndef? Because I didn't notice
357 * this until slightly too late for 1003.2, and none of the
358 * other 1003.2 regular-expression reviewers noticed it at
359 * all. So an unmatched ) is legal POSIX, at least until
360 * we can get it fixed.
361 */
362 SETERROR(REG_EPAREN);
363 break;
364 #endif
365 case '^':
366 EMIT(OBOL, 0);
367 p->g->iflags |= USEBOL;
368 p->g->nbol++;
369 wascaret = 1;
370 break;
371 case '$':
372 EMIT(OEOL, 0);
373 p->g->iflags |= USEEOL;
374 p->g->neol++;
375 break;
376 case '|':
377 SETERROR(REG_EMPTY);
378 break;
379 case '*':
380 case '+':
381 case '?':
382 SETERROR(REG_BADRPT);
383 break;
384 case '.':
385 if (p->g->cflags®_NEWLINE)
386 nonnewline(p);
387 else
388 EMIT(OANY, 0);
389 break;
390 case '[':
391 p_bracket(p);
392 break;
393 case '\\':
394 REQUIRE(MORE(), REG_EESCAPE);
395 c = GETNEXT();
396 ordinary(p, c);
397 break;
398 case '{': /* okay as ordinary except if digit follows */
399 REQUIRE(!MORE() || !isdigit(PEEK()), REG_BADRPT);
400 /* FALLTHROUGH */
401 default:
402 ordinary(p, c);
403 break;
404 }
405
406 if (!MORE())
407 return;
408 c = PEEK();
409 /* we call { a repetition if followed by a digit */
410 if (!( c == '*' || c == '+' || c == '?' ||
411 (c == '{' && MORE2() && isdigit(PEEK2())) ))
412 return; /* no repetition, we're done */
413 NEXT();
414
415 REQUIRE(!wascaret, REG_BADRPT);
416 switch (c) {
417 case '*': /* implemented as +? */
418 /* this case does not require the (y|) trick, noKLUDGE */
419 INSERT(OPLUS_, pos);
420 ASTERN(O_PLUS, pos);
421 INSERT(OQUEST_, pos);
422 ASTERN(O_QUEST, pos);
423 break;
424 case '+':
425 INSERT(OPLUS_, pos);
426 ASTERN(O_PLUS, pos);
427 break;
428 case '?':
429 /* KLUDGE: emit y? as (y|) until subtle bug gets fixed */
430 INSERT(OCH_, pos); /* offset slightly wrong */
431 ASTERN(OOR1, pos); /* this one's right */
432 AHEAD(pos); /* fix the OCH_ */
433 EMIT(OOR2, 0); /* offset very wrong... */
434 AHEAD(THERE()); /* ...so fix it */
435 ASTERN(O_CH, THERETHERE());
436 break;
437 case '{':
438 count = p_count(p);
439 if (EAT(',')) {
440 if (isdigit(PEEK())) {
441 count2 = p_count(p);
442 REQUIRE(count <= count2, REG_BADBR);
443 } else /* single number with comma */
444 count2 = INFINITY;
445 } else /* just a single number */
446 count2 = count;
447 repeat(p, pos, count, count2);
448 if (!EAT('}')) { /* error heuristics */
449 while (MORE() && PEEK() != '}')
450 NEXT();
451 REQUIRE(MORE(), REG_EBRACE);
452 SETERROR(REG_BADBR);
453 }
454 break;
455 }
456
457 if (!MORE())
458 return;
459 c = PEEK();
460 if (!( c == '*' || c == '+' || c == '?' ||
461 (c == '{' && MORE2() && isdigit(PEEK2())) ) )
462 return;
463 SETERROR(REG_BADRPT);
464 }
465
466 /*
467 - p_str - string (no metacharacters) "parser"
468 == static void p_str(register struct parse *p);
469 */
470 static void
p_str(p)471 p_str(p)
472 register struct parse *p;
473 {
474 REQUIRE(MORE(), REG_EMPTY);
475 while (MORE())
476 ordinary(p, GETNEXT());
477 }
478
479 /*
480 - p_bre - BRE parser top level, anchoring and concatenation
481 == static void p_bre(register struct parse *p, register int end1, \
482 == register int end2);
483 * Giving end1 as OUT essentially eliminates the end1/end2 check.
484 *
485 * This implementation is a bit of a kludge, in that a trailing $ is first
486 * taken as an ordinary character and then revised to be an anchor. The
487 * only undesirable side effect is that '$' gets included as a character
488 * category in such cases. This is fairly harmless; not worth fixing.
489 * The amount of lookahead needed to avoid this kludge is excessive.
490 */
491 static void
p_bre(p,end1,end2)492 p_bre(p, end1, end2)
493 register struct parse *p;
494 register int end1; /* first terminating character */
495 register int end2; /* second terminating character */
496 {
497 register sopno start = HERE();
498 register int first = 1; /* first subexpression? */
499 register int wasdollar = 0;
500
501 if (EAT('^')) {
502 EMIT(OBOL, 0);
503 p->g->iflags |= USEBOL;
504 p->g->nbol++;
505 }
506 while (MORE() && !SEETWO(end1, end2)) {
507 wasdollar = p_simp_re(p, first);
508 first = 0;
509 }
510 if (wasdollar) { /* oops, that was a trailing anchor */
511 DROP(1);
512 EMIT(OEOL, 0);
513 p->g->iflags |= USEEOL;
514 p->g->neol++;
515 }
516
517 REQUIRE(HERE() != start, REG_EMPTY); /* require nonempty */
518 }
519
520 /*
521 - p_simp_re - parse a simple RE, an atom possibly followed by a repetition
522 == static int p_simp_re(register struct parse *p, int starordinary);
523 */
524 static int /* was the simple RE an unbackslashed $? */
p_simp_re(p,starordinary)525 p_simp_re(p, starordinary)
526 register struct parse *p;
527 int starordinary; /* is a leading * an ordinary character? */
528 {
529 register int c;
530 register int count;
531 register int count2;
532 register sopno pos;
533 register int i;
534 register sopno subno;
535 # define BACKSL (1<<CHAR_BIT)
536
537 pos = HERE(); /* repetion op, if any, covers from here */
538
539 assert(MORE()); /* caller should have ensured this */
540 c = GETNEXT();
541 if (c == '\\') {
542 REQUIRE(MORE(), REG_EESCAPE);
543 c = BACKSL | (unsigned char)GETNEXT();
544 }
545 switch (c) {
546 case '.':
547 if (p->g->cflags®_NEWLINE)
548 nonnewline(p);
549 else
550 EMIT(OANY, 0);
551 break;
552 case '[':
553 p_bracket(p);
554 break;
555 case BACKSL|'{':
556 SETERROR(REG_BADRPT);
557 break;
558 case BACKSL|'(':
559 p->g->nsub++;
560 subno = p->g->nsub;
561 if (subno < NPAREN)
562 p->pbegin[subno] = HERE();
563 EMIT(OLPAREN, subno);
564 /* the MORE here is an error heuristic */
565 if (MORE() && !SEETWO('\\', ')'))
566 p_bre(p, '\\', ')');
567 if (subno < NPAREN) {
568 p->pend[subno] = HERE();
569 assert(p->pend[subno] != 0);
570 }
571 EMIT(ORPAREN, subno);
572 REQUIRE(EATTWO('\\', ')'), REG_EPAREN);
573 break;
574 case BACKSL|')': /* should not get here -- must be user */
575 case BACKSL|'}':
576 SETERROR(REG_EPAREN);
577 break;
578 case BACKSL|'1':
579 case BACKSL|'2':
580 case BACKSL|'3':
581 case BACKSL|'4':
582 case BACKSL|'5':
583 case BACKSL|'6':
584 case BACKSL|'7':
585 case BACKSL|'8':
586 case BACKSL|'9':
587 i = (c&~BACKSL) - '0';
588 assert(i < NPAREN);
589 if (p->pend[i] != 0) {
590 assert(i <= p->g->nsub);
591 EMIT(OBACK_, i);
592 assert(p->pbegin[i] != 0);
593 assert(OP(p->strip[p->pbegin[i]]) == OLPAREN);
594 assert(OP(p->strip[p->pend[i]]) == ORPAREN);
595 (void) dupl(p, p->pbegin[i]+1, p->pend[i]);
596 EMIT(O_BACK, i);
597 } else
598 SETERROR(REG_ESUBREG);
599 p->g->backrefs = 1;
600 break;
601 case '*':
602 REQUIRE(starordinary, REG_BADRPT);
603 /* FALLTHROUGH */
604 default:
605 ordinary(p, c &~ BACKSL);
606 break;
607 }
608
609 if (EAT('*')) { /* implemented as +? */
610 /* this case does not require the (y|) trick, noKLUDGE */
611 INSERT(OPLUS_, pos);
612 ASTERN(O_PLUS, pos);
613 INSERT(OQUEST_, pos);
614 ASTERN(O_QUEST, pos);
615 } else if (EATTWO('\\', '{')) {
616 count = p_count(p);
617 if (EAT(',')) {
618 if (MORE() && isdigit(PEEK())) {
619 count2 = p_count(p);
620 REQUIRE(count <= count2, REG_BADBR);
621 } else /* single number with comma */
622 count2 = INFINITY;
623 } else /* just a single number */
624 count2 = count;
625 repeat(p, pos, count, count2);
626 if (!EATTWO('\\', '}')) { /* error heuristics */
627 while (MORE() && !SEETWO('\\', '}'))
628 NEXT();
629 REQUIRE(MORE(), REG_EBRACE);
630 SETERROR(REG_BADBR);
631 }
632 } else if (c == (unsigned char)'$') /* $ (but not \$) ends it */
633 return(1);
634
635 return(0);
636 }
637
638 /*
639 - p_count - parse a repetition count
640 == static int p_count(register struct parse *p);
641 */
642 static int /* the value */
p_count(p)643 p_count(p)
644 register struct parse *p;
645 {
646 register int count = 0;
647 register int ndigits = 0;
648
649 while (MORE() && isdigit(PEEK()) && count <= DUPMAX) {
650 count = count*10 + (GETNEXT() - '0');
651 ndigits++;
652 }
653
654 REQUIRE(ndigits > 0 && count <= DUPMAX, REG_BADBR);
655 return(count);
656 }
657
658 /*
659 - p_bracket - parse a bracketed character list
660 == static void p_bracket(register struct parse *p);
661 *
662 * Note a significant property of this code: if the allocset() did SETERROR,
663 * no set operations are done.
664 */
665 static void
p_bracket(p)666 p_bracket(p)
667 register struct parse *p;
668 {
669 register cset *cs = allocset(p);
670 register int invert = 0;
671
672 /* Dept of Truly Sickening Special-Case Kludges */
673 if (p->next + 5 < p->end && strncmp(p->next, "[:<:]]", 6) == 0) {
674 EMIT(OBOW, 0);
675 NEXTn(6);
676 return;
677 }
678 if (p->next + 5 < p->end && strncmp(p->next, "[:>:]]", 6) == 0) {
679 EMIT(OEOW, 0);
680 NEXTn(6);
681 return;
682 }
683
684 if (EAT('^'))
685 invert++; /* make note to invert set at end */
686 if (EAT(']'))
687 CHadd(cs, ']');
688 else if (EAT('-'))
689 CHadd(cs, '-');
690 while (MORE() && PEEK() != ']' && !SEETWO('-', ']'))
691 p_b_term(p, cs);
692 if (EAT('-'))
693 CHadd(cs, '-');
694 MUSTEAT(']', REG_EBRACK);
695
696 if (p->error != 0) /* don't mess things up further */
697 return;
698
699 if (p->g->cflags®_ICASE) {
700 register int i;
701 register int ci;
702
703 for (i = p->g->csetsize - 1; i >= 0; i--)
704 if (CHIN(cs, i) && isalpha(i)) {
705 ci = othercase(i);
706 if (ci != i)
707 CHadd(cs, ci);
708 }
709 if (cs->multis != NULL)
710 mccase(p, cs);
711 }
712 if (invert) {
713 register int i;
714
715 for (i = p->g->csetsize - 1; i >= 0; i--)
716 if (CHIN(cs, i))
717 CHsub(cs, i);
718 else
719 CHadd(cs, i);
720 if (p->g->cflags®_NEWLINE)
721 CHsub(cs, '\n');
722 if (cs->multis != NULL)
723 mcinvert(p, cs);
724 }
725
726 assert(cs->multis == NULL); /* xxx */
727
728 if (nch(p, cs) == 1) { /* optimize singleton sets */
729 ordinary(p, firstch(p, cs));
730 freeset(p, cs);
731 } else
732 EMIT(OANYOF, freezeset(p, cs));
733 }
734
735 /*
736 - p_b_term - parse one term of a bracketed character list
737 == static void p_b_term(register struct parse *p, register cset *cs);
738 */
739 static void
p_b_term(p,cs)740 p_b_term(p, cs)
741 register struct parse *p;
742 register cset *cs;
743 {
744 register char c;
745 register char start, finish;
746 register int i;
747
748 /* classify what we've got */
749 switch ((MORE()) ? PEEK() : '\0') {
750 case '[':
751 c = (MORE2()) ? PEEK2() : '\0';
752 break;
753 case '-':
754 SETERROR(REG_ERANGE);
755 return; /* NOTE RETURN */
756 break;
757 default:
758 c = '\0';
759 break;
760 }
761
762 switch (c) {
763 case ':': /* character class */
764 NEXT2();
765 REQUIRE(MORE(), REG_EBRACK);
766 c = PEEK();
767 REQUIRE(c != '-' && c != ']', REG_ECTYPE);
768 p_b_cclass(p, cs);
769 REQUIRE(MORE(), REG_EBRACK);
770 REQUIRE(EATTWO(':', ']'), REG_ECTYPE);
771 break;
772 case '=': /* equivalence class */
773 NEXT2();
774 REQUIRE(MORE(), REG_EBRACK);
775 c = PEEK();
776 REQUIRE(c != '-' && c != ']', REG_ECOLLATE);
777 p_b_eclass(p, cs);
778 REQUIRE(MORE(), REG_EBRACK);
779 REQUIRE(EATTWO('=', ']'), REG_ECOLLATE);
780 break;
781 default: /* symbol, ordinary character, or range */
782 /* xxx revision needed for multichar stuff */
783 start = p_b_symbol(p);
784 if (SEE('-') && MORE2() && PEEK2() != ']') {
785 /* range */
786 NEXT();
787 if (EAT('-'))
788 finish = '-';
789 else
790 finish = p_b_symbol(p);
791 } else
792 finish = start;
793 /* xxx what about signed chars here... */
794 REQUIRE(start <= finish, REG_ERANGE);
795 for (i = start; i <= finish; i++)
796 CHadd(cs, i);
797 break;
798 }
799 }
800
801 /*
802 - p_b_cclass - parse a character-class name and deal with it
803 == static void p_b_cclass(register struct parse *p, register cset *cs);
804 */
805 static void
p_b_cclass(p,cs)806 p_b_cclass(p, cs)
807 register struct parse *p;
808 register cset *cs;
809 {
810 register char *sp = p->next;
811 register struct cclass *cp;
812 register size_t len;
813 register char *u;
814 register char c;
815
816 while (MORE() && isalpha(PEEK()))
817 NEXT();
818 len = p->next - sp;
819 for (cp = cclasses; cp->name != NULL; cp++)
820 if (strncmp(cp->name, sp, len) == 0 && cp->name[len] == '\0')
821 break;
822 if (cp->name == NULL) {
823 /* oops, didn't find it */
824 SETERROR(REG_ECTYPE);
825 return;
826 }
827
828 u = cp->chars;
829 while ((c = *u++) != '\0')
830 CHadd(cs, c);
831 for (u = cp->multis; *u != '\0'; u += strlen(u) + 1)
832 MCadd(p, cs, u);
833 }
834
835 /*
836 - p_b_eclass - parse an equivalence-class name and deal with it
837 == static void p_b_eclass(register struct parse *p, register cset *cs);
838 *
839 * This implementation is incomplete. xxx
840 */
841 static void
p_b_eclass(p,cs)842 p_b_eclass(p, cs)
843 register struct parse *p;
844 register cset *cs;
845 {
846 register char c;
847
848 c = p_b_coll_elem(p, '=');
849 CHadd(cs, c);
850 }
851
852 /*
853 - p_b_symbol - parse a character or [..]ed multicharacter collating symbol
854 == static char p_b_symbol(register struct parse *p);
855 */
856 static char /* value of symbol */
p_b_symbol(p)857 p_b_symbol(p)
858 register struct parse *p;
859 {
860 register char value;
861
862 REQUIRE(MORE(), REG_EBRACK);
863 if (!EATTWO('[', '.'))
864 return(GETNEXT());
865
866 /* collating symbol */
867 value = p_b_coll_elem(p, '.');
868 REQUIRE(EATTWO('.', ']'), REG_ECOLLATE);
869 return(value);
870 }
871
872 /*
873 - p_b_coll_elem - parse a collating-element name and look it up
874 == static char p_b_coll_elem(register struct parse *p, int endc);
875 */
876 static char /* value of collating element */
p_b_coll_elem(p,endc)877 p_b_coll_elem(p, endc)
878 register struct parse *p;
879 int endc; /* name ended by endc,']' */
880 {
881 register char *sp = p->next;
882 register struct cname *cp;
883 register int len;
884
885 while (MORE() && !SEETWO(endc, ']'))
886 NEXT();
887 if (!MORE()) {
888 SETERROR(REG_EBRACK);
889 return(0);
890 }
891 len = p->next - sp;
892 for (cp = cnames; cp->name != NULL; cp++)
893 if (strncmp(cp->name, sp, len) == 0 && cp->name[len] == '\0')
894 return(cp->code); /* known name */
895 if (len == 1)
896 return(*sp); /* single character */
897 SETERROR(REG_ECOLLATE); /* neither */
898 return(0);
899 }
900
901 /*
902 - othercase - return the case counterpart of an alphabetic
903 == static char othercase(int ch);
904 */
905 static char /* if no counterpart, return ch */
othercase(ch)906 othercase(ch)
907 int ch;
908 {
909 assert(isalpha(ch));
910 if (isupper(ch))
911 return(tolower(ch));
912 else if (islower(ch))
913 return(toupper(ch));
914 else /* peculiar, but could happen */
915 return(ch);
916 }
917
918 /*
919 - bothcases - emit a dualcase version of a two-case character
920 == static void bothcases(register struct parse *p, int ch);
921 *
922 * Boy, is this implementation ever a kludge...
923 */
924 static void
bothcases(p,ch)925 bothcases(p, ch)
926 register struct parse *p;
927 int ch;
928 {
929 register char *oldnext = p->next;
930 register char *oldend = p->end;
931 char bracket[3];
932
933 assert(othercase(ch) != ch); /* p_bracket() would recurse */
934 p->next = bracket;
935 p->end = bracket+2;
936 bracket[0] = ch;
937 bracket[1] = ']';
938 bracket[2] = '\0';
939 p_bracket(p);
940 assert(p->next == bracket+2);
941 p->next = oldnext;
942 p->end = oldend;
943 }
944
945 /*
946 - ordinary - emit an ordinary character
947 == static void ordinary(register struct parse *p, register int ch);
948 */
949 static void
ordinary(p,ch)950 ordinary(p, ch)
951 register struct parse *p;
952 register int ch;
953 {
954 register cat_t *cap = p->g->categories;
955
956 if ((p->g->cflags®_ICASE) && isalpha(ch) && othercase(ch) != ch)
957 bothcases(p, ch);
958 else {
959 EMIT(OCHAR, (unsigned char)ch);
960 if (cap[ch] == 0)
961 cap[ch] = p->g->ncategories++;
962 }
963 }
964
965 /*
966 - nonnewline - emit REG_NEWLINE version of OANY
967 == static void nonnewline(register struct parse *p);
968 *
969 * Boy, is this implementation ever a kludge...
970 */
971 static void
nonnewline(p)972 nonnewline(p)
973 register struct parse *p;
974 {
975 register char *oldnext = p->next;
976 register char *oldend = p->end;
977 char bracket[4];
978
979 p->next = bracket;
980 p->end = bracket+3;
981 bracket[0] = '^';
982 bracket[1] = '\n';
983 bracket[2] = ']';
984 bracket[3] = '\0';
985 p_bracket(p);
986 assert(p->next == bracket+3);
987 p->next = oldnext;
988 p->end = oldend;
989 }
990
991 /*
992 - repeat - generate code for a bounded repetition, recursively if needed
993 == static void repeat(register struct parse *p, sopno start, int from, int to);
994 */
995 static void
repeat(p,start,from,to)996 repeat(p, start, from, to)
997 register struct parse *p;
998 sopno start; /* operand from here to end of strip */
999 int from; /* repeated from this number */
1000 int to; /* to this number of times (maybe INFINITY) */
1001 {
1002 register sopno finish = HERE();
1003 # define N 2
1004 # define INF 3
1005 # define REP(f, t) ((f)*8 + (t))
1006 # define MAP(n) (((n) <= 1) ? (n) : ((n) == INFINITY) ? INF : N)
1007 register sopno copy;
1008
1009 if (p->error != 0) /* head off possible runaway recursion */
1010 return;
1011
1012 assert(from <= to);
1013
1014 switch (REP(MAP(from), MAP(to))) {
1015 case REP(0, 0): /* must be user doing this */
1016 DROP(finish-start); /* drop the operand */
1017 break;
1018 case REP(0, 1): /* as x{1,1}? */
1019 case REP(0, N): /* as x{1,n}? */
1020 case REP(0, INF): /* as x{1,}? */
1021 /* KLUDGE: emit y? as (y|) until subtle bug gets fixed */
1022 INSERT(OCH_, start); /* offset is wrong... */
1023 repeat(p, start+1, 1, to);
1024 ASTERN(OOR1, start);
1025 AHEAD(start); /* ... fix it */
1026 EMIT(OOR2, 0);
1027 AHEAD(THERE());
1028 ASTERN(O_CH, THERETHERE());
1029 break;
1030 case REP(1, 1): /* trivial case */
1031 /* done */
1032 break;
1033 case REP(1, N): /* as x?x{1,n-1} */
1034 /* KLUDGE: emit y? as (y|) until subtle bug gets fixed */
1035 INSERT(OCH_, start);
1036 ASTERN(OOR1, start);
1037 AHEAD(start);
1038 EMIT(OOR2, 0); /* offset very wrong... */
1039 AHEAD(THERE()); /* ...so fix it */
1040 ASTERN(O_CH, THERETHERE());
1041 copy = dupl(p, start+1, finish+1);
1042 assert(copy == finish+4);
1043 repeat(p, copy, 1, to-1);
1044 break;
1045 case REP(1, INF): /* as x+ */
1046 INSERT(OPLUS_, start);
1047 ASTERN(O_PLUS, start);
1048 break;
1049 case REP(N, N): /* as xx{m-1,n-1} */
1050 copy = dupl(p, start, finish);
1051 repeat(p, copy, from-1, to-1);
1052 break;
1053 case REP(N, INF): /* as xx{n-1,INF} */
1054 copy = dupl(p, start, finish);
1055 repeat(p, copy, from-1, to);
1056 break;
1057 default: /* "can't happen" */
1058 SETERROR(REG_ASSERT); /* just in case */
1059 break;
1060 }
1061 }
1062
1063 /*
1064 - seterr - set an error condition
1065 == static int seterr(register struct parse *p, int e);
1066 */
1067 static int /* useless but makes type checking happy */
seterr(p,e)1068 seterr(p, e)
1069 register struct parse *p;
1070 int e;
1071 {
1072 if (p->error == 0) /* keep earliest error condition */
1073 p->error = e;
1074 p->next = nuls; /* try to bring things to a halt */
1075 p->end = nuls;
1076 return(0); /* make the return value well-defined */
1077 }
1078
1079 /*
1080 - allocset - allocate a set of characters for []
1081 == static cset *allocset(register struct parse *p);
1082 */
1083 static cset *
allocset(p)1084 allocset(p)
1085 register struct parse *p;
1086 {
1087 register int no = p->g->ncsets++;
1088 register size_t nc;
1089 register size_t nbytes;
1090 register cset *cs;
1091 register size_t css = (size_t)p->g->csetsize;
1092 register int i;
1093
1094 if (no >= p->ncsalloc) { /* need another column of space */
1095 p->ncsalloc += CHAR_BIT;
1096 nc = p->ncsalloc;
1097 assert(nc % CHAR_BIT == 0);
1098 nbytes = nc / CHAR_BIT * css;
1099 if (p->g->sets == NULL)
1100 p->g->sets = (cset *)malloc(nc * sizeof(cset));
1101 else
1102 p->g->sets = (cset *)realloc((char *)p->g->sets,
1103 nc * sizeof(cset));
1104 if (p->g->setbits == NULL)
1105 p->g->setbits = (uch *)malloc(nbytes);
1106 else {
1107 p->g->setbits = (uch *)realloc((char *)p->g->setbits,
1108 nbytes);
1109 /* xxx this isn't right if setbits is now NULL */
1110 for (i = 0; i < no; i++)
1111 p->g->sets[i].ptr = p->g->setbits + css*(i/CHAR_BIT);
1112 }
1113 if (p->g->sets != NULL && p->g->setbits != NULL)
1114 (void) memset((char *)p->g->setbits + (nbytes - css),
1115 0, css);
1116 else {
1117 no = 0;
1118 SETERROR(REG_ESPACE);
1119 /* caller's responsibility not to do set ops */
1120 }
1121 }
1122
1123 assert(p->g->sets != NULL); /* xxx */
1124 cs = &p->g->sets[no];
1125 cs->ptr = p->g->setbits + css*((no)/CHAR_BIT);
1126 cs->mask = 1 << ((no) % CHAR_BIT);
1127 cs->hash = 0;
1128 cs->smultis = 0;
1129 cs->multis = NULL;
1130
1131 return(cs);
1132 }
1133
1134 /*
1135 - freeset - free a now-unused set
1136 == static void freeset(register struct parse *p, register cset *cs);
1137 */
1138 static void
freeset(p,cs)1139 freeset(p, cs)
1140 register struct parse *p;
1141 register cset *cs;
1142 {
1143 register int i;
1144 register cset *top = &p->g->sets[p->g->ncsets];
1145 register size_t css = (size_t)p->g->csetsize;
1146
1147 for (i = 0; i < css; i++)
1148 CHsub(cs, i);
1149 if (cs == top-1) /* recover only the easy case */
1150 p->g->ncsets--;
1151 }
1152
1153 /*
1154 - freezeset - final processing on a set of characters
1155 == static int freezeset(register struct parse *p, register cset *cs);
1156 *
1157 * The main task here is merging identical sets. This is usually a waste
1158 * of time (although the hash code minimizes the overhead), but can win
1159 * big if REG_ICASE is being used. REG_ICASE, by the way, is why the hash
1160 * is done using addition rather than xor -- all ASCII [aA] sets xor to
1161 * the same value!
1162 */
1163 static int /* set number */
freezeset(p,cs)1164 freezeset(p, cs)
1165 register struct parse *p;
1166 register cset *cs;
1167 {
1168 register uch h = cs->hash;
1169 register int i;
1170 register cset *top = &p->g->sets[p->g->ncsets];
1171 register cset *cs2;
1172 register size_t css = (size_t)p->g->csetsize;
1173
1174 /* look for an earlier one which is the same */
1175 for (cs2 = &p->g->sets[0]; cs2 < top; cs2++)
1176 if (cs2->hash == h && cs2 != cs) {
1177 /* maybe */
1178 for (i = 0; i < css; i++)
1179 if (!!CHIN(cs2, i) != !!CHIN(cs, i))
1180 break; /* no */
1181 if (i == css)
1182 break; /* yes */
1183 }
1184
1185 if (cs2 < top) { /* found one */
1186 freeset(p, cs);
1187 cs = cs2;
1188 }
1189
1190 return((int)(cs - p->g->sets));
1191 }
1192
1193 /*
1194 - firstch - return first character in a set (which must have at least one)
1195 == static int firstch(register struct parse *p, register cset *cs);
1196 */
1197 static int /* character; there is no "none" value */
firstch(p,cs)1198 firstch(p, cs)
1199 register struct parse *p;
1200 register cset *cs;
1201 {
1202 register int i;
1203 register size_t css = (size_t)p->g->csetsize;
1204
1205 for (i = 0; i < css; i++)
1206 if (CHIN(cs, i))
1207 return((char)i);
1208 assert(never);
1209 return(0); /* arbitrary */
1210 }
1211
1212 /*
1213 - nch - number of characters in a set
1214 == static int nch(register struct parse *p, register cset *cs);
1215 */
1216 static int
nch(p,cs)1217 nch(p, cs)
1218 register struct parse *p;
1219 register cset *cs;
1220 {
1221 register int i;
1222 register size_t css = (size_t)p->g->csetsize;
1223 register int n = 0;
1224
1225 for (i = 0; i < css; i++)
1226 if (CHIN(cs, i))
1227 n++;
1228 return(n);
1229 }
1230
1231 /*
1232 - mcadd - add a collating element to a cset
1233 == static void mcadd(register struct parse *p, register cset *cs, \
1234 == register char *cp);
1235 */
1236 static void
mcadd(p,cs,cp)1237 mcadd(p, cs, cp)
1238 register struct parse *p;
1239 register cset *cs;
1240 register char *cp;
1241 {
1242 register size_t oldend = cs->smultis;
1243 void *np;
1244
1245 cs->smultis += strlen(cp) + 1;
1246 if (cs->multis == NULL)
1247 np = malloc(cs->smultis);
1248 else
1249 np = realloc(cs->multis, cs->smultis);
1250 if (np == NULL) {
1251 if (cs->multis)
1252 free(cs->multis);
1253 cs->multis = NULL;
1254 SETERROR(REG_ESPACE);
1255 return;
1256 }
1257 cs->multis = np;
1258
1259 (void) strcpy(cs->multis + oldend - 1, cp);
1260 cs->multis[cs->smultis - 1] = '\0';
1261 }
1262
1263 /*
1264 - mcinvert - invert the list of collating elements in a cset
1265 == static void mcinvert(register struct parse *p, register cset *cs);
1266 *
1267 * This would have to know the set of possibilities. Implementation
1268 * is deferred.
1269 */
1270 /* ARGSUSED */
1271 static void
mcinvert(p,cs)1272 mcinvert(p, cs)
1273 register struct parse *p;
1274 register cset *cs;
1275 {
1276 assert(cs->multis == NULL); /* xxx */
1277 }
1278
1279 /*
1280 - mccase - add case counterparts of the list of collating elements in a cset
1281 == static void mccase(register struct parse *p, register cset *cs);
1282 *
1283 * This would have to know the set of possibilities. Implementation
1284 * is deferred.
1285 */
1286 /* ARGSUSED */
1287 static void
mccase(p,cs)1288 mccase(p, cs)
1289 register struct parse *p;
1290 register cset *cs;
1291 {
1292 assert(cs->multis == NULL); /* xxx */
1293 }
1294
1295 /*
1296 - isinsets - is this character in any sets?
1297 == static int isinsets(register struct re_guts *g, int c);
1298 */
1299 static int /* predicate */
isinsets(g,c)1300 isinsets(g, c)
1301 register struct re_guts *g;
1302 int c;
1303 {
1304 register uch *col;
1305 register int i;
1306 register int ncols = (g->ncsets+(CHAR_BIT-1)) / CHAR_BIT;
1307 register unsigned uc = (unsigned char)c;
1308
1309 for (i = 0, col = g->setbits; i < ncols; i++, col += g->csetsize)
1310 if (col[uc] != 0)
1311 return(1);
1312 return(0);
1313 }
1314
1315 /*
1316 - samesets - are these two characters in exactly the same sets?
1317 == static int samesets(register struct re_guts *g, int c1, int c2);
1318 */
1319 static int /* predicate */
samesets(g,c1,c2)1320 samesets(g, c1, c2)
1321 register struct re_guts *g;
1322 int c1;
1323 int c2;
1324 {
1325 register uch *col;
1326 register int i;
1327 register int ncols = (g->ncsets+(CHAR_BIT-1)) / CHAR_BIT;
1328 register unsigned uc1 = (unsigned char)c1;
1329 register unsigned uc2 = (unsigned char)c2;
1330
1331 for (i = 0, col = g->setbits; i < ncols; i++, col += g->csetsize)
1332 if (col[uc1] != col[uc2])
1333 return(0);
1334 return(1);
1335 }
1336
1337 /*
1338 - categorize - sort out character categories
1339 == static void categorize(struct parse *p, register struct re_guts *g);
1340 */
1341 static void
categorize(p,g)1342 categorize(p, g)
1343 struct parse *p;
1344 register struct re_guts *g;
1345 {
1346 register cat_t *cats = g->categories;
1347 register int c;
1348 register int c2;
1349 register cat_t cat;
1350
1351 /* avoid making error situations worse */
1352 if (p->error != 0)
1353 return;
1354
1355 for (c = CHAR_MIN; c <= CHAR_MAX; c++)
1356 if (cats[c] == 0 && isinsets(g, c)) {
1357 cat = g->ncategories++;
1358 cats[c] = cat;
1359 for (c2 = c+1; c2 <= CHAR_MAX; c2++)
1360 if (cats[c2] == 0 && samesets(g, c, c2))
1361 cats[c2] = cat;
1362 }
1363 }
1364
1365 /*
1366 - dupl - emit a duplicate of a bunch of sops
1367 == static sopno dupl(register struct parse *p, sopno start, sopno finish);
1368 */
1369 static sopno /* start of duplicate */
dupl(p,start,finish)1370 dupl(p, start, finish)
1371 register struct parse *p;
1372 sopno start; /* from here */
1373 sopno finish; /* to this less one */
1374 {
1375 register sopno ret = HERE();
1376 register sopno len = finish - start;
1377
1378 assert(finish >= start);
1379 if (len == 0)
1380 return(ret);
1381 enlarge(p, p->ssize + len); /* this many unexpected additions */
1382 assert(p->ssize >= p->slen + len);
1383 (void) memcpy((char *)(p->strip + p->slen),
1384 (char *)(p->strip + start), (size_t)len*sizeof(sop));
1385 p->slen += len;
1386 return(ret);
1387 }
1388
1389 /*
1390 - doemit - emit a strip operator
1391 == static void doemit(register struct parse *p, sop op, size_t opnd);
1392 *
1393 * It might seem better to implement this as a macro with a function as
1394 * hard-case backup, but it's just too big and messy unless there are
1395 * some changes to the data structures. Maybe later.
1396 */
1397 static void
doemit(p,op,opnd)1398 doemit(p, op, opnd)
1399 register struct parse *p;
1400 sop op;
1401 size_t opnd;
1402 {
1403 /* avoid making error situations worse */
1404 if (p->error != 0)
1405 return;
1406
1407 /* deal with oversize operands ("can't happen", more or less) */
1408 assert(opnd < 1<<OPSHIFT);
1409
1410 /* deal with undersized strip */
1411 if (p->slen >= p->ssize)
1412 enlarge(p, (p->ssize+1) / 2 * 3); /* +50% */
1413 assert(p->slen < p->ssize);
1414
1415 /* finally, it's all reduced to the easy case */
1416 p->strip[p->slen++] = SOP(op, opnd);
1417 }
1418
1419 /*
1420 - doinsert - insert a sop into the strip
1421 == static void doinsert(register struct parse *p, sop op, size_t opnd, sopno pos);
1422 */
1423 static void
doinsert(p,op,opnd,pos)1424 doinsert(p, op, opnd, pos)
1425 register struct parse *p;
1426 sop op;
1427 size_t opnd;
1428 sopno pos;
1429 {
1430 register sopno sn;
1431 register sop s;
1432 register int i;
1433
1434 /* avoid making error situations worse */
1435 if (p->error != 0)
1436 return;
1437
1438 sn = HERE();
1439 EMIT(op, opnd); /* do checks, ensure space */
1440 assert(HERE() == sn+1);
1441 s = p->strip[sn];
1442
1443 /* adjust paren pointers */
1444 assert(pos > 0);
1445 for (i = 1; i < NPAREN; i++) {
1446 if (p->pbegin[i] >= pos) {
1447 p->pbegin[i]++;
1448 }
1449 if (p->pend[i] >= pos) {
1450 p->pend[i]++;
1451 }
1452 }
1453
1454 memmove((char *)&p->strip[pos+1], (char *)&p->strip[pos],
1455 (HERE()-pos-1)*sizeof(sop));
1456 p->strip[pos] = s;
1457 }
1458
1459 /*
1460 - dofwd - complete a forward reference
1461 == static void dofwd(register struct parse *p, sopno pos, sop value);
1462 */
1463 static void
dofwd(p,pos,value)1464 dofwd(p, pos, value)
1465 register struct parse *p;
1466 register sopno pos;
1467 sop value;
1468 {
1469 /* avoid making error situations worse */
1470 if (p->error != 0)
1471 return;
1472
1473 assert(value < 1<<OPSHIFT);
1474 p->strip[pos] = OP(p->strip[pos]) | value;
1475 }
1476
1477 /*
1478 - enlarge - enlarge the strip
1479 == static void enlarge(register struct parse *p, sopno size);
1480 */
1481 static void
enlarge(p,size)1482 enlarge(p, size)
1483 register struct parse *p;
1484 register sopno size;
1485 {
1486 register sop *sp;
1487
1488 if (p->ssize >= size)
1489 return;
1490
1491 sp = (sop *)realloc(p->strip, size*sizeof(sop));
1492 if (sp == NULL) {
1493 SETERROR(REG_ESPACE);
1494 return;
1495 }
1496 p->strip = sp;
1497 p->ssize = size;
1498 }
1499
1500 /*
1501 - stripsnug - compact the strip
1502 == static void stripsnug(register struct parse *p, register struct re_guts *g);
1503 */
1504 static void
stripsnug(p,g)1505 stripsnug(p, g)
1506 register struct parse *p;
1507 register struct re_guts *g;
1508 {
1509 g->nstates = p->slen;
1510 g->strip = (sop *)realloc((char *)p->strip, p->slen * sizeof(sop));
1511 if (g->strip == NULL) {
1512 SETERROR(REG_ESPACE);
1513 g->strip = p->strip;
1514 }
1515 }
1516
1517 /*
1518 - findmust - fill in must and mlen with longest mandatory literal string
1519 == static void findmust(register struct parse *p, register struct re_guts *g);
1520 *
1521 * This algorithm could do fancy things like analyzing the operands of |
1522 * for common subsequences. Someday. This code is simple and finds most
1523 * of the interesting cases.
1524 *
1525 * Note that must and mlen got initialized during setup.
1526 */
1527 static void
findmust(p,g)1528 findmust(p, g)
1529 struct parse *p;
1530 register struct re_guts *g;
1531 {
1532 register sop *scan;
1533 sop *start;
1534 register sop *newstart;
1535 register sopno newlen;
1536 register sop s;
1537 register char *cp;
1538 register sopno i;
1539
1540 /* avoid making error situations worse */
1541 if (p->error != 0)
1542 return;
1543
1544 /* find the longest OCHAR sequence in strip */
1545 newlen = 0;
1546 scan = g->strip + 1;
1547 do {
1548 s = *scan++;
1549 switch (OP(s)) {
1550 case OCHAR: /* sequence member */
1551 if (newlen == 0) /* new sequence */
1552 newstart = scan - 1;
1553 newlen++;
1554 break;
1555 case OPLUS_: /* things that don't break one */
1556 case OLPAREN:
1557 case ORPAREN:
1558 break;
1559 case OQUEST_: /* things that must be skipped */
1560 case OCH_:
1561 scan--;
1562 do {
1563 scan += OPND(s);
1564 s = *scan;
1565 /* assert() interferes w debug printouts */
1566 if (OP(s) != O_QUEST && OP(s) != O_CH &&
1567 OP(s) != OOR2) {
1568 g->iflags |= BAD;
1569 return;
1570 }
1571 } while (OP(s) != O_QUEST && OP(s) != O_CH);
1572 /* fallthrough */
1573 default: /* things that break a sequence */
1574 if (newlen > g->mlen) { /* ends one */
1575 start = newstart;
1576 g->mlen = newlen;
1577 }
1578 newlen = 0;
1579 break;
1580 }
1581 } while (OP(s) != OEND);
1582
1583 if (g->mlen == 0) /* there isn't one */
1584 return;
1585
1586 /* turn it into a character string */
1587 g->must = malloc((size_t)g->mlen + 1);
1588 if (g->must == NULL) { /* argh; just forget it */
1589 g->mlen = 0;
1590 return;
1591 }
1592 cp = g->must;
1593 scan = start;
1594 for (i = g->mlen; i > 0; i--) {
1595 while (OP(s = *scan++) != OCHAR)
1596 continue;
1597 assert(cp < g->must + g->mlen);
1598 *cp++ = (char)OPND(s);
1599 }
1600 assert(cp == g->must + g->mlen);
1601 *cp++ = '\0'; /* just on general principles */
1602 }
1603
1604 /*
1605 - pluscount - count + nesting
1606 == static sopno pluscount(register struct parse *p, register struct re_guts *g);
1607 */
1608 static sopno /* nesting depth */
pluscount(p,g)1609 pluscount(p, g)
1610 struct parse *p;
1611 register struct re_guts *g;
1612 {
1613 register sop *scan;
1614 register sop s;
1615 register sopno plusnest = 0;
1616 register sopno maxnest = 0;
1617
1618 if (p->error != 0)
1619 return(0); /* there may not be an OEND */
1620
1621 scan = g->strip + 1;
1622 do {
1623 s = *scan++;
1624 switch (OP(s)) {
1625 case OPLUS_:
1626 plusnest++;
1627 break;
1628 case O_PLUS:
1629 if (plusnest > maxnest)
1630 maxnest = plusnest;
1631 plusnest--;
1632 break;
1633 }
1634 } while (OP(s) != OEND);
1635 if (plusnest != 0)
1636 g->iflags |= BAD;
1637 return(maxnest);
1638 }
1639