1 /*-
2 * Copyright (c) 1992, 1993, 1994 Henry Spencer.
3 * Copyright (c) 1992, 1993, 1994
4 * The Regents of the University of California. All rights reserved.
5 *
6 * This code is derived from software contributed to Berkeley by
7 * Henry Spencer.
8 *
9 * %sccs.include.redist.c%
10 *
11 * @(#)regcomp.c 8.5 (Berkeley) 03/20/94
12 */
13
14 #if defined(LIBC_SCCS) && !defined(lint)
15 static char sccsid[] = "@(#)regcomp.c 8.5 (Berkeley) 03/20/94";
16 #endif /* LIBC_SCCS and not lint */
17
18 #include <sys/types.h>
19 #include <stdio.h>
20 #include <string.h>
21 #include <ctype.h>
22 #include <limits.h>
23 #include <stdlib.h>
24 #include <regex.h>
25
26 #include "utils.h"
27 #include "regex2.h"
28
29 #include "cclass.h"
30 #include "cname.h"
31
32 /*
33 * parse structure, passed up and down to avoid global variables and
34 * other clumsinesses
35 */
36 struct parse {
37 char *next; /* next character in RE */
38 char *end; /* end of string (-> NUL normally) */
39 int error; /* has an error been seen? */
40 sop *strip; /* malloced strip */
41 sopno ssize; /* malloced strip size (allocated) */
42 sopno slen; /* malloced strip length (used) */
43 int ncsalloc; /* number of csets allocated */
44 struct re_guts *g;
45 # define NPAREN 10 /* we need to remember () 1-9 for back refs */
46 sopno pbegin[NPAREN]; /* -> ( ([0] unused) */
47 sopno pend[NPAREN]; /* -> ) ([0] unused) */
48 };
49
50 /* ========= begin header generated by ./mkh ========= */
51 #ifdef __cplusplus
52 extern "C" {
53 #endif
54
55 /* === regcomp.c === */
56 static void p_ere __P((struct parse *p, int stop));
57 static void p_ere_exp __P((struct parse *p));
58 static void p_str __P((struct parse *p));
59 static void p_bre __P((struct parse *p, int end1, int end2));
60 static int p_simp_re __P((struct parse *p, int starordinary));
61 static int p_count __P((struct parse *p));
62 static void p_bracket __P((struct parse *p));
63 static void p_b_term __P((struct parse *p, cset *cs));
64 static void p_b_cclass __P((struct parse *p, cset *cs));
65 static void p_b_eclass __P((struct parse *p, cset *cs));
66 static char p_b_symbol __P((struct parse *p));
67 static char p_b_coll_elem __P((struct parse *p, int endc));
68 static char othercase __P((int ch));
69 static void bothcases __P((struct parse *p, int ch));
70 static void ordinary __P((struct parse *p, int ch));
71 static void nonnewline __P((struct parse *p));
72 static void repeat __P((struct parse *p, sopno start, int from, int to));
73 static int seterr __P((struct parse *p, int e));
74 static cset *allocset __P((struct parse *p));
75 static void freeset __P((struct parse *p, cset *cs));
76 static int freezeset __P((struct parse *p, cset *cs));
77 static int firstch __P((struct parse *p, cset *cs));
78 static int nch __P((struct parse *p, cset *cs));
79 static void mcadd __P((struct parse *p, cset *cs, char *cp));
80 static void mcsub __P((cset *cs, char *cp));
81 static int mcin __P((cset *cs, char *cp));
82 static char *mcfind __P((cset *cs, char *cp));
83 static void mcinvert __P((struct parse *p, cset *cs));
84 static void mccase __P((struct parse *p, cset *cs));
85 static int isinsets __P((struct re_guts *g, int c));
86 static int samesets __P((struct re_guts *g, int c1, int c2));
87 static void categorize __P((struct parse *p, struct re_guts *g));
88 static sopno dupl __P((struct parse *p, sopno start, sopno finish));
89 static void doemit __P((struct parse *p, sop op, size_t opnd));
90 static void doinsert __P((struct parse *p, sop op, size_t opnd, sopno pos));
91 static void dofwd __P((struct parse *p, sopno pos, sop value));
92 static void enlarge __P((struct parse *p, sopno size));
93 static void stripsnug __P((struct parse *p, struct re_guts *g));
94 static void findmust __P((struct parse *p, struct re_guts *g));
95 static sopno pluscount __P((struct parse *p, struct re_guts *g));
96
97 #ifdef __cplusplus
98 }
99 #endif
100 /* ========= end header generated by ./mkh ========= */
101
102 static char nuls[10]; /* place to point scanner in event of error */
103
104 /*
105 * macros for use with parse structure
106 * BEWARE: these know that the parse structure is named `p' !!!
107 */
108 #define PEEK() (*p->next)
109 #define PEEK2() (*(p->next+1))
110 #define MORE() (p->next < p->end)
111 #define MORE2() (p->next+1 < p->end)
112 #define SEE(c) (MORE() && PEEK() == (c))
113 #define SEETWO(a, b) (MORE() && MORE2() && PEEK() == (a) && PEEK2() == (b))
114 #define EAT(c) ((SEE(c)) ? (NEXT(), 1) : 0)
115 #define EATTWO(a, b) ((SEETWO(a, b)) ? (NEXT2(), 1) : 0)
116 #define NEXT() (p->next++)
117 #define NEXT2() (p->next += 2)
118 #define NEXTn(n) (p->next += (n))
119 #define GETNEXT() (*p->next++)
120 #define SETERROR(e) seterr(p, (e))
121 #define REQUIRE(co, e) ((co) || SETERROR(e))
122 #define MUSTSEE(c, e) (REQUIRE(MORE() && PEEK() == (c), e))
123 #define MUSTEAT(c, e) (REQUIRE(MORE() && GETNEXT() == (c), e))
124 #define MUSTNOTSEE(c, e) (REQUIRE(!MORE() || PEEK() != (c), e))
125 #define EMIT(op, sopnd) doemit(p, (sop)(op), (size_t)(sopnd))
126 #define INSERT(op, pos) doinsert(p, (sop)(op), HERE()-(pos)+1, pos)
127 #define AHEAD(pos) dofwd(p, pos, HERE()-(pos))
128 #define ASTERN(sop, pos) EMIT(sop, HERE()-pos)
129 #define HERE() (p->slen)
130 #define THERE() (p->slen - 1)
131 #define THERETHERE() (p->slen - 2)
132 #define DROP(n) (p->slen -= (n))
133
134 #ifndef NDEBUG
135 static int never = 0; /* for use in asserts; shuts lint up */
136 #else
137 #define never 0 /* some <assert.h>s have bugs too */
138 #endif
139
140 /*
141 - regcomp - interface for parser and compilation
142 = extern int regcomp(regex_t *, const char *, int);
143 = #define REG_BASIC 0000
144 = #define REG_EXTENDED 0001
145 = #define REG_ICASE 0002
146 = #define REG_NOSUB 0004
147 = #define REG_NEWLINE 0010
148 = #define REG_NOSPEC 0020
149 = #define REG_PEND 0040
150 = #define REG_DUMP 0200
151 */
152 int /* 0 success, otherwise REG_something */
regcomp(preg,pattern,cflags)153 regcomp(preg, pattern, cflags)
154 regex_t *preg;
155 const char *pattern;
156 int cflags;
157 {
158 struct parse pa;
159 register struct re_guts *g;
160 register struct parse *p = &pa;
161 register int i;
162 register size_t len;
163 #ifdef REDEBUG
164 # define GOODFLAGS(f) (f)
165 #else
166 # define GOODFLAGS(f) ((f)&~REG_DUMP)
167 #endif
168
169 cflags = GOODFLAGS(cflags);
170 if ((cflags®_EXTENDED) && (cflags®_NOSPEC))
171 return(REG_INVARG);
172
173 if (cflags®_PEND) {
174 if (preg->re_endp < pattern)
175 return(REG_INVARG);
176 len = preg->re_endp - pattern;
177 } else
178 len = strlen((char *)pattern);
179
180 /* do the mallocs early so failure handling is easy */
181 g = (struct re_guts *)malloc(sizeof(struct re_guts) +
182 (NC-1)*sizeof(cat_t));
183 if (g == NULL)
184 return(REG_ESPACE);
185 p->ssize = len/(size_t)2*(size_t)3 + (size_t)1; /* ugh */
186 p->strip = (sop *)malloc(p->ssize * sizeof(sop));
187 p->slen = 0;
188 if (p->strip == NULL) {
189 free((char *)g);
190 return(REG_ESPACE);
191 }
192
193 /* set things up */
194 p->g = g;
195 p->next = (char *)pattern; /* convenience; we do not modify it */
196 p->end = p->next + len;
197 p->error = 0;
198 p->ncsalloc = 0;
199 for (i = 0; i < NPAREN; i++) {
200 p->pbegin[i] = 0;
201 p->pend[i] = 0;
202 }
203 g->csetsize = NC;
204 g->sets = NULL;
205 g->setbits = NULL;
206 g->ncsets = 0;
207 g->cflags = cflags;
208 g->iflags = 0;
209 g->nbol = 0;
210 g->neol = 0;
211 g->must = NULL;
212 g->mlen = 0;
213 g->nsub = 0;
214 g->ncategories = 1; /* category 0 is "everything else" */
215 g->categories = &g->catspace[-(CHAR_MIN)];
216 (void) memset((char *)g->catspace, 0, NC*sizeof(cat_t));
217 g->backrefs = 0;
218
219 /* do it */
220 EMIT(OEND, 0);
221 g->firststate = THERE();
222 if (cflags®_EXTENDED)
223 p_ere(p, OUT);
224 else if (cflags®_NOSPEC)
225 p_str(p);
226 else
227 p_bre(p, OUT, OUT);
228 EMIT(OEND, 0);
229 g->laststate = THERE();
230
231 /* tidy up loose ends and fill things in */
232 categorize(p, g);
233 stripsnug(p, g);
234 findmust(p, g);
235 g->nplus = pluscount(p, g);
236 g->magic = MAGIC2;
237 preg->re_nsub = g->nsub;
238 preg->re_g = g;
239 preg->re_magic = MAGIC1;
240 #ifndef REDEBUG
241 /* not debugging, so can't rely on the assert() in regexec() */
242 if (g->iflags&BAD)
243 SETERROR(REG_ASSERT);
244 #endif
245
246 /* win or lose, we're done */
247 if (p->error != 0) /* lose */
248 regfree(preg);
249 return(p->error);
250 }
251
252 /*
253 - p_ere - ERE parser top level, concatenation and alternation
254 == static void p_ere(register struct parse *p, int stop);
255 */
256 static void
p_ere(p,stop)257 p_ere(p, stop)
258 register struct parse *p;
259 int stop; /* character this ERE should end at */
260 {
261 register char c;
262 register sopno prevback;
263 register sopno prevfwd;
264 register sopno conc;
265 register int first = 1; /* is this the first alternative? */
266
267 for (;;) {
268 /* do a bunch of concatenated expressions */
269 conc = HERE();
270 while (MORE() && (c = PEEK()) != '|' && c != stop)
271 p_ere_exp(p);
272 REQUIRE(HERE() != conc, REG_EMPTY); /* require nonempty */
273
274 if (!EAT('|'))
275 break; /* NOTE BREAK OUT */
276
277 if (first) {
278 INSERT(OCH_, conc); /* offset is wrong */
279 prevfwd = conc;
280 prevback = conc;
281 first = 0;
282 }
283 ASTERN(OOR1, prevback);
284 prevback = THERE();
285 AHEAD(prevfwd); /* fix previous offset */
286 prevfwd = HERE();
287 EMIT(OOR2, 0); /* offset is very wrong */
288 }
289
290 if (!first) { /* tail-end fixups */
291 AHEAD(prevfwd);
292 ASTERN(O_CH, prevback);
293 }
294
295 assert(!MORE() || SEE(stop));
296 }
297
298 /*
299 - p_ere_exp - parse one subERE, an atom possibly followed by a repetition op
300 == static void p_ere_exp(register struct parse *p);
301 */
302 static void
p_ere_exp(p)303 p_ere_exp(p)
304 register struct parse *p;
305 {
306 register char c;
307 register sopno pos;
308 register int count;
309 register int count2;
310 register sopno subno;
311 int wascaret = 0;
312
313 assert(MORE()); /* caller should have ensured this */
314 c = GETNEXT();
315
316 pos = HERE();
317 switch (c) {
318 case '(':
319 REQUIRE(MORE(), REG_EPAREN);
320 p->g->nsub++;
321 subno = p->g->nsub;
322 if (subno < NPAREN)
323 p->pbegin[subno] = HERE();
324 EMIT(OLPAREN, subno);
325 if (!SEE(')'))
326 p_ere(p, ')');
327 if (subno < NPAREN) {
328 p->pend[subno] = HERE();
329 assert(p->pend[subno] != 0);
330 }
331 EMIT(ORPAREN, subno);
332 MUSTEAT(')', REG_EPAREN);
333 break;
334 #ifndef POSIX_MISTAKE
335 case ')': /* happens only if no current unmatched ( */
336 /*
337 * You may ask, why the ifndef? Because I didn't notice
338 * this until slightly too late for 1003.2, and none of the
339 * other 1003.2 regular-expression reviewers noticed it at
340 * all. So an unmatched ) is legal POSIX, at least until
341 * we can get it fixed.
342 */
343 SETERROR(REG_EPAREN);
344 break;
345 #endif
346 case '^':
347 EMIT(OBOL, 0);
348 p->g->iflags |= USEBOL;
349 p->g->nbol++;
350 wascaret = 1;
351 break;
352 case '$':
353 EMIT(OEOL, 0);
354 p->g->iflags |= USEEOL;
355 p->g->neol++;
356 break;
357 case '|':
358 SETERROR(REG_EMPTY);
359 break;
360 case '*':
361 case '+':
362 case '?':
363 SETERROR(REG_BADRPT);
364 break;
365 case '.':
366 if (p->g->cflags®_NEWLINE)
367 nonnewline(p);
368 else
369 EMIT(OANY, 0);
370 break;
371 case '[':
372 p_bracket(p);
373 break;
374 case '\\':
375 REQUIRE(MORE(), REG_EESCAPE);
376 c = GETNEXT();
377 ordinary(p, c);
378 break;
379 case '{': /* okay as ordinary except if digit follows */
380 REQUIRE(!MORE() || !isdigit(PEEK()), REG_BADRPT);
381 /* FALLTHROUGH */
382 default:
383 ordinary(p, c);
384 break;
385 }
386
387 if (!MORE())
388 return;
389 c = PEEK();
390 /* we call { a repetition if followed by a digit */
391 if (!( c == '*' || c == '+' || c == '?' ||
392 (c == '{' && MORE2() && isdigit(PEEK2())) ))
393 return; /* no repetition, we're done */
394 NEXT();
395
396 REQUIRE(!wascaret, REG_BADRPT);
397 switch (c) {
398 case '*': /* implemented as +? */
399 /* this case does not require the (y|) trick, noKLUDGE */
400 INSERT(OPLUS_, pos);
401 ASTERN(O_PLUS, pos);
402 INSERT(OQUEST_, pos);
403 ASTERN(O_QUEST, pos);
404 break;
405 case '+':
406 INSERT(OPLUS_, pos);
407 ASTERN(O_PLUS, pos);
408 break;
409 case '?':
410 /* KLUDGE: emit y? as (y|) until subtle bug gets fixed */
411 INSERT(OCH_, pos); /* offset slightly wrong */
412 ASTERN(OOR1, pos); /* this one's right */
413 AHEAD(pos); /* fix the OCH_ */
414 EMIT(OOR2, 0); /* offset very wrong... */
415 AHEAD(THERE()); /* ...so fix it */
416 ASTERN(O_CH, THERETHERE());
417 break;
418 case '{':
419 count = p_count(p);
420 if (EAT(',')) {
421 if (isdigit(PEEK())) {
422 count2 = p_count(p);
423 REQUIRE(count <= count2, REG_BADBR);
424 } else /* single number with comma */
425 count2 = INFINITY;
426 } else /* just a single number */
427 count2 = count;
428 repeat(p, pos, count, count2);
429 if (!EAT('}')) { /* error heuristics */
430 while (MORE() && PEEK() != '}')
431 NEXT();
432 REQUIRE(MORE(), REG_EBRACE);
433 SETERROR(REG_BADBR);
434 }
435 break;
436 }
437
438 if (!MORE())
439 return;
440 c = PEEK();
441 if (!( c == '*' || c == '+' || c == '?' ||
442 (c == '{' && MORE2() && isdigit(PEEK2())) ) )
443 return;
444 SETERROR(REG_BADRPT);
445 }
446
447 /*
448 - p_str - string (no metacharacters) "parser"
449 == static void p_str(register struct parse *p);
450 */
451 static void
p_str(p)452 p_str(p)
453 register struct parse *p;
454 {
455 REQUIRE(MORE(), REG_EMPTY);
456 while (MORE())
457 ordinary(p, GETNEXT());
458 }
459
460 /*
461 - p_bre - BRE parser top level, anchoring and concatenation
462 == static void p_bre(register struct parse *p, register int end1, \
463 == register int end2);
464 * Giving end1 as OUT essentially eliminates the end1/end2 check.
465 *
466 * This implementation is a bit of a kludge, in that a trailing $ is first
467 * taken as an ordinary character and then revised to be an anchor. The
468 * only undesirable side effect is that '$' gets included as a character
469 * category in such cases. This is fairly harmless; not worth fixing.
470 * The amount of lookahead needed to avoid this kludge is excessive.
471 */
472 static void
p_bre(p,end1,end2)473 p_bre(p, end1, end2)
474 register struct parse *p;
475 register int end1; /* first terminating character */
476 register int end2; /* second terminating character */
477 {
478 register sopno start = HERE();
479 register int first = 1; /* first subexpression? */
480 register int wasdollar = 0;
481
482 if (EAT('^')) {
483 EMIT(OBOL, 0);
484 p->g->iflags |= USEBOL;
485 p->g->nbol++;
486 }
487 while (MORE() && !SEETWO(end1, end2)) {
488 wasdollar = p_simp_re(p, first);
489 first = 0;
490 }
491 if (wasdollar) { /* oops, that was a trailing anchor */
492 DROP(1);
493 EMIT(OEOL, 0);
494 p->g->iflags |= USEEOL;
495 p->g->neol++;
496 }
497
498 REQUIRE(HERE() != start, REG_EMPTY); /* require nonempty */
499 }
500
501 /*
502 - p_simp_re - parse a simple RE, an atom possibly followed by a repetition
503 == static int p_simp_re(register struct parse *p, int starordinary);
504 */
505 static int /* was the simple RE an unbackslashed $? */
p_simp_re(p,starordinary)506 p_simp_re(p, starordinary)
507 register struct parse *p;
508 int starordinary; /* is a leading * an ordinary character? */
509 {
510 register int c;
511 register int count;
512 register int count2;
513 register sopno pos;
514 register int i;
515 register sopno subno;
516 # define BACKSL (1<<CHAR_BIT)
517
518 pos = HERE(); /* repetion op, if any, covers from here */
519
520 assert(MORE()); /* caller should have ensured this */
521 c = GETNEXT();
522 if (c == '\\') {
523 REQUIRE(MORE(), REG_EESCAPE);
524 c = BACKSL | (unsigned char)GETNEXT();
525 }
526 switch (c) {
527 case '.':
528 if (p->g->cflags®_NEWLINE)
529 nonnewline(p);
530 else
531 EMIT(OANY, 0);
532 break;
533 case '[':
534 p_bracket(p);
535 break;
536 case BACKSL|'{':
537 SETERROR(REG_BADRPT);
538 break;
539 case BACKSL|'(':
540 p->g->nsub++;
541 subno = p->g->nsub;
542 if (subno < NPAREN)
543 p->pbegin[subno] = HERE();
544 EMIT(OLPAREN, subno);
545 /* the MORE here is an error heuristic */
546 if (MORE() && !SEETWO('\\', ')'))
547 p_bre(p, '\\', ')');
548 if (subno < NPAREN) {
549 p->pend[subno] = HERE();
550 assert(p->pend[subno] != 0);
551 }
552 EMIT(ORPAREN, subno);
553 REQUIRE(EATTWO('\\', ')'), REG_EPAREN);
554 break;
555 case BACKSL|')': /* should not get here -- must be user */
556 case BACKSL|'}':
557 SETERROR(REG_EPAREN);
558 break;
559 case BACKSL|'1':
560 case BACKSL|'2':
561 case BACKSL|'3':
562 case BACKSL|'4':
563 case BACKSL|'5':
564 case BACKSL|'6':
565 case BACKSL|'7':
566 case BACKSL|'8':
567 case BACKSL|'9':
568 i = (c&~BACKSL) - '0';
569 assert(i < NPAREN);
570 if (p->pend[i] != 0) {
571 assert(i <= p->g->nsub);
572 EMIT(OBACK_, i);
573 assert(p->pbegin[i] != 0);
574 assert(OP(p->strip[p->pbegin[i]]) == OLPAREN);
575 assert(OP(p->strip[p->pend[i]]) == ORPAREN);
576 (void) dupl(p, p->pbegin[i]+1, p->pend[i]);
577 EMIT(O_BACK, i);
578 } else
579 SETERROR(REG_ESUBREG);
580 p->g->backrefs = 1;
581 break;
582 case '*':
583 REQUIRE(starordinary, REG_BADRPT);
584 /* FALLTHROUGH */
585 default:
586 ordinary(p, c &~ BACKSL);
587 break;
588 }
589
590 if (EAT('*')) { /* implemented as +? */
591 /* this case does not require the (y|) trick, noKLUDGE */
592 INSERT(OPLUS_, pos);
593 ASTERN(O_PLUS, pos);
594 INSERT(OQUEST_, pos);
595 ASTERN(O_QUEST, pos);
596 } else if (EATTWO('\\', '{')) {
597 count = p_count(p);
598 if (EAT(',')) {
599 if (MORE() && isdigit(PEEK())) {
600 count2 = p_count(p);
601 REQUIRE(count <= count2, REG_BADBR);
602 } else /* single number with comma */
603 count2 = INFINITY;
604 } else /* just a single number */
605 count2 = count;
606 repeat(p, pos, count, count2);
607 if (!EATTWO('\\', '}')) { /* error heuristics */
608 while (MORE() && !SEETWO('\\', '}'))
609 NEXT();
610 REQUIRE(MORE(), REG_EBRACE);
611 SETERROR(REG_BADBR);
612 }
613 } else if (c == (unsigned char)'$') /* $ (but not \$) ends it */
614 return(1);
615
616 return(0);
617 }
618
619 /*
620 - p_count - parse a repetition count
621 == static int p_count(register struct parse *p);
622 */
623 static int /* the value */
p_count(p)624 p_count(p)
625 register struct parse *p;
626 {
627 register int count = 0;
628 register int ndigits = 0;
629
630 while (MORE() && isdigit(PEEK()) && count <= DUPMAX) {
631 count = count*10 + (GETNEXT() - '0');
632 ndigits++;
633 }
634
635 REQUIRE(ndigits > 0 && count <= DUPMAX, REG_BADBR);
636 return(count);
637 }
638
639 /*
640 - p_bracket - parse a bracketed character list
641 == static void p_bracket(register struct parse *p);
642 *
643 * Note a significant property of this code: if the allocset() did SETERROR,
644 * no set operations are done.
645 */
646 static void
p_bracket(p)647 p_bracket(p)
648 register struct parse *p;
649 {
650 register char c;
651 register cset *cs = allocset(p);
652 register int invert = 0;
653
654 /* Dept of Truly Sickening Special-Case Kludges */
655 if (p->next + 5 < p->end && strncmp(p->next, "[:<:]]", 6) == 0) {
656 EMIT(OBOW, 0);
657 NEXTn(6);
658 return;
659 }
660 if (p->next + 5 < p->end && strncmp(p->next, "[:>:]]", 6) == 0) {
661 EMIT(OEOW, 0);
662 NEXTn(6);
663 return;
664 }
665
666 if (EAT('^'))
667 invert++; /* make note to invert set at end */
668 if (EAT(']'))
669 CHadd(cs, ']');
670 else if (EAT('-'))
671 CHadd(cs, '-');
672 while (MORE() && PEEK() != ']' && !SEETWO('-', ']'))
673 p_b_term(p, cs);
674 if (EAT('-'))
675 CHadd(cs, '-');
676 MUSTEAT(']', REG_EBRACK);
677
678 if (p->error != 0) /* don't mess things up further */
679 return;
680
681 if (p->g->cflags®_ICASE) {
682 register int i;
683 register int ci;
684
685 for (i = p->g->csetsize - 1; i >= 0; i--)
686 if (CHIN(cs, i) && isalpha(i)) {
687 ci = othercase(i);
688 if (ci != i)
689 CHadd(cs, ci);
690 }
691 if (cs->multis != NULL)
692 mccase(p, cs);
693 }
694 if (invert) {
695 register int i;
696
697 for (i = p->g->csetsize - 1; i >= 0; i--)
698 if (CHIN(cs, i))
699 CHsub(cs, i);
700 else
701 CHadd(cs, i);
702 if (p->g->cflags®_NEWLINE)
703 CHsub(cs, '\n');
704 if (cs->multis != NULL)
705 mcinvert(p, cs);
706 }
707
708 assert(cs->multis == NULL); /* xxx */
709
710 if (nch(p, cs) == 1) { /* optimize singleton sets */
711 ordinary(p, firstch(p, cs));
712 freeset(p, cs);
713 } else
714 EMIT(OANYOF, freezeset(p, cs));
715 }
716
717 /*
718 - p_b_term - parse one term of a bracketed character list
719 == static void p_b_term(register struct parse *p, register cset *cs);
720 */
721 static void
p_b_term(p,cs)722 p_b_term(p, cs)
723 register struct parse *p;
724 register cset *cs;
725 {
726 register char c;
727 register char start, finish;
728 register int i;
729
730 /* classify what we've got */
731 switch ((MORE()) ? PEEK() : '\0') {
732 case '[':
733 c = (MORE2()) ? PEEK2() : '\0';
734 break;
735 case '-':
736 SETERROR(REG_ERANGE);
737 return; /* NOTE RETURN */
738 break;
739 default:
740 c = '\0';
741 break;
742 }
743
744 switch (c) {
745 case ':': /* character class */
746 NEXT2();
747 REQUIRE(MORE(), REG_EBRACK);
748 c = PEEK();
749 REQUIRE(c != '-' && c != ']', REG_ECTYPE);
750 p_b_cclass(p, cs);
751 REQUIRE(MORE(), REG_EBRACK);
752 REQUIRE(EATTWO(':', ']'), REG_ECTYPE);
753 break;
754 case '=': /* equivalence class */
755 NEXT2();
756 REQUIRE(MORE(), REG_EBRACK);
757 c = PEEK();
758 REQUIRE(c != '-' && c != ']', REG_ECOLLATE);
759 p_b_eclass(p, cs);
760 REQUIRE(MORE(), REG_EBRACK);
761 REQUIRE(EATTWO('=', ']'), REG_ECOLLATE);
762 break;
763 default: /* symbol, ordinary character, or range */
764 /* xxx revision needed for multichar stuff */
765 start = p_b_symbol(p);
766 if (SEE('-') && MORE2() && PEEK2() != ']') {
767 /* range */
768 NEXT();
769 if (EAT('-'))
770 finish = '-';
771 else
772 finish = p_b_symbol(p);
773 } else
774 finish = start;
775 /* xxx what about signed chars here... */
776 REQUIRE(start <= finish, REG_ERANGE);
777 for (i = start; i <= finish; i++)
778 CHadd(cs, i);
779 break;
780 }
781 }
782
783 /*
784 - p_b_cclass - parse a character-class name and deal with it
785 == static void p_b_cclass(register struct parse *p, register cset *cs);
786 */
787 static void
p_b_cclass(p,cs)788 p_b_cclass(p, cs)
789 register struct parse *p;
790 register cset *cs;
791 {
792 register char *sp = p->next;
793 register struct cclass *cp;
794 register size_t len;
795 register char *u;
796 register char c;
797
798 while (MORE() && isalpha(PEEK()))
799 NEXT();
800 len = p->next - sp;
801 for (cp = cclasses; cp->name != NULL; cp++)
802 if (strncmp(cp->name, sp, len) == 0 && cp->name[len] == '\0')
803 break;
804 if (cp->name == NULL) {
805 /* oops, didn't find it */
806 SETERROR(REG_ECTYPE);
807 return;
808 }
809
810 u = cp->chars;
811 while ((c = *u++) != '\0')
812 CHadd(cs, c);
813 for (u = cp->multis; *u != '\0'; u += strlen(u) + 1)
814 MCadd(p, cs, u);
815 }
816
817 /*
818 - p_b_eclass - parse an equivalence-class name and deal with it
819 == static void p_b_eclass(register struct parse *p, register cset *cs);
820 *
821 * This implementation is incomplete. xxx
822 */
823 static void
p_b_eclass(p,cs)824 p_b_eclass(p, cs)
825 register struct parse *p;
826 register cset *cs;
827 {
828 register char c;
829
830 c = p_b_coll_elem(p, '=');
831 CHadd(cs, c);
832 }
833
834 /*
835 - p_b_symbol - parse a character or [..]ed multicharacter collating symbol
836 == static char p_b_symbol(register struct parse *p);
837 */
838 static char /* value of symbol */
p_b_symbol(p)839 p_b_symbol(p)
840 register struct parse *p;
841 {
842 register char value;
843
844 REQUIRE(MORE(), REG_EBRACK);
845 if (!EATTWO('[', '.'))
846 return(GETNEXT());
847
848 /* collating symbol */
849 value = p_b_coll_elem(p, '.');
850 REQUIRE(EATTWO('.', ']'), REG_ECOLLATE);
851 return(value);
852 }
853
854 /*
855 - p_b_coll_elem - parse a collating-element name and look it up
856 == static char p_b_coll_elem(register struct parse *p, int endc);
857 */
858 static char /* value of collating element */
p_b_coll_elem(p,endc)859 p_b_coll_elem(p, endc)
860 register struct parse *p;
861 int endc; /* name ended by endc,']' */
862 {
863 register char *sp = p->next;
864 register struct cname *cp;
865 register int len;
866 register char c;
867
868 while (MORE() && !SEETWO(endc, ']'))
869 NEXT();
870 if (!MORE()) {
871 SETERROR(REG_EBRACK);
872 return(0);
873 }
874 len = p->next - sp;
875 for (cp = cnames; cp->name != NULL; cp++)
876 if (strncmp(cp->name, sp, len) == 0 && cp->name[len] == '\0')
877 return(cp->code); /* known name */
878 if (len == 1)
879 return(*sp); /* single character */
880 SETERROR(REG_ECOLLATE); /* neither */
881 return(0);
882 }
883
884 /*
885 - othercase - return the case counterpart of an alphabetic
886 == static char othercase(int ch);
887 */
888 static char /* if no counterpart, return ch */
othercase(ch)889 othercase(ch)
890 int ch;
891 {
892 assert(isalpha(ch));
893 if (isupper(ch))
894 return(tolower(ch));
895 else if (islower(ch))
896 return(toupper(ch));
897 else /* peculiar, but could happen */
898 return(ch);
899 }
900
901 /*
902 - bothcases - emit a dualcase version of a two-case character
903 == static void bothcases(register struct parse *p, int ch);
904 *
905 * Boy, is this implementation ever a kludge...
906 */
907 static void
bothcases(p,ch)908 bothcases(p, ch)
909 register struct parse *p;
910 int ch;
911 {
912 register char *oldnext = p->next;
913 register char *oldend = p->end;
914 char bracket[3];
915
916 assert(othercase(ch) != ch); /* p_bracket() would recurse */
917 p->next = bracket;
918 p->end = bracket+2;
919 bracket[0] = ch;
920 bracket[1] = ']';
921 bracket[2] = '\0';
922 p_bracket(p);
923 assert(p->next == bracket+2);
924 p->next = oldnext;
925 p->end = oldend;
926 }
927
928 /*
929 - ordinary - emit an ordinary character
930 == static void ordinary(register struct parse *p, register int ch);
931 */
932 static void
ordinary(p,ch)933 ordinary(p, ch)
934 register struct parse *p;
935 register int ch;
936 {
937 register cat_t *cap = p->g->categories;
938
939 if ((p->g->cflags®_ICASE) && isalpha(ch) && othercase(ch) != ch)
940 bothcases(p, ch);
941 else {
942 EMIT(OCHAR, (unsigned char)ch);
943 if (cap[ch] == 0)
944 cap[ch] = p->g->ncategories++;
945 }
946 }
947
948 /*
949 - nonnewline - emit REG_NEWLINE version of OANY
950 == static void nonnewline(register struct parse *p);
951 *
952 * Boy, is this implementation ever a kludge...
953 */
954 static void
nonnewline(p)955 nonnewline(p)
956 register struct parse *p;
957 {
958 register char *oldnext = p->next;
959 register char *oldend = p->end;
960 char bracket[4];
961
962 p->next = bracket;
963 p->end = bracket+3;
964 bracket[0] = '^';
965 bracket[1] = '\n';
966 bracket[2] = ']';
967 bracket[3] = '\0';
968 p_bracket(p);
969 assert(p->next == bracket+3);
970 p->next = oldnext;
971 p->end = oldend;
972 }
973
974 /*
975 - repeat - generate code for a bounded repetition, recursively if needed
976 == static void repeat(register struct parse *p, sopno start, int from, int to);
977 */
978 static void
repeat(p,start,from,to)979 repeat(p, start, from, to)
980 register struct parse *p;
981 sopno start; /* operand from here to end of strip */
982 int from; /* repeated from this number */
983 int to; /* to this number of times (maybe INFINITY) */
984 {
985 register sopno finish = HERE();
986 # define N 2
987 # define INF 3
988 # define REP(f, t) ((f)*8 + (t))
989 # define MAP(n) (((n) <= 1) ? (n) : ((n) == INFINITY) ? INF : N)
990 register sopno copy;
991
992 if (p->error != 0) /* head off possible runaway recursion */
993 return;
994
995 assert(from <= to);
996
997 switch (REP(MAP(from), MAP(to))) {
998 case REP(0, 0): /* must be user doing this */
999 DROP(finish-start); /* drop the operand */
1000 break;
1001 case REP(0, 1): /* as x{1,1}? */
1002 case REP(0, N): /* as x{1,n}? */
1003 case REP(0, INF): /* as x{1,}? */
1004 /* KLUDGE: emit y? as (y|) until subtle bug gets fixed */
1005 INSERT(OCH_, start); /* offset is wrong... */
1006 repeat(p, start+1, 1, to);
1007 ASTERN(OOR1, start);
1008 AHEAD(start); /* ... fix it */
1009 EMIT(OOR2, 0);
1010 AHEAD(THERE());
1011 ASTERN(O_CH, THERETHERE());
1012 break;
1013 case REP(1, 1): /* trivial case */
1014 /* done */
1015 break;
1016 case REP(1, N): /* as x?x{1,n-1} */
1017 /* KLUDGE: emit y? as (y|) until subtle bug gets fixed */
1018 INSERT(OCH_, start);
1019 ASTERN(OOR1, start);
1020 AHEAD(start);
1021 EMIT(OOR2, 0); /* offset very wrong... */
1022 AHEAD(THERE()); /* ...so fix it */
1023 ASTERN(O_CH, THERETHERE());
1024 copy = dupl(p, start+1, finish+1);
1025 assert(copy == finish+4);
1026 repeat(p, copy, 1, to-1);
1027 break;
1028 case REP(1, INF): /* as x+ */
1029 INSERT(OPLUS_, start);
1030 ASTERN(O_PLUS, start);
1031 break;
1032 case REP(N, N): /* as xx{m-1,n-1} */
1033 copy = dupl(p, start, finish);
1034 repeat(p, copy, from-1, to-1);
1035 break;
1036 case REP(N, INF): /* as xx{n-1,INF} */
1037 copy = dupl(p, start, finish);
1038 repeat(p, copy, from-1, to);
1039 break;
1040 default: /* "can't happen" */
1041 SETERROR(REG_ASSERT); /* just in case */
1042 break;
1043 }
1044 }
1045
1046 /*
1047 - seterr - set an error condition
1048 == static int seterr(register struct parse *p, int e);
1049 */
1050 static int /* useless but makes type checking happy */
seterr(p,e)1051 seterr(p, e)
1052 register struct parse *p;
1053 int e;
1054 {
1055 if (p->error == 0) /* keep earliest error condition */
1056 p->error = e;
1057 p->next = nuls; /* try to bring things to a halt */
1058 p->end = nuls;
1059 return(0); /* make the return value well-defined */
1060 }
1061
1062 /*
1063 - allocset - allocate a set of characters for []
1064 == static cset *allocset(register struct parse *p);
1065 */
1066 static cset *
allocset(p)1067 allocset(p)
1068 register struct parse *p;
1069 {
1070 register int no = p->g->ncsets++;
1071 register size_t nc;
1072 register size_t nbytes;
1073 register cset *cs;
1074 register size_t css = (size_t)p->g->csetsize;
1075 register int i;
1076
1077 if (no >= p->ncsalloc) { /* need another column of space */
1078 p->ncsalloc += CHAR_BIT;
1079 nc = p->ncsalloc;
1080 assert(nc % CHAR_BIT == 0);
1081 nbytes = nc / CHAR_BIT * css;
1082 if (p->g->sets == NULL)
1083 p->g->sets = (cset *)malloc(nc * sizeof(cset));
1084 else
1085 p->g->sets = (cset *)realloc((char *)p->g->sets,
1086 nc * sizeof(cset));
1087 if (p->g->setbits == NULL)
1088 p->g->setbits = (uch *)malloc(nbytes);
1089 else {
1090 p->g->setbits = (uch *)realloc((char *)p->g->setbits,
1091 nbytes);
1092 /* xxx this isn't right if setbits is now NULL */
1093 for (i = 0; i < no; i++)
1094 p->g->sets[i].ptr = p->g->setbits + css*(i/CHAR_BIT);
1095 }
1096 if (p->g->sets != NULL && p->g->setbits != NULL)
1097 (void) memset((char *)p->g->setbits + (nbytes - css),
1098 0, css);
1099 else {
1100 no = 0;
1101 SETERROR(REG_ESPACE);
1102 /* caller's responsibility not to do set ops */
1103 }
1104 }
1105
1106 assert(p->g->sets != NULL); /* xxx */
1107 cs = &p->g->sets[no];
1108 cs->ptr = p->g->setbits + css*((no)/CHAR_BIT);
1109 cs->mask = 1 << ((no) % CHAR_BIT);
1110 cs->hash = 0;
1111 cs->smultis = 0;
1112 cs->multis = NULL;
1113
1114 return(cs);
1115 }
1116
1117 /*
1118 - freeset - free a now-unused set
1119 == static void freeset(register struct parse *p, register cset *cs);
1120 */
1121 static void
freeset(p,cs)1122 freeset(p, cs)
1123 register struct parse *p;
1124 register cset *cs;
1125 {
1126 register int i;
1127 register cset *top = &p->g->sets[p->g->ncsets];
1128 register size_t css = (size_t)p->g->csetsize;
1129
1130 for (i = 0; i < css; i++)
1131 CHsub(cs, i);
1132 if (cs == top-1) /* recover only the easy case */
1133 p->g->ncsets--;
1134 }
1135
1136 /*
1137 - freezeset - final processing on a set of characters
1138 == static int freezeset(register struct parse *p, register cset *cs);
1139 *
1140 * The main task here is merging identical sets. This is usually a waste
1141 * of time (although the hash code minimizes the overhead), but can win
1142 * big if REG_ICASE is being used. REG_ICASE, by the way, is why the hash
1143 * is done using addition rather than xor -- all ASCII [aA] sets xor to
1144 * the same value!
1145 */
1146 static int /* set number */
freezeset(p,cs)1147 freezeset(p, cs)
1148 register struct parse *p;
1149 register cset *cs;
1150 {
1151 register uch h = cs->hash;
1152 register int i;
1153 register cset *top = &p->g->sets[p->g->ncsets];
1154 register cset *cs2;
1155 register size_t css = (size_t)p->g->csetsize;
1156
1157 /* look for an earlier one which is the same */
1158 for (cs2 = &p->g->sets[0]; cs2 < top; cs2++)
1159 if (cs2->hash == h && cs2 != cs) {
1160 /* maybe */
1161 for (i = 0; i < css; i++)
1162 if (!!CHIN(cs2, i) != !!CHIN(cs, i))
1163 break; /* no */
1164 if (i == css)
1165 break; /* yes */
1166 }
1167
1168 if (cs2 < top) { /* found one */
1169 freeset(p, cs);
1170 cs = cs2;
1171 }
1172
1173 return((int)(cs - p->g->sets));
1174 }
1175
1176 /*
1177 - firstch - return first character in a set (which must have at least one)
1178 == static int firstch(register struct parse *p, register cset *cs);
1179 */
1180 static int /* character; there is no "none" value */
firstch(p,cs)1181 firstch(p, cs)
1182 register struct parse *p;
1183 register cset *cs;
1184 {
1185 register int i;
1186 register size_t css = (size_t)p->g->csetsize;
1187
1188 for (i = 0; i < css; i++)
1189 if (CHIN(cs, i))
1190 return((char)i);
1191 assert(never);
1192 return(0); /* arbitrary */
1193 }
1194
1195 /*
1196 - nch - number of characters in a set
1197 == static int nch(register struct parse *p, register cset *cs);
1198 */
1199 static int
nch(p,cs)1200 nch(p, cs)
1201 register struct parse *p;
1202 register cset *cs;
1203 {
1204 register int i;
1205 register size_t css = (size_t)p->g->csetsize;
1206 register int n = 0;
1207
1208 for (i = 0; i < css; i++)
1209 if (CHIN(cs, i))
1210 n++;
1211 return(n);
1212 }
1213
1214 /*
1215 - mcadd - add a collating element to a cset
1216 == static void mcadd(register struct parse *p, register cset *cs, \
1217 == register char *cp);
1218 */
1219 static void
mcadd(p,cs,cp)1220 mcadd(p, cs, cp)
1221 register struct parse *p;
1222 register cset *cs;
1223 register char *cp;
1224 {
1225 register size_t oldend = cs->smultis;
1226
1227 cs->smultis += strlen(cp) + 1;
1228 if (cs->multis == NULL)
1229 cs->multis = malloc(cs->smultis);
1230 else
1231 cs->multis = realloc(cs->multis, cs->smultis);
1232 if (cs->multis == NULL) {
1233 SETERROR(REG_ESPACE);
1234 return;
1235 }
1236
1237 (void) strcpy(cs->multis + oldend - 1, cp);
1238 cs->multis[cs->smultis - 1] = '\0';
1239 }
1240
1241 /*
1242 - mcsub - subtract a collating element from a cset
1243 == static void mcsub(register cset *cs, register char *cp);
1244 */
1245 static void
mcsub(cs,cp)1246 mcsub(cs, cp)
1247 register cset *cs;
1248 register char *cp;
1249 {
1250 register char *fp = mcfind(cs, cp);
1251 register size_t len = strlen(fp);
1252
1253 assert(fp != NULL);
1254 (void) memmove(fp, fp + len + 1,
1255 cs->smultis - (fp + len + 1 - cs->multis));
1256 cs->smultis -= len;
1257
1258 if (cs->smultis == 0) {
1259 free(cs->multis);
1260 cs->multis = NULL;
1261 return;
1262 }
1263
1264 cs->multis = realloc(cs->multis, cs->smultis);
1265 assert(cs->multis != NULL);
1266 }
1267
1268 /*
1269 - mcin - is a collating element in a cset?
1270 == static int mcin(register cset *cs, register char *cp);
1271 */
1272 static int
mcin(cs,cp)1273 mcin(cs, cp)
1274 register cset *cs;
1275 register char *cp;
1276 {
1277 return(mcfind(cs, cp) != NULL);
1278 }
1279
1280 /*
1281 - mcfind - find a collating element in a cset
1282 == static char *mcfind(register cset *cs, register char *cp);
1283 */
1284 static char *
mcfind(cs,cp)1285 mcfind(cs, cp)
1286 register cset *cs;
1287 register char *cp;
1288 {
1289 register char *p;
1290
1291 if (cs->multis == NULL)
1292 return(NULL);
1293 for (p = cs->multis; *p != '\0'; p += strlen(p) + 1)
1294 if (strcmp(cp, p) == 0)
1295 return(p);
1296 return(NULL);
1297 }
1298
1299 /*
1300 - mcinvert - invert the list of collating elements in a cset
1301 == static void mcinvert(register struct parse *p, register cset *cs);
1302 *
1303 * This would have to know the set of possibilities. Implementation
1304 * is deferred.
1305 */
1306 static void
mcinvert(p,cs)1307 mcinvert(p, cs)
1308 register struct parse *p;
1309 register cset *cs;
1310 {
1311 assert(cs->multis == NULL); /* xxx */
1312 }
1313
1314 /*
1315 - mccase - add case counterparts of the list of collating elements in a cset
1316 == static void mccase(register struct parse *p, register cset *cs);
1317 *
1318 * This would have to know the set of possibilities. Implementation
1319 * is deferred.
1320 */
1321 static void
mccase(p,cs)1322 mccase(p, cs)
1323 register struct parse *p;
1324 register cset *cs;
1325 {
1326 assert(cs->multis == NULL); /* xxx */
1327 }
1328
1329 /*
1330 - isinsets - is this character in any sets?
1331 == static int isinsets(register struct re_guts *g, int c);
1332 */
1333 static int /* predicate */
isinsets(g,c)1334 isinsets(g, c)
1335 register struct re_guts *g;
1336 int c;
1337 {
1338 register uch *col;
1339 register int i;
1340 register int ncols = (g->ncsets+(CHAR_BIT-1)) / CHAR_BIT;
1341 register unsigned uc = (unsigned char)c;
1342
1343 for (i = 0, col = g->setbits; i < ncols; i++, col += g->csetsize)
1344 if (col[uc] != 0)
1345 return(1);
1346 return(0);
1347 }
1348
1349 /*
1350 - samesets - are these two characters in exactly the same sets?
1351 == static int samesets(register struct re_guts *g, int c1, int c2);
1352 */
1353 static int /* predicate */
samesets(g,c1,c2)1354 samesets(g, c1, c2)
1355 register struct re_guts *g;
1356 int c1;
1357 int c2;
1358 {
1359 register uch *col;
1360 register int i;
1361 register int ncols = (g->ncsets+(CHAR_BIT-1)) / CHAR_BIT;
1362 register unsigned uc1 = (unsigned char)c1;
1363 register unsigned uc2 = (unsigned char)c2;
1364
1365 for (i = 0, col = g->setbits; i < ncols; i++, col += g->csetsize)
1366 if (col[uc1] != col[uc2])
1367 return(0);
1368 return(1);
1369 }
1370
1371 /*
1372 - categorize - sort out character categories
1373 == static void categorize(struct parse *p, register struct re_guts *g);
1374 */
1375 static void
categorize(p,g)1376 categorize(p, g)
1377 struct parse *p;
1378 register struct re_guts *g;
1379 {
1380 register cat_t *cats = g->categories;
1381 register int c;
1382 register int c2;
1383 register cat_t cat;
1384
1385 /* avoid making error situations worse */
1386 if (p->error != 0)
1387 return;
1388
1389 for (c = CHAR_MIN; c <= CHAR_MAX; c++)
1390 if (cats[c] == 0 && isinsets(g, c)) {
1391 cat = g->ncategories++;
1392 cats[c] = cat;
1393 for (c2 = c+1; c2 <= CHAR_MAX; c2++)
1394 if (cats[c2] == 0 && samesets(g, c, c2))
1395 cats[c2] = cat;
1396 }
1397 }
1398
1399 /*
1400 - dupl - emit a duplicate of a bunch of sops
1401 == static sopno dupl(register struct parse *p, sopno start, sopno finish);
1402 */
1403 static sopno /* start of duplicate */
dupl(p,start,finish)1404 dupl(p, start, finish)
1405 register struct parse *p;
1406 sopno start; /* from here */
1407 sopno finish; /* to this less one */
1408 {
1409 register sopno ret = HERE();
1410 register sopno len = finish - start;
1411
1412 assert(finish >= start);
1413 if (len == 0)
1414 return(ret);
1415 enlarge(p, p->ssize + len); /* this many unexpected additions */
1416 assert(p->ssize >= p->slen + len);
1417 (void) memcpy((char *)(p->strip + p->slen),
1418 (char *)(p->strip + start), (size_t)len*sizeof(sop));
1419 p->slen += len;
1420 return(ret);
1421 }
1422
1423 /*
1424 - doemit - emit a strip operator
1425 == static void doemit(register struct parse *p, sop op, size_t opnd);
1426 *
1427 * It might seem better to implement this as a macro with a function as
1428 * hard-case backup, but it's just too big and messy unless there are
1429 * some changes to the data structures. Maybe later.
1430 */
1431 static void
doemit(p,op,opnd)1432 doemit(p, op, opnd)
1433 register struct parse *p;
1434 sop op;
1435 size_t opnd;
1436 {
1437 /* avoid making error situations worse */
1438 if (p->error != 0)
1439 return;
1440
1441 /* deal with oversize operands ("can't happen", more or less) */
1442 assert(opnd < 1<<OPSHIFT);
1443
1444 /* deal with undersized strip */
1445 if (p->slen >= p->ssize)
1446 enlarge(p, (p->ssize+1) / 2 * 3); /* +50% */
1447 assert(p->slen < p->ssize);
1448
1449 /* finally, it's all reduced to the easy case */
1450 p->strip[p->slen++] = SOP(op, opnd);
1451 }
1452
1453 /*
1454 - doinsert - insert a sop into the strip
1455 == static void doinsert(register struct parse *p, sop op, size_t opnd, sopno pos);
1456 */
1457 static void
doinsert(p,op,opnd,pos)1458 doinsert(p, op, opnd, pos)
1459 register struct parse *p;
1460 sop op;
1461 size_t opnd;
1462 sopno pos;
1463 {
1464 register sopno sn;
1465 register sop s;
1466 register int i;
1467
1468 /* avoid making error situations worse */
1469 if (p->error != 0)
1470 return;
1471
1472 sn = HERE();
1473 EMIT(op, opnd); /* do checks, ensure space */
1474 assert(HERE() == sn+1);
1475 s = p->strip[sn];
1476
1477 /* adjust paren pointers */
1478 assert(pos > 0);
1479 for (i = 1; i < NPAREN; i++) {
1480 if (p->pbegin[i] >= pos) {
1481 p->pbegin[i]++;
1482 }
1483 if (p->pend[i] >= pos) {
1484 p->pend[i]++;
1485 }
1486 }
1487
1488 memmove((char *)&p->strip[pos+1], (char *)&p->strip[pos],
1489 (HERE()-pos-1)*sizeof(sop));
1490 p->strip[pos] = s;
1491 }
1492
1493 /*
1494 - dofwd - complete a forward reference
1495 == static void dofwd(register struct parse *p, sopno pos, sop value);
1496 */
1497 static void
dofwd(p,pos,value)1498 dofwd(p, pos, value)
1499 register struct parse *p;
1500 register sopno pos;
1501 sop value;
1502 {
1503 /* avoid making error situations worse */
1504 if (p->error != 0)
1505 return;
1506
1507 assert(value < 1<<OPSHIFT);
1508 p->strip[pos] = OP(p->strip[pos]) | value;
1509 }
1510
1511 /*
1512 - enlarge - enlarge the strip
1513 == static void enlarge(register struct parse *p, sopno size);
1514 */
1515 static void
enlarge(p,size)1516 enlarge(p, size)
1517 register struct parse *p;
1518 register sopno size;
1519 {
1520 register sop *sp;
1521
1522 if (p->ssize >= size)
1523 return;
1524
1525 sp = (sop *)realloc(p->strip, size*sizeof(sop));
1526 if (sp == NULL) {
1527 SETERROR(REG_ESPACE);
1528 return;
1529 }
1530 p->strip = sp;
1531 p->ssize = size;
1532 }
1533
1534 /*
1535 - stripsnug - compact the strip
1536 == static void stripsnug(register struct parse *p, register struct re_guts *g);
1537 */
1538 static void
stripsnug(p,g)1539 stripsnug(p, g)
1540 register struct parse *p;
1541 register struct re_guts *g;
1542 {
1543 g->nstates = p->slen;
1544 g->strip = (sop *)realloc((char *)p->strip, p->slen * sizeof(sop));
1545 if (g->strip == NULL) {
1546 SETERROR(REG_ESPACE);
1547 g->strip = p->strip;
1548 }
1549 }
1550
1551 /*
1552 - findmust - fill in must and mlen with longest mandatory literal string
1553 == static void findmust(register struct parse *p, register struct re_guts *g);
1554 *
1555 * This algorithm could do fancy things like analyzing the operands of |
1556 * for common subsequences. Someday. This code is simple and finds most
1557 * of the interesting cases.
1558 *
1559 * Note that must and mlen got initialized during setup.
1560 */
1561 static void
findmust(p,g)1562 findmust(p, g)
1563 struct parse *p;
1564 register struct re_guts *g;
1565 {
1566 register sop *scan;
1567 sop *start;
1568 register sop *newstart;
1569 register sopno newlen;
1570 register sop s;
1571 register char *cp;
1572 register sopno i;
1573
1574 /* avoid making error situations worse */
1575 if (p->error != 0)
1576 return;
1577
1578 /* find the longest OCHAR sequence in strip */
1579 newlen = 0;
1580 scan = g->strip + 1;
1581 do {
1582 s = *scan++;
1583 switch (OP(s)) {
1584 case OCHAR: /* sequence member */
1585 if (newlen == 0) /* new sequence */
1586 newstart = scan - 1;
1587 newlen++;
1588 break;
1589 case OPLUS_: /* things that don't break one */
1590 case OLPAREN:
1591 case ORPAREN:
1592 break;
1593 case OQUEST_: /* things that must be skipped */
1594 case OCH_:
1595 scan--;
1596 do {
1597 scan += OPND(s);
1598 s = *scan;
1599 /* assert() interferes w debug printouts */
1600 if (OP(s) != O_QUEST && OP(s) != O_CH &&
1601 OP(s) != OOR2) {
1602 g->iflags |= BAD;
1603 return;
1604 }
1605 } while (OP(s) != O_QUEST && OP(s) != O_CH);
1606 /* fallthrough */
1607 default: /* things that break a sequence */
1608 if (newlen > g->mlen) { /* ends one */
1609 start = newstart;
1610 g->mlen = newlen;
1611 }
1612 newlen = 0;
1613 break;
1614 }
1615 } while (OP(s) != OEND);
1616
1617 if (g->mlen == 0) /* there isn't one */
1618 return;
1619
1620 /* turn it into a character string */
1621 g->must = malloc((size_t)g->mlen + 1);
1622 if (g->must == NULL) { /* argh; just forget it */
1623 g->mlen = 0;
1624 return;
1625 }
1626 cp = g->must;
1627 scan = start;
1628 for (i = g->mlen; i > 0; i--) {
1629 while (OP(s = *scan++) != OCHAR)
1630 continue;
1631 assert(cp < g->must + g->mlen);
1632 *cp++ = (char)OPND(s);
1633 }
1634 assert(cp == g->must + g->mlen);
1635 *cp++ = '\0'; /* just on general principles */
1636 }
1637
1638 /*
1639 - pluscount - count + nesting
1640 == static sopno pluscount(register struct parse *p, register struct re_guts *g);
1641 */
1642 static sopno /* nesting depth */
pluscount(p,g)1643 pluscount(p, g)
1644 struct parse *p;
1645 register struct re_guts *g;
1646 {
1647 register sop *scan;
1648 register sop s;
1649 register sopno plusnest = 0;
1650 register sopno maxnest = 0;
1651
1652 if (p->error != 0)
1653 return(0); /* there may not be an OEND */
1654
1655 scan = g->strip + 1;
1656 do {
1657 s = *scan++;
1658 switch (OP(s)) {
1659 case OPLUS_:
1660 plusnest++;
1661 break;
1662 case O_PLUS:
1663 if (plusnest > maxnest)
1664 maxnest = plusnest;
1665 plusnest--;
1666 break;
1667 }
1668 } while (OP(s) != OEND);
1669 if (plusnest != 0)
1670 g->iflags |= BAD;
1671 return(maxnest);
1672 }
1673