1 /*- 2 * Copyright (c) 1992, 1993, 1994 Henry Spencer. 3 * Copyright (c) 1992, 1993, 1994 4 * The Regents of the University of California. All rights reserved. 5 * 6 * This code is derived from software contributed to Berkeley by 7 * Henry Spencer of the University of Toronto. 8 * 9 * %sccs.include.redist.c% 10 * 11 * @(#)regcomp.c 8.2 (Berkeley) 03/16/94 12 */ 13 14 #if defined(LIBC_SCCS) && !defined(lint) 15 static char sccsid[] = "@(#)regcomp.c 8.2 (Berkeley) 03/16/94"; 16 #endif /* LIBC_SCCS and not lint */ 17 18 #include <sys/types.h> 19 #include <stdio.h> 20 #include <string.h> 21 #include <ctype.h> 22 #include <limits.h> 23 #include <stdlib.h> 24 #include <regex.h> 25 26 #include "utils.h" 27 #include "regex2.h" 28 29 #include "cclass.h" 30 #include "cname.h" 31 32 /* 33 * parse structure, passed up and down to avoid global variables and 34 * other clumsinesses 35 */ 36 struct parse { 37 char *next; /* next character in RE */ 38 char *end; /* end of string (-> NUL normally) */ 39 int error; /* has an error been seen? */ 40 sop *strip; /* malloced strip */ 41 sopno ssize; /* malloced strip size (allocated) */ 42 sopno slen; /* malloced strip length (used) */ 43 int ncsalloc; /* number of csets allocated */ 44 struct re_guts *g; 45 # define NPAREN 10 /* we need to remember () 1-9 for back refs */ 46 sopno pbegin[NPAREN]; /* -> ( ([0] unused) */ 47 sopno pend[NPAREN]; /* -> ) ([0] unused) */ 48 }; 49 50 /* ========= begin header generated by ./mkh ========= */ 51 #ifdef __cplusplus 52 extern "C" { 53 #endif 54 55 /* === regcomp.c === */ 56 static void p_ere(register struct parse *p, int stop); 57 static void p_ere_exp(register struct parse *p); 58 static void p_str(register struct parse *p); 59 static void p_bre(register struct parse *p, register int end1, register int end2); 60 static int p_simp_re(register struct parse *p, int starordinary); 61 static int p_count(register struct parse *p); 62 static void p_bracket(register struct parse *p); 63 static void p_b_term(register struct parse *p, register cset *cs); 64 static void p_b_cclass(register struct parse *p, register cset *cs); 65 static void p_b_eclass(register struct parse *p, register cset *cs); 66 static char p_b_symbol(register struct parse *p); 67 static char p_b_coll_elem(register struct parse *p, int endc); 68 static char othercase(int ch); 69 static void bothcases(register struct parse *p, int ch); 70 static void ordinary(register struct parse *p, register int ch); 71 static void nonnewline(register struct parse *p); 72 static void repeat(register struct parse *p, sopno start, int from, int to); 73 static int seterr(register struct parse *p, int e); 74 static cset *allocset(register struct parse *p); 75 static void freeset(register struct parse *p, register cset *cs); 76 static int freezeset(register struct parse *p, register cset *cs); 77 static int firstch(register struct parse *p, register cset *cs); 78 static int nch(register struct parse *p, register cset *cs); 79 static void mcadd(register struct parse *p, register cset *cs, register char *cp); 80 static void mcsub(register cset *cs, register char *cp); 81 static int mcin(register cset *cs, register char *cp); 82 static char *mcfind(register cset *cs, register char *cp); 83 static void mcinvert(register struct parse *p, register cset *cs); 84 static void mccase(register struct parse *p, register cset *cs); 85 static int isinsets(register struct re_guts *g, int c); 86 static int samesets(register struct re_guts *g, int c1, int c2); 87 static void categorize(struct parse *p, register struct re_guts *g); 88 static sopno dupl(register struct parse *p, sopno start, sopno finish); 89 static void doemit(register struct parse *p, sop op, size_t opnd); 90 static void doinsert(register struct parse *p, sop op, size_t opnd, sopno pos); 91 static void dofwd(register struct parse *p, sopno pos, sop value); 92 static void enlarge(register struct parse *p, sopno size); 93 static void stripsnug(register struct parse *p, register struct re_guts *g); 94 static void findmust(register struct parse *p, register struct re_guts *g); 95 static sopno pluscount(register struct parse *p, register struct re_guts *g); 96 97 #ifdef __cplusplus 98 } 99 #endif 100 /* ========= end header generated by ./mkh ========= */ 101 102 static char nuls[10]; /* place to point scanner in event of error */ 103 104 /* 105 * macros for use with parse structure 106 * BEWARE: these know that the parse structure is named `p' !!! 107 */ 108 #define PEEK() (*p->next) 109 #define PEEK2() (*(p->next+1)) 110 #define MORE() (p->next < p->end) 111 #define MORE2() (p->next+1 < p->end) 112 #define SEE(c) (MORE() && PEEK() == (c)) 113 #define SEETWO(a, b) (MORE() && MORE2() && PEEK() == (a) && PEEK2() == (b)) 114 #define EAT(c) ((SEE(c)) ? (NEXT(), 1) : 0) 115 #define EATTWO(a, b) ((SEETWO(a, b)) ? (NEXT2(), 1) : 0) 116 #define NEXT() (p->next++) 117 #define NEXT2() (p->next += 2) 118 #define NEXTn(n) (p->next += (n)) 119 #define GETNEXT() (*p->next++) 120 #define SETERROR(e) seterr(p, (e)) 121 #define REQUIRE(co, e) ((co) || SETERROR(e)) 122 #define MUSTSEE(c, e) (REQUIRE(MORE() && PEEK() == (c), e)) 123 #define MUSTEAT(c, e) (REQUIRE(MORE() && GETNEXT() == (c), e)) 124 #define MUSTNOTSEE(c, e) (REQUIRE(!MORE() || PEEK() != (c), e)) 125 #define EMIT(op, sopnd) doemit(p, (sop)(op), (size_t)(sopnd)) 126 #define INSERT(op, pos) doinsert(p, (sop)(op), HERE()-(pos)+1, pos) 127 #define AHEAD(pos) dofwd(p, pos, HERE()-(pos)) 128 #define ASTERN(sop, pos) EMIT(sop, HERE()-pos) 129 #define HERE() (p->slen) 130 #define THERE() (p->slen - 1) 131 #define DROP(n) (p->slen -= (n)) 132 133 #ifndef NDEBUG 134 static int never = 0; /* for use in asserts; shuts lint up */ 135 #else 136 #define never 0 /* some <assert.h>s have bugs too */ 137 #endif 138 139 /* 140 - regcomp - interface for parser and compilation 141 = extern int regcomp(regex_t *, const char *, int); 142 = #define REG_BASIC 0000 143 = #define REG_EXTENDED 0001 144 = #define REG_ICASE 0002 145 = #define REG_NOSUB 0004 146 = #define REG_NEWLINE 0010 147 = #define REG_NOSPEC 0020 148 = #define REG_PEND 0040 149 = #define REG_DUMP 0200 150 */ 151 int /* 0 success, otherwise REG_something */ 152 regcomp(preg, pattern, cflags) 153 regex_t *preg; 154 const char *pattern; 155 int cflags; 156 { 157 struct parse pa; 158 register struct re_guts *g; 159 register struct parse *p = &pa; 160 register int i; 161 register size_t len; 162 #ifdef REDEBUG 163 # define GOODFLAGS(f) (f) 164 #else 165 # define GOODFLAGS(f) ((f)&~REG_DUMP) 166 #endif 167 168 cflags = GOODFLAGS(cflags); 169 if ((cflags®_EXTENDED) && (cflags®_NOSPEC)) 170 return(REG_INVARG); 171 172 if (cflags®_PEND) { 173 if (preg->re_endp < pattern) 174 return(REG_INVARG); 175 len = preg->re_endp - pattern; 176 } else 177 len = strlen((char *)pattern); 178 179 /* do the mallocs early so failure handling is easy */ 180 g = (struct re_guts *)malloc(sizeof(struct re_guts) + 181 (NC-1)*sizeof(cat_t)); 182 if (g == NULL) 183 return(REG_ESPACE); 184 p->ssize = len/(size_t)2*(size_t)3 + (size_t)1; /* ugh */ 185 p->strip = (sop *)malloc(p->ssize * sizeof(sop)); 186 p->slen = 0; 187 if (p->strip == NULL) { 188 free((char *)g); 189 return(REG_ESPACE); 190 } 191 192 /* set things up */ 193 p->g = g; 194 p->next = (char *)pattern; /* convenience; we do not modify it */ 195 p->end = p->next + len; 196 p->error = 0; 197 p->ncsalloc = 0; 198 for (i = 0; i < NPAREN; i++) { 199 p->pbegin[i] = 0; 200 p->pend[i] = 0; 201 } 202 g->csetsize = NC; 203 g->sets = NULL; 204 g->setbits = NULL; 205 g->ncsets = 0; 206 g->cflags = cflags; 207 g->iflags = 0; 208 g->nbol = 0; 209 g->neol = 0; 210 g->must = NULL; 211 g->mlen = 0; 212 g->nsub = 0; 213 g->ncategories = 1; /* category 0 is "everything else" */ 214 g->categories = &g->catspace[-(CHAR_MIN)]; 215 (void) memset((char *)g->catspace, 0, NC*sizeof(cat_t)); 216 g->backrefs = 0; 217 218 /* do it */ 219 EMIT(OEND, 0); 220 g->firststate = THERE(); 221 if (cflags®_EXTENDED) 222 p_ere(p, OUT); 223 else if (cflags®_NOSPEC) 224 p_str(p); 225 else 226 p_bre(p, OUT, OUT); 227 EMIT(OEND, 0); 228 g->laststate = THERE(); 229 230 /* tidy up loose ends and fill things in */ 231 categorize(p, g); 232 stripsnug(p, g); 233 findmust(p, g); 234 g->nplus = pluscount(p, g); 235 g->magic = MAGIC2; 236 preg->re_nsub = g->nsub; 237 preg->re_g = g; 238 preg->re_magic = MAGIC1; 239 #ifndef REDEBUG 240 /* not debugging, so can't rely on the assert() in regexec() */ 241 if (g->iflags&BAD) 242 SETERROR(REG_ASSERT); 243 #endif 244 245 /* win or lose, we're done */ 246 if (p->error != 0) /* lose */ 247 regfree(preg); 248 return(p->error); 249 } 250 251 /* 252 - p_ere - ERE parser top level, concatenation and alternation 253 == static void p_ere(register struct parse *p, int stop); 254 */ 255 static void 256 p_ere(p, stop) 257 register struct parse *p; 258 int stop; /* character this ERE should end at */ 259 { 260 register char c; 261 register sopno prevback; 262 register sopno prevfwd; 263 register sopno conc; 264 register int first = 1; /* is this the first alternative? */ 265 266 for (;;) { 267 /* do a bunch of concatenated expressions */ 268 conc = HERE(); 269 while (MORE() && (c = PEEK()) != '|' && c != stop) 270 p_ere_exp(p); 271 REQUIRE(HERE() != conc, REG_EMPTY); /* require nonempty */ 272 273 if (!EAT('|')) 274 break; /* NOTE BREAK OUT */ 275 276 if (first) { 277 INSERT(OCH_, conc); /* offset is wrong */ 278 prevfwd = conc; 279 prevback = conc; 280 first = 0; 281 } 282 ASTERN(OOR1, prevback); 283 prevback = THERE(); 284 AHEAD(prevfwd); /* fix previous offset */ 285 prevfwd = HERE(); 286 EMIT(OOR2, 0); /* offset is very wrong */ 287 } 288 289 if (!first) { /* tail-end fixups */ 290 AHEAD(prevfwd); 291 ASTERN(O_CH, prevback); 292 } 293 294 assert(!MORE() || SEE(stop)); 295 } 296 297 /* 298 - p_ere_exp - parse one subERE, an atom possibly followed by a repetition op 299 == static void p_ere_exp(register struct parse *p); 300 */ 301 static void 302 p_ere_exp(p) 303 register struct parse *p; 304 { 305 register char c; 306 register sopno pos; 307 register int count; 308 register int count2; 309 register sopno subno; 310 int wascaret = 0; 311 312 assert(MORE()); /* caller should have ensured this */ 313 c = GETNEXT(); 314 315 pos = HERE(); 316 switch (c) { 317 case '(': 318 REQUIRE(MORE(), REG_EPAREN); 319 p->g->nsub++; 320 subno = p->g->nsub; 321 if (subno < NPAREN) 322 p->pbegin[subno] = HERE(); 323 EMIT(OLPAREN, subno); 324 if (!SEE(')')) 325 p_ere(p, ')'); 326 if (subno < NPAREN) { 327 p->pend[subno] = HERE(); 328 assert(p->pend[subno] != 0); 329 } 330 EMIT(ORPAREN, subno); 331 MUSTEAT(')', REG_EPAREN); 332 break; 333 #ifndef POSIX_MISTAKE 334 case ')': /* happens only if no current unmatched ( */ 335 /* 336 * You may ask, why the ifndef? Because I didn't notice 337 * this until slightly too late for 1003.2, and none of the 338 * other 1003.2 regular-expression reviewers noticed it at 339 * all. So an unmatched ) is legal POSIX, at least until 340 * we can get it fixed. 341 */ 342 SETERROR(REG_EPAREN); 343 break; 344 #endif 345 case '^': 346 EMIT(OBOL, 0); 347 p->g->iflags |= USEBOL; 348 p->g->nbol++; 349 wascaret = 1; 350 break; 351 case '$': 352 EMIT(OEOL, 0); 353 p->g->iflags |= USEEOL; 354 p->g->neol++; 355 break; 356 case '|': 357 SETERROR(REG_EMPTY); 358 break; 359 case '*': 360 case '+': 361 case '?': 362 SETERROR(REG_BADRPT); 363 break; 364 case '.': 365 if (p->g->cflags®_NEWLINE) 366 nonnewline(p); 367 else 368 EMIT(OANY, 0); 369 break; 370 case '[': 371 p_bracket(p); 372 break; 373 case '\\': 374 REQUIRE(MORE(), REG_EESCAPE); 375 c = GETNEXT(); 376 ordinary(p, c); 377 break; 378 case '{': /* okay as ordinary except if digit follows */ 379 REQUIRE(!MORE() || !isdigit(PEEK()), REG_BADRPT); 380 /* FALLTHROUGH */ 381 default: 382 ordinary(p, c); 383 break; 384 } 385 386 if (!MORE()) 387 return; 388 c = PEEK(); 389 /* we call { a repetition if followed by a digit */ 390 if (!( c == '*' || c == '+' || c == '?' || 391 (c == '{' && MORE2() && isdigit(PEEK2())) )) 392 return; /* no repetition, we're done */ 393 NEXT(); 394 395 REQUIRE(!wascaret, REG_BADRPT); 396 switch (c) { 397 case '*': /* implemented as +? */ 398 INSERT(OPLUS_, pos); 399 ASTERN(O_PLUS, pos); 400 INSERT(OQUEST_, pos); 401 ASTERN(O_QUEST, pos); 402 break; 403 case '+': 404 INSERT(OPLUS_, pos); 405 ASTERN(O_PLUS, pos); 406 break; 407 case '?': 408 INSERT(OQUEST_, pos); 409 ASTERN(O_QUEST, pos); 410 break; 411 case '{': 412 count = p_count(p); 413 if (EAT(',')) { 414 if (isdigit(PEEK())) { 415 count2 = p_count(p); 416 REQUIRE(count <= count2, REG_BADBR); 417 } else /* single number with comma */ 418 count2 = INFINITY; 419 } else /* just a single number */ 420 count2 = count; 421 repeat(p, pos, count, count2); 422 if (!EAT('}')) { /* error heuristics */ 423 while (MORE() && PEEK() != '}') 424 NEXT(); 425 REQUIRE(MORE(), REG_EBRACE); 426 SETERROR(REG_BADBR); 427 } 428 break; 429 } 430 431 if (!MORE()) 432 return; 433 c = PEEK(); 434 if (!( c == '*' || c == '+' || c == '?' || 435 (c == '{' && MORE2() && isdigit(PEEK2())) ) ) 436 return; 437 SETERROR(REG_BADRPT); 438 } 439 440 /* 441 - p_str - string (no metacharacters) "parser" 442 == static void p_str(register struct parse *p); 443 */ 444 static void 445 p_str(p) 446 register struct parse *p; 447 { 448 REQUIRE(MORE(), REG_EMPTY); 449 while (MORE()) 450 ordinary(p, GETNEXT()); 451 } 452 453 /* 454 - p_bre - BRE parser top level, anchoring and concatenation 455 == static void p_bre(register struct parse *p, register int end1, \ 456 == register int end2); 457 * Giving end1 as OUT essentially eliminates the end1/end2 check. 458 * 459 * This implementation is a bit of a kludge, in that a trailing $ is first 460 * taken as an ordinary character and then revised to be an anchor. The 461 * only undesirable side effect is that '$' gets included as a character 462 * category in such cases. This is fairly harmless; not worth fixing. 463 * The amount of lookahead needed to avoid this kludge is excessive. 464 */ 465 static void 466 p_bre(p, end1, end2) 467 register struct parse *p; 468 register int end1; /* first terminating character */ 469 register int end2; /* second terminating character */ 470 { 471 register sopno start = HERE(); 472 register int first = 1; /* first subexpression? */ 473 register int wasdollar = 0; 474 475 if (EAT('^')) { 476 EMIT(OBOL, 0); 477 p->g->iflags |= USEBOL; 478 p->g->nbol++; 479 } 480 while (MORE() && !SEETWO(end1, end2)) { 481 wasdollar = p_simp_re(p, first); 482 first = 0; 483 } 484 if (wasdollar) { /* oops, that was a trailing anchor */ 485 DROP(1); 486 EMIT(OEOL, 0); 487 p->g->iflags |= USEEOL; 488 p->g->neol++; 489 } 490 491 REQUIRE(HERE() != start, REG_EMPTY); /* require nonempty */ 492 } 493 494 /* 495 - p_simp_re - parse a simple RE, an atom possibly followed by a repetition 496 == static int p_simp_re(register struct parse *p, int starordinary); 497 */ 498 static int /* was the simple RE an unbackslashed $? */ 499 p_simp_re(p, starordinary) 500 register struct parse *p; 501 int starordinary; /* is a leading * an ordinary character? */ 502 { 503 register int c; 504 register int count; 505 register int count2; 506 register sopno pos; 507 register int i; 508 register sopno subno; 509 # define BACKSL (1<<CHAR_BIT) 510 511 pos = HERE(); /* repetion op, if any, covers from here */ 512 513 assert(MORE()); /* caller should have ensured this */ 514 c = GETNEXT(); 515 if (c == '\\') { 516 REQUIRE(MORE(), REG_EESCAPE); 517 c = BACKSL | (unsigned char)GETNEXT(); 518 } 519 switch (c) { 520 case '.': 521 if (p->g->cflags®_NEWLINE) 522 nonnewline(p); 523 else 524 EMIT(OANY, 0); 525 break; 526 case '[': 527 p_bracket(p); 528 break; 529 case BACKSL|'{': 530 SETERROR(REG_BADRPT); 531 break; 532 case BACKSL|'(': 533 p->g->nsub++; 534 subno = p->g->nsub; 535 if (subno < NPAREN) 536 p->pbegin[subno] = HERE(); 537 EMIT(OLPAREN, subno); 538 /* the MORE here is an error heuristic */ 539 if (MORE() && !SEETWO('\\', ')')) 540 p_bre(p, '\\', ')'); 541 if (subno < NPAREN) { 542 p->pend[subno] = HERE(); 543 assert(p->pend[subno] != 0); 544 } 545 EMIT(ORPAREN, subno); 546 REQUIRE(EATTWO('\\', ')'), REG_EPAREN); 547 break; 548 case BACKSL|')': /* should not get here -- must be user */ 549 case BACKSL|'}': 550 SETERROR(REG_EPAREN); 551 break; 552 case BACKSL|'1': 553 case BACKSL|'2': 554 case BACKSL|'3': 555 case BACKSL|'4': 556 case BACKSL|'5': 557 case BACKSL|'6': 558 case BACKSL|'7': 559 case BACKSL|'8': 560 case BACKSL|'9': 561 i = (c&~BACKSL) - '0'; 562 assert(i < NPAREN); 563 if (p->pend[i] != 0) { 564 assert(i <= p->g->nsub); 565 EMIT(OBACK_, i); 566 assert(p->pbegin[i] != 0); 567 assert(OP(p->strip[p->pbegin[i]]) == OLPAREN); 568 assert(OP(p->strip[p->pend[i]]) == ORPAREN); 569 (void) dupl(p, p->pbegin[i]+1, p->pend[i]); 570 EMIT(O_BACK, i); 571 } else 572 SETERROR(REG_ESUBREG); 573 p->g->backrefs = 1; 574 break; 575 case '*': 576 REQUIRE(starordinary, REG_BADRPT); 577 /* FALLTHROUGH */ 578 default: 579 ordinary(p, c &~ BACKSL); 580 break; 581 } 582 583 if (EAT('*')) { /* implemented as +? */ 584 INSERT(OPLUS_, pos); 585 ASTERN(O_PLUS, pos); 586 INSERT(OQUEST_, pos); 587 ASTERN(O_QUEST, pos); 588 } else if (EATTWO('\\', '{')) { 589 count = p_count(p); 590 if (EAT(',')) { 591 if (MORE() && isdigit(PEEK())) { 592 count2 = p_count(p); 593 REQUIRE(count <= count2, REG_BADBR); 594 } else /* single number with comma */ 595 count2 = INFINITY; 596 } else /* just a single number */ 597 count2 = count; 598 repeat(p, pos, count, count2); 599 if (!EATTWO('\\', '}')) { /* error heuristics */ 600 while (MORE() && !SEETWO('\\', '}')) 601 NEXT(); 602 REQUIRE(MORE(), REG_EBRACE); 603 SETERROR(REG_BADBR); 604 } 605 } else if (c == (unsigned char)'$') /* $ (but not \$) ends it */ 606 return(1); 607 608 return(0); 609 } 610 611 /* 612 - p_count - parse a repetition count 613 == static int p_count(register struct parse *p); 614 */ 615 static int /* the value */ 616 p_count(p) 617 register struct parse *p; 618 { 619 register int count = 0; 620 register int ndigits = 0; 621 622 while (MORE() && isdigit(PEEK()) && count <= DUPMAX) { 623 count = count*10 + (GETNEXT() - '0'); 624 ndigits++; 625 } 626 627 REQUIRE(ndigits > 0 && count <= DUPMAX, REG_BADBR); 628 return(count); 629 } 630 631 /* 632 - p_bracket - parse a bracketed character list 633 == static void p_bracket(register struct parse *p); 634 * 635 * Note a significant property of this code: if the allocset() did SETERROR, 636 * no set operations are done. 637 */ 638 static void 639 p_bracket(p) 640 register struct parse *p; 641 { 642 register char c; 643 register cset *cs = allocset(p); 644 register int invert = 0; 645 646 /* Dept of Truly Sickening Special-Case Kludges */ 647 if (p->next + 5 < p->end && strncmp(p->next, "[:<:]]", 6) == 0) { 648 EMIT(OBOW, 0); 649 NEXTn(6); 650 return; 651 } 652 if (p->next + 5 < p->end && strncmp(p->next, "[:>:]]", 6) == 0) { 653 EMIT(OEOW, 0); 654 NEXTn(6); 655 return; 656 } 657 658 if (EAT('^')) 659 invert++; /* make note to invert set at end */ 660 if (EAT(']')) 661 CHadd(cs, ']'); 662 else if (EAT('-')) 663 CHadd(cs, '-'); 664 while (MORE() && PEEK() != ']' && !SEETWO('-', ']')) 665 p_b_term(p, cs); 666 if (EAT('-')) 667 CHadd(cs, '-'); 668 MUSTEAT(']', REG_EBRACK); 669 670 if (p->error != 0) /* don't mess things up further */ 671 return; 672 673 if (p->g->cflags®_ICASE) { 674 register int i; 675 register int ci; 676 677 for (i = p->g->csetsize - 1; i >= 0; i--) 678 if (CHIN(cs, i) && isalpha(i)) { 679 ci = othercase(i); 680 if (ci != i) 681 CHadd(cs, ci); 682 } 683 if (cs->multis != NULL) 684 mccase(p, cs); 685 } 686 if (invert) { 687 register int i; 688 689 for (i = p->g->csetsize - 1; i >= 0; i--) 690 if (CHIN(cs, i)) 691 CHsub(cs, i); 692 else 693 CHadd(cs, i); 694 if (p->g->cflags®_NEWLINE) 695 CHsub(cs, '\n'); 696 if (cs->multis != NULL) 697 mcinvert(p, cs); 698 } 699 700 assert(cs->multis == NULL); /* xxx */ 701 702 if (nch(p, cs) == 1) { /* optimize singleton sets */ 703 ordinary(p, firstch(p, cs)); 704 freeset(p, cs); 705 } else 706 EMIT(OANYOF, freezeset(p, cs)); 707 } 708 709 /* 710 - p_b_term - parse one term of a bracketed character list 711 == static void p_b_term(register struct parse *p, register cset *cs); 712 */ 713 static void 714 p_b_term(p, cs) 715 register struct parse *p; 716 register cset *cs; 717 { 718 register char c; 719 register char start, finish; 720 register int i; 721 722 /* classify what we've got */ 723 switch ((MORE()) ? PEEK() : '\0') { 724 case '[': 725 c = (MORE2()) ? PEEK2() : '\0'; 726 break; 727 case '-': 728 SETERROR(REG_ERANGE); 729 return; /* NOTE RETURN */ 730 break; 731 default: 732 c = '\0'; 733 break; 734 } 735 736 switch (c) { 737 case ':': /* character class */ 738 NEXT2(); 739 REQUIRE(MORE(), REG_EBRACK); 740 c = PEEK(); 741 REQUIRE(c != '-' && c != ']', REG_ECTYPE); 742 p_b_cclass(p, cs); 743 REQUIRE(MORE(), REG_EBRACK); 744 REQUIRE(EATTWO(':', ']'), REG_ECTYPE); 745 break; 746 case '=': /* equivalence class */ 747 NEXT2(); 748 REQUIRE(MORE(), REG_EBRACK); 749 c = PEEK(); 750 REQUIRE(c != '-' && c != ']', REG_ECOLLATE); 751 p_b_eclass(p, cs); 752 REQUIRE(MORE(), REG_EBRACK); 753 REQUIRE(EATTWO('=', ']'), REG_ECOLLATE); 754 break; 755 default: /* symbol, ordinary character, or range */ 756 /* xxx revision needed for multichar stuff */ 757 start = p_b_symbol(p); 758 if (SEE('-') && MORE2() && PEEK2() != ']') { 759 /* range */ 760 NEXT(); 761 if (EAT('-')) 762 finish = '-'; 763 else 764 finish = p_b_symbol(p); 765 } else 766 finish = start; 767 /* xxx what about signed chars here... */ 768 REQUIRE(start <= finish, REG_ERANGE); 769 for (i = start; i <= finish; i++) 770 CHadd(cs, i); 771 break; 772 } 773 } 774 775 /* 776 - p_b_cclass - parse a character-class name and deal with it 777 == static void p_b_cclass(register struct parse *p, register cset *cs); 778 */ 779 static void 780 p_b_cclass(p, cs) 781 register struct parse *p; 782 register cset *cs; 783 { 784 register char *sp = p->next; 785 register struct cclass *cp; 786 register size_t len; 787 register char *u; 788 register char c; 789 790 while (MORE() && isalpha(PEEK())) 791 NEXT(); 792 len = p->next - sp; 793 for (cp = cclasses; cp->name != NULL; cp++) 794 if (strncmp(cp->name, sp, len) == 0 && cp->name[len] == '\0') 795 break; 796 if (cp->name == NULL) { 797 /* oops, didn't find it */ 798 SETERROR(REG_ECTYPE); 799 return; 800 } 801 802 u = cp->chars; 803 while ((c = *u++) != '\0') 804 CHadd(cs, c); 805 for (u = cp->multis; *u != '\0'; u += strlen(u) + 1) 806 MCadd(p, cs, u); 807 } 808 809 /* 810 - p_b_eclass - parse an equivalence-class name and deal with it 811 == static void p_b_eclass(register struct parse *p, register cset *cs); 812 * 813 * This implementation is incomplete. xxx 814 */ 815 static void 816 p_b_eclass(p, cs) 817 register struct parse *p; 818 register cset *cs; 819 { 820 register char c; 821 822 c = p_b_coll_elem(p, '='); 823 CHadd(cs, c); 824 } 825 826 /* 827 - p_b_symbol - parse a character or [..]ed multicharacter collating symbol 828 == static char p_b_symbol(register struct parse *p); 829 */ 830 static char /* value of symbol */ 831 p_b_symbol(p) 832 register struct parse *p; 833 { 834 register char value; 835 836 REQUIRE(MORE(), REG_EBRACK); 837 if (!EATTWO('[', '.')) 838 return(GETNEXT()); 839 840 /* collating symbol */ 841 value = p_b_coll_elem(p, '.'); 842 REQUIRE(EATTWO('.', ']'), REG_ECOLLATE); 843 return(value); 844 } 845 846 /* 847 - p_b_coll_elem - parse a collating-element name and look it up 848 == static char p_b_coll_elem(register struct parse *p, int endc); 849 */ 850 static char /* value of collating element */ 851 p_b_coll_elem(p, endc) 852 register struct parse *p; 853 int endc; /* name ended by endc,']' */ 854 { 855 register char *sp = p->next; 856 register struct cname *cp; 857 register int len; 858 register char c; 859 860 while (MORE() && !SEETWO(endc, ']')) 861 NEXT(); 862 if (!MORE()) { 863 SETERROR(REG_EBRACK); 864 return(0); 865 } 866 len = p->next - sp; 867 for (cp = cnames; cp->name != NULL; cp++) 868 if (strncmp(cp->name, sp, len) == 0 && cp->name[len] == '\0') 869 return(cp->code); /* known name */ 870 if (len == 1) 871 return(*sp); /* single character */ 872 SETERROR(REG_ECOLLATE); /* neither */ 873 return(0); 874 } 875 876 /* 877 - othercase - return the case counterpart of an alphabetic 878 == static char othercase(int ch); 879 */ 880 static char /* if no counterpart, return ch */ 881 othercase(ch) 882 int ch; 883 { 884 assert(isalpha(ch)); 885 if (isupper(ch)) 886 return(tolower(ch)); 887 else if (islower(ch)) 888 return(toupper(ch)); 889 else /* peculiar, but could happen */ 890 return(ch); 891 } 892 893 /* 894 - bothcases - emit a dualcase version of a two-case character 895 == static void bothcases(register struct parse *p, int ch); 896 * 897 * Boy, is this implementation ever a kludge... 898 */ 899 static void 900 bothcases(p, ch) 901 register struct parse *p; 902 int ch; 903 { 904 register char *oldnext = p->next; 905 register char *oldend = p->end; 906 char bracket[3]; 907 908 assert(othercase(ch) != ch); /* p_bracket() would recurse */ 909 p->next = bracket; 910 p->end = bracket+2; 911 bracket[0] = ch; 912 bracket[1] = ']'; 913 bracket[2] = '\0'; 914 p_bracket(p); 915 assert(p->next == bracket+2); 916 p->next = oldnext; 917 p->end = oldend; 918 } 919 920 /* 921 - ordinary - emit an ordinary character 922 == static void ordinary(register struct parse *p, register int ch); 923 */ 924 static void 925 ordinary(p, ch) 926 register struct parse *p; 927 register int ch; 928 { 929 register cat_t *cap = p->g->categories; 930 931 if ((p->g->cflags®_ICASE) && isalpha(ch) && othercase(ch) != ch) 932 bothcases(p, ch); 933 else { 934 EMIT(OCHAR, (unsigned char)ch); 935 if (cap[ch] == 0) 936 cap[ch] = p->g->ncategories++; 937 } 938 } 939 940 /* 941 - nonnewline - emit REG_NEWLINE version of OANY 942 == static void nonnewline(register struct parse *p); 943 * 944 * Boy, is this implementation ever a kludge... 945 */ 946 static void 947 nonnewline(p) 948 register struct parse *p; 949 { 950 register char *oldnext = p->next; 951 register char *oldend = p->end; 952 char bracket[4]; 953 954 p->next = bracket; 955 p->end = bracket+3; 956 bracket[0] = '^'; 957 bracket[1] = '\n'; 958 bracket[2] = ']'; 959 bracket[3] = '\0'; 960 p_bracket(p); 961 assert(p->next == bracket+3); 962 p->next = oldnext; 963 p->end = oldend; 964 } 965 966 /* 967 - repeat - generate code for a bounded repetition, recursively if needed 968 == static void repeat(register struct parse *p, sopno start, int from, int to); 969 */ 970 static void 971 repeat(p, start, from, to) 972 register struct parse *p; 973 sopno start; /* operand from here to end of strip */ 974 int from; /* repeated from this number */ 975 int to; /* to this number of times (maybe INFINITY) */ 976 { 977 register sopno finish = HERE(); 978 # define N 2 979 # define INF 3 980 # define REP(f, t) ((f)*8 + (t)) 981 # define MAP(n) (((n) <= 1) ? (n) : ((n) == INFINITY) ? INF : N) 982 register sopno copy; 983 984 if (p->error != 0) /* head off possible runaway recursion */ 985 return; 986 987 assert(from <= to); 988 989 switch (REP(MAP(from), MAP(to))) { 990 case REP(0, 0): /* must be user doing this */ 991 DROP(finish-start); /* drop the operand */ 992 break; 993 case REP(0, 1): /* as x{1,1}? */ 994 case REP(0, N): /* as x{1,n}? */ 995 case REP(0, INF): /* as x{1,}? */ 996 INSERT(OQUEST_, start); /* offset is wrong... */ 997 repeat(p, start+1, 1, to); 998 AHEAD(start); /* ... fix it */ 999 ASTERN(O_QUEST, start); 1000 break; 1001 case REP(1, 1): /* trivial case */ 1002 /* done */ 1003 break; 1004 case REP(1, N): /* as x?x{1,n-1} */ 1005 INSERT(OQUEST_, start); 1006 ASTERN(O_QUEST, start); 1007 copy = dupl(p, start+1, finish+1); 1008 assert(copy == finish+2); 1009 repeat(p, copy, 1, to-1); 1010 break; 1011 case REP(1, INF): /* as x+ */ 1012 INSERT(OPLUS_, start); 1013 ASTERN(O_PLUS, start); 1014 break; 1015 case REP(N, N): /* as xx{m-1,n-1} */ 1016 copy = dupl(p, start, finish); 1017 repeat(p, copy, from-1, to-1); 1018 break; 1019 case REP(N, INF): /* as xx{n-1,INF} */ 1020 copy = dupl(p, start, finish); 1021 repeat(p, copy, from-1, to); 1022 break; 1023 default: /* "can't happen" */ 1024 SETERROR(REG_ASSERT); /* just in case */ 1025 break; 1026 } 1027 } 1028 1029 /* 1030 - seterr - set an error condition 1031 == static int seterr(register struct parse *p, int e); 1032 */ 1033 static int /* useless but makes type checking happy */ 1034 seterr(p, e) 1035 register struct parse *p; 1036 int e; 1037 { 1038 if (p->error == 0) /* keep earliest error condition */ 1039 p->error = e; 1040 p->next = nuls; /* try to bring things to a halt */ 1041 p->end = nuls; 1042 return(0); /* make the return value well-defined */ 1043 } 1044 1045 /* 1046 - allocset - allocate a set of characters for [] 1047 == static cset *allocset(register struct parse *p); 1048 */ 1049 static cset * 1050 allocset(p) 1051 register struct parse *p; 1052 { 1053 register int no = p->g->ncsets++; 1054 register size_t nc; 1055 register size_t nbytes; 1056 register cset *cs; 1057 register size_t css = (size_t)p->g->csetsize; 1058 register int i; 1059 1060 if (no >= p->ncsalloc) { /* need another column of space */ 1061 p->ncsalloc += CHAR_BIT; 1062 nc = p->ncsalloc; 1063 assert(nc % CHAR_BIT == 0); 1064 nbytes = nc / CHAR_BIT * css; 1065 if (p->g->sets == NULL) 1066 p->g->sets = (cset *)malloc(nc * sizeof(cset)); 1067 else 1068 p->g->sets = (cset *)realloc((char *)p->g->sets, 1069 nc * sizeof(cset)); 1070 if (p->g->setbits == NULL) 1071 p->g->setbits = (uch *)malloc(nbytes); 1072 else { 1073 p->g->setbits = (uch *)realloc((char *)p->g->setbits, 1074 nbytes); 1075 /* xxx this isn't right if setbits is now NULL */ 1076 for (i = 0; i < no; i++) 1077 p->g->sets[i].ptr = p->g->setbits + css*(i/CHAR_BIT); 1078 } 1079 if (p->g->sets != NULL && p->g->setbits != NULL) 1080 (void) memset((char *)p->g->setbits + (nbytes - css), 1081 0, css); 1082 else { 1083 no = 0; 1084 SETERROR(REG_ESPACE); 1085 /* caller's responsibility not to do set ops */ 1086 } 1087 } 1088 1089 assert(p->g->sets != NULL); /* xxx */ 1090 cs = &p->g->sets[no]; 1091 cs->ptr = p->g->setbits + css*((no)/CHAR_BIT); 1092 cs->mask = 1 << ((no) % CHAR_BIT); 1093 cs->hash = 0; 1094 cs->smultis = 0; 1095 cs->multis = NULL; 1096 1097 return(cs); 1098 } 1099 1100 /* 1101 - freeset - free a now-unused set 1102 == static void freeset(register struct parse *p, register cset *cs); 1103 */ 1104 static void 1105 freeset(p, cs) 1106 register struct parse *p; 1107 register cset *cs; 1108 { 1109 register int i; 1110 register cset *top = &p->g->sets[p->g->ncsets]; 1111 register size_t css = (size_t)p->g->csetsize; 1112 1113 for (i = 0; i < css; i++) 1114 CHsub(cs, i); 1115 if (cs == top-1) /* recover only the easy case */ 1116 p->g->ncsets--; 1117 } 1118 1119 /* 1120 - freezeset - final processing on a set of characters 1121 == static int freezeset(register struct parse *p, register cset *cs); 1122 * 1123 * The main task here is merging identical sets. This is usually a waste 1124 * of time (although the hash code minimizes the overhead), but can win 1125 * big if REG_ICASE is being used. REG_ICASE, by the way, is why the hash 1126 * is done using addition rather than xor -- all ASCII [aA] sets xor to 1127 * the same value! 1128 */ 1129 static int /* set number */ 1130 freezeset(p, cs) 1131 register struct parse *p; 1132 register cset *cs; 1133 { 1134 register uch h = cs->hash; 1135 register int i; 1136 register cset *top = &p->g->sets[p->g->ncsets]; 1137 register cset *cs2; 1138 register size_t css = (size_t)p->g->csetsize; 1139 1140 /* look for an earlier one which is the same */ 1141 for (cs2 = &p->g->sets[0]; cs2 < top; cs2++) 1142 if (cs2->hash == h && cs2 != cs) { 1143 /* maybe */ 1144 for (i = 0; i < css; i++) 1145 if (!!CHIN(cs2, i) != !!CHIN(cs, i)) 1146 break; /* no */ 1147 if (i == css) 1148 break; /* yes */ 1149 } 1150 1151 if (cs2 < top) { /* found one */ 1152 freeset(p, cs); 1153 cs = cs2; 1154 } 1155 1156 return((int)(cs - p->g->sets)); 1157 } 1158 1159 /* 1160 - firstch - return first character in a set (which must have at least one) 1161 == static int firstch(register struct parse *p, register cset *cs); 1162 */ 1163 static int /* character; there is no "none" value */ 1164 firstch(p, cs) 1165 register struct parse *p; 1166 register cset *cs; 1167 { 1168 register int i; 1169 register size_t css = (size_t)p->g->csetsize; 1170 1171 for (i = 0; i < css; i++) 1172 if (CHIN(cs, i)) 1173 return((char)i); 1174 assert(never); 1175 return(0); /* arbitrary */ 1176 } 1177 1178 /* 1179 - nch - number of characters in a set 1180 == static int nch(register struct parse *p, register cset *cs); 1181 */ 1182 static int 1183 nch(p, cs) 1184 register struct parse *p; 1185 register cset *cs; 1186 { 1187 register int i; 1188 register size_t css = (size_t)p->g->csetsize; 1189 register int n = 0; 1190 1191 for (i = 0; i < css; i++) 1192 if (CHIN(cs, i)) 1193 n++; 1194 return(n); 1195 } 1196 1197 /* 1198 - mcadd - add a collating element to a cset 1199 == static void mcadd(register struct parse *p, register cset *cs, \ 1200 == register char *cp); 1201 */ 1202 static void 1203 mcadd(p, cs, cp) 1204 register struct parse *p; 1205 register cset *cs; 1206 register char *cp; 1207 { 1208 register size_t oldend = cs->smultis; 1209 1210 cs->smultis += strlen(cp) + 1; 1211 if (cs->multis == NULL) 1212 cs->multis = malloc(cs->smultis); 1213 else 1214 cs->multis = realloc(cs->multis, cs->smultis); 1215 if (cs->multis == NULL) { 1216 SETERROR(REG_ESPACE); 1217 return; 1218 } 1219 1220 (void) strcpy(cs->multis + oldend - 1, cp); 1221 cs->multis[cs->smultis - 1] = '\0'; 1222 } 1223 1224 /* 1225 - mcsub - subtract a collating element from a cset 1226 == static void mcsub(register cset *cs, register char *cp); 1227 */ 1228 static void 1229 mcsub(cs, cp) 1230 register cset *cs; 1231 register char *cp; 1232 { 1233 register char *fp = mcfind(cs, cp); 1234 register size_t len = strlen(fp); 1235 1236 assert(fp != NULL); 1237 (void) memmove(fp, fp + len + 1, 1238 cs->smultis - (fp + len + 1 - cs->multis)); 1239 cs->smultis -= len; 1240 1241 if (cs->smultis == 0) { 1242 free(cs->multis); 1243 cs->multis = NULL; 1244 return; 1245 } 1246 1247 cs->multis = realloc(cs->multis, cs->smultis); 1248 assert(cs->multis != NULL); 1249 } 1250 1251 /* 1252 - mcin - is a collating element in a cset? 1253 == static int mcin(register cset *cs, register char *cp); 1254 */ 1255 static int 1256 mcin(cs, cp) 1257 register cset *cs; 1258 register char *cp; 1259 { 1260 return(mcfind(cs, cp) != NULL); 1261 } 1262 1263 /* 1264 - mcfind - find a collating element in a cset 1265 == static char *mcfind(register cset *cs, register char *cp); 1266 */ 1267 static char * 1268 mcfind(cs, cp) 1269 register cset *cs; 1270 register char *cp; 1271 { 1272 register char *p; 1273 1274 if (cs->multis == NULL) 1275 return(NULL); 1276 for (p = cs->multis; *p != '\0'; p += strlen(p) + 1) 1277 if (strcmp(cp, p) == 0) 1278 return(p); 1279 return(NULL); 1280 } 1281 1282 /* 1283 - mcinvert - invert the list of collating elements in a cset 1284 == static void mcinvert(register struct parse *p, register cset *cs); 1285 * 1286 * This would have to know the set of possibilities. Implementation 1287 * is deferred. 1288 */ 1289 static void 1290 mcinvert(p, cs) 1291 register struct parse *p; 1292 register cset *cs; 1293 { 1294 assert(cs->multis == NULL); /* xxx */ 1295 } 1296 1297 /* 1298 - mccase - add case counterparts of the list of collating elements in a cset 1299 == static void mccase(register struct parse *p, register cset *cs); 1300 * 1301 * This would have to know the set of possibilities. Implementation 1302 * is deferred. 1303 */ 1304 static void 1305 mccase(p, cs) 1306 register struct parse *p; 1307 register cset *cs; 1308 { 1309 assert(cs->multis == NULL); /* xxx */ 1310 } 1311 1312 /* 1313 - isinsets - is this character in any sets? 1314 == static int isinsets(register struct re_guts *g, int c); 1315 */ 1316 static int /* predicate */ 1317 isinsets(g, c) 1318 register struct re_guts *g; 1319 int c; 1320 { 1321 register uch *col; 1322 register int i; 1323 register int ncols = (g->ncsets+(CHAR_BIT-1)) / CHAR_BIT; 1324 register unsigned uc = (unsigned char)c; 1325 1326 for (i = 0, col = g->setbits; i < ncols; i++, col += g->csetsize) 1327 if (col[uc] != 0) 1328 return(1); 1329 return(0); 1330 } 1331 1332 /* 1333 - samesets - are these two characters in exactly the same sets? 1334 == static int samesets(register struct re_guts *g, int c1, int c2); 1335 */ 1336 static int /* predicate */ 1337 samesets(g, c1, c2) 1338 register struct re_guts *g; 1339 int c1; 1340 int c2; 1341 { 1342 register uch *col; 1343 register int i; 1344 register int ncols = (g->ncsets+(CHAR_BIT-1)) / CHAR_BIT; 1345 register unsigned uc1 = (unsigned char)c1; 1346 register unsigned uc2 = (unsigned char)c2; 1347 1348 for (i = 0, col = g->setbits; i < ncols; i++, col += g->csetsize) 1349 if (col[uc1] != col[uc2]) 1350 return(0); 1351 return(1); 1352 } 1353 1354 /* 1355 - categorize - sort out character categories 1356 == static void categorize(struct parse *p, register struct re_guts *g); 1357 */ 1358 static void 1359 categorize(p, g) 1360 struct parse *p; 1361 register struct re_guts *g; 1362 { 1363 register cat_t *cats = g->categories; 1364 register int c; 1365 register int c2; 1366 register cat_t cat; 1367 1368 /* avoid making error situations worse */ 1369 if (p->error != 0) 1370 return; 1371 1372 for (c = CHAR_MIN; c <= CHAR_MAX; c++) 1373 if (cats[c] == 0 && isinsets(g, c)) { 1374 cat = g->ncategories++; 1375 cats[c] = cat; 1376 for (c2 = c+1; c2 <= CHAR_MAX; c2++) 1377 if (cats[c2] == 0 && samesets(g, c, c2)) 1378 cats[c2] = cat; 1379 } 1380 } 1381 1382 /* 1383 - dupl - emit a duplicate of a bunch of sops 1384 == static sopno dupl(register struct parse *p, sopno start, sopno finish); 1385 */ 1386 static sopno /* start of duplicate */ 1387 dupl(p, start, finish) 1388 register struct parse *p; 1389 sopno start; /* from here */ 1390 sopno finish; /* to this less one */ 1391 { 1392 register sopno ret = HERE(); 1393 register sopno len = finish - start; 1394 1395 assert(finish >= start); 1396 if (len == 0) 1397 return(ret); 1398 enlarge(p, p->ssize + len); /* this many unexpected additions */ 1399 assert(p->ssize >= p->slen + len); 1400 (void) memcpy((char *)(p->strip + p->slen), 1401 (char *)(p->strip + start), (size_t)len*sizeof(sop)); 1402 p->slen += len; 1403 return(ret); 1404 } 1405 1406 /* 1407 - doemit - emit a strip operator 1408 == static void doemit(register struct parse *p, sop op, size_t opnd); 1409 * 1410 * It might seem better to implement this as a macro with a function as 1411 * hard-case backup, but it's just too big and messy unless there are 1412 * some changes to the data structures. Maybe later. 1413 */ 1414 static void 1415 doemit(p, op, opnd) 1416 register struct parse *p; 1417 sop op; 1418 size_t opnd; 1419 { 1420 /* avoid making error situations worse */ 1421 if (p->error != 0) 1422 return; 1423 1424 /* deal with oversize operands ("can't happen", more or less) */ 1425 assert(opnd < 1<<OPSHIFT); 1426 1427 /* deal with undersized strip */ 1428 if (p->slen >= p->ssize) 1429 enlarge(p, (p->ssize+1) / 2 * 3); /* +50% */ 1430 assert(p->slen < p->ssize); 1431 1432 /* finally, it's all reduced to the easy case */ 1433 p->strip[p->slen++] = SOP(op, opnd); 1434 } 1435 1436 /* 1437 - doinsert - insert a sop into the strip 1438 == static void doinsert(register struct parse *p, sop op, size_t opnd, sopno pos); 1439 */ 1440 static void 1441 doinsert(p, op, opnd, pos) 1442 register struct parse *p; 1443 sop op; 1444 size_t opnd; 1445 sopno pos; 1446 { 1447 register sopno sn; 1448 register sop s; 1449 register int i; 1450 1451 /* avoid making error situations worse */ 1452 if (p->error != 0) 1453 return; 1454 1455 sn = HERE(); 1456 EMIT(op, opnd); /* do checks, ensure space */ 1457 assert(HERE() == sn+1); 1458 s = p->strip[sn]; 1459 1460 /* adjust paren pointers */ 1461 assert(pos > 0); 1462 for (i = 1; i < NPAREN; i++) { 1463 if (p->pbegin[i] >= pos) { 1464 p->pbegin[i]++; 1465 } 1466 if (p->pend[i] >= pos) { 1467 p->pend[i]++; 1468 } 1469 } 1470 1471 memmove((char *)&p->strip[pos+1], (char *)&p->strip[pos], 1472 (HERE()-pos-1)*sizeof(sop)); 1473 p->strip[pos] = s; 1474 } 1475 1476 /* 1477 - dofwd - complete a forward reference 1478 == static void dofwd(register struct parse *p, sopno pos, sop value); 1479 */ 1480 static void 1481 dofwd(p, pos, value) 1482 register struct parse *p; 1483 register sopno pos; 1484 sop value; 1485 { 1486 /* avoid making error situations worse */ 1487 if (p->error != 0) 1488 return; 1489 1490 assert(value < 1<<OPSHIFT); 1491 p->strip[pos] = OP(p->strip[pos]) | value; 1492 } 1493 1494 /* 1495 - enlarge - enlarge the strip 1496 == static void enlarge(register struct parse *p, sopno size); 1497 */ 1498 static void 1499 enlarge(p, size) 1500 register struct parse *p; 1501 register sopno size; 1502 { 1503 register sop *sp; 1504 1505 if (p->ssize >= size) 1506 return; 1507 1508 sp = (sop *)realloc(p->strip, size*sizeof(sop)); 1509 if (sp == NULL) { 1510 SETERROR(REG_ESPACE); 1511 return; 1512 } 1513 p->strip = sp; 1514 p->ssize = size; 1515 } 1516 1517 /* 1518 - stripsnug - compact the strip 1519 == static void stripsnug(register struct parse *p, register struct re_guts *g); 1520 */ 1521 static void 1522 stripsnug(p, g) 1523 register struct parse *p; 1524 register struct re_guts *g; 1525 { 1526 g->nstates = p->slen; 1527 g->strip = (sop *)realloc((char *)p->strip, p->slen * sizeof(sop)); 1528 if (g->strip == NULL) { 1529 SETERROR(REG_ESPACE); 1530 g->strip = p->strip; 1531 } 1532 } 1533 1534 /* 1535 - findmust - fill in must and mlen with longest mandatory literal string 1536 == static void findmust(register struct parse *p, register struct re_guts *g); 1537 * 1538 * This algorithm could do fancy things like analyzing the operands of | 1539 * for common subsequences. Someday. This code is simple and finds most 1540 * of the interesting cases. 1541 * 1542 * Note that must and mlen got initialized during setup. 1543 */ 1544 static void 1545 findmust(p, g) 1546 struct parse *p; 1547 register struct re_guts *g; 1548 { 1549 register sop *scan; 1550 sop *start; 1551 register sop *newstart; 1552 register sopno newlen; 1553 register sop s; 1554 register char *cp; 1555 register sopno i; 1556 1557 /* avoid making error situations worse */ 1558 if (p->error != 0) 1559 return; 1560 1561 /* find the longest OCHAR sequence in strip */ 1562 newlen = 0; 1563 scan = g->strip + 1; 1564 do { 1565 s = *scan++; 1566 switch (OP(s)) { 1567 case OCHAR: /* sequence member */ 1568 if (newlen == 0) /* new sequence */ 1569 newstart = scan - 1; 1570 newlen++; 1571 break; 1572 case OPLUS_: /* things that don't break one */ 1573 case OLPAREN: 1574 case ORPAREN: 1575 break; 1576 case OQUEST_: /* things that must be skipped */ 1577 case OCH_: 1578 scan--; 1579 do { 1580 scan += OPND(s); 1581 s = *scan; 1582 /* assert() interferes w debug printouts */ 1583 if (OP(s) != O_QUEST && OP(s) != O_CH && 1584 OP(s) != OOR2) { 1585 g->iflags |= BAD; 1586 return; 1587 } 1588 } while (OP(s) != O_QUEST && OP(s) != O_CH); 1589 /* fallthrough */ 1590 default: /* things that break a sequence */ 1591 if (newlen > g->mlen) { /* ends one */ 1592 start = newstart; 1593 g->mlen = newlen; 1594 } 1595 newlen = 0; 1596 break; 1597 } 1598 } while (OP(s) != OEND); 1599 1600 if (g->mlen == 0) /* there isn't one */ 1601 return; 1602 1603 /* turn it into a character string */ 1604 g->must = malloc((size_t)g->mlen + 1); 1605 if (g->must == NULL) { /* argh; just forget it */ 1606 g->mlen = 0; 1607 return; 1608 } 1609 cp = g->must; 1610 scan = start; 1611 for (i = g->mlen; i > 0; i--) { 1612 while (OP(s = *scan++) != OCHAR) 1613 continue; 1614 assert(cp < g->must + g->mlen); 1615 *cp++ = (char)OPND(s); 1616 } 1617 assert(cp == g->must + g->mlen); 1618 *cp++ = '\0'; /* just on general principles */ 1619 } 1620 1621 /* 1622 - pluscount - count + nesting 1623 == static sopno pluscount(register struct parse *p, register struct re_guts *g); 1624 */ 1625 static sopno /* nesting depth */ 1626 pluscount(p, g) 1627 struct parse *p; 1628 register struct re_guts *g; 1629 { 1630 register sop *scan; 1631 register sop s; 1632 register sopno plusnest = 0; 1633 register sopno maxnest = 0; 1634 1635 if (p->error != 0) 1636 return(0); /* there may not be an OEND */ 1637 1638 scan = g->strip + 1; 1639 do { 1640 s = *scan++; 1641 switch (OP(s)) { 1642 case OPLUS_: 1643 plusnest++; 1644 break; 1645 case O_PLUS: 1646 if (plusnest > maxnest) 1647 maxnest = plusnest; 1648 plusnest--; 1649 break; 1650 } 1651 } while (OP(s) != OEND); 1652 if (plusnest != 0) 1653 g->iflags |= BAD; 1654 return(maxnest); 1655 } 1656