1 /*- 2 * Copyright (c) 1992, 1993, 1994 Henry Spencer. 3 * Copyright (c) 1992, 1993, 1994 4 * The Regents of the University of California. All rights reserved. 5 * 6 * This code is derived from software contributed to Berkeley by 7 * Henry Spencer. 8 * 9 * %sccs.include.redist.c% 10 * 11 * @(#)regcomp.c 8.5 (Berkeley) 03/20/94 12 */ 13 14 #if defined(LIBC_SCCS) && !defined(lint) 15 static char sccsid[] = "@(#)regcomp.c 8.5 (Berkeley) 03/20/94"; 16 #endif /* LIBC_SCCS and not lint */ 17 18 #include <sys/types.h> 19 #include <stdio.h> 20 #include <string.h> 21 #include <ctype.h> 22 #include <limits.h> 23 #include <stdlib.h> 24 #include <regex.h> 25 26 #include "utils.h" 27 #include "regex2.h" 28 29 #include "cclass.h" 30 #include "cname.h" 31 32 /* 33 * parse structure, passed up and down to avoid global variables and 34 * other clumsinesses 35 */ 36 struct parse { 37 char *next; /* next character in RE */ 38 char *end; /* end of string (-> NUL normally) */ 39 int error; /* has an error been seen? */ 40 sop *strip; /* malloced strip */ 41 sopno ssize; /* malloced strip size (allocated) */ 42 sopno slen; /* malloced strip length (used) */ 43 int ncsalloc; /* number of csets allocated */ 44 struct re_guts *g; 45 # define NPAREN 10 /* we need to remember () 1-9 for back refs */ 46 sopno pbegin[NPAREN]; /* -> ( ([0] unused) */ 47 sopno pend[NPAREN]; /* -> ) ([0] unused) */ 48 }; 49 50 /* ========= begin header generated by ./mkh ========= */ 51 #ifdef __cplusplus 52 extern "C" { 53 #endif 54 55 /* === regcomp.c === */ 56 static void p_ere __P((struct parse *p, int stop)); 57 static void p_ere_exp __P((struct parse *p)); 58 static void p_str __P((struct parse *p)); 59 static void p_bre __P((struct parse *p, int end1, int end2)); 60 static int p_simp_re __P((struct parse *p, int starordinary)); 61 static int p_count __P((struct parse *p)); 62 static void p_bracket __P((struct parse *p)); 63 static void p_b_term __P((struct parse *p, cset *cs)); 64 static void p_b_cclass __P((struct parse *p, cset *cs)); 65 static void p_b_eclass __P((struct parse *p, cset *cs)); 66 static char p_b_symbol __P((struct parse *p)); 67 static char p_b_coll_elem __P((struct parse *p, int endc)); 68 static char othercase __P((int ch)); 69 static void bothcases __P((struct parse *p, int ch)); 70 static void ordinary __P((struct parse *p, int ch)); 71 static void nonnewline __P((struct parse *p)); 72 static void repeat __P((struct parse *p, sopno start, int from, int to)); 73 static int seterr __P((struct parse *p, int e)); 74 static cset *allocset __P((struct parse *p)); 75 static void freeset __P((struct parse *p, cset *cs)); 76 static int freezeset __P((struct parse *p, cset *cs)); 77 static int firstch __P((struct parse *p, cset *cs)); 78 static int nch __P((struct parse *p, cset *cs)); 79 static void mcadd __P((struct parse *p, cset *cs, char *cp)); 80 static void mcsub __P((cset *cs, char *cp)); 81 static int mcin __P((cset *cs, char *cp)); 82 static char *mcfind __P((cset *cs, char *cp)); 83 static void mcinvert __P((struct parse *p, cset *cs)); 84 static void mccase __P((struct parse *p, cset *cs)); 85 static int isinsets __P((struct re_guts *g, int c)); 86 static int samesets __P((struct re_guts *g, int c1, int c2)); 87 static void categorize __P((struct parse *p, struct re_guts *g)); 88 static sopno dupl __P((struct parse *p, sopno start, sopno finish)); 89 static void doemit __P((struct parse *p, sop op, size_t opnd)); 90 static void doinsert __P((struct parse *p, sop op, size_t opnd, sopno pos)); 91 static void dofwd __P((struct parse *p, sopno pos, sop value)); 92 static void enlarge __P((struct parse *p, sopno size)); 93 static void stripsnug __P((struct parse *p, struct re_guts *g)); 94 static void findmust __P((struct parse *p, struct re_guts *g)); 95 static sopno pluscount __P((struct parse *p, struct re_guts *g)); 96 97 #ifdef __cplusplus 98 } 99 #endif 100 /* ========= end header generated by ./mkh ========= */ 101 102 static char nuls[10]; /* place to point scanner in event of error */ 103 104 /* 105 * macros for use with parse structure 106 * BEWARE: these know that the parse structure is named `p' !!! 107 */ 108 #define PEEK() (*p->next) 109 #define PEEK2() (*(p->next+1)) 110 #define MORE() (p->next < p->end) 111 #define MORE2() (p->next+1 < p->end) 112 #define SEE(c) (MORE() && PEEK() == (c)) 113 #define SEETWO(a, b) (MORE() && MORE2() && PEEK() == (a) && PEEK2() == (b)) 114 #define EAT(c) ((SEE(c)) ? (NEXT(), 1) : 0) 115 #define EATTWO(a, b) ((SEETWO(a, b)) ? (NEXT2(), 1) : 0) 116 #define NEXT() (p->next++) 117 #define NEXT2() (p->next += 2) 118 #define NEXTn(n) (p->next += (n)) 119 #define GETNEXT() (*p->next++) 120 #define SETERROR(e) seterr(p, (e)) 121 #define REQUIRE(co, e) ((co) || SETERROR(e)) 122 #define MUSTSEE(c, e) (REQUIRE(MORE() && PEEK() == (c), e)) 123 #define MUSTEAT(c, e) (REQUIRE(MORE() && GETNEXT() == (c), e)) 124 #define MUSTNOTSEE(c, e) (REQUIRE(!MORE() || PEEK() != (c), e)) 125 #define EMIT(op, sopnd) doemit(p, (sop)(op), (size_t)(sopnd)) 126 #define INSERT(op, pos) doinsert(p, (sop)(op), HERE()-(pos)+1, pos) 127 #define AHEAD(pos) dofwd(p, pos, HERE()-(pos)) 128 #define ASTERN(sop, pos) EMIT(sop, HERE()-pos) 129 #define HERE() (p->slen) 130 #define THERE() (p->slen - 1) 131 #define THERETHERE() (p->slen - 2) 132 #define DROP(n) (p->slen -= (n)) 133 134 #ifndef NDEBUG 135 static int never = 0; /* for use in asserts; shuts lint up */ 136 #else 137 #define never 0 /* some <assert.h>s have bugs too */ 138 #endif 139 140 /* 141 - regcomp - interface for parser and compilation 142 = extern int regcomp(regex_t *, const char *, int); 143 = #define REG_BASIC 0000 144 = #define REG_EXTENDED 0001 145 = #define REG_ICASE 0002 146 = #define REG_NOSUB 0004 147 = #define REG_NEWLINE 0010 148 = #define REG_NOSPEC 0020 149 = #define REG_PEND 0040 150 = #define REG_DUMP 0200 151 */ 152 int /* 0 success, otherwise REG_something */ 153 regcomp(preg, pattern, cflags) 154 regex_t *preg; 155 const char *pattern; 156 int cflags; 157 { 158 struct parse pa; 159 register struct re_guts *g; 160 register struct parse *p = &pa; 161 register int i; 162 register size_t len; 163 #ifdef REDEBUG 164 # define GOODFLAGS(f) (f) 165 #else 166 # define GOODFLAGS(f) ((f)&~REG_DUMP) 167 #endif 168 169 cflags = GOODFLAGS(cflags); 170 if ((cflags®_EXTENDED) && (cflags®_NOSPEC)) 171 return(REG_INVARG); 172 173 if (cflags®_PEND) { 174 if (preg->re_endp < pattern) 175 return(REG_INVARG); 176 len = preg->re_endp - pattern; 177 } else 178 len = strlen((char *)pattern); 179 180 /* do the mallocs early so failure handling is easy */ 181 g = (struct re_guts *)malloc(sizeof(struct re_guts) + 182 (NC-1)*sizeof(cat_t)); 183 if (g == NULL) 184 return(REG_ESPACE); 185 p->ssize = len/(size_t)2*(size_t)3 + (size_t)1; /* ugh */ 186 p->strip = (sop *)malloc(p->ssize * sizeof(sop)); 187 p->slen = 0; 188 if (p->strip == NULL) { 189 free((char *)g); 190 return(REG_ESPACE); 191 } 192 193 /* set things up */ 194 p->g = g; 195 p->next = (char *)pattern; /* convenience; we do not modify it */ 196 p->end = p->next + len; 197 p->error = 0; 198 p->ncsalloc = 0; 199 for (i = 0; i < NPAREN; i++) { 200 p->pbegin[i] = 0; 201 p->pend[i] = 0; 202 } 203 g->csetsize = NC; 204 g->sets = NULL; 205 g->setbits = NULL; 206 g->ncsets = 0; 207 g->cflags = cflags; 208 g->iflags = 0; 209 g->nbol = 0; 210 g->neol = 0; 211 g->must = NULL; 212 g->mlen = 0; 213 g->nsub = 0; 214 g->ncategories = 1; /* category 0 is "everything else" */ 215 g->categories = &g->catspace[-(CHAR_MIN)]; 216 (void) memset((char *)g->catspace, 0, NC*sizeof(cat_t)); 217 g->backrefs = 0; 218 219 /* do it */ 220 EMIT(OEND, 0); 221 g->firststate = THERE(); 222 if (cflags®_EXTENDED) 223 p_ere(p, OUT); 224 else if (cflags®_NOSPEC) 225 p_str(p); 226 else 227 p_bre(p, OUT, OUT); 228 EMIT(OEND, 0); 229 g->laststate = THERE(); 230 231 /* tidy up loose ends and fill things in */ 232 categorize(p, g); 233 stripsnug(p, g); 234 findmust(p, g); 235 g->nplus = pluscount(p, g); 236 g->magic = MAGIC2; 237 preg->re_nsub = g->nsub; 238 preg->re_g = g; 239 preg->re_magic = MAGIC1; 240 #ifndef REDEBUG 241 /* not debugging, so can't rely on the assert() in regexec() */ 242 if (g->iflags&BAD) 243 SETERROR(REG_ASSERT); 244 #endif 245 246 /* win or lose, we're done */ 247 if (p->error != 0) /* lose */ 248 regfree(preg); 249 return(p->error); 250 } 251 252 /* 253 - p_ere - ERE parser top level, concatenation and alternation 254 == static void p_ere(register struct parse *p, int stop); 255 */ 256 static void 257 p_ere(p, stop) 258 register struct parse *p; 259 int stop; /* character this ERE should end at */ 260 { 261 register char c; 262 register sopno prevback; 263 register sopno prevfwd; 264 register sopno conc; 265 register int first = 1; /* is this the first alternative? */ 266 267 for (;;) { 268 /* do a bunch of concatenated expressions */ 269 conc = HERE(); 270 while (MORE() && (c = PEEK()) != '|' && c != stop) 271 p_ere_exp(p); 272 REQUIRE(HERE() != conc, REG_EMPTY); /* require nonempty */ 273 274 if (!EAT('|')) 275 break; /* NOTE BREAK OUT */ 276 277 if (first) { 278 INSERT(OCH_, conc); /* offset is wrong */ 279 prevfwd = conc; 280 prevback = conc; 281 first = 0; 282 } 283 ASTERN(OOR1, prevback); 284 prevback = THERE(); 285 AHEAD(prevfwd); /* fix previous offset */ 286 prevfwd = HERE(); 287 EMIT(OOR2, 0); /* offset is very wrong */ 288 } 289 290 if (!first) { /* tail-end fixups */ 291 AHEAD(prevfwd); 292 ASTERN(O_CH, prevback); 293 } 294 295 assert(!MORE() || SEE(stop)); 296 } 297 298 /* 299 - p_ere_exp - parse one subERE, an atom possibly followed by a repetition op 300 == static void p_ere_exp(register struct parse *p); 301 */ 302 static void 303 p_ere_exp(p) 304 register struct parse *p; 305 { 306 register char c; 307 register sopno pos; 308 register int count; 309 register int count2; 310 register sopno subno; 311 int wascaret = 0; 312 313 assert(MORE()); /* caller should have ensured this */ 314 c = GETNEXT(); 315 316 pos = HERE(); 317 switch (c) { 318 case '(': 319 REQUIRE(MORE(), REG_EPAREN); 320 p->g->nsub++; 321 subno = p->g->nsub; 322 if (subno < NPAREN) 323 p->pbegin[subno] = HERE(); 324 EMIT(OLPAREN, subno); 325 if (!SEE(')')) 326 p_ere(p, ')'); 327 if (subno < NPAREN) { 328 p->pend[subno] = HERE(); 329 assert(p->pend[subno] != 0); 330 } 331 EMIT(ORPAREN, subno); 332 MUSTEAT(')', REG_EPAREN); 333 break; 334 #ifndef POSIX_MISTAKE 335 case ')': /* happens only if no current unmatched ( */ 336 /* 337 * You may ask, why the ifndef? Because I didn't notice 338 * this until slightly too late for 1003.2, and none of the 339 * other 1003.2 regular-expression reviewers noticed it at 340 * all. So an unmatched ) is legal POSIX, at least until 341 * we can get it fixed. 342 */ 343 SETERROR(REG_EPAREN); 344 break; 345 #endif 346 case '^': 347 EMIT(OBOL, 0); 348 p->g->iflags |= USEBOL; 349 p->g->nbol++; 350 wascaret = 1; 351 break; 352 case '$': 353 EMIT(OEOL, 0); 354 p->g->iflags |= USEEOL; 355 p->g->neol++; 356 break; 357 case '|': 358 SETERROR(REG_EMPTY); 359 break; 360 case '*': 361 case '+': 362 case '?': 363 SETERROR(REG_BADRPT); 364 break; 365 case '.': 366 if (p->g->cflags®_NEWLINE) 367 nonnewline(p); 368 else 369 EMIT(OANY, 0); 370 break; 371 case '[': 372 p_bracket(p); 373 break; 374 case '\\': 375 REQUIRE(MORE(), REG_EESCAPE); 376 c = GETNEXT(); 377 ordinary(p, c); 378 break; 379 case '{': /* okay as ordinary except if digit follows */ 380 REQUIRE(!MORE() || !isdigit(PEEK()), REG_BADRPT); 381 /* FALLTHROUGH */ 382 default: 383 ordinary(p, c); 384 break; 385 } 386 387 if (!MORE()) 388 return; 389 c = PEEK(); 390 /* we call { a repetition if followed by a digit */ 391 if (!( c == '*' || c == '+' || c == '?' || 392 (c == '{' && MORE2() && isdigit(PEEK2())) )) 393 return; /* no repetition, we're done */ 394 NEXT(); 395 396 REQUIRE(!wascaret, REG_BADRPT); 397 switch (c) { 398 case '*': /* implemented as +? */ 399 /* this case does not require the (y|) trick, noKLUDGE */ 400 INSERT(OPLUS_, pos); 401 ASTERN(O_PLUS, pos); 402 INSERT(OQUEST_, pos); 403 ASTERN(O_QUEST, pos); 404 break; 405 case '+': 406 INSERT(OPLUS_, pos); 407 ASTERN(O_PLUS, pos); 408 break; 409 case '?': 410 /* KLUDGE: emit y? as (y|) until subtle bug gets fixed */ 411 INSERT(OCH_, pos); /* offset slightly wrong */ 412 ASTERN(OOR1, pos); /* this one's right */ 413 AHEAD(pos); /* fix the OCH_ */ 414 EMIT(OOR2, 0); /* offset very wrong... */ 415 AHEAD(THERE()); /* ...so fix it */ 416 ASTERN(O_CH, THERETHERE()); 417 break; 418 case '{': 419 count = p_count(p); 420 if (EAT(',')) { 421 if (isdigit(PEEK())) { 422 count2 = p_count(p); 423 REQUIRE(count <= count2, REG_BADBR); 424 } else /* single number with comma */ 425 count2 = INFINITY; 426 } else /* just a single number */ 427 count2 = count; 428 repeat(p, pos, count, count2); 429 if (!EAT('}')) { /* error heuristics */ 430 while (MORE() && PEEK() != '}') 431 NEXT(); 432 REQUIRE(MORE(), REG_EBRACE); 433 SETERROR(REG_BADBR); 434 } 435 break; 436 } 437 438 if (!MORE()) 439 return; 440 c = PEEK(); 441 if (!( c == '*' || c == '+' || c == '?' || 442 (c == '{' && MORE2() && isdigit(PEEK2())) ) ) 443 return; 444 SETERROR(REG_BADRPT); 445 } 446 447 /* 448 - p_str - string (no metacharacters) "parser" 449 == static void p_str(register struct parse *p); 450 */ 451 static void 452 p_str(p) 453 register struct parse *p; 454 { 455 REQUIRE(MORE(), REG_EMPTY); 456 while (MORE()) 457 ordinary(p, GETNEXT()); 458 } 459 460 /* 461 - p_bre - BRE parser top level, anchoring and concatenation 462 == static void p_bre(register struct parse *p, register int end1, \ 463 == register int end2); 464 * Giving end1 as OUT essentially eliminates the end1/end2 check. 465 * 466 * This implementation is a bit of a kludge, in that a trailing $ is first 467 * taken as an ordinary character and then revised to be an anchor. The 468 * only undesirable side effect is that '$' gets included as a character 469 * category in such cases. This is fairly harmless; not worth fixing. 470 * The amount of lookahead needed to avoid this kludge is excessive. 471 */ 472 static void 473 p_bre(p, end1, end2) 474 register struct parse *p; 475 register int end1; /* first terminating character */ 476 register int end2; /* second terminating character */ 477 { 478 register sopno start = HERE(); 479 register int first = 1; /* first subexpression? */ 480 register int wasdollar = 0; 481 482 if (EAT('^')) { 483 EMIT(OBOL, 0); 484 p->g->iflags |= USEBOL; 485 p->g->nbol++; 486 } 487 while (MORE() && !SEETWO(end1, end2)) { 488 wasdollar = p_simp_re(p, first); 489 first = 0; 490 } 491 if (wasdollar) { /* oops, that was a trailing anchor */ 492 DROP(1); 493 EMIT(OEOL, 0); 494 p->g->iflags |= USEEOL; 495 p->g->neol++; 496 } 497 498 REQUIRE(HERE() != start, REG_EMPTY); /* require nonempty */ 499 } 500 501 /* 502 - p_simp_re - parse a simple RE, an atom possibly followed by a repetition 503 == static int p_simp_re(register struct parse *p, int starordinary); 504 */ 505 static int /* was the simple RE an unbackslashed $? */ 506 p_simp_re(p, starordinary) 507 register struct parse *p; 508 int starordinary; /* is a leading * an ordinary character? */ 509 { 510 register int c; 511 register int count; 512 register int count2; 513 register sopno pos; 514 register int i; 515 register sopno subno; 516 # define BACKSL (1<<CHAR_BIT) 517 518 pos = HERE(); /* repetion op, if any, covers from here */ 519 520 assert(MORE()); /* caller should have ensured this */ 521 c = GETNEXT(); 522 if (c == '\\') { 523 REQUIRE(MORE(), REG_EESCAPE); 524 c = BACKSL | (unsigned char)GETNEXT(); 525 } 526 switch (c) { 527 case '.': 528 if (p->g->cflags®_NEWLINE) 529 nonnewline(p); 530 else 531 EMIT(OANY, 0); 532 break; 533 case '[': 534 p_bracket(p); 535 break; 536 case BACKSL|'{': 537 SETERROR(REG_BADRPT); 538 break; 539 case BACKSL|'(': 540 p->g->nsub++; 541 subno = p->g->nsub; 542 if (subno < NPAREN) 543 p->pbegin[subno] = HERE(); 544 EMIT(OLPAREN, subno); 545 /* the MORE here is an error heuristic */ 546 if (MORE() && !SEETWO('\\', ')')) 547 p_bre(p, '\\', ')'); 548 if (subno < NPAREN) { 549 p->pend[subno] = HERE(); 550 assert(p->pend[subno] != 0); 551 } 552 EMIT(ORPAREN, subno); 553 REQUIRE(EATTWO('\\', ')'), REG_EPAREN); 554 break; 555 case BACKSL|')': /* should not get here -- must be user */ 556 case BACKSL|'}': 557 SETERROR(REG_EPAREN); 558 break; 559 case BACKSL|'1': 560 case BACKSL|'2': 561 case BACKSL|'3': 562 case BACKSL|'4': 563 case BACKSL|'5': 564 case BACKSL|'6': 565 case BACKSL|'7': 566 case BACKSL|'8': 567 case BACKSL|'9': 568 i = (c&~BACKSL) - '0'; 569 assert(i < NPAREN); 570 if (p->pend[i] != 0) { 571 assert(i <= p->g->nsub); 572 EMIT(OBACK_, i); 573 assert(p->pbegin[i] != 0); 574 assert(OP(p->strip[p->pbegin[i]]) == OLPAREN); 575 assert(OP(p->strip[p->pend[i]]) == ORPAREN); 576 (void) dupl(p, p->pbegin[i]+1, p->pend[i]); 577 EMIT(O_BACK, i); 578 } else 579 SETERROR(REG_ESUBREG); 580 p->g->backrefs = 1; 581 break; 582 case '*': 583 REQUIRE(starordinary, REG_BADRPT); 584 /* FALLTHROUGH */ 585 default: 586 ordinary(p, c &~ BACKSL); 587 break; 588 } 589 590 if (EAT('*')) { /* implemented as +? */ 591 /* this case does not require the (y|) trick, noKLUDGE */ 592 INSERT(OPLUS_, pos); 593 ASTERN(O_PLUS, pos); 594 INSERT(OQUEST_, pos); 595 ASTERN(O_QUEST, pos); 596 } else if (EATTWO('\\', '{')) { 597 count = p_count(p); 598 if (EAT(',')) { 599 if (MORE() && isdigit(PEEK())) { 600 count2 = p_count(p); 601 REQUIRE(count <= count2, REG_BADBR); 602 } else /* single number with comma */ 603 count2 = INFINITY; 604 } else /* just a single number */ 605 count2 = count; 606 repeat(p, pos, count, count2); 607 if (!EATTWO('\\', '}')) { /* error heuristics */ 608 while (MORE() && !SEETWO('\\', '}')) 609 NEXT(); 610 REQUIRE(MORE(), REG_EBRACE); 611 SETERROR(REG_BADBR); 612 } 613 } else if (c == (unsigned char)'$') /* $ (but not \$) ends it */ 614 return(1); 615 616 return(0); 617 } 618 619 /* 620 - p_count - parse a repetition count 621 == static int p_count(register struct parse *p); 622 */ 623 static int /* the value */ 624 p_count(p) 625 register struct parse *p; 626 { 627 register int count = 0; 628 register int ndigits = 0; 629 630 while (MORE() && isdigit(PEEK()) && count <= DUPMAX) { 631 count = count*10 + (GETNEXT() - '0'); 632 ndigits++; 633 } 634 635 REQUIRE(ndigits > 0 && count <= DUPMAX, REG_BADBR); 636 return(count); 637 } 638 639 /* 640 - p_bracket - parse a bracketed character list 641 == static void p_bracket(register struct parse *p); 642 * 643 * Note a significant property of this code: if the allocset() did SETERROR, 644 * no set operations are done. 645 */ 646 static void 647 p_bracket(p) 648 register struct parse *p; 649 { 650 register char c; 651 register cset *cs = allocset(p); 652 register int invert = 0; 653 654 /* Dept of Truly Sickening Special-Case Kludges */ 655 if (p->next + 5 < p->end && strncmp(p->next, "[:<:]]", 6) == 0) { 656 EMIT(OBOW, 0); 657 NEXTn(6); 658 return; 659 } 660 if (p->next + 5 < p->end && strncmp(p->next, "[:>:]]", 6) == 0) { 661 EMIT(OEOW, 0); 662 NEXTn(6); 663 return; 664 } 665 666 if (EAT('^')) 667 invert++; /* make note to invert set at end */ 668 if (EAT(']')) 669 CHadd(cs, ']'); 670 else if (EAT('-')) 671 CHadd(cs, '-'); 672 while (MORE() && PEEK() != ']' && !SEETWO('-', ']')) 673 p_b_term(p, cs); 674 if (EAT('-')) 675 CHadd(cs, '-'); 676 MUSTEAT(']', REG_EBRACK); 677 678 if (p->error != 0) /* don't mess things up further */ 679 return; 680 681 if (p->g->cflags®_ICASE) { 682 register int i; 683 register int ci; 684 685 for (i = p->g->csetsize - 1; i >= 0; i--) 686 if (CHIN(cs, i) && isalpha(i)) { 687 ci = othercase(i); 688 if (ci != i) 689 CHadd(cs, ci); 690 } 691 if (cs->multis != NULL) 692 mccase(p, cs); 693 } 694 if (invert) { 695 register int i; 696 697 for (i = p->g->csetsize - 1; i >= 0; i--) 698 if (CHIN(cs, i)) 699 CHsub(cs, i); 700 else 701 CHadd(cs, i); 702 if (p->g->cflags®_NEWLINE) 703 CHsub(cs, '\n'); 704 if (cs->multis != NULL) 705 mcinvert(p, cs); 706 } 707 708 assert(cs->multis == NULL); /* xxx */ 709 710 if (nch(p, cs) == 1) { /* optimize singleton sets */ 711 ordinary(p, firstch(p, cs)); 712 freeset(p, cs); 713 } else 714 EMIT(OANYOF, freezeset(p, cs)); 715 } 716 717 /* 718 - p_b_term - parse one term of a bracketed character list 719 == static void p_b_term(register struct parse *p, register cset *cs); 720 */ 721 static void 722 p_b_term(p, cs) 723 register struct parse *p; 724 register cset *cs; 725 { 726 register char c; 727 register char start, finish; 728 register int i; 729 730 /* classify what we've got */ 731 switch ((MORE()) ? PEEK() : '\0') { 732 case '[': 733 c = (MORE2()) ? PEEK2() : '\0'; 734 break; 735 case '-': 736 SETERROR(REG_ERANGE); 737 return; /* NOTE RETURN */ 738 break; 739 default: 740 c = '\0'; 741 break; 742 } 743 744 switch (c) { 745 case ':': /* character class */ 746 NEXT2(); 747 REQUIRE(MORE(), REG_EBRACK); 748 c = PEEK(); 749 REQUIRE(c != '-' && c != ']', REG_ECTYPE); 750 p_b_cclass(p, cs); 751 REQUIRE(MORE(), REG_EBRACK); 752 REQUIRE(EATTWO(':', ']'), REG_ECTYPE); 753 break; 754 case '=': /* equivalence class */ 755 NEXT2(); 756 REQUIRE(MORE(), REG_EBRACK); 757 c = PEEK(); 758 REQUIRE(c != '-' && c != ']', REG_ECOLLATE); 759 p_b_eclass(p, cs); 760 REQUIRE(MORE(), REG_EBRACK); 761 REQUIRE(EATTWO('=', ']'), REG_ECOLLATE); 762 break; 763 default: /* symbol, ordinary character, or range */ 764 /* xxx revision needed for multichar stuff */ 765 start = p_b_symbol(p); 766 if (SEE('-') && MORE2() && PEEK2() != ']') { 767 /* range */ 768 NEXT(); 769 if (EAT('-')) 770 finish = '-'; 771 else 772 finish = p_b_symbol(p); 773 } else 774 finish = start; 775 /* xxx what about signed chars here... */ 776 REQUIRE(start <= finish, REG_ERANGE); 777 for (i = start; i <= finish; i++) 778 CHadd(cs, i); 779 break; 780 } 781 } 782 783 /* 784 - p_b_cclass - parse a character-class name and deal with it 785 == static void p_b_cclass(register struct parse *p, register cset *cs); 786 */ 787 static void 788 p_b_cclass(p, cs) 789 register struct parse *p; 790 register cset *cs; 791 { 792 register char *sp = p->next; 793 register struct cclass *cp; 794 register size_t len; 795 register char *u; 796 register char c; 797 798 while (MORE() && isalpha(PEEK())) 799 NEXT(); 800 len = p->next - sp; 801 for (cp = cclasses; cp->name != NULL; cp++) 802 if (strncmp(cp->name, sp, len) == 0 && cp->name[len] == '\0') 803 break; 804 if (cp->name == NULL) { 805 /* oops, didn't find it */ 806 SETERROR(REG_ECTYPE); 807 return; 808 } 809 810 u = cp->chars; 811 while ((c = *u++) != '\0') 812 CHadd(cs, c); 813 for (u = cp->multis; *u != '\0'; u += strlen(u) + 1) 814 MCadd(p, cs, u); 815 } 816 817 /* 818 - p_b_eclass - parse an equivalence-class name and deal with it 819 == static void p_b_eclass(register struct parse *p, register cset *cs); 820 * 821 * This implementation is incomplete. xxx 822 */ 823 static void 824 p_b_eclass(p, cs) 825 register struct parse *p; 826 register cset *cs; 827 { 828 register char c; 829 830 c = p_b_coll_elem(p, '='); 831 CHadd(cs, c); 832 } 833 834 /* 835 - p_b_symbol - parse a character or [..]ed multicharacter collating symbol 836 == static char p_b_symbol(register struct parse *p); 837 */ 838 static char /* value of symbol */ 839 p_b_symbol(p) 840 register struct parse *p; 841 { 842 register char value; 843 844 REQUIRE(MORE(), REG_EBRACK); 845 if (!EATTWO('[', '.')) 846 return(GETNEXT()); 847 848 /* collating symbol */ 849 value = p_b_coll_elem(p, '.'); 850 REQUIRE(EATTWO('.', ']'), REG_ECOLLATE); 851 return(value); 852 } 853 854 /* 855 - p_b_coll_elem - parse a collating-element name and look it up 856 == static char p_b_coll_elem(register struct parse *p, int endc); 857 */ 858 static char /* value of collating element */ 859 p_b_coll_elem(p, endc) 860 register struct parse *p; 861 int endc; /* name ended by endc,']' */ 862 { 863 register char *sp = p->next; 864 register struct cname *cp; 865 register int len; 866 register char c; 867 868 while (MORE() && !SEETWO(endc, ']')) 869 NEXT(); 870 if (!MORE()) { 871 SETERROR(REG_EBRACK); 872 return(0); 873 } 874 len = p->next - sp; 875 for (cp = cnames; cp->name != NULL; cp++) 876 if (strncmp(cp->name, sp, len) == 0 && cp->name[len] == '\0') 877 return(cp->code); /* known name */ 878 if (len == 1) 879 return(*sp); /* single character */ 880 SETERROR(REG_ECOLLATE); /* neither */ 881 return(0); 882 } 883 884 /* 885 - othercase - return the case counterpart of an alphabetic 886 == static char othercase(int ch); 887 */ 888 static char /* if no counterpart, return ch */ 889 othercase(ch) 890 int ch; 891 { 892 assert(isalpha(ch)); 893 if (isupper(ch)) 894 return(tolower(ch)); 895 else if (islower(ch)) 896 return(toupper(ch)); 897 else /* peculiar, but could happen */ 898 return(ch); 899 } 900 901 /* 902 - bothcases - emit a dualcase version of a two-case character 903 == static void bothcases(register struct parse *p, int ch); 904 * 905 * Boy, is this implementation ever a kludge... 906 */ 907 static void 908 bothcases(p, ch) 909 register struct parse *p; 910 int ch; 911 { 912 register char *oldnext = p->next; 913 register char *oldend = p->end; 914 char bracket[3]; 915 916 assert(othercase(ch) != ch); /* p_bracket() would recurse */ 917 p->next = bracket; 918 p->end = bracket+2; 919 bracket[0] = ch; 920 bracket[1] = ']'; 921 bracket[2] = '\0'; 922 p_bracket(p); 923 assert(p->next == bracket+2); 924 p->next = oldnext; 925 p->end = oldend; 926 } 927 928 /* 929 - ordinary - emit an ordinary character 930 == static void ordinary(register struct parse *p, register int ch); 931 */ 932 static void 933 ordinary(p, ch) 934 register struct parse *p; 935 register int ch; 936 { 937 register cat_t *cap = p->g->categories; 938 939 if ((p->g->cflags®_ICASE) && isalpha(ch) && othercase(ch) != ch) 940 bothcases(p, ch); 941 else { 942 EMIT(OCHAR, (unsigned char)ch); 943 if (cap[ch] == 0) 944 cap[ch] = p->g->ncategories++; 945 } 946 } 947 948 /* 949 - nonnewline - emit REG_NEWLINE version of OANY 950 == static void nonnewline(register struct parse *p); 951 * 952 * Boy, is this implementation ever a kludge... 953 */ 954 static void 955 nonnewline(p) 956 register struct parse *p; 957 { 958 register char *oldnext = p->next; 959 register char *oldend = p->end; 960 char bracket[4]; 961 962 p->next = bracket; 963 p->end = bracket+3; 964 bracket[0] = '^'; 965 bracket[1] = '\n'; 966 bracket[2] = ']'; 967 bracket[3] = '\0'; 968 p_bracket(p); 969 assert(p->next == bracket+3); 970 p->next = oldnext; 971 p->end = oldend; 972 } 973 974 /* 975 - repeat - generate code for a bounded repetition, recursively if needed 976 == static void repeat(register struct parse *p, sopno start, int from, int to); 977 */ 978 static void 979 repeat(p, start, from, to) 980 register struct parse *p; 981 sopno start; /* operand from here to end of strip */ 982 int from; /* repeated from this number */ 983 int to; /* to this number of times (maybe INFINITY) */ 984 { 985 register sopno finish = HERE(); 986 # define N 2 987 # define INF 3 988 # define REP(f, t) ((f)*8 + (t)) 989 # define MAP(n) (((n) <= 1) ? (n) : ((n) == INFINITY) ? INF : N) 990 register sopno copy; 991 992 if (p->error != 0) /* head off possible runaway recursion */ 993 return; 994 995 assert(from <= to); 996 997 switch (REP(MAP(from), MAP(to))) { 998 case REP(0, 0): /* must be user doing this */ 999 DROP(finish-start); /* drop the operand */ 1000 break; 1001 case REP(0, 1): /* as x{1,1}? */ 1002 case REP(0, N): /* as x{1,n}? */ 1003 case REP(0, INF): /* as x{1,}? */ 1004 /* KLUDGE: emit y? as (y|) until subtle bug gets fixed */ 1005 INSERT(OCH_, start); /* offset is wrong... */ 1006 repeat(p, start+1, 1, to); 1007 ASTERN(OOR1, start); 1008 AHEAD(start); /* ... fix it */ 1009 EMIT(OOR2, 0); 1010 AHEAD(THERE()); 1011 ASTERN(O_CH, THERETHERE()); 1012 break; 1013 case REP(1, 1): /* trivial case */ 1014 /* done */ 1015 break; 1016 case REP(1, N): /* as x?x{1,n-1} */ 1017 /* KLUDGE: emit y? as (y|) until subtle bug gets fixed */ 1018 INSERT(OCH_, start); 1019 ASTERN(OOR1, start); 1020 AHEAD(start); 1021 EMIT(OOR2, 0); /* offset very wrong... */ 1022 AHEAD(THERE()); /* ...so fix it */ 1023 ASTERN(O_CH, THERETHERE()); 1024 copy = dupl(p, start+1, finish+1); 1025 assert(copy == finish+4); 1026 repeat(p, copy, 1, to-1); 1027 break; 1028 case REP(1, INF): /* as x+ */ 1029 INSERT(OPLUS_, start); 1030 ASTERN(O_PLUS, start); 1031 break; 1032 case REP(N, N): /* as xx{m-1,n-1} */ 1033 copy = dupl(p, start, finish); 1034 repeat(p, copy, from-1, to-1); 1035 break; 1036 case REP(N, INF): /* as xx{n-1,INF} */ 1037 copy = dupl(p, start, finish); 1038 repeat(p, copy, from-1, to); 1039 break; 1040 default: /* "can't happen" */ 1041 SETERROR(REG_ASSERT); /* just in case */ 1042 break; 1043 } 1044 } 1045 1046 /* 1047 - seterr - set an error condition 1048 == static int seterr(register struct parse *p, int e); 1049 */ 1050 static int /* useless but makes type checking happy */ 1051 seterr(p, e) 1052 register struct parse *p; 1053 int e; 1054 { 1055 if (p->error == 0) /* keep earliest error condition */ 1056 p->error = e; 1057 p->next = nuls; /* try to bring things to a halt */ 1058 p->end = nuls; 1059 return(0); /* make the return value well-defined */ 1060 } 1061 1062 /* 1063 - allocset - allocate a set of characters for [] 1064 == static cset *allocset(register struct parse *p); 1065 */ 1066 static cset * 1067 allocset(p) 1068 register struct parse *p; 1069 { 1070 register int no = p->g->ncsets++; 1071 register size_t nc; 1072 register size_t nbytes; 1073 register cset *cs; 1074 register size_t css = (size_t)p->g->csetsize; 1075 register int i; 1076 1077 if (no >= p->ncsalloc) { /* need another column of space */ 1078 p->ncsalloc += CHAR_BIT; 1079 nc = p->ncsalloc; 1080 assert(nc % CHAR_BIT == 0); 1081 nbytes = nc / CHAR_BIT * css; 1082 if (p->g->sets == NULL) 1083 p->g->sets = (cset *)malloc(nc * sizeof(cset)); 1084 else 1085 p->g->sets = (cset *)realloc((char *)p->g->sets, 1086 nc * sizeof(cset)); 1087 if (p->g->setbits == NULL) 1088 p->g->setbits = (uch *)malloc(nbytes); 1089 else { 1090 p->g->setbits = (uch *)realloc((char *)p->g->setbits, 1091 nbytes); 1092 /* xxx this isn't right if setbits is now NULL */ 1093 for (i = 0; i < no; i++) 1094 p->g->sets[i].ptr = p->g->setbits + css*(i/CHAR_BIT); 1095 } 1096 if (p->g->sets != NULL && p->g->setbits != NULL) 1097 (void) memset((char *)p->g->setbits + (nbytes - css), 1098 0, css); 1099 else { 1100 no = 0; 1101 SETERROR(REG_ESPACE); 1102 /* caller's responsibility not to do set ops */ 1103 } 1104 } 1105 1106 assert(p->g->sets != NULL); /* xxx */ 1107 cs = &p->g->sets[no]; 1108 cs->ptr = p->g->setbits + css*((no)/CHAR_BIT); 1109 cs->mask = 1 << ((no) % CHAR_BIT); 1110 cs->hash = 0; 1111 cs->smultis = 0; 1112 cs->multis = NULL; 1113 1114 return(cs); 1115 } 1116 1117 /* 1118 - freeset - free a now-unused set 1119 == static void freeset(register struct parse *p, register cset *cs); 1120 */ 1121 static void 1122 freeset(p, cs) 1123 register struct parse *p; 1124 register cset *cs; 1125 { 1126 register int i; 1127 register cset *top = &p->g->sets[p->g->ncsets]; 1128 register size_t css = (size_t)p->g->csetsize; 1129 1130 for (i = 0; i < css; i++) 1131 CHsub(cs, i); 1132 if (cs == top-1) /* recover only the easy case */ 1133 p->g->ncsets--; 1134 } 1135 1136 /* 1137 - freezeset - final processing on a set of characters 1138 == static int freezeset(register struct parse *p, register cset *cs); 1139 * 1140 * The main task here is merging identical sets. This is usually a waste 1141 * of time (although the hash code minimizes the overhead), but can win 1142 * big if REG_ICASE is being used. REG_ICASE, by the way, is why the hash 1143 * is done using addition rather than xor -- all ASCII [aA] sets xor to 1144 * the same value! 1145 */ 1146 static int /* set number */ 1147 freezeset(p, cs) 1148 register struct parse *p; 1149 register cset *cs; 1150 { 1151 register uch h = cs->hash; 1152 register int i; 1153 register cset *top = &p->g->sets[p->g->ncsets]; 1154 register cset *cs2; 1155 register size_t css = (size_t)p->g->csetsize; 1156 1157 /* look for an earlier one which is the same */ 1158 for (cs2 = &p->g->sets[0]; cs2 < top; cs2++) 1159 if (cs2->hash == h && cs2 != cs) { 1160 /* maybe */ 1161 for (i = 0; i < css; i++) 1162 if (!!CHIN(cs2, i) != !!CHIN(cs, i)) 1163 break; /* no */ 1164 if (i == css) 1165 break; /* yes */ 1166 } 1167 1168 if (cs2 < top) { /* found one */ 1169 freeset(p, cs); 1170 cs = cs2; 1171 } 1172 1173 return((int)(cs - p->g->sets)); 1174 } 1175 1176 /* 1177 - firstch - return first character in a set (which must have at least one) 1178 == static int firstch(register struct parse *p, register cset *cs); 1179 */ 1180 static int /* character; there is no "none" value */ 1181 firstch(p, cs) 1182 register struct parse *p; 1183 register cset *cs; 1184 { 1185 register int i; 1186 register size_t css = (size_t)p->g->csetsize; 1187 1188 for (i = 0; i < css; i++) 1189 if (CHIN(cs, i)) 1190 return((char)i); 1191 assert(never); 1192 return(0); /* arbitrary */ 1193 } 1194 1195 /* 1196 - nch - number of characters in a set 1197 == static int nch(register struct parse *p, register cset *cs); 1198 */ 1199 static int 1200 nch(p, cs) 1201 register struct parse *p; 1202 register cset *cs; 1203 { 1204 register int i; 1205 register size_t css = (size_t)p->g->csetsize; 1206 register int n = 0; 1207 1208 for (i = 0; i < css; i++) 1209 if (CHIN(cs, i)) 1210 n++; 1211 return(n); 1212 } 1213 1214 /* 1215 - mcadd - add a collating element to a cset 1216 == static void mcadd(register struct parse *p, register cset *cs, \ 1217 == register char *cp); 1218 */ 1219 static void 1220 mcadd(p, cs, cp) 1221 register struct parse *p; 1222 register cset *cs; 1223 register char *cp; 1224 { 1225 register size_t oldend = cs->smultis; 1226 1227 cs->smultis += strlen(cp) + 1; 1228 if (cs->multis == NULL) 1229 cs->multis = malloc(cs->smultis); 1230 else 1231 cs->multis = realloc(cs->multis, cs->smultis); 1232 if (cs->multis == NULL) { 1233 SETERROR(REG_ESPACE); 1234 return; 1235 } 1236 1237 (void) strcpy(cs->multis + oldend - 1, cp); 1238 cs->multis[cs->smultis - 1] = '\0'; 1239 } 1240 1241 /* 1242 - mcsub - subtract a collating element from a cset 1243 == static void mcsub(register cset *cs, register char *cp); 1244 */ 1245 static void 1246 mcsub(cs, cp) 1247 register cset *cs; 1248 register char *cp; 1249 { 1250 register char *fp = mcfind(cs, cp); 1251 register size_t len = strlen(fp); 1252 1253 assert(fp != NULL); 1254 (void) memmove(fp, fp + len + 1, 1255 cs->smultis - (fp + len + 1 - cs->multis)); 1256 cs->smultis -= len; 1257 1258 if (cs->smultis == 0) { 1259 free(cs->multis); 1260 cs->multis = NULL; 1261 return; 1262 } 1263 1264 cs->multis = realloc(cs->multis, cs->smultis); 1265 assert(cs->multis != NULL); 1266 } 1267 1268 /* 1269 - mcin - is a collating element in a cset? 1270 == static int mcin(register cset *cs, register char *cp); 1271 */ 1272 static int 1273 mcin(cs, cp) 1274 register cset *cs; 1275 register char *cp; 1276 { 1277 return(mcfind(cs, cp) != NULL); 1278 } 1279 1280 /* 1281 - mcfind - find a collating element in a cset 1282 == static char *mcfind(register cset *cs, register char *cp); 1283 */ 1284 static char * 1285 mcfind(cs, cp) 1286 register cset *cs; 1287 register char *cp; 1288 { 1289 register char *p; 1290 1291 if (cs->multis == NULL) 1292 return(NULL); 1293 for (p = cs->multis; *p != '\0'; p += strlen(p) + 1) 1294 if (strcmp(cp, p) == 0) 1295 return(p); 1296 return(NULL); 1297 } 1298 1299 /* 1300 - mcinvert - invert the list of collating elements in a cset 1301 == static void mcinvert(register struct parse *p, register cset *cs); 1302 * 1303 * This would have to know the set of possibilities. Implementation 1304 * is deferred. 1305 */ 1306 static void 1307 mcinvert(p, cs) 1308 register struct parse *p; 1309 register cset *cs; 1310 { 1311 assert(cs->multis == NULL); /* xxx */ 1312 } 1313 1314 /* 1315 - mccase - add case counterparts of the list of collating elements in a cset 1316 == static void mccase(register struct parse *p, register cset *cs); 1317 * 1318 * This would have to know the set of possibilities. Implementation 1319 * is deferred. 1320 */ 1321 static void 1322 mccase(p, cs) 1323 register struct parse *p; 1324 register cset *cs; 1325 { 1326 assert(cs->multis == NULL); /* xxx */ 1327 } 1328 1329 /* 1330 - isinsets - is this character in any sets? 1331 == static int isinsets(register struct re_guts *g, int c); 1332 */ 1333 static int /* predicate */ 1334 isinsets(g, c) 1335 register struct re_guts *g; 1336 int c; 1337 { 1338 register uch *col; 1339 register int i; 1340 register int ncols = (g->ncsets+(CHAR_BIT-1)) / CHAR_BIT; 1341 register unsigned uc = (unsigned char)c; 1342 1343 for (i = 0, col = g->setbits; i < ncols; i++, col += g->csetsize) 1344 if (col[uc] != 0) 1345 return(1); 1346 return(0); 1347 } 1348 1349 /* 1350 - samesets - are these two characters in exactly the same sets? 1351 == static int samesets(register struct re_guts *g, int c1, int c2); 1352 */ 1353 static int /* predicate */ 1354 samesets(g, c1, c2) 1355 register struct re_guts *g; 1356 int c1; 1357 int c2; 1358 { 1359 register uch *col; 1360 register int i; 1361 register int ncols = (g->ncsets+(CHAR_BIT-1)) / CHAR_BIT; 1362 register unsigned uc1 = (unsigned char)c1; 1363 register unsigned uc2 = (unsigned char)c2; 1364 1365 for (i = 0, col = g->setbits; i < ncols; i++, col += g->csetsize) 1366 if (col[uc1] != col[uc2]) 1367 return(0); 1368 return(1); 1369 } 1370 1371 /* 1372 - categorize - sort out character categories 1373 == static void categorize(struct parse *p, register struct re_guts *g); 1374 */ 1375 static void 1376 categorize(p, g) 1377 struct parse *p; 1378 register struct re_guts *g; 1379 { 1380 register cat_t *cats = g->categories; 1381 register int c; 1382 register int c2; 1383 register cat_t cat; 1384 1385 /* avoid making error situations worse */ 1386 if (p->error != 0) 1387 return; 1388 1389 for (c = CHAR_MIN; c <= CHAR_MAX; c++) 1390 if (cats[c] == 0 && isinsets(g, c)) { 1391 cat = g->ncategories++; 1392 cats[c] = cat; 1393 for (c2 = c+1; c2 <= CHAR_MAX; c2++) 1394 if (cats[c2] == 0 && samesets(g, c, c2)) 1395 cats[c2] = cat; 1396 } 1397 } 1398 1399 /* 1400 - dupl - emit a duplicate of a bunch of sops 1401 == static sopno dupl(register struct parse *p, sopno start, sopno finish); 1402 */ 1403 static sopno /* start of duplicate */ 1404 dupl(p, start, finish) 1405 register struct parse *p; 1406 sopno start; /* from here */ 1407 sopno finish; /* to this less one */ 1408 { 1409 register sopno ret = HERE(); 1410 register sopno len = finish - start; 1411 1412 assert(finish >= start); 1413 if (len == 0) 1414 return(ret); 1415 enlarge(p, p->ssize + len); /* this many unexpected additions */ 1416 assert(p->ssize >= p->slen + len); 1417 (void) memcpy((char *)(p->strip + p->slen), 1418 (char *)(p->strip + start), (size_t)len*sizeof(sop)); 1419 p->slen += len; 1420 return(ret); 1421 } 1422 1423 /* 1424 - doemit - emit a strip operator 1425 == static void doemit(register struct parse *p, sop op, size_t opnd); 1426 * 1427 * It might seem better to implement this as a macro with a function as 1428 * hard-case backup, but it's just too big and messy unless there are 1429 * some changes to the data structures. Maybe later. 1430 */ 1431 static void 1432 doemit(p, op, opnd) 1433 register struct parse *p; 1434 sop op; 1435 size_t opnd; 1436 { 1437 /* avoid making error situations worse */ 1438 if (p->error != 0) 1439 return; 1440 1441 /* deal with oversize operands ("can't happen", more or less) */ 1442 assert(opnd < 1<<OPSHIFT); 1443 1444 /* deal with undersized strip */ 1445 if (p->slen >= p->ssize) 1446 enlarge(p, (p->ssize+1) / 2 * 3); /* +50% */ 1447 assert(p->slen < p->ssize); 1448 1449 /* finally, it's all reduced to the easy case */ 1450 p->strip[p->slen++] = SOP(op, opnd); 1451 } 1452 1453 /* 1454 - doinsert - insert a sop into the strip 1455 == static void doinsert(register struct parse *p, sop op, size_t opnd, sopno pos); 1456 */ 1457 static void 1458 doinsert(p, op, opnd, pos) 1459 register struct parse *p; 1460 sop op; 1461 size_t opnd; 1462 sopno pos; 1463 { 1464 register sopno sn; 1465 register sop s; 1466 register int i; 1467 1468 /* avoid making error situations worse */ 1469 if (p->error != 0) 1470 return; 1471 1472 sn = HERE(); 1473 EMIT(op, opnd); /* do checks, ensure space */ 1474 assert(HERE() == sn+1); 1475 s = p->strip[sn]; 1476 1477 /* adjust paren pointers */ 1478 assert(pos > 0); 1479 for (i = 1; i < NPAREN; i++) { 1480 if (p->pbegin[i] >= pos) { 1481 p->pbegin[i]++; 1482 } 1483 if (p->pend[i] >= pos) { 1484 p->pend[i]++; 1485 } 1486 } 1487 1488 memmove((char *)&p->strip[pos+1], (char *)&p->strip[pos], 1489 (HERE()-pos-1)*sizeof(sop)); 1490 p->strip[pos] = s; 1491 } 1492 1493 /* 1494 - dofwd - complete a forward reference 1495 == static void dofwd(register struct parse *p, sopno pos, sop value); 1496 */ 1497 static void 1498 dofwd(p, pos, value) 1499 register struct parse *p; 1500 register sopno pos; 1501 sop value; 1502 { 1503 /* avoid making error situations worse */ 1504 if (p->error != 0) 1505 return; 1506 1507 assert(value < 1<<OPSHIFT); 1508 p->strip[pos] = OP(p->strip[pos]) | value; 1509 } 1510 1511 /* 1512 - enlarge - enlarge the strip 1513 == static void enlarge(register struct parse *p, sopno size); 1514 */ 1515 static void 1516 enlarge(p, size) 1517 register struct parse *p; 1518 register sopno size; 1519 { 1520 register sop *sp; 1521 1522 if (p->ssize >= size) 1523 return; 1524 1525 sp = (sop *)realloc(p->strip, size*sizeof(sop)); 1526 if (sp == NULL) { 1527 SETERROR(REG_ESPACE); 1528 return; 1529 } 1530 p->strip = sp; 1531 p->ssize = size; 1532 } 1533 1534 /* 1535 - stripsnug - compact the strip 1536 == static void stripsnug(register struct parse *p, register struct re_guts *g); 1537 */ 1538 static void 1539 stripsnug(p, g) 1540 register struct parse *p; 1541 register struct re_guts *g; 1542 { 1543 g->nstates = p->slen; 1544 g->strip = (sop *)realloc((char *)p->strip, p->slen * sizeof(sop)); 1545 if (g->strip == NULL) { 1546 SETERROR(REG_ESPACE); 1547 g->strip = p->strip; 1548 } 1549 } 1550 1551 /* 1552 - findmust - fill in must and mlen with longest mandatory literal string 1553 == static void findmust(register struct parse *p, register struct re_guts *g); 1554 * 1555 * This algorithm could do fancy things like analyzing the operands of | 1556 * for common subsequences. Someday. This code is simple and finds most 1557 * of the interesting cases. 1558 * 1559 * Note that must and mlen got initialized during setup. 1560 */ 1561 static void 1562 findmust(p, g) 1563 struct parse *p; 1564 register struct re_guts *g; 1565 { 1566 register sop *scan; 1567 sop *start; 1568 register sop *newstart; 1569 register sopno newlen; 1570 register sop s; 1571 register char *cp; 1572 register sopno i; 1573 1574 /* avoid making error situations worse */ 1575 if (p->error != 0) 1576 return; 1577 1578 /* find the longest OCHAR sequence in strip */ 1579 newlen = 0; 1580 scan = g->strip + 1; 1581 do { 1582 s = *scan++; 1583 switch (OP(s)) { 1584 case OCHAR: /* sequence member */ 1585 if (newlen == 0) /* new sequence */ 1586 newstart = scan - 1; 1587 newlen++; 1588 break; 1589 case OPLUS_: /* things that don't break one */ 1590 case OLPAREN: 1591 case ORPAREN: 1592 break; 1593 case OQUEST_: /* things that must be skipped */ 1594 case OCH_: 1595 scan--; 1596 do { 1597 scan += OPND(s); 1598 s = *scan; 1599 /* assert() interferes w debug printouts */ 1600 if (OP(s) != O_QUEST && OP(s) != O_CH && 1601 OP(s) != OOR2) { 1602 g->iflags |= BAD; 1603 return; 1604 } 1605 } while (OP(s) != O_QUEST && OP(s) != O_CH); 1606 /* fallthrough */ 1607 default: /* things that break a sequence */ 1608 if (newlen > g->mlen) { /* ends one */ 1609 start = newstart; 1610 g->mlen = newlen; 1611 } 1612 newlen = 0; 1613 break; 1614 } 1615 } while (OP(s) != OEND); 1616 1617 if (g->mlen == 0) /* there isn't one */ 1618 return; 1619 1620 /* turn it into a character string */ 1621 g->must = malloc((size_t)g->mlen + 1); 1622 if (g->must == NULL) { /* argh; just forget it */ 1623 g->mlen = 0; 1624 return; 1625 } 1626 cp = g->must; 1627 scan = start; 1628 for (i = g->mlen; i > 0; i--) { 1629 while (OP(s = *scan++) != OCHAR) 1630 continue; 1631 assert(cp < g->must + g->mlen); 1632 *cp++ = (char)OPND(s); 1633 } 1634 assert(cp == g->must + g->mlen); 1635 *cp++ = '\0'; /* just on general principles */ 1636 } 1637 1638 /* 1639 - pluscount - count + nesting 1640 == static sopno pluscount(register struct parse *p, register struct re_guts *g); 1641 */ 1642 static sopno /* nesting depth */ 1643 pluscount(p, g) 1644 struct parse *p; 1645 register struct re_guts *g; 1646 { 1647 register sop *scan; 1648 register sop s; 1649 register sopno plusnest = 0; 1650 register sopno maxnest = 0; 1651 1652 if (p->error != 0) 1653 return(0); /* there may not be an OEND */ 1654 1655 scan = g->strip + 1; 1656 do { 1657 s = *scan++; 1658 switch (OP(s)) { 1659 case OPLUS_: 1660 plusnest++; 1661 break; 1662 case O_PLUS: 1663 if (plusnest > maxnest) 1664 maxnest = plusnest; 1665 plusnest--; 1666 break; 1667 } 1668 } while (OP(s) != OEND); 1669 if (plusnest != 0) 1670 g->iflags |= BAD; 1671 return(maxnest); 1672 } 1673