1 /*- 2 * Copyright (c) 1992 Henry Spencer. 3 * Copyright (c) 1992 The Regents of the University of California. 4 * All rights reserved. 5 * 6 * This code is derived from software contributed to Berkeley by 7 * Henry Spencer of the University of Toronto. 8 * 9 * %sccs.include.redist.c% 10 * 11 * @(#)regcomp.c 5.1 (Berkeley) 08/06/92 12 */ 13 14 #if defined(LIBC_SCCS) && !defined(lint) 15 static char sccsid[] = "@(#)regcomp.c 5.1 (Berkeley) 08/06/92"; 16 #endif /* LIBC_SCCS and not lint */ 17 18 #include <sys/types.h> 19 20 #include <stdio.h> 21 #include <string.h> 22 #include <ctype.h> 23 #include <limits.h> 24 #include <stdlib.h> 25 #include <assert.h> 26 #include <regex.h> 27 28 #include "utils.h" 29 #include "regex2.h" 30 31 #include "cclass.h" 32 #include "cname.h" 33 34 static uchar nuls[10]; /* place to point scanner in event of error */ 35 36 /* 37 * parse structure, passed up and down to avoid global variables and 38 * other clumsinesses 39 */ 40 struct parse { 41 uchar *next; /* next character in RE */ 42 int error; /* has an error been seen? */ 43 sop *strip; /* malloced strip */ 44 sopno ssize; /* malloced strip size (allocated) */ 45 sopno slen; /* malloced strip length (used) */ 46 int ncsalloc; /* number of csets allocated */ 47 struct re_guts *g; 48 # define NPAREN 10 /* we need to remember () 1-9 for back refs */ 49 sopno pbegin[NPAREN]; /* -> ( ([0] unused) */ 50 sopno pend[NPAREN]; /* -> ) ([0] unused) */ 51 }; 52 53 STATIC void doemit __P((struct parse *, sop, size_t)); 54 STATIC void dofwd __P((struct parse *, sopno, sop)); 55 STATIC void doinsert __P((struct parse *, sop, size_t, sopno)); 56 STATIC sopno dupl __P((struct parse *, sopno, sopno)); 57 STATIC void enlarge __P((struct parse *, sopno)); 58 STATIC void mcadd __P((struct parse *, cset *, uchar *)); 59 STATIC uchar *mcfind __P((cset *, uchar *)); 60 STATIC int mcin __P((struct parse *, cset *, uchar *)); 61 STATIC void mcinvert __P((struct parse *, cset *)); 62 STATIC void mcsub __P((struct parse *, cset *, uchar *)); 63 STATIC int seterr __P((struct parse *, int)); 64 65 /* 66 * macros for use with parse structure 67 * BEWARE: these know that the parse structure is named `p' !!! 68 */ 69 #define PEEK() ((uchar)*p->next) 70 #define PEEK2() ((uchar)*(p->next+1)) 71 #define SEE(c) (PEEK() == (c)) 72 #define SEETWO(a, b) (PEEK() == (a) && PEEK2() == (b)) 73 #define EAT(c) ((SEE(c)) ? (NEXT(), 1) : 0) 74 #define EATTWO(a, b) ((SEETWO(a, b)) ? (NEXT2(), 1) : 0) 75 #define NEXT() (p->next++) 76 #define NEXT2() (p->next += 2) 77 #define NEXTn(n) (p->next += (n)) 78 #define GETNEXT() ((uchar)*p->next++) 79 #define SETERROR(e) seterr(p, (e)) 80 #define REQUIRE(co, e) ((co) || SETERROR(e)) 81 #define MUSTSEE(c, e) (REQUIRE(PEEK() == (c), e)) 82 #define MUSTEAT(c, e) (REQUIRE(GETNEXT() == (c), e)) 83 #define MUSTNOTSEE(c, e) (REQUIRE(PEEK() != (c), e)) 84 #define EMIT(sop, sopnd) doemit(p, sop, (size_t)(sopnd)) 85 #define INSERT(sop, pos) doinsert(p, sop, HERE()-(pos)+1, pos) 86 #define FWD(pos) dofwd(p, pos, HERE()-(pos)) 87 #define BACK(sop, pos) EMIT(sop, HERE()-pos) 88 #define HERE() (p->slen) 89 #define THERE() (p->slen - 1) 90 #define DROP(n) (p->slen -= (n)) 91 92 /* 93 - regcomp - interface for parser and compilation 94 */ 95 int /* 0 success, otherwise REG_something */ 96 regcomp(preg, pattern, cflags) 97 regex_t *preg; 98 const char *pattern; 99 int cflags; 100 { 101 struct parse pa; 102 register struct re_guts *g; 103 register struct parse *p = &pa; 104 sopno nstates; 105 register int i; 106 STATIC void p_ere(); 107 STATIC void p_bre(); 108 STATIC void stripsnug(); 109 STATIC void categorize(); 110 STATIC void findmust(); 111 STATIC sopno pluscount(); 112 113 /* do the mallocs early so failure handling is easy */ 114 /* the +NUC here is for the category table */ 115 g = (struct re_guts *)malloc(sizeof(struct re_guts) + NUC); 116 if (g == NULL) 117 return(REG_ESPACE); 118 p->ssize = strlen(pattern)/2*3 + 1; 119 p->strip = (sop *)malloc(p->ssize * sizeof(sop)); 120 p->slen = 0; 121 if (p->strip == NULL) { 122 free((char *)g); 123 return(REG_ESPACE); 124 } 125 126 /* set things up */ 127 p->g = g; 128 p->next = (uchar *)pattern; 129 p->error = 0; 130 p->ncsalloc = 0; 131 for (i = 0; i < NPAREN; i++) { 132 p->pbegin[i] = 0; 133 p->pend[i] = 0; 134 } 135 g->csetsize = NUC; 136 g->sets = NULL; 137 g->setbits = NULL; 138 g->ncsets = 0; 139 g->cflags = cflags; 140 g->iflags = 0; 141 g->must = NULL; 142 g->mlen = 0; 143 g->nsub = 0; 144 g->ncategories = 1; /* category 0 is "everything else" */ 145 g->categories = (uchar *)g + sizeof(struct re_guts); 146 (void) memset((char *)g->categories, 0, NUC); 147 g->backrefs = 0; 148 g->nplus = 0; 149 150 /* do it */ 151 EMIT(OEND, 0); 152 g->firststate = THERE(); 153 if (cflags®_EXTENDED) 154 p_ere(p, '\0'); 155 else 156 p_bre(p, '\0', '\0'); 157 EMIT(OEND, 0); 158 g->laststate = THERE(); 159 160 /* tidy up loose ends and fill things in */ 161 categorize(p, g); 162 stripsnug(p, g); 163 findmust(p, g); 164 g->nplus = pluscount(p, g); 165 g->magic = MAGIC2; 166 preg->re_nsub = g->nsub; 167 preg->re_g = g; 168 preg->re_magic = MAGIC1; 169 #ifdef NDEBUG 170 /* not debugging, so can't rely on the assert() in regexec() */ 171 if (g->iflags&BAD) 172 SETERROR(REG_ASSERT); 173 #endif 174 175 /* win or lose, we're done */ 176 if (p->error != 0) /* lose */ 177 regfree(preg); 178 return(p->error); 179 } 180 181 /* 182 - p_ere - ERE parser top level, concatenation and alternation 183 */ 184 static void 185 p_ere(p, stop) 186 register struct parse *p; 187 uchar stop; /* character this ERE should end at */ 188 { 189 STATIC void p_ere_exp(); 190 register uchar c; 191 register sopno prevback; 192 register sopno prevfwd; 193 register sopno conc; 194 register int first = 1; /* is this the first alternative? */ 195 196 for (;;) { 197 /* do a bunch of concatenated expressions */ 198 conc = HERE(); 199 while ((c = PEEK()) != '|' && c != stop && c != '\0') 200 p_ere_exp(p); 201 REQUIRE(HERE() != conc, REG_EMPTY); /* require nonempty */ 202 203 if (!EAT('|')) 204 break; /* NOTE BREAK OUT */ 205 206 if (first) { 207 INSERT(OCH_, conc); /* offset is wrong */ 208 prevfwd = conc; 209 prevback = conc; 210 first = 0; 211 } 212 BACK(OOR1, prevback); 213 prevback = THERE(); 214 FWD(prevfwd); /* fix previous offset */ 215 prevfwd = HERE(); 216 EMIT(OOR2, 0); /* offset is very wrong */ 217 } 218 219 if (!first) { /* tail-end fixups */ 220 FWD(prevfwd); 221 BACK(O_CH, prevback); 222 } 223 224 assert(SEE(stop) || SEE('\0')); 225 } 226 227 /* 228 - p_ere_exp - parse one subERE, an atom possibly followed by a repetition op 229 */ 230 static void 231 p_ere_exp(p) 232 register struct parse *p; 233 { 234 STATIC int p_count(); 235 STATIC void p_bracket(); 236 STATIC void ordinary(); 237 STATIC void nonnewline(); 238 STATIC void repeat(); 239 register uchar c; 240 register sopno pos; 241 register sopno sub; 242 register int count; 243 register int count2; 244 register sopno subno; 245 int wascaret = 0; 246 /* we call { a repetition if followed by a digit */ 247 # define ISRPT(c1, c2) (c1 == '*' || c1 == '+' || c1 == '?' || \ 248 (c1 == '{' && isdigit(c2))) 249 250 c = GETNEXT(); 251 assert(c != '\0'); /* caller should have ensured this */ 252 253 pos = HERE(); 254 switch (c) { 255 case '(': 256 MUSTNOTSEE('\0', REG_EPAREN); 257 p->g->nsub++; 258 subno = p->g->nsub; 259 if (subno < NPAREN) 260 p->pbegin[subno] = HERE(); 261 EMIT(OLPAREN, subno); 262 if (!SEE(')')) 263 p_ere(p, ')'); 264 if (subno < NPAREN) { 265 p->pend[subno] = HERE(); 266 assert(p->pend[subno] != 0); 267 } 268 EMIT(ORPAREN, subno); 269 MUSTEAT(')', REG_EPAREN); 270 break; 271 #ifndef POSIX_MISTAKE 272 case ')': /* happens only if no current unmatched ( */ 273 /* 274 * You may ask, why the ifndef? Because I didn't notice 275 * this until slightly too late for 1003.2, and none of the 276 * other 1003.2 regular-expression reviewers noticed it at 277 * all. So an unmatched ) is legal POSIX, at least until 278 * we can get it fixed. 279 */ 280 SETERROR(REG_EPAREN); 281 break; 282 #endif 283 case '^': 284 EMIT(OBOL, 0); 285 p->g->iflags |= USEBOL; 286 wascaret = 1; 287 break; 288 case '$': 289 EMIT(OEOL, 0); 290 p->g->iflags |= USEEOL; 291 break; 292 case '|': 293 SETERROR(REG_EMPTY); 294 break; 295 case '*': 296 case '+': 297 case '?': 298 SETERROR(REG_BADRPT); 299 break; 300 case '.': 301 if (p->g->cflags®_NEWLINE) 302 nonnewline(p); 303 else 304 EMIT(OANY, 0); 305 break; 306 case '[': 307 p_bracket(p); 308 break; 309 case '\\': 310 c = GETNEXT(); 311 if (c == '^' || c == '.' || c == '[' || c == '$' || 312 c == '(' || c == ')' || c == '|' || 313 c == '*' || c == '+' || c == '?' || 314 c == '{' || c == '\\') 315 ordinary(p, c); 316 else 317 SETERROR(REG_EESCAPE); 318 break; 319 case '{': /* okay as ordinary except if digit follows */ 320 REQUIRE(!isdigit(PEEK()), REG_BADRPT); 321 /* FALLTHROUGH */ 322 default: 323 ordinary(p, c); 324 break; 325 } 326 327 c = PEEK(); 328 if (!ISRPT(c, PEEK2())) 329 return; /* no repetition, we're done */ 330 NEXT(); 331 332 REQUIRE(!wascaret, REG_BADRPT); 333 switch (c) { 334 case '*': /* implemented as +? */ 335 INSERT(OPLUS_, pos); 336 BACK(O_PLUS, pos); 337 INSERT(OQUEST_, pos); 338 BACK(O_QUEST, pos); 339 break; 340 case '+': 341 INSERT(OPLUS_, pos); 342 BACK(O_PLUS, pos); 343 break; 344 case '?': 345 INSERT(OQUEST_, pos); 346 BACK(O_QUEST, pos); 347 break; 348 case '{': 349 count = p_count(p); 350 if (EAT(',')) { 351 if (isdigit(PEEK())) { 352 count2 = p_count(p); 353 REQUIRE(count <= count2, REG_BADBR); 354 } else /* single number with comma */ 355 count2 = INFINITY; 356 } else /* just a single number */ 357 count2 = count; 358 repeat(p, pos, count, count2); 359 if (!EAT('}')) { /* error heuristics */ 360 while ((c = PEEK()) != '\0' && c != '}') 361 NEXT(); 362 if (c == '\0') 363 SETERROR(REG_EBRACE); 364 else 365 SETERROR(REG_BADBR); 366 } 367 break; 368 } 369 370 c = PEEK(); 371 REQUIRE(!ISRPT(c, PEEK2()), REG_BADRPT); 372 } 373 374 /* 375 - p_bre - BRE parser top level, anchoring and concatenation 376 * 377 * Giving end1 as '\0' essentially eliminates the end1/end2 check. 378 */ 379 static void 380 p_bre(p, end1, end2) 381 register struct parse *p; 382 register uchar end1; /* first terminating character */ 383 register uchar end2; /* second terminating character */ 384 { 385 STATIC int p_simp_re(); 386 register sopno start = HERE(); 387 register int first = 1; /* first subexpression? */ 388 register int wasdollar; 389 390 if (EAT('^')) { 391 EMIT(OBOL, 0); 392 p->g->iflags |= USEBOL; 393 } 394 while (!SEE('\0') && !SEETWO(end1, end2)) { 395 wasdollar = p_simp_re(p, first); 396 first = 0; 397 } 398 if (wasdollar) { /* oops, that was a trailing anchor */ 399 DROP(1); 400 EMIT(OEOL, 0); 401 p->g->iflags |= USEEOL; 402 } 403 404 REQUIRE(HERE() != start, REG_EMPTY); /* require nonempty */ 405 } 406 407 /* 408 - p_simp_re - parse a simple RE, an atom possibly followed by a repetition 409 */ 410 static int /* was the simple RE an unbackslashed $? */ 411 p_simp_re(p, starordinary) 412 register struct parse *p; 413 int starordinary; /* is a leading * an ordinary character? */ 414 { 415 STATIC int p_count(); 416 STATIC void p_bracket(); 417 STATIC void ordinary(); 418 STATIC void nonnewline(); 419 STATIC void repeat(); 420 register int c; 421 register int count; 422 register int count2; 423 register sopno pos; 424 register sopno sub; 425 register int i; 426 register sopno subno; 427 # define BACKSL (1<<CHAR_BIT) 428 429 pos = HERE(); /* repetion op, if any, covers from here */ 430 431 c = GETNEXT(); 432 assert(c != '\0'); /* caller should have ensured this */ 433 if (c == '\\') 434 c = BACKSL | PEEK(); 435 switch (c) { 436 case '.': 437 if (p->g->cflags®_NEWLINE) 438 nonnewline(p); 439 else 440 EMIT(OANY, 0); 441 break; 442 case '[': 443 p_bracket(p); 444 break; 445 case BACKSL|'^': 446 case BACKSL|'.': 447 case BACKSL|'*': 448 case BACKSL|'[': 449 case BACKSL|'$': 450 case BACKSL|'\\': 451 ordinary(p, c&~BACKSL); 452 NEXT(); 453 break; 454 case BACKSL|'{': 455 SETERROR(REG_BADRPT); 456 break; 457 case BACKSL|'(': 458 NEXT(); 459 p->g->nsub++; 460 subno = p->g->nsub; 461 if (subno < NPAREN) 462 p->pbegin[subno] = HERE(); 463 EMIT(OLPAREN, subno); 464 /* the SEE here is an error heuristic */ 465 if (!SEE('\0') && !SEETWO('\\', ')')) 466 p_bre(p, '\\', ')'); 467 if (subno < NPAREN) { 468 p->pend[subno] = HERE(); 469 assert(p->pend[subno] != 0); 470 } 471 EMIT(ORPAREN, subno); 472 REQUIRE(EATTWO('\\', ')'), REG_EPAREN); 473 break; 474 case BACKSL|')': /* should not get here -- must be user */ 475 case BACKSL|'}': 476 SETERROR(REG_EPAREN); 477 break; 478 case BACKSL|'1': 479 case BACKSL|'2': 480 case BACKSL|'3': 481 case BACKSL|'4': 482 case BACKSL|'5': 483 case BACKSL|'6': 484 case BACKSL|'7': 485 case BACKSL|'8': 486 case BACKSL|'9': 487 i = (c&~BACKSL) - '0'; 488 assert(i < NPAREN); 489 if (p->pend[i] != 0) { 490 assert(i <= p->g->nsub); 491 EMIT(OBACK_, i); 492 assert(p->pbegin[i] != 0); 493 assert(OP(p->strip[p->pbegin[i]]) == OLPAREN); 494 assert(OP(p->strip[p->pend[i]]) == ORPAREN); 495 (void) dupl(p, p->pbegin[i]+1, p->pend[i]); 496 EMIT(O_BACK, i); 497 } else 498 SETERROR(REG_ESUBREG); 499 p->g->backrefs = 1; 500 NEXT(); 501 break; 502 case '*': 503 REQUIRE(starordinary, REG_BADRPT); 504 /* FALLTHROUGH */ 505 default: 506 if (c & BACKSL) { 507 SETERROR(REG_EESCAPE); 508 c &= ~BACKSL; 509 } 510 ordinary(p, (uchar)c); 511 break; 512 } 513 514 if (EAT('*')) { /* implemented as +? */ 515 INSERT(OPLUS_, pos); 516 BACK(O_PLUS, pos); 517 INSERT(OQUEST_, pos); 518 BACK(O_QUEST, pos); 519 } else if (EATTWO('\\', '{')) { 520 count = p_count(p); 521 if (EAT(',')) { 522 if (isdigit(PEEK())) { 523 count2 = p_count(p); 524 REQUIRE(count <= count2, REG_BADBR); 525 } else /* single number with comma */ 526 count2 = INFINITY; 527 } else /* just a single number */ 528 count2 = count; 529 repeat(p, pos, count, count2); 530 if (!EATTWO('\\', '}')) { /* error heuristics */ 531 while (!SEE('\0') && !SEETWO('\\', '}')) 532 NEXT(); 533 if (SEE('\0')) 534 SETERROR(REG_EBRACE); 535 else 536 SETERROR(REG_BADBR); 537 } 538 } else if (c == '$') /* unbackslashed $ not followed by reptn */ 539 return(1); 540 541 return(0); 542 } 543 544 /* 545 - p_count - parse a repetition count 546 */ 547 static int /* the value */ 548 p_count(p) 549 register struct parse *p; 550 { 551 register int count = 0; 552 register int ndigits = 0; 553 554 while (isdigit(PEEK()) && count <= DUPMAX) { 555 count = count*10 + (GETNEXT() - '0'); 556 ndigits++; 557 } 558 559 REQUIRE(ndigits > 0 && count <= DUPMAX, REG_BADBR); 560 return(count); 561 } 562 563 /* 564 - p_bracket - parse a bracketed character list 565 * 566 * Note a significant property of this code: if the allocset() did SETERROR, 567 * no set operations are done. 568 */ 569 static void 570 p_bracket(p) 571 register struct parse *p; 572 { 573 STATIC void p_b_term(); 574 STATIC cset *allocset(); 575 STATIC int freezeset(); 576 register uchar c; 577 register cset *cs = allocset(p); 578 register int invert = 0; 579 580 if (EAT('^')) 581 invert++; /* make note to invert set at end */ 582 if (EAT(']')) 583 CHadd(cs, ']'); 584 while ((c = PEEK()) != '\0' && c != ']' && !SEETWO('-', ']')) 585 p_b_term(p, cs); 586 if (EAT('-')) 587 CHadd(cs, '-'); 588 MUSTEAT(']', REG_EBRACK); 589 590 if (invert && p->error == 0) { 591 register int i; 592 593 for (i = p->g->csetsize - 1; i >= 0; i--) 594 if (CHIN(cs, i)) 595 CHsub(cs, i); 596 else 597 CHadd(cs, i); 598 if (p->g->cflags®_NEWLINE) 599 CHsub(cs, '\n'); 600 if (cs->multis != NULL) 601 mcinvert(p, cs); 602 } 603 assert(cs->multis == NULL); /* xxx */ 604 EMIT(OANYOF, freezeset(p, cs)); 605 } 606 607 /* 608 - p_b_term - parse one term of a bracketed character list 609 */ 610 static void 611 p_b_term(p, cs) 612 register struct parse *p; 613 register cset *cs; 614 { 615 STATIC uchar p_b_symbol(); 616 STATIC void p_b_cclass(); 617 STATIC void p_b_eclass(); 618 STATIC uchar othercase(); 619 register uchar c; 620 register uchar start, finish; 621 register int i; 622 623 /* classify what we've got */ 624 switch (PEEK()) { 625 case '[': 626 c = PEEK2(); 627 break; 628 case '-': 629 SETERROR(REG_ERANGE); 630 return; /* NOTE RETURN */ 631 break; 632 default: 633 c = '\0'; 634 break; 635 } 636 637 switch (c) { 638 case ':': /* character class */ 639 NEXT2(); 640 c = PEEK(); 641 REQUIRE(c != '\0', REG_EBRACK); 642 REQUIRE(c != '-' && c != ']', REG_ECTYPE); 643 p_b_cclass(p, cs); 644 MUSTNOTSEE('\0', REG_EBRACK); 645 REQUIRE(EATTWO(':', ']'), REG_ECTYPE); 646 break; 647 case '=': /* equivalence class */ 648 NEXT2(); 649 c = PEEK(); 650 REQUIRE(c != '\0', REG_EBRACK); 651 REQUIRE(c != '-' && c != ']', REG_ECOLLATE); 652 p_b_eclass(p, cs); 653 MUSTNOTSEE('\0', REG_EBRACK); 654 REQUIRE(EATTWO('=', ']'), REG_ECOLLATE); 655 break; 656 default: /* symbol, ordinary character, or range */ 657 /* xxx revision needed for multichar stuff */ 658 start = p_b_symbol(p); 659 if (PEEK() == '-' && (c = PEEK2()) != ']' && c != '\0') { 660 /* range */ 661 NEXT(); 662 if (EAT('-')) 663 finish = '-'; 664 else 665 finish = p_b_symbol(p); 666 } else 667 finish = start; 668 REQUIRE(start <= finish, REG_ERANGE); 669 for (i = start; i <= finish; i++) { 670 CHadd(cs, i); 671 if ((p->g->cflags®_ICASE) && isalpha(i)) { 672 c = othercase((uchar)i); 673 CHadd(cs, c); 674 } 675 } 676 break; 677 } 678 } 679 680 /* 681 - p_b_cclass - parse a character-class name and deal with it 682 */ 683 static void 684 p_b_cclass(p, cs) 685 register struct parse *p; 686 register cset *cs; 687 { 688 register uchar *sb = p->next; 689 register uchar *se = sb; 690 register struct cclass *cp; 691 register int len; 692 register uchar *u; 693 register uchar c; 694 695 while (isalpha(*se)) 696 se++; 697 len = se - sb; 698 NEXTn(len); 699 for (cp = cclasses; cp->name != NULL; cp++) 700 if (strncmp(cp->name, (char *)sb, len) == 0 && 701 cp->name[len] == '\0') 702 break; 703 if (cp->name == NULL) { 704 /* oops, didn't find it */ 705 SETERROR(REG_ECTYPE); 706 return; 707 } 708 709 u = (uchar *)cp->chars; 710 while ((c = *u++) != '\0') 711 CHadd(cs, c); 712 for (u = (uchar *)cp->multis; *u != '\0'; u += strlen((char *)u) + 1) 713 MCadd(cs, u); 714 } 715 716 /* 717 - p_b_eclass - parse an equivalence-class name and deal with it 718 * 719 * This implementation is incomplete. xxx 720 */ 721 static void 722 p_b_eclass(p, cs) 723 register struct parse *p; 724 register cset *cs; 725 { 726 register uchar c; 727 STATIC uchar p_b_coll_elem(); 728 729 c = p_b_coll_elem(p, '='); 730 CHadd(cs, c); 731 } 732 733 /* 734 - p_b_symbol - parse a character or [..]ed multicharacter collating symbol 735 */ 736 static uchar /* value of symbol */ 737 p_b_symbol(p) 738 register struct parse *p; 739 { 740 STATIC uchar p_b_coll_elem(); 741 register uchar value; 742 743 if (!EATTWO('[', '.')) { 744 MUSTNOTSEE('\0', REG_EBRACK); 745 return(GETNEXT()); 746 } 747 748 /* collating symbol */ 749 MUSTNOTSEE('\0', REG_EBRACK); 750 value = p_b_coll_elem(p, '.'); 751 REQUIRE(EATTWO('.', ']'), REG_ECOLLATE); 752 return(value); 753 } 754 755 /* 756 - p_b_coll_elem - parse a collating-element name and look it up 757 */ 758 static uchar /* value of collating element */ 759 p_b_coll_elem(p, endc) 760 register struct parse *p; 761 uchar endc; /* name ended by endc,']' */ 762 { 763 register uchar *sp = p->next; 764 register struct cname *cp; 765 register int len; 766 register uchar c; 767 768 while ((c = PEEK()) != '\0' && !SEETWO(endc, ']')) 769 NEXT(); 770 if (c == '\0') { 771 SETERROR(REG_EBRACK); 772 return(0); 773 } 774 len = p->next - sp; 775 for (cp = cnames; cp->name != NULL; cp++) 776 if (strncmp(cp->name, (char *)sp, len) == 0 && 777 cp->name[len] == '\0') 778 return(cp->code); /* known name */ 779 if (len == 1) 780 return(*sp); /* single character */ 781 SETERROR(REG_ECOLLATE); /* neither */ 782 return(0); 783 } 784 785 /* 786 - othercase - return the case counterpart of an alphabetic 787 */ 788 static uchar 789 othercase(ch) 790 uchar ch; 791 { 792 assert(isalpha(ch)); 793 if (isupper(ch)) 794 return(tolower(ch)); 795 else if (islower(ch)) 796 return(toupper(ch)); 797 else /* peculiar, but could happen */ 798 return(ch); 799 } 800 801 /* 802 - bothcases - emit a dualcase version of a character 803 * Boy, is this implementation ever a kludge... 804 */ 805 static void 806 bothcases(p, ch) 807 register struct parse *p; 808 uchar ch; 809 { 810 register uchar *oldnext; 811 uchar bracket[3]; 812 813 oldnext = p->next; 814 p->next = bracket; 815 bracket[0] = ch; 816 bracket[1] = ']'; 817 bracket[2] = '\0'; 818 p_bracket(p); 819 assert(p->next == bracket+2); 820 p->next = oldnext; 821 } 822 823 /* 824 - ordinary - emit an ordinary character 825 */ 826 static void 827 ordinary(p, ch) 828 register struct parse *p; 829 register uchar ch; 830 { 831 register uchar *cap = p->g->categories; 832 833 if ((p->g->cflags®_ICASE) && isalpha(ch)) { 834 bothcases(p, ch); 835 return; 836 } 837 838 EMIT(OCHAR, ch); 839 if (cap[ch] == 0) 840 cap[ch] = p->g->ncategories++; 841 } 842 843 /* 844 - nonnewline - emit REG_NEWLINE version of OANY 845 * Boy, is this implementation ever a kludge... 846 */ 847 static void 848 nonnewline(p) 849 register struct parse *p; 850 { 851 register uchar *oldnext; 852 uchar bracket[4]; 853 854 oldnext = p->next; 855 p->next = bracket; 856 bracket[0] = '^'; 857 bracket[1] = '\n'; 858 bracket[2] = ']'; 859 bracket[3] = '\0'; 860 p_bracket(p); 861 assert(p->next == bracket+3); 862 p->next = oldnext; 863 } 864 865 /* 866 - repeat - generate code for a bounded repetition, recursively if needed 867 */ 868 static void 869 repeat(p, start, from, to) 870 register struct parse *p; 871 sopno start; /* operand from here to end of strip */ 872 int from; /* repeated from this number */ 873 int to; /* to this number of times (maybe INFINITY) */ 874 { 875 register sopno finish = HERE(); 876 # define N 2 877 # define INF 3 878 # define REP(f, t) ((f)*8 + (t)) 879 # define MAP(n) (((n) <= 1) ? (n) : ((n) == INFINITY) ? INF : N) 880 register sopno copy; 881 882 if (p->error != 0) /* head off possible runaway recursion */ 883 return; 884 885 assert(from <= to); 886 887 switch (REP(MAP(from), MAP(to))) { 888 case REP(0, 0): /* must be user doing this */ 889 DROP(finish-start); /* drop the operand */ 890 break; 891 case REP(0, 1): /* as x{1,1}? */ 892 case REP(0, N): /* as x{1,n}? */ 893 case REP(0, INF): /* as x{1,}? */ 894 INSERT(OQUEST_, start); /* offset is wrong... */ 895 repeat(p, start+1, 1, to); 896 FWD(start); /* ... fix it */ 897 BACK(O_QUEST, start); 898 break; 899 case REP(1, 1): /* trivial case */ 900 /* done */ 901 break; 902 case REP(1, N): /* as x?x{1,n-1} */ 903 INSERT(OQUEST_, start); 904 BACK(O_QUEST, start); 905 copy = dupl(p, start+1, finish+1); 906 assert(copy == finish+2); 907 repeat(p, copy, 1, to-1); 908 break; 909 case REP(1, INF): /* as x+ */ 910 INSERT(OPLUS_, start); 911 BACK(O_PLUS, start); 912 break; 913 case REP(N, N): /* as xx{m-1,n-1} */ 914 copy = dupl(p, start, finish); 915 repeat(p, copy, from-1, to-1); 916 break; 917 case REP(N, INF): /* as xx{n-1,INF} */ 918 copy = dupl(p, start, finish); 919 repeat(p, copy, from-1, to); 920 break; 921 default: /* "can't happen" */ 922 SETERROR(REG_ASSERT); /* just in case */ 923 break; 924 } 925 } 926 927 /* 928 - seterr - set an error condition 929 */ 930 static int /* useless but makes type checking happy */ 931 seterr(p, e) 932 register struct parse *p; 933 int e; 934 { 935 if (p->error == 0) /* keep earliest error condition */ 936 p->error = e; 937 p->next = nuls; /* try to bring things to a halt */ 938 return(0); /* make the return value well-defined */ 939 } 940 941 /* 942 - allocset - allocate a set of characters for [] 943 */ 944 static cset * 945 allocset(p) 946 register struct parse *p; 947 { 948 register int no = p->g->ncsets++; 949 register size_t nc; 950 register size_t nbytes; 951 register cset *cs; 952 register int i; 953 register size_t css = (size_t)p->g->csetsize; 954 955 if (no >= p->ncsalloc) { /* need another column of space */ 956 p->ncsalloc += CHAR_BIT; 957 nc = p->ncsalloc; 958 assert(nc % CHAR_BIT == 0); 959 nbytes = nc / CHAR_BIT * css; 960 if (p->g->sets == NULL) 961 p->g->sets = (cset *)malloc(nc * sizeof(cset)); 962 else 963 p->g->sets = (cset *)realloc((char *)p->g->sets, 964 nc * sizeof(cset)); 965 if (p->g->setbits == NULL) 966 p->g->setbits = (uchar *)malloc(nbytes); 967 else 968 p->g->setbits = (uchar *)realloc((char *)p->g->setbits, 969 nbytes); 970 if (p->g->sets != NULL && p->g->setbits != NULL) 971 (void) memset((char *)p->g->setbits + (nbytes - css), 972 0, css); 973 else { 974 no = 0; 975 SETERROR(REG_ESPACE); 976 /* caller's responsibility not to do set ops */ 977 } 978 } 979 980 assert(p->g->sets != NULL); /* xxx */ 981 cs = &p->g->sets[no]; 982 cs->ptr = p->g->setbits + css*((no)/CHAR_BIT); 983 cs->mask = 1 << ((no) % CHAR_BIT); 984 cs->hash = 0; 985 cs->smultis = 0; 986 cs->multis = NULL; 987 988 return(cs); 989 } 990 991 /* 992 - freezeset - final processing on a set of characters 993 * 994 * The main task here is merging identical sets. This is usually a waste 995 * of time (although the hash code minimizes the overhead), but can win 996 * big if REG_ICASE is being used. REG_ICASE, by the way, is why the hash 997 * is done using addition rather than xor -- all ASCII [aA] sets xor to 998 * the same value! 999 */ 1000 static int /* set number */ 1001 freezeset(p, cs) 1002 register struct parse *p; 1003 register cset *cs; 1004 { 1005 register uchar h = cs->hash; 1006 register int i; 1007 register cset *top = &p->g->sets[p->g->ncsets]; 1008 register uchar c; 1009 register cset *cs2; 1010 register size_t css = (size_t)p->g->csetsize; 1011 1012 /* look for an earlier one which is the same */ 1013 for (cs2 = &p->g->sets[0]; cs2 < top; cs2++) 1014 if (cs2->hash == h && cs2 != cs) { 1015 /* maybe */ 1016 for (i = 0; i < css; i++) 1017 if (!!CHIN(cs2, i) != !!CHIN(cs, i)) 1018 break; /* no */ 1019 if (i == css) 1020 break; /* yes */ 1021 } 1022 1023 if (cs2 < top) { /* found one */ 1024 assert(cs == top-1); 1025 p->g->ncsets--; 1026 for (i = 0; i < css; i++) 1027 CHsub(cs, i); 1028 cs = cs2; 1029 } 1030 1031 return((int)(cs - p->g->sets)); 1032 } 1033 1034 /* 1035 - mcadd - add a collating element to a cset 1036 */ 1037 static void 1038 mcadd(p, cs, cp) 1039 register struct parse *p; 1040 register cset *cs; 1041 register uchar *cp; 1042 { 1043 register size_t oldend = cs->smultis; 1044 1045 cs->smultis += strlen((char *)cp) + 1; 1046 if (cs->multis == NULL) 1047 cs->multis = (uchar *)malloc(cs->smultis); 1048 else 1049 cs->multis = (uchar *)realloc(cs->multis, cs->smultis); 1050 if (cs->multis == NULL) { 1051 SETERROR(REG_ESPACE); 1052 return; 1053 } 1054 1055 (void) strcpy((char *)(cs->multis + oldend - 1), (char *)cp); 1056 cs->multis[cs->smultis - 1] = '\0'; 1057 } 1058 1059 /* 1060 - mcsub - subtract a collating element from a cset 1061 */ 1062 static void 1063 mcsub(p, cs, cp) 1064 register struct parse *p; 1065 register cset *cs; 1066 register uchar *cp; 1067 { 1068 register uchar *fp = mcfind(cs, cp); 1069 register size_t len = strlen((char *)fp); 1070 1071 assert(p != NULL); 1072 (void) memmove((char *)fp, (char *)(fp + len + 1), 1073 cs->smultis - (fp + len + 1 - cs->multis)); 1074 cs->smultis -= len; 1075 1076 if (cs->smultis == 0) { 1077 free((char *)cs->multis); 1078 cs->multis = NULL; 1079 return; 1080 } 1081 1082 cs->multis = (uchar *)realloc(cs->multis, cs->smultis); 1083 assert(cs->multis != NULL); 1084 } 1085 1086 /* 1087 - mcin - is a collating element in a cset? 1088 */ 1089 static int 1090 mcin(p, cs, cp) 1091 register struct parse *p; 1092 register cset *cs; 1093 register uchar *cp; 1094 { 1095 return(mcfind(cs, cp) != NULL); 1096 } 1097 1098 /* 1099 - mcfind - find a collating element in a cset 1100 */ 1101 static uchar * 1102 mcfind(cs, cp) 1103 register cset *cs; 1104 register uchar *cp; 1105 { 1106 register uchar *p; 1107 1108 if (cs->multis == NULL) 1109 return(NULL); 1110 for (p = cs->multis; *p != '\0'; p += strlen((char *)p) + 1) 1111 if (strcmp((char *)cp, (char *)p) == 0) 1112 return(p); 1113 return(NULL); 1114 } 1115 1116 /* 1117 - mcinvert - invert the list of collating elements in a cset 1118 * 1119 * This would have to know the set of possibilities. Implementation 1120 * is deferred. 1121 */ 1122 static void 1123 mcinvert(p, cs) 1124 register struct parse *p; 1125 register cset *cs; 1126 { 1127 assert(cs->multis == NULL); /* xxx */ 1128 } 1129 1130 /* 1131 - isinsets - is this character in any sets? 1132 */ 1133 static int /* predicate */ 1134 isinsets(g, c) 1135 register struct re_guts *g; 1136 uchar c; 1137 { 1138 register uchar *col; 1139 register int i; 1140 register int ncols = (g->ncsets+(CHAR_BIT-1)) / CHAR_BIT; 1141 1142 for (i = 0, col = g->setbits; i < ncols; i++, col += g->csetsize) 1143 if (col[c] != 0) 1144 return(1); 1145 return(0); 1146 } 1147 1148 /* 1149 - samesets - are these two characters in exactly the same sets? 1150 */ 1151 static int /* predicate */ 1152 samesets(g, c1, c2) 1153 register struct re_guts *g; 1154 register uchar c1; 1155 register uchar c2; 1156 { 1157 register uchar *col; 1158 register int i; 1159 register int ncols = (g->ncsets+(CHAR_BIT-1)) / CHAR_BIT; 1160 1161 for (i = 0, col = g->setbits; i < ncols; i++, col += g->csetsize) 1162 if (col[c1] != col[c2]) 1163 return(0); 1164 return(1); 1165 } 1166 1167 /* 1168 - categorize - sort out character categories 1169 */ 1170 static void 1171 categorize(p, g) 1172 struct parse *p; 1173 register struct re_guts *g; 1174 { 1175 register uchar *cats = g->categories; 1176 register unsigned c; 1177 register unsigned c2; 1178 register uchar cat; 1179 1180 /* avoid making error situations worse */ 1181 if (p->error != 0) 1182 return; 1183 1184 for (c = 0; c < g->csetsize; c++) 1185 if (cats[c] == 0 && isinsets(g, c)) { 1186 cat = g->ncategories++; 1187 cats[c] = cat; 1188 for (c2 = c+1; c2 < g->csetsize; c2++) 1189 if (cats[c2] == 0 && samesets(g, c, c2)) 1190 cats[c2] = cat; 1191 } 1192 } 1193 1194 /* 1195 - dupl - emit a duplicate of a bunch of sops 1196 */ 1197 static sopno /* start of duplicate */ 1198 dupl(p, start, finish) 1199 register struct parse *p; 1200 sopno start; /* from here */ 1201 sopno finish; /* to this less one */ 1202 { 1203 register int i; 1204 register sopno ret = HERE(); 1205 register sopno len = finish - start; 1206 1207 assert(finish >= start); 1208 if (len == 0) 1209 return(ret); 1210 enlarge(p, p->ssize + len); /* this many unexpected additions */ 1211 assert(p->ssize >= p->slen + len); 1212 (void) memcpy((char *)(p->strip + p->slen), 1213 (char *)(p->strip + start), (size_t)len*sizeof(sop)); 1214 p->slen += len; 1215 return(ret); 1216 } 1217 1218 /* 1219 - doemit - emit a strip operator 1220 * 1221 * It might seem better to implement this as a macro with a function as 1222 * hard-case backup, but it's just too big and messy unless there are 1223 * some changes to the data structures. Maybe later. 1224 */ 1225 static void 1226 doemit(p, op, opnd) 1227 register struct parse *p; 1228 sop op; 1229 size_t opnd; 1230 { 1231 /* avoid making error situations worse */ 1232 if (p->error != 0) 1233 return; 1234 1235 /* deal with oversize operands ("can't happen", more or less) */ 1236 assert(opnd < 1<<OPSHIFT); 1237 1238 /* deal with undersized strip */ 1239 if (p->slen >= p->ssize) 1240 enlarge(p, (p->ssize+1) / 2 * 3); /* +50% */ 1241 assert(p->slen < p->ssize); 1242 1243 /* finally, it's all reduced to the easy case */ 1244 p->strip[p->slen++] = SOP(op, opnd); 1245 } 1246 1247 /* 1248 - doinsert - insert a sop into the strip 1249 */ 1250 static void 1251 doinsert(p, op, opnd, pos) 1252 register struct parse *p; 1253 sop op; 1254 size_t opnd; 1255 sopno pos; 1256 { 1257 register sopno sn; 1258 register sop s; 1259 register int i; 1260 1261 /* avoid making error situations worse */ 1262 if (p->error != 0) 1263 return; 1264 1265 sn = HERE(); 1266 EMIT(op, opnd); /* do checks, ensure space */ 1267 assert(HERE() == sn+1); 1268 s = p->strip[sn]; 1269 1270 /* adjust paren pointers */ 1271 assert(pos > 0); 1272 for (i = 1; i < NPAREN; i++) { 1273 if (p->pbegin[i] >= pos) { 1274 p->pbegin[i]++; 1275 } 1276 if (p->pend[i] >= pos) { 1277 p->pend[i]++; 1278 } 1279 } 1280 1281 memmove((char *)&p->strip[pos+1], (char *)&p->strip[pos], 1282 (HERE()-pos-1)*sizeof(sop)); 1283 p->strip[pos] = s; 1284 } 1285 1286 /* 1287 - dofwd - complete a forward reference 1288 */ 1289 static void 1290 dofwd(p, pos, value) 1291 register struct parse *p; 1292 register sopno pos; 1293 sop value; 1294 { 1295 /* avoid making error situations worse */ 1296 if (p->error != 0) 1297 return; 1298 1299 assert(value < 1<<OPSHIFT); 1300 p->strip[pos] = OP(p->strip[pos]) | value; 1301 } 1302 1303 /* 1304 - enlarge - enlarge the strip 1305 */ 1306 static void 1307 enlarge(p, size) 1308 register struct parse *p; 1309 register sopno size; 1310 { 1311 register sop *sp; 1312 1313 if (p->ssize >= size) 1314 return; 1315 1316 sp = (sop *)realloc(p->strip, size*sizeof(sop)); 1317 if (sp == NULL) { 1318 SETERROR(REG_ESPACE); 1319 return; 1320 } 1321 p->strip = sp; 1322 p->ssize = size; 1323 } 1324 1325 /* 1326 - stripsnug - compact the strip 1327 */ 1328 static void 1329 stripsnug(p, g) 1330 register struct parse *p; 1331 register struct re_guts *g; 1332 { 1333 g->nstates = p->slen; 1334 g->strip = (sop *)realloc((sop *)p->strip, p->slen * sizeof(sop)); 1335 if (g->strip == NULL) { 1336 SETERROR(REG_ESPACE); 1337 g->strip = p->strip; 1338 } 1339 } 1340 1341 /* 1342 - findmust - fill in must and mlen with longest mandatory literal string 1343 * 1344 * This algorithm could do fancy things like analyzing the operands of | 1345 * for common subsequences. Someday. This code is simple and finds most 1346 * of the interesting cases. 1347 * 1348 * Note that must and mlen got initialized during setup. 1349 */ 1350 STATIC void 1351 findmust(p, g) 1352 struct parse *p; 1353 register struct re_guts *g; 1354 { 1355 register sop *scan; 1356 sop *start; 1357 register sop *newstart; 1358 register sopno newlen; 1359 register sop s; 1360 register char *cp; 1361 register sopno i; 1362 1363 /* avoid making error situations worse */ 1364 if (p->error != 0) 1365 return; 1366 1367 /* find the longest OCHAR sequence in strip */ 1368 newlen = 0; 1369 scan = g->strip + 1; 1370 do { 1371 s = *scan++; 1372 switch (OP(s)) { 1373 case OCHAR: /* sequence member */ 1374 if (newlen == 0) /* new sequence */ 1375 newstart = scan - 1; 1376 newlen++; 1377 break; 1378 case OPLUS_: /* things that don't break one */ 1379 case OLPAREN: 1380 case ORPAREN: 1381 break; 1382 case OQUEST_: /* things that must be skipped */ 1383 case OCH_: 1384 scan--; 1385 do { 1386 scan += OPND(s); 1387 s = *scan; 1388 /* assert() interferes w debug printouts */ 1389 if (OP(s) != O_QUEST && OP(s) != O_CH && 1390 OP(s) != OOR2) { 1391 g->iflags |= BAD; 1392 return; 1393 } 1394 } while (OP(s) != O_QUEST && OP(s) != O_CH); 1395 /* fallthrough */ 1396 default: /* things that break a sequence */ 1397 if (newlen > g->mlen) { /* ends one */ 1398 start = newstart; 1399 g->mlen = newlen; 1400 } 1401 newlen = 0; 1402 break; 1403 } 1404 } while (OP(s) != OEND); 1405 1406 if (g->mlen == 0) /* there isn't one */ 1407 return; 1408 1409 /* turn it into a character string */ 1410 g->must = malloc((size_t)g->mlen + 1); 1411 if (g->must == NULL) { /* argh; just forget it */ 1412 g->mlen = 0; 1413 return; 1414 } 1415 cp = g->must; 1416 scan = start; 1417 for (i = g->mlen; i > 0; i--) { 1418 while (OP(s = *scan++) != OCHAR) 1419 continue; 1420 *cp++ = OPND(s); 1421 } 1422 *cp++ = '\0'; /* just on general principles */ 1423 } 1424 1425 /* 1426 - pluscount - count + nesting 1427 */ 1428 STATIC sopno /* nesting depth */ 1429 pluscount(p, g) 1430 struct parse *p; 1431 register struct re_guts *g; 1432 { 1433 register sop *scan; 1434 register sop s; 1435 register sopno plusnest = 0; 1436 register sopno maxnest = 0; 1437 1438 if (p->error != 0) 1439 return(0); /* there may not be an OEND */ 1440 1441 scan = g->strip + 1; 1442 do { 1443 s = *scan++; 1444 switch (OP(s)) { 1445 case OPLUS_: 1446 plusnest++; 1447 break; 1448 case O_PLUS: 1449 if (plusnest > maxnest) 1450 maxnest = plusnest; 1451 plusnest--; 1452 break; 1453 } 1454 } while (OP(s) != OEND); 1455 if (plusnest != 0) 1456 g->iflags |= BAD; 1457 return(maxnest); 1458 } 1459