1 /*- 2 * Copyright (c) 1992 Henry Spencer. 3 * Copyright (c) 1992 The Regents of the University of California. 4 * All rights reserved. 5 * 6 * This code is derived from software contributed to Berkeley by 7 * Henry Spencer of the University of Toronto. 8 * 9 * %sccs.include.redist.c% 10 * 11 * @(#)regcomp.c 5.4 (Berkeley) 10/01/92 12 */ 13 14 #if defined(LIBC_SCCS) && !defined(lint) 15 static char sccsid[] = "@(#)regcomp.c 5.4 (Berkeley) 10/01/92"; 16 #endif /* LIBC_SCCS and not lint */ 17 18 #include <sys/types.h> 19 #include <stdio.h> 20 #include <string.h> 21 #include <ctype.h> 22 #include <limits.h> 23 #include <stdlib.h> 24 #include <regex.h> 25 26 #include "utils.h" 27 #include "regex2.h" 28 29 #include "cclass.h" 30 #include "cname.h" 31 32 /* 33 * parse structure, passed up and down to avoid global variables and 34 * other clumsinesses 35 */ 36 struct parse { 37 uchar *next; /* next character in RE */ 38 int error; /* has an error been seen? */ 39 sop *strip; /* malloced strip */ 40 sopno ssize; /* malloced strip size (allocated) */ 41 sopno slen; /* malloced strip length (used) */ 42 int ncsalloc; /* number of csets allocated */ 43 struct re_guts *g; 44 # define NPAREN 10 /* we need to remember () 1-9 for back refs */ 45 sopno pbegin[NPAREN]; /* -> ( ([0] unused) */ 46 sopno pend[NPAREN]; /* -> ) ([0] unused) */ 47 }; 48 49 static uchar nuls[10]; /* place to point scanner in event of error */ 50 51 /* 52 * macros for use with parse structure 53 * BEWARE: these know that the parse structure is named `p' !!! 54 */ 55 #define PEEK() ((uchar)*p->next) 56 #define PEEK2() ((uchar)*(p->next+1)) 57 #define SEE(c) (PEEK() == (c)) 58 #define SEETWO(a, b) (PEEK() == (a) && PEEK2() == (b)) 59 #define EAT(c) ((SEE(c)) ? (NEXT(), 1) : 0) 60 #define EATTWO(a, b) ((SEETWO(a, b)) ? (NEXT2(), 1) : 0) 61 #define NEXT() (p->next++) 62 #define NEXT2() (p->next += 2) 63 #define NEXTn(n) (p->next += (n)) 64 #define GETNEXT() ((uchar)*p->next++) 65 #define SETERROR(e) seterr(p, (e)) 66 #define REQUIRE(co, e) ((co) || SETERROR(e)) 67 #define MUSTSEE(c, e) (REQUIRE(PEEK() == (c), e)) 68 #define MUSTEAT(c, e) (REQUIRE(GETNEXT() == (c), e)) 69 #define MUSTNOTSEE(c, e) (REQUIRE(PEEK() != (c), e)) 70 #define EMIT(sop, sopnd) doemit(p, sop, (size_t)(sopnd)) 71 #define INSERT(sop, pos) doinsert(p, sop, HERE()-(pos)+1, pos) 72 #define FWD(pos) dofwd(p, pos, HERE()-(pos)) 73 #define BACK(sop, pos) EMIT(sop, HERE()-pos) 74 #define HERE() (p->slen) 75 #define THERE() (p->slen - 1) 76 #define DROP(n) (p->slen -= (n)) 77 78 static cset *allocset __P((struct parse *)); 79 static void bothcases __P((struct parse *, u_int)); 80 static void categorize __P((struct parse *, struct re_guts *)); 81 static void doemit __P((struct parse *, sop, size_t)); 82 static void dofwd __P((struct parse *, sopno, sop)); 83 static void doinsert __P((struct parse *, sop, size_t, sopno)); 84 static sopno dupl __P((struct parse *, sopno, sopno)); 85 static void enlarge __P((struct parse *, sopno)); 86 static void findmust __P((struct parse *, struct re_guts *)); 87 static int freezeset __P((struct parse *, cset *)); 88 static int isinsets __P((struct re_guts *, u_int)); 89 static void mcadd __P((struct parse *, cset *, uchar *)); 90 static uchar *mcfind __P((cset *, u_int *)); 91 static int mcin __P((struct parse *, cset *, u_int *)); 92 static void mcinvert __P((struct parse *, cset *)); 93 static void mcsub __P((struct parse *, cset *, u_int *)); 94 static void nonnewline __P((struct parse *)); 95 static void ordinary __P((struct parse *, u_int)); 96 static uchar othercase __P((u_int)); 97 static void p_b_cclass __P((struct parse *, cset *)); 98 static uchar p_b_coll_elem __P((struct parse *, u_int)); 99 static void p_b_eclass __P((struct parse *, cset *)); 100 static uchar p_b_symbol __P((struct parse *)); 101 static void p_b_term __P((struct parse *, cset *)); 102 static void p_bracket __P((struct parse *)); 103 static void p_bre __P((struct parse *, u_int, u_int)); 104 static int p_count __P((struct parse *)); 105 static void p_ere __P((struct parse *, u_int)); 106 static void p_ere_exp __P((struct parse *)); 107 static int p_simp_re __P((struct parse *, int)); 108 static sopno pluscount __P((struct parse *, struct re_guts *)); 109 static void repeat __P((struct parse *, sopno, int, int)); 110 static int samesets __P((struct re_guts *, u_int, u_int)); 111 static int seterr __P((struct parse *, int)); 112 static void stripsnug __P((struct parse *, struct re_guts *)); 113 114 /* 115 - regcomp - interface for parser and compilation 116 */ 117 int /* 0 success, otherwise REG_something */ 118 regcomp(preg, pattern, cflags) 119 regex_t *preg; 120 const char *pattern; 121 int cflags; 122 { 123 struct parse pa; 124 register struct re_guts *g; 125 register struct parse *p = &pa; 126 register int i; 127 128 /* do the mallocs early so failure handling is easy */ 129 /* the +NUC here is for the category table */ 130 g = (struct re_guts *)malloc(sizeof(struct re_guts) + NUC); 131 if (g == NULL) 132 return(REG_ESPACE); 133 p->ssize = strlen(pattern)/2*3 + 1; 134 p->strip = (sop *)malloc(p->ssize * sizeof(sop)); 135 p->slen = 0; 136 if (p->strip == NULL) { 137 free((char *)g); 138 return(REG_ESPACE); 139 } 140 141 /* set things up */ 142 p->g = g; 143 p->next = (uchar *)pattern; 144 p->error = 0; 145 p->ncsalloc = 0; 146 for (i = 0; i < NPAREN; i++) { 147 p->pbegin[i] = 0; 148 p->pend[i] = 0; 149 } 150 g->csetsize = NUC; 151 g->sets = NULL; 152 g->setbits = NULL; 153 g->ncsets = 0; 154 g->cflags = cflags; 155 g->iflags = 0; 156 g->must = NULL; 157 g->mlen = 0; 158 g->nsub = 0; 159 g->ncategories = 1; /* category 0 is "everything else" */ 160 g->categories = (uchar *)g + sizeof(struct re_guts); 161 (void) memset((char *)g->categories, 0, NUC); 162 g->backrefs = 0; 163 g->nplus = 0; 164 165 /* do it */ 166 EMIT(OEND, 0); 167 g->firststate = THERE(); 168 if (cflags®_EXTENDED) 169 p_ere(p, '\0'); 170 else 171 p_bre(p, '\0', '\0'); 172 EMIT(OEND, 0); 173 g->laststate = THERE(); 174 175 /* tidy up loose ends and fill things in */ 176 categorize(p, g); 177 stripsnug(p, g); 178 findmust(p, g); 179 g->nplus = pluscount(p, g); 180 g->magic = MAGIC2; 181 preg->re_nsub = g->nsub; 182 preg->re_g = g; 183 preg->re_magic = MAGIC1; 184 #ifndef REDEBUG 185 /* not debugging, so can't rely on the assert() in regexec() */ 186 if (g->iflags&BAD) 187 SETERROR(REG_ASSERT); 188 #endif 189 190 /* win or lose, we're done */ 191 if (p->error != 0) /* lose */ 192 regfree(preg); 193 return(p->error); 194 } 195 196 /* 197 - p_ere - ERE parser top level, concatenation and alternation 198 */ 199 static void 200 p_ere(p, stop) 201 register struct parse *p; 202 u_int stop; /* character this ERE should end at */ 203 { 204 register uchar c; 205 register sopno prevback; 206 register sopno prevfwd; 207 register sopno conc; 208 register int first = 1; /* is this the first alternative? */ 209 210 for (;;) { 211 /* do a bunch of concatenated expressions */ 212 conc = HERE(); 213 while ((c = PEEK()) != '|' && c != stop && c != '\0') 214 p_ere_exp(p); 215 REQUIRE(HERE() != conc, REG_EMPTY); /* require nonempty */ 216 217 if (!EAT('|')) 218 break; /* NOTE BREAK OUT */ 219 220 if (first) { 221 INSERT(OCH_, conc); /* offset is wrong */ 222 prevfwd = conc; 223 prevback = conc; 224 first = 0; 225 } 226 BACK(OOR1, prevback); 227 prevback = THERE(); 228 FWD(prevfwd); /* fix previous offset */ 229 prevfwd = HERE(); 230 EMIT(OOR2, 0); /* offset is very wrong */ 231 } 232 233 if (!first) { /* tail-end fixups */ 234 FWD(prevfwd); 235 BACK(O_CH, prevback); 236 } 237 238 assert(SEE(stop) || SEE('\0')); 239 } 240 241 /* 242 - p_ere_exp - parse one subERE, an atom possibly followed by a repetition op 243 */ 244 static void 245 p_ere_exp(p) 246 register struct parse *p; 247 { 248 register uchar c; 249 register sopno pos; 250 register int count; 251 register int count2; 252 register sopno subno; 253 int wascaret = 0; 254 /* we call { a repetition if followed by a digit */ 255 # define ISRPT(c1, c2) (c1 == '*' || c1 == '+' || c1 == '?' || \ 256 (c1 == '{' && isdigit(c2))) 257 258 c = GETNEXT(); 259 assert(c != '\0'); /* caller should have ensured this */ 260 261 pos = HERE(); 262 switch (c) { 263 case '(': 264 MUSTNOTSEE('\0', REG_EPAREN); 265 p->g->nsub++; 266 subno = p->g->nsub; 267 if (subno < NPAREN) 268 p->pbegin[subno] = HERE(); 269 EMIT(OLPAREN, subno); 270 if (!SEE(')')) 271 p_ere(p, ')'); 272 if (subno < NPAREN) { 273 p->pend[subno] = HERE(); 274 assert(p->pend[subno] != 0); 275 } 276 EMIT(ORPAREN, subno); 277 MUSTEAT(')', REG_EPAREN); 278 break; 279 #ifndef POSIX_MISTAKE 280 case ')': /* happens only if no current unmatched ( */ 281 /* 282 * You may ask, why the ifndef? Because I didn't notice 283 * this until slightly too late for 1003.2, and none of the 284 * other 1003.2 regular-expression reviewers noticed it at 285 * all. So an unmatched ) is legal POSIX, at least until 286 * we can get it fixed. 287 */ 288 SETERROR(REG_EPAREN); 289 break; 290 #endif 291 case '^': 292 EMIT(OBOL, 0); 293 p->g->iflags |= USEBOL; 294 wascaret = 1; 295 break; 296 case '$': 297 EMIT(OEOL, 0); 298 p->g->iflags |= USEEOL; 299 break; 300 case '|': 301 SETERROR(REG_EMPTY); 302 break; 303 case '*': 304 case '+': 305 case '?': 306 SETERROR(REG_BADRPT); 307 break; 308 case '.': 309 if (p->g->cflags®_NEWLINE) 310 nonnewline(p); 311 else 312 EMIT(OANY, 0); 313 break; 314 case '[': 315 p_bracket(p); 316 break; 317 case '\\': 318 c = GETNEXT(); 319 #ifdef xxx 320 if (c == '^' || c == '.' || c == '[' || c == '$' || 321 c == '(' || c == ')' || c == '|' || 322 c == '*' || c == '+' || c == '?' || 323 c == '{' || c == '\\') 324 #else 325 if (c != '\0') 326 #endif 327 ordinary(p, c); 328 else 329 SETERROR(REG_EESCAPE); 330 break; 331 case '{': /* okay as ordinary except if digit follows */ 332 REQUIRE(!isdigit(PEEK()), REG_BADRPT); 333 /* FALLTHROUGH */ 334 default: 335 ordinary(p, c); 336 break; 337 } 338 339 c = PEEK(); 340 if (!ISRPT(c, PEEK2())) 341 return; /* no repetition, we're done */ 342 NEXT(); 343 344 REQUIRE(!wascaret, REG_BADRPT); 345 switch (c) { 346 case '*': /* implemented as +? */ 347 INSERT(OPLUS_, pos); 348 BACK(O_PLUS, pos); 349 INSERT(OQUEST_, pos); 350 BACK(O_QUEST, pos); 351 break; 352 case '+': 353 INSERT(OPLUS_, pos); 354 BACK(O_PLUS, pos); 355 break; 356 case '?': 357 INSERT(OQUEST_, pos); 358 BACK(O_QUEST, pos); 359 break; 360 case '{': 361 count = p_count(p); 362 if (EAT(',')) { 363 if (isdigit(PEEK())) { 364 count2 = p_count(p); 365 REQUIRE(count <= count2, REG_BADBR); 366 } else /* single number with comma */ 367 count2 = INFINITY; 368 } else /* just a single number */ 369 count2 = count; 370 repeat(p, pos, count, count2); 371 if (!EAT('}')) { /* error heuristics */ 372 while ((c = PEEK()) != '\0' && c != '}') 373 NEXT(); 374 if (c == '\0') 375 SETERROR(REG_EBRACE); 376 else 377 SETERROR(REG_BADBR); 378 } 379 break; 380 } 381 382 c = PEEK(); 383 REQUIRE(!ISRPT(c, PEEK2()), REG_BADRPT); 384 } 385 386 /* 387 - p_bre - BRE parser top level, anchoring and concatenation 388 * Giving end1 as '\0' essentially eliminates the end1/end2 check. 389 * 390 * This implementation is a bit of a kludge, in that a trailing $ is first 391 * taken as an ordinary character and then revised to be an anchor. The 392 * only undesirable side effect is that '$' gets included as a character 393 * category in such cases. This is fairly harmless; not worth fixing. 394 * The amount of lookahead needed to avoid this kludge is excessive, 395 * especially since things like "$*" appear to be legal. xxx 396 */ 397 static void 398 p_bre(p, end1, end2) 399 register struct parse *p; 400 register u_int end1; /* first terminating character */ 401 register u_int end2; /* second terminating character */ 402 { 403 register sopno start = HERE(); 404 register int first = 1; /* first subexpression? */ 405 register int wasdollar = 0; 406 407 if (EAT('^')) { 408 EMIT(OBOL, 0); 409 p->g->iflags |= USEBOL; 410 } 411 while (!SEE('\0') && !SEETWO(end1, end2)) { 412 wasdollar = p_simp_re(p, first); 413 first = 0; 414 } 415 if (wasdollar) { /* oops, that was a trailing anchor */ 416 DROP(1); 417 EMIT(OEOL, 0); 418 p->g->iflags |= USEEOL; 419 } 420 421 REQUIRE(HERE() != start, REG_EMPTY); /* require nonempty */ 422 } 423 424 /* 425 - p_simp_re - parse a simple RE, an atom possibly followed by a repetition 426 */ 427 static int /* was the simple RE an unbackslashed $? */ 428 p_simp_re(p, starordinary) 429 register struct parse *p; 430 int starordinary; /* is a leading * an ordinary character? */ 431 { 432 register int c; 433 register int count; 434 register int count2; 435 register sopno pos; 436 register int i; 437 register sopno subno; 438 # define BACKSL (1<<CHAR_BIT) 439 440 pos = HERE(); /* repetion op, if any, covers from here */ 441 442 c = GETNEXT(); 443 assert(c != '\0'); /* caller should have ensured this */ 444 if (c == '\\') 445 c = BACKSL | PEEK(); 446 switch (c) { 447 case '.': 448 if (p->g->cflags®_NEWLINE) 449 nonnewline(p); 450 else 451 EMIT(OANY, 0); 452 break; 453 case '[': 454 p_bracket(p); 455 break; 456 case BACKSL|'{': 457 SETERROR(REG_BADRPT); 458 break; 459 case BACKSL|'(': 460 NEXT(); 461 p->g->nsub++; 462 subno = p->g->nsub; 463 if (subno < NPAREN) 464 p->pbegin[subno] = HERE(); 465 EMIT(OLPAREN, subno); 466 /* the SEE here is an error heuristic */ 467 if (!SEE('\0') && !SEETWO('\\', ')')) 468 p_bre(p, '\\', ')'); 469 if (subno < NPAREN) { 470 p->pend[subno] = HERE(); 471 assert(p->pend[subno] != 0); 472 } 473 EMIT(ORPAREN, subno); 474 REQUIRE(EATTWO('\\', ')'), REG_EPAREN); 475 break; 476 case BACKSL|')': /* should not get here -- must be user */ 477 case BACKSL|'}': 478 SETERROR(REG_EPAREN); 479 break; 480 case BACKSL|'1': 481 case BACKSL|'2': 482 case BACKSL|'3': 483 case BACKSL|'4': 484 case BACKSL|'5': 485 case BACKSL|'6': 486 case BACKSL|'7': 487 case BACKSL|'8': 488 case BACKSL|'9': 489 i = (c&~BACKSL) - '0'; 490 assert(i < NPAREN); 491 if (p->pend[i] != 0) { 492 assert(i <= p->g->nsub); 493 EMIT(OBACK_, i); 494 assert(p->pbegin[i] != 0); 495 assert(OP(p->strip[p->pbegin[i]]) == OLPAREN); 496 assert(OP(p->strip[p->pend[i]]) == ORPAREN); 497 (void) dupl(p, p->pbegin[i]+1, p->pend[i]); 498 EMIT(O_BACK, i); 499 } else 500 SETERROR(REG_ESUBREG); 501 p->g->backrefs = 1; 502 NEXT(); 503 break; 504 case BACKSL|'\0': 505 SETERROR(REG_EESCAPE); 506 break; 507 case '*': 508 REQUIRE(starordinary, REG_BADRPT); 509 /* FALLTHROUGH */ 510 default: 511 if (c & BACKSL) 512 NEXT(); 513 ordinary(p, (uchar)(c &~ BACKSL)); 514 break; 515 } 516 517 if (EAT('*')) { /* implemented as +? */ 518 INSERT(OPLUS_, pos); 519 BACK(O_PLUS, pos); 520 INSERT(OQUEST_, pos); 521 BACK(O_QUEST, pos); 522 } else if (EATTWO('\\', '{')) { 523 count = p_count(p); 524 if (EAT(',')) { 525 if (isdigit(PEEK())) { 526 count2 = p_count(p); 527 REQUIRE(count <= count2, REG_BADBR); 528 } else /* single number with comma */ 529 count2 = INFINITY; 530 } else /* just a single number */ 531 count2 = count; 532 repeat(p, pos, count, count2); 533 if (!EATTWO('\\', '}')) { /* error heuristics */ 534 while (!SEE('\0') && !SEETWO('\\', '}')) 535 NEXT(); 536 if (SEE('\0')) 537 SETERROR(REG_EBRACE); 538 else 539 SETERROR(REG_BADBR); 540 } 541 } else if (c == '$') /* unbackslashed $ not followed by reptn */ 542 return(1); 543 544 return(0); 545 } 546 547 /* 548 - p_count - parse a repetition count 549 */ 550 static int /* the value */ 551 p_count(p) 552 register struct parse *p; 553 { 554 register int count = 0; 555 register int ndigits = 0; 556 557 while (isdigit(PEEK()) && count <= DUPMAX) { 558 count = count*10 + (GETNEXT() - '0'); 559 ndigits++; 560 } 561 562 REQUIRE(ndigits > 0 && count <= DUPMAX, REG_BADBR); 563 return(count); 564 } 565 566 /* 567 - p_bracket - parse a bracketed character list 568 * 569 * Note a significant property of this code: if the allocset() did SETERROR, 570 * no set operations are done. 571 */ 572 static void 573 p_bracket(p) 574 register struct parse *p; 575 { 576 register uchar c; 577 register cset *cs = allocset(p); 578 register int invert = 0; 579 580 if (EAT('^')) 581 invert++; /* make note to invert set at end */ 582 if (EAT(']')) 583 CHadd(cs, ']'); 584 while ((c = PEEK()) != '\0' && c != ']' && !SEETWO('-', ']')) 585 p_b_term(p, cs); 586 if (EAT('-')) 587 CHadd(cs, '-'); 588 MUSTEAT(']', REG_EBRACK); 589 590 if (invert && p->error == 0) { 591 register int i; 592 593 for (i = p->g->csetsize - 1; i >= 0; i--) 594 if (CHIN(cs, i)) 595 CHsub(cs, i); 596 else 597 CHadd(cs, i); 598 if (p->g->cflags®_NEWLINE) 599 CHsub(cs, '\n'); 600 if (cs->multis != NULL) 601 mcinvert(p, cs); 602 } 603 assert(cs->multis == NULL); /* xxx */ 604 EMIT(OANYOF, freezeset(p, cs)); 605 } 606 607 /* 608 - p_b_term - parse one term of a bracketed character list 609 */ 610 static void 611 p_b_term(p, cs) 612 register struct parse *p; 613 register cset *cs; 614 { 615 register uchar c; 616 register uchar start, finish; 617 register int i; 618 619 /* classify what we've got */ 620 switch (PEEK()) { 621 case '[': 622 c = PEEK2(); 623 break; 624 case '-': 625 SETERROR(REG_ERANGE); 626 return; /* NOTE RETURN */ 627 break; 628 default: 629 c = '\0'; 630 break; 631 } 632 633 switch (c) { 634 case ':': /* character class */ 635 NEXT2(); 636 c = PEEK(); 637 REQUIRE(c != '\0', REG_EBRACK); 638 REQUIRE(c != '-' && c != ']', REG_ECTYPE); 639 p_b_cclass(p, cs); 640 MUSTNOTSEE('\0', REG_EBRACK); 641 REQUIRE(EATTWO(':', ']'), REG_ECTYPE); 642 break; 643 case '=': /* equivalence class */ 644 NEXT2(); 645 c = PEEK(); 646 REQUIRE(c != '\0', REG_EBRACK); 647 REQUIRE(c != '-' && c != ']', REG_ECOLLATE); 648 p_b_eclass(p, cs); 649 MUSTNOTSEE('\0', REG_EBRACK); 650 REQUIRE(EATTWO('=', ']'), REG_ECOLLATE); 651 break; 652 default: /* symbol, ordinary character, or range */ 653 /* xxx revision needed for multichar stuff */ 654 start = p_b_symbol(p); 655 if (PEEK() == '-' && (c = PEEK2()) != ']' && c != '\0') { 656 /* range */ 657 NEXT(); 658 if (EAT('-')) 659 finish = '-'; 660 else 661 finish = p_b_symbol(p); 662 } else 663 finish = start; 664 REQUIRE(start <= finish, REG_ERANGE); 665 for (i = start; i <= finish; i++) { 666 CHadd(cs, i); 667 if ((p->g->cflags®_ICASE) && isalpha(i)) { 668 c = othercase((uchar)i); 669 CHadd(cs, c); 670 } 671 } 672 break; 673 } 674 } 675 676 /* 677 - p_b_cclass - parse a character-class name and deal with it 678 */ 679 static void 680 p_b_cclass(p, cs) 681 register struct parse *p; 682 register cset *cs; 683 { 684 register uchar *sb = p->next; 685 register uchar *se = sb; 686 register struct cclass *cp; 687 register int len; 688 register uchar *u; 689 register uchar c; 690 691 while (isalpha(*se)) 692 se++; 693 len = se - sb; 694 NEXTn(len); 695 for (cp = cclasses; cp->name != NULL; cp++) 696 if (strncmp(cp->name, (char *)sb, len) == 0 && 697 cp->name[len] == '\0') 698 break; 699 if (cp->name == NULL) { 700 /* oops, didn't find it */ 701 SETERROR(REG_ECTYPE); 702 return; 703 } 704 705 u = (uchar *)cp->chars; 706 while ((c = *u++) != '\0') 707 CHadd(cs, c); 708 for (u = (uchar *)cp->multis; *u != '\0'; u += strlen((char *)u) + 1) 709 MCadd(cs, u); 710 } 711 712 /* 713 - p_b_eclass - parse an equivalence-class name and deal with it 714 * 715 * This implementation is incomplete. xxx 716 */ 717 static void 718 p_b_eclass(p, cs) 719 register struct parse *p; 720 register cset *cs; 721 { 722 register uchar c; 723 724 c = p_b_coll_elem(p, '='); 725 CHadd(cs, c); 726 } 727 728 /* 729 - p_b_symbol - parse a character or [..]ed multicharacter collating symbol 730 */ 731 static uchar /* value of symbol */ 732 p_b_symbol(p) 733 register struct parse *p; 734 { 735 register uchar value; 736 737 if (!EATTWO('[', '.')) { 738 MUSTNOTSEE('\0', REG_EBRACK); 739 return(GETNEXT()); 740 } 741 742 /* collating symbol */ 743 MUSTNOTSEE('\0', REG_EBRACK); 744 value = p_b_coll_elem(p, '.'); 745 REQUIRE(EATTWO('.', ']'), REG_ECOLLATE); 746 return(value); 747 } 748 749 /* 750 - p_b_coll_elem - parse a collating-element name and look it up 751 */ 752 static uchar /* value of collating element */ 753 p_b_coll_elem(p, endc) 754 register struct parse *p; 755 u_int endc; /* name ended by endc,']' */ 756 { 757 register uchar *sp = p->next; 758 register struct cname *cp; 759 register int len; 760 register uchar c; 761 762 while ((c = PEEK()) != '\0' && !SEETWO(endc, ']')) 763 NEXT(); 764 if (c == '\0') { 765 SETERROR(REG_EBRACK); 766 return(0); 767 } 768 len = p->next - sp; 769 for (cp = cnames; cp->name != NULL; cp++) 770 if (strncmp(cp->name, (char *)sp, len) == 0 && 771 cp->name[len] == '\0') 772 return(cp->code); /* known name */ 773 if (len == 1) 774 return(*sp); /* single character */ 775 SETERROR(REG_ECOLLATE); /* neither */ 776 return(0); 777 } 778 779 /* 780 - othercase - return the case counterpart of an alphabetic 781 */ 782 static uchar 783 othercase(ch) 784 u_int ch; 785 { 786 assert(isalpha(ch)); 787 if (isupper(ch)) 788 return(tolower(ch)); 789 else if (islower(ch)) 790 return(toupper(ch)); 791 else /* peculiar, but could happen */ 792 return(ch); 793 } 794 795 /* 796 - bothcases - emit a dualcase version of a character 797 * 798 * Boy, is this implementation ever a kludge... 799 */ 800 static void 801 bothcases(p, ch) 802 register struct parse *p; 803 u_int ch; 804 { 805 register uchar *oldnext; 806 uchar bracket[3]; 807 808 oldnext = p->next; 809 p->next = bracket; 810 bracket[0] = ch; 811 bracket[1] = ']'; 812 bracket[2] = '\0'; 813 p_bracket(p); 814 assert(p->next == bracket+2); 815 p->next = oldnext; 816 } 817 818 /* 819 - ordinary - emit an ordinary character 820 */ 821 static void 822 ordinary(p, ch) 823 register struct parse *p; 824 register u_int ch; 825 { 826 register uchar *cap = p->g->categories; 827 828 if ((p->g->cflags®_ICASE) && isalpha(ch)) { 829 bothcases(p, ch); 830 return; 831 } 832 833 EMIT(OCHAR, ch); 834 if (cap[ch] == 0) 835 cap[ch] = p->g->ncategories++; 836 } 837 838 /* 839 - nonnewline - emit REG_NEWLINE version of OANY 840 * 841 * Boy, is this implementation ever a kludge... 842 */ 843 static void 844 nonnewline(p) 845 register struct parse *p; 846 { 847 register uchar *oldnext; 848 uchar bracket[4]; 849 850 oldnext = p->next; 851 p->next = bracket; 852 bracket[0] = '^'; 853 bracket[1] = '\n'; 854 bracket[2] = ']'; 855 bracket[3] = '\0'; 856 p_bracket(p); 857 assert(p->next == bracket+3); 858 p->next = oldnext; 859 } 860 861 /* 862 - repeat - generate code for a bounded repetition, recursively if needed 863 */ 864 static void 865 repeat(p, start, from, to) 866 register struct parse *p; 867 sopno start; /* operand from here to end of strip */ 868 int from; /* repeated from this number */ 869 int to; /* to this number of times (maybe INFINITY) */ 870 { 871 register sopno finish = HERE(); 872 # define N 2 873 # define INF 3 874 # define REP(f, t) ((f)*8 + (t)) 875 # define MAP(n) (((n) <= 1) ? (n) : ((n) == INFINITY) ? INF : N) 876 register sopno copy; 877 878 if (p->error != 0) /* head off possible runaway recursion */ 879 return; 880 881 assert(from <= to); 882 883 switch (REP(MAP(from), MAP(to))) { 884 case REP(0, 0): /* must be user doing this */ 885 DROP(finish-start); /* drop the operand */ 886 break; 887 case REP(0, 1): /* as x{1,1}? */ 888 case REP(0, N): /* as x{1,n}? */ 889 case REP(0, INF): /* as x{1,}? */ 890 INSERT(OQUEST_, start); /* offset is wrong... */ 891 repeat(p, start+1, 1, to); 892 FWD(start); /* ... fix it */ 893 BACK(O_QUEST, start); 894 break; 895 case REP(1, 1): /* trivial case */ 896 /* done */ 897 break; 898 case REP(1, N): /* as x?x{1,n-1} */ 899 INSERT(OQUEST_, start); 900 BACK(O_QUEST, start); 901 copy = dupl(p, start+1, finish+1); 902 assert(copy == finish+2); 903 repeat(p, copy, 1, to-1); 904 break; 905 case REP(1, INF): /* as x+ */ 906 INSERT(OPLUS_, start); 907 BACK(O_PLUS, start); 908 break; 909 case REP(N, N): /* as xx{m-1,n-1} */ 910 copy = dupl(p, start, finish); 911 repeat(p, copy, from-1, to-1); 912 break; 913 case REP(N, INF): /* as xx{n-1,INF} */ 914 copy = dupl(p, start, finish); 915 repeat(p, copy, from-1, to); 916 break; 917 default: /* "can't happen" */ 918 SETERROR(REG_ASSERT); /* just in case */ 919 break; 920 } 921 } 922 923 /* 924 - seterr - set an error condition 925 */ 926 static int /* useless but makes type checking happy */ 927 seterr(p, e) 928 register struct parse *p; 929 int e; 930 { 931 if (p->error == 0) /* keep earliest error condition */ 932 p->error = e; 933 p->next = nuls; /* try to bring things to a halt */ 934 return(0); /* make the return value well-defined */ 935 } 936 937 /* 938 - allocset - allocate a set of characters for [] 939 */ 940 static cset * 941 allocset(p) 942 register struct parse *p; 943 { 944 register int no = p->g->ncsets++; 945 register size_t nc; 946 register size_t nbytes; 947 register cset *cs; 948 register size_t css = (size_t)p->g->csetsize; 949 950 if (no >= p->ncsalloc) { /* need another column of space */ 951 p->ncsalloc += CHAR_BIT; 952 nc = p->ncsalloc; 953 assert(nc % CHAR_BIT == 0); 954 nbytes = nc / CHAR_BIT * css; 955 if (p->g->sets == NULL) 956 p->g->sets = (cset *)malloc(nc * sizeof(cset)); 957 else 958 p->g->sets = (cset *)realloc((char *)p->g->sets, 959 nc * sizeof(cset)); 960 if (p->g->setbits == NULL) 961 p->g->setbits = (uchar *)malloc(nbytes); 962 else 963 p->g->setbits = (uchar *)realloc((char *)p->g->setbits, 964 nbytes); 965 if (p->g->sets != NULL && p->g->setbits != NULL) 966 (void) memset((char *)p->g->setbits + (nbytes - css), 967 0, css); 968 else { 969 no = 0; 970 SETERROR(REG_ESPACE); 971 /* caller's responsibility not to do set ops */ 972 } 973 } 974 975 assert(p->g->sets != NULL); /* xxx */ 976 cs = &p->g->sets[no]; 977 cs->ptr = p->g->setbits + css*((no)/CHAR_BIT); 978 cs->mask = 1 << ((no) % CHAR_BIT); 979 cs->hash = 0; 980 cs->smultis = 0; 981 cs->multis = NULL; 982 983 return(cs); 984 } 985 986 /* 987 - freezeset - final processing on a set of characters 988 * 989 * The main task here is merging identical sets. This is usually a waste 990 * of time (although the hash code minimizes the overhead), but can win 991 * big if REG_ICASE is being used. REG_ICASE, by the way, is why the hash 992 * is done using addition rather than xor -- all ASCII [aA] sets xor to 993 * the same value! 994 */ 995 static int /* set number */ 996 freezeset(p, cs) 997 register struct parse *p; 998 register cset *cs; 999 { 1000 register uchar h = cs->hash; 1001 register int i; 1002 register cset *top = &p->g->sets[p->g->ncsets]; 1003 register cset *cs2; 1004 register size_t css = (size_t)p->g->csetsize; 1005 1006 /* look for an earlier one which is the same */ 1007 for (cs2 = &p->g->sets[0]; cs2 < top; cs2++) 1008 if (cs2->hash == h && cs2 != cs) { 1009 /* maybe */ 1010 for (i = 0; i < css; i++) 1011 if (!!CHIN(cs2, i) != !!CHIN(cs, i)) 1012 break; /* no */ 1013 if (i == css) 1014 break; /* yes */ 1015 } 1016 1017 if (cs2 < top) { /* found one */ 1018 assert(cs == top-1); 1019 p->g->ncsets--; 1020 for (i = 0; i < css; i++) 1021 CHsub(cs, i); 1022 cs = cs2; 1023 } 1024 1025 return((int)(cs - p->g->sets)); 1026 } 1027 1028 /* 1029 - mcadd - add a collating element to a cset 1030 */ 1031 static void 1032 mcadd(p, cs, cp) 1033 register struct parse *p; 1034 register cset *cs; 1035 register uchar *cp; 1036 { 1037 register size_t oldend = cs->smultis; 1038 1039 cs->smultis += strlen((char *)cp) + 1; 1040 if (cs->multis == NULL) 1041 cs->multis = (uchar *)malloc(cs->smultis); 1042 else 1043 cs->multis = (uchar *)realloc(cs->multis, cs->smultis); 1044 if (cs->multis == NULL) { 1045 SETERROR(REG_ESPACE); 1046 return; 1047 } 1048 1049 (void) strcpy((char *)(cs->multis + oldend - 1), (char *)cp); 1050 cs->multis[cs->smultis - 1] = '\0'; 1051 } 1052 1053 /* 1054 - mcsub - subtract a collating element from a cset 1055 */ 1056 static void 1057 mcsub(p, cs, cp) 1058 register struct parse *p; 1059 register cset *cs; 1060 register u_int *cp; 1061 { 1062 register uchar *fp = mcfind(cs, cp); 1063 register size_t len = strlen((char *)fp); 1064 1065 assert(p != NULL); 1066 (void) memmove((char *)fp, (char *)(fp + len + 1), 1067 cs->smultis - (fp + len + 1 - cs->multis)); 1068 cs->smultis -= len; 1069 1070 if (cs->smultis == 0) { 1071 free((char *)cs->multis); 1072 cs->multis = NULL; 1073 return; 1074 } 1075 1076 cs->multis = (uchar *)realloc(cs->multis, cs->smultis); 1077 assert(cs->multis != NULL); 1078 } 1079 1080 /* 1081 - mcin - is a collating element in a cset? 1082 */ 1083 static int 1084 mcin(p, cs, cp) 1085 register struct parse *p; 1086 register cset *cs; 1087 register u_int *cp; 1088 { 1089 return(mcfind(cs, cp) != NULL); 1090 } 1091 1092 /* 1093 - mcfind - find a collating element in a cset 1094 */ 1095 static uchar * 1096 mcfind(cs, cp) 1097 register cset *cs; 1098 register u_int *cp; 1099 { 1100 register uchar *p; 1101 1102 if (cs->multis == NULL) 1103 return(NULL); 1104 for (p = cs->multis; *p != '\0'; p += strlen((char *)p) + 1) 1105 if (strcmp((char *)cp, (char *)p) == 0) 1106 return(p); 1107 return(NULL); 1108 } 1109 1110 /* 1111 - mcinvert - invert the list of collating elements in a cset 1112 * 1113 * This would have to know the set of possibilities. Implementation 1114 * is deferred. 1115 */ 1116 static void 1117 mcinvert(p, cs) 1118 register struct parse *p; 1119 register cset *cs; 1120 { 1121 assert(cs->multis == NULL); /* xxx */ 1122 } 1123 1124 /* 1125 - isinsets - is this character in any sets? 1126 */ 1127 static int /* predicate */ 1128 isinsets(g, c) 1129 register struct re_guts *g; 1130 u_int c; 1131 { 1132 register uchar *col; 1133 register int i; 1134 register int ncols = (g->ncsets+(CHAR_BIT-1)) / CHAR_BIT; 1135 1136 for (i = 0, col = g->setbits; i < ncols; i++, col += g->csetsize) 1137 if (col[c] != 0) 1138 return(1); 1139 return(0); 1140 } 1141 1142 /* 1143 - samesets - are these two characters in exactly the same sets? 1144 */ 1145 static int /* predicate */ 1146 samesets(g, c1, c2) 1147 register struct re_guts *g; 1148 register u_int c1; 1149 register u_int c2; 1150 { 1151 register uchar *col; 1152 register int i; 1153 register int ncols = (g->ncsets+(CHAR_BIT-1)) / CHAR_BIT; 1154 1155 for (i = 0, col = g->setbits; i < ncols; i++, col += g->csetsize) 1156 if (col[c1] != col[c2]) 1157 return(0); 1158 return(1); 1159 } 1160 1161 /* 1162 - categorize - sort out character categories 1163 */ 1164 static void 1165 categorize(p, g) 1166 struct parse *p; 1167 register struct re_guts *g; 1168 { 1169 register uchar *cats = g->categories; 1170 register unsigned c; 1171 register unsigned c2; 1172 register uchar cat; 1173 1174 /* avoid making error situations worse */ 1175 if (p->error != 0) 1176 return; 1177 1178 for (c = 0; c < g->csetsize; c++) 1179 if (cats[c] == 0 && isinsets(g, c)) { 1180 cat = g->ncategories++; 1181 cats[c] = cat; 1182 for (c2 = c+1; c2 < g->csetsize; c2++) 1183 if (cats[c2] == 0 && samesets(g, c, c2)) 1184 cats[c2] = cat; 1185 } 1186 } 1187 1188 /* 1189 - dupl - emit a duplicate of a bunch of sops 1190 */ 1191 static sopno /* start of duplicate */ 1192 dupl(p, start, finish) 1193 register struct parse *p; 1194 sopno start; /* from here */ 1195 sopno finish; /* to this less one */ 1196 { 1197 register sopno ret = HERE(); 1198 register sopno len = finish - start; 1199 1200 assert(finish >= start); 1201 if (len == 0) 1202 return(ret); 1203 enlarge(p, p->ssize + len); /* this many unexpected additions */ 1204 assert(p->ssize >= p->slen + len); 1205 (void) memcpy((char *)(p->strip + p->slen), 1206 (char *)(p->strip + start), (size_t)len*sizeof(sop)); 1207 p->slen += len; 1208 return(ret); 1209 } 1210 1211 /* 1212 - doemit - emit a strip operator 1213 * 1214 * It might seem better to implement this as a macro with a function as 1215 * hard-case backup, but it's just too big and messy unless there are 1216 * some changes to the data structures. Maybe later. 1217 */ 1218 static void 1219 doemit(p, op, opnd) 1220 register struct parse *p; 1221 sop op; 1222 size_t opnd; 1223 { 1224 /* avoid making error situations worse */ 1225 if (p->error != 0) 1226 return; 1227 1228 /* deal with oversize operands ("can't happen", more or less) */ 1229 assert(opnd < 1<<OPSHIFT); 1230 1231 /* deal with undersized strip */ 1232 if (p->slen >= p->ssize) 1233 enlarge(p, (p->ssize+1) / 2 * 3); /* +50% */ 1234 assert(p->slen < p->ssize); 1235 1236 /* finally, it's all reduced to the easy case */ 1237 p->strip[p->slen++] = SOP(op, opnd); 1238 } 1239 1240 /* 1241 - doinsert - insert a sop into the strip 1242 */ 1243 static void 1244 doinsert(p, op, opnd, pos) 1245 register struct parse *p; 1246 sop op; 1247 size_t opnd; 1248 sopno pos; 1249 { 1250 register sopno sn; 1251 register sop s; 1252 register int i; 1253 1254 /* avoid making error situations worse */ 1255 if (p->error != 0) 1256 return; 1257 1258 sn = HERE(); 1259 EMIT(op, opnd); /* do checks, ensure space */ 1260 assert(HERE() == sn+1); 1261 s = p->strip[sn]; 1262 1263 /* adjust paren pointers */ 1264 assert(pos > 0); 1265 for (i = 1; i < NPAREN; i++) { 1266 if (p->pbegin[i] >= pos) { 1267 p->pbegin[i]++; 1268 } 1269 if (p->pend[i] >= pos) { 1270 p->pend[i]++; 1271 } 1272 } 1273 1274 memmove((char *)&p->strip[pos+1], (char *)&p->strip[pos], 1275 (HERE()-pos-1)*sizeof(sop)); 1276 p->strip[pos] = s; 1277 } 1278 1279 /* 1280 - dofwd - complete a forward reference 1281 */ 1282 static void 1283 dofwd(p, pos, value) 1284 register struct parse *p; 1285 register sopno pos; 1286 sop value; 1287 { 1288 /* avoid making error situations worse */ 1289 if (p->error != 0) 1290 return; 1291 1292 assert(value < 1<<OPSHIFT); 1293 p->strip[pos] = OP(p->strip[pos]) | value; 1294 } 1295 1296 /* 1297 - enlarge - enlarge the strip 1298 */ 1299 static void 1300 enlarge(p, size) 1301 register struct parse *p; 1302 register sopno size; 1303 { 1304 register sop *sp; 1305 1306 if (p->ssize >= size) 1307 return; 1308 1309 sp = (sop *)realloc(p->strip, size*sizeof(sop)); 1310 if (sp == NULL) { 1311 SETERROR(REG_ESPACE); 1312 return; 1313 } 1314 p->strip = sp; 1315 p->ssize = size; 1316 } 1317 1318 /* 1319 - stripsnug - compact the strip 1320 */ 1321 static void 1322 stripsnug(p, g) 1323 register struct parse *p; 1324 register struct re_guts *g; 1325 { 1326 g->nstates = p->slen; 1327 g->strip = (sop *)realloc((sop *)p->strip, p->slen * sizeof(sop)); 1328 if (g->strip == NULL) { 1329 SETERROR(REG_ESPACE); 1330 g->strip = p->strip; 1331 } 1332 } 1333 1334 /* 1335 - findmust - fill in must and mlen with longest mandatory literal string 1336 * 1337 * This algorithm could do fancy things like analyzing the operands of | 1338 * for common subsequences. Someday. This code is simple and finds most 1339 * of the interesting cases. 1340 * 1341 * Note that must and mlen got initialized during setup. 1342 */ 1343 static void 1344 findmust(p, g) 1345 struct parse *p; 1346 register struct re_guts *g; 1347 { 1348 register sop *scan; 1349 sop *start; 1350 register sop *newstart; 1351 register sopno newlen; 1352 register sop s; 1353 register char *cp; 1354 register sopno i; 1355 1356 /* avoid making error situations worse */ 1357 if (p->error != 0) 1358 return; 1359 1360 /* find the longest OCHAR sequence in strip */ 1361 newlen = 0; 1362 scan = g->strip + 1; 1363 do { 1364 s = *scan++; 1365 switch (OP(s)) { 1366 case OCHAR: /* sequence member */ 1367 if (newlen == 0) /* new sequence */ 1368 newstart = scan - 1; 1369 newlen++; 1370 break; 1371 case OPLUS_: /* things that don't break one */ 1372 case OLPAREN: 1373 case ORPAREN: 1374 break; 1375 case OQUEST_: /* things that must be skipped */ 1376 case OCH_: 1377 scan--; 1378 do { 1379 scan += OPND(s); 1380 s = *scan; 1381 /* assert() interferes w debug printouts */ 1382 if (OP(s) != O_QUEST && OP(s) != O_CH && 1383 OP(s) != OOR2) { 1384 g->iflags |= BAD; 1385 return; 1386 } 1387 } while (OP(s) != O_QUEST && OP(s) != O_CH); 1388 /* fallthrough */ 1389 default: /* things that break a sequence */ 1390 if (newlen > g->mlen) { /* ends one */ 1391 start = newstart; 1392 g->mlen = newlen; 1393 } 1394 newlen = 0; 1395 break; 1396 } 1397 } while (OP(s) != OEND); 1398 1399 if (g->mlen == 0) /* there isn't one */ 1400 return; 1401 1402 /* turn it into a character string */ 1403 g->must = malloc((size_t)g->mlen + 1); 1404 if (g->must == NULL) { /* argh; just forget it */ 1405 g->mlen = 0; 1406 return; 1407 } 1408 cp = g->must; 1409 scan = start; 1410 for (i = g->mlen; i > 0; i--) { 1411 while (OP(s = *scan++) != OCHAR) 1412 continue; 1413 *cp++ = OPND(s); 1414 } 1415 *cp++ = '\0'; /* just on general principles */ 1416 } 1417 1418 /* 1419 - pluscount - count + nesting 1420 */ 1421 static sopno /* nesting depth */ 1422 pluscount(p, g) 1423 struct parse *p; 1424 register struct re_guts *g; 1425 { 1426 register sop *scan; 1427 register sop s; 1428 register sopno plusnest = 0; 1429 register sopno maxnest = 0; 1430 1431 if (p->error != 0) 1432 return(0); /* there may not be an OEND */ 1433 1434 scan = g->strip + 1; 1435 do { 1436 s = *scan++; 1437 switch (OP(s)) { 1438 case OPLUS_: 1439 plusnest++; 1440 break; 1441 case O_PLUS: 1442 if (plusnest > maxnest) 1443 maxnest = plusnest; 1444 plusnest--; 1445 break; 1446 } 1447 } while (OP(s) != OEND); 1448 if (plusnest != 0) 1449 g->iflags |= BAD; 1450 return(maxnest); 1451 } 1452