1 /*- 2 * Copyright (c) 1992 Henry Spencer. 3 * Copyright (c) 1992 The Regents of the University of California. 4 * All rights reserved. 5 * 6 * This code is derived from software contributed to Berkeley by 7 * Henry Spencer of the University of Toronto. 8 * 9 * %sccs.include.redist.c% 10 * 11 * @(#)regcomp.c 5.5 (Berkeley) 10/23/92 12 */ 13 14 #if defined(LIBC_SCCS) && !defined(lint) 15 static char sccsid[] = "@(#)regcomp.c 5.5 (Berkeley) 10/23/92"; 16 #endif /* LIBC_SCCS and not lint */ 17 18 #include <sys/types.h> 19 #include <stdio.h> 20 #include <string.h> 21 #include <ctype.h> 22 #include <limits.h> 23 #include <stdlib.h> 24 #include <regex.h> 25 26 #include "utils.h" 27 #include "regex2.h" 28 29 #include "cclass.h" 30 #include "cname.h" 31 32 /* 33 * parse structure, passed up and down to avoid global variables and 34 * other clumsinesses 35 */ 36 struct parse { 37 uchar *next; /* next character in RE */ 38 int error; /* has an error been seen? */ 39 sop *strip; /* malloced strip */ 40 sopno ssize; /* malloced strip size (allocated) */ 41 sopno slen; /* malloced strip length (used) */ 42 int ncsalloc; /* number of csets allocated */ 43 struct re_guts *g; 44 # define NPAREN 10 /* we need to remember () 1-9 for back refs */ 45 sopno pbegin[NPAREN]; /* -> ( ([0] unused) */ 46 sopno pend[NPAREN]; /* -> ) ([0] unused) */ 47 }; 48 49 static uchar nuls[10]; /* place to point scanner in event of error */ 50 51 /* 52 * macros for use with parse structure 53 * BEWARE: these know that the parse structure is named `p' !!! 54 */ 55 #define PEEK() ((uchar)*p->next) 56 #define PEEK2() ((uchar)*(p->next+1)) 57 #define SEE(c) (PEEK() == (c)) 58 #define SEETWO(a, b) (PEEK() == (a) && PEEK2() == (b)) 59 #define EAT(c) ((SEE(c)) ? (NEXT(), 1) : 0) 60 #define EATTWO(a, b) ((SEETWO(a, b)) ? (NEXT2(), 1) : 0) 61 #define NEXT() (p->next++) 62 #define NEXT2() (p->next += 2) 63 #define NEXTn(n) (p->next += (n)) 64 #define GETNEXT() ((uchar)*p->next++) 65 #define SETERROR(e) seterr(p, (e)) 66 #define REQUIRE(co, e) ((co) || SETERROR(e)) 67 #define MUSTSEE(c, e) (REQUIRE(PEEK() == (c), e)) 68 #define MUSTEAT(c, e) (REQUIRE(GETNEXT() == (c), e)) 69 #define MUSTNOTSEE(c, e) (REQUIRE(PEEK() != (c), e)) 70 #define EMIT(sop, sopnd) doemit(p, sop, (size_t)(sopnd)) 71 #define INSERT(sop, pos) doinsert(p, sop, HERE()-(pos)+1, pos) 72 #define FWD(pos) dofwd(p, pos, HERE()-(pos)) 73 #define BACK(sop, pos) EMIT(sop, HERE()-pos) 74 #define HERE() (p->slen) 75 #define THERE() (p->slen - 1) 76 #define DROP(n) (p->slen -= (n)) 77 78 static cset *allocset __P((struct parse *)); 79 static void bothcases __P((struct parse *, u_int)); 80 static void categorize __P((struct parse *, struct re_guts *)); 81 static void doemit __P((struct parse *, sop, size_t)); 82 static void dofwd __P((struct parse *, sopno, sop)); 83 static void doinsert __P((struct parse *, sop, size_t, sopno)); 84 static sopno dupl __P((struct parse *, sopno, sopno)); 85 static void enlarge __P((struct parse *, sopno)); 86 static void findmust __P((struct parse *, struct re_guts *)); 87 static int freezeset __P((struct parse *, cset *)); 88 static int isinsets __P((struct re_guts *, u_int)); 89 static void mcadd __P((struct parse *, cset *, uchar *)); 90 static uchar *mcfind __P((cset *, u_int *)); 91 static int mcin __P((struct parse *, cset *, u_int *)); 92 static void mcinvert __P((struct parse *, cset *)); 93 static void mcsub __P((struct parse *, cset *, u_int *)); 94 static void nonnewline __P((struct parse *)); 95 static void ordinary __P((struct parse *, u_int)); 96 static uchar othercase __P((u_int)); 97 static void p_b_cclass __P((struct parse *, cset *)); 98 static uchar p_b_coll_elem __P((struct parse *, u_int)); 99 static void p_b_eclass __P((struct parse *, cset *)); 100 static uchar p_b_symbol __P((struct parse *)); 101 static void p_b_term __P((struct parse *, cset *)); 102 static void p_bracket __P((struct parse *)); 103 static void p_bre __P((struct parse *, u_int, u_int)); 104 static int p_count __P((struct parse *)); 105 static void p_ere __P((struct parse *, u_int)); 106 static void p_ere_exp __P((struct parse *)); 107 static int p_simp_re __P((struct parse *, int)); 108 static sopno pluscount __P((struct parse *, struct re_guts *)); 109 static void repeat __P((struct parse *, sopno, int, int)); 110 static int samesets __P((struct re_guts *, u_int, u_int)); 111 static int seterr __P((struct parse *, int)); 112 static void stripsnug __P((struct parse *, struct re_guts *)); 113 114 /* 115 - regcomp - interface for parser and compilation 116 */ 117 int /* 0 success, otherwise REG_something */ 118 regcomp(preg, pattern, cflags) 119 regex_t *preg; 120 const char *pattern; 121 int cflags; 122 { 123 struct parse pa; 124 register struct re_guts *g; 125 register struct parse *p = &pa; 126 register int i; 127 128 /* do the mallocs early so failure handling is easy */ 129 /* the +NUC here is for the category table */ 130 g = (struct re_guts *)malloc(sizeof(struct re_guts) + NUC); 131 if (g == NULL) 132 return(REG_ESPACE); 133 p->ssize = strlen(pattern)/2*3 + 1; 134 p->strip = (sop *)malloc(p->ssize * sizeof(sop)); 135 p->slen = 0; 136 if (p->strip == NULL) { 137 free((char *)g); 138 return(REG_ESPACE); 139 } 140 141 /* set things up */ 142 p->g = g; 143 p->next = (uchar *)pattern; 144 p->error = 0; 145 p->ncsalloc = 0; 146 for (i = 0; i < NPAREN; i++) { 147 p->pbegin[i] = 0; 148 p->pend[i] = 0; 149 } 150 g->csetsize = NUC; 151 g->sets = NULL; 152 g->setbits = NULL; 153 g->ncsets = 0; 154 g->cflags = cflags; 155 g->iflags = 0; 156 g->must = NULL; 157 g->mlen = 0; 158 g->nsub = 0; 159 g->ncategories = 1; /* category 0 is "everything else" */ 160 g->categories = (uchar *)g + sizeof(struct re_guts); 161 (void) memset((char *)g->categories, 0, NUC); 162 g->backrefs = 0; 163 g->nplus = 0; 164 165 /* do it */ 166 EMIT(OEND, 0); 167 g->firststate = THERE(); 168 if (cflags®_EXTENDED) 169 p_ere(p, '\0'); 170 else 171 p_bre(p, '\0', '\0'); 172 EMIT(OEND, 0); 173 g->laststate = THERE(); 174 175 /* tidy up loose ends and fill things in */ 176 categorize(p, g); 177 stripsnug(p, g); 178 findmust(p, g); 179 g->nplus = pluscount(p, g); 180 g->magic = MAGIC2; 181 preg->re_nsub = g->nsub; 182 preg->re_g = g; 183 preg->re_magic = MAGIC1; 184 #ifndef REDEBUG 185 /* not debugging, so can't rely on the assert() in regexec() */ 186 if (g->iflags&BAD) 187 SETERROR(REG_ASSERT); 188 #endif 189 190 /* win or lose, we're done */ 191 if (p->error != 0) /* lose */ 192 regfree(preg); 193 return(p->error); 194 } 195 196 /* 197 - p_ere - ERE parser top level, concatenation and alternation 198 */ 199 static void 200 p_ere(p, stop) 201 register struct parse *p; 202 u_int stop; /* character this ERE should end at */ 203 { 204 register uchar c; 205 register sopno prevback; 206 register sopno prevfwd; 207 register sopno conc; 208 register int first = 1; /* is this the first alternative? */ 209 210 for (;;) { 211 /* do a bunch of concatenated expressions */ 212 conc = HERE(); 213 while ((c = PEEK()) != '|' && c != stop && c != '\0') 214 p_ere_exp(p); 215 REQUIRE(HERE() != conc, REG_EMPTY); /* require nonempty */ 216 217 if (!EAT('|')) 218 break; /* NOTE BREAK OUT */ 219 220 if (first) { 221 INSERT(OCH_, conc); /* offset is wrong */ 222 prevfwd = conc; 223 prevback = conc; 224 first = 0; 225 } 226 BACK(OOR1, prevback); 227 prevback = THERE(); 228 FWD(prevfwd); /* fix previous offset */ 229 prevfwd = HERE(); 230 EMIT(OOR2, 0); /* offset is very wrong */ 231 } 232 233 if (!first) { /* tail-end fixups */ 234 FWD(prevfwd); 235 BACK(O_CH, prevback); 236 } 237 238 assert(SEE(stop) || SEE('\0')); 239 } 240 241 /* 242 - p_ere_exp - parse one subERE, an atom possibly followed by a repetition op 243 */ 244 static void 245 p_ere_exp(p) 246 register struct parse *p; 247 { 248 register uchar c; 249 register sopno pos; 250 register int count; 251 register int count2; 252 register sopno subno; 253 int wascaret = 0; 254 /* we call { a repetition if followed by a digit */ 255 # define ISRPT(c1, c2) (c1 == '*' || c1 == '+' || c1 == '?' || \ 256 (c1 == '{' && isdigit(c2))) 257 258 c = GETNEXT(); 259 assert(c != '\0'); /* caller should have ensured this */ 260 261 pos = HERE(); 262 switch (c) { 263 case '(': 264 MUSTNOTSEE('\0', REG_EPAREN); 265 p->g->nsub++; 266 subno = p->g->nsub; 267 if (subno < NPAREN) 268 p->pbegin[subno] = HERE(); 269 EMIT(OLPAREN, subno); 270 if (!SEE(')')) 271 p_ere(p, ')'); 272 if (subno < NPAREN) { 273 p->pend[subno] = HERE(); 274 assert(p->pend[subno] != 0); 275 } 276 EMIT(ORPAREN, subno); 277 MUSTEAT(')', REG_EPAREN); 278 break; 279 #ifndef POSIX_MISTAKE 280 case ')': /* happens only if no current unmatched ( */ 281 /* 282 * You may ask, why the ifndef? Because I didn't notice 283 * this until slightly too late for 1003.2, and none of the 284 * other 1003.2 regular-expression reviewers noticed it at 285 * all. So an unmatched ) is legal POSIX, at least until 286 * we can get it fixed. 287 */ 288 SETERROR(REG_EPAREN); 289 break; 290 #endif 291 case '^': 292 EMIT(OBOL, 0); 293 p->g->iflags |= USEBOL; 294 wascaret = 1; 295 break; 296 case '$': 297 EMIT(OEOL, 0); 298 p->g->iflags |= USEEOL; 299 break; 300 case '|': 301 SETERROR(REG_EMPTY); 302 break; 303 case '*': 304 case '+': 305 case '?': 306 SETERROR(REG_BADRPT); 307 break; 308 case '.': 309 if (p->g->cflags®_NEWLINE) 310 nonnewline(p); 311 else 312 EMIT(OANY, 0); 313 break; 314 case '[': 315 p_bracket(p); 316 break; 317 case '\\': 318 c = GETNEXT(); 319 #ifdef xxx 320 if (c == '^' || c == '.' || c == '[' || c == '$' || 321 c == '(' || c == ')' || c == '|' || 322 c == '*' || c == '+' || c == '?' || 323 c == '{' || c == '\\') 324 #else 325 if (c != '\0') 326 #endif 327 ordinary(p, c); 328 else 329 SETERROR(REG_EESCAPE); 330 break; 331 case '{': /* okay as ordinary except if digit follows */ 332 REQUIRE(!isdigit(PEEK()), REG_BADRPT); 333 /* FALLTHROUGH */ 334 default: 335 ordinary(p, c); 336 break; 337 } 338 339 c = PEEK(); 340 if (!ISRPT(c, PEEK2())) 341 return; /* no repetition, we're done */ 342 NEXT(); 343 344 REQUIRE(!wascaret, REG_BADRPT); 345 switch (c) { 346 case '*': /* implemented as +? */ 347 INSERT(OPLUS_, pos); 348 BACK(O_PLUS, pos); 349 INSERT(OQUEST_, pos); 350 BACK(O_QUEST, pos); 351 break; 352 case '+': 353 INSERT(OPLUS_, pos); 354 BACK(O_PLUS, pos); 355 break; 356 case '?': 357 INSERT(OQUEST_, pos); 358 BACK(O_QUEST, pos); 359 break; 360 case '{': 361 count = p_count(p); 362 if (EAT(',')) { 363 if (isdigit(PEEK())) { 364 count2 = p_count(p); 365 REQUIRE(count <= count2, REG_BADBR); 366 } else /* single number with comma */ 367 count2 = INFINITY; 368 } else /* just a single number */ 369 count2 = count; 370 repeat(p, pos, count, count2); 371 if (!EAT('}')) { /* error heuristics */ 372 while ((c = PEEK()) != '\0' && c != '}') 373 NEXT(); 374 if (c == '\0') 375 SETERROR(REG_EBRACE); 376 else 377 SETERROR(REG_BADBR); 378 } 379 break; 380 } 381 382 c = PEEK(); 383 REQUIRE(!ISRPT(c, PEEK2()), REG_BADRPT); 384 } 385 386 /* 387 - p_bre - BRE parser top level, anchoring and concatenation 388 * Giving end1 as '\0' essentially eliminates the end1/end2 check. 389 * 390 * This implementation is a bit of a kludge, in that a trailing $ is first 391 * taken as an ordinary character and then revised to be an anchor. The 392 * only undesirable side effect is that '$' gets included as a character 393 * category in such cases. This is fairly harmless; not worth fixing. 394 * The amount of lookahead needed to avoid this kludge is excessive, 395 * especially since things like "$*" appear to be legal. xxx 396 */ 397 static void 398 p_bre(p, end1, end2) 399 register struct parse *p; 400 register u_int end1; /* first terminating character */ 401 register u_int end2; /* second terminating character */ 402 { 403 register sopno start = HERE(); 404 register int first = 1; /* first subexpression? */ 405 register int wasdollar = 0; 406 407 if (EAT('^')) { 408 EMIT(OBOL, 0); 409 p->g->iflags |= USEBOL; 410 } 411 while (!SEE('\0') && !SEETWO(end1, end2)) { 412 wasdollar = p_simp_re(p, first); 413 first = 0; 414 } 415 if (wasdollar) { /* oops, that was a trailing anchor */ 416 DROP(1); 417 EMIT(OEOL, 0); 418 p->g->iflags |= USEEOL; 419 } 420 421 REQUIRE(HERE() != start, REG_EMPTY); /* require nonempty */ 422 } 423 424 /* 425 - p_simp_re - parse a simple RE, an atom possibly followed by a repetition 426 */ 427 static int /* was the simple RE an unbackslashed $? */ 428 p_simp_re(p, starordinary) 429 register struct parse *p; 430 int starordinary; /* is a leading * an ordinary character? */ 431 { 432 register int c; 433 register int count; 434 register int count2; 435 register sopno pos; 436 register int i; 437 register sopno subno; 438 # define BACKSL (1<<CHAR_BIT) 439 440 pos = HERE(); /* repetion op, if any, covers from here */ 441 442 c = GETNEXT(); 443 assert(c != '\0'); /* caller should have ensured this */ 444 if (c == '\\') 445 c = BACKSL | PEEK(); 446 switch (c) { 447 case '.': 448 if (p->g->cflags®_NEWLINE) 449 nonnewline(p); 450 else 451 EMIT(OANY, 0); 452 break; 453 case '[': 454 p_bracket(p); 455 break; 456 case BACKSL|'{': 457 SETERROR(REG_BADRPT); 458 break; 459 case BACKSL|'(': 460 NEXT(); 461 p->g->nsub++; 462 subno = p->g->nsub; 463 if (subno < NPAREN) 464 p->pbegin[subno] = HERE(); 465 EMIT(OLPAREN, subno); 466 /* the SEE here is an error heuristic */ 467 if (!SEE('\0') && !SEETWO('\\', ')')) 468 p_bre(p, '\\', ')'); 469 if (subno < NPAREN) { 470 p->pend[subno] = HERE(); 471 assert(p->pend[subno] != 0); 472 } 473 EMIT(ORPAREN, subno); 474 REQUIRE(EATTWO('\\', ')'), REG_EPAREN); 475 break; 476 case BACKSL|')': /* should not get here -- must be user */ 477 case BACKSL|'}': 478 SETERROR(REG_EPAREN); 479 break; 480 case BACKSL|'1': 481 case BACKSL|'2': 482 case BACKSL|'3': 483 case BACKSL|'4': 484 case BACKSL|'5': 485 case BACKSL|'6': 486 case BACKSL|'7': 487 case BACKSL|'8': 488 case BACKSL|'9': 489 i = (c&~BACKSL) - '0'; 490 assert(i < NPAREN); 491 if (p->pend[i] != 0) { 492 assert(i <= p->g->nsub); 493 EMIT(OBACK_, i); 494 assert(p->pbegin[i] != 0); 495 assert(OP(p->strip[p->pbegin[i]]) == OLPAREN); 496 assert(OP(p->strip[p->pend[i]]) == ORPAREN); 497 (void) dupl(p, p->pbegin[i]+1, p->pend[i]); 498 EMIT(O_BACK, i); 499 } else 500 SETERROR(REG_ESUBREG); 501 p->g->backrefs = 1; 502 NEXT(); 503 break; 504 case BACKSL|'\0': 505 SETERROR(REG_EESCAPE); 506 break; 507 case '*': 508 REQUIRE(starordinary, REG_BADRPT); 509 /* FALLTHROUGH */ 510 default: 511 if (c & BACKSL) 512 NEXT(); 513 ordinary(p, (uchar)(c &~ BACKSL)); 514 break; 515 } 516 517 if (EAT('*')) { /* implemented as +? */ 518 INSERT(OPLUS_, pos); 519 BACK(O_PLUS, pos); 520 INSERT(OQUEST_, pos); 521 BACK(O_QUEST, pos); 522 } else if (EATTWO('\\', '{')) { 523 count = p_count(p); 524 if (EAT(',')) { 525 if (isdigit(PEEK())) { 526 count2 = p_count(p); 527 REQUIRE(count <= count2, REG_BADBR); 528 } else /* single number with comma */ 529 count2 = INFINITY; 530 } else /* just a single number */ 531 count2 = count; 532 repeat(p, pos, count, count2); 533 if (!EATTWO('\\', '}')) { /* error heuristics */ 534 while (!SEE('\0') && !SEETWO('\\', '}')) 535 NEXT(); 536 if (SEE('\0')) 537 SETERROR(REG_EBRACE); 538 else 539 SETERROR(REG_BADBR); 540 } 541 } else if (c == '$') /* unbackslashed $ not followed by reptn */ 542 return(1); 543 544 return(0); 545 } 546 547 /* 548 - p_count - parse a repetition count 549 */ 550 static int /* the value */ 551 p_count(p) 552 register struct parse *p; 553 { 554 register int count = 0; 555 register int ndigits = 0; 556 557 while (isdigit(PEEK()) && count <= DUPMAX) { 558 count = count*10 + (GETNEXT() - '0'); 559 ndigits++; 560 } 561 562 REQUIRE(ndigits > 0 && count <= DUPMAX, REG_BADBR); 563 return(count); 564 } 565 566 /* 567 - p_bracket - parse a bracketed character list 568 * 569 * Note a significant property of this code: if the allocset() did SETERROR, 570 * no set operations are done. 571 */ 572 static void 573 p_bracket(p) 574 register struct parse *p; 575 { 576 register uchar c; 577 register cset *cs = allocset(p); 578 register int invert = 0; 579 580 if (EAT('^')) 581 invert++; /* make note to invert set at end */ 582 if (EAT(']')) 583 CHadd(cs, ']'); 584 else if (EAT('-')) 585 CHadd(cs, '-'); 586 while ((c = PEEK()) != '\0' && c != ']' && !SEETWO('-', ']')) 587 p_b_term(p, cs); 588 if (EAT('-')) 589 CHadd(cs, '-'); 590 MUSTEAT(']', REG_EBRACK); 591 592 if (invert && p->error == 0) { 593 register int i; 594 595 for (i = p->g->csetsize - 1; i >= 0; i--) 596 if (CHIN(cs, i)) 597 CHsub(cs, i); 598 else 599 CHadd(cs, i); 600 if (p->g->cflags®_NEWLINE) 601 CHsub(cs, '\n'); 602 if (cs->multis != NULL) 603 mcinvert(p, cs); 604 } 605 assert(cs->multis == NULL); /* xxx */ 606 EMIT(OANYOF, freezeset(p, cs)); 607 } 608 609 /* 610 - p_b_term - parse one term of a bracketed character list 611 */ 612 static void 613 p_b_term(p, cs) 614 register struct parse *p; 615 register cset *cs; 616 { 617 register uchar c; 618 register uchar start, finish; 619 register int i; 620 621 /* classify what we've got */ 622 switch (PEEK()) { 623 case '[': 624 c = PEEK2(); 625 break; 626 case '-': 627 SETERROR(REG_ERANGE); 628 return; /* NOTE RETURN */ 629 break; 630 default: 631 c = '\0'; 632 break; 633 } 634 635 switch (c) { 636 case ':': /* character class */ 637 NEXT2(); 638 c = PEEK(); 639 REQUIRE(c != '\0', REG_EBRACK); 640 REQUIRE(c != '-' && c != ']', REG_ECTYPE); 641 p_b_cclass(p, cs); 642 MUSTNOTSEE('\0', REG_EBRACK); 643 REQUIRE(EATTWO(':', ']'), REG_ECTYPE); 644 break; 645 case '=': /* equivalence class */ 646 NEXT2(); 647 c = PEEK(); 648 REQUIRE(c != '\0', REG_EBRACK); 649 REQUIRE(c != '-' && c != ']', REG_ECOLLATE); 650 p_b_eclass(p, cs); 651 MUSTNOTSEE('\0', REG_EBRACK); 652 REQUIRE(EATTWO('=', ']'), REG_ECOLLATE); 653 break; 654 default: /* symbol, ordinary character, or range */ 655 /* xxx revision needed for multichar stuff */ 656 start = p_b_symbol(p); 657 if (PEEK() == '-' && (c = PEEK2()) != ']' && c != '\0') { 658 /* range */ 659 NEXT(); 660 if (EAT('-')) 661 finish = '-'; 662 else 663 finish = p_b_symbol(p); 664 } else 665 finish = start; 666 REQUIRE(start <= finish, REG_ERANGE); 667 for (i = start; i <= finish; i++) { 668 CHadd(cs, i); 669 if ((p->g->cflags®_ICASE) && isalpha(i)) { 670 c = othercase((uchar)i); 671 CHadd(cs, c); 672 } 673 } 674 break; 675 } 676 } 677 678 /* 679 - p_b_cclass - parse a character-class name and deal with it 680 */ 681 static void 682 p_b_cclass(p, cs) 683 register struct parse *p; 684 register cset *cs; 685 { 686 register uchar *sb = p->next; 687 register uchar *se = sb; 688 register struct cclass *cp; 689 register int len; 690 register uchar *u; 691 register uchar c; 692 693 while (isalpha(*se)) 694 se++; 695 len = se - sb; 696 NEXTn(len); 697 for (cp = cclasses; cp->name != NULL; cp++) 698 if (strncmp(cp->name, (char *)sb, len) == 0 && 699 cp->name[len] == '\0') 700 break; 701 if (cp->name == NULL) { 702 /* oops, didn't find it */ 703 SETERROR(REG_ECTYPE); 704 return; 705 } 706 707 u = (uchar *)cp->chars; 708 while ((c = *u++) != '\0') 709 CHadd(cs, c); 710 for (u = (uchar *)cp->multis; *u != '\0'; u += strlen((char *)u) + 1) 711 MCadd(cs, u); 712 } 713 714 /* 715 - p_b_eclass - parse an equivalence-class name and deal with it 716 * 717 * This implementation is incomplete. xxx 718 */ 719 static void 720 p_b_eclass(p, cs) 721 register struct parse *p; 722 register cset *cs; 723 { 724 register uchar c; 725 726 c = p_b_coll_elem(p, '='); 727 CHadd(cs, c); 728 } 729 730 /* 731 - p_b_symbol - parse a character or [..]ed multicharacter collating symbol 732 */ 733 static uchar /* value of symbol */ 734 p_b_symbol(p) 735 register struct parse *p; 736 { 737 register uchar value; 738 739 if (!EATTWO('[', '.')) { 740 MUSTNOTSEE('\0', REG_EBRACK); 741 return(GETNEXT()); 742 } 743 744 /* collating symbol */ 745 MUSTNOTSEE('\0', REG_EBRACK); 746 value = p_b_coll_elem(p, '.'); 747 REQUIRE(EATTWO('.', ']'), REG_ECOLLATE); 748 return(value); 749 } 750 751 /* 752 - p_b_coll_elem - parse a collating-element name and look it up 753 */ 754 static uchar /* value of collating element */ 755 p_b_coll_elem(p, endc) 756 register struct parse *p; 757 u_int endc; /* name ended by endc,']' */ 758 { 759 register uchar *sp = p->next; 760 register struct cname *cp; 761 register int len; 762 register uchar c; 763 764 while ((c = PEEK()) != '\0' && !SEETWO(endc, ']')) 765 NEXT(); 766 if (c == '\0') { 767 SETERROR(REG_EBRACK); 768 return(0); 769 } 770 len = p->next - sp; 771 for (cp = cnames; cp->name != NULL; cp++) 772 if (strncmp(cp->name, (char *)sp, len) == 0 && 773 cp->name[len] == '\0') 774 return(cp->code); /* known name */ 775 if (len == 1) 776 return(*sp); /* single character */ 777 SETERROR(REG_ECOLLATE); /* neither */ 778 return(0); 779 } 780 781 /* 782 - othercase - return the case counterpart of an alphabetic 783 */ 784 static uchar 785 othercase(ch) 786 u_int ch; 787 { 788 assert(isalpha(ch)); 789 if (isupper(ch)) 790 return(tolower(ch)); 791 else if (islower(ch)) 792 return(toupper(ch)); 793 else /* peculiar, but could happen */ 794 return(ch); 795 } 796 797 /* 798 - bothcases - emit a dualcase version of a character 799 * 800 * Boy, is this implementation ever a kludge... 801 */ 802 static void 803 bothcases(p, ch) 804 register struct parse *p; 805 u_int ch; 806 { 807 register uchar *oldnext; 808 uchar bracket[3]; 809 810 oldnext = p->next; 811 p->next = bracket; 812 bracket[0] = ch; 813 bracket[1] = ']'; 814 bracket[2] = '\0'; 815 p_bracket(p); 816 assert(p->next == bracket+2); 817 p->next = oldnext; 818 } 819 820 /* 821 - ordinary - emit an ordinary character 822 */ 823 static void 824 ordinary(p, ch) 825 register struct parse *p; 826 register u_int ch; 827 { 828 register uchar *cap = p->g->categories; 829 830 if ((p->g->cflags®_ICASE) && isalpha(ch)) { 831 bothcases(p, ch); 832 return; 833 } 834 835 EMIT(OCHAR, ch); 836 if (cap[ch] == 0) 837 cap[ch] = p->g->ncategories++; 838 } 839 840 /* 841 - nonnewline - emit REG_NEWLINE version of OANY 842 * 843 * Boy, is this implementation ever a kludge... 844 */ 845 static void 846 nonnewline(p) 847 register struct parse *p; 848 { 849 register uchar *oldnext; 850 uchar bracket[4]; 851 852 oldnext = p->next; 853 p->next = bracket; 854 bracket[0] = '^'; 855 bracket[1] = '\n'; 856 bracket[2] = ']'; 857 bracket[3] = '\0'; 858 p_bracket(p); 859 assert(p->next == bracket+3); 860 p->next = oldnext; 861 } 862 863 /* 864 - repeat - generate code for a bounded repetition, recursively if needed 865 */ 866 static void 867 repeat(p, start, from, to) 868 register struct parse *p; 869 sopno start; /* operand from here to end of strip */ 870 int from; /* repeated from this number */ 871 int to; /* to this number of times (maybe INFINITY) */ 872 { 873 register sopno finish = HERE(); 874 # define N 2 875 # define INF 3 876 # define REP(f, t) ((f)*8 + (t)) 877 # define MAP(n) (((n) <= 1) ? (n) : ((n) == INFINITY) ? INF : N) 878 register sopno copy; 879 880 if (p->error != 0) /* head off possible runaway recursion */ 881 return; 882 883 assert(from <= to); 884 885 switch (REP(MAP(from), MAP(to))) { 886 case REP(0, 0): /* must be user doing this */ 887 DROP(finish-start); /* drop the operand */ 888 break; 889 case REP(0, 1): /* as x{1,1}? */ 890 case REP(0, N): /* as x{1,n}? */ 891 case REP(0, INF): /* as x{1,}? */ 892 INSERT(OQUEST_, start); /* offset is wrong... */ 893 repeat(p, start+1, 1, to); 894 FWD(start); /* ... fix it */ 895 BACK(O_QUEST, start); 896 break; 897 case REP(1, 1): /* trivial case */ 898 /* done */ 899 break; 900 case REP(1, N): /* as x?x{1,n-1} */ 901 INSERT(OQUEST_, start); 902 BACK(O_QUEST, start); 903 copy = dupl(p, start+1, finish+1); 904 assert(copy == finish+2); 905 repeat(p, copy, 1, to-1); 906 break; 907 case REP(1, INF): /* as x+ */ 908 INSERT(OPLUS_, start); 909 BACK(O_PLUS, start); 910 break; 911 case REP(N, N): /* as xx{m-1,n-1} */ 912 copy = dupl(p, start, finish); 913 repeat(p, copy, from-1, to-1); 914 break; 915 case REP(N, INF): /* as xx{n-1,INF} */ 916 copy = dupl(p, start, finish); 917 repeat(p, copy, from-1, to); 918 break; 919 default: /* "can't happen" */ 920 SETERROR(REG_ASSERT); /* just in case */ 921 break; 922 } 923 } 924 925 /* 926 - seterr - set an error condition 927 */ 928 static int /* useless but makes type checking happy */ 929 seterr(p, e) 930 register struct parse *p; 931 int e; 932 { 933 if (p->error == 0) /* keep earliest error condition */ 934 p->error = e; 935 p->next = nuls; /* try to bring things to a halt */ 936 return(0); /* make the return value well-defined */ 937 } 938 939 /* 940 - allocset - allocate a set of characters for [] 941 */ 942 static cset * 943 allocset(p) 944 register struct parse *p; 945 { 946 register int no = p->g->ncsets++; 947 register size_t nc; 948 register size_t nbytes; 949 register cset *cs; 950 register size_t css = (size_t)p->g->csetsize; 951 952 if (no >= p->ncsalloc) { /* need another column of space */ 953 p->ncsalloc += CHAR_BIT; 954 nc = p->ncsalloc; 955 assert(nc % CHAR_BIT == 0); 956 nbytes = nc / CHAR_BIT * css; 957 if (p->g->sets == NULL) 958 p->g->sets = (cset *)malloc(nc * sizeof(cset)); 959 else 960 p->g->sets = (cset *)realloc((char *)p->g->sets, 961 nc * sizeof(cset)); 962 if (p->g->setbits == NULL) 963 p->g->setbits = (uchar *)malloc(nbytes); 964 else 965 p->g->setbits = (uchar *)realloc((char *)p->g->setbits, 966 nbytes); 967 if (p->g->sets != NULL && p->g->setbits != NULL) 968 (void) memset((char *)p->g->setbits + (nbytes - css), 969 0, css); 970 else { 971 no = 0; 972 SETERROR(REG_ESPACE); 973 /* caller's responsibility not to do set ops */ 974 } 975 } 976 977 assert(p->g->sets != NULL); /* xxx */ 978 cs = &p->g->sets[no]; 979 cs->ptr = p->g->setbits + css*((no)/CHAR_BIT); 980 cs->mask = 1 << ((no) % CHAR_BIT); 981 cs->hash = 0; 982 cs->smultis = 0; 983 cs->multis = NULL; 984 985 return(cs); 986 } 987 988 /* 989 - freezeset - final processing on a set of characters 990 * 991 * The main task here is merging identical sets. This is usually a waste 992 * of time (although the hash code minimizes the overhead), but can win 993 * big if REG_ICASE is being used. REG_ICASE, by the way, is why the hash 994 * is done using addition rather than xor -- all ASCII [aA] sets xor to 995 * the same value! 996 */ 997 static int /* set number */ 998 freezeset(p, cs) 999 register struct parse *p; 1000 register cset *cs; 1001 { 1002 register uchar h = cs->hash; 1003 register int i; 1004 register cset *top = &p->g->sets[p->g->ncsets]; 1005 register cset *cs2; 1006 register size_t css = (size_t)p->g->csetsize; 1007 1008 /* look for an earlier one which is the same */ 1009 for (cs2 = &p->g->sets[0]; cs2 < top; cs2++) 1010 if (cs2->hash == h && cs2 != cs) { 1011 /* maybe */ 1012 for (i = 0; i < css; i++) 1013 if (!!CHIN(cs2, i) != !!CHIN(cs, i)) 1014 break; /* no */ 1015 if (i == css) 1016 break; /* yes */ 1017 } 1018 1019 if (cs2 < top) { /* found one */ 1020 assert(cs == top-1); 1021 p->g->ncsets--; 1022 for (i = 0; i < css; i++) 1023 CHsub(cs, i); 1024 cs = cs2; 1025 } 1026 1027 return((int)(cs - p->g->sets)); 1028 } 1029 1030 /* 1031 - mcadd - add a collating element to a cset 1032 */ 1033 static void 1034 mcadd(p, cs, cp) 1035 register struct parse *p; 1036 register cset *cs; 1037 register uchar *cp; 1038 { 1039 register size_t oldend = cs->smultis; 1040 1041 cs->smultis += strlen((char *)cp) + 1; 1042 if (cs->multis == NULL) 1043 cs->multis = (uchar *)malloc(cs->smultis); 1044 else 1045 cs->multis = (uchar *)realloc(cs->multis, cs->smultis); 1046 if (cs->multis == NULL) { 1047 SETERROR(REG_ESPACE); 1048 return; 1049 } 1050 1051 (void) strcpy((char *)(cs->multis + oldend - 1), (char *)cp); 1052 cs->multis[cs->smultis - 1] = '\0'; 1053 } 1054 1055 /* 1056 - mcsub - subtract a collating element from a cset 1057 */ 1058 static void 1059 mcsub(p, cs, cp) 1060 register struct parse *p; 1061 register cset *cs; 1062 register u_int *cp; 1063 { 1064 register uchar *fp = mcfind(cs, cp); 1065 register size_t len = strlen((char *)fp); 1066 1067 assert(p != NULL); 1068 (void) memmove((char *)fp, (char *)(fp + len + 1), 1069 cs->smultis - (fp + len + 1 - cs->multis)); 1070 cs->smultis -= len; 1071 1072 if (cs->smultis == 0) { 1073 free((char *)cs->multis); 1074 cs->multis = NULL; 1075 return; 1076 } 1077 1078 cs->multis = (uchar *)realloc(cs->multis, cs->smultis); 1079 assert(cs->multis != NULL); 1080 } 1081 1082 /* 1083 - mcin - is a collating element in a cset? 1084 */ 1085 static int 1086 mcin(p, cs, cp) 1087 register struct parse *p; 1088 register cset *cs; 1089 register u_int *cp; 1090 { 1091 return(mcfind(cs, cp) != NULL); 1092 } 1093 1094 /* 1095 - mcfind - find a collating element in a cset 1096 */ 1097 static uchar * 1098 mcfind(cs, cp) 1099 register cset *cs; 1100 register u_int *cp; 1101 { 1102 register uchar *p; 1103 1104 if (cs->multis == NULL) 1105 return(NULL); 1106 for (p = cs->multis; *p != '\0'; p += strlen((char *)p) + 1) 1107 if (strcmp((char *)cp, (char *)p) == 0) 1108 return(p); 1109 return(NULL); 1110 } 1111 1112 /* 1113 - mcinvert - invert the list of collating elements in a cset 1114 * 1115 * This would have to know the set of possibilities. Implementation 1116 * is deferred. 1117 */ 1118 static void 1119 mcinvert(p, cs) 1120 register struct parse *p; 1121 register cset *cs; 1122 { 1123 assert(cs->multis == NULL); /* xxx */ 1124 } 1125 1126 /* 1127 - isinsets - is this character in any sets? 1128 */ 1129 static int /* predicate */ 1130 isinsets(g, c) 1131 register struct re_guts *g; 1132 u_int c; 1133 { 1134 register uchar *col; 1135 register int i; 1136 register int ncols = (g->ncsets+(CHAR_BIT-1)) / CHAR_BIT; 1137 1138 for (i = 0, col = g->setbits; i < ncols; i++, col += g->csetsize) 1139 if (col[c] != 0) 1140 return(1); 1141 return(0); 1142 } 1143 1144 /* 1145 - samesets - are these two characters in exactly the same sets? 1146 */ 1147 static int /* predicate */ 1148 samesets(g, c1, c2) 1149 register struct re_guts *g; 1150 register u_int c1; 1151 register u_int c2; 1152 { 1153 register uchar *col; 1154 register int i; 1155 register int ncols = (g->ncsets+(CHAR_BIT-1)) / CHAR_BIT; 1156 1157 for (i = 0, col = g->setbits; i < ncols; i++, col += g->csetsize) 1158 if (col[c1] != col[c2]) 1159 return(0); 1160 return(1); 1161 } 1162 1163 /* 1164 - categorize - sort out character categories 1165 */ 1166 static void 1167 categorize(p, g) 1168 struct parse *p; 1169 register struct re_guts *g; 1170 { 1171 register uchar *cats = g->categories; 1172 register unsigned c; 1173 register unsigned c2; 1174 register uchar cat; 1175 1176 /* avoid making error situations worse */ 1177 if (p->error != 0) 1178 return; 1179 1180 for (c = 0; c < g->csetsize; c++) 1181 if (cats[c] == 0 && isinsets(g, c)) { 1182 cat = g->ncategories++; 1183 cats[c] = cat; 1184 for (c2 = c+1; c2 < g->csetsize; c2++) 1185 if (cats[c2] == 0 && samesets(g, c, c2)) 1186 cats[c2] = cat; 1187 } 1188 } 1189 1190 /* 1191 - dupl - emit a duplicate of a bunch of sops 1192 */ 1193 static sopno /* start of duplicate */ 1194 dupl(p, start, finish) 1195 register struct parse *p; 1196 sopno start; /* from here */ 1197 sopno finish; /* to this less one */ 1198 { 1199 register sopno ret = HERE(); 1200 register sopno len = finish - start; 1201 1202 assert(finish >= start); 1203 if (len == 0) 1204 return(ret); 1205 enlarge(p, p->ssize + len); /* this many unexpected additions */ 1206 assert(p->ssize >= p->slen + len); 1207 (void) memcpy((char *)(p->strip + p->slen), 1208 (char *)(p->strip + start), (size_t)len*sizeof(sop)); 1209 p->slen += len; 1210 return(ret); 1211 } 1212 1213 /* 1214 - doemit - emit a strip operator 1215 * 1216 * It might seem better to implement this as a macro with a function as 1217 * hard-case backup, but it's just too big and messy unless there are 1218 * some changes to the data structures. Maybe later. 1219 */ 1220 static void 1221 doemit(p, op, opnd) 1222 register struct parse *p; 1223 sop op; 1224 size_t opnd; 1225 { 1226 /* avoid making error situations worse */ 1227 if (p->error != 0) 1228 return; 1229 1230 /* deal with oversize operands ("can't happen", more or less) */ 1231 assert(opnd < 1<<OPSHIFT); 1232 1233 /* deal with undersized strip */ 1234 if (p->slen >= p->ssize) 1235 enlarge(p, (p->ssize+1) / 2 * 3); /* +50% */ 1236 assert(p->slen < p->ssize); 1237 1238 /* finally, it's all reduced to the easy case */ 1239 p->strip[p->slen++] = SOP(op, opnd); 1240 } 1241 1242 /* 1243 - doinsert - insert a sop into the strip 1244 */ 1245 static void 1246 doinsert(p, op, opnd, pos) 1247 register struct parse *p; 1248 sop op; 1249 size_t opnd; 1250 sopno pos; 1251 { 1252 register sopno sn; 1253 register sop s; 1254 register int i; 1255 1256 /* avoid making error situations worse */ 1257 if (p->error != 0) 1258 return; 1259 1260 sn = HERE(); 1261 EMIT(op, opnd); /* do checks, ensure space */ 1262 assert(HERE() == sn+1); 1263 s = p->strip[sn]; 1264 1265 /* adjust paren pointers */ 1266 assert(pos > 0); 1267 for (i = 1; i < NPAREN; i++) { 1268 if (p->pbegin[i] >= pos) { 1269 p->pbegin[i]++; 1270 } 1271 if (p->pend[i] >= pos) { 1272 p->pend[i]++; 1273 } 1274 } 1275 1276 memmove((char *)&p->strip[pos+1], (char *)&p->strip[pos], 1277 (HERE()-pos-1)*sizeof(sop)); 1278 p->strip[pos] = s; 1279 } 1280 1281 /* 1282 - dofwd - complete a forward reference 1283 */ 1284 static void 1285 dofwd(p, pos, value) 1286 register struct parse *p; 1287 register sopno pos; 1288 sop value; 1289 { 1290 /* avoid making error situations worse */ 1291 if (p->error != 0) 1292 return; 1293 1294 assert(value < 1<<OPSHIFT); 1295 p->strip[pos] = OP(p->strip[pos]) | value; 1296 } 1297 1298 /* 1299 - enlarge - enlarge the strip 1300 */ 1301 static void 1302 enlarge(p, size) 1303 register struct parse *p; 1304 register sopno size; 1305 { 1306 register sop *sp; 1307 1308 if (p->ssize >= size) 1309 return; 1310 1311 sp = (sop *)realloc(p->strip, size*sizeof(sop)); 1312 if (sp == NULL) { 1313 SETERROR(REG_ESPACE); 1314 return; 1315 } 1316 p->strip = sp; 1317 p->ssize = size; 1318 } 1319 1320 /* 1321 - stripsnug - compact the strip 1322 */ 1323 static void 1324 stripsnug(p, g) 1325 register struct parse *p; 1326 register struct re_guts *g; 1327 { 1328 g->nstates = p->slen; 1329 g->strip = (sop *)realloc((sop *)p->strip, p->slen * sizeof(sop)); 1330 if (g->strip == NULL) { 1331 SETERROR(REG_ESPACE); 1332 g->strip = p->strip; 1333 } 1334 } 1335 1336 /* 1337 - findmust - fill in must and mlen with longest mandatory literal string 1338 * 1339 * This algorithm could do fancy things like analyzing the operands of | 1340 * for common subsequences. Someday. This code is simple and finds most 1341 * of the interesting cases. 1342 * 1343 * Note that must and mlen got initialized during setup. 1344 */ 1345 static void 1346 findmust(p, g) 1347 struct parse *p; 1348 register struct re_guts *g; 1349 { 1350 register sop *scan; 1351 sop *start; 1352 register sop *newstart; 1353 register sopno newlen; 1354 register sop s; 1355 register char *cp; 1356 register sopno i; 1357 1358 /* avoid making error situations worse */ 1359 if (p->error != 0) 1360 return; 1361 1362 /* find the longest OCHAR sequence in strip */ 1363 newlen = 0; 1364 scan = g->strip + 1; 1365 do { 1366 s = *scan++; 1367 switch (OP(s)) { 1368 case OCHAR: /* sequence member */ 1369 if (newlen == 0) /* new sequence */ 1370 newstart = scan - 1; 1371 newlen++; 1372 break; 1373 case OPLUS_: /* things that don't break one */ 1374 case OLPAREN: 1375 case ORPAREN: 1376 break; 1377 case OQUEST_: /* things that must be skipped */ 1378 case OCH_: 1379 scan--; 1380 do { 1381 scan += OPND(s); 1382 s = *scan; 1383 /* assert() interferes w debug printouts */ 1384 if (OP(s) != O_QUEST && OP(s) != O_CH && 1385 OP(s) != OOR2) { 1386 g->iflags |= BAD; 1387 return; 1388 } 1389 } while (OP(s) != O_QUEST && OP(s) != O_CH); 1390 /* fallthrough */ 1391 default: /* things that break a sequence */ 1392 if (newlen > g->mlen) { /* ends one */ 1393 start = newstart; 1394 g->mlen = newlen; 1395 } 1396 newlen = 0; 1397 break; 1398 } 1399 } while (OP(s) != OEND); 1400 1401 if (g->mlen == 0) /* there isn't one */ 1402 return; 1403 1404 /* turn it into a character string */ 1405 g->must = malloc((size_t)g->mlen + 1); 1406 if (g->must == NULL) { /* argh; just forget it */ 1407 g->mlen = 0; 1408 return; 1409 } 1410 cp = g->must; 1411 scan = start; 1412 for (i = g->mlen; i > 0; i--) { 1413 while (OP(s = *scan++) != OCHAR) 1414 continue; 1415 *cp++ = OPND(s); 1416 } 1417 *cp++ = '\0'; /* just on general principles */ 1418 } 1419 1420 /* 1421 - pluscount - count + nesting 1422 */ 1423 static sopno /* nesting depth */ 1424 pluscount(p, g) 1425 struct parse *p; 1426 register struct re_guts *g; 1427 { 1428 register sop *scan; 1429 register sop s; 1430 register sopno plusnest = 0; 1431 register sopno maxnest = 0; 1432 1433 if (p->error != 0) 1434 return(0); /* there may not be an OEND */ 1435 1436 scan = g->strip + 1; 1437 do { 1438 s = *scan++; 1439 switch (OP(s)) { 1440 case OPLUS_: 1441 plusnest++; 1442 break; 1443 case O_PLUS: 1444 if (plusnest > maxnest) 1445 maxnest = plusnest; 1446 plusnest--; 1447 break; 1448 } 1449 } while (OP(s) != OEND); 1450 if (plusnest != 0) 1451 g->iflags |= BAD; 1452 return(maxnest); 1453 } 1454