1 /* $NetBSD: regcomp.c,v 1.17 2000/07/07 08:03:40 itohy Exp $ */ 2 3 /*- 4 * Copyright (c) 1992, 1993, 1994 Henry Spencer. 5 * Copyright (c) 1992, 1993, 1994 6 * The Regents of the University of California. All rights reserved. 7 * 8 * This code is derived from software contributed to Berkeley by 9 * Henry Spencer. 10 * 11 * Redistribution and use in source and binary forms, with or without 12 * modification, are permitted provided that the following conditions 13 * are met: 14 * 1. Redistributions of source code must retain the above copyright 15 * notice, this list of conditions and the following disclaimer. 16 * 2. Redistributions in binary form must reproduce the above copyright 17 * notice, this list of conditions and the following disclaimer in the 18 * documentation and/or other materials provided with the distribution. 19 * 3. All advertising materials mentioning features or use of this software 20 * must display the following acknowledgement: 21 * This product includes software developed by the University of 22 * California, Berkeley and its contributors. 23 * 4. Neither the name of the University nor the names of its contributors 24 * may be used to endorse or promote products derived from this software 25 * without specific prior written permission. 26 * 27 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 28 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 29 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 30 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 31 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 32 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 33 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 34 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 35 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 36 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 37 * SUCH DAMAGE. 38 * 39 * @(#)regcomp.c 8.5 (Berkeley) 3/20/94 40 */ 41 42 #include <sys/cdefs.h> 43 #if defined(LIBC_SCCS) && !defined(lint) 44 #if 0 45 static char sccsid[] = "@(#)regcomp.c 8.5 (Berkeley) 3/20/94"; 46 #else 47 __RCSID("$NetBSD: regcomp.c,v 1.17 2000/07/07 08:03:40 itohy Exp $"); 48 #endif 49 #endif /* LIBC_SCCS and not lint */ 50 51 #include "namespace.h" 52 #include <sys/types.h> 53 54 #include <assert.h> 55 #include <ctype.h> 56 #include <limits.h> 57 #include <regex.h> 58 #include <stdio.h> 59 #include <stdlib.h> 60 #include <string.h> 61 62 #ifdef __weak_alias 63 __weak_alias(regcomp,_regcomp) 64 #endif 65 66 #include "utils.h" 67 #include "regex2.h" 68 69 #include "cclass.h" 70 #include "cname.h" 71 72 /* 73 * parse structure, passed up and down to avoid global variables and 74 * other clumsinesses 75 */ 76 struct parse { 77 char *next; /* next character in RE */ 78 char *end; /* end of string (-> NUL normally) */ 79 int error; /* has an error been seen? */ 80 sop *strip; /* malloced strip */ 81 sopno ssize; /* malloced strip size (allocated) */ 82 sopno slen; /* malloced strip length (used) */ 83 int ncsalloc; /* number of csets allocated */ 84 struct re_guts *g; 85 # define NPAREN 10 /* we need to remember () 1-9 for back refs */ 86 sopno pbegin[NPAREN]; /* -> ( ([0] unused) */ 87 sopno pend[NPAREN]; /* -> ) ([0] unused) */ 88 }; 89 90 /* ========= begin header generated by ./mkh ========= */ 91 #ifdef __cplusplus 92 extern "C" { 93 #endif 94 95 /* === regcomp.c === */ 96 static void p_ere __P((struct parse *p, int stop)); 97 static void p_ere_exp __P((struct parse *p)); 98 static void p_str __P((struct parse *p)); 99 static void p_bre __P((struct parse *p, int end1, int end2)); 100 static int p_simp_re __P((struct parse *p, int starordinary)); 101 static int p_count __P((struct parse *p)); 102 static void p_bracket __P((struct parse *p)); 103 static void p_b_term __P((struct parse *p, cset *cs)); 104 static void p_b_cclass __P((struct parse *p, cset *cs)); 105 static void p_b_eclass __P((struct parse *p, cset *cs)); 106 static char p_b_symbol __P((struct parse *p)); 107 static char p_b_coll_elem __P((struct parse *p, int endc)); 108 static char othercase __P((int ch)); 109 static void bothcases __P((struct parse *p, int ch)); 110 static void ordinary __P((struct parse *p, int ch)); 111 static void nonnewline __P((struct parse *p)); 112 static void repeat __P((struct parse *p, sopno start, int from, int to)); 113 static int seterr __P((struct parse *p, int e)); 114 static cset *allocset __P((struct parse *p)); 115 static void freeset __P((struct parse *p, cset *cs)); 116 static int freezeset __P((struct parse *p, cset *cs)); 117 static int firstch __P((struct parse *p, cset *cs)); 118 static int nch __P((struct parse *p, cset *cs)); 119 static void mcadd __P((struct parse *p, cset *cs, const char *cp)); 120 #if 0 121 static void mcsub __P((cset *cs, char *cp)); 122 static int mcin __P((cset *cs, char *cp)); 123 static char *mcfind __P((cset *cs, char *cp)); 124 #endif 125 static void mcinvert __P((struct parse *p, cset *cs)); 126 static void mccase __P((struct parse *p, cset *cs)); 127 static int isinsets __P((struct re_guts *g, int c)); 128 static int samesets __P((struct re_guts *g, int c1, int c2)); 129 static void categorize __P((struct parse *p, struct re_guts *g)); 130 static sopno dupl __P((struct parse *p, sopno start, sopno finish)); 131 static void doemit __P((struct parse *p, sop op, sopno opnd)); 132 static void doinsert __P((struct parse *p, sop op, sopno opnd, sopno pos)); 133 static void dofwd __P((struct parse *p, sopno pos, sopno value)); 134 static void enlarge __P((struct parse *p, sopno size)); 135 static void stripsnug __P((struct parse *p, struct re_guts *g)); 136 static void findmust __P((struct parse *p, struct re_guts *g)); 137 static sopno pluscount __P((struct parse *p, struct re_guts *g)); 138 139 #ifdef __cplusplus 140 } 141 #endif 142 /* ========= end header generated by ./mkh ========= */ 143 144 static char nuls[10]; /* place to point scanner in event of error */ 145 146 /* 147 * macros for use with parse structure 148 * BEWARE: these know that the parse structure is named `p' !!! 149 */ 150 #define PEEK() (*p->next) 151 #define PEEK2() (*(p->next+1)) 152 #define MORE() (p->next < p->end) 153 #define MORE2() (p->next+1 < p->end) 154 #define SEE(c) (MORE() && PEEK() == (c)) 155 #define SEETWO(a, b) (MORE() && MORE2() && PEEK() == (a) && PEEK2() == (b)) 156 #define EAT(c) ((SEE(c)) ? (NEXT(), 1) : 0) 157 #define EATTWO(a, b) ((SEETWO(a, b)) ? (NEXT2(), 1) : 0) 158 #define NEXT() (p->next++) 159 #define NEXT2() (p->next += 2) 160 #define NEXTn(n) (p->next += (n)) 161 #define GETNEXT() (*p->next++) 162 #define SETERROR(e) seterr(p, (e)) 163 #define REQUIRE(co, e) (void) ((co) || SETERROR(e)) 164 #define MUSTSEE(c, e) (REQUIRE(MORE() && PEEK() == (c), e)) 165 #define MUSTEAT(c, e) (void) (REQUIRE(MORE() && GETNEXT() == (c), e)) 166 #define MUSTNOTSEE(c, e) (REQUIRE(!MORE() || PEEK() != (c), e)) 167 #define EMIT(op, sopnd) doemit(p, (sop)(op), sopnd) 168 #define INSERT(op, pos) doinsert(p, (sop)(op), HERE()-(pos)+1, pos) 169 #define AHEAD(pos) dofwd(p, pos, HERE()-(pos)) 170 #define ASTERN(sop, pos) EMIT(sop, HERE()-pos) 171 #define HERE() (p->slen) 172 #define THERE() (p->slen - 1) 173 #define THERETHERE() (p->slen - 2) 174 #define DROP(n) (p->slen -= (n)) 175 176 #ifndef NDEBUG 177 static int never = 0; /* for use in asserts; shuts lint up */ 178 #else 179 #define never 0 /* some <assert.h>s have bugs too */ 180 #endif 181 182 /* 183 - regcomp - interface for parser and compilation 184 = extern int regcomp(regex_t *, const char *, int); 185 = #define REG_BASIC 0000 186 = #define REG_EXTENDED 0001 187 = #define REG_ICASE 0002 188 = #define REG_NOSUB 0004 189 = #define REG_NEWLINE 0010 190 = #define REG_NOSPEC 0020 191 = #define REG_PEND 0040 192 = #define REG_DUMP 0200 193 */ 194 int /* 0 success, otherwise REG_something */ 195 regcomp(preg, pattern, cflags) 196 regex_t *preg; 197 const char *pattern; 198 int cflags; 199 { 200 struct parse pa; 201 struct re_guts *g; 202 struct parse *p = &pa; 203 int i; 204 size_t len; 205 #ifdef REDEBUG 206 # define GOODFLAGS(f) (f) 207 #else 208 # define GOODFLAGS(f) ((f)&~REG_DUMP) 209 #endif 210 211 _DIAGASSERT(preg != NULL); 212 _DIAGASSERT(pattern != NULL); 213 214 cflags = GOODFLAGS(cflags); 215 if ((cflags®_EXTENDED) && (cflags®_NOSPEC)) 216 return(REG_INVARG); 217 218 if (cflags®_PEND) { 219 if (preg->re_endp < pattern) 220 return(REG_INVARG); 221 len = preg->re_endp - pattern; 222 } else 223 len = strlen(pattern); 224 225 /* do the mallocs early so failure handling is easy */ 226 g = (struct re_guts *)malloc(sizeof(struct re_guts) + 227 (NC-1)*sizeof(cat_t)); 228 if (g == NULL) 229 return(REG_ESPACE); 230 p->ssize = len/(size_t)2*(size_t)3 + (size_t)1; /* ugh */ 231 p->strip = (sop *)malloc(p->ssize * sizeof(sop)); 232 p->slen = 0; 233 if (p->strip == NULL) { 234 free(g); 235 return(REG_ESPACE); 236 } 237 238 /* set things up */ 239 p->g = g; 240 /* LINTED convenience; we do not modify it */ 241 p->next = (char *)pattern; 242 p->end = p->next + len; 243 p->error = 0; 244 p->ncsalloc = 0; 245 for (i = 0; i < NPAREN; i++) { 246 p->pbegin[i] = 0; 247 p->pend[i] = 0; 248 } 249 g->csetsize = NC; 250 g->sets = NULL; 251 g->setbits = NULL; 252 g->ncsets = 0; 253 g->cflags = cflags; 254 g->iflags = 0; 255 g->nbol = 0; 256 g->neol = 0; 257 g->must = NULL; 258 g->mlen = 0; 259 g->nsub = 0; 260 g->ncategories = 1; /* category 0 is "everything else" */ 261 g->categories = &g->catspace[-(CHAR_MIN)]; 262 (void) memset((char *)g->catspace, 0, NC*sizeof(cat_t)); 263 g->backrefs = 0; 264 265 /* do it */ 266 EMIT(OEND, 0); 267 g->firststate = THERE(); 268 if (cflags®_EXTENDED) 269 p_ere(p, OUT); 270 else if (cflags®_NOSPEC) 271 p_str(p); 272 else 273 p_bre(p, OUT, OUT); 274 EMIT(OEND, 0); 275 g->laststate = THERE(); 276 277 /* tidy up loose ends and fill things in */ 278 categorize(p, g); 279 stripsnug(p, g); 280 findmust(p, g); 281 g->nplus = pluscount(p, g); 282 g->magic = MAGIC2; 283 preg->re_nsub = g->nsub; 284 preg->re_g = g; 285 preg->re_magic = MAGIC1; 286 #ifndef REDEBUG 287 /* not debugging, so can't rely on the assert() in regexec() */ 288 if (g->iflags&BAD) 289 SETERROR(REG_ASSERT); 290 #endif 291 292 /* win or lose, we're done */ 293 if (p->error != 0) /* lose */ 294 regfree(preg); 295 return(p->error); 296 } 297 298 /* 299 - p_ere - ERE parser top level, concatenation and alternation 300 == static void p_ere(struct parse *p, int stop); 301 */ 302 static void 303 p_ere(p, stop) 304 struct parse *p; 305 int stop; /* character this ERE should end at */ 306 { 307 char c; 308 sopno prevback = 0; /* pacify gcc */ 309 sopno prevfwd = 0; /* pacify gcc */ 310 sopno conc; 311 int first = 1; /* is this the first alternative? */ 312 313 _DIAGASSERT(p != NULL); 314 315 for (;;) { 316 /* do a bunch of concatenated expressions */ 317 conc = HERE(); 318 while (MORE() && (c = PEEK()) != '|' && c != stop) 319 p_ere_exp(p); 320 REQUIRE(HERE() != conc, REG_EMPTY); /* require nonempty */ 321 322 if (!EAT('|')) 323 break; /* NOTE BREAK OUT */ 324 325 if (first) { 326 INSERT(OCH_, conc); /* offset is wrong */ 327 prevfwd = conc; 328 prevback = conc; 329 first = 0; 330 } 331 ASTERN(OOR1, prevback); 332 prevback = THERE(); 333 AHEAD(prevfwd); /* fix previous offset */ 334 prevfwd = HERE(); 335 EMIT(OOR2, 0); /* offset is very wrong */ 336 } 337 338 if (!first) { /* tail-end fixups */ 339 AHEAD(prevfwd); 340 ASTERN(O_CH, prevback); 341 } 342 343 assert(!MORE() || SEE(stop)); 344 } 345 346 /* 347 - p_ere_exp - parse one subERE, an atom possibly followed by a repetition op 348 == static void p_ere_exp(struct parse *p); 349 */ 350 static void 351 p_ere_exp(p) 352 struct parse *p; 353 { 354 char c; 355 sopno pos; 356 int count; 357 int count2; 358 sopno subno; 359 int wascaret = 0; 360 361 _DIAGASSERT(p != NULL); 362 363 assert(MORE()); /* caller should have ensured this */ 364 c = GETNEXT(); 365 366 pos = HERE(); 367 switch (c) { 368 case '(': 369 REQUIRE(MORE(), REG_EPAREN); 370 p->g->nsub++; 371 subno = p->g->nsub; 372 if (subno < NPAREN) 373 p->pbegin[subno] = HERE(); 374 EMIT(OLPAREN, subno); 375 if (!SEE(')')) 376 p_ere(p, ')'); 377 if (subno < NPAREN) { 378 p->pend[subno] = HERE(); 379 assert(p->pend[subno] != 0); 380 } 381 EMIT(ORPAREN, subno); 382 MUSTEAT(')', REG_EPAREN); 383 break; 384 #ifndef POSIX_MISTAKE 385 case ')': /* happens only if no current unmatched ( */ 386 /* 387 * You may ask, why the ifndef? Because I didn't notice 388 * this until slightly too late for 1003.2, and none of the 389 * other 1003.2 regular-expression reviewers noticed it at 390 * all. So an unmatched ) is legal POSIX, at least until 391 * we can get it fixed. 392 */ 393 SETERROR(REG_EPAREN); 394 break; 395 #endif 396 case '^': 397 EMIT(OBOL, 0); 398 p->g->iflags |= USEBOL; 399 p->g->nbol++; 400 wascaret = 1; 401 break; 402 case '$': 403 EMIT(OEOL, 0); 404 p->g->iflags |= USEEOL; 405 p->g->neol++; 406 break; 407 case '|': 408 SETERROR(REG_EMPTY); 409 break; 410 case '*': 411 case '+': 412 case '?': 413 SETERROR(REG_BADRPT); 414 break; 415 case '.': 416 if (p->g->cflags®_NEWLINE) 417 nonnewline(p); 418 else 419 EMIT(OANY, 0); 420 break; 421 case '[': 422 p_bracket(p); 423 break; 424 case '\\': 425 REQUIRE(MORE(), REG_EESCAPE); 426 c = GETNEXT(); 427 ordinary(p, c); 428 break; 429 case '{': /* okay as ordinary except if digit follows */ 430 REQUIRE(!MORE() || !isdigit((unsigned char)PEEK()), REG_BADRPT); 431 /* FALLTHROUGH */ 432 default: 433 ordinary(p, c); 434 break; 435 } 436 437 if (!MORE()) 438 return; 439 c = PEEK(); 440 /* we call { a repetition if followed by a digit */ 441 if (!( c == '*' || c == '+' || c == '?' || 442 (c == '{' && MORE2() && isdigit((unsigned char)PEEK2())) )) 443 return; /* no repetition, we're done */ 444 NEXT(); 445 446 REQUIRE(!wascaret, REG_BADRPT); 447 switch (c) { 448 case '*': /* implemented as +? */ 449 /* this case does not require the (y|) trick, noKLUDGE */ 450 INSERT(OPLUS_, pos); 451 ASTERN(O_PLUS, pos); 452 INSERT(OQUEST_, pos); 453 ASTERN(O_QUEST, pos); 454 break; 455 case '+': 456 INSERT(OPLUS_, pos); 457 ASTERN(O_PLUS, pos); 458 break; 459 case '?': 460 /* KLUDGE: emit y? as (y|) until subtle bug gets fixed */ 461 INSERT(OCH_, pos); /* offset slightly wrong */ 462 ASTERN(OOR1, pos); /* this one's right */ 463 AHEAD(pos); /* fix the OCH_ */ 464 EMIT(OOR2, 0); /* offset very wrong... */ 465 AHEAD(THERE()); /* ...so fix it */ 466 ASTERN(O_CH, THERETHERE()); 467 break; 468 case '{': 469 count = p_count(p); 470 if (EAT(',')) { 471 if (isdigit((unsigned char)PEEK())) { 472 count2 = p_count(p); 473 REQUIRE(count <= count2, REG_BADBR); 474 } else /* single number with comma */ 475 count2 = INFINITY; 476 } else /* just a single number */ 477 count2 = count; 478 repeat(p, pos, count, count2); 479 if (!EAT('}')) { /* error heuristics */ 480 while (MORE() && PEEK() != '}') 481 NEXT(); 482 REQUIRE(MORE(), REG_EBRACE); 483 SETERROR(REG_BADBR); 484 } 485 break; 486 } 487 488 if (!MORE()) 489 return; 490 c = PEEK(); 491 if (!( c == '*' || c == '+' || c == '?' || 492 (c == '{' && MORE2() && isdigit((unsigned char)PEEK2())) ) ) 493 return; 494 SETERROR(REG_BADRPT); 495 } 496 497 /* 498 - p_str - string (no metacharacters) "parser" 499 == static void p_str(struct parse *p); 500 */ 501 static void 502 p_str(p) 503 struct parse *p; 504 { 505 506 _DIAGASSERT(p != NULL); 507 508 REQUIRE(MORE(), REG_EMPTY); 509 while (MORE()) 510 ordinary(p, GETNEXT()); 511 } 512 513 /* 514 - p_bre - BRE parser top level, anchoring and concatenation 515 == static void p_bre(struct parse *p, int end1, \ 516 == int end2); 517 * Giving end1 as OUT essentially eliminates the end1/end2 check. 518 * 519 * This implementation is a bit of a kludge, in that a trailing $ is first 520 * taken as an ordinary character and then revised to be an anchor. The 521 * only undesirable side effect is that '$' gets included as a character 522 * category in such cases. This is fairly harmless; not worth fixing. 523 * The amount of lookahead needed to avoid this kludge is excessive. 524 */ 525 static void 526 p_bre(p, end1, end2) 527 struct parse *p; 528 int end1; /* first terminating character */ 529 int end2; /* second terminating character */ 530 { 531 sopno start; 532 int first = 1; /* first subexpression? */ 533 int wasdollar = 0; 534 535 _DIAGASSERT(p != NULL); 536 537 start = HERE(); 538 539 if (EAT('^')) { 540 EMIT(OBOL, 0); 541 p->g->iflags |= USEBOL; 542 p->g->nbol++; 543 } 544 while (MORE() && !SEETWO(end1, end2)) { 545 wasdollar = p_simp_re(p, first); 546 first = 0; 547 } 548 if (wasdollar) { /* oops, that was a trailing anchor */ 549 DROP(1); 550 EMIT(OEOL, 0); 551 p->g->iflags |= USEEOL; 552 p->g->neol++; 553 } 554 555 REQUIRE(HERE() != start, REG_EMPTY); /* require nonempty */ 556 } 557 558 /* 559 - p_simp_re - parse a simple RE, an atom possibly followed by a repetition 560 == static int p_simp_re(struct parse *p, int starordinary); 561 */ 562 static int /* was the simple RE an unbackslashed $? */ 563 p_simp_re(p, starordinary) 564 struct parse *p; 565 int starordinary; /* is a leading * an ordinary character? */ 566 { 567 int c; 568 int count; 569 int count2; 570 sopno pos; 571 int i; 572 sopno subno; 573 # define BACKSL (1<<CHAR_BIT) 574 575 _DIAGASSERT(p != NULL); 576 577 pos = HERE(); /* repetion op, if any, covers from here */ 578 579 assert(MORE()); /* caller should have ensured this */ 580 c = GETNEXT(); 581 if (c == '\\') { 582 REQUIRE(MORE(), REG_EESCAPE); 583 c = BACKSL | (unsigned char)GETNEXT(); 584 } 585 switch (c) { 586 case '.': 587 if (p->g->cflags®_NEWLINE) 588 nonnewline(p); 589 else 590 EMIT(OANY, 0); 591 break; 592 case '[': 593 p_bracket(p); 594 break; 595 case BACKSL|'{': 596 SETERROR(REG_BADRPT); 597 break; 598 case BACKSL|'(': 599 p->g->nsub++; 600 subno = p->g->nsub; 601 if (subno < NPAREN) 602 p->pbegin[subno] = HERE(); 603 EMIT(OLPAREN, subno); 604 /* the MORE here is an error heuristic */ 605 if (MORE() && !SEETWO('\\', ')')) 606 p_bre(p, '\\', ')'); 607 if (subno < NPAREN) { 608 p->pend[subno] = HERE(); 609 assert(p->pend[subno] != 0); 610 } 611 EMIT(ORPAREN, subno); 612 REQUIRE(EATTWO('\\', ')'), REG_EPAREN); 613 break; 614 case BACKSL|')': /* should not get here -- must be user */ 615 case BACKSL|'}': 616 SETERROR(REG_EPAREN); 617 break; 618 case BACKSL|'1': 619 case BACKSL|'2': 620 case BACKSL|'3': 621 case BACKSL|'4': 622 case BACKSL|'5': 623 case BACKSL|'6': 624 case BACKSL|'7': 625 case BACKSL|'8': 626 case BACKSL|'9': 627 i = (c&~BACKSL) - '0'; 628 assert(i < NPAREN); 629 if (p->pend[i] != 0) { 630 assert(i <= p->g->nsub); 631 EMIT(OBACK_, i); 632 assert(p->pbegin[i] != 0); 633 assert(OP(p->strip[p->pbegin[i]]) == OLPAREN); 634 assert(OP(p->strip[p->pend[i]]) == ORPAREN); 635 (void) dupl(p, p->pbegin[i]+1, p->pend[i]); 636 EMIT(O_BACK, i); 637 } else 638 SETERROR(REG_ESUBREG); 639 p->g->backrefs = 1; 640 break; 641 case '*': 642 REQUIRE(starordinary, REG_BADRPT); 643 /* FALLTHROUGH */ 644 default: 645 ordinary(p, c &~ BACKSL); 646 break; 647 } 648 649 if (EAT('*')) { /* implemented as +? */ 650 /* this case does not require the (y|) trick, noKLUDGE */ 651 INSERT(OPLUS_, pos); 652 ASTERN(O_PLUS, pos); 653 INSERT(OQUEST_, pos); 654 ASTERN(O_QUEST, pos); 655 } else if (EATTWO('\\', '{')) { 656 count = p_count(p); 657 if (EAT(',')) { 658 if (MORE() && isdigit((unsigned char)PEEK())) { 659 count2 = p_count(p); 660 REQUIRE(count <= count2, REG_BADBR); 661 } else /* single number with comma */ 662 count2 = INFINITY; 663 } else /* just a single number */ 664 count2 = count; 665 repeat(p, pos, count, count2); 666 if (!EATTWO('\\', '}')) { /* error heuristics */ 667 while (MORE() && !SEETWO('\\', '}')) 668 NEXT(); 669 REQUIRE(MORE(), REG_EBRACE); 670 SETERROR(REG_BADBR); 671 } 672 } else if (c == (unsigned char)'$') /* $ (but not \$) ends it */ 673 return(1); 674 675 return(0); 676 } 677 678 /* 679 - p_count - parse a repetition count 680 == static int p_count(struct parse *p); 681 */ 682 static int /* the value */ 683 p_count(p) 684 struct parse *p; 685 { 686 int count = 0; 687 int ndigits = 0; 688 689 _DIAGASSERT(p != NULL); 690 691 while (MORE() && isdigit((unsigned char)PEEK()) && count <= DUPMAX) { 692 count = count*10 + (GETNEXT() - '0'); 693 ndigits++; 694 } 695 696 REQUIRE(ndigits > 0 && count <= DUPMAX, REG_BADBR); 697 return(count); 698 } 699 700 /* 701 - p_bracket - parse a bracketed character list 702 == static void p_bracket(struct parse *p); 703 * 704 * Note a significant property of this code: if the allocset() did SETERROR, 705 * no set operations are done. 706 */ 707 static void 708 p_bracket(p) 709 struct parse *p; 710 { 711 cset *cs; 712 int invert = 0; 713 714 _DIAGASSERT(p != NULL); 715 716 cs = allocset(p); 717 718 /* Dept of Truly Sickening Special-Case Kludges */ 719 if (p->next + 5 < p->end && strncmp(p->next, "[:<:]]", 720 (size_t)6) == 0) { 721 EMIT(OBOW, 0); 722 NEXTn(6); 723 return; 724 } 725 if (p->next + 5 < p->end && strncmp(p->next, "[:>:]]", 726 (size_t)6) == 0) { 727 EMIT(OEOW, 0); 728 NEXTn(6); 729 return; 730 } 731 732 if (EAT('^')) 733 invert++; /* make note to invert set at end */ 734 if (EAT(']')) 735 CHadd(cs, ']'); 736 else if (EAT('-')) 737 CHadd(cs, '-'); 738 while (MORE() && PEEK() != ']' && !SEETWO('-', ']')) 739 p_b_term(p, cs); 740 if (EAT('-')) 741 CHadd(cs, '-'); 742 MUSTEAT(']', REG_EBRACK); 743 744 if (p->error != 0) /* don't mess things up further */ 745 return; 746 747 if (p->g->cflags®_ICASE) { 748 int i; 749 int ci; 750 751 for (i = p->g->csetsize - 1; i >= 0; i--) 752 if (CHIN(cs, i) && isalpha(i)) { 753 ci = othercase(i); 754 if (ci != i) 755 CHadd(cs, ci); 756 } 757 if (cs->multis != NULL) 758 mccase(p, cs); 759 } 760 if (invert) { 761 int i; 762 763 for (i = p->g->csetsize - 1; i >= 0; i--) 764 if (CHIN(cs, i)) 765 CHsub(cs, i); 766 else 767 CHadd(cs, i); 768 if (p->g->cflags®_NEWLINE) 769 CHsub(cs, '\n'); 770 if (cs->multis != NULL) 771 mcinvert(p, cs); 772 } 773 774 assert(cs->multis == NULL); /* xxx */ 775 776 if (nch(p, cs) == 1) { /* optimize singleton sets */ 777 ordinary(p, firstch(p, cs)); 778 freeset(p, cs); 779 } else 780 EMIT(OANYOF, freezeset(p, cs)); 781 } 782 783 /* 784 - p_b_term - parse one term of a bracketed character list 785 == static void p_b_term(struct parse *p, cset *cs); 786 */ 787 static void 788 p_b_term(p, cs) 789 struct parse *p; 790 cset *cs; 791 { 792 char c; 793 char start, finish; 794 int i; 795 796 _DIAGASSERT(p != NULL); 797 _DIAGASSERT(cs != NULL); 798 799 /* classify what we've got */ 800 switch ((MORE()) ? PEEK() : '\0') { 801 case '[': 802 c = (MORE2()) ? PEEK2() : '\0'; 803 break; 804 805 case '-': 806 SETERROR(REG_ERANGE); 807 return; /* NOTE RETURN */ 808 809 default: 810 c = '\0'; 811 break; 812 } 813 814 switch (c) { 815 case ':': /* character class */ 816 NEXT2(); 817 REQUIRE(MORE(), REG_EBRACK); 818 c = PEEK(); 819 REQUIRE(c != '-' && c != ']', REG_ECTYPE); 820 p_b_cclass(p, cs); 821 REQUIRE(MORE(), REG_EBRACK); 822 REQUIRE(EATTWO(':', ']'), REG_ECTYPE); 823 break; 824 case '=': /* equivalence class */ 825 NEXT2(); 826 REQUIRE(MORE(), REG_EBRACK); 827 c = PEEK(); 828 REQUIRE(c != '-' && c != ']', REG_ECOLLATE); 829 p_b_eclass(p, cs); 830 REQUIRE(MORE(), REG_EBRACK); 831 REQUIRE(EATTWO('=', ']'), REG_ECOLLATE); 832 break; 833 default: /* symbol, ordinary character, or range */ 834 /* xxx revision needed for multichar stuff */ 835 start = p_b_symbol(p); 836 if (SEE('-') && MORE2() && PEEK2() != ']') { 837 /* range */ 838 NEXT(); 839 if (EAT('-')) 840 finish = '-'; 841 else 842 finish = p_b_symbol(p); 843 } else 844 finish = start; 845 /* xxx what about signed chars here... */ 846 REQUIRE(start <= finish, REG_ERANGE); 847 for (i = start; i <= finish; i++) 848 CHadd(cs, i); 849 break; 850 } 851 } 852 853 /* 854 - p_b_cclass - parse a character-class name and deal with it 855 == static void p_b_cclass(struct parse *p, cset *cs); 856 */ 857 static void 858 p_b_cclass(p, cs) 859 struct parse *p; 860 cset *cs; 861 { 862 char *sp; 863 const struct cclass *cp; 864 size_t len; 865 const char *u; 866 char c; 867 868 _DIAGASSERT(p != NULL); 869 _DIAGASSERT(cs != NULL); 870 871 sp = p->next; 872 873 while (MORE() && isalpha((unsigned char)PEEK())) 874 NEXT(); 875 len = p->next - sp; 876 for (cp = cclasses; cp->name != NULL; cp++) 877 if (strncmp(cp->name, sp, len) == 0 && cp->name[len] == '\0') 878 break; 879 if (cp->name == NULL) { 880 /* oops, didn't find it */ 881 SETERROR(REG_ECTYPE); 882 return; 883 } 884 885 u = cp->chars; 886 while ((c = *u++) != '\0') 887 CHadd(cs, c); 888 for (u = cp->multis; *u != '\0'; u += strlen(u) + 1) 889 MCadd(p, cs, u); 890 } 891 892 /* 893 - p_b_eclass - parse an equivalence-class name and deal with it 894 == static void p_b_eclass(struct parse *p, cset *cs); 895 * 896 * This implementation is incomplete. xxx 897 */ 898 static void 899 p_b_eclass(p, cs) 900 struct parse *p; 901 cset *cs; 902 { 903 char c; 904 905 _DIAGASSERT(p != NULL); 906 _DIAGASSERT(cs != NULL); 907 908 c = p_b_coll_elem(p, '='); 909 CHadd(cs, c); 910 } 911 912 /* 913 - p_b_symbol - parse a character or [..]ed multicharacter collating symbol 914 == static char p_b_symbol(struct parse *p); 915 */ 916 static char /* value of symbol */ 917 p_b_symbol(p) 918 struct parse *p; 919 { 920 char value; 921 922 _DIAGASSERT(p != NULL); 923 924 REQUIRE(MORE(), REG_EBRACK); 925 if (!EATTWO('[', '.')) 926 return(GETNEXT()); 927 928 /* collating symbol */ 929 value = p_b_coll_elem(p, '.'); 930 REQUIRE(EATTWO('.', ']'), REG_ECOLLATE); 931 return(value); 932 } 933 934 /* 935 - p_b_coll_elem - parse a collating-element name and look it up 936 == static char p_b_coll_elem(struct parse *p, int endc); 937 */ 938 static char /* value of collating element */ 939 p_b_coll_elem(p, endc) 940 struct parse *p; 941 int endc; /* name ended by endc,']' */ 942 { 943 char *sp; 944 const struct cname *cp; 945 size_t len; 946 947 _DIAGASSERT(p != NULL); 948 949 sp = p->next; 950 951 while (MORE() && !SEETWO(endc, ']')) 952 NEXT(); 953 if (!MORE()) { 954 SETERROR(REG_EBRACK); 955 return(0); 956 } 957 len = p->next - sp; 958 for (cp = cnames; cp->name != NULL; cp++) 959 if (strncmp(cp->name, sp, len) == 0 && cp->name[len] == '\0') 960 return(cp->code); /* known name */ 961 if (len == 1) 962 return(*sp); /* single character */ 963 SETERROR(REG_ECOLLATE); /* neither */ 964 return(0); 965 } 966 967 /* 968 - othercase - return the case counterpart of an alphabetic 969 == static char othercase(int ch); 970 */ 971 static char /* if no counterpart, return ch */ 972 othercase(ch) 973 int ch; 974 { 975 assert(isalpha(ch)); 976 if (isupper(ch)) 977 return(tolower(ch)); 978 else if (islower(ch)) 979 return(toupper(ch)); 980 else /* peculiar, but could happen */ 981 return(ch); 982 } 983 984 /* 985 - bothcases - emit a dualcase version of a two-case character 986 == static void bothcases(struct parse *p, int ch); 987 * 988 * Boy, is this implementation ever a kludge... 989 */ 990 static void 991 bothcases(p, ch) 992 struct parse *p; 993 int ch; 994 { 995 char *oldnext; 996 char *oldend; 997 char bracket[3]; 998 999 _DIAGASSERT(p != NULL); 1000 1001 oldnext = p->next; 1002 oldend = p->end; 1003 1004 assert(othercase(ch) != ch); /* p_bracket() would recurse */ 1005 p->next = bracket; 1006 p->end = bracket+2; 1007 bracket[0] = ch; 1008 bracket[1] = ']'; 1009 bracket[2] = '\0'; 1010 p_bracket(p); 1011 assert(p->next == bracket+2); 1012 p->next = oldnext; 1013 p->end = oldend; 1014 } 1015 1016 /* 1017 - ordinary - emit an ordinary character 1018 == static void ordinary(struct parse *p, int ch); 1019 */ 1020 static void 1021 ordinary(p, ch) 1022 struct parse *p; 1023 int ch; 1024 { 1025 cat_t *cap; 1026 1027 _DIAGASSERT(p != NULL); 1028 1029 cap = p->g->categories; 1030 if ((p->g->cflags®_ICASE) && isalpha((unsigned char) ch) 1031 && othercase((unsigned char) ch) != (unsigned char) ch) 1032 bothcases(p, (unsigned char) ch); 1033 else { 1034 EMIT(OCHAR, (unsigned char)ch); 1035 if (cap[ch] == 0) 1036 cap[ch] = p->g->ncategories++; 1037 } 1038 } 1039 1040 /* 1041 - nonnewline - emit REG_NEWLINE version of OANY 1042 == static void nonnewline(struct parse *p); 1043 * 1044 * Boy, is this implementation ever a kludge... 1045 */ 1046 static void 1047 nonnewline(p) 1048 struct parse *p; 1049 { 1050 char *oldnext; 1051 char *oldend; 1052 char bracket[4]; 1053 1054 _DIAGASSERT(p != NULL); 1055 1056 oldnext = p->next; 1057 oldend = p->end; 1058 1059 p->next = bracket; 1060 p->end = bracket+3; 1061 bracket[0] = '^'; 1062 bracket[1] = '\n'; 1063 bracket[2] = ']'; 1064 bracket[3] = '\0'; 1065 p_bracket(p); 1066 assert(p->next == bracket+3); 1067 p->next = oldnext; 1068 p->end = oldend; 1069 } 1070 1071 /* 1072 - repeat - generate code for a bounded repetition, recursively if needed 1073 == static void repeat(struct parse *p, sopno start, int from, int to); 1074 */ 1075 static void 1076 repeat(p, start, from, to) 1077 struct parse *p; 1078 sopno start; /* operand from here to end of strip */ 1079 int from; /* repeated from this number */ 1080 int to; /* to this number of times (maybe INFINITY) */ 1081 { 1082 sopno finish; 1083 # define N 2 1084 # define INF 3 1085 # define REP(f, t) ((f)*8 + (t)) 1086 # define MAP(n) (((n) <= 1) ? (n) : ((n) == INFINITY) ? INF : N) 1087 sopno copy; 1088 1089 _DIAGASSERT(p != NULL); 1090 1091 finish = HERE(); 1092 1093 if (p->error != 0) /* head off possible runaway recursion */ 1094 return; 1095 1096 assert(from <= to); 1097 1098 switch (REP(MAP(from), MAP(to))) { 1099 case REP(0, 0): /* must be user doing this */ 1100 DROP(finish-start); /* drop the operand */ 1101 break; 1102 case REP(0, 1): /* as x{1,1}? */ 1103 case REP(0, N): /* as x{1,n}? */ 1104 case REP(0, INF): /* as x{1,}? */ 1105 /* KLUDGE: emit y? as (y|) until subtle bug gets fixed */ 1106 INSERT(OCH_, start); /* offset is wrong... */ 1107 repeat(p, start+1, 1, to); 1108 ASTERN(OOR1, start); 1109 AHEAD(start); /* ... fix it */ 1110 EMIT(OOR2, 0); 1111 AHEAD(THERE()); 1112 ASTERN(O_CH, THERETHERE()); 1113 break; 1114 case REP(1, 1): /* trivial case */ 1115 /* done */ 1116 break; 1117 case REP(1, N): /* as x?x{1,n-1} */ 1118 /* KLUDGE: emit y? as (y|) until subtle bug gets fixed */ 1119 INSERT(OCH_, start); 1120 ASTERN(OOR1, start); 1121 AHEAD(start); 1122 EMIT(OOR2, 0); /* offset very wrong... */ 1123 AHEAD(THERE()); /* ...so fix it */ 1124 ASTERN(O_CH, THERETHERE()); 1125 copy = dupl(p, start+1, finish+1); 1126 assert(copy == finish+4); 1127 repeat(p, copy, 1, to-1); 1128 break; 1129 case REP(1, INF): /* as x+ */ 1130 INSERT(OPLUS_, start); 1131 ASTERN(O_PLUS, start); 1132 break; 1133 case REP(N, N): /* as xx{m-1,n-1} */ 1134 copy = dupl(p, start, finish); 1135 repeat(p, copy, from-1, to-1); 1136 break; 1137 case REP(N, INF): /* as xx{n-1,INF} */ 1138 copy = dupl(p, start, finish); 1139 repeat(p, copy, from-1, to); 1140 break; 1141 default: /* "can't happen" */ 1142 SETERROR(REG_ASSERT); /* just in case */ 1143 break; 1144 } 1145 } 1146 1147 /* 1148 - seterr - set an error condition 1149 == static int seterr(struct parse *p, int e); 1150 */ 1151 static int /* useless but makes type checking happy */ 1152 seterr(p, e) 1153 struct parse *p; 1154 int e; 1155 { 1156 1157 _DIAGASSERT(p != NULL); 1158 1159 if (p->error == 0) /* keep earliest error condition */ 1160 p->error = e; 1161 p->next = nuls; /* try to bring things to a halt */ 1162 p->end = nuls; 1163 return(0); /* make the return value well-defined */ 1164 } 1165 1166 /* 1167 - allocset - allocate a set of characters for [] 1168 == static cset *allocset(struct parse *p); 1169 */ 1170 static cset * 1171 allocset(p) 1172 struct parse *p; 1173 { 1174 int no; 1175 size_t nc; 1176 size_t nbytes; 1177 cset *cs; 1178 size_t css; 1179 int i; 1180 1181 _DIAGASSERT(p != NULL); 1182 1183 no = p->g->ncsets++; 1184 css = (size_t)p->g->csetsize; 1185 if (no >= p->ncsalloc) { /* need another column of space */ 1186 p->ncsalloc += CHAR_BIT; 1187 nc = p->ncsalloc; 1188 assert(nc % CHAR_BIT == 0); 1189 nbytes = nc / CHAR_BIT * css; 1190 if (p->g->sets == NULL) 1191 p->g->sets = malloc(nc * sizeof(cset)); 1192 else 1193 p->g->sets = realloc(p->g->sets, nc * sizeof(cset)); 1194 if (p->g->setbits == NULL) 1195 p->g->setbits = malloc(nbytes); 1196 else { 1197 p->g->setbits = realloc(p->g->setbits, nbytes); 1198 /* xxx this isn't right if setbits is now NULL */ 1199 for (i = 0; i < no; i++) 1200 p->g->sets[i].ptr = p->g->setbits + css*(i/CHAR_BIT); 1201 } 1202 if (p->g->sets != NULL && p->g->setbits != NULL) 1203 (void) memset((char *)p->g->setbits + (nbytes - css), 1204 0, css); 1205 else { 1206 no = 0; 1207 SETERROR(REG_ESPACE); 1208 /* caller's responsibility not to do set ops */ 1209 } 1210 } 1211 1212 assert(p->g->sets != NULL); /* xxx */ 1213 cs = &p->g->sets[no]; 1214 cs->ptr = p->g->setbits + css*((no)/CHAR_BIT); 1215 cs->mask = 1 << ((no) % CHAR_BIT); 1216 cs->hash = 0; 1217 cs->smultis = 0; 1218 cs->multis = NULL; 1219 1220 return(cs); 1221 } 1222 1223 /* 1224 - freeset - free a now-unused set 1225 == static void freeset(struct parse *p, cset *cs); 1226 */ 1227 static void 1228 freeset(p, cs) 1229 struct parse *p; 1230 cset *cs; 1231 { 1232 int i; 1233 cset *top; 1234 size_t css; 1235 1236 _DIAGASSERT(p != NULL); 1237 _DIAGASSERT(cs != NULL); 1238 1239 top = &p->g->sets[p->g->ncsets]; 1240 css = (size_t)p->g->csetsize; 1241 1242 for (i = 0; i < css; i++) 1243 CHsub(cs, i); 1244 if (cs == top-1) /* recover only the easy case */ 1245 p->g->ncsets--; 1246 } 1247 1248 /* 1249 - freezeset - final processing on a set of characters 1250 == static int freezeset(struct parse *p, cset *cs); 1251 * 1252 * The main task here is merging identical sets. This is usually a waste 1253 * of time (although the hash code minimizes the overhead), but can win 1254 * big if REG_ICASE is being used. REG_ICASE, by the way, is why the hash 1255 * is done using addition rather than xor -- all ASCII [aA] sets xor to 1256 * the same value! 1257 */ 1258 static int /* set number */ 1259 freezeset(p, cs) 1260 struct parse *p; 1261 cset *cs; 1262 { 1263 uch h; 1264 int i; 1265 cset *top; 1266 cset *cs2; 1267 size_t css; 1268 1269 _DIAGASSERT(p != NULL); 1270 _DIAGASSERT(cs != NULL); 1271 1272 h = cs->hash; 1273 top = &p->g->sets[p->g->ncsets]; 1274 css = (size_t)p->g->csetsize; 1275 1276 /* look for an earlier one which is the same */ 1277 for (cs2 = &p->g->sets[0]; cs2 < top; cs2++) 1278 if (cs2->hash == h && cs2 != cs) { 1279 /* maybe */ 1280 for (i = 0; i < css; i++) 1281 if (!!CHIN(cs2, i) != !!CHIN(cs, i)) 1282 break; /* no */ 1283 if (i == css) 1284 break; /* yes */ 1285 } 1286 1287 if (cs2 < top) { /* found one */ 1288 freeset(p, cs); 1289 cs = cs2; 1290 } 1291 1292 return((int)(cs - p->g->sets)); 1293 } 1294 1295 /* 1296 - firstch - return first character in a set (which must have at least one) 1297 == static int firstch(struct parse *p, cset *cs); 1298 */ 1299 static int /* character; there is no "none" value */ 1300 firstch(p, cs) 1301 struct parse *p; 1302 cset *cs; 1303 { 1304 int i; 1305 size_t css; 1306 1307 _DIAGASSERT(p != NULL); 1308 _DIAGASSERT(cs != NULL); 1309 1310 css = (size_t)p->g->csetsize; 1311 1312 for (i = 0; i < css; i++) 1313 if (CHIN(cs, i)) 1314 return((char)i); 1315 assert(never); 1316 return(0); /* arbitrary */ 1317 } 1318 1319 /* 1320 - nch - number of characters in a set 1321 == static int nch(struct parse *p, cset *cs); 1322 */ 1323 static int 1324 nch(p, cs) 1325 struct parse *p; 1326 cset *cs; 1327 { 1328 int i; 1329 size_t css; 1330 int n = 0; 1331 1332 _DIAGASSERT(p != NULL); 1333 _DIAGASSERT(cs != NULL); 1334 1335 css = (size_t)p->g->csetsize; 1336 1337 for (i = 0; i < css; i++) 1338 if (CHIN(cs, i)) 1339 n++; 1340 return(n); 1341 } 1342 1343 /* 1344 - mcadd - add a collating element to a cset 1345 == static void mcadd(struct parse *p, cset *cs, \ 1346 == char *cp); 1347 */ 1348 static void 1349 mcadd(p, cs, cp) 1350 struct parse *p; 1351 cset *cs; 1352 const char *cp; 1353 { 1354 size_t oldend; 1355 1356 _DIAGASSERT(p != NULL); 1357 _DIAGASSERT(cs != NULL); 1358 _DIAGASSERT(cp != NULL); 1359 1360 oldend = cs->smultis; 1361 1362 cs->smultis += strlen(cp) + 1; 1363 if (cs->multis == NULL) 1364 cs->multis = malloc(cs->smultis); 1365 else 1366 cs->multis = realloc(cs->multis, cs->smultis); 1367 if (cs->multis == NULL) { 1368 SETERROR(REG_ESPACE); 1369 return; 1370 } 1371 1372 (void) strcpy(cs->multis + oldend - 1, cp); 1373 cs->multis[cs->smultis - 1] = '\0'; 1374 } 1375 1376 #if 0 1377 /* 1378 - mcsub - subtract a collating element from a cset 1379 == static void mcsub(cset *cs, char *cp); 1380 */ 1381 static void 1382 mcsub(cs, cp) 1383 cset *cs; 1384 char *cp; 1385 { 1386 char *fp; 1387 size_t len; 1388 1389 _DIAGASSERT(cs != NULL); 1390 _DIAGASSERT(cp != NULL); 1391 1392 fp = mcfind(cs, cp); 1393 len = strlen(fp); 1394 1395 assert(fp != NULL); 1396 (void) memmove(fp, fp + len + 1, 1397 cs->smultis - (fp + len + 1 - cs->multis)); 1398 cs->smultis -= len; 1399 1400 if (cs->smultis == 0) { 1401 free(cs->multis); 1402 cs->multis = NULL; 1403 return; 1404 } 1405 1406 cs->multis = realloc(cs->multis, cs->smultis); 1407 assert(cs->multis != NULL); 1408 } 1409 1410 /* 1411 - mcin - is a collating element in a cset? 1412 == static int mcin(cset *cs, char *cp); 1413 */ 1414 static int 1415 mcin(cs, cp) 1416 cset *cs; 1417 char *cp; 1418 { 1419 1420 _DIAGASSERT(cs != NULL); 1421 _DIAGASSERT(cp != NULL); 1422 1423 return(mcfind(cs, cp) != NULL); 1424 } 1425 1426 /* 1427 - mcfind - find a collating element in a cset 1428 == static char *mcfind(cset *cs, char *cp); 1429 */ 1430 static char * 1431 mcfind(cs, cp) 1432 cset *cs; 1433 char *cp; 1434 { 1435 char *p; 1436 1437 _DIAGASSERT(cs != NULL); 1438 _DIAGASSERT(cp != NULL); 1439 1440 if (cs->multis == NULL) 1441 return(NULL); 1442 for (p = cs->multis; *p != '\0'; p += strlen(p) + 1) 1443 if (strcmp(cp, p) == 0) 1444 return(p); 1445 return(NULL); 1446 } 1447 #endif 1448 1449 /* 1450 - mcinvert - invert the list of collating elements in a cset 1451 == static void mcinvert(struct parse *p, cset *cs); 1452 * 1453 * This would have to know the set of possibilities. Implementation 1454 * is deferred. 1455 */ 1456 /* ARGSUSED */ 1457 static void 1458 mcinvert(p, cs) 1459 struct parse *p; 1460 cset *cs; 1461 { 1462 1463 _DIAGASSERT(p != NULL); 1464 _DIAGASSERT(cs != NULL); 1465 1466 assert(cs->multis == NULL); /* xxx */ 1467 } 1468 1469 /* 1470 - mccase - add case counterparts of the list of collating elements in a cset 1471 == static void mccase(struct parse *p, cset *cs); 1472 * 1473 * This would have to know the set of possibilities. Implementation 1474 * is deferred. 1475 */ 1476 /* ARGSUSED */ 1477 static void 1478 mccase(p, cs) 1479 struct parse *p; 1480 cset *cs; 1481 { 1482 1483 _DIAGASSERT(p != NULL); 1484 _DIAGASSERT(cs != NULL); 1485 1486 assert(cs->multis == NULL); /* xxx */ 1487 } 1488 1489 /* 1490 - isinsets - is this character in any sets? 1491 == static int isinsets(struct re_guts *g, int c); 1492 */ 1493 static int /* predicate */ 1494 isinsets(g, c) 1495 struct re_guts *g; 1496 int c; 1497 { 1498 uch *col; 1499 int i; 1500 int ncols; 1501 unsigned uc = (unsigned char)c; 1502 1503 _DIAGASSERT(g != NULL); 1504 1505 ncols = (g->ncsets+(CHAR_BIT-1)) / CHAR_BIT; 1506 1507 for (i = 0, col = g->setbits; i < ncols; i++, col += g->csetsize) 1508 if (col[uc] != 0) 1509 return(1); 1510 return(0); 1511 } 1512 1513 /* 1514 - samesets - are these two characters in exactly the same sets? 1515 == static int samesets(struct re_guts *g, int c1, int c2); 1516 */ 1517 static int /* predicate */ 1518 samesets(g, c1, c2) 1519 struct re_guts *g; 1520 int c1; 1521 int c2; 1522 { 1523 uch *col; 1524 int i; 1525 int ncols; 1526 unsigned uc1 = (unsigned char)c1; 1527 unsigned uc2 = (unsigned char)c2; 1528 1529 _DIAGASSERT(g != NULL); 1530 1531 ncols = (g->ncsets+(CHAR_BIT-1)) / CHAR_BIT; 1532 1533 for (i = 0, col = g->setbits; i < ncols; i++, col += g->csetsize) 1534 if (col[uc1] != col[uc2]) 1535 return(0); 1536 return(1); 1537 } 1538 1539 /* 1540 - categorize - sort out character categories 1541 == static void categorize(struct parse *p, struct re_guts *g); 1542 */ 1543 static void 1544 categorize(p, g) 1545 struct parse *p; 1546 struct re_guts *g; 1547 { 1548 cat_t *cats; 1549 int c; 1550 int c2; 1551 cat_t cat; 1552 1553 _DIAGASSERT(p != NULL); 1554 _DIAGASSERT(g != NULL); 1555 1556 cats = g->categories; 1557 1558 /* avoid making error situations worse */ 1559 if (p->error != 0) 1560 return; 1561 1562 for (c = CHAR_MIN; c <= CHAR_MAX; c++) 1563 if (cats[c] == 0 && isinsets(g, c)) { 1564 cat = g->ncategories++; 1565 cats[c] = cat; 1566 for (c2 = c+1; c2 <= CHAR_MAX; c2++) 1567 if (cats[c2] == 0 && samesets(g, c, c2)) 1568 cats[c2] = cat; 1569 } 1570 } 1571 1572 /* 1573 - dupl - emit a duplicate of a bunch of sops 1574 == static sopno dupl(struct parse *p, sopno start, sopno finish); 1575 */ 1576 static sopno /* start of duplicate */ 1577 dupl(p, start, finish) 1578 struct parse *p; 1579 sopno start; /* from here */ 1580 sopno finish; /* to this less one */ 1581 { 1582 sopno ret; 1583 sopno len = finish - start; 1584 1585 _DIAGASSERT(p != NULL); 1586 1587 ret = HERE(); 1588 1589 assert(finish >= start); 1590 if (len == 0) 1591 return(ret); 1592 enlarge(p, p->ssize + len); /* this many unexpected additions */ 1593 assert(p->ssize >= p->slen + len); 1594 (void)memcpy(p->strip + p->slen, p->strip + start, 1595 (size_t)len * sizeof(sop)); 1596 p->slen += len; 1597 return(ret); 1598 } 1599 1600 /* 1601 - doemit - emit a strip operator 1602 == static void doemit(struct parse *p, sop op, size_t opnd); 1603 * 1604 * It might seem better to implement this as a macro with a function as 1605 * hard-case backup, but it's just too big and messy unless there are 1606 * some changes to the data structures. Maybe later. 1607 */ 1608 static void 1609 doemit(p, op, opnd) 1610 struct parse *p; 1611 sop op; 1612 sopno opnd; 1613 { 1614 1615 _DIAGASSERT(p != NULL); 1616 1617 /* avoid making error situations worse */ 1618 if (p->error != 0) 1619 return; 1620 1621 /* deal with oversize operands ("can't happen", more or less) */ 1622 assert(opnd < 1<<OPSHIFT); 1623 1624 /* deal with undersized strip */ 1625 if (p->slen >= p->ssize) 1626 enlarge(p, (p->ssize+1) / 2 * 3); /* +50% */ 1627 assert(p->slen < p->ssize); 1628 1629 /* finally, it's all reduced to the easy case */ 1630 p->strip[p->slen++] = SOP(op, opnd); 1631 } 1632 1633 /* 1634 - doinsert - insert a sop into the strip 1635 == static void doinsert(struct parse *p, sop op, size_t opnd, sopno pos); 1636 */ 1637 static void 1638 doinsert(p, op, opnd, pos) 1639 struct parse *p; 1640 sop op; 1641 sopno opnd; 1642 sopno pos; 1643 { 1644 sopno sn; 1645 sop s; 1646 int i; 1647 1648 _DIAGASSERT(p != NULL); 1649 1650 /* avoid making error situations worse */ 1651 if (p->error != 0) 1652 return; 1653 1654 sn = HERE(); 1655 EMIT(op, opnd); /* do checks, ensure space */ 1656 assert(HERE() == sn+1); 1657 s = p->strip[sn]; 1658 1659 /* adjust paren pointers */ 1660 assert(pos > 0); 1661 for (i = 1; i < NPAREN; i++) { 1662 if (p->pbegin[i] >= pos) { 1663 p->pbegin[i]++; 1664 } 1665 if (p->pend[i] >= pos) { 1666 p->pend[i]++; 1667 } 1668 } 1669 1670 memmove(&p->strip[pos+1], &p->strip[pos], (HERE()-pos-1)*sizeof(sop)); 1671 p->strip[pos] = s; 1672 } 1673 1674 /* 1675 - dofwd - complete a forward reference 1676 == static void dofwd(struct parse *p, sopno pos, sop value); 1677 */ 1678 static void 1679 dofwd(p, pos, value) 1680 struct parse *p; 1681 sopno pos; 1682 sopno value; 1683 { 1684 1685 _DIAGASSERT(p != NULL); 1686 1687 /* avoid making error situations worse */ 1688 if (p->error != 0) 1689 return; 1690 1691 assert(value < 1<<OPSHIFT); 1692 p->strip[pos] = OP(p->strip[pos]) | value; 1693 } 1694 1695 /* 1696 - enlarge - enlarge the strip 1697 == static void enlarge(struct parse *p, sopno size); 1698 */ 1699 static void 1700 enlarge(p, size) 1701 struct parse *p; 1702 sopno size; 1703 { 1704 sop *sp; 1705 1706 _DIAGASSERT(p != NULL); 1707 1708 if (p->ssize >= size) 1709 return; 1710 1711 sp = (sop *)realloc(p->strip, size*sizeof(sop)); 1712 if (sp == NULL) { 1713 SETERROR(REG_ESPACE); 1714 return; 1715 } 1716 p->strip = sp; 1717 p->ssize = size; 1718 } 1719 1720 /* 1721 - stripsnug - compact the strip 1722 == static void stripsnug(struct parse *p, struct re_guts *g); 1723 */ 1724 static void 1725 stripsnug(p, g) 1726 struct parse *p; 1727 struct re_guts *g; 1728 { 1729 1730 _DIAGASSERT(p != NULL); 1731 _DIAGASSERT(g != NULL); 1732 1733 g->nstates = p->slen; 1734 g->strip = realloc(p->strip, p->slen * sizeof(sop)); 1735 if (g->strip == NULL) { 1736 SETERROR(REG_ESPACE); 1737 g->strip = p->strip; 1738 } 1739 } 1740 1741 /* 1742 - findmust - fill in must and mlen with longest mandatory literal string 1743 == static void findmust(struct parse *p, struct re_guts *g); 1744 * 1745 * This algorithm could do fancy things like analyzing the operands of | 1746 * for common subsequences. Someday. This code is simple and finds most 1747 * of the interesting cases. 1748 * 1749 * Note that must and mlen got initialized during setup. 1750 */ 1751 static void 1752 findmust(p, g) 1753 struct parse *p; 1754 struct re_guts *g; 1755 { 1756 sop *scan; 1757 sop *start = NULL; 1758 sop *newstart = NULL; 1759 sopno newlen; 1760 sop s; 1761 char *cp; 1762 sopno i; 1763 1764 _DIAGASSERT(p != NULL); 1765 _DIAGASSERT(g != NULL); 1766 1767 /* avoid making error situations worse */ 1768 if (p->error != 0) 1769 return; 1770 1771 /* find the longest OCHAR sequence in strip */ 1772 newlen = 0; 1773 scan = g->strip + 1; 1774 do { 1775 s = *scan++; 1776 switch (OP(s)) { 1777 case OCHAR: /* sequence member */ 1778 if (newlen == 0) /* new sequence */ 1779 newstart = scan - 1; 1780 newlen++; 1781 break; 1782 case OPLUS_: /* things that don't break one */ 1783 case OLPAREN: 1784 case ORPAREN: 1785 break; 1786 case OQUEST_: /* things that must be skipped */ 1787 case OCH_: 1788 scan--; 1789 do { 1790 scan += OPND(s); 1791 s = *scan; 1792 /* assert() interferes w debug printouts */ 1793 if (OP(s) != O_QUEST && OP(s) != O_CH && 1794 OP(s) != OOR2) { 1795 g->iflags |= BAD; 1796 return; 1797 } 1798 } while (OP(s) != O_QUEST && OP(s) != O_CH); 1799 /* FALLTHROUGH */ 1800 default: /* things that break a sequence */ 1801 if (newlen > g->mlen) { /* ends one */ 1802 start = newstart; 1803 g->mlen = newlen; 1804 } 1805 newlen = 0; 1806 break; 1807 } 1808 } while (OP(s) != OEND); 1809 1810 if (g->mlen == 0) /* there isn't one */ 1811 return; 1812 1813 /* turn it into a character string */ 1814 g->must = malloc((size_t)g->mlen + 1); 1815 if (g->must == NULL) { /* argh; just forget it */ 1816 g->mlen = 0; 1817 return; 1818 } 1819 cp = g->must; 1820 scan = start; 1821 for (i = g->mlen; i > 0; i--) { 1822 while (OP(s = *scan++) != OCHAR) 1823 continue; 1824 assert(cp < g->must + g->mlen); 1825 *cp++ = (char)OPND(s); 1826 } 1827 assert(cp == g->must + g->mlen); 1828 *cp++ = '\0'; /* just on general principles */ 1829 } 1830 1831 /* 1832 - pluscount - count + nesting 1833 == static sopno pluscount(struct parse *p, struct re_guts *g); 1834 */ 1835 static sopno /* nesting depth */ 1836 pluscount(p, g) 1837 struct parse *p; 1838 struct re_guts *g; 1839 { 1840 sop *scan; 1841 sop s; 1842 sopno plusnest = 0; 1843 sopno maxnest = 0; 1844 1845 _DIAGASSERT(p != NULL); 1846 _DIAGASSERT(g != NULL); 1847 1848 if (p->error != 0) 1849 return(0); /* there may not be an OEND */ 1850 1851 scan = g->strip + 1; 1852 do { 1853 s = *scan++; 1854 switch (OP(s)) { 1855 case OPLUS_: 1856 plusnest++; 1857 break; 1858 case O_PLUS: 1859 if (plusnest > maxnest) 1860 maxnest = plusnest; 1861 plusnest--; 1862 break; 1863 } 1864 } while (OP(s) != OEND); 1865 if (plusnest != 0) 1866 g->iflags |= BAD; 1867 return(maxnest); 1868 } 1869