1 /* $NetBSD: regcomp.c,v 1.36 2015/09/12 19:08:47 christos Exp $ */ 2 3 /*- 4 * Copyright (c) 1992, 1993, 1994 5 * The Regents of the University of California. All rights reserved. 6 * 7 * This code is derived from software contributed to Berkeley by 8 * Henry Spencer. 9 * 10 * Redistribution and use in source and binary forms, with or without 11 * modification, are permitted provided that the following conditions 12 * are met: 13 * 1. Redistributions of source code must retain the above copyright 14 * notice, this list of conditions and the following disclaimer. 15 * 2. Redistributions in binary form must reproduce the above copyright 16 * notice, this list of conditions and the following disclaimer in the 17 * documentation and/or other materials provided with the distribution. 18 * 3. Neither the name of the University nor the names of its contributors 19 * may be used to endorse or promote products derived from this software 20 * without specific prior written permission. 21 * 22 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 23 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 24 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 25 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 26 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 27 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 28 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 29 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 30 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 31 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 32 * SUCH DAMAGE. 33 * 34 * @(#)regcomp.c 8.5 (Berkeley) 3/20/94 35 */ 36 37 /*- 38 * Copyright (c) 1992, 1993, 1994 Henry Spencer. 39 * 40 * This code is derived from software contributed to Berkeley by 41 * Henry Spencer. 42 * 43 * Redistribution and use in source and binary forms, with or without 44 * modification, are permitted provided that the following conditions 45 * are met: 46 * 1. Redistributions of source code must retain the above copyright 47 * notice, this list of conditions and the following disclaimer. 48 * 2. Redistributions in binary form must reproduce the above copyright 49 * notice, this list of conditions and the following disclaimer in the 50 * documentation and/or other materials provided with the distribution. 51 * 3. All advertising materials mentioning features or use of this software 52 * must display the following acknowledgement: 53 * This product includes software developed by the University of 54 * California, Berkeley and its contributors. 55 * 4. Neither the name of the University nor the names of its contributors 56 * may be used to endorse or promote products derived from this software 57 * without specific prior written permission. 58 * 59 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 60 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 61 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 62 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 63 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 64 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 65 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 66 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 67 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 68 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 69 * SUCH DAMAGE. 70 * 71 * @(#)regcomp.c 8.5 (Berkeley) 3/20/94 72 */ 73 74 #include <sys/cdefs.h> 75 #if defined(LIBC_SCCS) && !defined(lint) 76 #if 0 77 static char sccsid[] = "@(#)regcomp.c 8.5 (Berkeley) 3/20/94"; 78 #else 79 __RCSID("$NetBSD: regcomp.c,v 1.36 2015/09/12 19:08:47 christos Exp $"); 80 #endif 81 #endif /* LIBC_SCCS and not lint */ 82 83 #include "namespace.h" 84 #include <sys/types.h> 85 86 #include <assert.h> 87 #include <ctype.h> 88 #include <limits.h> 89 #include <stdio.h> 90 #include <stdlib.h> 91 #include <string.h> 92 #include <regex.h> 93 94 #ifdef __weak_alias 95 __weak_alias(regcomp,_regcomp) 96 #endif 97 98 #include "utils.h" 99 #include "regex2.h" 100 101 #include "cclass.h" 102 #include "cname.h" 103 104 /* 105 * parse structure, passed up and down to avoid global variables and 106 * other clumsinesses 107 */ 108 struct parse { 109 const char *next; /* next character in RE */ 110 const char *end; /* end of string (-> NUL normally) */ 111 int error; /* has an error been seen? */ 112 sop *strip; /* malloced strip */ 113 sopno ssize; /* malloced strip size (allocated) */ 114 sopno slen; /* malloced strip length (used) */ 115 size_t ncsalloc; /* number of csets allocated */ 116 struct re_guts *g; 117 # define NPAREN 10 /* we need to remember () 1-9 for back refs */ 118 sopno pbegin[NPAREN]; /* -> ( ([0] unused) */ 119 sopno pend[NPAREN]; /* -> ) ([0] unused) */ 120 }; 121 122 /* ========= begin header generated by ./mkh ========= */ 123 #ifdef __cplusplus 124 extern "C" { 125 #endif 126 127 /* === regcomp.c === */ 128 static void p_ere(struct parse *p, int stop, size_t reclimit); 129 static void p_ere_exp(struct parse *p, size_t reclimit); 130 static void p_str(struct parse *p); 131 static void p_bre(struct parse *p, int end1, int end2, size_t reclimit); 132 static int p_simp_re(struct parse *p, int starordinary, size_t reclimit); 133 static int p_count(struct parse *p); 134 static void p_bracket(struct parse *p); 135 static void p_b_term(struct parse *p, cset *cs); 136 static void p_b_cclass(struct parse *p, cset *cs); 137 static void p_b_eclass(struct parse *p, cset *cs); 138 static char p_b_symbol(struct parse *p); 139 static char p_b_coll_elem(struct parse *p, int endc); 140 static int othercase(int ch); 141 static void bothcases(struct parse *p, int ch); 142 static void ordinary(struct parse *p, int ch); 143 static void nonnewline(struct parse *p); 144 static void repeat(struct parse *p, sopno start, int from, int to, size_t reclimit); 145 static int seterr(struct parse *p, int e); 146 static cset *allocset(struct parse *p); 147 static void freeset(struct parse *p, cset *cs); 148 static sopno freezeset(struct parse *p, cset *cs); 149 static int firstch(struct parse *p, cset *cs); 150 static int nch(struct parse *p, cset *cs); 151 static void mcadd(struct parse *p, cset *cs, const char *cp); 152 #if 0 153 static void mcsub(cset *cs, char *cp); 154 static int mcin(cset *cs, char *cp); 155 static char *mcfind(cset *cs, char *cp); 156 #endif 157 static void mcinvert(struct parse *p, cset *cs); 158 static void mccase(struct parse *p, cset *cs); 159 static int isinsets(struct re_guts *g, int c); 160 static int samesets(struct re_guts *g, int c1, int c2); 161 static void categorize(struct parse *p, struct re_guts *g); 162 static sopno dupl(struct parse *p, sopno start, sopno finish); 163 static void doemit(struct parse *p, sop op, sopno opnd); 164 static void doinsert(struct parse *p, sop op, sopno opnd, sopno pos); 165 static void dofwd(struct parse *p, sopno pos, sopno value); 166 static int enlarge(struct parse *p, sopno size); 167 static void stripsnug(struct parse *p, struct re_guts *g); 168 static void findmust(struct parse *p, struct re_guts *g); 169 static sopno pluscount(struct parse *p, struct re_guts *g); 170 171 #ifdef __cplusplus 172 } 173 #endif 174 /* ========= end header generated by ./mkh ========= */ 175 176 static char nuls[10]; /* place to point scanner in event of error */ 177 178 /* 179 * macros for use with parse structure 180 * BEWARE: these know that the parse structure is named `p' !!! 181 */ 182 #define PEEK() (*p->next) 183 #define PEEK2() (*(p->next+1)) 184 #define MORE() (p->next < p->end) 185 #define MORE2() (p->next+1 < p->end) 186 #define SEE(c) (MORE() && PEEK() == (c)) 187 #define SEETWO(a, b) (MORE() && MORE2() && PEEK() == (a) && PEEK2() == (b)) 188 #define EAT(c) ((SEE(c)) ? (NEXT(), 1) : 0) 189 #define EATTWO(a, b) ((SEETWO(a, b)) ? (NEXT2(), 1) : 0) 190 #define NEXT() (p->next++) 191 #define NEXT2() (p->next += 2) 192 #define NEXTn(n) (p->next += (n)) 193 #define GETNEXT() (*p->next++) 194 #define SETERROR(e) seterr(p, (e)) 195 #define REQUIRE(co, e) (void) ((co) || SETERROR(e)) 196 #define MUSTSEE(c, e) (REQUIRE(MORE() && PEEK() == (c), e)) 197 #define MUSTEAT(c, e) (void) (REQUIRE(MORE() && GETNEXT() == (c), e)) 198 #define MUSTNOTSEE(c, e) (REQUIRE(!MORE() || PEEK() != (c), e)) 199 #define EMIT(op, sopnd) doemit(p, (sop)(op), sopnd) 200 #define INSERT(op, pos) doinsert(p, (sop)(op), HERE()-(pos)+1, pos) 201 #define AHEAD(pos) dofwd(p, pos, HERE()-(pos)) 202 #define ASTERN(sop, pos) EMIT(sop, HERE()-pos) 203 #define HERE() (p->slen) 204 #define THERE() (p->slen - 1) 205 #define THERETHERE() (p->slen - 2) 206 #define DROP(n) (p->slen -= (n)) 207 208 #ifndef NDEBUG 209 static int never = 0; /* for use in asserts; shuts lint up */ 210 #else 211 #define never 0 /* some <assert.h>s have bugs too */ 212 #endif 213 214 #define MEMLIMIT 0x8000000 215 #define MEMSIZE(p) \ 216 ((p)->ncsalloc / CHAR_BIT * (p)->g->csetsize + \ 217 (p)->ncsalloc * sizeof(cset) + \ 218 (p)->ssize * sizeof(sop)) 219 #define RECLIMIT 256 220 221 /* 222 - regcomp - interface for parser and compilation 223 = extern int regcomp(regex_t *, const char *, int); 224 = #define REG_BASIC 0000 225 = #define REG_EXTENDED 0001 226 = #define REG_ICASE 0002 227 = #define REG_NOSUB 0004 228 = #define REG_NEWLINE 0010 229 = #define REG_NOSPEC 0020 230 = #define REG_PEND 0040 231 = #define REG_DUMP 0200 232 */ 233 int /* 0 success, otherwise REG_something */ 234 regcomp( 235 regex_t *preg, 236 const char *pattern, 237 int cflags) 238 { 239 struct parse pa; 240 struct re_guts *g; 241 struct parse *p = &pa; 242 int i; 243 size_t len; 244 #ifdef REDEBUG 245 # define GOODFLAGS(f) (f) 246 #else 247 # define GOODFLAGS(f) ((f)&~REG_DUMP) 248 #endif 249 250 _DIAGASSERT(preg != NULL); 251 _DIAGASSERT(pattern != NULL); 252 253 cflags = GOODFLAGS(cflags); 254 if ((cflags®_EXTENDED) && (cflags®_NOSPEC)) 255 return(REG_INVARG); 256 257 if (cflags®_PEND) { 258 if (preg->re_endp < pattern) 259 return(REG_INVARG); 260 len = preg->re_endp - pattern; 261 } else 262 len = strlen(pattern); 263 264 /* do the mallocs early so failure handling is easy */ 265 g = malloc(sizeof(struct re_guts) + (NC - 1) * sizeof(cat_t)); 266 if (g == NULL) 267 return(REG_ESPACE); 268 p->ssize = len/(size_t)2*(size_t)3 + (size_t)1; /* ugh */ 269 p->strip = calloc(p->ssize, sizeof(sop)); 270 p->slen = 0; 271 if (p->strip == NULL) { 272 free(g); 273 return(REG_ESPACE); 274 } 275 276 /* set things up */ 277 p->g = g; 278 p->next = pattern; 279 p->end = p->next + len; 280 p->error = 0; 281 p->ncsalloc = 0; 282 for (i = 0; i < NPAREN; i++) { 283 p->pbegin[i] = 0; 284 p->pend[i] = 0; 285 } 286 g->csetsize = NC; 287 g->sets = NULL; 288 g->setbits = NULL; 289 g->ncsets = 0; 290 g->cflags = cflags; 291 g->iflags = 0; 292 g->nbol = 0; 293 g->neol = 0; 294 g->must = NULL; 295 g->mlen = 0; 296 g->nsub = 0; 297 g->ncategories = 1; /* category 0 is "everything else" */ 298 g->categories = &g->catspace[-(CHAR_MIN)]; 299 (void) memset((char *)g->catspace, 0, NC*sizeof(cat_t)); 300 g->backrefs = 0; 301 302 /* do it */ 303 EMIT(OEND, 0); 304 g->firststate = THERE(); 305 if (cflags®_EXTENDED) 306 p_ere(p, OUT, 0); 307 else if (cflags®_NOSPEC) 308 p_str(p); 309 else 310 p_bre(p, OUT, OUT, 0); 311 EMIT(OEND, 0); 312 g->laststate = THERE(); 313 314 /* tidy up loose ends and fill things in */ 315 categorize(p, g); 316 stripsnug(p, g); 317 findmust(p, g); 318 g->nplus = pluscount(p, g); 319 g->magic = MAGIC2; 320 preg->re_nsub = g->nsub; 321 preg->re_g = g; 322 preg->re_magic = MAGIC1; 323 #ifndef REDEBUG 324 /* not debugging, so can't rely on the assert() in regexec() */ 325 if (g->iflags&BAD) 326 SETERROR(REG_ASSERT); 327 #endif 328 329 /* win or lose, we're done */ 330 if (p->error != 0) /* lose */ 331 regfree(preg); 332 return(p->error); 333 } 334 335 /* 336 - p_ere - ERE parser top level, concatenation and alternation 337 == static void p_ere(struct parse *p, int stop, size_t reclimit); 338 */ 339 static void 340 p_ere( 341 struct parse *p, 342 int stop, /* character this ERE should end at */ 343 size_t reclimit) 344 { 345 char c; 346 sopno prevback = 0; /* pacify gcc */ 347 sopno prevfwd = 0; /* pacify gcc */ 348 sopno conc; 349 int first = 1; /* is this the first alternative? */ 350 351 _DIAGASSERT(p != NULL); 352 353 if (reclimit++ > RECLIMIT || p->error == REG_ESPACE) { 354 p->error = REG_ESPACE; 355 return; 356 } 357 358 for (;;) { 359 /* do a bunch of concatenated expressions */ 360 conc = HERE(); 361 while (MORE() && (c = PEEK()) != '|' && c != stop) 362 p_ere_exp(p, reclimit); 363 REQUIRE(HERE() != conc, REG_EMPTY); /* require nonempty */ 364 365 if (!EAT('|')) 366 break; /* NOTE BREAK OUT */ 367 368 if (first) { 369 INSERT(OCH_, conc); /* offset is wrong */ 370 prevfwd = conc; 371 prevback = conc; 372 first = 0; 373 } 374 ASTERN(OOR1, prevback); 375 prevback = THERE(); 376 AHEAD(prevfwd); /* fix previous offset */ 377 prevfwd = HERE(); 378 EMIT(OOR2, 0); /* offset is very wrong */ 379 } 380 381 if (!first) { /* tail-end fixups */ 382 AHEAD(prevfwd); 383 ASTERN(O_CH, prevback); 384 } 385 386 assert(!MORE() || SEE(stop)); 387 } 388 389 /* 390 - p_ere_exp - parse one subERE, an atom possibly followed by a repetition op 391 == static void p_ere_exp(struct parse *p, size_t reclimit); 392 */ 393 static void 394 p_ere_exp( 395 struct parse *p, 396 size_t reclimit) 397 { 398 char c; 399 sopno pos; 400 int count; 401 int count2; 402 sopno subno; 403 int wascaret = 0; 404 405 _DIAGASSERT(p != NULL); 406 407 assert(MORE()); /* caller should have ensured this */ 408 c = GETNEXT(); 409 410 pos = HERE(); 411 switch (c) { 412 case '(': 413 REQUIRE(MORE(), REG_EPAREN); 414 p->g->nsub++; 415 subno = p->g->nsub; 416 if (subno < NPAREN) 417 p->pbegin[subno] = HERE(); 418 EMIT(OLPAREN, subno); 419 if (!SEE(')')) 420 p_ere(p, ')', reclimit); 421 if (subno < NPAREN) { 422 p->pend[subno] = HERE(); 423 assert(p->pend[subno] != 0); 424 } 425 EMIT(ORPAREN, subno); 426 MUSTEAT(')', REG_EPAREN); 427 break; 428 #ifndef POSIX_MISTAKE 429 case ')': /* happens only if no current unmatched ( */ 430 /* 431 * You may ask, why the ifndef? Because I didn't notice 432 * this until slightly too late for 1003.2, and none of the 433 * other 1003.2 regular-expression reviewers noticed it at 434 * all. So an unmatched ) is legal POSIX, at least until 435 * we can get it fixed. 436 */ 437 SETERROR(REG_EPAREN); 438 break; 439 #endif 440 case '^': 441 EMIT(OBOL, 0); 442 p->g->iflags |= USEBOL; 443 p->g->nbol++; 444 wascaret = 1; 445 break; 446 case '$': 447 EMIT(OEOL, 0); 448 p->g->iflags |= USEEOL; 449 p->g->neol++; 450 break; 451 case '|': 452 SETERROR(REG_EMPTY); 453 break; 454 case '*': 455 case '+': 456 case '?': 457 SETERROR(REG_BADRPT); 458 break; 459 case '.': 460 if (p->g->cflags®_NEWLINE) 461 nonnewline(p); 462 else 463 EMIT(OANY, 0); 464 break; 465 case '[': 466 p_bracket(p); 467 break; 468 case '\\': 469 REQUIRE(MORE(), REG_EESCAPE); 470 c = GETNEXT(); 471 ordinary(p, c); 472 break; 473 case '{': /* okay as ordinary except if digit follows */ 474 REQUIRE(!MORE() || !isdigit((unsigned char)PEEK()), REG_BADRPT); 475 /* FALLTHROUGH */ 476 default: 477 ordinary(p, c); 478 break; 479 } 480 481 if (!MORE()) 482 return; 483 c = PEEK(); 484 /* we call { a repetition if followed by a digit */ 485 if (!( c == '*' || c == '+' || c == '?' || 486 (c == '{' && MORE2() && isdigit((unsigned char)PEEK2())) )) 487 return; /* no repetition, we're done */ 488 NEXT(); 489 490 REQUIRE(!wascaret, REG_BADRPT); 491 switch (c) { 492 case '*': /* implemented as +? */ 493 /* this case does not require the (y|) trick, noKLUDGE */ 494 INSERT(OPLUS_, pos); 495 ASTERN(O_PLUS, pos); 496 INSERT(OQUEST_, pos); 497 ASTERN(O_QUEST, pos); 498 break; 499 case '+': 500 INSERT(OPLUS_, pos); 501 ASTERN(O_PLUS, pos); 502 break; 503 case '?': 504 /* KLUDGE: emit y? as (y|) until subtle bug gets fixed */ 505 INSERT(OCH_, pos); /* offset slightly wrong */ 506 ASTERN(OOR1, pos); /* this one's right */ 507 AHEAD(pos); /* fix the OCH_ */ 508 EMIT(OOR2, 0); /* offset very wrong... */ 509 AHEAD(THERE()); /* ...so fix it */ 510 ASTERN(O_CH, THERETHERE()); 511 break; 512 case '{': 513 count = p_count(p); 514 if (EAT(',')) { 515 if (isdigit((unsigned char)PEEK())) { 516 count2 = p_count(p); 517 REQUIRE(count <= count2, REG_BADBR); 518 } else /* single number with comma */ 519 count2 = INFINITY; 520 } else /* just a single number */ 521 count2 = count; 522 repeat(p, pos, count, count2, 0); 523 if (!EAT('}')) { /* error heuristics */ 524 while (MORE() && PEEK() != '}') 525 NEXT(); 526 REQUIRE(MORE(), REG_EBRACE); 527 SETERROR(REG_BADBR); 528 } 529 break; 530 } 531 532 if (!MORE()) 533 return; 534 c = PEEK(); 535 if (!( c == '*' || c == '+' || c == '?' || 536 (c == '{' && MORE2() && isdigit((unsigned char)PEEK2())) ) ) 537 return; 538 SETERROR(REG_BADRPT); 539 } 540 541 /* 542 - p_str - string (no metacharacters) "parser" 543 == static void p_str(struct parse *p); 544 */ 545 static void 546 p_str( 547 struct parse *p) 548 { 549 550 _DIAGASSERT(p != NULL); 551 552 REQUIRE(MORE(), REG_EMPTY); 553 while (MORE()) 554 ordinary(p, GETNEXT()); 555 } 556 557 /* 558 - p_bre - BRE parser top level, anchoring and concatenation 559 == static void p_bre(struct parse *p, int end1, \ 560 == int end2, size_t reclimit); 561 * Giving end1 as OUT essentially eliminates the end1/end2 check. 562 * 563 * This implementation is a bit of a kludge, in that a trailing $ is first 564 * taken as an ordinary character and then revised to be an anchor. The 565 * only undesirable side effect is that '$' gets included as a character 566 * category in such cases. This is fairly harmless; not worth fixing. 567 * The amount of lookahead needed to avoid this kludge is excessive. 568 */ 569 static void 570 p_bre( 571 struct parse *p, 572 int end1, /* first terminating character */ 573 int end2, /* second terminating character */ 574 size_t reclimit) 575 { 576 sopno start; 577 int first = 1; /* first subexpression? */ 578 int wasdollar = 0; 579 580 _DIAGASSERT(p != NULL); 581 582 if (reclimit++ > RECLIMIT || p->error == REG_ESPACE) { 583 p->error = REG_ESPACE; 584 return; 585 } 586 587 start = HERE(); 588 589 if (EAT('^')) { 590 EMIT(OBOL, 0); 591 p->g->iflags |= USEBOL; 592 p->g->nbol++; 593 } 594 while (MORE() && !SEETWO(end1, end2)) { 595 wasdollar = p_simp_re(p, first, reclimit); 596 first = 0; 597 } 598 if (wasdollar) { /* oops, that was a trailing anchor */ 599 DROP(1); 600 EMIT(OEOL, 0); 601 p->g->iflags |= USEEOL; 602 p->g->neol++; 603 } 604 605 REQUIRE(HERE() != start, REG_EMPTY); /* require nonempty */ 606 } 607 608 /* 609 - p_simp_re - parse a simple RE, an atom possibly followed by a repetition 610 == static int p_simp_re(struct parse *p, int starordinary, size_t reclimit); 611 */ 612 static int /* was the simple RE an unbackslashed $? */ 613 p_simp_re( 614 struct parse *p, 615 int starordinary, /* is a leading * an ordinary character? */ 616 size_t reclimit) 617 { 618 int c; 619 int count; 620 int count2; 621 sopno pos, i; 622 sopno subno; 623 # define BACKSL (1<<CHAR_BIT) 624 625 _DIAGASSERT(p != NULL); 626 627 pos = HERE(); /* repetion op, if any, covers from here */ 628 629 assert(MORE()); /* caller should have ensured this */ 630 c = GETNEXT(); 631 if (c == '\\') { 632 REQUIRE(MORE(), REG_EESCAPE); 633 c = BACKSL | (unsigned char)GETNEXT(); 634 } 635 switch (c) { 636 case '.': 637 if (p->g->cflags®_NEWLINE) 638 nonnewline(p); 639 else 640 EMIT(OANY, 0); 641 break; 642 case '[': 643 p_bracket(p); 644 break; 645 case BACKSL|'{': 646 SETERROR(REG_BADRPT); 647 break; 648 case BACKSL|'(': 649 p->g->nsub++; 650 subno = p->g->nsub; 651 if (subno < NPAREN) 652 p->pbegin[subno] = HERE(); 653 EMIT(OLPAREN, subno); 654 /* the MORE here is an error heuristic */ 655 if (MORE() && !SEETWO('\\', ')')) 656 p_bre(p, '\\', ')', reclimit); 657 if (subno < NPAREN) { 658 p->pend[subno] = HERE(); 659 assert(p->pend[subno] != 0); 660 } 661 EMIT(ORPAREN, subno); 662 REQUIRE(EATTWO('\\', ')'), REG_EPAREN); 663 break; 664 case BACKSL|')': /* should not get here -- must be user */ 665 case BACKSL|'}': 666 SETERROR(REG_EPAREN); 667 break; 668 case BACKSL|'1': 669 case BACKSL|'2': 670 case BACKSL|'3': 671 case BACKSL|'4': 672 case BACKSL|'5': 673 case BACKSL|'6': 674 case BACKSL|'7': 675 case BACKSL|'8': 676 case BACKSL|'9': 677 i = (c&~BACKSL) - '0'; 678 assert(i < NPAREN); 679 if (p->pend[i] != 0) { 680 assert(i <= p->g->nsub); 681 EMIT(OBACK_, i); 682 assert(p->pbegin[i] != 0); 683 assert(OP(p->strip[p->pbegin[i]]) == OLPAREN); 684 assert(OP(p->strip[p->pend[i]]) == ORPAREN); 685 (void) dupl(p, p->pbegin[i]+1, p->pend[i]); 686 EMIT(O_BACK, i); 687 } else 688 SETERROR(REG_ESUBREG); 689 p->g->backrefs = 1; 690 break; 691 case '*': 692 REQUIRE(starordinary, REG_BADRPT); 693 /* FALLTHROUGH */ 694 default: 695 ordinary(p, c &~ BACKSL); 696 break; 697 } 698 699 if (EAT('*')) { /* implemented as +? */ 700 /* this case does not require the (y|) trick, noKLUDGE */ 701 INSERT(OPLUS_, pos); 702 ASTERN(O_PLUS, pos); 703 INSERT(OQUEST_, pos); 704 ASTERN(O_QUEST, pos); 705 } else if (EATTWO('\\', '{')) { 706 count = p_count(p); 707 if (EAT(',')) { 708 if (MORE() && isdigit((unsigned char)PEEK())) { 709 count2 = p_count(p); 710 REQUIRE(count <= count2, REG_BADBR); 711 } else /* single number with comma */ 712 count2 = INFINITY; 713 } else /* just a single number */ 714 count2 = count; 715 repeat(p, pos, count, count2, 0); 716 if (!EATTWO('\\', '}')) { /* error heuristics */ 717 while (MORE() && !SEETWO('\\', '}')) 718 NEXT(); 719 REQUIRE(MORE(), REG_EBRACE); 720 SETERROR(REG_BADBR); 721 } 722 } else if (c == (unsigned char)'$') /* $ (but not \$) ends it */ 723 return(1); 724 725 return(0); 726 } 727 728 /* 729 - p_count - parse a repetition count 730 == static int p_count(struct parse *p); 731 */ 732 static int /* the value */ 733 p_count( 734 struct parse *p) 735 { 736 int count = 0; 737 int ndigits = 0; 738 739 _DIAGASSERT(p != NULL); 740 741 while (MORE() && isdigit((unsigned char)PEEK()) && count <= DUPMAX) { 742 count = count*10 + (GETNEXT() - '0'); 743 ndigits++; 744 } 745 746 REQUIRE(ndigits > 0 && count <= DUPMAX, REG_BADBR); 747 return(count); 748 } 749 750 /* 751 - p_bracket - parse a bracketed character list 752 == static void p_bracket(struct parse *p); 753 * 754 * Note a significant property of this code: if the allocset() did SETERROR, 755 * no set operations are done. 756 */ 757 static void 758 p_bracket( 759 struct parse *p) 760 { 761 cset *cs; 762 int invert = 0; 763 _DIAGASSERT(p != NULL); 764 765 cs = allocset(p); 766 if (cs == NULL) 767 return; 768 769 /* Dept of Truly Sickening Special-Case Kludges */ 770 if (p->next + 5 < p->end && strncmp(p->next, "[:<:]]", 771 (size_t)6) == 0) { 772 EMIT(OBOW, 0); 773 NEXTn(6); 774 return; 775 } 776 if (p->next + 5 < p->end && strncmp(p->next, "[:>:]]", 777 (size_t)6) == 0) { 778 EMIT(OEOW, 0); 779 NEXTn(6); 780 return; 781 } 782 783 if (EAT('^')) 784 invert++; /* make note to invert set at end */ 785 if (EAT(']')) 786 CHadd(cs, ']'); 787 else if (EAT('-')) 788 CHadd(cs, '-'); 789 while (MORE() && PEEK() != ']' && !SEETWO('-', ']')) 790 p_b_term(p, cs); 791 if (EAT('-')) 792 CHadd(cs, '-'); 793 MUSTEAT(']', REG_EBRACK); 794 795 if (p->error != 0) /* don't mess things up further */ 796 return; 797 798 if (p->g->cflags®_ICASE) { 799 ssize_t i; 800 int ci; 801 802 for (i = p->g->csetsize - 1; i >= 0; i--) 803 if (CHIN(cs, i) && isalpha(i)) { 804 ci = othercase((int)i); 805 if (ci != i) 806 CHadd(cs, ci); 807 } 808 if (cs->multis != NULL) 809 mccase(p, cs); 810 } 811 if (invert) { 812 ssize_t i; 813 814 for (i = p->g->csetsize - 1; i >= 0; i--) 815 if (CHIN(cs, i)) 816 CHsub(cs, (int)i); 817 else 818 CHadd(cs, (int)i); 819 if (p->g->cflags®_NEWLINE) 820 CHsub(cs, '\n'); 821 if (cs->multis != NULL) 822 mcinvert(p, cs); 823 } 824 825 assert(cs->multis == NULL); /* xxx */ 826 827 if (nch(p, cs) == 1) { /* optimize singleton sets */ 828 ordinary(p, firstch(p, cs)); 829 freeset(p, cs); 830 } else 831 EMIT(OANYOF, freezeset(p, cs)); 832 } 833 834 /* 835 - p_b_term - parse one term of a bracketed character list 836 == static void p_b_term(struct parse *p, cset *cs); 837 */ 838 static void 839 p_b_term( 840 struct parse *p, 841 cset *cs) 842 { 843 char c; 844 char start, finish; 845 int i; 846 847 _DIAGASSERT(p != NULL); 848 _DIAGASSERT(cs != NULL); 849 850 /* classify what we've got */ 851 switch ((MORE()) ? PEEK() : '\0') { 852 case '[': 853 c = (MORE2()) ? PEEK2() : '\0'; 854 break; 855 856 case '-': 857 SETERROR(REG_ERANGE); 858 return; /* NOTE RETURN */ 859 860 default: 861 c = '\0'; 862 break; 863 } 864 865 switch (c) { 866 case ':': /* character class */ 867 NEXT2(); 868 REQUIRE(MORE(), REG_EBRACK); 869 c = PEEK(); 870 REQUIRE(c != '-' && c != ']', REG_ECTYPE); 871 p_b_cclass(p, cs); 872 REQUIRE(MORE(), REG_EBRACK); 873 REQUIRE(EATTWO(':', ']'), REG_ECTYPE); 874 break; 875 case '=': /* equivalence class */ 876 NEXT2(); 877 REQUIRE(MORE(), REG_EBRACK); 878 c = PEEK(); 879 REQUIRE(c != '-' && c != ']', REG_ECOLLATE); 880 p_b_eclass(p, cs); 881 REQUIRE(MORE(), REG_EBRACK); 882 REQUIRE(EATTWO('=', ']'), REG_ECOLLATE); 883 break; 884 default: /* symbol, ordinary character, or range */ 885 /* xxx revision needed for multichar stuff */ 886 start = p_b_symbol(p); 887 if (SEE('-') && MORE2() && PEEK2() != ']') { 888 /* range */ 889 NEXT(); 890 if (EAT('-')) 891 finish = '-'; 892 else 893 finish = p_b_symbol(p); 894 } else 895 finish = start; 896 /* xxx what about signed chars here... */ 897 REQUIRE(start <= finish, REG_ERANGE); 898 for (i = start; i <= finish; i++) 899 CHadd(cs, i); 900 break; 901 } 902 } 903 904 /* 905 - p_b_cclass - parse a character-class name and deal with it 906 == static void p_b_cclass(struct parse *p, cset *cs); 907 */ 908 static void 909 p_b_cclass( 910 struct parse *p, 911 cset *cs) 912 { 913 const char *sp; 914 const struct cclass *cp; 915 size_t len; 916 const char *u; 917 char c; 918 919 _DIAGASSERT(p != NULL); 920 _DIAGASSERT(cs != NULL); 921 922 sp = p->next; 923 924 while (MORE() && isalpha((unsigned char)PEEK())) 925 NEXT(); 926 len = p->next - sp; 927 for (cp = cclasses; cp->name != NULL; cp++) 928 if (strncmp(cp->name, sp, len) == 0 && cp->name[len] == '\0') 929 break; 930 if (cp->name == NULL) { 931 /* oops, didn't find it */ 932 SETERROR(REG_ECTYPE); 933 return; 934 } 935 936 u = cp->chars; 937 while ((c = *u++) != '\0') 938 CHadd(cs, c); 939 for (u = cp->multis; *u != '\0'; u += strlen(u) + 1) 940 MCadd(p, cs, u); 941 } 942 943 /* 944 - p_b_eclass - parse an equivalence-class name and deal with it 945 == static void p_b_eclass(struct parse *p, cset *cs); 946 * 947 * This implementation is incomplete. xxx 948 */ 949 static void 950 p_b_eclass( 951 struct parse *p, 952 cset *cs) 953 { 954 char c; 955 956 _DIAGASSERT(p != NULL); 957 _DIAGASSERT(cs != NULL); 958 959 c = p_b_coll_elem(p, '='); 960 CHadd(cs, c); 961 } 962 963 /* 964 - p_b_symbol - parse a character or [..]ed multicharacter collating symbol 965 == static char p_b_symbol(struct parse *p); 966 */ 967 static char /* value of symbol */ 968 p_b_symbol( 969 struct parse *p) 970 { 971 char value; 972 973 _DIAGASSERT(p != NULL); 974 975 REQUIRE(MORE(), REG_EBRACK); 976 if (!EATTWO('[', '.')) 977 return(GETNEXT()); 978 979 /* collating symbol */ 980 value = p_b_coll_elem(p, '.'); 981 REQUIRE(EATTWO('.', ']'), REG_ECOLLATE); 982 return(value); 983 } 984 985 /* 986 - p_b_coll_elem - parse a collating-element name and look it up 987 == static char p_b_coll_elem(struct parse *p, int endc); 988 */ 989 static char /* value of collating element */ 990 p_b_coll_elem( 991 struct parse *p, 992 int endc) /* name ended by endc,']' */ 993 { 994 const char *sp; 995 const struct cname *cp; 996 size_t len; 997 998 _DIAGASSERT(p != NULL); 999 1000 sp = p->next; 1001 1002 while (MORE() && !SEETWO(endc, ']')) 1003 NEXT(); 1004 if (!MORE()) { 1005 SETERROR(REG_EBRACK); 1006 return(0); 1007 } 1008 len = p->next - sp; 1009 for (cp = cnames; cp->name != NULL; cp++) 1010 if (strncmp(cp->name, sp, len) == 0 && cp->name[len] == '\0') 1011 return(cp->code); /* known name */ 1012 if (len == 1) 1013 return(*sp); /* single character */ 1014 SETERROR(REG_ECOLLATE); /* neither */ 1015 return(0); 1016 } 1017 1018 /* 1019 - othercase - return the case counterpart of an alphabetic 1020 == static int othercase(int ch); 1021 */ 1022 static int /* if no counterpart, return ch */ 1023 othercase( 1024 int ch) 1025 { 1026 assert(isalpha(ch)); 1027 if (isupper(ch)) 1028 return(tolower(ch)); 1029 else if (islower(ch)) 1030 return(toupper(ch)); 1031 else /* peculiar, but could happen */ 1032 return(ch); 1033 } 1034 1035 /* 1036 - bothcases - emit a dualcase version of a two-case character 1037 == static void bothcases(struct parse *p, int ch); 1038 * 1039 * Boy, is this implementation ever a kludge... 1040 */ 1041 static void 1042 bothcases( 1043 struct parse *p, 1044 int ch) 1045 { 1046 const char *oldnext; 1047 const char *oldend; 1048 char bracket[3]; 1049 1050 _DIAGASSERT(p != NULL); 1051 1052 oldnext = p->next; 1053 oldend = p->end; 1054 1055 assert(othercase(ch) != ch); /* p_bracket() would recurse */ 1056 p->next = bracket; 1057 p->end = bracket+2; 1058 bracket[0] = ch; 1059 bracket[1] = ']'; 1060 bracket[2] = '\0'; 1061 p_bracket(p); 1062 assert(p->next == bracket+2); 1063 p->next = oldnext; 1064 p->end = oldend; 1065 } 1066 1067 /* 1068 - ordinary - emit an ordinary character 1069 == static void ordinary(struct parse *p, int ch); 1070 */ 1071 static void 1072 ordinary( 1073 struct parse *p, 1074 int ch) 1075 { 1076 cat_t *cap; 1077 unsigned char uc = (unsigned char)ch; 1078 1079 _DIAGASSERT(p != NULL); 1080 1081 cap = p->g->categories; 1082 if ((p->g->cflags & REG_ICASE) && isalpha(uc) && othercase(uc) != uc) 1083 bothcases(p, uc); 1084 else { 1085 EMIT(OCHAR, (sopno)uc); 1086 if (cap[uc] == 0) { 1087 _DIAGASSERT(__type_fit(unsigned char, 1088 p->g->ncategories + 1)); 1089 cap[uc] = (unsigned char)p->g->ncategories++; 1090 } 1091 } 1092 } 1093 1094 /* 1095 - nonnewline - emit REG_NEWLINE version of OANY 1096 == static void nonnewline(struct parse *p); 1097 * 1098 * Boy, is this implementation ever a kludge... 1099 */ 1100 static void 1101 nonnewline( 1102 struct parse *p) 1103 { 1104 const char *oldnext; 1105 const char *oldend; 1106 char bracket[4]; 1107 1108 _DIAGASSERT(p != NULL); 1109 1110 oldnext = p->next; 1111 oldend = p->end; 1112 1113 p->next = bracket; 1114 p->end = bracket+3; 1115 bracket[0] = '^'; 1116 bracket[1] = '\n'; 1117 bracket[2] = ']'; 1118 bracket[3] = '\0'; 1119 p_bracket(p); 1120 assert(p->next == bracket+3); 1121 p->next = oldnext; 1122 p->end = oldend; 1123 } 1124 1125 /* 1126 - repeat - generate code for a bounded repetition, recursively if needed 1127 == static void repeat(struct parse *p, sopno start, int from, int to, 1128 == size_t reclimit); 1129 */ 1130 static void 1131 repeat( 1132 struct parse *p, 1133 sopno start, /* operand from here to end of strip */ 1134 int from, /* repeated from this number */ 1135 int to, /* to this number of times (maybe INFINITY) */ 1136 size_t reclimit) 1137 { 1138 sopno finish; 1139 # define N 2 1140 # define INF 3 1141 # define REP(f, t) ((f)*8 + (t)) 1142 # define MAP(n) (((n) <= 1) ? (n) : ((n) == INFINITY) ? INF : N) 1143 sopno copy; 1144 1145 _DIAGASSERT(p != NULL); 1146 1147 if (reclimit++ > RECLIMIT) 1148 p->error = REG_ESPACE; 1149 if (p->error) 1150 return; 1151 1152 finish = HERE(); 1153 1154 assert(from <= to); 1155 1156 switch (REP(MAP(from), MAP(to))) { 1157 case REP(0, 0): /* must be user doing this */ 1158 DROP(finish-start); /* drop the operand */ 1159 break; 1160 case REP(0, 1): /* as x{1,1}? */ 1161 case REP(0, N): /* as x{1,n}? */ 1162 case REP(0, INF): /* as x{1,}? */ 1163 /* KLUDGE: emit y? as (y|) until subtle bug gets fixed */ 1164 INSERT(OCH_, start); /* offset is wrong... */ 1165 repeat(p, start+1, 1, to, reclimit); 1166 ASTERN(OOR1, start); 1167 AHEAD(start); /* ... fix it */ 1168 EMIT(OOR2, 0); 1169 AHEAD(THERE()); 1170 ASTERN(O_CH, THERETHERE()); 1171 break; 1172 case REP(1, 1): /* trivial case */ 1173 /* done */ 1174 break; 1175 case REP(1, N): /* as x?x{1,n-1} */ 1176 /* KLUDGE: emit y? as (y|) until subtle bug gets fixed */ 1177 INSERT(OCH_, start); 1178 ASTERN(OOR1, start); 1179 AHEAD(start); 1180 EMIT(OOR2, 0); /* offset very wrong... */ 1181 AHEAD(THERE()); /* ...so fix it */ 1182 ASTERN(O_CH, THERETHERE()); 1183 copy = dupl(p, start+1, finish+1); 1184 assert(copy == finish+4); 1185 repeat(p, copy, 1, to-1, reclimit); 1186 break; 1187 case REP(1, INF): /* as x+ */ 1188 INSERT(OPLUS_, start); 1189 ASTERN(O_PLUS, start); 1190 break; 1191 case REP(N, N): /* as xx{m-1,n-1} */ 1192 copy = dupl(p, start, finish); 1193 repeat(p, copy, from-1, to-1, reclimit); 1194 break; 1195 case REP(N, INF): /* as xx{n-1,INF} */ 1196 copy = dupl(p, start, finish); 1197 repeat(p, copy, from-1, to, reclimit); 1198 break; 1199 default: /* "can't happen" */ 1200 SETERROR(REG_ASSERT); /* just in case */ 1201 break; 1202 } 1203 } 1204 1205 /* 1206 - seterr - set an error condition 1207 == static int seterr(struct parse *p, int e); 1208 */ 1209 static int /* useless but makes type checking happy */ 1210 seterr( 1211 struct parse *p, 1212 int e) 1213 { 1214 1215 _DIAGASSERT(p != NULL); 1216 1217 if (p->error == 0) /* keep earliest error condition */ 1218 p->error = e; 1219 p->next = nuls; /* try to bring things to a halt */ 1220 p->end = nuls; 1221 return(0); /* make the return value well-defined */ 1222 } 1223 1224 /* 1225 - allocset - allocate a set of characters for [] 1226 == static cset *allocset(struct parse *p); 1227 */ 1228 static cset * 1229 allocset( 1230 struct parse *p) 1231 { 1232 size_t no; 1233 size_t nc; 1234 size_t nbytes; 1235 cset *cs; 1236 size_t css; 1237 size_t i; 1238 void *old_ptr; 1239 1240 _DIAGASSERT(p != NULL); 1241 1242 no = p->g->ncsets++; 1243 css = (size_t)p->g->csetsize; 1244 if (no >= p->ncsalloc) { /* need another column of space */ 1245 p->ncsalloc += CHAR_BIT; 1246 nc = p->ncsalloc; 1247 assert(nc % CHAR_BIT == 0); 1248 nbytes = nc / CHAR_BIT * css; 1249 if (MEMSIZE(p) > MEMLIMIT) 1250 goto oomem; 1251 if (reallocarr(&p->g->sets, nc, sizeof(cset))) 1252 goto oomem; 1253 old_ptr = p->g->setbits; 1254 if (reallocarr(&p->g->setbits, nc / CHAR_BIT, css)) { 1255 free(old_ptr); 1256 goto oomem; 1257 } 1258 if (old_ptr != p->g->setbits) { 1259 for (i = 0; i < no; i++) 1260 p->g->sets[i].ptr = p->g->setbits + css*(i/CHAR_BIT); 1261 } 1262 (void) memset((char *)p->g->setbits + (nbytes - css), 0, css); 1263 } 1264 1265 cs = &p->g->sets[no]; 1266 cs->ptr = p->g->setbits + css*((no)/CHAR_BIT); 1267 cs->mask = 1 << (unsigned int)((no) % CHAR_BIT); 1268 cs->hash = 0; 1269 cs->smultis = 0; 1270 cs->multis = NULL; 1271 1272 return(cs); 1273 1274 oomem: 1275 SETERROR(REG_ESPACE); 1276 /* caller's responsibility not to do set ops */ 1277 return NULL; 1278 } 1279 1280 /* 1281 - freeset - free a now-unused set 1282 == static void freeset(struct parse *p, cset *cs); 1283 */ 1284 static void 1285 freeset( 1286 struct parse *p, 1287 cset *cs) 1288 { 1289 size_t i; 1290 cset *top; 1291 size_t css; 1292 1293 _DIAGASSERT(p != NULL); 1294 _DIAGASSERT(cs != NULL); 1295 1296 top = &p->g->sets[p->g->ncsets]; 1297 css = (size_t)p->g->csetsize; 1298 1299 for (i = 0; i < css; i++) 1300 CHsub(cs, (int)i); 1301 if (cs == top-1) /* recover only the easy case */ 1302 p->g->ncsets--; 1303 } 1304 1305 /* 1306 - freezeset - final processing on a set of characters 1307 == static int freezeset(struct parse *p, cset *cs); 1308 * 1309 * The main task here is merging identical sets. This is usually a waste 1310 * of time (although the hash code minimizes the overhead), but can win 1311 * big if REG_ICASE is being used. REG_ICASE, by the way, is why the hash 1312 * is done using addition rather than xor -- all ASCII [aA] sets xor to 1313 * the same value! 1314 */ 1315 static sopno /* set number */ 1316 freezeset( 1317 struct parse *p, 1318 cset *cs) 1319 { 1320 uch h; 1321 size_t i; 1322 cset *top; 1323 cset *cs2; 1324 size_t css; 1325 1326 _DIAGASSERT(p != NULL); 1327 _DIAGASSERT(cs != NULL); 1328 1329 h = cs->hash; 1330 top = &p->g->sets[p->g->ncsets]; 1331 css = (size_t)p->g->csetsize; 1332 1333 /* look for an earlier one which is the same */ 1334 for (cs2 = &p->g->sets[0]; cs2 < top; cs2++) 1335 if (cs2->hash == h && cs2 != cs) { 1336 /* maybe */ 1337 for (i = 0; i < css; i++) 1338 if (!!CHIN(cs2, i) != !!CHIN(cs, i)) 1339 break; /* no */ 1340 if (i == css) 1341 break; /* yes */ 1342 } 1343 1344 if (cs2 < top) { /* found one */ 1345 freeset(p, cs); 1346 cs = cs2; 1347 } 1348 1349 return (sopno)(cs - p->g->sets); 1350 } 1351 1352 /* 1353 - firstch - return first character in a set (which must have at least one) 1354 == static int firstch(struct parse *p, cset *cs); 1355 */ 1356 static int /* character; there is no "none" value */ 1357 firstch( 1358 struct parse *p, 1359 cset *cs) 1360 { 1361 size_t i; 1362 size_t css; 1363 1364 _DIAGASSERT(p != NULL); 1365 _DIAGASSERT(cs != NULL); 1366 1367 css = (size_t)p->g->csetsize; 1368 1369 for (i = 0; i < css; i++) 1370 if (CHIN(cs, i)) 1371 return((char)i); 1372 assert(never); 1373 return(0); /* arbitrary */ 1374 } 1375 1376 /* 1377 - nch - number of characters in a set 1378 == static int nch(struct parse *p, cset *cs); 1379 */ 1380 static int 1381 nch( 1382 struct parse *p, 1383 cset *cs) 1384 { 1385 size_t i; 1386 size_t css; 1387 int n = 0; 1388 1389 _DIAGASSERT(p != NULL); 1390 _DIAGASSERT(cs != NULL); 1391 1392 css = (size_t)p->g->csetsize; 1393 1394 for (i = 0; i < css; i++) 1395 if (CHIN(cs, i)) 1396 n++; 1397 return(n); 1398 } 1399 1400 /* 1401 - mcadd - add a collating element to a cset 1402 == static void mcadd(struct parse *p, cset *cs, \ 1403 == char *cp); 1404 */ 1405 static void 1406 mcadd( 1407 struct parse *p, 1408 cset *cs, 1409 const char *cp) 1410 { 1411 size_t oldend; 1412 1413 _DIAGASSERT(p != NULL); 1414 _DIAGASSERT(cs != NULL); 1415 _DIAGASSERT(cp != NULL); 1416 1417 oldend = cs->smultis; 1418 1419 cs->smultis += strlen(cp) + 1; 1420 if (cs->multis == NULL) 1421 cs->multis = malloc(cs->smultis); 1422 else 1423 cs->multis = realloc(cs->multis, cs->smultis); 1424 if (cs->multis == NULL) { 1425 SETERROR(REG_ESPACE); 1426 return; 1427 } 1428 1429 (void) strcpy(cs->multis + oldend - 1, cp); 1430 cs->multis[cs->smultis - 1] = '\0'; 1431 } 1432 1433 #if 0 1434 /* 1435 - mcsub - subtract a collating element from a cset 1436 == static void mcsub(cset *cs, char *cp); 1437 */ 1438 static void 1439 mcsub( 1440 cset *cs, 1441 char *cp) 1442 { 1443 char *fp; 1444 size_t len; 1445 1446 _DIAGASSERT(cs != NULL); 1447 _DIAGASSERT(cp != NULL); 1448 1449 fp = mcfind(cs, cp); 1450 len = strlen(fp); 1451 1452 assert(fp != NULL); 1453 (void) memmove(fp, fp + len + 1, 1454 cs->smultis - (fp + len + 1 - cs->multis)); 1455 cs->smultis -= len; 1456 1457 if (cs->smultis == 0) { 1458 free(cs->multis); 1459 cs->multis = NULL; 1460 return; 1461 } 1462 1463 cs->multis = realloc(cs->multis, cs->smultis); 1464 assert(cs->multis != NULL); 1465 } 1466 1467 /* 1468 - mcin - is a collating element in a cset? 1469 == static int mcin(cset *cs, char *cp); 1470 */ 1471 static int 1472 mcin( 1473 cset *cs, 1474 char *cp) 1475 { 1476 1477 _DIAGASSERT(cs != NULL); 1478 _DIAGASSERT(cp != NULL); 1479 1480 return(mcfind(cs, cp) != NULL); 1481 } 1482 1483 /* 1484 - mcfind - find a collating element in a cset 1485 == static char *mcfind(cset *cs, char *cp); 1486 */ 1487 static char * 1488 mcfind( 1489 cset *cs, 1490 char *cp) 1491 { 1492 char *p; 1493 1494 _DIAGASSERT(cs != NULL); 1495 _DIAGASSERT(cp != NULL); 1496 1497 if (cs->multis == NULL) 1498 return(NULL); 1499 for (p = cs->multis; *p != '\0'; p += strlen(p) + 1) 1500 if (strcmp(cp, p) == 0) 1501 return(p); 1502 return(NULL); 1503 } 1504 #endif 1505 1506 /* 1507 - mcinvert - invert the list of collating elements in a cset 1508 == static void mcinvert(struct parse *p, cset *cs); 1509 * 1510 * This would have to know the set of possibilities. Implementation 1511 * is deferred. 1512 */ 1513 /* ARGSUSED */ 1514 static void 1515 mcinvert( 1516 struct parse *p, 1517 cset *cs) 1518 { 1519 1520 _DIAGASSERT(p != NULL); 1521 _DIAGASSERT(cs != NULL); 1522 1523 assert(cs->multis == NULL); /* xxx */ 1524 } 1525 1526 /* 1527 - mccase - add case counterparts of the list of collating elements in a cset 1528 == static void mccase(struct parse *p, cset *cs); 1529 * 1530 * This would have to know the set of possibilities. Implementation 1531 * is deferred. 1532 */ 1533 /* ARGSUSED */ 1534 static void 1535 mccase( 1536 struct parse *p, 1537 cset *cs) 1538 { 1539 1540 _DIAGASSERT(p != NULL); 1541 _DIAGASSERT(cs != NULL); 1542 1543 assert(cs->multis == NULL); /* xxx */ 1544 } 1545 1546 /* 1547 - isinsets - is this character in any sets? 1548 == static int isinsets(struct re_guts *g, int c); 1549 */ 1550 static int /* predicate */ 1551 isinsets( 1552 struct re_guts *g, 1553 int c) 1554 { 1555 uch *col; 1556 size_t i; 1557 size_t ncols; 1558 unsigned uc = (unsigned char)c; 1559 1560 _DIAGASSERT(g != NULL); 1561 1562 if (g->setbits == NULL) 1563 return 0; 1564 1565 ncols = (g->ncsets+(CHAR_BIT-1)) / CHAR_BIT; 1566 1567 for (i = 0, col = g->setbits; i < ncols; i++, col += g->csetsize) 1568 if (col[uc] != 0) 1569 return(1); 1570 return(0); 1571 } 1572 1573 /* 1574 - samesets - are these two characters in exactly the same sets? 1575 == static int samesets(struct re_guts *g, int c1, int c2); 1576 */ 1577 static int /* predicate */ 1578 samesets( 1579 struct re_guts *g, 1580 int c1, 1581 int c2) 1582 { 1583 uch *col; 1584 size_t i; 1585 size_t ncols; 1586 unsigned uc1 = (unsigned char)c1; 1587 unsigned uc2 = (unsigned char)c2; 1588 1589 _DIAGASSERT(g != NULL); 1590 1591 ncols = (g->ncsets+(CHAR_BIT-1)) / CHAR_BIT; 1592 1593 for (i = 0, col = g->setbits; i < ncols; i++, col += g->csetsize) 1594 if (col[uc1] != col[uc2]) 1595 return(0); 1596 return(1); 1597 } 1598 1599 /* 1600 - categorize - sort out character categories 1601 == static void categorize(struct parse *p, struct re_guts *g); 1602 */ 1603 static void 1604 categorize( 1605 struct parse *p, 1606 struct re_guts *g) 1607 { 1608 cat_t *cats; 1609 int c; 1610 int c2; 1611 cat_t cat; 1612 1613 _DIAGASSERT(p != NULL); 1614 _DIAGASSERT(g != NULL); 1615 1616 cats = g->categories; 1617 1618 /* avoid making error situations worse */ 1619 if (p->error != 0) 1620 return; 1621 1622 for (c = CHAR_MIN; c <= CHAR_MAX; c++) 1623 if (cats[c] == 0 && isinsets(g, c)) { 1624 _DIAGASSERT(__type_fit(unsigned char, 1625 g->ncategories + 1)); 1626 cat = g->ncategories++; 1627 cats[c] = cat; 1628 for (c2 = c+1; c2 <= CHAR_MAX; c2++) 1629 if (cats[c2] == 0 && samesets(g, c, c2)) 1630 cats[c2] = cat; 1631 } 1632 } 1633 1634 /* 1635 - dupl - emit a duplicate of a bunch of sops 1636 == static sopno dupl(struct parse *p, sopno start, sopno finish); 1637 */ 1638 static sopno /* start of duplicate */ 1639 dupl( 1640 struct parse *p, 1641 sopno start, /* from here */ 1642 sopno finish) /* to this less one */ 1643 { 1644 sopno ret; 1645 sopno len = finish - start; 1646 1647 _DIAGASSERT(p != NULL); 1648 1649 ret = HERE(); 1650 1651 assert(finish >= start); 1652 if (len == 0) 1653 return(ret); 1654 if (!enlarge(p, p->ssize + len))/* this many unexpected additions */ 1655 return ret; 1656 (void)memcpy(p->strip + p->slen, p->strip + start, 1657 (size_t)len * sizeof(sop)); 1658 p->slen += len; 1659 return(ret); 1660 } 1661 1662 /* 1663 - doemit - emit a strip operator 1664 == static void doemit(struct parse *p, sop op, size_t opnd); 1665 * 1666 * It might seem better to implement this as a macro with a function as 1667 * hard-case backup, but it's just too big and messy unless there are 1668 * some changes to the data structures. Maybe later. 1669 */ 1670 static void 1671 doemit( 1672 struct parse *p, 1673 sop op, 1674 sopno opnd) 1675 { 1676 _DIAGASSERT(p != NULL); 1677 1678 /* avoid making error situations worse */ 1679 if (p->error != 0) 1680 return; 1681 1682 /* deal with oversize operands ("can't happen", more or less) */ 1683 assert(opnd < 1<<OPSHIFT); 1684 1685 /* deal with undersized strip */ 1686 if (p->slen >= p->ssize) 1687 if (!enlarge(p, (p->ssize+1) / 2 * 3)) /* +50% */ 1688 return; 1689 1690 /* finally, it's all reduced to the easy case */ 1691 p->strip[p->slen++] = (sop)SOP(op, opnd); 1692 } 1693 1694 /* 1695 - doinsert - insert a sop into the strip 1696 == static void doinsert(struct parse *p, sop op, size_t opnd, sopno pos); 1697 */ 1698 static void 1699 doinsert( 1700 struct parse *p, 1701 sop op, 1702 sopno opnd, 1703 sopno pos) 1704 { 1705 sopno sn; 1706 sop s; 1707 int i; 1708 1709 _DIAGASSERT(p != NULL); 1710 1711 /* avoid making error situations worse */ 1712 if (p->error != 0) 1713 return; 1714 1715 sn = HERE(); 1716 EMIT(op, opnd); /* do checks, ensure space */ 1717 assert(HERE() == sn+1); 1718 s = p->strip[sn]; 1719 1720 /* adjust paren pointers */ 1721 assert(pos > 0); 1722 for (i = 1; i < NPAREN; i++) { 1723 if (p->pbegin[i] >= pos) { 1724 p->pbegin[i]++; 1725 } 1726 if (p->pend[i] >= pos) { 1727 p->pend[i]++; 1728 } 1729 } 1730 1731 memmove(&p->strip[pos+1], &p->strip[pos], (HERE()-pos-1)*sizeof(sop)); 1732 p->strip[pos] = s; 1733 } 1734 1735 /* 1736 - dofwd - complete a forward reference 1737 == static void dofwd(struct parse *p, sopno pos, sop value); 1738 */ 1739 static void 1740 dofwd( 1741 struct parse *p, 1742 sopno pos, 1743 sopno value) 1744 { 1745 1746 _DIAGASSERT(p != NULL); 1747 1748 /* avoid making error situations worse */ 1749 if (p->error != 0) 1750 return; 1751 1752 assert(value < 1<<OPSHIFT); 1753 p->strip[pos] = (sop)(OP(p->strip[pos]) | value); 1754 } 1755 1756 /* 1757 - enlarge - enlarge the strip 1758 == static void enlarge(struct parse *p, sopno size); 1759 */ 1760 static int 1761 enlarge(struct parse *p, sopno size) 1762 { 1763 _DIAGASSERT(p != NULL); 1764 1765 if (p->ssize >= size) 1766 return 1; 1767 1768 if (MEMSIZE(p) > MEMLIMIT || reallocarr(&p->strip, size, sizeof(sop))) { 1769 SETERROR(REG_ESPACE); 1770 return 0; 1771 } 1772 p->ssize = size; 1773 return 1; 1774 } 1775 1776 /* 1777 - stripsnug - compact the strip 1778 == static void stripsnug(struct parse *p, struct re_guts *g); 1779 */ 1780 static void 1781 stripsnug( 1782 struct parse *p, 1783 struct re_guts *g) 1784 { 1785 1786 _DIAGASSERT(p != NULL); 1787 _DIAGASSERT(g != NULL); 1788 1789 g->nstates = p->slen; 1790 g->strip = p->strip; 1791 reallocarr(&g->strip, p->slen, sizeof(sop)); 1792 /* Ignore error as tries to free memory only. */ 1793 } 1794 1795 /* 1796 - findmust - fill in must and mlen with longest mandatory literal string 1797 == static void findmust(struct parse *p, struct re_guts *g); 1798 * 1799 * This algorithm could do fancy things like analyzing the operands of | 1800 * for common subsequences. Someday. This code is simple and finds most 1801 * of the interesting cases. 1802 * 1803 * Note that must and mlen got initialized during setup. 1804 */ 1805 static void 1806 findmust( 1807 struct parse *p, 1808 struct re_guts *g) 1809 { 1810 sop *scan; 1811 sop *start = NULL; 1812 sop *newstart = NULL; 1813 sopno newlen; 1814 sop s; 1815 char *cp; 1816 sopno i; 1817 1818 _DIAGASSERT(p != NULL); 1819 _DIAGASSERT(g != NULL); 1820 1821 /* avoid making error situations worse */ 1822 if (p->error != 0) 1823 return; 1824 1825 /* find the longest OCHAR sequence in strip */ 1826 newlen = 0; 1827 scan = g->strip + 1; 1828 do { 1829 s = *scan++; 1830 switch (OP(s)) { 1831 case OCHAR: /* sequence member */ 1832 if (newlen == 0) /* new sequence */ 1833 newstart = scan - 1; 1834 newlen++; 1835 break; 1836 case OPLUS_: /* things that don't break one */ 1837 case OLPAREN: 1838 case ORPAREN: 1839 break; 1840 case OQUEST_: /* things that must be skipped */ 1841 case OCH_: 1842 scan--; 1843 do { 1844 scan += OPND(s); 1845 s = *scan; 1846 /* assert() interferes w debug printouts */ 1847 if (OP(s) != O_QUEST && OP(s) != O_CH && 1848 OP(s) != OOR2) { 1849 g->iflags |= BAD; 1850 return; 1851 } 1852 } while (OP(s) != O_QUEST && OP(s) != O_CH); 1853 /* FALLTHROUGH */ 1854 default: /* things that break a sequence */ 1855 if (newlen > g->mlen) { /* ends one */ 1856 start = newstart; 1857 g->mlen = newlen; 1858 } 1859 newlen = 0; 1860 break; 1861 } 1862 } while (OP(s) != OEND); 1863 1864 if (start == NULL) 1865 g->mlen = 0; 1866 1867 if (g->mlen == 0) /* there isn't one */ 1868 return; 1869 1870 /* turn it into a character string */ 1871 g->must = malloc((size_t)g->mlen + 1); 1872 if (g->must == NULL) { /* argh; just forget it */ 1873 g->mlen = 0; 1874 return; 1875 } 1876 cp = g->must; 1877 scan = start; 1878 for (i = g->mlen; i > 0; i--) { 1879 while (OP(s = *scan++) != OCHAR) 1880 continue; 1881 assert(cp < g->must + g->mlen); 1882 *cp++ = (char)OPND(s); 1883 } 1884 assert(cp == g->must + g->mlen); 1885 *cp++ = '\0'; /* just on general principles */ 1886 } 1887 1888 /* 1889 - pluscount - count + nesting 1890 == static sopno pluscount(struct parse *p, struct re_guts *g); 1891 */ 1892 static sopno /* nesting depth */ 1893 pluscount( 1894 struct parse *p, 1895 struct re_guts *g) 1896 { 1897 sop *scan; 1898 sop s; 1899 sopno plusnest = 0; 1900 sopno maxnest = 0; 1901 1902 _DIAGASSERT(p != NULL); 1903 _DIAGASSERT(g != NULL); 1904 1905 if (p->error != 0) 1906 return(0); /* there may not be an OEND */ 1907 1908 scan = g->strip + 1; 1909 do { 1910 s = *scan++; 1911 switch (OP(s)) { 1912 case OPLUS_: 1913 plusnest++; 1914 break; 1915 case O_PLUS: 1916 if (plusnest > maxnest) 1917 maxnest = plusnest; 1918 plusnest--; 1919 break; 1920 } 1921 } while (OP(s) != OEND); 1922 if (plusnest != 0) 1923 g->iflags |= BAD; 1924 return(maxnest); 1925 } 1926