1 /* dfa.c - deterministic extended regexp routines for GNU 2 Copyright (C) 1988, 1998, 2000, 2002, 2004-2005, 2007-2020 Free Software 3 Foundation, Inc. 4 5 This program is free software; you can redistribute it and/or modify 6 it under the terms of the GNU General Public License as published by 7 the Free Software Foundation; either version 3, or (at your option) 8 any later version. 9 10 This program is distributed in the hope that it will be useful, 11 but WITHOUT ANY WARRANTY; without even the implied warranty of 12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13 GNU General Public License for more details. 14 15 You should have received a copy of the GNU General Public License 16 along with this program; if not, write to the Free Software 17 Foundation, Inc., 18 51 Franklin Street - Fifth Floor, Boston, MA 02110-1301, USA */ 19 20 /* Written June, 1988 by Mike Haertel 21 Modified July, 1988 by Arthur David Olson to assist BMG speedups */ 22 23 #include <config.h> 24 25 #include "dfa.h" 26 27 #include "flexmember.h" 28 29 #include <assert.h> 30 #include <ctype.h> 31 #include <stdint.h> 32 #include <stdio.h> 33 #include <stdlib.h> 34 #include <limits.h> 35 #include <string.h> 36 37 /* Another name for ptrdiff_t, for sizes of objects and nonnegative 38 indexes into objects. It is signed to help catch integer overflow. 39 It has its own name because it is for nonnegative values only. */ 40 typedef ptrdiff_t idx_t; 41 static idx_t const IDX_MAX = PTRDIFF_MAX; 42 43 static bool 44 streq (char const *a, char const *b) 45 { 46 return strcmp (a, b) == 0; 47 } 48 49 static bool 50 isasciidigit (char c) 51 { 52 return '0' <= c && c <= '9'; 53 } 54 55 #include "gettext.h" 56 #define _(str) gettext (str) 57 58 #include <wchar.h> 59 60 #include "intprops.h" 61 #include "xalloc.h" 62 #include "localeinfo.h" 63 64 #ifndef FALLTHROUGH 65 # if __GNUC__ < 7 66 # define FALLTHROUGH ((void) 0) 67 # else 68 # define FALLTHROUGH __attribute__ ((__fallthrough__)) 69 # endif 70 #endif 71 72 #ifndef MIN 73 # define MIN(a,b) ((a) < (b) ? (a) : (b)) 74 #endif 75 76 /* HPUX defines these as macros in sys/param.h. */ 77 #ifdef setbit 78 # undef setbit 79 #endif 80 #ifdef clrbit 81 # undef clrbit 82 #endif 83 84 /* First integer value that is greater than any character code. */ 85 enum { NOTCHAR = 1 << CHAR_BIT }; 86 87 /* Number of bits used in a charclass word. */ 88 enum { CHARCLASS_WORD_BITS = 64 }; 89 90 /* This represents part of a character class. It must be unsigned and 91 at least CHARCLASS_WORD_BITS wide. Any excess bits are zero. */ 92 typedef uint_least64_t charclass_word; 93 94 /* An initializer for a charclass whose 64-bit words are A through D. */ 95 #define CHARCLASS_INIT(a, b, c, d) {{a, b, c, d}} 96 97 /* The maximum useful value of a charclass_word; all used bits are 1. */ 98 static charclass_word const CHARCLASS_WORD_MASK 99 = ((charclass_word) 1 << (CHARCLASS_WORD_BITS - 1) << 1) - 1; 100 101 /* Number of words required to hold a bit for every character. */ 102 enum 103 { 104 CHARCLASS_WORDS = (NOTCHAR + CHARCLASS_WORD_BITS - 1) / CHARCLASS_WORD_BITS 105 }; 106 107 /* Sets of unsigned characters are stored as bit vectors in arrays of ints. */ 108 typedef struct { charclass_word w[CHARCLASS_WORDS]; } charclass; 109 110 /* Convert a possibly-signed character to an unsigned character. This is 111 a bit safer than casting to unsigned char, since it catches some type 112 errors that the cast doesn't. */ 113 static unsigned char 114 to_uchar (char ch) 115 { 116 return ch; 117 } 118 119 /* Contexts tell us whether a character is a newline or a word constituent. 120 Word-constituent characters are those that satisfy iswalnum, plus '_'. 121 Each character has a single CTX_* value; bitmasks of CTX_* values denote 122 a particular character class. 123 124 A state also stores a context value, which is a bitmask of CTX_* values. 125 A state's context represents a set of characters that the state's 126 predecessors must match. For example, a state whose context does not 127 include CTX_LETTER will never have transitions where the previous 128 character is a word constituent. A state whose context is CTX_ANY 129 might have transitions from any character. */ 130 131 enum 132 { 133 CTX_NONE = 1, 134 CTX_LETTER = 2, 135 CTX_NEWLINE = 4, 136 CTX_ANY = 7 137 }; 138 139 /* Sometimes characters can only be matched depending on the surrounding 140 context. Such context decisions depend on what the previous character 141 was, and the value of the current (lookahead) character. Context 142 dependent constraints are encoded as 9-bit integers. Each bit that 143 is set indicates that the constraint succeeds in the corresponding 144 context. 145 146 bit 6-8 - valid contexts when next character is CTX_NEWLINE 147 bit 3-5 - valid contexts when next character is CTX_LETTER 148 bit 0-2 - valid contexts when next character is CTX_NONE 149 150 succeeds_in_context determines whether a given constraint 151 succeeds in a particular context. Prev is a bitmask of possible 152 context values for the previous character, curr is the (single-bit) 153 context value for the lookahead character. */ 154 static int 155 newline_constraint (int constraint) 156 { 157 return (constraint >> 6) & 7; 158 } 159 static int 160 letter_constraint (int constraint) 161 { 162 return (constraint >> 3) & 7; 163 } 164 static int 165 other_constraint (int constraint) 166 { 167 return constraint & 7; 168 } 169 170 static bool 171 succeeds_in_context (int constraint, int prev, int curr) 172 { 173 return !! (((curr & CTX_NONE ? other_constraint (constraint) : 0) \ 174 | (curr & CTX_LETTER ? letter_constraint (constraint) : 0) \ 175 | (curr & CTX_NEWLINE ? newline_constraint (constraint) : 0)) \ 176 & prev); 177 } 178 179 /* The following describe what a constraint depends on. */ 180 static bool 181 prev_newline_dependent (int constraint) 182 { 183 return ((constraint ^ constraint >> 2) & 0111) != 0; 184 } 185 static bool 186 prev_letter_dependent (int constraint) 187 { 188 return ((constraint ^ constraint >> 1) & 0111) != 0; 189 } 190 191 /* Tokens that match the empty string subject to some constraint actually 192 work by applying that constraint to determine what may follow them, 193 taking into account what has gone before. The following values are 194 the constraints corresponding to the special tokens previously defined. */ 195 enum 196 { 197 NO_CONSTRAINT = 0777, 198 BEGLINE_CONSTRAINT = 0444, 199 ENDLINE_CONSTRAINT = 0700, 200 BEGWORD_CONSTRAINT = 0050, 201 ENDWORD_CONSTRAINT = 0202, 202 LIMWORD_CONSTRAINT = 0252, 203 NOTLIMWORD_CONSTRAINT = 0525 204 }; 205 206 /* The regexp is parsed into an array of tokens in postfix form. Some tokens 207 are operators and others are terminal symbols. Most (but not all) of these 208 codes are returned by the lexical analyzer. */ 209 210 typedef ptrdiff_t token; 211 static token const TOKEN_MAX = PTRDIFF_MAX; 212 213 /* States are indexed by state_num values. These are normally 214 nonnegative but -1 is used as a special value. */ 215 typedef ptrdiff_t state_num; 216 217 /* Predefined token values. */ 218 enum 219 { 220 END = -1, /* END is a terminal symbol that matches the 221 end of input; any value of END or less in 222 the parse tree is such a symbol. Accepting 223 states of the DFA are those that would have 224 a transition on END. This is -1, not some 225 more-negative value, to tweak the speed of 226 comparisons to END. */ 227 228 /* Ordinary character values are terminal symbols that match themselves. */ 229 230 /* CSET must come last in the following list of special tokens. Otherwise, 231 the list order matters only for performance. Related special tokens 232 should have nearby values so that code like (t == ANYCHAR || t == MBCSET 233 || CSET <= t) can be done with a single machine-level comparison. */ 234 235 EMPTY = NOTCHAR, /* EMPTY is a terminal symbol that matches 236 the empty string. */ 237 238 QMARK, /* QMARK is an operator of one argument that 239 matches zero or one occurrences of its 240 argument. */ 241 242 STAR, /* STAR is an operator of one argument that 243 matches the Kleene closure (zero or more 244 occurrences) of its argument. */ 245 246 PLUS, /* PLUS is an operator of one argument that 247 matches the positive closure (one or more 248 occurrences) of its argument. */ 249 250 REPMN, /* REPMN is a lexical token corresponding 251 to the {m,n} construct. REPMN never 252 appears in the compiled token vector. */ 253 254 CAT, /* CAT is an operator of two arguments that 255 matches the concatenation of its 256 arguments. CAT is never returned by the 257 lexical analyzer. */ 258 259 OR, /* OR is an operator of two arguments that 260 matches either of its arguments. */ 261 262 LPAREN, /* LPAREN never appears in the parse tree, 263 it is only a lexeme. */ 264 265 RPAREN, /* RPAREN never appears in the parse tree. */ 266 267 WCHAR, /* Only returned by lex. wctok contains 268 the wide character representation. */ 269 270 ANYCHAR, /* ANYCHAR is a terminal symbol that matches 271 a valid multibyte (or single byte) character. 272 It is used only if MB_CUR_MAX > 1. */ 273 274 BEG, /* BEG is an initial symbol that matches the 275 beginning of input. */ 276 277 BEGLINE, /* BEGLINE is a terminal symbol that matches 278 the empty string at the beginning of a 279 line. */ 280 281 ENDLINE, /* ENDLINE is a terminal symbol that matches 282 the empty string at the end of a line. */ 283 284 BEGWORD, /* BEGWORD is a terminal symbol that matches 285 the empty string at the beginning of a 286 word. */ 287 288 ENDWORD, /* ENDWORD is a terminal symbol that matches 289 the empty string at the end of a word. */ 290 291 LIMWORD, /* LIMWORD is a terminal symbol that matches 292 the empty string at the beginning or the 293 end of a word. */ 294 295 NOTLIMWORD, /* NOTLIMWORD is a terminal symbol that 296 matches the empty string not at 297 the beginning or end of a word. */ 298 299 BACKREF, /* BACKREF is generated by \<digit> 300 or by any other construct that 301 is not completely handled. If the scanner 302 detects a transition on backref, it returns 303 a kind of "semi-success" indicating that 304 the match will have to be verified with 305 a backtracking matcher. */ 306 307 MBCSET, /* MBCSET is similar to CSET, but for 308 multibyte characters. */ 309 310 CSET /* CSET and (and any value greater) is a 311 terminal symbol that matches any of a 312 class of characters. */ 313 }; 314 315 316 /* States of the recognizer correspond to sets of positions in the parse 317 tree, together with the constraints under which they may be matched. 318 So a position is encoded as an index into the parse tree together with 319 a constraint. */ 320 typedef struct 321 { 322 idx_t index; /* Index into the parse array. */ 323 unsigned int constraint; /* Constraint for matching this position. */ 324 } position; 325 326 /* Sets of positions are stored as arrays. */ 327 typedef struct 328 { 329 position *elems; /* Elements of this position set. */ 330 idx_t nelem; /* Number of elements in this set. */ 331 idx_t alloc; /* Number of elements allocated in ELEMS. */ 332 } position_set; 333 334 /* A state of the dfa consists of a set of positions, some flags, 335 and the token value of the lowest-numbered position of the state that 336 contains an END token. */ 337 typedef struct 338 { 339 size_t hash; /* Hash of the positions of this state. */ 340 position_set elems; /* Positions this state could match. */ 341 unsigned char context; /* Context from previous state. */ 342 unsigned short constraint; /* Constraint for this state to accept. */ 343 token first_end; /* Token value of the first END in elems. */ 344 position_set mbps; /* Positions which can match multibyte 345 characters or the follows, e.g., period. 346 Used only if MB_CUR_MAX > 1. */ 347 state_num mb_trindex; /* Index of this state in MB_TRANS, or 348 negative if the state does not have 349 ANYCHAR. */ 350 } dfa_state; 351 352 /* Maximum for any transition table count. This should be at least 3, 353 for the initial state setup. */ 354 enum { MAX_TRCOUNT = 1024 }; 355 356 /* A bracket operator. 357 e.g., [a-c], [[:alpha:]], etc. */ 358 struct mb_char_classes 359 { 360 ptrdiff_t cset; 361 bool invert; 362 wchar_t *chars; /* Normal characters. */ 363 idx_t nchars; 364 idx_t nchars_alloc; 365 }; 366 367 struct regex_syntax 368 { 369 /* Syntax bits controlling the behavior of the lexical analyzer. */ 370 reg_syntax_t syntax_bits; 371 bool syntax_bits_set; 372 373 /* Flag for case-folding letters into sets. */ 374 bool case_fold; 375 376 /* True if ^ and $ match only the start and end of data, and do not match 377 end-of-line within data. */ 378 bool anchor; 379 380 /* End-of-line byte in data. */ 381 unsigned char eolbyte; 382 383 /* Cache of char-context values. */ 384 char sbit[NOTCHAR]; 385 386 /* If never_trail[B], the byte B cannot be a non-initial byte in a 387 multibyte character. */ 388 bool never_trail[NOTCHAR]; 389 390 /* Set of characters considered letters. */ 391 charclass letters; 392 393 /* Set of characters that are newline. */ 394 charclass newline; 395 }; 396 397 /* Lexical analyzer. All the dross that deals with the obnoxious 398 GNU Regex syntax bits is located here. The poor, suffering 399 reader is referred to the GNU Regex documentation for the 400 meaning of the @#%!@#%^!@ syntax bits. */ 401 struct lexer_state 402 { 403 char const *ptr; /* Pointer to next input character. */ 404 idx_t left; /* Number of characters remaining. */ 405 token lasttok; /* Previous token returned; initially END. */ 406 idx_t parens; /* Count of outstanding left parens. */ 407 int minrep, maxrep; /* Repeat counts for {m,n}. */ 408 409 /* Wide character representation of the current multibyte character, 410 or WEOF if there was an encoding error. Used only if 411 MB_CUR_MAX > 1. */ 412 wint_t wctok; 413 414 /* The most recently analyzed multibyte bracket expression. */ 415 struct mb_char_classes brack; 416 417 /* We're separated from beginning or (, | only by zero-width characters. */ 418 bool laststart; 419 }; 420 421 /* Recursive descent parser for regular expressions. */ 422 423 struct parser_state 424 { 425 token tok; /* Lookahead token. */ 426 idx_t depth; /* Current depth of a hypothetical stack 427 holding deferred productions. This is 428 used to determine the depth that will be 429 required of the real stack later on in 430 dfaanalyze. */ 431 }; 432 433 /* A compiled regular expression. */ 434 struct dfa 435 { 436 /* Fields filled by the scanner. */ 437 charclass *charclasses; /* Array of character sets for CSET tokens. */ 438 idx_t cindex; /* Index for adding new charclasses. */ 439 idx_t calloc; /* Number of charclasses allocated. */ 440 ptrdiff_t canychar; /* Index of anychar class, or -1. */ 441 442 /* Scanner state */ 443 struct lexer_state lex; 444 445 /* Parser state */ 446 struct parser_state parse; 447 448 /* Fields filled by the parser. */ 449 token *tokens; /* Postfix parse array. */ 450 idx_t tindex; /* Index for adding new tokens. */ 451 idx_t talloc; /* Number of tokens currently allocated. */ 452 idx_t depth; /* Depth required of an evaluation stack 453 used for depth-first traversal of the 454 parse tree. */ 455 idx_t nleaves; /* Number of leaves on the parse tree. */ 456 idx_t nregexps; /* Count of parallel regexps being built 457 with dfaparse. */ 458 bool fast; /* The DFA is fast. */ 459 token utf8_anychar_classes[9]; /* To lower ANYCHAR in UTF-8 locales. */ 460 mbstate_t mbs; /* Multibyte conversion state. */ 461 462 /* The following are valid only if MB_CUR_MAX > 1. */ 463 464 /* The value of multibyte_prop[i] is defined by following rule. 465 if tokens[i] < NOTCHAR 466 bit 0 : tokens[i] is the first byte of a character, including 467 single-byte characters. 468 bit 1 : tokens[i] is the last byte of a character, including 469 single-byte characters. 470 471 e.g. 472 tokens 473 = 'single_byte_a', 'multi_byte_A', single_byte_b' 474 = 'sb_a', 'mb_A(1st byte)', 'mb_A(2nd byte)', 'mb_A(3rd byte)', 'sb_b' 475 multibyte_prop 476 = 3 , 1 , 0 , 2 , 3 477 */ 478 char *multibyte_prop; 479 480 /* Fields filled by the superset. */ 481 struct dfa *superset; /* Hint of the dfa. */ 482 483 /* Fields filled by the state builder. */ 484 dfa_state *states; /* States of the dfa. */ 485 state_num sindex; /* Index for adding new states. */ 486 idx_t salloc; /* Number of states currently allocated. */ 487 488 /* Fields filled by the parse tree->NFA conversion. */ 489 position_set *follows; /* Array of follow sets, indexed by position 490 index. The follow of a position is the set 491 of positions containing characters that 492 could conceivably follow a character 493 matching the given position in a string 494 matching the regexp. Allocated to the 495 maximum possible position index. */ 496 bool searchflag; /* We are supposed to build a searching 497 as opposed to an exact matcher. A searching 498 matcher finds the first and shortest string 499 matching a regexp anywhere in the buffer, 500 whereas an exact matcher finds the longest 501 string matching, but anchored to the 502 beginning of the buffer. */ 503 504 /* Fields filled by dfaanalyze. */ 505 int *constraints; /* Array of union of accepting constraints 506 in the follow of a position. */ 507 int *separates; /* Array of contexts on follow of a 508 position. */ 509 510 /* Fields filled by dfaexec. */ 511 state_num tralloc; /* Number of transition tables that have 512 slots so far, not counting trans[-1] and 513 trans[-2]. */ 514 int trcount; /* Number of transition tables that have 515 been built, other than for initial 516 states. */ 517 int min_trcount; /* Number of initial states. Equivalently, 518 the minimum state number for which trcount 519 counts transitions. */ 520 state_num **trans; /* Transition tables for states that can 521 never accept. If the transitions for a 522 state have not yet been computed, or the 523 state could possibly accept, its entry in 524 this table is NULL. This points to two 525 past the start of the allocated array, 526 and trans[-1] and trans[-2] are always 527 NULL. */ 528 state_num **fails; /* Transition tables after failing to accept 529 on a state that potentially could do so. 530 If trans[i] is non-null, fails[i] must 531 be null. */ 532 char *success; /* Table of acceptance conditions used in 533 dfaexec and computed in build_state. */ 534 state_num *newlines; /* Transitions on newlines. The entry for a 535 newline in any transition table is always 536 -1 so we can count lines without wasting 537 too many cycles. The transition for a 538 newline is stored separately and handled 539 as a special case. Newline is also used 540 as a sentinel at the end of the buffer. */ 541 state_num initstate_notbol; /* Initial state for CTX_LETTER and CTX_NONE 542 context in multibyte locales, in which we 543 do not distinguish between their contexts, 544 as not supported word. */ 545 position_set mb_follows; /* Follow set added by ANYCHAR on demand. */ 546 state_num **mb_trans; /* Transition tables for states with 547 ANYCHAR. */ 548 state_num mb_trcount; /* Number of transition tables for states with 549 ANYCHAR that have actually been built. */ 550 551 /* Syntax configuration. This is near the end so that dfacopysyntax 552 can memset up to here. */ 553 struct regex_syntax syntax; 554 555 /* Information derived from the locale. This is at the end so that 556 a quick memset need not clear it specially. */ 557 558 /* dfaexec implementation. */ 559 char *(*dfaexec) (struct dfa *, char const *, char *, 560 bool, ptrdiff_t *, bool *); 561 562 /* Other cached information derived from the locale. */ 563 struct localeinfo localeinfo; 564 }; 565 566 /* User access to dfa internals. */ 567 568 /* S could possibly be an accepting state of R. */ 569 static bool 570 accepting (state_num s, struct dfa const *r) 571 { 572 return r->states[s].constraint != 0; 573 } 574 575 /* STATE accepts in the specified context. */ 576 static bool 577 accepts_in_context (int prev, int curr, state_num state, struct dfa const *dfa) 578 { 579 return succeeds_in_context (dfa->states[state].constraint, prev, curr); 580 } 581 582 static void regexp (struct dfa *dfa); 583 584 /* Store into *PWC the result of converting the leading bytes of the 585 multibyte buffer S of length N bytes, using D->localeinfo.sbctowc 586 and updating the conversion state in *D. On conversion error, 587 convert just a single byte, to WEOF. Return the number of bytes 588 converted. 589 590 This differs from mbrtowc (PWC, S, N, &D->mbs) as follows: 591 592 * PWC points to wint_t, not to wchar_t. 593 * The last arg is a dfa *D instead of merely a multibyte conversion 594 state D->mbs. 595 * N must be at least 1. 596 * S[N - 1] must be a sentinel byte. 597 * Shift encodings are not supported. 598 * The return value is always in the range 1..N. 599 * D->mbs is always valid afterwards. 600 * *PWC is always set to something. */ 601 static int 602 mbs_to_wchar (wint_t *pwc, char const *s, size_t n, struct dfa *d) 603 { 604 unsigned char uc = s[0]; 605 wint_t wc = d->localeinfo.sbctowc[uc]; 606 607 if (wc == WEOF) 608 { 609 wchar_t wch; 610 size_t nbytes = mbrtowc (&wch, s, n, &d->mbs); 611 if (0 < nbytes && nbytes < (size_t) -2) 612 { 613 *pwc = wch; 614 return nbytes; 615 } 616 memset (&d->mbs, 0, sizeof d->mbs); 617 } 618 619 *pwc = wc; 620 return 1; 621 } 622 623 #ifdef DEBUG 624 625 static void 626 prtok (token t) 627 { 628 if (t <= END) 629 fprintf (stderr, "END"); 630 else if (0 <= t && t < NOTCHAR) 631 { 632 unsigned int ch = t; 633 fprintf (stderr, "0x%02x", ch); 634 } 635 else 636 { 637 char const *s; 638 switch (t) 639 { 640 case BEG: 641 s = "BEG"; 642 break; 643 case EMPTY: 644 s = "EMPTY"; 645 break; 646 case BACKREF: 647 s = "BACKREF"; 648 break; 649 case BEGLINE: 650 s = "BEGLINE"; 651 break; 652 case ENDLINE: 653 s = "ENDLINE"; 654 break; 655 case BEGWORD: 656 s = "BEGWORD"; 657 break; 658 case ENDWORD: 659 s = "ENDWORD"; 660 break; 661 case LIMWORD: 662 s = "LIMWORD"; 663 break; 664 case NOTLIMWORD: 665 s = "NOTLIMWORD"; 666 break; 667 case QMARK: 668 s = "QMARK"; 669 break; 670 case STAR: 671 s = "STAR"; 672 break; 673 case PLUS: 674 s = "PLUS"; 675 break; 676 case CAT: 677 s = "CAT"; 678 break; 679 case OR: 680 s = "OR"; 681 break; 682 case LPAREN: 683 s = "LPAREN"; 684 break; 685 case RPAREN: 686 s = "RPAREN"; 687 break; 688 case ANYCHAR: 689 s = "ANYCHAR"; 690 break; 691 case MBCSET: 692 s = "MBCSET"; 693 break; 694 default: 695 s = "CSET"; 696 break; 697 } 698 fprintf (stderr, "%s", s); 699 } 700 } 701 #endif /* DEBUG */ 702 703 /* Stuff pertaining to charclasses. */ 704 705 static bool 706 tstbit (unsigned int b, charclass const *c) 707 { 708 return c->w[b / CHARCLASS_WORD_BITS] >> b % CHARCLASS_WORD_BITS & 1; 709 } 710 711 static void 712 setbit (unsigned int b, charclass *c) 713 { 714 charclass_word one = 1; 715 c->w[b / CHARCLASS_WORD_BITS] |= one << b % CHARCLASS_WORD_BITS; 716 } 717 718 static void 719 clrbit (unsigned int b, charclass *c) 720 { 721 charclass_word one = 1; 722 c->w[b / CHARCLASS_WORD_BITS] &= ~(one << b % CHARCLASS_WORD_BITS); 723 } 724 725 static void 726 zeroset (charclass *s) 727 { 728 memset (s, 0, sizeof *s); 729 } 730 731 static void 732 fillset (charclass *s) 733 { 734 for (int i = 0; i < CHARCLASS_WORDS; i++) 735 s->w[i] = CHARCLASS_WORD_MASK; 736 } 737 738 static void 739 notset (charclass *s) 740 { 741 for (int i = 0; i < CHARCLASS_WORDS; ++i) 742 s->w[i] = CHARCLASS_WORD_MASK & ~s->w[i]; 743 } 744 745 static bool 746 equal (charclass const *s1, charclass const *s2) 747 { 748 charclass_word w = 0; 749 for (int i = 0; i < CHARCLASS_WORDS; i++) 750 w |= s1->w[i] ^ s2->w[i]; 751 return w == 0; 752 } 753 754 static bool 755 emptyset (charclass const *s) 756 { 757 charclass_word w = 0; 758 for (int i = 0; i < CHARCLASS_WORDS; i++) 759 w |= s->w[i]; 760 return w == 0; 761 } 762 763 /* Grow PA, which points to an array of *NITEMS items, and return the 764 location of the reallocated array, updating *NITEMS to reflect its 765 new size. The new array will contain at least NITEMS_INCR_MIN more 766 items, but will not contain more than NITEMS_MAX items total. 767 ITEM_SIZE is the size of each item, in bytes. 768 769 ITEM_SIZE and NITEMS_INCR_MIN must be positive. *NITEMS must be 770 nonnegative. If NITEMS_MAX is -1, it is treated as if it were 771 infinity. 772 773 If PA is null, then allocate a new array instead of reallocating 774 the old one. 775 776 Thus, to grow an array A without saving its old contents, do 777 { free (A); A = xpalloc (NULL, &AITEMS, ...); }. */ 778 779 static void * 780 xpalloc (void *pa, idx_t *nitems, idx_t nitems_incr_min, 781 ptrdiff_t nitems_max, idx_t item_size) 782 { 783 idx_t n0 = *nitems; 784 785 /* The approximate size to use for initial small allocation 786 requests. This is the largest "small" request for the GNU C 787 library malloc. */ 788 enum { DEFAULT_MXFAST = 64 * sizeof (size_t) / 4 }; 789 790 /* If the array is tiny, grow it to about (but no greater than) 791 DEFAULT_MXFAST bytes. Otherwise, grow it by about 50%. 792 Adjust the growth according to three constraints: NITEMS_INCR_MIN, 793 NITEMS_MAX, and what the C language can represent safely. */ 794 795 idx_t n, nbytes; 796 if (INT_ADD_WRAPV (n0, n0 >> 1, &n)) 797 n = IDX_MAX; 798 if (0 <= nitems_max && nitems_max < n) 799 n = nitems_max; 800 801 idx_t adjusted_nbytes 802 = ((INT_MULTIPLY_WRAPV (n, item_size, &nbytes) || SIZE_MAX < nbytes) 803 ? MIN (IDX_MAX, SIZE_MAX) 804 : nbytes < DEFAULT_MXFAST ? DEFAULT_MXFAST : 0); 805 if (adjusted_nbytes) 806 { 807 n = adjusted_nbytes / item_size; 808 nbytes = adjusted_nbytes - adjusted_nbytes % item_size; 809 } 810 811 if (! pa) 812 *nitems = 0; 813 if (n - n0 < nitems_incr_min 814 && (INT_ADD_WRAPV (n0, nitems_incr_min, &n) 815 || (0 <= nitems_max && nitems_max < n) 816 || INT_MULTIPLY_WRAPV (n, item_size, &nbytes))) 817 xalloc_die (); 818 pa = xrealloc (pa, nbytes); 819 *nitems = n; 820 return pa; 821 } 822 823 /* Ensure that the array addressed by PA holds at least I + 1 items. 824 Either return PA, or reallocate the array and return its new address. 825 Although PA may be null, the returned value is never null. 826 827 The array holds *NITEMS items, where 0 <= I <= *NITEMS; *NITEMS 828 is updated on reallocation. If PA is null, *NITEMS must be zero. 829 Do not allocate more than NITEMS_MAX items total; -1 means no limit. 830 ITEM_SIZE is the size of one item; it must be positive. 831 Avoid O(N**2) behavior on arrays growing linearly. */ 832 static void * 833 maybe_realloc (void *pa, idx_t i, idx_t *nitems, 834 ptrdiff_t nitems_max, idx_t item_size) 835 { 836 if (i < *nitems) 837 return pa; 838 return xpalloc (pa, nitems, 1, nitems_max, item_size); 839 } 840 841 /* In DFA D, find the index of charclass S, or allocate a new one. */ 842 static idx_t 843 charclass_index (struct dfa *d, charclass const *s) 844 { 845 idx_t i; 846 847 for (i = 0; i < d->cindex; ++i) 848 if (equal (s, &d->charclasses[i])) 849 return i; 850 d->charclasses = maybe_realloc (d->charclasses, d->cindex, &d->calloc, 851 TOKEN_MAX - CSET, sizeof *d->charclasses); 852 ++d->cindex; 853 d->charclasses[i] = *s; 854 return i; 855 } 856 857 static bool 858 unibyte_word_constituent (struct dfa const *dfa, unsigned char c) 859 { 860 return dfa->localeinfo.sbctowc[c] != WEOF && (isalnum (c) || (c) == '_'); 861 } 862 863 static int 864 char_context (struct dfa const *dfa, unsigned char c) 865 { 866 if (c == dfa->syntax.eolbyte && !dfa->syntax.anchor) 867 return CTX_NEWLINE; 868 if (unibyte_word_constituent (dfa, c)) 869 return CTX_LETTER; 870 return CTX_NONE; 871 } 872 873 /* Set a bit in the charclass for the given wchar_t. Do nothing if WC 874 is represented by a multi-byte sequence. Even for MB_CUR_MAX == 1, 875 this may happen when folding case in weird Turkish locales where 876 dotless i/dotted I are not included in the chosen character set. 877 Return whether a bit was set in the charclass. */ 878 static bool 879 setbit_wc (wint_t wc, charclass *c) 880 { 881 int b = wctob (wc); 882 if (b < 0) 883 return false; 884 885 setbit (b, c); 886 return true; 887 } 888 889 /* Set a bit for B and its case variants in the charclass C. 890 MB_CUR_MAX must be 1. */ 891 static void 892 setbit_case_fold_c (int b, charclass *c) 893 { 894 int ub = toupper (b); 895 for (int i = 0; i < NOTCHAR; i++) 896 if (toupper (i) == ub) 897 setbit (i, c); 898 } 899 900 /* Fetch the next lexical input character from the pattern. There 901 must at least one byte of pattern input. Set DFA->lex.wctok to the 902 value of the character or to WEOF depending on whether the input is 903 a valid multibyte character (possibly of length 1). Then return 904 the next input byte value, except return EOF if the input is a 905 multibyte character of length greater than 1. */ 906 static int 907 fetch_wc (struct dfa *dfa) 908 { 909 int nbytes = mbs_to_wchar (&dfa->lex.wctok, dfa->lex.ptr, dfa->lex.left, 910 dfa); 911 int c = nbytes == 1 ? to_uchar (dfa->lex.ptr[0]) : EOF; 912 dfa->lex.ptr += nbytes; 913 dfa->lex.left -= nbytes; 914 return c; 915 } 916 917 /* If there is no more input, report an error about unbalanced brackets. 918 Otherwise, behave as with fetch_wc (DFA). */ 919 static int 920 bracket_fetch_wc (struct dfa *dfa) 921 { 922 if (! dfa->lex.left) 923 dfaerror (_("unbalanced [")); 924 return fetch_wc (dfa); 925 } 926 927 typedef int predicate (int); 928 929 /* The following list maps the names of the Posix named character classes 930 to predicate functions that determine whether a given character is in 931 the class. The leading [ has already been eaten by the lexical 932 analyzer. */ 933 struct dfa_ctype 934 { 935 const char *name; 936 predicate *func; 937 bool single_byte_only; 938 }; 939 940 static const struct dfa_ctype prednames[] = { 941 {"alpha", isalpha, false}, 942 {"upper", isupper, false}, 943 {"lower", islower, false}, 944 {"digit", isdigit, true}, 945 {"xdigit", isxdigit, false}, 946 {"space", isspace, false}, 947 {"punct", ispunct, false}, 948 {"alnum", isalnum, false}, 949 {"print", isprint, false}, 950 {"graph", isgraph, false}, 951 {"cntrl", iscntrl, false}, 952 {"blank", isblank, false}, 953 {NULL, NULL, false} 954 }; 955 956 static const struct dfa_ctype *_GL_ATTRIBUTE_PURE 957 find_pred (const char *str) 958 { 959 for (int i = 0; prednames[i].name; i++) 960 if (streq (str, prednames[i].name)) 961 return &prednames[i]; 962 return NULL; 963 } 964 965 /* Parse a bracket expression, which possibly includes multibyte 966 characters. */ 967 static token 968 parse_bracket_exp (struct dfa *dfa) 969 { 970 /* This is a bracket expression that dfaexec is known to 971 process correctly. */ 972 bool known_bracket_exp = true; 973 974 /* Used to warn about [:space:]. 975 Bit 0 = first character is a colon. 976 Bit 1 = last character is a colon. 977 Bit 2 = includes any other character but a colon. 978 Bit 3 = includes ranges, char/equiv classes or collation elements. */ 979 int colon_warning_state; 980 981 dfa->lex.brack.nchars = 0; 982 charclass ccl; 983 zeroset (&ccl); 984 int c = bracket_fetch_wc (dfa); 985 bool invert = c == '^'; 986 if (invert) 987 { 988 c = bracket_fetch_wc (dfa); 989 known_bracket_exp = dfa->localeinfo.simple; 990 } 991 wint_t wc = dfa->lex.wctok; 992 int c1; 993 wint_t wc1; 994 colon_warning_state = (c == ':'); 995 do 996 { 997 c1 = NOTCHAR; /* Mark c1 as not initialized. */ 998 colon_warning_state &= ~2; 999 1000 /* Note that if we're looking at some other [:...:] construct, 1001 we just treat it as a bunch of ordinary characters. We can do 1002 this because we assume regex has checked for syntax errors before 1003 dfa is ever called. */ 1004 if (c == '[') 1005 { 1006 c1 = bracket_fetch_wc (dfa); 1007 wc1 = dfa->lex.wctok; 1008 1009 if ((c1 == ':' && (dfa->syntax.syntax_bits & RE_CHAR_CLASSES)) 1010 || c1 == '.' || c1 == '=') 1011 { 1012 enum { MAX_BRACKET_STRING_LEN = 32 }; 1013 char str[MAX_BRACKET_STRING_LEN + 1]; 1014 int len = 0; 1015 for (;;) 1016 { 1017 c = bracket_fetch_wc (dfa); 1018 if (dfa->lex.left == 0 1019 || (c == c1 && dfa->lex.ptr[0] == ']')) 1020 break; 1021 if (len < MAX_BRACKET_STRING_LEN) 1022 str[len++] = c; 1023 else 1024 /* This is in any case an invalid class name. */ 1025 str[0] = '\0'; 1026 } 1027 str[len] = '\0'; 1028 1029 /* Fetch bracket. */ 1030 c = bracket_fetch_wc (dfa); 1031 wc = dfa->lex.wctok; 1032 if (c1 == ':') 1033 /* Build character class. POSIX allows character 1034 classes to match multicharacter collating elements, 1035 but the regex code does not support that, so do not 1036 worry about that possibility. */ 1037 { 1038 char const *class 1039 = (dfa->syntax.case_fold && (streq (str, "upper") 1040 || streq (str, "lower")) 1041 ? "alpha" : str); 1042 const struct dfa_ctype *pred = find_pred (class); 1043 if (!pred) 1044 dfaerror (_("invalid character class")); 1045 1046 if (dfa->localeinfo.multibyte && !pred->single_byte_only) 1047 known_bracket_exp = false; 1048 else 1049 for (int c2 = 0; c2 < NOTCHAR; ++c2) 1050 if (pred->func (c2)) 1051 setbit (c2, &ccl); 1052 } 1053 else 1054 known_bracket_exp = false; 1055 1056 colon_warning_state |= 8; 1057 1058 /* Fetch new lookahead character. */ 1059 c1 = bracket_fetch_wc (dfa); 1060 wc1 = dfa->lex.wctok; 1061 continue; 1062 } 1063 1064 /* We treat '[' as a normal character here. c/c1/wc/wc1 1065 are already set up. */ 1066 } 1067 1068 if (c == '\\' 1069 && (dfa->syntax.syntax_bits & RE_BACKSLASH_ESCAPE_IN_LISTS)) 1070 { 1071 c = bracket_fetch_wc (dfa); 1072 wc = dfa->lex.wctok; 1073 } 1074 1075 if (c1 == NOTCHAR) 1076 { 1077 c1 = bracket_fetch_wc (dfa); 1078 wc1 = dfa->lex.wctok; 1079 } 1080 1081 if (c1 == '-') 1082 /* build range characters. */ 1083 { 1084 int c2 = bracket_fetch_wc (dfa); 1085 wint_t wc2 = dfa->lex.wctok; 1086 1087 /* A bracket expression like [a-[.aa.]] matches an unknown set. 1088 Treat it like [-a[.aa.]] while parsing it, and 1089 remember that the set is unknown. */ 1090 if (c2 == '[' && dfa->lex.ptr[0] == '.') 1091 { 1092 known_bracket_exp = false; 1093 c2 = ']'; 1094 } 1095 1096 if (c2 == ']') 1097 { 1098 /* In the case [x-], the - is an ordinary hyphen, 1099 which is left in c1, the lookahead character. */ 1100 dfa->lex.ptr--; 1101 dfa->lex.left++; 1102 } 1103 else 1104 { 1105 if (c2 == '\\' && (dfa->syntax.syntax_bits 1106 & RE_BACKSLASH_ESCAPE_IN_LISTS)) 1107 { 1108 c2 = bracket_fetch_wc (dfa); 1109 wc2 = dfa->lex.wctok; 1110 } 1111 1112 colon_warning_state |= 8; 1113 c1 = bracket_fetch_wc (dfa); 1114 wc1 = dfa->lex.wctok; 1115 1116 /* Treat [x-y] as a range if x != y. */ 1117 if (wc != wc2 || wc == WEOF) 1118 { 1119 if (dfa->localeinfo.simple 1120 || (isasciidigit (c) & isasciidigit (c2))) 1121 { 1122 for (int ci = c; ci <= c2; ci++) 1123 if (dfa->syntax.case_fold && isalpha (ci)) 1124 setbit_case_fold_c (ci, &ccl); 1125 else 1126 setbit (ci, &ccl); 1127 } 1128 else 1129 known_bracket_exp = false; 1130 1131 continue; 1132 } 1133 } 1134 } 1135 1136 colon_warning_state |= (c == ':') ? 2 : 4; 1137 1138 if (!dfa->localeinfo.multibyte) 1139 { 1140 if (dfa->syntax.case_fold && isalpha (c)) 1141 setbit_case_fold_c (c, &ccl); 1142 else 1143 setbit (c, &ccl); 1144 continue; 1145 } 1146 1147 if (wc == WEOF) 1148 known_bracket_exp = false; 1149 else 1150 { 1151 wchar_t folded[CASE_FOLDED_BUFSIZE + 1]; 1152 int n = (dfa->syntax.case_fold 1153 ? case_folded_counterparts (wc, folded + 1) + 1 1154 : 1); 1155 folded[0] = wc; 1156 for (int i = 0; i < n; i++) 1157 if (!setbit_wc (folded[i], &ccl)) 1158 { 1159 dfa->lex.brack.chars 1160 = maybe_realloc (dfa->lex.brack.chars, dfa->lex.brack.nchars, 1161 &dfa->lex.brack.nchars_alloc, -1, 1162 sizeof *dfa->lex.brack.chars); 1163 dfa->lex.brack.chars[dfa->lex.brack.nchars++] = folded[i]; 1164 } 1165 } 1166 } 1167 while ((wc = wc1, (c = c1) != ']')); 1168 1169 if (colon_warning_state == 7) 1170 dfawarn (_("character class syntax is [[:space:]], not [:space:]")); 1171 1172 if (! known_bracket_exp) 1173 return BACKREF; 1174 1175 if (dfa->localeinfo.multibyte && (invert || dfa->lex.brack.nchars != 0)) 1176 { 1177 dfa->lex.brack.invert = invert; 1178 dfa->lex.brack.cset = emptyset (&ccl) ? -1 : charclass_index (dfa, &ccl); 1179 return MBCSET; 1180 } 1181 1182 if (invert) 1183 { 1184 notset (&ccl); 1185 if (dfa->syntax.syntax_bits & RE_HAT_LISTS_NOT_NEWLINE) 1186 clrbit ('\n', &ccl); 1187 } 1188 1189 return CSET + charclass_index (dfa, &ccl); 1190 } 1191 1192 struct lexptr 1193 { 1194 char const *ptr; 1195 idx_t left; 1196 }; 1197 1198 static void 1199 push_lex_state (struct dfa *dfa, struct lexptr *ls, char const *s) 1200 { 1201 ls->ptr = dfa->lex.ptr; 1202 ls->left = dfa->lex.left; 1203 dfa->lex.ptr = s; 1204 dfa->lex.left = strlen (s); 1205 } 1206 1207 static void 1208 pop_lex_state (struct dfa *dfa, struct lexptr const *ls) 1209 { 1210 dfa->lex.ptr = ls->ptr; 1211 dfa->lex.left = ls->left; 1212 } 1213 1214 static token 1215 lex (struct dfa *dfa) 1216 { 1217 bool backslash = false; 1218 1219 /* Basic plan: We fetch a character. If it's a backslash, 1220 we set the backslash flag and go through the loop again. 1221 On the plus side, this avoids having a duplicate of the 1222 main switch inside the backslash case. On the minus side, 1223 it means that just about every case begins with 1224 "if (backslash) ...". */ 1225 for (int i = 0; i < 2; ++i) 1226 { 1227 if (! dfa->lex.left) 1228 return dfa->lex.lasttok = END; 1229 int c = fetch_wc (dfa); 1230 1231 switch (c) 1232 { 1233 case '\\': 1234 if (backslash) 1235 goto normal_char; 1236 if (dfa->lex.left == 0) 1237 dfaerror (_("unfinished \\ escape")); 1238 backslash = true; 1239 break; 1240 1241 case '^': 1242 if (backslash) 1243 goto normal_char; 1244 if (dfa->syntax.syntax_bits & RE_CONTEXT_INDEP_ANCHORS 1245 || dfa->lex.lasttok == END || dfa->lex.lasttok == LPAREN 1246 || dfa->lex.lasttok == OR) 1247 return dfa->lex.lasttok = BEGLINE; 1248 goto normal_char; 1249 1250 case '$': 1251 if (backslash) 1252 goto normal_char; 1253 if (dfa->syntax.syntax_bits & RE_CONTEXT_INDEP_ANCHORS 1254 || dfa->lex.left == 0 1255 || ((dfa->lex.left 1256 > !(dfa->syntax.syntax_bits & RE_NO_BK_PARENS)) 1257 && (dfa->lex.ptr[!(dfa->syntax.syntax_bits & RE_NO_BK_PARENS) 1258 & (dfa->lex.ptr[0] == '\\')] 1259 == ')')) 1260 || ((dfa->lex.left 1261 > !(dfa->syntax.syntax_bits & RE_NO_BK_VBAR)) 1262 && (dfa->lex.ptr[!(dfa->syntax.syntax_bits & RE_NO_BK_VBAR) 1263 & (dfa->lex.ptr[0] == '\\')] 1264 == '|')) 1265 || ((dfa->syntax.syntax_bits & RE_NEWLINE_ALT) 1266 && dfa->lex.left > 0 && dfa->lex.ptr[0] == '\n')) 1267 return dfa->lex.lasttok = ENDLINE; 1268 goto normal_char; 1269 1270 case '1': 1271 case '2': 1272 case '3': 1273 case '4': 1274 case '5': 1275 case '6': 1276 case '7': 1277 case '8': 1278 case '9': 1279 if (backslash && !(dfa->syntax.syntax_bits & RE_NO_BK_REFS)) 1280 { 1281 dfa->lex.laststart = false; 1282 return dfa->lex.lasttok = BACKREF; 1283 } 1284 goto normal_char; 1285 1286 case '`': 1287 if (backslash && !(dfa->syntax.syntax_bits & RE_NO_GNU_OPS)) 1288 { 1289 /* FIXME: should be beginning of string */ 1290 return dfa->lex.lasttok = BEGLINE; 1291 } 1292 goto normal_char; 1293 1294 case '\'': 1295 if (backslash && !(dfa->syntax.syntax_bits & RE_NO_GNU_OPS)) 1296 { 1297 /* FIXME: should be end of string */ 1298 return dfa->lex.lasttok = ENDLINE; 1299 } 1300 goto normal_char; 1301 1302 case '<': 1303 if (backslash && !(dfa->syntax.syntax_bits & RE_NO_GNU_OPS)) 1304 return dfa->lex.lasttok = BEGWORD; 1305 goto normal_char; 1306 1307 case '>': 1308 if (backslash && !(dfa->syntax.syntax_bits & RE_NO_GNU_OPS)) 1309 return dfa->lex.lasttok = ENDWORD; 1310 goto normal_char; 1311 1312 case 'b': 1313 if (backslash && !(dfa->syntax.syntax_bits & RE_NO_GNU_OPS)) 1314 return dfa->lex.lasttok = LIMWORD; 1315 goto normal_char; 1316 1317 case 'B': 1318 if (backslash && !(dfa->syntax.syntax_bits & RE_NO_GNU_OPS)) 1319 return dfa->lex.lasttok = NOTLIMWORD; 1320 goto normal_char; 1321 1322 case '?': 1323 if (dfa->syntax.syntax_bits & RE_LIMITED_OPS) 1324 goto normal_char; 1325 if (backslash != ((dfa->syntax.syntax_bits & RE_BK_PLUS_QM) != 0)) 1326 goto normal_char; 1327 if (!(dfa->syntax.syntax_bits & RE_CONTEXT_INDEP_OPS) 1328 && dfa->lex.laststart) 1329 goto normal_char; 1330 return dfa->lex.lasttok = QMARK; 1331 1332 case '*': 1333 if (backslash) 1334 goto normal_char; 1335 if (!(dfa->syntax.syntax_bits & RE_CONTEXT_INDEP_OPS) 1336 && dfa->lex.laststart) 1337 goto normal_char; 1338 return dfa->lex.lasttok = STAR; 1339 1340 case '+': 1341 if (dfa->syntax.syntax_bits & RE_LIMITED_OPS) 1342 goto normal_char; 1343 if (backslash != ((dfa->syntax.syntax_bits & RE_BK_PLUS_QM) != 0)) 1344 goto normal_char; 1345 if (!(dfa->syntax.syntax_bits & RE_CONTEXT_INDEP_OPS) 1346 && dfa->lex.laststart) 1347 goto normal_char; 1348 return dfa->lex.lasttok = PLUS; 1349 1350 case '{': 1351 if (!(dfa->syntax.syntax_bits & RE_INTERVALS)) 1352 goto normal_char; 1353 if (backslash != ((dfa->syntax.syntax_bits & RE_NO_BK_BRACES) == 0)) 1354 goto normal_char; 1355 if (!(dfa->syntax.syntax_bits & RE_CONTEXT_INDEP_OPS) 1356 && dfa->lex.laststart) 1357 goto normal_char; 1358 1359 /* Cases: 1360 {M} - exact count 1361 {M,} - minimum count, maximum is infinity 1362 {,N} - 0 through N 1363 {,} - 0 to infinity (same as '*') 1364 {M,N} - M through N */ 1365 { 1366 char const *p = dfa->lex.ptr; 1367 char const *lim = p + dfa->lex.left; 1368 dfa->lex.minrep = dfa->lex.maxrep = -1; 1369 for (; p != lim && isasciidigit (*p); p++) 1370 dfa->lex.minrep = (dfa->lex.minrep < 0 1371 ? *p - '0' 1372 : MIN (RE_DUP_MAX + 1, 1373 dfa->lex.minrep * 10 + *p - '0')); 1374 if (p != lim) 1375 { 1376 if (*p != ',') 1377 dfa->lex.maxrep = dfa->lex.minrep; 1378 else 1379 { 1380 if (dfa->lex.minrep < 0) 1381 dfa->lex.minrep = 0; 1382 while (++p != lim && isasciidigit (*p)) 1383 dfa->lex.maxrep 1384 = (dfa->lex.maxrep < 0 1385 ? *p - '0' 1386 : MIN (RE_DUP_MAX + 1, 1387 dfa->lex.maxrep * 10 + *p - '0')); 1388 } 1389 } 1390 if (! ((! backslash || (p != lim && *p++ == '\\')) 1391 && p != lim && *p++ == '}' 1392 && 0 <= dfa->lex.minrep 1393 && (dfa->lex.maxrep < 0 1394 || dfa->lex.minrep <= dfa->lex.maxrep))) 1395 { 1396 if (dfa->syntax.syntax_bits & RE_INVALID_INTERVAL_ORD) 1397 goto normal_char; 1398 dfaerror (_("invalid content of \\{\\}")); 1399 } 1400 if (RE_DUP_MAX < dfa->lex.maxrep) 1401 dfaerror (_("regular expression too big")); 1402 dfa->lex.ptr = p; 1403 dfa->lex.left = lim - p; 1404 } 1405 dfa->lex.laststart = false; 1406 return dfa->lex.lasttok = REPMN; 1407 1408 case '|': 1409 if (dfa->syntax.syntax_bits & RE_LIMITED_OPS) 1410 goto normal_char; 1411 if (backslash != ((dfa->syntax.syntax_bits & RE_NO_BK_VBAR) == 0)) 1412 goto normal_char; 1413 dfa->lex.laststart = true; 1414 return dfa->lex.lasttok = OR; 1415 1416 case '\n': 1417 if (dfa->syntax.syntax_bits & RE_LIMITED_OPS 1418 || backslash || !(dfa->syntax.syntax_bits & RE_NEWLINE_ALT)) 1419 goto normal_char; 1420 dfa->lex.laststart = true; 1421 return dfa->lex.lasttok = OR; 1422 1423 case '(': 1424 if (backslash != ((dfa->syntax.syntax_bits & RE_NO_BK_PARENS) == 0)) 1425 goto normal_char; 1426 dfa->lex.parens++; 1427 dfa->lex.laststart = true; 1428 return dfa->lex.lasttok = LPAREN; 1429 1430 case ')': 1431 if (backslash != ((dfa->syntax.syntax_bits & RE_NO_BK_PARENS) == 0)) 1432 goto normal_char; 1433 if (dfa->lex.parens == 0 1434 && dfa->syntax.syntax_bits & RE_UNMATCHED_RIGHT_PAREN_ORD) 1435 goto normal_char; 1436 dfa->lex.parens--; 1437 dfa->lex.laststart = false; 1438 return dfa->lex.lasttok = RPAREN; 1439 1440 case '.': 1441 if (backslash) 1442 goto normal_char; 1443 if (dfa->canychar < 0) 1444 { 1445 charclass ccl; 1446 fillset (&ccl); 1447 if (!(dfa->syntax.syntax_bits & RE_DOT_NEWLINE)) 1448 clrbit ('\n', &ccl); 1449 if (dfa->syntax.syntax_bits & RE_DOT_NOT_NULL) 1450 clrbit ('\0', &ccl); 1451 if (dfa->localeinfo.multibyte) 1452 for (int c2 = 0; c2 < NOTCHAR; c2++) 1453 if (dfa->localeinfo.sbctowc[c2] == WEOF) 1454 clrbit (c2, &ccl); 1455 dfa->canychar = charclass_index (dfa, &ccl); 1456 } 1457 dfa->lex.laststart = false; 1458 return dfa->lex.lasttok = (dfa->localeinfo.multibyte 1459 ? ANYCHAR 1460 : CSET + dfa->canychar); 1461 1462 case 's': 1463 case 'S': 1464 if (!backslash || (dfa->syntax.syntax_bits & RE_NO_GNU_OPS)) 1465 goto normal_char; 1466 if (!dfa->localeinfo.multibyte) 1467 { 1468 charclass ccl; 1469 zeroset (&ccl); 1470 for (int c2 = 0; c2 < NOTCHAR; ++c2) 1471 if (isspace (c2)) 1472 setbit (c2, &ccl); 1473 if (c == 'S') 1474 notset (&ccl); 1475 dfa->lex.laststart = false; 1476 return dfa->lex.lasttok = CSET + charclass_index (dfa, &ccl); 1477 } 1478 1479 /* FIXME: see if optimizing this, as is done with ANYCHAR and 1480 add_utf8_anychar, makes sense. */ 1481 1482 /* \s and \S are documented to be equivalent to [[:space:]] and 1483 [^[:space:]] respectively, so tell the lexer to process those 1484 strings, each minus its "already processed" '['. */ 1485 { 1486 struct lexptr ls; 1487 push_lex_state (dfa, &ls, &"^[:space:]]"[c == 's']); 1488 dfa->lex.lasttok = parse_bracket_exp (dfa); 1489 pop_lex_state (dfa, &ls); 1490 } 1491 1492 dfa->lex.laststart = false; 1493 return dfa->lex.lasttok; 1494 1495 case 'w': 1496 case 'W': 1497 if (!backslash || (dfa->syntax.syntax_bits & RE_NO_GNU_OPS)) 1498 goto normal_char; 1499 1500 if (!dfa->localeinfo.multibyte) 1501 { 1502 charclass ccl; 1503 zeroset (&ccl); 1504 for (int c2 = 0; c2 < NOTCHAR; ++c2) 1505 if (dfa->syntax.sbit[c2] == CTX_LETTER) 1506 setbit (c2, &ccl); 1507 if (c == 'W') 1508 notset (&ccl); 1509 dfa->lex.laststart = false; 1510 return dfa->lex.lasttok = CSET + charclass_index (dfa, &ccl); 1511 } 1512 1513 /* FIXME: see if optimizing this, as is done with ANYCHAR and 1514 add_utf8_anychar, makes sense. */ 1515 1516 /* \w and \W are documented to be equivalent to [_[:alnum:]] and 1517 [^_[:alnum:]] respectively, so tell the lexer to process those 1518 strings, each minus its "already processed" '['. */ 1519 { 1520 struct lexptr ls; 1521 push_lex_state (dfa, &ls, &"^_[:alnum:]]"[c == 'w']); 1522 dfa->lex.lasttok = parse_bracket_exp (dfa); 1523 pop_lex_state (dfa, &ls); 1524 } 1525 1526 dfa->lex.laststart = false; 1527 return dfa->lex.lasttok; 1528 1529 case '[': 1530 if (backslash) 1531 goto normal_char; 1532 dfa->lex.laststart = false; 1533 return dfa->lex.lasttok = parse_bracket_exp (dfa); 1534 1535 default: 1536 normal_char: 1537 dfa->lex.laststart = false; 1538 /* For multibyte character sets, folding is done in atom. Always 1539 return WCHAR. */ 1540 if (dfa->localeinfo.multibyte) 1541 return dfa->lex.lasttok = WCHAR; 1542 1543 if (dfa->syntax.case_fold && isalpha (c)) 1544 { 1545 charclass ccl; 1546 zeroset (&ccl); 1547 setbit_case_fold_c (c, &ccl); 1548 return dfa->lex.lasttok = CSET + charclass_index (dfa, &ccl); 1549 } 1550 1551 return dfa->lex.lasttok = c; 1552 } 1553 } 1554 1555 /* The above loop should consume at most a backslash 1556 and some other character. */ 1557 abort (); 1558 return END; /* keeps pedantic compilers happy. */ 1559 } 1560 1561 static void 1562 addtok_mb (struct dfa *dfa, token t, char mbprop) 1563 { 1564 if (dfa->talloc == dfa->tindex) 1565 { 1566 dfa->tokens = xpalloc (dfa->tokens, &dfa->talloc, 1, -1, 1567 sizeof *dfa->tokens); 1568 if (dfa->localeinfo.multibyte) 1569 dfa->multibyte_prop = xnrealloc (dfa->multibyte_prop, dfa->talloc, 1570 sizeof *dfa->multibyte_prop); 1571 } 1572 if (dfa->localeinfo.multibyte) 1573 dfa->multibyte_prop[dfa->tindex] = mbprop; 1574 dfa->tokens[dfa->tindex++] = t; 1575 1576 switch (t) 1577 { 1578 case QMARK: 1579 case STAR: 1580 case PLUS: 1581 break; 1582 1583 case CAT: 1584 case OR: 1585 dfa->parse.depth--; 1586 break; 1587 1588 case BACKREF: 1589 dfa->fast = false; 1590 FALLTHROUGH; 1591 default: 1592 dfa->nleaves++; 1593 FALLTHROUGH; 1594 case EMPTY: 1595 dfa->parse.depth++; 1596 break; 1597 } 1598 if (dfa->parse.depth > dfa->depth) 1599 dfa->depth = dfa->parse.depth; 1600 } 1601 1602 static void addtok_wc (struct dfa *dfa, wint_t wc); 1603 1604 /* Add the given token to the parse tree, maintaining the depth count and 1605 updating the maximum depth if necessary. */ 1606 static void 1607 addtok (struct dfa *dfa, token t) 1608 { 1609 if (dfa->localeinfo.multibyte && t == MBCSET) 1610 { 1611 bool need_or = false; 1612 1613 /* Extract wide characters into alternations for better performance. 1614 This does not require UTF-8. */ 1615 for (idx_t i = 0; i < dfa->lex.brack.nchars; i++) 1616 { 1617 addtok_wc (dfa, dfa->lex.brack.chars[i]); 1618 if (need_or) 1619 addtok (dfa, OR); 1620 need_or = true; 1621 } 1622 dfa->lex.brack.nchars = 0; 1623 1624 /* Wide characters have been handled above, so it is possible 1625 that the set is empty now. Do nothing in that case. */ 1626 if (dfa->lex.brack.cset != -1) 1627 { 1628 addtok (dfa, CSET + dfa->lex.brack.cset); 1629 if (need_or) 1630 addtok (dfa, OR); 1631 } 1632 } 1633 else 1634 { 1635 addtok_mb (dfa, t, 3); 1636 } 1637 } 1638 1639 /* We treat a multibyte character as a single atom, so that DFA 1640 can treat a multibyte character as a single expression. 1641 1642 e.g., we construct the following tree from "<mb1><mb2>". 1643 <mb1(1st-byte)><mb1(2nd-byte)><CAT><mb1(3rd-byte)><CAT> 1644 <mb2(1st-byte)><mb2(2nd-byte)><CAT><mb2(3rd-byte)><CAT><CAT> */ 1645 static void 1646 addtok_wc (struct dfa *dfa, wint_t wc) 1647 { 1648 unsigned char buf[MB_LEN_MAX]; 1649 mbstate_t s = { 0 }; 1650 size_t stored_bytes = wcrtomb ((char *) buf, wc, &s); 1651 int buflen; 1652 1653 if (stored_bytes != (size_t) -1) 1654 buflen = stored_bytes; 1655 else 1656 { 1657 /* This is merely stop-gap. buf[0] is undefined, yet skipping 1658 the addtok_mb call altogether can corrupt the heap. */ 1659 buflen = 1; 1660 buf[0] = 0; 1661 } 1662 1663 addtok_mb (dfa, buf[0], buflen == 1 ? 3 : 1); 1664 for (int i = 1; i < buflen; i++) 1665 { 1666 addtok_mb (dfa, buf[i], i == buflen - 1 ? 2 : 0); 1667 addtok (dfa, CAT); 1668 } 1669 } 1670 1671 static void 1672 add_utf8_anychar (struct dfa *dfa) 1673 { 1674 /* Since the Unicode Standard Version 4.0.0 (2003), a well-formed 1675 UTF-8 byte sequence has been defined as follows: 1676 1677 ([\x00-\x7f] 1678 |[\xc2-\xdf][\x80-\xbf] 1679 |[\xe0][\xa0-\xbf][\x80-\xbf] 1680 |[\xe1-\xec\xee-\xef][\x80-\xbf][\x80-\xbf] 1681 |[\xed][\x80-\x9f][\x80-\xbf] 1682 |[\xf0][\x90-\xbf][\x80-\xbf][\x80-\xbf]) 1683 |[\xf1-\xf3][\x80-\xbf][\x80-\xbf][\x80-\xbf] 1684 |[\xf4][\x80-\x8f][\x80-\xbf][\x80-\xbf]) 1685 1686 which I'll write more concisely "A|BC|DEC|FCC|GHC|IJCC|KCCC|LMCC", 1687 where A = [\x00-\x7f], B = [\xc2-\xdf], C = [\x80-\xbf], 1688 D = [\xe0], E = [\xa0-\xbf], F = [\xe1-\xec\xee-\xef], G = [\xed], 1689 H = [\x80-\x9f], I = [\xf0], 1690 J = [\x90-\xbf], K = [\xf1-\xf3], L = [\xf4], M = [\x80-\x8f]. 1691 1692 This can be refactored to "A|(B|DE|GH|(F|IJ|LM|KC)C)C". */ 1693 1694 /* Mnemonics for classes containing two or more bytes. */ 1695 enum { A, B, C, E, F, H, J, K, M }; 1696 1697 /* Mnemonics for single-byte tokens. */ 1698 enum { D_token = 0xe0, G_token = 0xed, I_token = 0xf0, L_token = 0xf4 }; 1699 1700 static charclass const utf8_classes[] = { 1701 /* A. 00-7f: 1-byte sequence. */ 1702 CHARCLASS_INIT (0xffffffffffffffff, 0xffffffffffffffff, 0, 0), 1703 1704 /* B. c2-df: 1st byte of a 2-byte sequence. */ 1705 CHARCLASS_INIT (0, 0, 0, 0x00000000fffffffc), 1706 1707 /* C. 80-bf: non-leading bytes. */ 1708 CHARCLASS_INIT (0, 0, 0xffffffffffffffff, 0), 1709 1710 /* D. e0 (just a token). */ 1711 1712 /* E. a0-bf: 2nd byte of a "DEC" sequence. */ 1713 CHARCLASS_INIT (0, 0, 0xffffffff00000000, 0), 1714 1715 /* F. e1-ec + ee-ef: 1st byte of an "FCC" sequence. */ 1716 CHARCLASS_INIT (0, 0, 0, 0x0000dffe00000000), 1717 1718 /* G. ed (just a token). */ 1719 1720 /* H. 80-9f: 2nd byte of a "GHC" sequence. */ 1721 CHARCLASS_INIT (0, 0, 0x000000000000ffff, 0), 1722 1723 /* I. f0 (just a token). */ 1724 1725 /* J. 90-bf: 2nd byte of an "IJCC" sequence. */ 1726 CHARCLASS_INIT (0, 0, 0xffffffffffff0000, 0), 1727 1728 /* K. f1-f3: 1st byte of a "KCCC" sequence. */ 1729 CHARCLASS_INIT (0, 0, 0, 0x000e000000000000), 1730 1731 /* L. f4 (just a token). */ 1732 1733 /* M. 80-8f: 2nd byte of a "LMCC" sequence. */ 1734 CHARCLASS_INIT (0, 0, 0x00000000000000ff, 0), 1735 }; 1736 1737 /* Define the character classes that are needed below. */ 1738 if (dfa->utf8_anychar_classes[0] == 0) 1739 { 1740 charclass c = utf8_classes[0]; 1741 if (! (dfa->syntax.syntax_bits & RE_DOT_NEWLINE)) 1742 clrbit ('\n', &c); 1743 if (dfa->syntax.syntax_bits & RE_DOT_NOT_NULL) 1744 clrbit ('\0', &c); 1745 dfa->utf8_anychar_classes[0] = CSET + charclass_index (dfa, &c); 1746 1747 for (int i = 1; i < sizeof utf8_classes / sizeof *utf8_classes; i++) 1748 dfa->utf8_anychar_classes[i] 1749 = CSET + charclass_index (dfa, &utf8_classes[i]); 1750 } 1751 1752 /* Implement the "A|(B|DE|GH|(F|IJ|LM|KC)C)C" pattern mentioned above. 1753 The token buffer is in reverse Polish order, so we get 1754 "A B D E CAT OR G H CAT OR F I J CAT OR L M CAT OR K 1755 C CAT OR C CAT OR C CAT OR". */ 1756 addtok (dfa, dfa->utf8_anychar_classes[A]); 1757 addtok (dfa, dfa->utf8_anychar_classes[B]); 1758 addtok (dfa, D_token); 1759 addtok (dfa, dfa->utf8_anychar_classes[E]); 1760 addtok (dfa, CAT); 1761 addtok (dfa, OR); 1762 addtok (dfa, G_token); 1763 addtok (dfa, dfa->utf8_anychar_classes[H]); 1764 addtok (dfa, CAT); 1765 addtok (dfa, OR); 1766 addtok (dfa, dfa->utf8_anychar_classes[F]); 1767 addtok (dfa, I_token); 1768 addtok (dfa, dfa->utf8_anychar_classes[J]); 1769 addtok (dfa, CAT); 1770 addtok (dfa, OR); 1771 addtok (dfa, L_token); 1772 addtok (dfa, dfa->utf8_anychar_classes[M]); 1773 addtok (dfa, CAT); 1774 addtok (dfa, OR); 1775 addtok (dfa, dfa->utf8_anychar_classes[K]); 1776 for (int i = 0; i < 3; i++) 1777 { 1778 addtok (dfa, dfa->utf8_anychar_classes[C]); 1779 addtok (dfa, CAT); 1780 addtok (dfa, OR); 1781 } 1782 } 1783 1784 /* The grammar understood by the parser is as follows. 1785 1786 regexp: 1787 regexp OR branch 1788 branch 1789 1790 branch: 1791 branch closure 1792 closure 1793 1794 closure: 1795 closure QMARK 1796 closure STAR 1797 closure PLUS 1798 closure REPMN 1799 atom 1800 1801 atom: 1802 <normal character> 1803 <multibyte character> 1804 ANYCHAR 1805 MBCSET 1806 CSET 1807 BACKREF 1808 BEGLINE 1809 ENDLINE 1810 BEGWORD 1811 ENDWORD 1812 LIMWORD 1813 NOTLIMWORD 1814 LPAREN regexp RPAREN 1815 <empty> 1816 1817 The parser builds a parse tree in postfix form in an array of tokens. */ 1818 1819 static void 1820 atom (struct dfa *dfa) 1821 { 1822 if ((0 <= dfa->parse.tok && dfa->parse.tok < NOTCHAR) 1823 || dfa->parse.tok >= CSET 1824 || dfa->parse.tok == BEG || dfa->parse.tok == BACKREF 1825 || dfa->parse.tok == BEGLINE || dfa->parse.tok == ENDLINE 1826 || dfa->parse.tok == BEGWORD || dfa->parse.tok == ENDWORD 1827 || dfa->parse.tok == LIMWORD || dfa->parse.tok == NOTLIMWORD 1828 || dfa->parse.tok == ANYCHAR || dfa->parse.tok == MBCSET) 1829 { 1830 if (dfa->parse.tok == ANYCHAR && dfa->localeinfo.using_utf8) 1831 { 1832 /* For UTF-8 expand the period to a series of CSETs that define a 1833 valid UTF-8 character. This avoids using the slow multibyte 1834 path. I'm pretty sure it would be both profitable and correct to 1835 do it for any encoding; however, the optimization must be done 1836 manually as it is done above in add_utf8_anychar. So, let's 1837 start with UTF-8: it is the most used, and the structure of the 1838 encoding makes the correctness more obvious. */ 1839 add_utf8_anychar (dfa); 1840 } 1841 else 1842 addtok (dfa, dfa->parse.tok); 1843 dfa->parse.tok = lex (dfa); 1844 } 1845 else if (dfa->parse.tok == WCHAR) 1846 { 1847 if (dfa->lex.wctok == WEOF) 1848 addtok (dfa, BACKREF); 1849 else 1850 { 1851 addtok_wc (dfa, dfa->lex.wctok); 1852 1853 if (dfa->syntax.case_fold) 1854 { 1855 wchar_t folded[CASE_FOLDED_BUFSIZE]; 1856 int n = case_folded_counterparts (dfa->lex.wctok, folded); 1857 for (int i = 0; i < n; i++) 1858 { 1859 addtok_wc (dfa, folded[i]); 1860 addtok (dfa, OR); 1861 } 1862 } 1863 } 1864 1865 dfa->parse.tok = lex (dfa); 1866 } 1867 else if (dfa->parse.tok == LPAREN) 1868 { 1869 dfa->parse.tok = lex (dfa); 1870 regexp (dfa); 1871 if (dfa->parse.tok != RPAREN) 1872 dfaerror (_("unbalanced (")); 1873 dfa->parse.tok = lex (dfa); 1874 } 1875 else 1876 addtok (dfa, EMPTY); 1877 } 1878 1879 /* Return the number of tokens in the given subexpression. */ 1880 static idx_t _GL_ATTRIBUTE_PURE 1881 nsubtoks (struct dfa const *dfa, idx_t tindex) 1882 { 1883 switch (dfa->tokens[tindex - 1]) 1884 { 1885 default: 1886 return 1; 1887 case QMARK: 1888 case STAR: 1889 case PLUS: 1890 return 1 + nsubtoks (dfa, tindex - 1); 1891 case CAT: 1892 case OR: 1893 { 1894 idx_t ntoks1 = nsubtoks (dfa, tindex - 1); 1895 return 1 + ntoks1 + nsubtoks (dfa, tindex - 1 - ntoks1); 1896 } 1897 } 1898 } 1899 1900 /* Copy the given subexpression to the top of the tree. */ 1901 static void 1902 copytoks (struct dfa *dfa, idx_t tindex, idx_t ntokens) 1903 { 1904 if (dfa->localeinfo.multibyte) 1905 for (idx_t i = 0; i < ntokens; i++) 1906 addtok_mb (dfa, dfa->tokens[tindex + i], 1907 dfa->multibyte_prop[tindex + i]); 1908 else 1909 for (idx_t i = 0; i < ntokens; i++) 1910 addtok_mb (dfa, dfa->tokens[tindex + i], 3); 1911 } 1912 1913 static void 1914 closure (struct dfa *dfa) 1915 { 1916 atom (dfa); 1917 while (dfa->parse.tok == QMARK || dfa->parse.tok == STAR 1918 || dfa->parse.tok == PLUS || dfa->parse.tok == REPMN) 1919 if (dfa->parse.tok == REPMN && (dfa->lex.minrep || dfa->lex.maxrep)) 1920 { 1921 idx_t ntokens = nsubtoks (dfa, dfa->tindex); 1922 idx_t tindex = dfa->tindex - ntokens; 1923 if (dfa->lex.maxrep < 0) 1924 addtok (dfa, PLUS); 1925 if (dfa->lex.minrep == 0) 1926 addtok (dfa, QMARK); 1927 int i; 1928 for (i = 1; i < dfa->lex.minrep; i++) 1929 { 1930 copytoks (dfa, tindex, ntokens); 1931 addtok (dfa, CAT); 1932 } 1933 for (; i < dfa->lex.maxrep; i++) 1934 { 1935 copytoks (dfa, tindex, ntokens); 1936 addtok (dfa, QMARK); 1937 addtok (dfa, CAT); 1938 } 1939 dfa->parse.tok = lex (dfa); 1940 } 1941 else if (dfa->parse.tok == REPMN) 1942 { 1943 dfa->tindex -= nsubtoks (dfa, dfa->tindex); 1944 dfa->parse.tok = lex (dfa); 1945 closure (dfa); 1946 } 1947 else 1948 { 1949 addtok (dfa, dfa->parse.tok); 1950 dfa->parse.tok = lex (dfa); 1951 } 1952 } 1953 1954 static void 1955 branch (struct dfa* dfa) 1956 { 1957 closure (dfa); 1958 while (dfa->parse.tok != RPAREN && dfa->parse.tok != OR 1959 && dfa->parse.tok >= 0) 1960 { 1961 closure (dfa); 1962 addtok (dfa, CAT); 1963 } 1964 } 1965 1966 static void 1967 regexp (struct dfa *dfa) 1968 { 1969 branch (dfa); 1970 while (dfa->parse.tok == OR) 1971 { 1972 dfa->parse.tok = lex (dfa); 1973 branch (dfa); 1974 addtok (dfa, OR); 1975 } 1976 } 1977 1978 /* Parse a string S of length LEN into D. S can include NUL characters. 1979 This is the main entry point for the parser. */ 1980 void 1981 dfaparse (char const *s, idx_t len, struct dfa *d) 1982 { 1983 d->lex.ptr = s; 1984 d->lex.left = len; 1985 d->lex.lasttok = END; 1986 d->lex.laststart = true; 1987 1988 if (!d->syntax.syntax_bits_set) 1989 dfaerror (_("no syntax specified")); 1990 1991 if (!d->nregexps) 1992 addtok (d, BEG); 1993 1994 d->parse.tok = lex (d); 1995 d->parse.depth = d->depth; 1996 1997 regexp (d); 1998 1999 if (d->parse.tok != END) 2000 dfaerror (_("unbalanced )")); 2001 2002 addtok (d, END - d->nregexps); 2003 addtok (d, CAT); 2004 2005 if (d->nregexps) 2006 addtok (d, OR); 2007 2008 ++d->nregexps; 2009 } 2010 2011 /* Some primitives for operating on sets of positions. */ 2012 2013 /* Copy one set to another. */ 2014 static void 2015 copy (position_set const *src, position_set *dst) 2016 { 2017 if (dst->alloc < src->nelem) 2018 { 2019 free (dst->elems); 2020 dst->elems = xpalloc (NULL, &dst->alloc, src->nelem - dst->alloc, -1, 2021 sizeof *dst->elems); 2022 } 2023 dst->nelem = src->nelem; 2024 if (src->nelem != 0) 2025 memcpy (dst->elems, src->elems, src->nelem * sizeof *dst->elems); 2026 } 2027 2028 static void 2029 alloc_position_set (position_set *s, idx_t size) 2030 { 2031 s->elems = xnmalloc (size, sizeof *s->elems); 2032 s->alloc = size; 2033 s->nelem = 0; 2034 } 2035 2036 /* Insert position P in set S. S is maintained in sorted order on 2037 decreasing index. If there is already an entry in S with P.index 2038 then merge (logically-OR) P's constraints into the one in S. 2039 S->elems must point to an array large enough to hold the resulting set. */ 2040 static void 2041 insert (position p, position_set *s) 2042 { 2043 idx_t count = s->nelem; 2044 idx_t lo = 0, hi = count; 2045 while (lo < hi) 2046 { 2047 idx_t mid = (lo + hi) >> 1; 2048 if (s->elems[mid].index < p.index) 2049 lo = mid + 1; 2050 else if (s->elems[mid].index == p.index) 2051 { 2052 s->elems[mid].constraint |= p.constraint; 2053 return; 2054 } 2055 else 2056 hi = mid; 2057 } 2058 2059 s->elems = maybe_realloc (s->elems, count, &s->alloc, -1, sizeof *s->elems); 2060 for (idx_t i = count; i > lo; i--) 2061 s->elems[i] = s->elems[i - 1]; 2062 s->elems[lo] = p; 2063 ++s->nelem; 2064 } 2065 2066 static void 2067 append (position p, position_set *s) 2068 { 2069 idx_t count = s->nelem; 2070 s->elems = maybe_realloc (s->elems, count, &s->alloc, -1, sizeof *s->elems); 2071 s->elems[s->nelem++] = p; 2072 } 2073 2074 /* Merge S1 and S2 (with the additional constraint C2) into M. The 2075 result is as if the positions of S1, and of S2 with the additional 2076 constraint C2, were inserted into an initially empty set. */ 2077 static void 2078 merge_constrained (position_set const *s1, position_set const *s2, 2079 unsigned int c2, position_set *m) 2080 { 2081 idx_t i = 0, j = 0; 2082 2083 if (m->alloc - s1->nelem < s2->nelem) 2084 { 2085 free (m->elems); 2086 m->alloc = s1->nelem; 2087 m->elems = xpalloc (NULL, &m->alloc, s2->nelem, -1, sizeof *m->elems); 2088 } 2089 m->nelem = 0; 2090 while (i < s1->nelem || j < s2->nelem) 2091 if (! (j < s2->nelem) 2092 || (i < s1->nelem && s1->elems[i].index <= s2->elems[j].index)) 2093 { 2094 unsigned int c = ((i < s1->nelem && j < s2->nelem 2095 && s1->elems[i].index == s2->elems[j].index) 2096 ? s2->elems[j++].constraint & c2 2097 : 0); 2098 m->elems[m->nelem].index = s1->elems[i].index; 2099 m->elems[m->nelem++].constraint = s1->elems[i++].constraint | c; 2100 } 2101 else 2102 { 2103 if (s2->elems[j].constraint & c2) 2104 { 2105 m->elems[m->nelem].index = s2->elems[j].index; 2106 m->elems[m->nelem++].constraint = s2->elems[j].constraint & c2; 2107 } 2108 j++; 2109 } 2110 } 2111 2112 /* Merge two sets of positions into a third. The result is exactly as if 2113 the positions of both sets were inserted into an initially empty set. */ 2114 static void 2115 merge (position_set const *s1, position_set const *s2, position_set *m) 2116 { 2117 merge_constrained (s1, s2, -1, m); 2118 } 2119 2120 static void 2121 merge2 (position_set *dst, position_set const *src, position_set *m) 2122 { 2123 if (src->nelem < 4) 2124 { 2125 for (idx_t i = 0; i < src->nelem; i++) 2126 insert (src->elems[i], dst); 2127 } 2128 else 2129 { 2130 merge (src, dst, m); 2131 copy (m, dst); 2132 } 2133 } 2134 2135 /* Delete a position from a set. Return the nonzero constraint of the 2136 deleted position, or zero if there was no such position. */ 2137 static unsigned int 2138 delete (idx_t del, position_set *s) 2139 { 2140 idx_t count = s->nelem; 2141 idx_t lo = 0, hi = count; 2142 while (lo < hi) 2143 { 2144 idx_t mid = (lo + hi) >> 1; 2145 if (s->elems[mid].index < del) 2146 lo = mid + 1; 2147 else if (s->elems[mid].index == del) 2148 { 2149 unsigned int c = s->elems[mid].constraint; 2150 idx_t i; 2151 for (i = mid; i + 1 < count; i++) 2152 s->elems[i] = s->elems[i + 1]; 2153 s->nelem = i; 2154 return c; 2155 } 2156 else 2157 hi = mid; 2158 } 2159 return 0; 2160 } 2161 2162 /* Replace a position with the followed set. */ 2163 static void 2164 replace (position_set *dst, idx_t del, position_set *add, 2165 unsigned int constraint, position_set *tmp) 2166 { 2167 unsigned int c = delete (del, dst) & constraint; 2168 2169 if (c) 2170 { 2171 copy (dst, tmp); 2172 merge_constrained (tmp, add, c, dst); 2173 } 2174 } 2175 2176 /* Find the index of the state corresponding to the given position set with 2177 the given preceding context, or create a new state if there is no such 2178 state. Context tells whether we got here on a newline or letter. */ 2179 static state_num 2180 state_index (struct dfa *d, position_set const *s, int context) 2181 { 2182 size_t hash = 0; 2183 int constraint = 0; 2184 state_num i; 2185 token first_end = 0; 2186 2187 for (i = 0; i < s->nelem; ++i) 2188 { 2189 size_t ind = s->elems[i].index; 2190 hash ^= ind + s->elems[i].constraint; 2191 } 2192 2193 /* Try to find a state that exactly matches the proposed one. */ 2194 for (i = 0; i < d->sindex; ++i) 2195 { 2196 if (hash != d->states[i].hash || s->nelem != d->states[i].elems.nelem 2197 || context != d->states[i].context) 2198 continue; 2199 state_num j; 2200 for (j = 0; j < s->nelem; ++j) 2201 if (s->elems[j].constraint != d->states[i].elems.elems[j].constraint 2202 || s->elems[j].index != d->states[i].elems.elems[j].index) 2203 break; 2204 if (j == s->nelem) 2205 return i; 2206 } 2207 2208 #ifdef DEBUG 2209 fprintf (stderr, "new state %td\n nextpos:", i); 2210 for (state_num j = 0; j < s->nelem; j++) 2211 { 2212 fprintf (stderr, " %td:", s->elems[j].index); 2213 prtok (d->tokens[s->elems[j].index]); 2214 } 2215 fprintf (stderr, "\n context:"); 2216 if (context ^ CTX_ANY) 2217 { 2218 if (context & CTX_NONE) 2219 fprintf (stderr, " CTX_NONE"); 2220 if (context & CTX_LETTER) 2221 fprintf (stderr, " CTX_LETTER"); 2222 if (context & CTX_NEWLINE) 2223 fprintf (stderr, " CTX_NEWLINE"); 2224 } 2225 else 2226 fprintf (stderr, " CTX_ANY"); 2227 fprintf (stderr, "\n"); 2228 #endif 2229 2230 for (state_num j = 0; j < s->nelem; j++) 2231 { 2232 int c = d->constraints[s->elems[j].index]; 2233 2234 if (c != 0) 2235 { 2236 if (succeeds_in_context (c, context, CTX_ANY)) 2237 constraint |= c; 2238 if (!first_end) 2239 first_end = d->tokens[s->elems[j].index]; 2240 } 2241 else if (d->tokens[s->elems[j].index] == BACKREF) 2242 constraint = NO_CONSTRAINT; 2243 } 2244 2245 2246 /* Create a new state. */ 2247 d->states = maybe_realloc (d->states, d->sindex, &d->salloc, -1, 2248 sizeof *d->states); 2249 d->states[i].hash = hash; 2250 alloc_position_set (&d->states[i].elems, s->nelem); 2251 copy (s, &d->states[i].elems); 2252 d->states[i].context = context; 2253 d->states[i].constraint = constraint; 2254 d->states[i].first_end = first_end; 2255 d->states[i].mbps.nelem = 0; 2256 d->states[i].mbps.elems = NULL; 2257 d->states[i].mb_trindex = -1; 2258 2259 ++d->sindex; 2260 2261 return i; 2262 } 2263 2264 /* Find the epsilon closure of a set of positions. If any position of the set 2265 contains a symbol that matches the empty string in some context, replace 2266 that position with the elements of its follow labeled with an appropriate 2267 constraint. Repeat exhaustively until no funny positions are left. 2268 S->elems must be large enough to hold the result. */ 2269 static void 2270 epsclosure (struct dfa const *d) 2271 { 2272 position_set tmp; 2273 alloc_position_set (&tmp, d->nleaves); 2274 for (idx_t i = 0; i < d->tindex; i++) 2275 if (d->follows[i].nelem > 0 && d->tokens[i] >= NOTCHAR 2276 && d->tokens[i] != BACKREF && d->tokens[i] != ANYCHAR 2277 && d->tokens[i] != MBCSET && d->tokens[i] < CSET) 2278 { 2279 unsigned int constraint; 2280 switch (d->tokens[i]) 2281 { 2282 case BEGLINE: 2283 constraint = BEGLINE_CONSTRAINT; 2284 break; 2285 case ENDLINE: 2286 constraint = ENDLINE_CONSTRAINT; 2287 break; 2288 case BEGWORD: 2289 constraint = BEGWORD_CONSTRAINT; 2290 break; 2291 case ENDWORD: 2292 constraint = ENDWORD_CONSTRAINT; 2293 break; 2294 case LIMWORD: 2295 constraint = LIMWORD_CONSTRAINT; 2296 break; 2297 case NOTLIMWORD: 2298 constraint = NOTLIMWORD_CONSTRAINT; 2299 break; 2300 default: 2301 constraint = NO_CONSTRAINT; 2302 break; 2303 } 2304 2305 delete (i, &d->follows[i]); 2306 2307 for (idx_t j = 0; j < d->tindex; j++) 2308 if (i != j && d->follows[j].nelem > 0) 2309 replace (&d->follows[j], i, &d->follows[i], constraint, &tmp); 2310 } 2311 free (tmp.elems); 2312 } 2313 2314 /* Returns the set of contexts for which there is at least one 2315 character included in C. */ 2316 2317 static int 2318 charclass_context (struct dfa const *dfa, charclass const *c) 2319 { 2320 int context = 0; 2321 2322 for (int j = 0; j < CHARCLASS_WORDS; j++) 2323 { 2324 if (c->w[j] & dfa->syntax.newline.w[j]) 2325 context |= CTX_NEWLINE; 2326 if (c->w[j] & dfa->syntax.letters.w[j]) 2327 context |= CTX_LETTER; 2328 if (c->w[j] & ~(dfa->syntax.letters.w[j] | dfa->syntax.newline.w[j])) 2329 context |= CTX_NONE; 2330 } 2331 2332 return context; 2333 } 2334 2335 /* Returns the contexts on which the position set S depends. Each context 2336 in the set of returned contexts (let's call it SC) may have a different 2337 follow set than other contexts in SC, and also different from the 2338 follow set of the complement set (sc ^ CTX_ANY). However, all contexts 2339 in the complement set will have the same follow set. */ 2340 2341 static int _GL_ATTRIBUTE_PURE 2342 state_separate_contexts (struct dfa *d, position_set const *s) 2343 { 2344 int separate_contexts = 0; 2345 2346 for (idx_t j = 0; j < s->nelem; j++) 2347 separate_contexts |= d->separates[s->elems[j].index]; 2348 2349 return separate_contexts; 2350 } 2351 2352 enum 2353 { 2354 /* Single token is repeated. It is distinguished from non-repeated. */ 2355 OPT_REPEAT = (1 << 0), 2356 2357 /* Multiple tokens are repeated. This flag is on at head of tokens. The 2358 node is not merged. */ 2359 OPT_LPAREN = (1 << 1), 2360 2361 /* Multiple branches are joined. The node is not merged. */ 2362 OPT_RPAREN = (1 << 2), 2363 2364 /* The node is walked. If the node is found in walking again, OPT_RPAREN 2365 flag is turned on. */ 2366 OPT_WALKED = (1 << 3), 2367 2368 /* The node is queued. The node is not queued again. */ 2369 OPT_QUEUED = (1 << 4) 2370 }; 2371 2372 static void 2373 merge_nfa_state (struct dfa *d, idx_t tindex, char *flags, 2374 position_set *merged) 2375 { 2376 position_set *follows = d->follows; 2377 idx_t nelem = 0; 2378 2379 d->constraints[tindex] = 0; 2380 2381 for (idx_t i = 0; i < follows[tindex].nelem; i++) 2382 { 2383 idx_t sindex = follows[tindex].elems[i].index; 2384 2385 /* Skip the node as pruned in future. */ 2386 unsigned int iconstraint = follows[tindex].elems[i].constraint; 2387 if (iconstraint == 0) 2388 continue; 2389 2390 if (d->tokens[follows[tindex].elems[i].index] <= END) 2391 { 2392 d->constraints[tindex] |= follows[tindex].elems[i].constraint; 2393 continue; 2394 } 2395 2396 if (!(flags[sindex] & (OPT_LPAREN | OPT_RPAREN))) 2397 { 2398 idx_t j; 2399 2400 for (j = 0; j < nelem; j++) 2401 { 2402 idx_t dindex = follows[tindex].elems[j].index; 2403 2404 if (follows[tindex].elems[j].constraint != iconstraint) 2405 continue; 2406 2407 if (flags[dindex] & (OPT_LPAREN | OPT_RPAREN)) 2408 continue; 2409 2410 if (d->tokens[sindex] != d->tokens[dindex]) 2411 continue; 2412 2413 if ((flags[sindex] ^ flags[dindex]) & OPT_REPEAT) 2414 continue; 2415 2416 if (flags[sindex] & OPT_REPEAT) 2417 delete (sindex, &follows[sindex]); 2418 2419 merge2 (&follows[dindex], &follows[sindex], merged); 2420 2421 break; 2422 } 2423 2424 if (j < nelem) 2425 continue; 2426 } 2427 2428 follows[tindex].elems[nelem++] = follows[tindex].elems[i]; 2429 flags[sindex] |= OPT_QUEUED; 2430 } 2431 2432 follows[tindex].nelem = nelem; 2433 } 2434 2435 static int 2436 compare (const void *a, const void *b) 2437 { 2438 position const *p = a, *q = b; 2439 return p->index < q->index ? -1 : p->index > q->index; 2440 } 2441 2442 static void 2443 reorder_tokens (struct dfa *d) 2444 { 2445 idx_t nleaves; 2446 ptrdiff_t *map; 2447 token *tokens; 2448 position_set *follows; 2449 int *constraints; 2450 char *multibyte_prop; 2451 2452 nleaves = 0; 2453 2454 map = xnmalloc (d->tindex, sizeof *map); 2455 2456 map[0] = nleaves++; 2457 2458 for (idx_t i = 1; i < d->tindex; i++) 2459 map[i] = -1; 2460 2461 tokens = xnmalloc (d->nleaves, sizeof *tokens); 2462 follows = xnmalloc (d->nleaves, sizeof *follows); 2463 constraints = xnmalloc (d->nleaves, sizeof *constraints); 2464 2465 if (d->localeinfo.multibyte) 2466 multibyte_prop = xnmalloc (d->nleaves, sizeof *multibyte_prop); 2467 else 2468 multibyte_prop = NULL; 2469 2470 for (idx_t i = 0; i < d->tindex; i++) 2471 { 2472 if (map[i] == -1) 2473 { 2474 free (d->follows[i].elems); 2475 d->follows[i].elems = NULL; 2476 d->follows[i].nelem = 0; 2477 continue; 2478 } 2479 2480 tokens[map[i]] = d->tokens[i]; 2481 follows[map[i]] = d->follows[i]; 2482 constraints[map[i]] = d->constraints[i]; 2483 2484 if (multibyte_prop != NULL) 2485 multibyte_prop[map[i]] = d->multibyte_prop[i]; 2486 2487 for (idx_t j = 0; j < d->follows[i].nelem; j++) 2488 { 2489 if (map[d->follows[i].elems[j].index] == -1) 2490 map[d->follows[i].elems[j].index] = nleaves++; 2491 2492 d->follows[i].elems[j].index = map[d->follows[i].elems[j].index]; 2493 } 2494 2495 qsort (d->follows[i].elems, d->follows[i].nelem, 2496 sizeof *d->follows[i].elems, compare); 2497 } 2498 2499 for (idx_t i = 0; i < nleaves; i++) 2500 { 2501 d->tokens[i] = tokens[i]; 2502 d->follows[i] = follows[i]; 2503 d->constraints[i] = constraints[i]; 2504 2505 if (multibyte_prop != NULL) 2506 d->multibyte_prop[i] = multibyte_prop[i]; 2507 } 2508 2509 d->tindex = d->nleaves = nleaves; 2510 2511 free (tokens); 2512 free (follows); 2513 free (constraints); 2514 free (multibyte_prop); 2515 free (map); 2516 } 2517 2518 static void 2519 dfaoptimize (struct dfa *d) 2520 { 2521 char *flags = xzalloc (d->tindex); 2522 2523 for (idx_t i = 0; i < d->tindex; i++) 2524 { 2525 for (idx_t j = 0; j < d->follows[i].nelem; j++) 2526 { 2527 if (d->follows[i].elems[j].index == i) 2528 flags[d->follows[i].elems[j].index] |= OPT_REPEAT; 2529 else if (d->follows[i].elems[j].index < i) 2530 flags[d->follows[i].elems[j].index] |= OPT_LPAREN; 2531 else if (flags[d->follows[i].elems[j].index] &= OPT_WALKED) 2532 flags[d->follows[i].elems[j].index] |= OPT_RPAREN; 2533 else 2534 flags[d->follows[i].elems[j].index] |= OPT_WALKED; 2535 } 2536 } 2537 2538 flags[0] |= OPT_QUEUED; 2539 2540 position_set merged0; 2541 position_set *merged = &merged0; 2542 alloc_position_set (merged, d->nleaves); 2543 2544 d->constraints = xnmalloc (d->tindex, sizeof *d->constraints); 2545 2546 for (idx_t i = 0; i < d->tindex; i++) 2547 if (flags[i] & OPT_QUEUED) 2548 merge_nfa_state (d, i, flags, merged); 2549 2550 reorder_tokens (d); 2551 2552 free (merged->elems); 2553 free (flags); 2554 } 2555 2556 /* Perform bottom-up analysis on the parse tree, computing various functions. 2557 Note that at this point, we're pretending constructs like \< are real 2558 characters rather than constraints on what can follow them. 2559 2560 Nullable: A node is nullable if it is at the root of a regexp that can 2561 match the empty string. 2562 * EMPTY leaves are nullable. 2563 * No other leaf is nullable. 2564 * A QMARK or STAR node is nullable. 2565 * A PLUS node is nullable if its argument is nullable. 2566 * A CAT node is nullable if both its arguments are nullable. 2567 * An OR node is nullable if either argument is nullable. 2568 2569 Firstpos: The firstpos of a node is the set of positions (nonempty leaves) 2570 that could correspond to the first character of a string matching the 2571 regexp rooted at the given node. 2572 * EMPTY leaves have empty firstpos. 2573 * The firstpos of a nonempty leaf is that leaf itself. 2574 * The firstpos of a QMARK, STAR, or PLUS node is the firstpos of its 2575 argument. 2576 * The firstpos of a CAT node is the firstpos of the left argument, union 2577 the firstpos of the right if the left argument is nullable. 2578 * The firstpos of an OR node is the union of firstpos of each argument. 2579 2580 Lastpos: The lastpos of a node is the set of positions that could 2581 correspond to the last character of a string matching the regexp at 2582 the given node. 2583 * EMPTY leaves have empty lastpos. 2584 * The lastpos of a nonempty leaf is that leaf itself. 2585 * The lastpos of a QMARK, STAR, or PLUS node is the lastpos of its 2586 argument. 2587 * The lastpos of a CAT node is the lastpos of its right argument, union 2588 the lastpos of the left if the right argument is nullable. 2589 * The lastpos of an OR node is the union of the lastpos of each argument. 2590 2591 Follow: The follow of a position is the set of positions that could 2592 correspond to the character following a character matching the node in 2593 a string matching the regexp. At this point we consider special symbols 2594 that match the empty string in some context to be just normal characters. 2595 Later, if we find that a special symbol is in a follow set, we will 2596 replace it with the elements of its follow, labeled with an appropriate 2597 constraint. 2598 * Every node in the firstpos of the argument of a STAR or PLUS node is in 2599 the follow of every node in the lastpos. 2600 * Every node in the firstpos of the second argument of a CAT node is in 2601 the follow of every node in the lastpos of the first argument. 2602 2603 Because of the postfix representation of the parse tree, the depth-first 2604 analysis is conveniently done by a linear scan with the aid of a stack. 2605 Sets are stored as arrays of the elements, obeying a stack-like allocation 2606 scheme; the number of elements in each set deeper in the stack can be 2607 used to determine the address of a particular set's array. */ 2608 static void 2609 dfaanalyze (struct dfa *d, bool searchflag) 2610 { 2611 /* Array allocated to hold position sets. */ 2612 position *posalloc = xnmalloc (d->nleaves, 2 * sizeof *posalloc); 2613 /* Firstpos and lastpos elements. */ 2614 position *firstpos = posalloc; 2615 position *lastpos = firstpos + d->nleaves; 2616 position pos; 2617 position_set tmp; 2618 2619 /* Stack for element counts and nullable flags. */ 2620 struct 2621 { 2622 /* Whether the entry is nullable. */ 2623 bool nullable; 2624 2625 /* Counts of firstpos and lastpos sets. */ 2626 idx_t nfirstpos; 2627 idx_t nlastpos; 2628 } *stkalloc = xnmalloc (d->depth, sizeof *stkalloc), *stk = stkalloc; 2629 2630 position_set merged; /* Result of merging sets. */ 2631 2632 addtok (d, CAT); 2633 2634 #ifdef DEBUG 2635 fprintf (stderr, "dfaanalyze:\n"); 2636 for (idx_t i = 0; i < d->tindex; i++) 2637 { 2638 fprintf (stderr, " %td:", i); 2639 prtok (d->tokens[i]); 2640 } 2641 putc ('\n', stderr); 2642 #endif 2643 2644 d->searchflag = searchflag; 2645 alloc_position_set (&merged, d->nleaves); 2646 d->follows = xcalloc (d->tindex, sizeof *d->follows); 2647 2648 for (idx_t i = 0; i < d->tindex; i++) 2649 { 2650 switch (d->tokens[i]) 2651 { 2652 case EMPTY: 2653 /* The empty set is nullable. */ 2654 stk->nullable = true; 2655 2656 /* The firstpos and lastpos of the empty leaf are both empty. */ 2657 stk->nfirstpos = stk->nlastpos = 0; 2658 stk++; 2659 break; 2660 2661 case STAR: 2662 case PLUS: 2663 /* Every element in the firstpos of the argument is in the follow 2664 of every element in the lastpos. */ 2665 { 2666 tmp.elems = firstpos - stk[-1].nfirstpos; 2667 tmp.nelem = stk[-1].nfirstpos; 2668 position *p = lastpos - stk[-1].nlastpos; 2669 for (idx_t j = 0; j < stk[-1].nlastpos; j++) 2670 { 2671 merge (&tmp, &d->follows[p[j].index], &merged); 2672 copy (&merged, &d->follows[p[j].index]); 2673 } 2674 } 2675 FALLTHROUGH; 2676 case QMARK: 2677 /* A QMARK or STAR node is automatically nullable. */ 2678 if (d->tokens[i] != PLUS) 2679 stk[-1].nullable = true; 2680 break; 2681 2682 case CAT: 2683 /* Every element in the firstpos of the second argument is in the 2684 follow of every element in the lastpos of the first argument. */ 2685 { 2686 tmp.nelem = stk[-1].nfirstpos; 2687 tmp.elems = firstpos - stk[-1].nfirstpos; 2688 position *p = lastpos - stk[-1].nlastpos - stk[-2].nlastpos; 2689 for (idx_t j = 0; j < stk[-2].nlastpos; j++) 2690 { 2691 merge (&tmp, &d->follows[p[j].index], &merged); 2692 copy (&merged, &d->follows[p[j].index]); 2693 } 2694 } 2695 2696 /* The firstpos of a CAT node is the firstpos of the first argument, 2697 union that of the second argument if the first is nullable. */ 2698 if (stk[-2].nullable) 2699 stk[-2].nfirstpos += stk[-1].nfirstpos; 2700 else 2701 firstpos -= stk[-1].nfirstpos; 2702 2703 /* The lastpos of a CAT node is the lastpos of the second argument, 2704 union that of the first argument if the second is nullable. */ 2705 if (stk[-1].nullable) 2706 stk[-2].nlastpos += stk[-1].nlastpos; 2707 else 2708 { 2709 position *p = lastpos - stk[-1].nlastpos - stk[-2].nlastpos; 2710 for (idx_t j = 0; j < stk[-1].nlastpos; j++) 2711 p[j] = p[j + stk[-2].nlastpos]; 2712 lastpos -= stk[-2].nlastpos; 2713 stk[-2].nlastpos = stk[-1].nlastpos; 2714 } 2715 2716 /* A CAT node is nullable if both arguments are nullable. */ 2717 stk[-2].nullable &= stk[-1].nullable; 2718 stk--; 2719 break; 2720 2721 case OR: 2722 /* The firstpos is the union of the firstpos of each argument. */ 2723 stk[-2].nfirstpos += stk[-1].nfirstpos; 2724 2725 /* The lastpos is the union of the lastpos of each argument. */ 2726 stk[-2].nlastpos += stk[-1].nlastpos; 2727 2728 /* An OR node is nullable if either argument is nullable. */ 2729 stk[-2].nullable |= stk[-1].nullable; 2730 stk--; 2731 break; 2732 2733 default: 2734 /* Anything else is a nonempty position. (Note that special 2735 constructs like \< are treated as nonempty strings here; 2736 an "epsilon closure" effectively makes them nullable later. 2737 Backreferences have to get a real position so we can detect 2738 transitions on them later. But they are nullable. */ 2739 stk->nullable = d->tokens[i] == BACKREF; 2740 2741 /* This position is in its own firstpos and lastpos. */ 2742 stk->nfirstpos = stk->nlastpos = 1; 2743 stk++; 2744 2745 firstpos->index = lastpos->index = i; 2746 firstpos->constraint = lastpos->constraint = NO_CONSTRAINT; 2747 firstpos++, lastpos++; 2748 2749 break; 2750 } 2751 #ifdef DEBUG 2752 /* ... balance the above nonsyntactic #ifdef goo... */ 2753 fprintf (stderr, "node %td:", i); 2754 prtok (d->tokens[i]); 2755 putc ('\n', stderr); 2756 fprintf (stderr, 2757 stk[-1].nullable ? " nullable: yes\n" : " nullable: no\n"); 2758 fprintf (stderr, " firstpos:"); 2759 for (idx_t j = 0; j < stk[-1].nfirstpos; j++) 2760 { 2761 fprintf (stderr, " %td:", firstpos[j - stk[-1].nfirstpos].index); 2762 prtok (d->tokens[firstpos[j - stk[-1].nfirstpos].index]); 2763 } 2764 fprintf (stderr, "\n lastpos:"); 2765 for (idx_t j = 0; j < stk[-1].nlastpos; j++) 2766 { 2767 fprintf (stderr, " %td:", lastpos[j - stk[-1].nlastpos].index); 2768 prtok (d->tokens[lastpos[j - stk[-1].nlastpos].index]); 2769 } 2770 putc ('\n', stderr); 2771 #endif 2772 } 2773 2774 /* For each follow set that is the follow set of a real position, replace 2775 it with its epsilon closure. */ 2776 epsclosure (d); 2777 2778 dfaoptimize (d); 2779 2780 #ifdef DEBUG 2781 for (idx_t i = 0; i < d->tindex; i++) 2782 if (d->tokens[i] == BEG || d->tokens[i] < NOTCHAR 2783 || d->tokens[i] == BACKREF || d->tokens[i] == ANYCHAR 2784 || d->tokens[i] == MBCSET || d->tokens[i] >= CSET) 2785 { 2786 fprintf (stderr, "follows(%td:", i); 2787 prtok (d->tokens[i]); 2788 fprintf (stderr, "):"); 2789 for (idx_t j = 0; j < d->follows[i].nelem; j++) 2790 { 2791 fprintf (stderr, " %td:", d->follows[i].elems[j].index); 2792 prtok (d->tokens[d->follows[i].elems[j].index]); 2793 } 2794 putc ('\n', stderr); 2795 } 2796 #endif 2797 2798 pos.index = 0; 2799 pos.constraint = NO_CONSTRAINT; 2800 2801 alloc_position_set (&tmp, 1); 2802 2803 append (pos, &tmp); 2804 2805 d->separates = xnmalloc (d->tindex, sizeof *d->separates); 2806 2807 for (idx_t i = 0; i < d->tindex; i++) 2808 { 2809 d->separates[i] = 0; 2810 2811 if (prev_newline_dependent (d->constraints[i])) 2812 d->separates[i] |= CTX_NEWLINE; 2813 if (prev_letter_dependent (d->constraints[i])) 2814 d->separates[i] |= CTX_LETTER; 2815 2816 for (idx_t j = 0; j < d->follows[i].nelem; j++) 2817 { 2818 if (prev_newline_dependent (d->follows[i].elems[j].constraint)) 2819 d->separates[i] |= CTX_NEWLINE; 2820 if (prev_letter_dependent (d->follows[i].elems[j].constraint)) 2821 d->separates[i] |= CTX_LETTER; 2822 } 2823 } 2824 2825 /* Context wanted by some position. */ 2826 int separate_contexts = state_separate_contexts (d, &tmp); 2827 2828 /* Build the initial state. */ 2829 if (separate_contexts & CTX_NEWLINE) 2830 state_index (d, &tmp, CTX_NEWLINE); 2831 d->initstate_notbol = d->min_trcount 2832 = state_index (d, &tmp, separate_contexts ^ CTX_ANY); 2833 if (separate_contexts & CTX_LETTER) 2834 d->min_trcount = state_index (d, &tmp, CTX_LETTER); 2835 d->min_trcount++; 2836 d->trcount = 0; 2837 2838 free (posalloc); 2839 free (stkalloc); 2840 free (merged.elems); 2841 free (tmp.elems); 2842 } 2843 2844 /* Make sure D's state arrays are large enough to hold NEW_STATE. */ 2845 static void 2846 realloc_trans_if_necessary (struct dfa *d) 2847 { 2848 state_num oldalloc = d->tralloc; 2849 if (oldalloc < d->sindex) 2850 { 2851 state_num **realtrans = d->trans ? d->trans - 2 : NULL; 2852 idx_t newalloc1 = realtrans ? d->tralloc + 2 : 0; 2853 realtrans = xpalloc (realtrans, &newalloc1, d->sindex - oldalloc, 2854 -1, sizeof *realtrans); 2855 realtrans[0] = realtrans[1] = NULL; 2856 d->trans = realtrans + 2; 2857 idx_t newalloc = d->tralloc = newalloc1 - 2; 2858 d->fails = xnrealloc (d->fails, newalloc, sizeof *d->fails); 2859 d->success = xnrealloc (d->success, newalloc, sizeof *d->success); 2860 d->newlines = xnrealloc (d->newlines, newalloc, sizeof *d->newlines); 2861 if (d->localeinfo.multibyte) 2862 { 2863 realtrans = d->mb_trans ? d->mb_trans - 2 : NULL; 2864 realtrans = xnrealloc (realtrans, newalloc1, sizeof *realtrans); 2865 if (oldalloc == 0) 2866 realtrans[0] = realtrans[1] = NULL; 2867 d->mb_trans = realtrans + 2; 2868 } 2869 for (; oldalloc < newalloc; oldalloc++) 2870 { 2871 d->trans[oldalloc] = NULL; 2872 d->fails[oldalloc] = NULL; 2873 if (d->localeinfo.multibyte) 2874 d->mb_trans[oldalloc] = NULL; 2875 } 2876 } 2877 } 2878 2879 /* 2880 Calculate the transition table for a new state derived from state s 2881 for a compiled dfa d after input character uc, and return the new 2882 state number. 2883 2884 Do not worry about all possible input characters; calculate just the group 2885 of positions that match uc. Label it with the set of characters that 2886 every position in the group matches (taking into account, if necessary, 2887 preceding context information of s). Then find the union 2888 of these positions' follows, i.e., the set of positions of the 2889 new state. For each character in the group's label, set the transition 2890 on this character to be to a state corresponding to the set's positions, 2891 and its associated backward context information, if necessary. 2892 2893 When building a searching matcher, include the positions of state 2894 0 in every state. 2895 2896 The group is constructed by building an equivalence-class 2897 partition of the positions of s. 2898 2899 For each position, find the set of characters C that it matches. Eliminate 2900 any characters from C that fail on grounds of backward context. 2901 2902 Check whether the group's label L has nonempty 2903 intersection with C. If L - C is nonempty, create a new group labeled 2904 L - C and having the same positions as the current group, and set L to 2905 the intersection of L and C. Insert the position in the group, set 2906 C = C - L, and resume scanning. 2907 2908 If after comparing with every group there are characters remaining in C, 2909 create a new group labeled with the characters of C and insert this 2910 position in that group. */ 2911 2912 static state_num 2913 build_state (state_num s, struct dfa *d, unsigned char uc) 2914 { 2915 position_set follows; /* Union of the follows for each 2916 position of the current state. */ 2917 position_set group; /* Positions that match the input char. */ 2918 position_set tmp; /* Temporary space for merging sets. */ 2919 state_num state; /* New state. */ 2920 state_num state_newline; /* New state on a newline transition. */ 2921 state_num state_letter; /* New state on a letter transition. */ 2922 2923 #ifdef DEBUG 2924 fprintf (stderr, "build state %td\n", s); 2925 #endif 2926 2927 /* A pointer to the new transition table, and the table itself. */ 2928 state_num **ptrans = (accepting (s, d) ? d->fails : d->trans) + s; 2929 state_num *trans = *ptrans; 2930 2931 if (!trans) 2932 { 2933 /* MAX_TRCOUNT is an arbitrary upper limit on the number of 2934 transition tables that can exist at once, other than for 2935 initial states. Often-used transition tables are quickly 2936 rebuilt, whereas rarely-used ones are cleared away. */ 2937 if (MAX_TRCOUNT <= d->trcount) 2938 { 2939 for (state_num i = d->min_trcount; i < d->tralloc; i++) 2940 { 2941 free (d->trans[i]); 2942 free (d->fails[i]); 2943 d->trans[i] = d->fails[i] = NULL; 2944 } 2945 d->trcount = 0; 2946 } 2947 2948 d->trcount++; 2949 *ptrans = trans = xmalloc (NOTCHAR * sizeof *trans); 2950 2951 /* Fill transition table with a default value which means that the 2952 transited state has not been calculated yet. */ 2953 for (int i = 0; i < NOTCHAR; i++) 2954 trans[i] = -2; 2955 } 2956 2957 /* Set up the success bits for this state. */ 2958 d->success[s] = 0; 2959 if (accepts_in_context (d->states[s].context, CTX_NEWLINE, s, d)) 2960 d->success[s] |= CTX_NEWLINE; 2961 if (accepts_in_context (d->states[s].context, CTX_LETTER, s, d)) 2962 d->success[s] |= CTX_LETTER; 2963 if (accepts_in_context (d->states[s].context, CTX_NONE, s, d)) 2964 d->success[s] |= CTX_NONE; 2965 2966 alloc_position_set (&follows, d->nleaves); 2967 2968 /* Find the union of the follows of the positions of the group. 2969 This is a hideously inefficient loop. Fix it someday. */ 2970 for (idx_t j = 0; j < d->states[s].elems.nelem; j++) 2971 for (idx_t k = 0; 2972 k < d->follows[d->states[s].elems.elems[j].index].nelem; ++k) 2973 insert (d->follows[d->states[s].elems.elems[j].index].elems[k], 2974 &follows); 2975 2976 /* Positions that match the input char. */ 2977 alloc_position_set (&group, d->nleaves); 2978 2979 /* The group's label. */ 2980 charclass label; 2981 fillset (&label); 2982 2983 for (idx_t i = 0; i < follows.nelem; i++) 2984 { 2985 charclass matches; /* Set of matching characters. */ 2986 position pos = follows.elems[i]; 2987 bool matched = false; 2988 if (d->tokens[pos.index] >= 0 && d->tokens[pos.index] < NOTCHAR) 2989 { 2990 zeroset (&matches); 2991 setbit (d->tokens[pos.index], &matches); 2992 if (d->tokens[pos.index] == uc) 2993 matched = true; 2994 } 2995 else if (d->tokens[pos.index] >= CSET) 2996 { 2997 matches = d->charclasses[d->tokens[pos.index] - CSET]; 2998 if (tstbit (uc, &matches)) 2999 matched = true; 3000 } 3001 else if (d->tokens[pos.index] == ANYCHAR) 3002 { 3003 matches = d->charclasses[d->canychar]; 3004 if (tstbit (uc, &matches)) 3005 matched = true; 3006 3007 /* ANYCHAR must match with a single character, so we must put 3008 it to D->states[s].mbps which contains the positions which 3009 can match with a single character not a byte. If all 3010 positions which has ANYCHAR does not depend on context of 3011 next character, we put the follows instead of it to 3012 D->states[s].mbps to optimize. */ 3013 if (succeeds_in_context (pos.constraint, d->states[s].context, 3014 CTX_NONE)) 3015 { 3016 if (d->states[s].mbps.nelem == 0) 3017 alloc_position_set (&d->states[s].mbps, 1); 3018 insert (pos, &d->states[s].mbps); 3019 } 3020 } 3021 else 3022 continue; 3023 3024 /* Some characters may need to be eliminated from matches because 3025 they fail in the current context. */ 3026 if (pos.constraint != NO_CONSTRAINT) 3027 { 3028 if (!succeeds_in_context (pos.constraint, 3029 d->states[s].context, CTX_NEWLINE)) 3030 for (int j = 0; j < CHARCLASS_WORDS; j++) 3031 matches.w[j] &= ~d->syntax.newline.w[j]; 3032 if (!succeeds_in_context (pos.constraint, 3033 d->states[s].context, CTX_LETTER)) 3034 for (int j = 0; j < CHARCLASS_WORDS; ++j) 3035 matches.w[j] &= ~d->syntax.letters.w[j]; 3036 if (!succeeds_in_context (pos.constraint, 3037 d->states[s].context, CTX_NONE)) 3038 for (int j = 0; j < CHARCLASS_WORDS; ++j) 3039 matches.w[j] &= d->syntax.letters.w[j] | d->syntax.newline.w[j]; 3040 3041 /* If there are no characters left, there's no point in going on. */ 3042 if (emptyset (&matches)) 3043 continue; 3044 3045 /* If we have reset the bit that made us declare "matched", reset 3046 that indicator, too. This is required to avoid an infinite loop 3047 with this command: echo cx | LC_ALL=C grep -E 'c\b[x ]' */ 3048 if (!tstbit (uc, &matches)) 3049 matched = false; 3050 } 3051 3052 #ifdef DEBUG 3053 fprintf (stderr, " nextpos %td:", pos.index); 3054 prtok (d->tokens[pos.index]); 3055 fprintf (stderr, " of"); 3056 for (unsigned j = 0; j < NOTCHAR; j++) 3057 if (tstbit (j, &matches)) 3058 fprintf (stderr, " 0x%02x", j); 3059 fprintf (stderr, "\n"); 3060 #endif 3061 3062 if (matched) 3063 { 3064 for (int k = 0; k < CHARCLASS_WORDS; ++k) 3065 label.w[k] &= matches.w[k]; 3066 append (pos, &group); 3067 } 3068 else 3069 { 3070 for (int k = 0; k < CHARCLASS_WORDS; ++k) 3071 label.w[k] &= ~matches.w[k]; 3072 } 3073 } 3074 3075 alloc_position_set (&tmp, d->nleaves); 3076 3077 if (group.nelem > 0) 3078 { 3079 /* If we are building a searching matcher, throw in the positions 3080 of state 0 as well, if possible. */ 3081 if (d->searchflag) 3082 { 3083 /* If a token in follows.elems is not 1st byte of a multibyte 3084 character, or the states of follows must accept the bytes 3085 which are not 1st byte of the multibyte character. 3086 Then, if a state of follows encounters a byte, it must not be 3087 a 1st byte of a multibyte character nor a single byte character. 3088 In this case, do not add state[0].follows to next state, because 3089 state[0] must accept 1st-byte. 3090 3091 For example, suppose <sb a> is a certain single byte character, 3092 <mb A> is a certain multibyte character, and the codepoint of 3093 <sb a> equals the 2nd byte of the codepoint of <mb A>. When 3094 state[0] accepts <sb a>, state[i] transits to state[i+1] by 3095 accepting the 1st byte of <mb A>, and state[i+1] accepts the 3096 2nd byte of <mb A>, if state[i+1] encounters the codepoint of 3097 <sb a>, it must not be <sb a> but the 2nd byte of <mb A>, so do 3098 not add state[0]. */ 3099 3100 bool mergeit = !d->localeinfo.multibyte; 3101 if (!mergeit) 3102 { 3103 mergeit = true; 3104 for (idx_t j = 0; mergeit && j < group.nelem; j++) 3105 mergeit &= d->multibyte_prop[group.elems[j].index]; 3106 } 3107 if (mergeit) 3108 { 3109 merge (&d->states[0].elems, &group, &tmp); 3110 copy (&tmp, &group); 3111 } 3112 } 3113 3114 /* Find out if the new state will want any context information, 3115 by calculating possible contexts that the group can match, 3116 and separate contexts that the new state wants to know. */ 3117 int possible_contexts = charclass_context (d, &label); 3118 int separate_contexts = state_separate_contexts (d, &group); 3119 3120 /* Find the state(s) corresponding to the union of the follows. */ 3121 if (possible_contexts & ~separate_contexts) 3122 state = state_index (d, &group, separate_contexts ^ CTX_ANY); 3123 else 3124 state = -1; 3125 if (separate_contexts & possible_contexts & CTX_NEWLINE) 3126 state_newline = state_index (d, &group, CTX_NEWLINE); 3127 else 3128 state_newline = state; 3129 if (separate_contexts & possible_contexts & CTX_LETTER) 3130 state_letter = state_index (d, &group, CTX_LETTER); 3131 else 3132 state_letter = state; 3133 3134 /* Reallocate now, to reallocate any newline transition properly. */ 3135 realloc_trans_if_necessary (d); 3136 } 3137 3138 /* If we are a searching matcher, the default transition is to a state 3139 containing the positions of state 0, otherwise the default transition 3140 is to fail miserably. */ 3141 else if (d->searchflag) 3142 { 3143 state_newline = 0; 3144 state_letter = d->min_trcount - 1; 3145 state = d->initstate_notbol; 3146 } 3147 else 3148 { 3149 state_newline = -1; 3150 state_letter = -1; 3151 state = -1; 3152 } 3153 3154 /* Set the transitions for each character in the label. */ 3155 for (int i = 0; i < NOTCHAR; i++) 3156 if (tstbit (i, &label)) 3157 switch (d->syntax.sbit[i]) 3158 { 3159 case CTX_NEWLINE: 3160 trans[i] = state_newline; 3161 break; 3162 case CTX_LETTER: 3163 trans[i] = state_letter; 3164 break; 3165 default: 3166 trans[i] = state; 3167 break; 3168 } 3169 3170 #ifdef DEBUG 3171 fprintf (stderr, "trans table %td", s); 3172 for (int i = 0; i < NOTCHAR; ++i) 3173 { 3174 if (!(i & 0xf)) 3175 fprintf (stderr, "\n"); 3176 fprintf (stderr, " %2td", trans[i]); 3177 } 3178 fprintf (stderr, "\n"); 3179 #endif 3180 3181 free (group.elems); 3182 free (follows.elems); 3183 free (tmp.elems); 3184 3185 /* Keep the newline transition in a special place so we can use it as 3186 a sentinel. */ 3187 if (tstbit (d->syntax.eolbyte, &label)) 3188 { 3189 d->newlines[s] = trans[d->syntax.eolbyte]; 3190 trans[d->syntax.eolbyte] = -1; 3191 } 3192 3193 return trans[uc]; 3194 } 3195 3196 /* Multibyte character handling sub-routines for dfaexec. */ 3197 3198 /* Consume a single byte and transit state from 's' to '*next_state'. 3199 This function is almost same as the state transition routin in dfaexec. 3200 But state transition is done just once, otherwise matching succeed or 3201 reach the end of the buffer. */ 3202 static state_num 3203 transit_state_singlebyte (struct dfa *d, state_num s, unsigned char const **pp) 3204 { 3205 state_num *t; 3206 3207 if (d->trans[s]) 3208 t = d->trans[s]; 3209 else if (d->fails[s]) 3210 t = d->fails[s]; 3211 else 3212 { 3213 build_state (s, d, **pp); 3214 if (d->trans[s]) 3215 t = d->trans[s]; 3216 else 3217 { 3218 t = d->fails[s]; 3219 assert (t); 3220 } 3221 } 3222 3223 if (t[**pp] == -2) 3224 build_state (s, d, **pp); 3225 3226 return t[*(*pp)++]; 3227 } 3228 3229 /* Transit state from s, then return new state and update the pointer of 3230 the buffer. This function is for a period operator which can match a 3231 multi-byte character. */ 3232 static state_num 3233 transit_state (struct dfa *d, state_num s, unsigned char const **pp, 3234 unsigned char const *end) 3235 { 3236 wint_t wc; 3237 3238 int mbclen = mbs_to_wchar (&wc, (char const *) *pp, end - *pp, d); 3239 3240 /* This state has some operators which can match a multibyte character. */ 3241 d->mb_follows.nelem = 0; 3242 3243 /* Calculate the state which can be reached from the state 's' by 3244 consuming 'mbclen' single bytes from the buffer. */ 3245 state_num s1 = s; 3246 int mbci; 3247 for (mbci = 0; mbci < mbclen && (mbci == 0 || d->min_trcount <= s); mbci++) 3248 s = transit_state_singlebyte (d, s, pp); 3249 *pp += mbclen - mbci; 3250 3251 if (wc == WEOF) 3252 { 3253 /* It is an invalid character, so ANYCHAR is not accepted. */ 3254 return s; 3255 } 3256 3257 /* If all positions which have ANYCHAR do not depend on the context 3258 of the next character, calculate the next state with 3259 pre-calculated follows and cache the result. */ 3260 if (d->states[s1].mb_trindex < 0) 3261 { 3262 if (MAX_TRCOUNT <= d->mb_trcount) 3263 { 3264 state_num s3; 3265 for (s3 = -1; s3 < d->tralloc; s3++) 3266 { 3267 free (d->mb_trans[s3]); 3268 d->mb_trans[s3] = NULL; 3269 } 3270 3271 for (state_num i = 0; i < d->sindex; i++) 3272 d->states[i].mb_trindex = -1; 3273 d->mb_trcount = 0; 3274 } 3275 d->states[s1].mb_trindex = d->mb_trcount++; 3276 } 3277 3278 if (! d->mb_trans[s]) 3279 { 3280 enum { TRANSPTR_SIZE = sizeof *d->mb_trans[s] }; 3281 enum { TRANSALLOC_SIZE = MAX_TRCOUNT * TRANSPTR_SIZE }; 3282 d->mb_trans[s] = xmalloc (TRANSALLOC_SIZE); 3283 for (int i = 0; i < MAX_TRCOUNT; i++) 3284 d->mb_trans[s][i] = -1; 3285 } 3286 else if (d->mb_trans[s][d->states[s1].mb_trindex] >= 0) 3287 return d->mb_trans[s][d->states[s1].mb_trindex]; 3288 3289 if (s == -1) 3290 copy (&d->states[s1].mbps, &d->mb_follows); 3291 else 3292 merge (&d->states[s1].mbps, &d->states[s].elems, &d->mb_follows); 3293 3294 int separate_contexts = state_separate_contexts (d, &d->mb_follows); 3295 state_num s2 = state_index (d, &d->mb_follows, separate_contexts ^ CTX_ANY); 3296 realloc_trans_if_necessary (d); 3297 3298 d->mb_trans[s][d->states[s1].mb_trindex] = s2; 3299 3300 return s2; 3301 } 3302 3303 /* The initial state may encounter a byte which is not a single byte character 3304 nor the first byte of a multibyte character. But it is incorrect for the 3305 initial state to accept such a byte. For example, in Shift JIS the regular 3306 expression "\\" accepts the codepoint 0x5c, but should not accept the second 3307 byte of the codepoint 0x815c. Then the initial state must skip the bytes 3308 that are not a single byte character nor the first byte of a multibyte 3309 character. 3310 3311 Given DFA state d, use mbs_to_wchar to advance MBP until it reaches 3312 or exceeds P, and return the advanced MBP. If WCP is non-NULL and 3313 the result is greater than P, set *WCP to the final wide character 3314 processed, or to WEOF if no wide character is processed. Otherwise, 3315 if WCP is non-NULL, *WCP may or may not be updated. 3316 3317 Both P and MBP must be no larger than END. */ 3318 static unsigned char const * 3319 skip_remains_mb (struct dfa *d, unsigned char const *p, 3320 unsigned char const *mbp, char const *end) 3321 { 3322 if (d->syntax.never_trail[*p]) 3323 return p; 3324 while (mbp < p) 3325 { 3326 wint_t wc; 3327 mbp += mbs_to_wchar (&wc, (char const *) mbp, 3328 end - (char const *) mbp, d); 3329 } 3330 return mbp; 3331 } 3332 3333 /* Search through a buffer looking for a match to the struct dfa *D. 3334 Find the first occurrence of a string matching the regexp in the 3335 buffer, and the shortest possible version thereof. Return a pointer to 3336 the first character after the match, or NULL if none is found. BEGIN 3337 points to the beginning of the buffer, and END points to the first byte 3338 after its end. Note however that we store a sentinel byte (usually 3339 newline) in *END, so the actual buffer must be one byte longer. 3340 When ALLOW_NL, newlines may appear in the matching string. 3341 If COUNT is non-NULL, increment *COUNT once for each newline processed. 3342 If MULTIBYTE, the input consists of multibyte characters and/or 3343 encoding-error bytes. Otherwise, it consists of single-byte characters. 3344 Here is the list of features that make this DFA matcher punt: 3345 - [M-N] range in non-simple locale: regex is up to 25% faster on [a-z] 3346 - [^...] in non-simple locale 3347 - [[=foo=]] or [[.foo.]] 3348 - [[:alpha:]] etc. in multibyte locale (except [[:digit:]] works OK) 3349 - back-reference: (.)\1 3350 - word-delimiter in multibyte locale: \<, \>, \b, \B 3351 See struct localeinfo.simple for the definition of "simple locale". */ 3352 3353 static inline char * 3354 dfaexec_main (struct dfa *d, char const *begin, char *end, bool allow_nl, 3355 ptrdiff_t *count, bool multibyte) 3356 { 3357 if (MAX_TRCOUNT <= d->sindex) 3358 { 3359 for (state_num s = d->min_trcount; s < d->sindex; s++) 3360 { 3361 free (d->states[s].elems.elems); 3362 free (d->states[s].mbps.elems); 3363 } 3364 d->sindex = d->min_trcount; 3365 3366 if (d->trans) 3367 { 3368 for (state_num s = 0; s < d->tralloc; s++) 3369 { 3370 free (d->trans[s]); 3371 free (d->fails[s]); 3372 d->trans[s] = d->fails[s] = NULL; 3373 } 3374 d->trcount = 0; 3375 } 3376 3377 if (d->localeinfo.multibyte && d->mb_trans) 3378 { 3379 for (state_num s = -1; s < d->tralloc; s++) 3380 { 3381 free (d->mb_trans[s]); 3382 d->mb_trans[s] = NULL; 3383 } 3384 for (state_num s = 0; s < d->min_trcount; s++) 3385 d->states[s].mb_trindex = -1; 3386 d->mb_trcount = 0; 3387 } 3388 } 3389 3390 if (!d->tralloc) 3391 realloc_trans_if_necessary (d); 3392 3393 /* Current state. */ 3394 state_num s = 0, s1 = 0; 3395 3396 /* Current input character. */ 3397 unsigned char const *p = (unsigned char const *) begin; 3398 unsigned char const *mbp = p; 3399 3400 /* Copy of d->trans so it can be optimized into a register. */ 3401 state_num **trans = d->trans; 3402 unsigned char eol = d->syntax.eolbyte; /* Likewise for eolbyte. */ 3403 unsigned char saved_end = *(unsigned char *) end; 3404 *end = eol; 3405 3406 if (multibyte) 3407 { 3408 memset (&d->mbs, 0, sizeof d->mbs); 3409 if (d->mb_follows.alloc == 0) 3410 alloc_position_set (&d->mb_follows, d->nleaves); 3411 } 3412 3413 idx_t nlcount = 0; 3414 for (;;) 3415 { 3416 state_num *t; 3417 while ((t = trans[s]) != NULL) 3418 { 3419 if (s < d->min_trcount) 3420 { 3421 if (!multibyte || d->states[s].mbps.nelem == 0) 3422 { 3423 while (t[*p] == s) 3424 p++; 3425 } 3426 if (multibyte) 3427 p = mbp = skip_remains_mb (d, p, mbp, end); 3428 } 3429 3430 if (multibyte) 3431 { 3432 s1 = s; 3433 3434 if (d->states[s].mbps.nelem == 0 3435 || d->localeinfo.sbctowc[*p] != WEOF || (char *) p >= end) 3436 { 3437 /* If an input character does not match ANYCHAR, do it 3438 like a single-byte character. */ 3439 s = t[*p++]; 3440 } 3441 else 3442 { 3443 s = transit_state (d, s, &p, (unsigned char *) end); 3444 mbp = p; 3445 trans = d->trans; 3446 } 3447 } 3448 else 3449 { 3450 s1 = t[*p++]; 3451 t = trans[s1]; 3452 if (! t) 3453 { 3454 state_num tmp = s; 3455 s = s1; 3456 s1 = tmp; /* swap */ 3457 break; 3458 } 3459 if (s < d->min_trcount) 3460 { 3461 while (t[*p] == s1) 3462 p++; 3463 } 3464 s = t[*p++]; 3465 } 3466 } 3467 3468 if (s < 0) 3469 { 3470 if (s == -2) 3471 { 3472 s = build_state (s1, d, p[-1]); 3473 trans = d->trans; 3474 } 3475 else if ((char *) p <= end && p[-1] == eol && 0 <= d->newlines[s1]) 3476 { 3477 /* The previous character was a newline. Count it, and skip 3478 checking of multibyte character boundary until here. */ 3479 nlcount++; 3480 mbp = p; 3481 3482 s = (allow_nl ? d->newlines[s1] 3483 : d->syntax.sbit[eol] == CTX_NEWLINE ? 0 3484 : d->syntax.sbit[eol] == CTX_LETTER ? d->min_trcount - 1 3485 : d->initstate_notbol); 3486 } 3487 else 3488 { 3489 p = NULL; 3490 goto done; 3491 } 3492 } 3493 else if (d->fails[s]) 3494 { 3495 if ((d->success[s] & d->syntax.sbit[*p]) 3496 || ((char *) p == end 3497 && accepts_in_context (d->states[s].context, CTX_NEWLINE, s, 3498 d))) 3499 goto done; 3500 3501 if (multibyte && s < d->min_trcount) 3502 p = mbp = skip_remains_mb (d, p, mbp, end); 3503 3504 s1 = s; 3505 if (!multibyte || d->states[s].mbps.nelem == 0 3506 || d->localeinfo.sbctowc[*p] != WEOF || (char *) p >= end) 3507 { 3508 /* If a input character does not match ANYCHAR, do it 3509 like a single-byte character. */ 3510 s = d->fails[s][*p++]; 3511 } 3512 else 3513 { 3514 s = transit_state (d, s, &p, (unsigned char *) end); 3515 mbp = p; 3516 trans = d->trans; 3517 } 3518 } 3519 else 3520 { 3521 build_state (s, d, p[0]); 3522 trans = d->trans; 3523 } 3524 } 3525 3526 done: 3527 if (count) 3528 *count += nlcount; 3529 *end = saved_end; 3530 return (char *) p; 3531 } 3532 3533 /* Specialized versions of dfaexec for multibyte and single-byte cases. 3534 This is for performance, as dfaexec_main is an inline function. */ 3535 3536 static char * 3537 dfaexec_mb (struct dfa *d, char const *begin, char *end, 3538 bool allow_nl, ptrdiff_t *count, bool *backref) 3539 { 3540 return dfaexec_main (d, begin, end, allow_nl, count, true); 3541 } 3542 3543 static char * 3544 dfaexec_sb (struct dfa *d, char const *begin, char *end, 3545 bool allow_nl, ptrdiff_t *count, bool *backref) 3546 { 3547 return dfaexec_main (d, begin, end, allow_nl, count, false); 3548 } 3549 3550 /* Always set *BACKREF and return BEGIN. Use this wrapper for 3551 any regexp that uses a construct not supported by this code. */ 3552 static char * 3553 dfaexec_noop (struct dfa *d, char const *begin, char *end, 3554 bool allow_nl, ptrdiff_t *count, bool *backref) 3555 { 3556 *backref = true; 3557 return (char *) begin; 3558 } 3559 3560 /* Like dfaexec_main (D, BEGIN, END, ALLOW_NL, COUNT, D->localeinfo.multibyte), 3561 but faster and set *BACKREF if the DFA code does not support this 3562 regexp usage. */ 3563 3564 char * 3565 dfaexec (struct dfa *d, char const *begin, char *end, 3566 bool allow_nl, ptrdiff_t *count, bool *backref) 3567 { 3568 return d->dfaexec (d, begin, end, allow_nl, count, backref); 3569 } 3570 3571 struct dfa * 3572 dfasuperset (struct dfa const *d) 3573 { 3574 return d->superset; 3575 } 3576 3577 bool 3578 dfaisfast (struct dfa const *d) 3579 { 3580 return d->fast; 3581 } 3582 3583 static void 3584 free_mbdata (struct dfa *d) 3585 { 3586 free (d->multibyte_prop); 3587 free (d->lex.brack.chars); 3588 free (d->mb_follows.elems); 3589 3590 if (d->mb_trans) 3591 { 3592 state_num s; 3593 for (s = -1; s < d->tralloc; s++) 3594 free (d->mb_trans[s]); 3595 free (d->mb_trans - 2); 3596 } 3597 } 3598 3599 /* Return true if every construct in D is supported by this DFA matcher. */ 3600 static bool _GL_ATTRIBUTE_PURE 3601 dfa_supported (struct dfa const *d) 3602 { 3603 for (idx_t i = 0; i < d->tindex; i++) 3604 { 3605 switch (d->tokens[i]) 3606 { 3607 case BEGWORD: 3608 case ENDWORD: 3609 case LIMWORD: 3610 case NOTLIMWORD: 3611 if (!d->localeinfo.multibyte) 3612 continue; 3613 FALLTHROUGH; 3614 case BACKREF: 3615 case MBCSET: 3616 return false; 3617 } 3618 } 3619 return true; 3620 } 3621 3622 /* Disable use of the superset DFA if it is not likely to help 3623 performance. */ 3624 static void 3625 maybe_disable_superset_dfa (struct dfa *d) 3626 { 3627 if (!d->localeinfo.using_utf8) 3628 return; 3629 3630 bool have_backref = false; 3631 for (idx_t i = 0; i < d->tindex; i++) 3632 { 3633 switch (d->tokens[i]) 3634 { 3635 case ANYCHAR: 3636 /* Lowered. */ 3637 abort (); 3638 case BACKREF: 3639 have_backref = true; 3640 break; 3641 case MBCSET: 3642 /* Requires multi-byte algorithm. */ 3643 return; 3644 default: 3645 break; 3646 } 3647 } 3648 3649 if (!have_backref && d->superset) 3650 { 3651 /* The superset DFA is not likely to be much faster, so remove it. */ 3652 dfafree (d->superset); 3653 free (d->superset); 3654 d->superset = NULL; 3655 } 3656 3657 free_mbdata (d); 3658 d->localeinfo.multibyte = false; 3659 d->dfaexec = dfaexec_sb; 3660 d->fast = true; 3661 } 3662 3663 static void 3664 dfassbuild (struct dfa *d) 3665 { 3666 struct dfa *sup = dfaalloc (); 3667 3668 *sup = *d; 3669 sup->localeinfo.multibyte = false; 3670 sup->dfaexec = dfaexec_sb; 3671 sup->multibyte_prop = NULL; 3672 sup->superset = NULL; 3673 sup->states = NULL; 3674 sup->sindex = 0; 3675 sup->constraints = NULL; 3676 sup->separates = NULL; 3677 sup->follows = NULL; 3678 sup->tralloc = 0; 3679 sup->trans = NULL; 3680 sup->fails = NULL; 3681 sup->success = NULL; 3682 sup->newlines = NULL; 3683 3684 sup->charclasses = xnmalloc (sup->calloc, sizeof *sup->charclasses); 3685 if (d->cindex) 3686 { 3687 memcpy (sup->charclasses, d->charclasses, 3688 d->cindex * sizeof *sup->charclasses); 3689 } 3690 3691 sup->tokens = xnmalloc (d->tindex, 2 * sizeof *sup->tokens); 3692 sup->talloc = d->tindex * 2; 3693 3694 bool have_achar = false; 3695 bool have_nchar = false; 3696 idx_t j; 3697 for (idx_t i = j = 0; i < d->tindex; i++) 3698 { 3699 switch (d->tokens[i]) 3700 { 3701 case ANYCHAR: 3702 case MBCSET: 3703 case BACKREF: 3704 { 3705 charclass ccl; 3706 fillset (&ccl); 3707 sup->tokens[j++] = CSET + charclass_index (sup, &ccl); 3708 sup->tokens[j++] = STAR; 3709 if (d->tokens[i + 1] == QMARK || d->tokens[i + 1] == STAR 3710 || d->tokens[i + 1] == PLUS) 3711 i++; 3712 have_achar = true; 3713 } 3714 break; 3715 case BEGWORD: 3716 case ENDWORD: 3717 case LIMWORD: 3718 case NOTLIMWORD: 3719 if (d->localeinfo.multibyte) 3720 { 3721 /* These constraints aren't supported in a multibyte locale. 3722 Ignore them in the superset DFA. */ 3723 sup->tokens[j++] = EMPTY; 3724 break; 3725 } 3726 FALLTHROUGH; 3727 default: 3728 sup->tokens[j++] = d->tokens[i]; 3729 if ((0 <= d->tokens[i] && d->tokens[i] < NOTCHAR) 3730 || d->tokens[i] >= CSET) 3731 have_nchar = true; 3732 break; 3733 } 3734 } 3735 sup->tindex = j; 3736 3737 if (have_nchar && (have_achar || d->localeinfo.multibyte)) 3738 d->superset = sup; 3739 else 3740 { 3741 dfafree (sup); 3742 free (sup); 3743 } 3744 } 3745 3746 /* Parse a string S of length LEN into D (but skip this step if S is null). 3747 Then analyze D and build a matcher for it. 3748 SEARCHFLAG says whether to build a searching or an exact matcher. */ 3749 void 3750 dfacomp (char const *s, idx_t len, struct dfa *d, bool searchflag) 3751 { 3752 if (s != NULL) 3753 dfaparse (s, len, d); 3754 3755 dfassbuild (d); 3756 3757 if (dfa_supported (d)) 3758 { 3759 maybe_disable_superset_dfa (d); 3760 dfaanalyze (d, searchflag); 3761 } 3762 else 3763 { 3764 d->dfaexec = dfaexec_noop; 3765 } 3766 3767 if (d->superset) 3768 { 3769 d->fast = true; 3770 dfaanalyze (d->superset, searchflag); 3771 } 3772 } 3773 3774 /* Free the storage held by the components of a dfa. */ 3775 void 3776 dfafree (struct dfa *d) 3777 { 3778 free (d->charclasses); 3779 free (d->tokens); 3780 3781 if (d->localeinfo.multibyte) 3782 free_mbdata (d); 3783 3784 free (d->constraints); 3785 free (d->separates); 3786 3787 for (idx_t i = 0; i < d->sindex; i++) 3788 { 3789 free (d->states[i].elems.elems); 3790 free (d->states[i].mbps.elems); 3791 } 3792 free (d->states); 3793 3794 if (d->follows) 3795 { 3796 for (idx_t i = 0; i < d->tindex; i++) 3797 free (d->follows[i].elems); 3798 free (d->follows); 3799 } 3800 3801 if (d->trans) 3802 { 3803 for (idx_t i = 0; i < d->tralloc; i++) 3804 { 3805 free (d->trans[i]); 3806 free (d->fails[i]); 3807 } 3808 3809 free (d->trans - 2); 3810 free (d->fails); 3811 free (d->newlines); 3812 free (d->success); 3813 } 3814 3815 if (d->superset) 3816 { 3817 dfafree (d->superset); 3818 free (d->superset); 3819 } 3820 } 3821 3822 /* Having found the postfix representation of the regular expression, 3823 try to find a long sequence of characters that must appear in any line 3824 containing the r.e. 3825 Finding a "longest" sequence is beyond the scope here; 3826 we take an easy way out and hope for the best. 3827 (Take "(ab|a)b"--please.) 3828 3829 We do a bottom-up calculation of sequences of characters that must appear 3830 in matches of r.e.'s represented by trees rooted at the nodes of the postfix 3831 representation: 3832 sequences that must appear at the left of the match ("left") 3833 sequences that must appear at the right of the match ("right") 3834 lists of sequences that must appear somewhere in the match ("in") 3835 sequences that must constitute the match ("is") 3836 3837 When we get to the root of the tree, we use one of the longest of its 3838 calculated "in" sequences as our answer. 3839 3840 The sequences calculated for the various types of node (in pseudo ANSI c) 3841 are shown below. "p" is the operand of unary operators (and the left-hand 3842 operand of binary operators); "q" is the right-hand operand of binary 3843 operators. 3844 3845 "ZERO" means "a zero-length sequence" below. 3846 3847 Type left right is in 3848 ---- ---- ----- -- -- 3849 char c # c # c # c # c 3850 3851 ANYCHAR ZERO ZERO ZERO ZERO 3852 3853 MBCSET ZERO ZERO ZERO ZERO 3854 3855 CSET ZERO ZERO ZERO ZERO 3856 3857 STAR ZERO ZERO ZERO ZERO 3858 3859 QMARK ZERO ZERO ZERO ZERO 3860 3861 PLUS p->left p->right ZERO p->in 3862 3863 CAT (p->is==ZERO)? (q->is==ZERO)? (p->is!=ZERO && p->in plus 3864 p->left : q->right : q->is!=ZERO) ? q->in plus 3865 p->is##q->left p->right##q->is p->is##q->is : p->right##q->left 3866 ZERO 3867 3868 OR longest common longest common (do p->is and substrings common 3869 leading trailing to q->is have same p->in and 3870 (sub)sequence (sub)sequence q->in length and content) ? 3871 of p->left of p->right 3872 and q->left and q->right p->is : NULL 3873 3874 If there's anything else we recognize in the tree, all four sequences get set 3875 to zero-length sequences. If there's something we don't recognize in the 3876 tree, we just return a zero-length sequence. 3877 3878 Break ties in favor of infrequent letters (choosing 'zzz' in preference to 3879 'aaa')? 3880 3881 And ... is it here or someplace that we might ponder "optimizations" such as 3882 egrep 'psi|epsilon' -> egrep 'psi' 3883 egrep 'pepsi|epsilon' -> egrep 'epsi' 3884 (Yes, we now find "epsi" as a "string 3885 that must occur", but we might also 3886 simplify the *entire* r.e. being sought) 3887 grep '[c]' -> grep 'c' 3888 grep '(ab|a)b' -> grep 'ab' 3889 grep 'ab*' -> grep 'a' 3890 grep 'a*b' -> grep 'b' 3891 3892 There are several issues: 3893 3894 Is optimization easy (enough)? 3895 3896 Does optimization actually accomplish anything, 3897 or is the automaton you get from "psi|epsilon" (for example) 3898 the same as the one you get from "psi" (for example)? 3899 3900 Are optimizable r.e.'s likely to be used in real-life situations 3901 (something like 'ab*' is probably unlikely; something like is 3902 'psi|epsilon' is likelier)? */ 3903 3904 static char * 3905 icatalloc (char *old, char const *new) 3906 { 3907 idx_t newsize = strlen (new); 3908 if (newsize == 0) 3909 return old; 3910 idx_t oldsize = strlen (old); 3911 char *result = xrealloc (old, oldsize + newsize + 1); 3912 memcpy (result + oldsize, new, newsize + 1); 3913 return result; 3914 } 3915 3916 static void 3917 freelist (char **cpp) 3918 { 3919 while (*cpp) 3920 free (*cpp++); 3921 } 3922 3923 static char ** 3924 enlist (char **cpp, char *new, idx_t len) 3925 { 3926 new = memcpy (xmalloc (len + 1), new, len); 3927 new[len] = '\0'; 3928 /* Is there already something in the list that's new (or longer)? */ 3929 idx_t i; 3930 for (i = 0; cpp[i] != NULL; i++) 3931 if (strstr (cpp[i], new) != NULL) 3932 { 3933 free (new); 3934 return cpp; 3935 } 3936 /* Eliminate any obsoleted strings. */ 3937 for (idx_t j = 0; cpp[j] != NULL; ) 3938 if (strstr (new, cpp[j]) == NULL) 3939 ++j; 3940 else 3941 { 3942 free (cpp[j]); 3943 if (--i == j) 3944 break; 3945 cpp[j] = cpp[i]; 3946 cpp[i] = NULL; 3947 } 3948 /* Add the new string. */ 3949 cpp = xnrealloc (cpp, i + 2, sizeof *cpp); 3950 cpp[i] = new; 3951 cpp[i + 1] = NULL; 3952 return cpp; 3953 } 3954 3955 /* Given pointers to two strings, return a pointer to an allocated 3956 list of their distinct common substrings. */ 3957 static char ** 3958 comsubs (char *left, char const *right) 3959 { 3960 char **cpp = xzalloc (sizeof *cpp); 3961 3962 for (char *lcp = left; *lcp != '\0'; lcp++) 3963 { 3964 idx_t len = 0; 3965 char *rcp = strchr (right, *lcp); 3966 while (rcp != NULL) 3967 { 3968 idx_t i; 3969 for (i = 1; lcp[i] != '\0' && lcp[i] == rcp[i]; ++i) 3970 continue; 3971 if (i > len) 3972 len = i; 3973 rcp = strchr (rcp + 1, *lcp); 3974 } 3975 if (len != 0) 3976 cpp = enlist (cpp, lcp, len); 3977 } 3978 return cpp; 3979 } 3980 3981 static char ** 3982 addlists (char **old, char **new) 3983 { 3984 for (; *new; new++) 3985 old = enlist (old, *new, strlen (*new)); 3986 return old; 3987 } 3988 3989 /* Given two lists of substrings, return a new list giving substrings 3990 common to both. */ 3991 static char ** 3992 inboth (char **left, char **right) 3993 { 3994 char **both = xzalloc (sizeof *both); 3995 3996 for (idx_t lnum = 0; left[lnum] != NULL; lnum++) 3997 { 3998 for (idx_t rnum = 0; right[rnum] != NULL; rnum++) 3999 { 4000 char **temp = comsubs (left[lnum], right[rnum]); 4001 both = addlists (both, temp); 4002 freelist (temp); 4003 free (temp); 4004 } 4005 } 4006 return both; 4007 } 4008 4009 typedef struct must must; 4010 4011 struct must 4012 { 4013 char **in; 4014 char *left; 4015 char *right; 4016 char *is; 4017 bool begline; 4018 bool endline; 4019 must *prev; 4020 }; 4021 4022 static must * 4023 allocmust (must *mp, idx_t size) 4024 { 4025 must *new_mp = xmalloc (sizeof *new_mp); 4026 new_mp->in = xzalloc (sizeof *new_mp->in); 4027 new_mp->left = xzalloc (size); 4028 new_mp->right = xzalloc (size); 4029 new_mp->is = xzalloc (size); 4030 new_mp->begline = false; 4031 new_mp->endline = false; 4032 new_mp->prev = mp; 4033 return new_mp; 4034 } 4035 4036 static void 4037 resetmust (must *mp) 4038 { 4039 freelist (mp->in); 4040 mp->in[0] = NULL; 4041 mp->left[0] = mp->right[0] = mp->is[0] = '\0'; 4042 mp->begline = false; 4043 mp->endline = false; 4044 } 4045 4046 static void 4047 freemust (must *mp) 4048 { 4049 freelist (mp->in); 4050 free (mp->in); 4051 free (mp->left); 4052 free (mp->right); 4053 free (mp->is); 4054 free (mp); 4055 } 4056 4057 struct dfamust * 4058 dfamust (struct dfa const *d) 4059 { 4060 must *mp = NULL; 4061 char const *result = ""; 4062 bool exact = false; 4063 bool begline = false; 4064 bool endline = false; 4065 bool need_begline = false; 4066 bool need_endline = false; 4067 bool case_fold_unibyte = d->syntax.case_fold & !d->localeinfo.multibyte; 4068 4069 for (idx_t ri = 1; ri + 1 < d->tindex; ri++) 4070 { 4071 token t = d->tokens[ri]; 4072 switch (t) 4073 { 4074 case BEGLINE: 4075 mp = allocmust (mp, 2); 4076 mp->begline = true; 4077 need_begline = true; 4078 break; 4079 case ENDLINE: 4080 mp = allocmust (mp, 2); 4081 mp->endline = true; 4082 need_endline = true; 4083 break; 4084 case LPAREN: 4085 case RPAREN: 4086 assert (!"neither LPAREN nor RPAREN may appear here"); 4087 4088 case EMPTY: 4089 case BEGWORD: 4090 case ENDWORD: 4091 case LIMWORD: 4092 case NOTLIMWORD: 4093 case BACKREF: 4094 case ANYCHAR: 4095 case MBCSET: 4096 mp = allocmust (mp, 2); 4097 break; 4098 4099 case STAR: 4100 case QMARK: 4101 resetmust (mp); 4102 break; 4103 4104 case OR: 4105 { 4106 char **new; 4107 must *rmp = mp; 4108 must *lmp = mp = mp->prev; 4109 idx_t j, ln, rn, n; 4110 4111 /* Guaranteed to be. Unlikely, but ... */ 4112 if (streq (lmp->is, rmp->is)) 4113 { 4114 lmp->begline &= rmp->begline; 4115 lmp->endline &= rmp->endline; 4116 } 4117 else 4118 { 4119 lmp->is[0] = '\0'; 4120 lmp->begline = false; 4121 lmp->endline = false; 4122 } 4123 /* Left side--easy */ 4124 idx_t i = 0; 4125 while (lmp->left[i] != '\0' && lmp->left[i] == rmp->left[i]) 4126 ++i; 4127 lmp->left[i] = '\0'; 4128 /* Right side */ 4129 ln = strlen (lmp->right); 4130 rn = strlen (rmp->right); 4131 n = ln; 4132 if (n > rn) 4133 n = rn; 4134 for (i = 0; i < n; ++i) 4135 if (lmp->right[ln - i - 1] != rmp->right[rn - i - 1]) 4136 break; 4137 for (j = 0; j < i; ++j) 4138 lmp->right[j] = lmp->right[(ln - i) + j]; 4139 lmp->right[j] = '\0'; 4140 new = inboth (lmp->in, rmp->in); 4141 freelist (lmp->in); 4142 free (lmp->in); 4143 lmp->in = new; 4144 freemust (rmp); 4145 } 4146 break; 4147 4148 case PLUS: 4149 mp->is[0] = '\0'; 4150 break; 4151 4152 case END: 4153 assert (!mp->prev); 4154 for (idx_t i = 0; mp->in[i] != NULL; i++) 4155 if (strlen (mp->in[i]) > strlen (result)) 4156 result = mp->in[i]; 4157 if (streq (result, mp->is)) 4158 { 4159 if ((!need_begline || mp->begline) && (!need_endline 4160 || mp->endline)) 4161 exact = true; 4162 begline = mp->begline; 4163 endline = mp->endline; 4164 } 4165 goto done; 4166 4167 case CAT: 4168 { 4169 must *rmp = mp; 4170 must *lmp = mp = mp->prev; 4171 4172 /* In. Everything in left, plus everything in 4173 right, plus concatenation of 4174 left's right and right's left. */ 4175 lmp->in = addlists (lmp->in, rmp->in); 4176 if (lmp->right[0] != '\0' && rmp->left[0] != '\0') 4177 { 4178 idx_t lrlen = strlen (lmp->right); 4179 idx_t rllen = strlen (rmp->left); 4180 char *tp = xmalloc (lrlen + rllen); 4181 memcpy (tp, lmp->right, lrlen); 4182 memcpy (tp + lrlen, rmp->left, rllen); 4183 lmp->in = enlist (lmp->in, tp, lrlen + rllen); 4184 free (tp); 4185 } 4186 /* Left-hand */ 4187 if (lmp->is[0] != '\0') 4188 lmp->left = icatalloc (lmp->left, rmp->left); 4189 /* Right-hand */ 4190 if (rmp->is[0] == '\0') 4191 lmp->right[0] = '\0'; 4192 lmp->right = icatalloc (lmp->right, rmp->right); 4193 /* Guaranteed to be */ 4194 if ((lmp->is[0] != '\0' || lmp->begline) 4195 && (rmp->is[0] != '\0' || rmp->endline)) 4196 { 4197 lmp->is = icatalloc (lmp->is, rmp->is); 4198 lmp->endline = rmp->endline; 4199 } 4200 else 4201 { 4202 lmp->is[0] = '\0'; 4203 lmp->begline = false; 4204 lmp->endline = false; 4205 } 4206 freemust (rmp); 4207 } 4208 break; 4209 4210 case '\0': 4211 /* Not on *my* shift. */ 4212 goto done; 4213 4214 default: 4215 if (CSET <= t) 4216 { 4217 /* If T is a singleton, or if case-folding in a unibyte 4218 locale and T's members all case-fold to the same char, 4219 convert T to one of its members. Otherwise, do 4220 nothing further with T. */ 4221 charclass *ccl = &d->charclasses[t - CSET]; 4222 int j; 4223 for (j = 0; j < NOTCHAR; j++) 4224 if (tstbit (j, ccl)) 4225 break; 4226 if (! (j < NOTCHAR)) 4227 { 4228 mp = allocmust (mp, 2); 4229 break; 4230 } 4231 t = j; 4232 while (++j < NOTCHAR) 4233 if (tstbit (j, ccl) 4234 && ! (case_fold_unibyte 4235 && toupper (j) == toupper (t))) 4236 break; 4237 if (j < NOTCHAR) 4238 { 4239 mp = allocmust (mp, 2); 4240 break; 4241 } 4242 } 4243 4244 idx_t rj = ri + 2; 4245 if (d->tokens[ri + 1] == CAT) 4246 { 4247 for (; rj < d->tindex - 1; rj += 2) 4248 { 4249 if ((rj != ri && (d->tokens[rj] <= 0 4250 || NOTCHAR <= d->tokens[rj])) 4251 || d->tokens[rj + 1] != CAT) 4252 break; 4253 } 4254 } 4255 mp = allocmust (mp, ((rj - ri) >> 1) + 1); 4256 mp->is[0] = mp->left[0] = mp->right[0] 4257 = case_fold_unibyte ? toupper (t) : t; 4258 4259 idx_t i; 4260 for (i = 1; ri + 2 < rj; i++) 4261 { 4262 ri += 2; 4263 t = d->tokens[ri]; 4264 mp->is[i] = mp->left[i] = mp->right[i] 4265 = case_fold_unibyte ? toupper (t) : t; 4266 } 4267 mp->is[i] = mp->left[i] = mp->right[i] = '\0'; 4268 mp->in = enlist (mp->in, mp->is, i); 4269 break; 4270 } 4271 } 4272 done:; 4273 4274 struct dfamust *dm = NULL; 4275 if (*result) 4276 { 4277 dm = xmalloc (FLEXSIZEOF (struct dfamust, must, strlen (result) + 1)); 4278 dm->exact = exact; 4279 dm->begline = begline; 4280 dm->endline = endline; 4281 strcpy (dm->must, result); 4282 } 4283 4284 while (mp) 4285 { 4286 must *prev = mp->prev; 4287 freemust (mp); 4288 mp = prev; 4289 } 4290 4291 return dm; 4292 } 4293 4294 void 4295 dfamustfree (struct dfamust *dm) 4296 { 4297 free (dm); 4298 } 4299 4300 struct dfa * 4301 dfaalloc (void) 4302 { 4303 return xmalloc (sizeof (struct dfa)); 4304 } 4305 4306 /* Initialize DFA. */ 4307 void 4308 dfasyntax (struct dfa *dfa, struct localeinfo const *linfo, 4309 reg_syntax_t bits, int dfaopts) 4310 { 4311 memset (dfa, 0, offsetof (struct dfa, dfaexec)); 4312 dfa->dfaexec = linfo->multibyte ? dfaexec_mb : dfaexec_sb; 4313 dfa->localeinfo = *linfo; 4314 4315 dfa->fast = !dfa->localeinfo.multibyte; 4316 4317 dfa->canychar = -1; 4318 dfa->syntax.syntax_bits_set = true; 4319 dfa->syntax.case_fold = (bits & RE_ICASE) != 0; 4320 dfa->syntax.anchor = (dfaopts & DFA_ANCHOR) != 0; 4321 dfa->syntax.eolbyte = dfaopts & DFA_EOL_NUL ? '\0' : '\n'; 4322 dfa->syntax.syntax_bits = bits; 4323 4324 for (int i = CHAR_MIN; i <= CHAR_MAX; ++i) 4325 { 4326 unsigned char uc = i; 4327 4328 dfa->syntax.sbit[uc] = char_context (dfa, uc); 4329 switch (dfa->syntax.sbit[uc]) 4330 { 4331 case CTX_LETTER: 4332 setbit (uc, &dfa->syntax.letters); 4333 break; 4334 case CTX_NEWLINE: 4335 setbit (uc, &dfa->syntax.newline); 4336 break; 4337 } 4338 4339 /* POSIX requires that the five bytes in "\n\r./" (including the 4340 terminating NUL) cannot occur inside a multibyte character. */ 4341 dfa->syntax.never_trail[uc] = (dfa->localeinfo.using_utf8 4342 ? (uc & 0xc0) != 0x80 4343 : strchr ("\n\r./", uc) != NULL); 4344 } 4345 } 4346 4347 /* Initialize TO by copying FROM's syntax settings. */ 4348 void 4349 dfacopysyntax (struct dfa *to, struct dfa const *from) 4350 { 4351 memset (to, 0, offsetof (struct dfa, syntax)); 4352 to->canychar = -1; 4353 to->fast = from->fast; 4354 to->syntax = from->syntax; 4355 to->dfaexec = from->dfaexec; 4356 to->localeinfo = from->localeinfo; 4357 } 4358 4359 /* vim:set shiftwidth=2: */ 4360