1 /* Extended regular expression matching and search library. 2 Copyright (C) 1985, 1989-90 Free Software Foundation, Inc. 3 4 This program is free software; you can redistribute it and/or modify 5 it under the terms of the GNU General Public License as published by 6 the Free Software Foundation; either version 2, or (at your option) 7 any later version. 8 9 This program is distributed in the hope that it will be useful, 10 but WITHOUT ANY WARRANTY; without even the implied warranty of 11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12 GNU General Public License for more details. 13 14 You should have received a copy of the GNU General Public License 15 along with this program; if not, write to the Free Software 16 Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */ 17 18 19 // This is a translation into C++ of regex.c, the GNU regexp package. 20 21 /* To test, compile with -Dtest. This Dtestable feature turns this into 22 a self-contained program which reads a pattern, describes how it 23 compiles, then reads a string and searches for it. 24 25 On the other hand, if you compile with both -Dtest and -Dcanned you 26 can run some tests we've already thought of. */ 27 28 /* AIX requires the alloca decl to be the first thing in the file. */ 29 #ifdef __GNUC__ 30 #define alloca alloca 31 #else 32 #ifdef sparc 33 #include <alloca.h> 34 #else 35 #ifdef _AIX 36 #pragma alloca 37 #else 38 char *alloca (); 39 #endif 40 #endif 41 #endif 42 43 #ifdef emacs 44 45 /* The `emacs' switch turns on certain special matching commands 46 that make sense only in emacs. */ 47 48 #include "config.h" 49 #include "lisp.h" 50 #include "buffer.h" 51 #include "syntax.h" 52 53 #else /* not emacs */ 54 55 #include <_G_config.h> 56 #include <string.h> 57 #include <stdlib.h> 58 59 /* Define the syntax stuff, so we can do the \<, \>, etc. */ 60 61 /* This must be nonzero for the wordchar and notwordchar pattern 62 commands in re_match_2. */ 63 #ifndef Sword 64 #define Sword 1 65 #endif 66 67 #define SYNTAX(c) re_syntax_table[c] 68 69 70 #ifdef SYNTAX_TABLE 71 72 char *re_syntax_table; 73 74 #else /* not SYNTAX_TABLE */ 75 76 static char re_syntax_table[256]; 77 78 79 static void 80 init_syntax_once () 81 { 82 register int c; 83 static int done = 0; 84 85 if (done) 86 return; 87 88 memset (re_syntax_table, 0, sizeof re_syntax_table); 89 90 for (c = 'a'; c <= 'z'; c++) 91 re_syntax_table[c] = Sword; 92 93 for (c = 'A'; c <= 'Z'; c++) 94 re_syntax_table[c] = Sword; 95 96 for (c = '0'; c <= '9'; c++) 97 re_syntax_table[c] = Sword; 98 99 done = 1; 100 } 101 102 #endif /* SYNTAX_TABLE */ 103 #endif /* emacs */ 104 105 /* We write fatal error messages on standard error. */ 106 #include <stdio.h> 107 108 /* isalpha(3) etc. are used for the character classes. */ 109 #include <ctype.h> 110 /* Sequents are missing isgraph. */ 111 #ifndef isgraph 112 #define isgraph(c) (isprint((c)) && !isspace((c))) 113 #endif 114 115 /* Get the interface, including the syntax bits. */ 116 #include "regex.h" 117 118 119 /* These are the command codes that appear in compiled regular 120 expressions, one per byte. Some command codes are followed by 121 argument bytes. A command code can specify any interpretation 122 whatsoever for its arguments. Zero-bytes may appear in the compiled 123 regular expression. 124 125 The value of `exactn' is needed in search.c (search_buffer) in emacs. 126 So regex.h defines a symbol `RE_EXACTN_VALUE' to be 1; the value of 127 `exactn' we use here must also be 1. */ 128 129 enum regexpcode 130 { 131 unused=0, 132 exactn=1, /* Followed by one byte giving n, then by n literal bytes. */ 133 begline, /* Fail unless at beginning of line. */ 134 endline, /* Fail unless at end of line. */ 135 jump, /* Followed by two bytes giving relative address to jump to. */ 136 on_failure_jump, /* Followed by two bytes giving relative address of 137 place to resume at in case of failure. */ 138 finalize_jump, /* Throw away latest failure point and then jump to 139 address. */ 140 maybe_finalize_jump, /* Like jump but finalize if safe to do so. 141 This is used to jump back to the beginning 142 of a repeat. If the command that follows 143 this jump is clearly incompatible with the 144 one at the beginning of the repeat, such that 145 we can be sure that there is no use backtracking 146 out of repetitions already completed, 147 then we finalize. */ 148 dummy_failure_jump, /* Jump, and push a dummy failure point. This 149 failure point will be thrown away if an attempt 150 is made to use it for a failure. A + construct 151 makes this before the first repeat. Also 152 use it as an intermediary kind of jump when 153 compiling an or construct. */ 154 succeed_n, /* Used like on_failure_jump except has to succeed n times; 155 then gets turned into an on_failure_jump. The relative 156 address following it is useless until then. The 157 address is followed by two bytes containing n. */ 158 jump_n, /* Similar to jump, but jump n times only; also the relative 159 address following is in turn followed by yet two more bytes 160 containing n. */ 161 set_number_at, /* Set the following relative location to the 162 subsequent number. */ 163 anychar, /* Matches any (more or less) one character. */ 164 charset, /* Matches any one char belonging to specified set. 165 First following byte is number of bitmap bytes. 166 Then come bytes for a bitmap saying which chars are in. 167 Bits in each byte are ordered low-bit-first. 168 A character is in the set if its bit is 1. 169 A character too large to have a bit in the map 170 is automatically not in the set. */ 171 charset_not, /* Same parameters as charset, but match any character 172 that is not one of those specified. */ 173 start_memory, /* Start remembering the text that is matched, for 174 storing in a memory register. Followed by one 175 byte containing the register number. Register numbers 176 must be in the range 0 through RE_NREGS. */ 177 stop_memory, /* Stop remembering the text that is matched 178 and store it in a memory register. Followed by 179 one byte containing the register number. Register 180 numbers must be in the range 0 through RE_NREGS. */ 181 duplicate, /* Match a duplicate of something remembered. 182 Followed by one byte containing the index of the memory 183 register. */ 184 #ifdef emacs 185 before_dot, /* Succeeds if before point. */ 186 at_dot, /* Succeeds if at point. */ 187 after_dot, /* Succeeds if after point. */ 188 #endif 189 begbuf, /* Succeeds if at beginning of buffer. */ 190 endbuf, /* Succeeds if at end of buffer. */ 191 wordchar, /* Matches any word-constituent character. */ 192 notwordchar, /* Matches any char that is not a word-constituent. */ 193 wordbeg, /* Succeeds if at word beginning. */ 194 wordend, /* Succeeds if at word end. */ 195 wordbound, /* Succeeds if at a word boundary. */ 196 notwordbound,/* Succeeds if not at a word boundary. */ 197 #ifdef emacs 198 syntaxspec, /* Matches any character whose syntax is specified. 199 followed by a byte which contains a syntax code, 200 e.g., Sword. */ 201 notsyntaxspec /* Matches any character whose syntax differs from 202 that specified. */ 203 #endif 204 }; 205 206 207 /* Number of failure points to allocate space for initially, 208 when matching. If this number is exceeded, more space is allocated, 209 so it is not a hard limit. */ 210 211 #ifndef NFAILURES 212 #define NFAILURES 80 213 #endif 214 215 216 #ifndef SIGN_EXTEND_CHAR 217 #ifdef __STDC__ 218 #define SIGN_EXTEND_CHAR(c) ((signed char)(c)) 219 #else 220 #define SIGN_EXTEND_CHAR(c) (((c)^128) - 128) /* As in Harbison and Steele. */ 221 #endif 222 #endif /* not SIGN_EXTEND_CHAR */ 223 224 /* Store NUMBER in two contiguous bytes starting at DESTINATION. */ 225 #define STORE_NUMBER(destination, number) \ 226 { (destination)[0] = (number) & 0377; \ 227 (destination)[1] = (number) >> 8; } 228 229 /* Same as STORE_NUMBER, except increment the destination pointer to 230 the byte after where the number is stored. Watch out that values for 231 DESTINATION such as p + 1 won't work, whereas p will. */ 232 #define STORE_NUMBER_AND_INCR(destination, number) \ 233 { STORE_NUMBER(destination, number); \ 234 (destination) += 2; } 235 236 237 /* Put into DESTINATION a number stored in two contingous bytes starting 238 at SOURCE. */ 239 #define EXTRACT_NUMBER(destination, source) \ 240 { (destination) = *(source) & 0377; \ 241 (destination) += SIGN_EXTEND_CHAR (*(char *)((source) + 1)) << 8; } 242 243 /* Same as EXTRACT_NUMBER, except increment the pointer for source to 244 point to second byte of SOURCE. Note that SOURCE has to be a value 245 such as p, not, e.g., p + 1. */ 246 #define EXTRACT_NUMBER_AND_INCR(destination, source) \ 247 { EXTRACT_NUMBER (destination, source); \ 248 (source) += 2; } 249 250 251 /* Specify the precise syntax of regexps for compilation. This provides 252 for compatibility for various utilities which historically have 253 different, incompatible syntaxes. 254 255 The argument SYNTAX is a bit-mask comprised of the various bits 256 defined in regex.h. */ 257 258 int 259 re_set_syntax (int syntax) 260 { 261 int ret; 262 263 ret = obscure_syntax; 264 obscure_syntax = syntax; 265 return ret; 266 } 267 268 /* Set by re_set_syntax to the current regexp syntax to recognize. */ 269 int obscure_syntax = 0; 270 271 272 273 /* Macros for re_compile_pattern, which is found below these definitions. */ 274 275 #define CHAR_CLASS_MAX_LENGTH 6 276 277 /* Fetch the next character in the uncompiled pattern, translating it if 278 necessary. */ 279 #define PATFETCH(c) \ 280 {if (p == pend) goto end_of_pattern; \ 281 c = * (const unsigned char *) p++; \ 282 if (translate) c = translate[c]; } 283 284 /* Fetch the next character in the uncompiled pattern, with no 285 translation. */ 286 #define PATFETCH_RAW(c) \ 287 {if (p == pend) goto end_of_pattern; \ 288 c = * (const unsigned char *) p++; } 289 290 #define PATUNFETCH p-- 291 292 293 /* If the buffer isn't allocated when it comes in, use this. */ 294 #define INIT_BUF_SIZE 28 295 296 /* Make sure we have at least N more bytes of space in buffer. */ 297 #define GET_BUFFER_SPACE(n) \ 298 { \ 299 while (b - bufp->buffer + (n) >= bufp->allocated) \ 300 EXTEND_BUFFER; \ 301 } 302 303 /* Make sure we have one more byte of buffer space and then add CH to it. */ 304 #define BUFPUSH(ch) \ 305 { \ 306 GET_BUFFER_SPACE (1); \ 307 *b++ = (char) (ch); \ 308 } 309 310 /* Extend the buffer by twice its current size via reallociation and 311 reset the pointers that pointed into the old allocation to point to 312 the correct places in the new allocation. If extending the buffer 313 results in it being larger than 1 << 16, then flag memory exhausted. */ 314 #define EXTEND_BUFFER \ 315 { char *old_buffer = bufp->buffer; \ 316 if (bufp->allocated == (1L<<16)) goto too_big; \ 317 bufp->allocated *= 2; \ 318 if (bufp->allocated > (1L<<16)) bufp->allocated = (1L<<16); \ 319 bufp->buffer = (char *) realloc (bufp->buffer, bufp->allocated); \ 320 if (bufp->buffer == 0) \ 321 goto memory_exhausted; \ 322 b = (b - old_buffer) + bufp->buffer; \ 323 if (fixup_jump) \ 324 fixup_jump = (fixup_jump - old_buffer) + bufp->buffer; \ 325 if (laststart) \ 326 laststart = (laststart - old_buffer) + bufp->buffer; \ 327 begalt = (begalt - old_buffer) + bufp->buffer; \ 328 if (pending_exact) \ 329 pending_exact = (pending_exact - old_buffer) + bufp->buffer; \ 330 } 331 332 /* Set the bit for character C in a character set list. */ 333 #define SET_LIST_BIT(c) (b[(c) / BYTEWIDTH] |= 1 << ((c) % BYTEWIDTH)) 334 335 /* Get the next unsigned number in the uncompiled pattern. */ 336 #define GET_UNSIGNED_NUMBER(num) \ 337 { if (p != pend) \ 338 { \ 339 PATFETCH (c); \ 340 while (isdigit (c)) \ 341 { \ 342 if (num < 0) \ 343 num = 0; \ 344 num = num * 10 + c - '0'; \ 345 if (p == pend) \ 346 break; \ 347 PATFETCH (c); \ 348 } \ 349 } \ 350 } 351 352 /* Subroutines for re_compile_pattern. */ 353 static void store_jump (char *from, char opcode, char *to); 354 static void insert_jump (char op, char *from, char *to, char *current_end); 355 static void store_jump_n (char *from, char opcode, char *to, unsigned n); 356 static void insert_jump_n (char, char *, char *, char *, unsigned); 357 static void insert_op_2 (char, char *, char *_end, int, int); 358 359 360 /* re_compile_pattern takes a regular-expression string 361 and converts it into a buffer full of byte commands for matching. 362 363 PATTERN is the address of the pattern string 364 SIZE is the length of it. 365 BUFP is a struct re_pattern_buffer * which points to the info 366 on where to store the byte commands. 367 This structure contains a char * which points to the 368 actual space, which should have been obtained with malloc. 369 re_compile_pattern may use realloc to grow the buffer space. 370 371 The number of bytes of commands can be found out by looking in 372 the `struct re_pattern_buffer' that bufp pointed to, after 373 re_compile_pattern returns. */ 374 375 char * 376 re_compile_pattern (const char *pattern, int size, struct re_pattern_buffer *bufp) 377 { 378 register char *b = bufp->buffer; 379 register const char *p = pattern; 380 const char *pend = pattern + size; 381 register unsigned c, c1; 382 const char *p1; 383 unsigned char *translate = (unsigned char *) bufp->translate; 384 385 /* Address of the count-byte of the most recently inserted `exactn' 386 command. This makes it possible to tell whether a new exact-match 387 character can be added to that command or requires a new `exactn' 388 command. */ 389 390 char *pending_exact = 0; 391 392 /* Address of the place where a forward-jump should go to the end of 393 the containing expression. Each alternative of an `or', except the 394 last, ends with a forward-jump of this sort. */ 395 396 char *fixup_jump = 0; 397 398 /* Address of start of the most recently finished expression. 399 This tells postfix * where to find the start of its operand. */ 400 401 char *laststart = 0; 402 403 /* In processing a repeat, 1 means zero matches is allowed. */ 404 405 char zero_times_ok; 406 407 /* In processing a repeat, 1 means many matches is allowed. */ 408 409 char many_times_ok; 410 411 /* Address of beginning of regexp, or inside of last \(. */ 412 413 char *begalt = b; 414 415 /* In processing an interval, at least this many matches must be made. */ 416 int lower_bound; 417 418 /* In processing an interval, at most this many matches can be made. */ 419 int upper_bound; 420 421 /* Place in pattern (i.e., the {) to which to go back if the interval 422 is invalid. */ 423 const char *beg_interval = 0; 424 425 /* Stack of information saved by \( and restored by \). 426 Four stack elements are pushed by each \(: 427 First, the value of b. 428 Second, the value of fixup_jump. 429 Third, the value of regnum. 430 Fourth, the value of begalt. */ 431 432 int stackb[40]; 433 int *stackp = stackb; 434 int *stacke = stackb + 40; 435 int *stackt; 436 437 /* Counts \('s as they are encountered. Remembered for the matching \), 438 where it becomes the register number to put in the stop_memory 439 command. */ 440 441 unsigned regnum = 1; 442 443 bufp->fastmap_accurate = 0; 444 445 #ifndef emacs 446 #ifndef SYNTAX_TABLE 447 /* Initialize the syntax table. */ 448 init_syntax_once(); 449 #endif 450 #endif 451 452 if (bufp->allocated == 0) 453 { 454 bufp->allocated = INIT_BUF_SIZE; 455 if (bufp->buffer) 456 /* EXTEND_BUFFER loses when bufp->allocated is 0. */ 457 bufp->buffer = (char *) realloc (bufp->buffer, INIT_BUF_SIZE); 458 else 459 /* Caller did not allocate a buffer. Do it for them. */ 460 bufp->buffer = (char *) malloc (INIT_BUF_SIZE); 461 if (!bufp->buffer) goto memory_exhausted; 462 begalt = b = bufp->buffer; 463 } 464 465 while (p != pend) 466 { 467 PATFETCH (c); 468 469 switch (c) 470 { 471 case '$': 472 { 473 const char *p1 = p; 474 /* When testing what follows the $, 475 look past the \-constructs that don't consume anything. */ 476 if (! (obscure_syntax & RE_CONTEXT_INDEP_OPS)) 477 while (p1 != pend) 478 { 479 if (*p1 == '\\' && p1 + 1 != pend 480 && (p1[1] == '<' || p1[1] == '>' 481 || p1[1] == '`' || p1[1] == '\'' 482 #ifdef emacs 483 || p1[1] == '=' 484 #endif 485 || p1[1] == 'b' || p1[1] == 'B')) 486 p1 += 2; 487 else 488 break; 489 } 490 if (obscure_syntax & RE_TIGHT_VBAR) 491 { 492 if (! (obscure_syntax & RE_CONTEXT_INDEP_OPS) && p1 != pend) 493 goto normal_char; 494 /* Make operand of last vbar end before this `$'. */ 495 if (fixup_jump) 496 store_jump (fixup_jump, jump, b); 497 fixup_jump = 0; 498 BUFPUSH (endline); 499 break; 500 } 501 /* $ means succeed if at end of line, but only in special contexts. 502 If validly in the middle of a pattern, it is a normal character. */ 503 504 if ((obscure_syntax & RE_CONTEXTUAL_INVALID_OPS) && p1 != pend) 505 goto invalid_pattern; 506 if (p1 == pend || *p1 == '\n' 507 || (obscure_syntax & RE_CONTEXT_INDEP_OPS) 508 || (obscure_syntax & RE_NO_BK_PARENS 509 ? *p1 == ')' 510 : *p1 == '\\' && p1[1] == ')') 511 || (obscure_syntax & RE_NO_BK_VBAR 512 ? *p1 == '|' 513 : *p1 == '\\' && p1[1] == '|')) 514 { 515 BUFPUSH (endline); 516 break; 517 } 518 goto normal_char; 519 } 520 case '^': 521 /* ^ means succeed if at beg of line, but only if no preceding 522 pattern. */ 523 524 if ((obscure_syntax & RE_CONTEXTUAL_INVALID_OPS) && laststart) 525 goto invalid_pattern; 526 if (laststart && p - 2 >= pattern && p[-2] != '\n' 527 && !(obscure_syntax & RE_CONTEXT_INDEP_OPS)) 528 goto normal_char; 529 if (obscure_syntax & RE_TIGHT_VBAR) 530 { 531 if (p != pattern + 1 532 && ! (obscure_syntax & RE_CONTEXT_INDEP_OPS)) 533 goto normal_char; 534 BUFPUSH (begline); 535 begalt = b; 536 } 537 else 538 BUFPUSH (begline); 539 break; 540 541 case '+': 542 case '?': 543 if ((obscure_syntax & RE_BK_PLUS_QM) 544 || (obscure_syntax & RE_LIMITED_OPS)) 545 goto normal_char; 546 handle_plus: 547 case '*': 548 /* If there is no previous pattern, char not special. */ 549 if (!laststart) 550 { 551 if (obscure_syntax & RE_CONTEXTUAL_INVALID_OPS) 552 goto invalid_pattern; 553 else if (! (obscure_syntax & RE_CONTEXT_INDEP_OPS)) 554 goto normal_char; 555 } 556 /* If there is a sequence of repetition chars, 557 collapse it down to just one. */ 558 zero_times_ok = 0; 559 many_times_ok = 0; 560 while (1) 561 { 562 zero_times_ok |= c != '+'; 563 many_times_ok |= c != '?'; 564 if (p == pend) 565 break; 566 PATFETCH (c); 567 if (c == '*') 568 ; 569 else if (!(obscure_syntax & RE_BK_PLUS_QM) 570 && (c == '+' || c == '?')) 571 ; 572 else if ((obscure_syntax & RE_BK_PLUS_QM) 573 && c == '\\') 574 { 575 int c1; 576 PATFETCH (c1); 577 if (!(c1 == '+' || c1 == '?')) 578 { 579 PATUNFETCH; 580 PATUNFETCH; 581 break; 582 } 583 c = c1; 584 } 585 else 586 { 587 PATUNFETCH; 588 break; 589 } 590 } 591 592 /* Star, etc. applied to an empty pattern is equivalent 593 to an empty pattern. */ 594 if (!laststart) 595 break; 596 597 /* Now we know whether or not zero matches is allowed 598 and also whether or not two or more matches is allowed. */ 599 if (many_times_ok) 600 { 601 /* If more than one repetition is allowed, put in at the 602 end a backward relative jump from b to before the next 603 jump we're going to put in below (which jumps from 604 laststart to after this jump). */ 605 GET_BUFFER_SPACE (3); 606 store_jump (b, maybe_finalize_jump, laststart - 3); 607 b += 3; /* Because store_jump put stuff here. */ 608 } 609 /* On failure, jump from laststart to b + 3, which will be the 610 end of the buffer after this jump is inserted. */ 611 GET_BUFFER_SPACE (3); 612 insert_jump (on_failure_jump, laststart, b + 3, b); 613 pending_exact = 0; 614 b += 3; 615 if (!zero_times_ok) 616 { 617 /* At least one repetition is required, so insert a 618 dummy-failure before the initial on-failure-jump 619 instruction of the loop. This effects a skip over that 620 instruction the first time we hit that loop. */ 621 GET_BUFFER_SPACE (6); 622 insert_jump (dummy_failure_jump, laststart, laststart + 6, b); 623 b += 3; 624 } 625 break; 626 627 case '.': 628 laststart = b; 629 BUFPUSH (anychar); 630 break; 631 632 case '[': 633 if (p == pend) 634 goto invalid_pattern; 635 while (b - bufp->buffer 636 > bufp->allocated - 3 - (1 << BYTEWIDTH) / BYTEWIDTH) 637 EXTEND_BUFFER; 638 639 laststart = b; 640 if (*p == '^') 641 { 642 BUFPUSH (charset_not); 643 p++; 644 } 645 else 646 BUFPUSH (charset); 647 p1 = p; 648 649 BUFPUSH ((1 << BYTEWIDTH) / BYTEWIDTH); 650 /* Clear the whole map */ 651 memset (b, 0, (1 << BYTEWIDTH) / BYTEWIDTH); 652 653 if ((obscure_syntax & RE_HAT_NOT_NEWLINE) && b[-2] == charset_not) 654 SET_LIST_BIT ('\n'); 655 656 657 /* Read in characters and ranges, setting map bits. */ 658 while (1) 659 { 660 /* Don't translate while fetching, in case it's a range bound. 661 When we set the bit for the character, we translate it. */ 662 PATFETCH_RAW (c); 663 664 /* If set, \ escapes characters when inside [...]. */ 665 if ((obscure_syntax & RE_AWK_CLASS_HACK) && c == '\\') 666 { 667 PATFETCH(c1); 668 SET_LIST_BIT (c1); 669 continue; 670 } 671 if (c == ']') 672 { 673 if (p == p1 + 1) 674 { 675 /* If this is an empty bracket expression. */ 676 if ((obscure_syntax & RE_NO_EMPTY_BRACKETS) 677 && p == pend) 678 goto invalid_pattern; 679 } 680 else 681 /* Stop if this isn't merely a ] inside a bracket 682 expression, but rather the end of a bracket 683 expression. */ 684 break; 685 } 686 /* Get a range. */ 687 if (p[0] == '-' && p[1] != ']') 688 { 689 PATFETCH (c1); 690 /* Don't translate the range bounds while fetching them. */ 691 PATFETCH_RAW (c1); 692 693 if ((obscure_syntax & RE_NO_EMPTY_RANGES) && c > c1) 694 goto invalid_pattern; 695 696 if ((obscure_syntax & RE_NO_HYPHEN_RANGE_END) 697 && c1 == '-' && *p != ']') 698 goto invalid_pattern; 699 700 while (c <= c1) 701 { 702 /* Translate each char that's in the range. */ 703 if (translate) 704 SET_LIST_BIT (translate[c]); 705 else 706 SET_LIST_BIT (c); 707 c++; 708 } 709 } 710 else if ((obscure_syntax & RE_CHAR_CLASSES) 711 && c == '[' && p[0] == ':') 712 { 713 /* Longest valid character class word has six characters. */ 714 char str[CHAR_CLASS_MAX_LENGTH]; 715 PATFETCH (c); 716 c1 = 0; 717 /* If no ] at end. */ 718 if (p == pend) 719 goto invalid_pattern; 720 while (1) 721 { 722 /* Don't translate the ``character class'' characters. */ 723 PATFETCH_RAW (c); 724 if (c == ':' || c == ']' || p == pend 725 || c1 == CHAR_CLASS_MAX_LENGTH) 726 break; 727 str[c1++] = c; 728 } 729 str[c1] = '\0'; 730 if (p == pend 731 || c == ']' /* End of the bracket expression. */ 732 || p[0] != ']' 733 || p + 1 == pend 734 || (strcmp (str, "alpha") != 0 735 && strcmp (str, "upper") != 0 736 && strcmp (str, "lower") != 0 737 && strcmp (str, "digit") != 0 738 && strcmp (str, "alnum") != 0 739 && strcmp (str, "xdigit") != 0 740 && strcmp (str, "space") != 0 741 && strcmp (str, "print") != 0 742 && strcmp (str, "punct") != 0 743 && strcmp (str, "graph") != 0 744 && strcmp (str, "cntrl") != 0)) 745 { 746 /* Undo the ending character, the letters, and leave 747 the leading : and [ (but set bits for them). */ 748 c1++; 749 while (c1--) 750 PATUNFETCH; 751 SET_LIST_BIT ('['); 752 SET_LIST_BIT (':'); 753 } 754 else 755 { 756 /* The ] at the end of the character class. */ 757 PATFETCH (c); 758 if (c != ']') 759 goto invalid_pattern; 760 for (c = 0; c < (1 << BYTEWIDTH); c++) 761 { 762 if ((strcmp (str, "alpha") == 0 && isalpha (c)) 763 || (strcmp (str, "upper") == 0 && isupper (c)) 764 || (strcmp (str, "lower") == 0 && islower (c)) 765 || (strcmp (str, "digit") == 0 && isdigit (c)) 766 || (strcmp (str, "alnum") == 0 && isalnum (c)) 767 || (strcmp (str, "xdigit") == 0 && isxdigit (c)) 768 || (strcmp (str, "space") == 0 && isspace (c)) 769 || (strcmp (str, "print") == 0 && isprint (c)) 770 || (strcmp (str, "punct") == 0 && ispunct (c)) 771 || (strcmp (str, "graph") == 0 && isgraph (c)) 772 || (strcmp (str, "cntrl") == 0 && iscntrl (c))) 773 SET_LIST_BIT (c); 774 } 775 } 776 } 777 else if (translate) 778 SET_LIST_BIT (translate[c]); 779 else 780 SET_LIST_BIT (c); 781 } 782 783 /* Discard any character set/class bitmap bytes that are all 784 0 at the end of the map. Decrement the map-length byte too. */ 785 while ((int) b[-1] > 0 && b[b[-1] - 1] == 0) 786 b[-1]--; 787 b += b[-1]; 788 break; 789 790 case '(': 791 if (! (obscure_syntax & RE_NO_BK_PARENS)) 792 goto normal_char; 793 else 794 goto handle_open; 795 796 case ')': 797 if (! (obscure_syntax & RE_NO_BK_PARENS)) 798 goto normal_char; 799 else 800 goto handle_close; 801 802 case '\n': 803 if (! (obscure_syntax & RE_NEWLINE_OR)) 804 goto normal_char; 805 else 806 goto handle_bar; 807 808 case '|': 809 if ((obscure_syntax & RE_CONTEXTUAL_INVALID_OPS) 810 && (! laststart || p == pend)) 811 goto invalid_pattern; 812 else if (! (obscure_syntax & RE_NO_BK_VBAR)) 813 goto normal_char; 814 else 815 goto handle_bar; 816 817 case '{': 818 if (! ((obscure_syntax & RE_NO_BK_CURLY_BRACES) 819 && (obscure_syntax & RE_INTERVALS))) 820 goto normal_char; 821 else 822 goto handle_interval; 823 824 case '\\': 825 if (p == pend) goto invalid_pattern; 826 PATFETCH_RAW (c); 827 switch (c) 828 { 829 case '(': 830 if (obscure_syntax & RE_NO_BK_PARENS) 831 goto normal_backsl; 832 handle_open: 833 if (stackp == stacke) goto nesting_too_deep; 834 835 /* Laststart should point to the start_memory that we are about 836 to push (unless the pattern has RE_NREGS or more ('s). */ 837 *stackp++ = b - bufp->buffer; 838 if (regnum < RE_NREGS) 839 { 840 BUFPUSH (start_memory); 841 BUFPUSH (regnum); 842 } 843 *stackp++ = fixup_jump ? fixup_jump - bufp->buffer + 1 : 0; 844 *stackp++ = regnum++; 845 *stackp++ = begalt - bufp->buffer; 846 fixup_jump = 0; 847 laststart = 0; 848 begalt = b; 849 break; 850 851 case ')': 852 if (obscure_syntax & RE_NO_BK_PARENS) 853 goto normal_backsl; 854 handle_close: 855 if (stackp == stackb) goto unmatched_close; 856 begalt = *--stackp + bufp->buffer; 857 if (fixup_jump) 858 store_jump (fixup_jump, jump, b); 859 if (stackp[-1] < RE_NREGS) 860 { 861 BUFPUSH (stop_memory); 862 BUFPUSH (stackp[-1]); 863 } 864 stackp -= 2; 865 fixup_jump = *stackp ? *stackp + bufp->buffer - 1 : 0; 866 laststart = *--stackp + bufp->buffer; 867 break; 868 869 case '|': 870 if ((obscure_syntax & RE_LIMITED_OPS) 871 || (obscure_syntax & RE_NO_BK_VBAR)) 872 goto normal_backsl; 873 handle_bar: 874 if (obscure_syntax & RE_LIMITED_OPS) 875 goto normal_char; 876 /* Insert before the previous alternative a jump which 877 jumps to this alternative if the former fails. */ 878 GET_BUFFER_SPACE (6); 879 insert_jump (on_failure_jump, begalt, b + 6, b); 880 pending_exact = 0; 881 b += 3; 882 /* The alternative before the previous alternative has a 883 jump after it which gets executed if it gets matched. 884 Adjust that jump so it will jump to the previous 885 alternative's analogous jump (put in below, which in 886 turn will jump to the next (if any) alternative's such 887 jump, etc.). The last such jump jumps to the correct 888 final destination. */ 889 if (fixup_jump) 890 store_jump (fixup_jump, jump, b); 891 892 /* Leave space for a jump after previous alternative---to be 893 filled in later. */ 894 fixup_jump = b; 895 b += 3; 896 897 laststart = 0; 898 begalt = b; 899 break; 900 901 case '{': 902 if (! (obscure_syntax & RE_INTERVALS) 903 /* Let \{ be a literal. */ 904 || ((obscure_syntax & RE_INTERVALS) 905 && (obscure_syntax & RE_NO_BK_CURLY_BRACES)) 906 /* If it's the string "\{". */ 907 || (p - 2 == pattern && p == pend)) 908 goto normal_backsl; 909 handle_interval: 910 beg_interval = p - 1; /* The {. */ 911 /* If there is no previous pattern, this isn't an interval. */ 912 if (!laststart) 913 { 914 if (obscure_syntax & RE_CONTEXTUAL_INVALID_OPS) 915 goto invalid_pattern; 916 else 917 goto normal_backsl; 918 } 919 /* It also isn't an interval if not preceded by an re 920 matching a single character or subexpression, or if 921 the current type of intervals can't handle back 922 references and the previous thing is a back reference. */ 923 if (! (*laststart == anychar 924 || *laststart == charset 925 || *laststart == charset_not 926 || *laststart == start_memory 927 || (*laststart == exactn && laststart[1] == 1) 928 || (! (obscure_syntax & RE_NO_BK_REFS) 929 && *laststart == duplicate))) 930 { 931 if (obscure_syntax & RE_NO_BK_CURLY_BRACES) 932 goto normal_char; 933 934 /* Posix extended syntax is handled in previous 935 statement; this is for Posix basic syntax. */ 936 if (obscure_syntax & RE_INTERVALS) 937 goto invalid_pattern; 938 939 goto normal_backsl; 940 } 941 lower_bound = -1; /* So can see if are set. */ 942 upper_bound = -1; 943 GET_UNSIGNED_NUMBER (lower_bound); 944 if (c == ',') 945 { 946 GET_UNSIGNED_NUMBER (upper_bound); 947 if (upper_bound < 0) 948 upper_bound = RE_DUP_MAX; 949 } 950 if (upper_bound < 0) 951 upper_bound = lower_bound; 952 if (! (obscure_syntax & RE_NO_BK_CURLY_BRACES)) 953 { 954 if (c != '\\') 955 goto invalid_pattern; 956 PATFETCH (c); 957 } 958 if (c != '}' || lower_bound < 0 || upper_bound > RE_DUP_MAX 959 || lower_bound > upper_bound 960 || ((obscure_syntax & RE_NO_BK_CURLY_BRACES) 961 && p != pend && *p == '{')) 962 { 963 if (obscure_syntax & RE_NO_BK_CURLY_BRACES) 964 goto unfetch_interval; 965 else 966 goto invalid_pattern; 967 } 968 969 /* If upper_bound is zero, don't want to succeed at all; 970 jump from laststart to b + 3, which will be the end of 971 the buffer after this jump is inserted. */ 972 973 if (upper_bound == 0) 974 { 975 GET_BUFFER_SPACE (3); 976 insert_jump (jump, laststart, b + 3, b); 977 b += 3; 978 } 979 980 /* Otherwise, after lower_bound number of succeeds, jump 981 to after the jump_n which will be inserted at the end 982 of the buffer, and insert that jump_n. */ 983 else 984 { /* Set to 5 if only one repetition is allowed and 985 hence no jump_n is inserted at the current end of 986 the buffer; then only space for the succeed_n is 987 needed. Otherwise, need space for both the 988 succeed_n and the jump_n. */ 989 990 unsigned slots_needed = upper_bound == 1 ? 5 : 10; 991 992 GET_BUFFER_SPACE ((int) slots_needed); 993 /* Initialize the succeed_n to n, even though it will 994 be set by its attendant set_number_at, because 995 re_compile_fastmap will need to know it. Jump to 996 what the end of buffer will be after inserting 997 this succeed_n and possibly appending a jump_n. */ 998 insert_jump_n (succeed_n, laststart, b + slots_needed, 999 b, lower_bound); 1000 b += 5; /* Just increment for the succeed_n here. */ 1001 1002 /* More than one repetition is allowed, so put in at 1003 the end of the buffer a backward jump from b to the 1004 succeed_n we put in above. By the time we've gotten 1005 to this jump when matching, we'll have matched once 1006 already, so jump back only upper_bound - 1 times. */ 1007 1008 if (upper_bound > 1) 1009 { 1010 store_jump_n (b, jump_n, laststart, upper_bound - 1); 1011 b += 5; 1012 /* When hit this when matching, reset the 1013 preceding jump_n's n to upper_bound - 1. */ 1014 BUFPUSH (set_number_at); 1015 GET_BUFFER_SPACE (2); 1016 STORE_NUMBER_AND_INCR (b, -5); 1017 STORE_NUMBER_AND_INCR (b, upper_bound - 1); 1018 } 1019 /* When hit this when matching, set the succeed_n's n. */ 1020 GET_BUFFER_SPACE (5); 1021 insert_op_2 (set_number_at, laststart, b, 5, lower_bound); 1022 b += 5; 1023 } 1024 pending_exact = 0; 1025 beg_interval = 0; 1026 break; 1027 1028 1029 unfetch_interval: 1030 /* If an invalid interval, match the characters as literals. */ 1031 if (beg_interval) 1032 p = beg_interval; 1033 else 1034 { 1035 fprintf (stderr, 1036 "regex: no interval beginning to which to backtrack.\n"); 1037 exit (1); 1038 } 1039 1040 beg_interval = 0; 1041 PATFETCH (c); /* normal_char expects char in `c'. */ 1042 goto normal_char; 1043 break; 1044 1045 #ifdef emacs 1046 case '=': 1047 BUFPUSH (at_dot); 1048 break; 1049 1050 case 's': 1051 laststart = b; 1052 BUFPUSH (syntaxspec); 1053 PATFETCH (c); 1054 BUFPUSH (syntax_spec_code[c]); 1055 break; 1056 1057 case 'S': 1058 laststart = b; 1059 BUFPUSH (notsyntaxspec); 1060 PATFETCH (c); 1061 BUFPUSH (syntax_spec_code[c]); 1062 break; 1063 #endif /* emacs */ 1064 1065 case 'w': 1066 laststart = b; 1067 BUFPUSH (wordchar); 1068 break; 1069 1070 case 'W': 1071 laststart = b; 1072 BUFPUSH (notwordchar); 1073 break; 1074 1075 case '<': 1076 BUFPUSH (wordbeg); 1077 break; 1078 1079 case '>': 1080 BUFPUSH (wordend); 1081 break; 1082 1083 case 'b': 1084 BUFPUSH (wordbound); 1085 break; 1086 1087 case 'B': 1088 BUFPUSH (notwordbound); 1089 break; 1090 1091 case '`': 1092 BUFPUSH (begbuf); 1093 break; 1094 1095 case '\'': 1096 BUFPUSH (endbuf); 1097 break; 1098 1099 case '1': 1100 case '2': 1101 case '3': 1102 case '4': 1103 case '5': 1104 case '6': 1105 case '7': 1106 case '8': 1107 case '9': 1108 if (obscure_syntax & RE_NO_BK_REFS) 1109 goto normal_char; 1110 c1 = c - '0'; 1111 if (c1 >= regnum) 1112 { 1113 if (obscure_syntax & RE_NO_EMPTY_BK_REF) 1114 goto invalid_pattern; 1115 else 1116 goto normal_char; 1117 } 1118 /* Can't back reference to a subexpression if inside of it. */ 1119 for (stackt = stackp - 2; stackt > stackb; stackt -= 4) 1120 if (*stackt == c1) 1121 goto normal_char; 1122 laststart = b; 1123 BUFPUSH (duplicate); 1124 BUFPUSH (c1); 1125 break; 1126 1127 case '+': 1128 case '?': 1129 if (obscure_syntax & RE_BK_PLUS_QM) 1130 goto handle_plus; 1131 else 1132 goto normal_backsl; 1133 break; 1134 1135 default: 1136 normal_backsl: 1137 /* You might think it would be useful for \ to mean 1138 not to translate; but if we don't translate it 1139 it will never match anything. */ 1140 if (translate) c = translate[c]; 1141 goto normal_char; 1142 } 1143 break; 1144 1145 default: 1146 normal_char: /* Expects the character in `c'. */ 1147 if (!pending_exact || pending_exact + *pending_exact + 1 != b 1148 || *pending_exact == 0177 || *p == '*' || *p == '^' 1149 || ((obscure_syntax & RE_BK_PLUS_QM) 1150 ? *p == '\\' && (p[1] == '+' || p[1] == '?') 1151 : (*p == '+' || *p == '?')) 1152 || ((obscure_syntax & RE_INTERVALS) 1153 && ((obscure_syntax & RE_NO_BK_CURLY_BRACES) 1154 ? *p == '{' 1155 : (p[0] == '\\' && p[1] == '{')))) 1156 { 1157 laststart = b; 1158 BUFPUSH (exactn); 1159 pending_exact = b; 1160 BUFPUSH (0); 1161 } 1162 BUFPUSH (c); 1163 (*pending_exact)++; 1164 } 1165 } 1166 1167 if (fixup_jump) 1168 store_jump (fixup_jump, jump, b); 1169 1170 if (stackp != stackb) goto unmatched_open; 1171 1172 bufp->used = b - bufp->buffer; 1173 return 0; 1174 1175 invalid_pattern: 1176 return "Invalid regular expression"; 1177 1178 unmatched_open: 1179 return "Unmatched \\("; 1180 1181 unmatched_close: 1182 return "Unmatched \\)"; 1183 1184 end_of_pattern: 1185 return "Premature end of regular expression"; 1186 1187 nesting_too_deep: 1188 return "Nesting too deep"; 1189 1190 too_big: 1191 return "Regular expression too big"; 1192 1193 memory_exhausted: 1194 return "Memory exhausted"; 1195 } 1196 1197 1198 /* Store a jump of the form <OPCODE> <relative address>. 1199 Store in the location FROM a jump operation to jump to relative 1200 address FROM - TO. OPCODE is the opcode to store. */ 1201 1202 static void 1203 store_jump (char *from, char opcode, char *to) 1204 { 1205 from[0] = opcode; 1206 STORE_NUMBER(from + 1, to - (from + 3)); 1207 } 1208 1209 1210 /* Open up space before char FROM, and insert there a jump to TO. 1211 CURRENT_END gives the end of the storage not in use, so we know 1212 how much data to copy up. OP is the opcode of the jump to insert. 1213 1214 If you call this function, you must zero out pending_exact. */ 1215 1216 static void 1217 insert_jump (char op, char *from, char *to, char *current_end) 1218 { 1219 register char *pfrom = current_end; /* Copy from here... */ 1220 register char *pto = current_end + 3; /* ...to here. */ 1221 1222 while (pfrom != from) 1223 *--pto = *--pfrom; 1224 store_jump (from, op, to); 1225 } 1226 1227 1228 /* Store a jump of the form <opcode> <relative address> <n> . 1229 1230 Store in the location FROM a jump operation to jump to relative 1231 address FROM - TO. OPCODE is the opcode to store, N is a number the 1232 jump uses, say, to decide how many times to jump. 1233 1234 If you call this function, you must zero out pending_exact. */ 1235 1236 static void 1237 store_jump_n (char *from, char opcode, char *to, unsigned n) 1238 { 1239 from[0] = opcode; 1240 STORE_NUMBER (from + 1, to - (from + 3)); 1241 STORE_NUMBER (from + 3, n); 1242 } 1243 1244 1245 /* Similar to insert_jump, but handles a jump which needs an extra 1246 number to handle minimum and maximum cases. Open up space at 1247 location FROM, and insert there a jump to TO. CURRENT_END gives the 1248 end of the storage in use, so we know how much data to copy up. OP is 1249 the opcode of the jump to insert. 1250 1251 If you call this function, you must zero out pending_exact. */ 1252 1253 static void 1254 insert_jump_n (char op, char *from, char *to, char *current_end, unsigned n) 1255 { 1256 register char *pfrom = current_end; /* Copy from here... */ 1257 register char *pto = current_end + 5; /* ...to here. */ 1258 1259 while (pfrom != from) 1260 *--pto = *--pfrom; 1261 store_jump_n (from, op, to, n); 1262 } 1263 1264 1265 /* Open up space at location THERE, and insert operation OP followed by 1266 NUM_1 and NUM_2. CURRENT_END gives the end of the storage in use, so 1267 we know how much data to copy up. 1268 1269 If you call this function, you must zero out pending_exact. */ 1270 1271 static void 1272 insert_op_2 (char op, char *there, char *current_end, int num_1, int num_2) 1273 { 1274 register char *pfrom = current_end; /* Copy from here... */ 1275 register char *pto = current_end + 5; /* ...to here. */ 1276 1277 while (pfrom != there) 1278 *--pto = *--pfrom; 1279 1280 there[0] = op; 1281 STORE_NUMBER (there + 1, num_1); 1282 STORE_NUMBER (there + 3, num_2); 1283 } 1284 1285 1286 1287 /* Given a pattern, compute a fastmap from it. The fastmap records 1288 which of the (1 << BYTEWIDTH) possible characters can start a string 1289 that matches the pattern. This fastmap is used by re_search to skip 1290 quickly over totally implausible text. 1291 1292 The caller must supply the address of a (1 << BYTEWIDTH)-byte data 1293 area as bufp->fastmap. 1294 The other components of bufp describe the pattern to be used. */ 1295 1296 void 1297 re_compile_fastmap (struct re_pattern_buffer *bufp) 1298 { 1299 unsigned char *pattern = (unsigned char *) bufp->buffer; 1300 int size = bufp->used; 1301 register char *fastmap = bufp->fastmap; 1302 register unsigned char *p = pattern; 1303 register unsigned char *pend = pattern + size; 1304 register int j, k; 1305 unsigned char *translate = (unsigned char *) bufp->translate; 1306 1307 unsigned char *stackb[NFAILURES]; 1308 unsigned char **stackp = stackb; 1309 1310 unsigned is_a_succeed_n; 1311 1312 memset (fastmap, 0, (1 << BYTEWIDTH)); 1313 bufp->fastmap_accurate = 1; 1314 bufp->can_be_null = 0; 1315 1316 while (p) 1317 { 1318 is_a_succeed_n = 0; 1319 if (p == pend) 1320 { 1321 bufp->can_be_null = 1; 1322 break; 1323 } 1324 #ifdef SWITCH_ENUM_BUG 1325 switch ((int) ((enum regexpcode) *p++)) 1326 #else 1327 switch ((enum regexpcode) *p++) 1328 #endif 1329 { 1330 case exactn: 1331 if (translate) 1332 fastmap[translate[p[1]]] = 1; 1333 else 1334 fastmap[p[1]] = 1; 1335 break; 1336 1337 case unused: 1338 case begline: 1339 #ifdef emacs 1340 case before_dot: 1341 case at_dot: 1342 case after_dot: 1343 #endif 1344 case begbuf: 1345 case endbuf: 1346 case wordbound: 1347 case notwordbound: 1348 case wordbeg: 1349 case wordend: 1350 continue; 1351 1352 case endline: 1353 if (translate) 1354 fastmap[translate['\n']] = 1; 1355 else 1356 fastmap['\n'] = 1; 1357 1358 if (bufp->can_be_null != 1) 1359 bufp->can_be_null = 2; 1360 break; 1361 1362 case jump_n: 1363 case finalize_jump: 1364 case maybe_finalize_jump: 1365 case jump: 1366 case dummy_failure_jump: 1367 EXTRACT_NUMBER_AND_INCR (j, p); 1368 p += j; 1369 if (j > 0) 1370 continue; 1371 /* Jump backward reached implies we just went through 1372 the body of a loop and matched nothing. 1373 Opcode jumped to should be an on_failure_jump. 1374 Just treat it like an ordinary jump. 1375 For a * loop, it has pushed its failure point already; 1376 If so, discard that as redundant. */ 1377 1378 if ((enum regexpcode) *p != on_failure_jump 1379 && (enum regexpcode) *p != succeed_n) 1380 continue; 1381 p++; 1382 EXTRACT_NUMBER_AND_INCR (j, p); 1383 p += j; 1384 if (stackp != stackb && *stackp == p) 1385 stackp--; 1386 continue; 1387 1388 case on_failure_jump: 1389 handle_on_failure_jump: 1390 EXTRACT_NUMBER_AND_INCR (j, p); 1391 *++stackp = p + j; 1392 if (is_a_succeed_n) 1393 EXTRACT_NUMBER_AND_INCR (k, p); /* Skip the n. */ 1394 continue; 1395 1396 case succeed_n: 1397 is_a_succeed_n = 1; 1398 /* Get to the number of times to succeed. */ 1399 p += 2; 1400 /* Increment p past the n for when k != 0. */ 1401 EXTRACT_NUMBER_AND_INCR (k, p); 1402 if (k == 0) 1403 { 1404 p -= 4; 1405 goto handle_on_failure_jump; 1406 } 1407 continue; 1408 1409 case set_number_at: 1410 p += 4; 1411 continue; 1412 1413 case start_memory: 1414 case stop_memory: 1415 p++; 1416 continue; 1417 1418 case duplicate: 1419 bufp->can_be_null = 1; 1420 fastmap['\n'] = 1; 1421 case anychar: 1422 for (j = 0; j < (1 << BYTEWIDTH); j++) 1423 if (j != '\n') 1424 fastmap[j] = 1; 1425 if (bufp->can_be_null) 1426 return; 1427 /* Don't return; check the alternative paths 1428 so we can set can_be_null if appropriate. */ 1429 break; 1430 1431 case wordchar: 1432 for (j = 0; j < (1 << BYTEWIDTH); j++) 1433 if (SYNTAX (j) == Sword) 1434 fastmap[j] = 1; 1435 break; 1436 1437 case notwordchar: 1438 for (j = 0; j < (1 << BYTEWIDTH); j++) 1439 if (SYNTAX (j) != Sword) 1440 fastmap[j] = 1; 1441 break; 1442 1443 #ifdef emacs 1444 case syntaxspec: 1445 k = *p++; 1446 for (j = 0; j < (1 << BYTEWIDTH); j++) 1447 if (SYNTAX (j) == (enum syntaxcode) k) 1448 fastmap[j] = 1; 1449 break; 1450 1451 case notsyntaxspec: 1452 k = *p++; 1453 for (j = 0; j < (1 << BYTEWIDTH); j++) 1454 if (SYNTAX (j) != (enum syntaxcode) k) 1455 fastmap[j] = 1; 1456 break; 1457 #endif /* not emacs */ 1458 1459 case charset: 1460 for (j = *p++ * BYTEWIDTH - 1; j >= 0; j--) 1461 if (p[j / BYTEWIDTH] & (1 << (j % BYTEWIDTH))) 1462 { 1463 if (translate) 1464 fastmap[translate[j]] = 1; 1465 else 1466 fastmap[j] = 1; 1467 } 1468 break; 1469 1470 case charset_not: 1471 /* Chars beyond end of map must be allowed */ 1472 for (j = *p * BYTEWIDTH; j < (1 << BYTEWIDTH); j++) 1473 if (translate) 1474 fastmap[translate[j]] = 1; 1475 else 1476 fastmap[j] = 1; 1477 1478 for (j = *p++ * BYTEWIDTH - 1; j >= 0; j--) 1479 if (!(p[j / BYTEWIDTH] & (1 << (j % BYTEWIDTH)))) 1480 { 1481 if (translate) 1482 fastmap[translate[j]] = 1; 1483 else 1484 fastmap[j] = 1; 1485 } 1486 break; 1487 } 1488 1489 /* Get here means we have successfully found the possible starting 1490 characters of one path of the pattern. We need not follow this 1491 path any farther. Instead, look at the next alternative 1492 remembered in the stack. */ 1493 if (stackp != stackb) 1494 p = *stackp--; 1495 else 1496 break; 1497 } 1498 } 1499 1500 1501 1502 /* Like re_search_2, below, but only one string is specified, and 1503 doesn't let you say where to stop matching. */ 1504 1505 int 1506 re_search (struct re_pattern_buffer *pbufp, 1507 char *string, 1508 int size, 1509 int startpos, 1510 int range, 1511 struct re_registers *regs) 1512 { 1513 return re_search_2 (pbufp, (char *) 0, 0, string, size, startpos, range, 1514 regs, size); 1515 } 1516 1517 1518 /* Using the compiled pattern in PBUFP->buffer, first tries to match the 1519 virtual concatenation of STRING1 and STRING2, starting first at index 1520 STARTPOS, then at STARTPOS + 1, and so on. RANGE is the number of 1521 places to try before giving up. If RANGE is negative, it searches 1522 backwards, i.e., the starting positions tried are STARTPOS, STARTPOS 1523 - 1, etc. STRING1 and STRING2 are of SIZE1 and SIZE2, respectively. 1524 In REGS, return the indices of the virtual concatenation of STRING1 1525 and STRING2 that matched the entire PBUFP->buffer and its contained 1526 subexpressions. Do not consider matching one past the index MSTOP in 1527 the virtual concatenation of STRING1 and STRING2. 1528 1529 The value returned is the position in the strings at which the match 1530 was found, or -1 if no match was found, or -2 if error (such as 1531 failure stack overflow). */ 1532 1533 int 1534 re_search_2 (struct re_pattern_buffer *pbufp, 1535 char *string1, int size1, 1536 char *string2, int size2, 1537 int startpos, 1538 register int range, 1539 struct re_registers *regs, 1540 int mstop) 1541 { 1542 register char *fastmap = pbufp->fastmap; 1543 register unsigned char *translate = (unsigned char *) pbufp->translate; 1544 int total_size = size1 + size2; 1545 int endpos = startpos + range; 1546 int val; 1547 1548 /* Check for out-of-range starting position. */ 1549 if (startpos < 0 || startpos > total_size) 1550 return -1; 1551 1552 /* Fix up range if it would eventually take startpos outside of the 1553 virtual concatenation of string1 and string2. */ 1554 if (endpos < -1) 1555 range = -1 - startpos; 1556 else if (endpos > total_size) 1557 range = total_size - startpos; 1558 1559 /* Update the fastmap now if not correct already. */ 1560 if (fastmap && !pbufp->fastmap_accurate) 1561 re_compile_fastmap (pbufp); 1562 1563 /* If the search isn't to be a backwards one, don't waste time in a 1564 long search for a pattern that says it is anchored. */ 1565 if (pbufp->used > 0 && (enum regexpcode) pbufp->buffer[0] == begbuf 1566 && range > 0) 1567 { 1568 if (startpos > 0) 1569 return -1; 1570 else 1571 range = 1; 1572 } 1573 1574 while (1) 1575 { 1576 /* If a fastmap is supplied, skip quickly over characters that 1577 cannot possibly be the start of a match. Note, however, that 1578 if the pattern can possibly match the null string, we must 1579 test it at each starting point so that we take the first null 1580 string we get. */ 1581 1582 if (fastmap && startpos < total_size && pbufp->can_be_null != 1) 1583 { 1584 if (range > 0) /* Searching forwards. */ 1585 { 1586 register int lim = 0; 1587 register unsigned char *p; 1588 int irange = range; 1589 if (startpos < size1 && startpos + range >= size1) 1590 lim = range - (size1 - startpos); 1591 1592 p = ((unsigned char *) 1593 &(startpos >= size1 ? string2 - size1 : string1)[startpos]); 1594 1595 while (range > lim && !fastmap[translate 1596 ? translate[*p++] 1597 : *p++]) 1598 range--; 1599 startpos += irange - range; 1600 } 1601 else /* Searching backwards. */ 1602 { 1603 register unsigned char c; 1604 1605 if (string1 == 0 || startpos >= size1) 1606 c = string2[startpos - size1]; 1607 else 1608 c = string1[startpos]; 1609 1610 c &= 0xff; 1611 if (translate ? !fastmap[translate[c]] : !fastmap[c]) 1612 goto advance; 1613 } 1614 } 1615 1616 if (range >= 0 && startpos == total_size 1617 && fastmap && pbufp->can_be_null == 0) 1618 return -1; 1619 1620 val = re_match_2 (pbufp, string1, size1, string2, size2, startpos, 1621 regs, mstop); 1622 if (val >= 0) 1623 return startpos; 1624 if (val == -2) 1625 return -2; 1626 1627 #ifdef C_ALLOCA 1628 alloca (0); 1629 #endif /* C_ALLOCA */ 1630 1631 advance: 1632 if (!range) 1633 break; 1634 else if (range > 0) 1635 { 1636 range--; 1637 startpos++; 1638 } 1639 else 1640 { 1641 range++; 1642 startpos--; 1643 } 1644 } 1645 return -1; 1646 } 1647 1648 1649 1650 #ifndef emacs /* emacs never uses this. */ 1651 int 1652 re_match (struct re_pattern_buffer *pbufp, 1653 char *string, 1654 int size, 1655 int pos, 1656 struct re_registers *regs) 1657 { 1658 return re_match_2 (pbufp, (char *) 0, 0, string, size, pos, regs, size); 1659 } 1660 #endif /* not emacs */ 1661 1662 1663 /* The following are used for re_match_2, defined below: */ 1664 1665 /* Roughly the maximum number of failure points on the stack. Would be 1666 exactly that if always pushed MAX_NUM_FAILURE_ITEMS each time we failed. */ 1667 1668 int re_max_failures = 2000; 1669 1670 /* Routine used by re_match_2. */ 1671 static int bcmp_translate (char *, char *, int, unsigned char *); 1672 1673 1674 /* Structure and accessing macros used in re_match_2: */ 1675 1676 struct register_info 1677 { 1678 unsigned is_active : 1; 1679 unsigned matched_something : 1; 1680 }; 1681 1682 #define IS_ACTIVE(R) ((R).is_active) 1683 #define MATCHED_SOMETHING(R) ((R).matched_something) 1684 1685 1686 /* Macros used by re_match_2: */ 1687 1688 1689 /* I.e., regstart, regend, and reg_info. */ 1690 1691 #define NUM_REG_ITEMS 3 1692 1693 /* We push at most this many things on the stack whenever we 1694 fail. The `+ 2' refers to PATTERN_PLACE and STRING_PLACE, which are 1695 arguments to the PUSH_FAILURE_POINT macro. */ 1696 1697 #define MAX_NUM_FAILURE_ITEMS (RE_NREGS * NUM_REG_ITEMS + 2) 1698 1699 1700 /* We push this many things on the stack whenever we fail. */ 1701 1702 #define NUM_FAILURE_ITEMS (last_used_reg * NUM_REG_ITEMS + 2) 1703 1704 1705 /* This pushes most of the information about the current state we will want 1706 if we ever fail back to it. */ 1707 1708 #define PUSH_FAILURE_POINT(pattern_place, string_place) \ 1709 { \ 1710 short last_used_reg, this_reg; \ 1711 \ 1712 /* Find out how many registers are active or have been matched. \ 1713 (Aside from register zero, which is only set at the end.) */ \ 1714 for (last_used_reg = RE_NREGS - 1; last_used_reg > 0; last_used_reg--)\ 1715 if (regstart[last_used_reg] != (unsigned char *) -1) \ 1716 break; \ 1717 \ 1718 if (stacke - stackp < NUM_FAILURE_ITEMS) \ 1719 { \ 1720 unsigned char **stackx; \ 1721 int len = stacke - stackb; \ 1722 if (len > re_max_failures * MAX_NUM_FAILURE_ITEMS) \ 1723 return -2; \ 1724 \ 1725 /* Roughly double the size of the stack. */ \ 1726 stackx = (unsigned char **) alloca (2 * len \ 1727 * sizeof (unsigned char *));\ 1728 /* Only copy what is in use. */ \ 1729 memcpy (stackx, stackb, len * sizeof (char *)); \ 1730 stackp = stackx + (stackp - stackb); \ 1731 stackb = stackx; \ 1732 stacke = stackb + 2 * len; \ 1733 } \ 1734 \ 1735 /* Now push the info for each of those registers. */ \ 1736 for (this_reg = 1; this_reg <= last_used_reg; this_reg++) \ 1737 { \ 1738 *stackp++ = regstart[this_reg]; \ 1739 *stackp++ = regend[this_reg]; \ 1740 *stackp++ = (unsigned char *) ®_info[this_reg]; \ 1741 } \ 1742 \ 1743 /* Push how many registers we saved. */ \ 1744 *stackp++ = (unsigned char *) last_used_reg; \ 1745 \ 1746 *stackp++ = pattern_place; \ 1747 *stackp++ = string_place; \ 1748 } 1749 1750 1751 /* This pops what PUSH_FAILURE_POINT pushes. */ 1752 1753 #define POP_FAILURE_POINT() \ 1754 { \ 1755 int temp; \ 1756 stackp -= 2; /* Remove failure points. */ \ 1757 temp = (int) *--stackp; /* How many regs pushed. */ \ 1758 temp *= NUM_REG_ITEMS; /* How much to take off the stack. */ \ 1759 stackp -= temp; /* Remove the register info. */ \ 1760 } 1761 1762 1763 #define MATCHING_IN_FIRST_STRING (dend == end_match_1) 1764 1765 /* Is true if there is a first string and if PTR is pointing anywhere 1766 inside it or just past the end. */ 1767 1768 #define IS_IN_FIRST_STRING(ptr) \ 1769 (size1 && string1 <= (ptr) && (ptr) <= string1 + size1) 1770 1771 /* Call before fetching a character with *d. This switches over to 1772 string2 if necessary. */ 1773 1774 #define PREFETCH \ 1775 while (d == dend) \ 1776 { \ 1777 /* end of string2 => fail. */ \ 1778 if (dend == end_match_2) \ 1779 goto fail; \ 1780 /* end of string1 => advance to string2. */ \ 1781 d = string2; \ 1782 dend = end_match_2; \ 1783 } 1784 1785 1786 /* Call this when have matched something; it sets `matched' flags for the 1787 registers corresponding to the subexpressions of which we currently 1788 are inside. */ 1789 #define SET_REGS_MATCHED \ 1790 { unsigned this_reg; \ 1791 for (this_reg = 0; this_reg < RE_NREGS; this_reg++) \ 1792 { \ 1793 if (IS_ACTIVE(reg_info[this_reg])) \ 1794 MATCHED_SOMETHING(reg_info[this_reg]) = 1; \ 1795 else \ 1796 MATCHED_SOMETHING(reg_info[this_reg]) = 0; \ 1797 } \ 1798 } 1799 1800 /* Test if at very beginning or at very end of the virtual concatenation 1801 of string1 and string2. If there is only one string, we've put it in 1802 string2. */ 1803 1804 #define AT_STRINGS_BEG (d == (size1 ? string1 : string2) || !size2) 1805 #define AT_STRINGS_END (d == end2) 1806 1807 #define AT_WORD_BOUNDARY \ 1808 (AT_STRINGS_BEG || AT_STRINGS_END || IS_A_LETTER (d - 1) != IS_A_LETTER (d)) 1809 1810 /* We have two special cases to check for: 1811 1) if we're past the end of string1, we have to look at the first 1812 character in string2; 1813 2) if we're before the beginning of string2, we have to look at the 1814 last character in string1; we assume there is a string1, so use 1815 this in conjunction with AT_STRINGS_BEG. */ 1816 #define IS_A_LETTER(d) \ 1817 (SYNTAX ((d) == end1 ? *string2 : (d) == string2 - 1 ? *(end1 - 1) : *(d))\ 1818 == Sword) 1819 1820 1821 /* Match the pattern described by PBUFP against the virtual 1822 concatenation of STRING1 and STRING2, which are of SIZE1 and SIZE2, 1823 respectively. Start the match at index POS in the virtual 1824 concatenation of STRING1 and STRING2. In REGS, return the indices of 1825 the virtual concatenation of STRING1 and STRING2 that matched the 1826 entire PBUFP->buffer and its contained subexpressions. Do not 1827 consider matching one past the index MSTOP in the virtual 1828 concatenation of STRING1 and STRING2. 1829 1830 If pbufp->fastmap is nonzero, then it had better be up to date. 1831 1832 The reason that the data to match are specified as two components 1833 which are to be regarded as concatenated is so this function can be 1834 used directly on the contents of an Emacs buffer. 1835 1836 -1 is returned if there is no match. -2 is returned if there is an 1837 error (such as match stack overflow). Otherwise the value is the 1838 length of the substring which was matched. */ 1839 1840 int 1841 re_match_2 (struct re_pattern_buffer *pbufp, 1842 char *string1_arg, int size1, 1843 char *string2_arg, int size2, 1844 int pos, 1845 struct re_registers *regs, 1846 int mstop) 1847 { 1848 register unsigned char *p = (unsigned char *) pbufp->buffer; 1849 1850 /* Pointer to beyond end of buffer. */ 1851 register unsigned char *pend = p + pbufp->used; 1852 1853 unsigned char *string1 = (unsigned char *) string1_arg; 1854 unsigned char *string2 = (unsigned char *) string2_arg; 1855 unsigned char *end1; /* Just past end of first string. */ 1856 unsigned char *end2; /* Just past end of second string. */ 1857 1858 /* Pointers into string1 and string2, just past the last characters in 1859 each to consider matching. */ 1860 unsigned char *end_match_1, *end_match_2; 1861 1862 register unsigned char *d, *dend; 1863 register int mcnt; /* Multipurpose. */ 1864 unsigned char *translate = (unsigned char *) pbufp->translate; 1865 unsigned is_a_jump_n = 0; 1866 1867 /* Failure point stack. Each place that can handle a failure further 1868 down the line pushes a failure point on this stack. It consists of 1869 restart, regend, and reg_info for all registers corresponding to the 1870 subexpressions we're currently inside, plus the number of such 1871 registers, and, finally, two char *'s. The first char * is where to 1872 resume scanning the pattern; the second one is where to resume 1873 scanning the strings. If the latter is zero, the failure point is a 1874 ``dummy''; if a failure happens and the failure point is a dummy, it 1875 gets discarded and the next next one is tried. */ 1876 1877 unsigned char *initial_stack[MAX_NUM_FAILURE_ITEMS * NFAILURES]; 1878 unsigned char **stackb = initial_stack; 1879 unsigned char **stackp = stackb; 1880 unsigned char **stacke = &stackb[MAX_NUM_FAILURE_ITEMS * NFAILURES]; 1881 1882 1883 /* Information on the contents of registers. These are pointers into 1884 the input strings; they record just what was matched (on this 1885 attempt) by a subexpression part of the pattern, that is, the 1886 regnum-th regstart pointer points to where in the pattern we began 1887 matching and the regnum-th regend points to right after where we 1888 stopped matching the regnum-th subexpression. (The zeroth register 1889 keeps track of what the whole pattern matches.) */ 1890 1891 unsigned char *regstart[RE_NREGS]; 1892 unsigned char *regend[RE_NREGS]; 1893 1894 /* The is_active field of reg_info helps us keep track of which (possibly 1895 nested) subexpressions we are currently in. The matched_something 1896 field of reg_info[reg_num] helps us tell whether or not we have 1897 matched any of the pattern so far this time through the reg_num-th 1898 subexpression. These two fields get reset each time through any 1899 loop their register is in. */ 1900 1901 struct register_info reg_info[RE_NREGS]; 1902 1903 1904 /* The following record the register info as found in the above 1905 variables when we find a match better than any we've seen before. 1906 This happens as we backtrack through the failure points, which in 1907 turn happens only if we have not yet matched the entire string. */ 1908 1909 unsigned best_regs_set = 0; 1910 unsigned char *best_regstart[RE_NREGS]; 1911 unsigned char *best_regend[RE_NREGS]; 1912 1913 /* Initialize subexpression text positions to -1 to mark ones that no 1914 \( or ( and \) or ) has been seen for. Also set all registers to 1915 inactive and mark them as not having matched anything or ever 1916 failed. */ 1917 for (mcnt = 0; mcnt < RE_NREGS; mcnt++) 1918 { 1919 regstart[mcnt] = regend[mcnt] = (unsigned char *) -1; 1920 IS_ACTIVE (reg_info[mcnt]) = 0; 1921 MATCHED_SOMETHING (reg_info[mcnt]) = 0; 1922 } 1923 1924 if (regs) 1925 for (mcnt = 0; mcnt < RE_NREGS; mcnt++) 1926 regs->start[mcnt] = regs->end[mcnt] = -1; 1927 1928 /* Set up pointers to ends of strings. 1929 Don't allow the second string to be empty unless both are empty. */ 1930 if (size2 == 0) 1931 { 1932 string2 = string1; 1933 size2 = size1; 1934 string1 = 0; 1935 size1 = 0; 1936 } 1937 end1 = string1 + size1; 1938 end2 = string2 + size2; 1939 1940 /* Compute where to stop matching, within the two strings. */ 1941 if (mstop <= size1) 1942 { 1943 end_match_1 = string1 + mstop; 1944 end_match_2 = string2; 1945 } 1946 else 1947 { 1948 end_match_1 = end1; 1949 end_match_2 = string2 + mstop - size1; 1950 } 1951 1952 /* `p' scans through the pattern as `d' scans through the data. `dend' 1953 is the end of the input string that `d' points within. `d' is 1954 advanced into the following input string whenever necessary, but 1955 this happens before fetching; therefore, at the beginning of the 1956 loop, `d' can be pointing at the end of a string, but it cannot 1957 equal string2. */ 1958 1959 if (size1 != 0 && pos <= size1) 1960 d = string1 + pos, dend = end_match_1; 1961 else 1962 d = string2 + pos - size1, dend = end_match_2; 1963 1964 1965 /* This loops over pattern commands. It exits by returning from the 1966 function if match is complete, or it drops through if match fails 1967 at this starting point in the input data. */ 1968 1969 while (1) 1970 { 1971 is_a_jump_n = 0; 1972 /* End of pattern means we might have succeeded. */ 1973 if (p == pend) 1974 { 1975 /* If not end of string, try backtracking. Otherwise done. */ 1976 if (d != end_match_2) 1977 { 1978 if (stackp != stackb) 1979 { 1980 /* More failure points to try. */ 1981 1982 unsigned in_same_string = 1983 IS_IN_FIRST_STRING (best_regend[0]) 1984 == MATCHING_IN_FIRST_STRING; 1985 1986 /* If exceeds best match so far, save it. */ 1987 if (! best_regs_set 1988 || (in_same_string && d > best_regend[0]) 1989 || (! in_same_string && ! MATCHING_IN_FIRST_STRING)) 1990 { 1991 best_regs_set = 1; 1992 best_regend[0] = d; /* Never use regstart[0]. */ 1993 1994 for (mcnt = 1; mcnt < RE_NREGS; mcnt++) 1995 { 1996 best_regstart[mcnt] = regstart[mcnt]; 1997 best_regend[mcnt] = regend[mcnt]; 1998 } 1999 } 2000 goto fail; 2001 } 2002 /* If no failure points, don't restore garbage. */ 2003 else if (best_regs_set) 2004 { 2005 restore_best_regs: 2006 /* Restore best match. */ 2007 d = best_regend[0]; 2008 2009 for (mcnt = 0; mcnt < RE_NREGS; mcnt++) 2010 { 2011 regstart[mcnt] = best_regstart[mcnt]; 2012 regend[mcnt] = best_regend[mcnt]; 2013 } 2014 } 2015 } 2016 2017 /* If caller wants register contents data back, convert it 2018 to indices. */ 2019 if (regs) 2020 { 2021 regs->start[0] = pos; 2022 if (MATCHING_IN_FIRST_STRING) 2023 regs->end[0] = d - string1; 2024 else 2025 regs->end[0] = d - string2 + size1; 2026 for (mcnt = 1; mcnt < RE_NREGS; mcnt++) 2027 { 2028 if (regend[mcnt] == (unsigned char *) -1) 2029 { 2030 regs->start[mcnt] = -1; 2031 regs->end[mcnt] = -1; 2032 continue; 2033 } 2034 if (IS_IN_FIRST_STRING (regstart[mcnt])) 2035 regs->start[mcnt] = regstart[mcnt] - string1; 2036 else 2037 regs->start[mcnt] = regstart[mcnt] - string2 + size1; 2038 2039 if (IS_IN_FIRST_STRING (regend[mcnt])) 2040 regs->end[mcnt] = regend[mcnt] - string1; 2041 else 2042 regs->end[mcnt] = regend[mcnt] - string2 + size1; 2043 } 2044 } 2045 return d - pos - (MATCHING_IN_FIRST_STRING 2046 ? string1 2047 : string2 - size1); 2048 } 2049 2050 /* Otherwise match next pattern command. */ 2051 #ifdef SWITCH_ENUM_BUG 2052 switch ((int) ((enum regexpcode) *p++)) 2053 #else 2054 switch ((enum regexpcode) *p++) 2055 #endif 2056 { 2057 2058 /* \( [or `(', as appropriate] is represented by start_memory, 2059 \) by stop_memory. Both of those commands are followed by 2060 a register number in the next byte. The text matched 2061 within the \( and \) is recorded under that number. */ 2062 case start_memory: 2063 regstart[*p] = d; 2064 IS_ACTIVE (reg_info[*p]) = 1; 2065 MATCHED_SOMETHING (reg_info[*p]) = 0; 2066 p++; 2067 break; 2068 2069 case stop_memory: 2070 regend[*p] = d; 2071 IS_ACTIVE (reg_info[*p]) = 0; 2072 2073 /* If just failed to match something this time around with a sub- 2074 expression that's in a loop, try to force exit from the loop. */ 2075 if ((! MATCHED_SOMETHING (reg_info[*p]) 2076 || (enum regexpcode) p[-3] == start_memory) 2077 && (p + 1) != pend) 2078 { 2079 register unsigned char *p2 = p + 1; 2080 mcnt = 0; 2081 switch (*p2++) 2082 { 2083 case jump_n: 2084 is_a_jump_n = 1; 2085 case finalize_jump: 2086 case maybe_finalize_jump: 2087 case jump: 2088 case dummy_failure_jump: 2089 EXTRACT_NUMBER_AND_INCR (mcnt, p2); 2090 if (is_a_jump_n) 2091 p2 += 2; 2092 break; 2093 } 2094 p2 += mcnt; 2095 2096 /* If the next operation is a jump backwards in the pattern 2097 to an on_failure_jump, exit from the loop by forcing a 2098 failure after pushing on the stack the on_failure_jump's 2099 jump in the pattern, and d. */ 2100 if (mcnt < 0 && (enum regexpcode) *p2++ == on_failure_jump) 2101 { 2102 EXTRACT_NUMBER_AND_INCR (mcnt, p2); 2103 PUSH_FAILURE_POINT (p2 + mcnt, d); 2104 goto fail; 2105 } 2106 } 2107 p++; 2108 break; 2109 2110 /* \<digit> has been turned into a `duplicate' command which is 2111 followed by the numeric value of <digit> as the register number. */ 2112 case duplicate: 2113 { 2114 int regno = *p++; /* Get which register to match against */ 2115 register unsigned char *d2, *dend2; 2116 2117 /* Where in input to try to start matching. */ 2118 d2 = regstart[regno]; 2119 2120 /* Where to stop matching; if both the place to start and 2121 the place to stop matching are in the same string, then 2122 set to the place to stop, otherwise, for now have to use 2123 the end of the first string. */ 2124 2125 dend2 = ((IS_IN_FIRST_STRING (regstart[regno]) 2126 == IS_IN_FIRST_STRING (regend[regno])) 2127 ? regend[regno] : end_match_1); 2128 while (1) 2129 { 2130 /* If necessary, advance to next segment in register 2131 contents. */ 2132 while (d2 == dend2) 2133 { 2134 if (dend2 == end_match_2) break; 2135 if (dend2 == regend[regno]) break; 2136 d2 = string2, dend2 = regend[regno]; /* end of string1 => advance to string2. */ 2137 } 2138 /* At end of register contents => success */ 2139 if (d2 == dend2) break; 2140 2141 /* If necessary, advance to next segment in data. */ 2142 PREFETCH; 2143 2144 /* How many characters left in this segment to match. */ 2145 mcnt = dend - d; 2146 2147 /* Want how many consecutive characters we can match in 2148 one shot, so, if necessary, adjust the count. */ 2149 if (mcnt > dend2 - d2) 2150 mcnt = dend2 - d2; 2151 2152 /* Compare that many; failure if mismatch, else move 2153 past them. */ 2154 if (translate 2155 ? bcmp_translate ((char*)d, (char*)d2, mcnt, translate) 2156 : memcmp (d, d2, mcnt)) 2157 goto fail; 2158 d += mcnt, d2 += mcnt; 2159 } 2160 } 2161 break; 2162 2163 case anychar: 2164 PREFETCH; /* Fetch a data character. */ 2165 /* Match anything but a newline, maybe even a null. */ 2166 if ((translate ? translate[*d] : *d) == '\n' 2167 || ((obscure_syntax & RE_DOT_NOT_NULL) 2168 && (translate ? translate[*d] : *d) == '\000')) 2169 goto fail; 2170 SET_REGS_MATCHED; 2171 d++; 2172 break; 2173 2174 case charset: 2175 case charset_not: 2176 { 2177 int not = 0; /* Nonzero for charset_not. */ 2178 register int c; 2179 if (*(p - 1) == (unsigned char) charset_not) 2180 not = 1; 2181 2182 PREFETCH; /* Fetch a data character. */ 2183 2184 if (translate) 2185 c = translate[*d]; 2186 else 2187 c = *d; 2188 2189 if (c < *p * BYTEWIDTH 2190 && p[1 + c / BYTEWIDTH] & (1 << (c % BYTEWIDTH))) 2191 not = !not; 2192 2193 p += 1 + *p; 2194 2195 if (!not) goto fail; 2196 SET_REGS_MATCHED; 2197 d++; 2198 break; 2199 } 2200 2201 case begline: 2202 if ((size1 != 0 && d == string1) 2203 || (size1 == 0 && size2 != 0 && d == string2) 2204 || (d && d[-1] == '\n') 2205 || (size1 == 0 && size2 == 0)) 2206 break; 2207 else 2208 goto fail; 2209 2210 case endline: 2211 if (d == end2 2212 || (d == end1 ? (size2 == 0 || *string2 == '\n') : *d == '\n')) 2213 break; 2214 goto fail; 2215 2216 /* `or' constructs are handled by starting each alternative with 2217 an on_failure_jump that points to the start of the next 2218 alternative. Each alternative except the last ends with a 2219 jump to the joining point. (Actually, each jump except for 2220 the last one really jumps to the following jump, because 2221 tensioning the jumps is a hassle.) */ 2222 2223 /* The start of a stupid repeat has an on_failure_jump that points 2224 past the end of the repeat text. This makes a failure point so 2225 that on failure to match a repetition, matching restarts past 2226 as many repetitions have been found with no way to fail and 2227 look for another one. */ 2228 2229 /* A smart repeat is similar but loops back to the on_failure_jump 2230 so that each repetition makes another failure point. */ 2231 2232 case on_failure_jump: 2233 on_failure: 2234 EXTRACT_NUMBER_AND_INCR (mcnt, p); 2235 PUSH_FAILURE_POINT (p + mcnt, d); 2236 break; 2237 2238 /* The end of a smart repeat has a maybe_finalize_jump back. 2239 Change it either to a finalize_jump or an ordinary jump. */ 2240 case maybe_finalize_jump: 2241 EXTRACT_NUMBER_AND_INCR (mcnt, p); 2242 { 2243 register unsigned char *p2 = p; 2244 /* Compare what follows with the beginning of the repeat. 2245 If we can establish that there is nothing that they would 2246 both match, we can change to finalize_jump. */ 2247 while (p2 + 1 != pend 2248 && (*p2 == (unsigned char) stop_memory 2249 || *p2 == (unsigned char) start_memory)) 2250 p2 += 2; /* Skip over reg number. */ 2251 if (p2 == pend) 2252 p[-3] = (unsigned char) finalize_jump; 2253 else if (*p2 == (unsigned char) exactn 2254 || *p2 == (unsigned char) endline) 2255 { 2256 register int c = *p2 == (unsigned char) endline ? '\n' : p2[2]; 2257 register unsigned char *p1 = p + mcnt; 2258 /* p1[0] ... p1[2] are an on_failure_jump. 2259 Examine what follows that. */ 2260 if (p1[3] == (unsigned char) exactn && p1[5] != c) 2261 p[-3] = (unsigned char) finalize_jump; 2262 else if (p1[3] == (unsigned char) charset 2263 || p1[3] == (unsigned char) charset_not) 2264 { 2265 int not = p1[3] == (unsigned char) charset_not; 2266 if (c < p1[4] * BYTEWIDTH 2267 && p1[5 + c / BYTEWIDTH] & (1 << (c % BYTEWIDTH))) 2268 not = !not; 2269 /* `not' is 1 if c would match. */ 2270 /* That means it is not safe to finalize. */ 2271 if (!not) 2272 p[-3] = (unsigned char) finalize_jump; 2273 } 2274 } 2275 } 2276 p -= 2; /* Point at relative address again. */ 2277 if (p[-1] != (unsigned char) finalize_jump) 2278 { 2279 p[-1] = (unsigned char) jump; 2280 goto nofinalize; 2281 } 2282 /* Note fall through. */ 2283 2284 /* The end of a stupid repeat has a finalize_jump back to the 2285 start, where another failure point will be made which will 2286 point to after all the repetitions found so far. */ 2287 2288 /* Take off failure points put on by matching on_failure_jump 2289 because didn't fail. Also remove the register information 2290 put on by the on_failure_jump. */ 2291 case finalize_jump: 2292 POP_FAILURE_POINT (); 2293 /* Note fall through. */ 2294 2295 /* Jump without taking off any failure points. */ 2296 case jump: 2297 nofinalize: 2298 EXTRACT_NUMBER_AND_INCR (mcnt, p); 2299 p += mcnt; 2300 break; 2301 2302 case dummy_failure_jump: 2303 /* Normally, the on_failure_jump pushes a failure point, which 2304 then gets popped at finalize_jump. We will end up at 2305 finalize_jump, also, and with a pattern of, say, `a+', we 2306 are skipping over the on_failure_jump, so we have to push 2307 something meaningless for finalize_jump to pop. */ 2308 PUSH_FAILURE_POINT (0, 0); 2309 goto nofinalize; 2310 2311 2312 /* Have to succeed matching what follows at least n times. Then 2313 just handle like an on_failure_jump. */ 2314 case succeed_n: 2315 EXTRACT_NUMBER (mcnt, p + 2); 2316 /* Originally, this is how many times we HAVE to succeed. */ 2317 if (mcnt) 2318 { 2319 mcnt--; 2320 p += 2; 2321 STORE_NUMBER_AND_INCR (p, mcnt); 2322 } 2323 else if (mcnt == 0) 2324 { 2325 p[2] = unused; 2326 p[3] = unused; 2327 goto on_failure; 2328 } 2329 else 2330 { 2331 fprintf (stderr, "regex: the succeed_n's n is not set.\n"); 2332 exit (1); 2333 } 2334 break; 2335 2336 case jump_n: 2337 EXTRACT_NUMBER (mcnt, p + 2); 2338 /* Originally, this is how many times we CAN jump. */ 2339 if (mcnt) 2340 { 2341 mcnt--; 2342 STORE_NUMBER(p + 2, mcnt); 2343 goto nofinalize; /* Do the jump without taking off 2344 any failure points. */ 2345 } 2346 /* If don't have to jump any more, skip over the rest of command. */ 2347 else 2348 p += 4; 2349 break; 2350 2351 case set_number_at: 2352 { 2353 register unsigned char *p1; 2354 2355 EXTRACT_NUMBER_AND_INCR (mcnt, p); 2356 p1 = p + mcnt; 2357 EXTRACT_NUMBER_AND_INCR (mcnt, p); 2358 STORE_NUMBER (p1, mcnt); 2359 break; 2360 } 2361 2362 /* Ignore these. Used to ignore the n of succeed_n's which 2363 currently have n == 0. */ 2364 case unused: 2365 break; 2366 2367 case wordbound: 2368 if (AT_WORD_BOUNDARY) 2369 break; 2370 goto fail; 2371 2372 case notwordbound: 2373 if (AT_WORD_BOUNDARY) 2374 goto fail; 2375 break; 2376 2377 case wordbeg: 2378 /* Have to check if AT_STRINGS_BEG before looking at d - 1. */ 2379 if (IS_A_LETTER (d) && (AT_STRINGS_BEG || !IS_A_LETTER (d - 1))) 2380 break; 2381 goto fail; 2382 2383 case wordend: 2384 /* Have to check if AT_STRINGS_BEG before looking at d - 1. */ 2385 if (!AT_STRINGS_BEG && IS_A_LETTER (d - 1) 2386 && (!IS_A_LETTER (d) || AT_STRINGS_END)) 2387 break; 2388 goto fail; 2389 2390 #ifdef emacs 2391 case before_dot: 2392 if (PTR_CHAR_POS (d) >= point) 2393 goto fail; 2394 break; 2395 2396 case at_dot: 2397 if (PTR_CHAR_POS (d) != point) 2398 goto fail; 2399 break; 2400 2401 case after_dot: 2402 if (PTR_CHAR_POS (d) <= point) 2403 goto fail; 2404 break; 2405 2406 case wordchar: 2407 mcnt = (int) Sword; 2408 goto matchsyntax; 2409 2410 case syntaxspec: 2411 mcnt = *p++; 2412 matchsyntax: 2413 PREFETCH; 2414 if (SYNTAX (*d++) != (enum syntaxcode) mcnt) goto fail; 2415 SET_REGS_MATCHED; 2416 break; 2417 2418 case notwordchar: 2419 mcnt = (int) Sword; 2420 goto matchnotsyntax; 2421 2422 case notsyntaxspec: 2423 mcnt = *p++; 2424 matchnotsyntax: 2425 PREFETCH; 2426 if (SYNTAX (*d++) == (enum syntaxcode) mcnt) goto fail; 2427 SET_REGS_MATCHED; 2428 break; 2429 2430 #else /* not emacs */ 2431 2432 case wordchar: 2433 PREFETCH; 2434 if (!IS_A_LETTER (d)) 2435 goto fail; 2436 SET_REGS_MATCHED; 2437 break; 2438 2439 case notwordchar: 2440 PREFETCH; 2441 if (IS_A_LETTER (d)) 2442 goto fail; 2443 SET_REGS_MATCHED; 2444 break; 2445 2446 #endif /* not emacs */ 2447 2448 case begbuf: 2449 if (AT_STRINGS_BEG) 2450 break; 2451 goto fail; 2452 2453 case endbuf: 2454 if (AT_STRINGS_END) 2455 break; 2456 goto fail; 2457 2458 case exactn: 2459 /* Match the next few pattern characters exactly. 2460 mcnt is how many characters to match. */ 2461 mcnt = *p++; 2462 /* This is written out as an if-else so we don't waste time 2463 testing `translate' inside the loop. */ 2464 if (translate) 2465 { 2466 do 2467 { 2468 PREFETCH; 2469 if (translate[*d++] != *p++) goto fail; 2470 } 2471 while (--mcnt); 2472 } 2473 else 2474 { 2475 do 2476 { 2477 PREFETCH; 2478 if (*d++ != *p++) goto fail; 2479 } 2480 while (--mcnt); 2481 } 2482 SET_REGS_MATCHED; 2483 break; 2484 } 2485 continue; /* Successfully executed one pattern command; keep going. */ 2486 2487 /* Jump here if any matching operation fails. */ 2488 fail: 2489 if (stackp != stackb) 2490 /* A restart point is known. Restart there and pop it. */ 2491 { 2492 short last_used_reg, this_reg; 2493 2494 /* If this failure point is from a dummy_failure_point, just 2495 skip it. */ 2496 if (!stackp[-2]) 2497 { 2498 POP_FAILURE_POINT (); 2499 goto fail; 2500 } 2501 2502 d = *--stackp; 2503 p = *--stackp; 2504 if (d >= string1 && d <= end1) 2505 dend = end_match_1; 2506 /* Restore register info. */ 2507 last_used_reg = (short) *--stackp; 2508 2509 /* Make the ones that weren't saved -1 or 0 again. */ 2510 for (this_reg = RE_NREGS - 1; this_reg > last_used_reg; this_reg--) 2511 { 2512 regend[this_reg] = (unsigned char *) -1; 2513 regstart[this_reg] = (unsigned char *) -1; 2514 IS_ACTIVE (reg_info[this_reg]) = 0; 2515 MATCHED_SOMETHING (reg_info[this_reg]) = 0; 2516 } 2517 2518 /* And restore the rest from the stack. */ 2519 for ( ; this_reg > 0; this_reg--) 2520 { 2521 reg_info[this_reg] = *(struct register_info *) *--stackp; 2522 regend[this_reg] = *--stackp; 2523 regstart[this_reg] = *--stackp; 2524 } 2525 } 2526 else 2527 break; /* Matching at this starting point really fails. */ 2528 } 2529 2530 if (best_regs_set) 2531 goto restore_best_regs; 2532 return -1; /* Failure to match. */ 2533 } 2534 2535 2536 static int 2537 bcmp_translate (char *s1, char *s2, int len, unsigned char *translate) 2538 { 2539 register unsigned char *p1 = (unsigned char*)s1; 2540 register unsigned char *p2 = (unsigned char*)s2; 2541 while (len) 2542 { 2543 if (translate [*p1++] != translate [*p2++]) return 1; 2544 len--; 2545 } 2546 return 0; 2547 } 2548 2549 2550 2551 /* Entry points compatible with 4.2 BSD regex library. */ 2552 2553 #ifndef emacs 2554 2555 static struct re_pattern_buffer re_comp_buf; 2556 2557 char * 2558 re_comp (const char *s) 2559 { 2560 if (!s) 2561 { 2562 if (!re_comp_buf.buffer) 2563 return "No previous regular expression"; 2564 return 0; 2565 } 2566 2567 if (!re_comp_buf.buffer) 2568 { 2569 if (!(re_comp_buf.buffer = (char *) malloc (200))) 2570 return "Memory exhausted"; 2571 re_comp_buf.allocated = 200; 2572 if (!(re_comp_buf.fastmap = (char *) malloc (1 << BYTEWIDTH))) 2573 return "Memory exhausted"; 2574 } 2575 return re_compile_pattern (s, strlen (s), &re_comp_buf); 2576 } 2577 2578 int 2579 re_exec (const char *s) 2580 { 2581 int len = strlen (s); 2582 return 0 <= re_search (&re_comp_buf, s, len, 0, len, 2583 (struct re_registers *) 0); 2584 } 2585 #endif /* not emacs */ 2586 2587 2588 2589 #ifdef test 2590 2591 #include <stdio.h> 2592 2593 /* Indexed by a character, gives the upper case equivalent of the 2594 character. */ 2595 2596 char upcase[0400] = 2597 { 000, 001, 002, 003, 004, 005, 006, 007, 2598 010, 011, 012, 013, 014, 015, 016, 017, 2599 020, 021, 022, 023, 024, 025, 026, 027, 2600 030, 031, 032, 033, 034, 035, 036, 037, 2601 040, 041, 042, 043, 044, 045, 046, 047, 2602 050, 051, 052, 053, 054, 055, 056, 057, 2603 060, 061, 062, 063, 064, 065, 066, 067, 2604 070, 071, 072, 073, 074, 075, 076, 077, 2605 0100, 0101, 0102, 0103, 0104, 0105, 0106, 0107, 2606 0110, 0111, 0112, 0113, 0114, 0115, 0116, 0117, 2607 0120, 0121, 0122, 0123, 0124, 0125, 0126, 0127, 2608 0130, 0131, 0132, 0133, 0134, 0135, 0136, 0137, 2609 0140, 0101, 0102, 0103, 0104, 0105, 0106, 0107, 2610 0110, 0111, 0112, 0113, 0114, 0115, 0116, 0117, 2611 0120, 0121, 0122, 0123, 0124, 0125, 0126, 0127, 2612 0130, 0131, 0132, 0173, 0174, 0175, 0176, 0177, 2613 0200, 0201, 0202, 0203, 0204, 0205, 0206, 0207, 2614 0210, 0211, 0212, 0213, 0214, 0215, 0216, 0217, 2615 0220, 0221, 0222, 0223, 0224, 0225, 0226, 0227, 2616 0230, 0231, 0232, 0233, 0234, 0235, 0236, 0237, 2617 0240, 0241, 0242, 0243, 0244, 0245, 0246, 0247, 2618 0250, 0251, 0252, 0253, 0254, 0255, 0256, 0257, 2619 0260, 0261, 0262, 0263, 0264, 0265, 0266, 0267, 2620 0270, 0271, 0272, 0273, 0274, 0275, 0276, 0277, 2621 0300, 0301, 0302, 0303, 0304, 0305, 0306, 0307, 2622 0310, 0311, 0312, 0313, 0314, 0315, 0316, 0317, 2623 0320, 0321, 0322, 0323, 0324, 0325, 0326, 0327, 2624 0330, 0331, 0332, 0333, 0334, 0335, 0336, 0337, 2625 0340, 0341, 0342, 0343, 0344, 0345, 0346, 0347, 2626 0350, 0351, 0352, 0353, 0354, 0355, 0356, 0357, 2627 0360, 0361, 0362, 0363, 0364, 0365, 0366, 0367, 2628 0370, 0371, 0372, 0373, 0374, 0375, 0376, 0377 2629 }; 2630 2631 #ifdef canned 2632 2633 #include "tests.h" 2634 2635 typedef enum { extended_test, basic_test } test_type; 2636 2637 /* Use this to run the tests we've thought of. */ 2638 2639 void 2640 main () 2641 { 2642 test_type t = extended_test; 2643 2644 if (t == basic_test) 2645 { 2646 printf ("Running basic tests:\n\n"); 2647 test_posix_basic (); 2648 } 2649 else if (t == extended_test) 2650 { 2651 printf ("Running extended tests:\n\n"); 2652 test_posix_extended (); 2653 } 2654 } 2655 2656 #else /* not canned */ 2657 2658 /* Use this to run interactive tests. */ 2659 2660 void 2661 main (int argc, char **argv) 2662 { 2663 char pat[80]; 2664 struct re_pattern_buffer buf; 2665 int i; 2666 char c; 2667 char fastmap[(1 << BYTEWIDTH)]; 2668 2669 /* Allow a command argument to specify the style of syntax. */ 2670 if (argc > 1) 2671 obscure_syntax = atoi (argv[1]); 2672 2673 buf.allocated = 40; 2674 buf.buffer = (char *) malloc (buf.allocated); 2675 buf.fastmap = fastmap; 2676 buf.translate = upcase; 2677 2678 while (1) 2679 { 2680 gets (pat); 2681 2682 if (*pat) 2683 { 2684 re_compile_pattern (pat, strlen(pat), &buf); 2685 2686 for (i = 0; i < buf.used; i++) 2687 printchar (buf.buffer[i]); 2688 2689 putchar ('\n'); 2690 2691 printf ("%d allocated, %d used.\n", buf.allocated, buf.used); 2692 2693 re_compile_fastmap (&buf); 2694 printf ("Allowed by fastmap: "); 2695 for (i = 0; i < (1 << BYTEWIDTH); i++) 2696 if (fastmap[i]) printchar (i); 2697 putchar ('\n'); 2698 } 2699 2700 gets (pat); /* Now read the string to match against */ 2701 2702 i = re_match (&buf, pat, strlen (pat), 0, 0); 2703 printf ("Match value %d.\n", i); 2704 } 2705 } 2706 2707 #endif 2708 2709 2710 #ifdef NOTDEF 2711 void 2712 print_buf (struct re_pattern_buffer *bufpbufp) 2713 { 2714 int i; 2715 2716 printf ("buf is :\n----------------\n"); 2717 for (i = 0; i < bufp->used; i++) 2718 printchar (bufp->buffer[i]); 2719 2720 printf ("\n%d allocated, %d used.\n", bufp->allocated, bufp->used); 2721 2722 printf ("Allowed by fastmap: "); 2723 for (i = 0; i < (1 << BYTEWIDTH); i++) 2724 if (bufp->fastmap[i]) 2725 printchar (i); 2726 printf ("\nAllowed by translate: "); 2727 if (bufp->translate) 2728 for (i = 0; i < (1 << BYTEWIDTH); i++) 2729 if (bufp->translate[i]) 2730 printchar (i); 2731 printf ("\nfastmap is%s accurate\n", bufp->fastmap_accurate ? "" : "n't"); 2732 printf ("can %s be null\n----------", bufp->can_be_null ? "" : "not"); 2733 } 2734 #endif /* NOTDEF */ 2735 2736 void 2737 printchar (char c) 2738 { 2739 if (c < 040 || c >= 0177) 2740 { 2741 putchar ('\\'); 2742 putchar (((c >> 6) & 3) + '0'); 2743 putchar (((c >> 3) & 7) + '0'); 2744 putchar ((c & 7) + '0'); 2745 } 2746 else 2747 putchar (c); 2748 } 2749 2750 void 2751 error (char *string) 2752 { 2753 puts (string); 2754 exit (1); 2755 } 2756 #endif /* test */ 2757