1 #if !defined(RXH) || defined(RX_WANT_SE_DEFS) 2 #define RXH 3 4 /* Copyright (C) 1992, 1993 Free Software Foundation, Inc. 5 6 This program is free software; you can redistribute it and/or modify 7 it under the terms of the GNU General Public License as published by 8 the Free Software Foundation; either version 2, or (at your option) 9 any later version. 10 11 This program is distributed in the hope that it will be useful, 12 but WITHOUT ANY WARRANTY; without even the implied warranty of 13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 14 GNU General Public License for more details. 15 16 You should have received a copy of the GNU General Public License 17 along with this software; see the file COPYING. If not, write to 18 the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA. */ 19 /* t. lord Wed Sep 23 18:20:57 1992 */ 20 21 22 23 24 #ifndef RX_WANT_SE_DEFS 25 26 /* This page: Bitsets */ 27 28 #ifndef RX_subset 29 typedef unsigned int RX_subset; 30 #define RX_subset_bits (32) 31 #define RX_subset_mask (RX_subset_bits - 1) 32 #endif 33 34 typedef RX_subset * rx_Bitset; 35 36 #ifdef __STDC__ 37 typedef void (*rx_bitset_iterator) (void *, int member_index); 38 #else 39 typedef void (*rx_bitset_iterator) (); 40 #endif 41 42 #define rx_bitset_subset(N) ((N) / RX_subset_bits) 43 #define rx_bitset_subset_val(B,N) ((B)[rx_bitset_subset(N)]) 44 #define RX_bitset_access(B,N,OP) \ 45 ((B)[rx_bitset_subset(N)] OP rx_subset_singletons[(N) & RX_subset_mask]) 46 #define RX_bitset_member(B,N) RX_bitset_access(B, N, &) 47 #define RX_bitset_enjoin(B,N) RX_bitset_access(B, N, |=) 48 #define RX_bitset_remove(B,N) RX_bitset_access(B, N, &= ~) 49 #define RX_bitset_toggle(B,N) RX_bitset_access(B, N, ^= ) 50 #define rx_bitset_numb_subsets(N) (((N) + RX_subset_bits - 1) / RX_subset_bits) 51 #define rx_sizeof_bitset(N) (rx_bitset_numb_subsets(N) * sizeof(RX_subset)) 52 53 54 55 /* This page: Splay trees. */ 56 57 #ifdef __STDC__ 58 typedef int (*rx_sp_comparer) (void * a, void * b); 59 #else 60 typedef int (*rx_sp_comparer) (); 61 #endif 62 63 struct rx_sp_node 64 { 65 void * key; 66 void * data; 67 struct rx_sp_node * kids[2]; 68 }; 69 70 #ifdef __STDC__ 71 typedef void (*rx_sp_key_data_freer) (struct rx_sp_node *); 72 #else 73 typedef void (*rx_sp_key_data_freer) (); 74 #endif 75 76 77 /* giant inflatable hash trees */ 78 79 struct rx_hash_item 80 { 81 struct rx_hash_item * next_same_hash; 82 struct rx_hash * table; 83 unsigned long hash; 84 void * data; 85 void * binding; 86 }; 87 88 struct rx_hash 89 { 90 struct rx_hash * parent; 91 int refs; 92 struct rx_hash * children[13]; 93 struct rx_hash_item * buckets [13]; 94 int bucket_size [13]; 95 }; 96 97 struct rx_hash_rules; 98 99 #ifdef __STDC__ 100 /* should return like == */ 101 typedef int (*rx_hash_eq)(void *, void *); 102 typedef struct rx_hash * (*rx_alloc_hash)(struct rx_hash_rules *); 103 typedef void (*rx_free_hash)(struct rx_hash *, 104 struct rx_hash_rules *); 105 typedef struct rx_hash_item * (*rx_alloc_hash_item)(struct rx_hash_rules *, 106 void *); 107 typedef void (*rx_free_hash_item)(struct rx_hash_item *, 108 struct rx_hash_rules *); 109 #else 110 typedef int (*rx_hash_eq)(); 111 typedef struct rx_hash * (*rx_alloc_hash)(); 112 typedef void (*rx_free_hash)(); 113 typedef struct rx_hash_item * (*rx_alloc_hash_item)(); 114 typedef void (*rx_free_hash_item)(); 115 #endif 116 117 struct rx_hash_rules 118 { 119 rx_hash_eq eq; 120 rx_alloc_hash hash_alloc; 121 rx_free_hash free_hash; 122 rx_alloc_hash_item hash_item_alloc; 123 rx_free_hash_item free_hash_item; 124 }; 125 126 127 128 /* Matchers decide what to do by examining a series of these. 129 * Instruction types are described below. 130 */ 131 struct rx_inx 132 { 133 void * inx; 134 void * data; 135 void * data_2; 136 void * fnord; 137 }; 138 139 /* Struct RX holds a compiled regular expression - that is, an nfa ready to be 140 * converted on demand to a more efficient nfa. This is for the low level interface. 141 * The high-level interface incloses this in a `struct re_pattern_buffer'. 142 */ 143 struct rx_cache; 144 #ifdef __STDC__ 145 struct rx_se_list; 146 struct rx; 147 typedef int (*rx_se_list_order) (struct rx *, 148 struct rx_se_list *, struct rx_se_list *); 149 #else 150 typedef int (*rx_se_list_order) (); 151 #endif 152 153 struct rx_superset; 154 155 struct rx 156 { 157 int rx_id; /* Every edition numbered and signed by eclose_nfa. */ 158 159 struct rx_cache * cache; /* Where superstates come from */ 160 161 /* Every regex defines the size of its own character set. */ 162 int local_cset_size; 163 164 void * buffer; /* Malloced memory for the nfa. */ 165 unsigned long allocated; /* Size of that memory. */ 166 167 /* How much buffer space to save for external uses. After compilation, 168 * this space will be available at (buffer + allocated - reserved) 169 */ 170 unsigned long reserved; 171 172 /* --------- The remaining fields are for internal use only. --------- */ 173 /* --------- But! they should be initialized to 0. --------- */ 174 /* NODEC is the number of nodes in the NFA with non-epsilon 175 * orx transitions. 176 */ 177 int nodec; 178 179 /* EPSNODEC is the number of nodes with only epsilon (orx) transitions. */ 180 int epsnodec; 181 182 /* The sum of NODEC & EPSNODEC is the total number of states in the 183 * compiled NFA. 184 */ 185 186 /* side_effect_progs temporarily holds a tree of side effect lists. */ 187 struct rx_hash se_list_memo; 188 189 /* A memo for sets of states in the possible_future lists of an nfa: */ 190 struct rx_hash set_list_memo; 191 192 /* The instruction table is indexed by the enum of instructions defined in 193 * rxrun.h. The values in the table are used to fill in the `inx' 194 * slot of instruction frames (see rxrun.h). 195 */ 196 void ** instruction_table; 197 struct rx_nfa_state *nfa_states; 198 struct rx_nfa_state *start; 199 200 /* This orders the search through super-nfa paths. */ 201 rx_se_list_order se_list_cmp; 202 203 struct rx_superset * start_set; 204 }; 205 206 /* An RX NFA may contain epsilon edges labeled with side effects. 207 * These side effects represent match actions that can not normally be 208 * defined in a `pure' NFA; for example, recording the location at 209 * which a paren is crossed in a register structure. 210 * 211 * A matcher is supposed to find a particular path 212 * through the NFA (such as leftmost-longest), and then to execute the 213 * side effects along that path. Operationally, the space of paths is 214 * searched and side effects are carried out incrementally, and with 215 * backtracking. 216 * 217 * As the NFA is manipulated during matching sets of side effects. 218 * Simple lists are used to hold side effect lists. 219 */ 220 221 typedef void * rx_side_effect; 222 223 struct rx_se_list 224 { 225 rx_side_effect car; 226 struct rx_se_list * cdr; 227 }; 228 229 230 231 /* Struct rexp_node holds an expression tree that represents a regexp. 232 * In this expression tree, every node has a type, and some parameters 233 * appropriate to that type. 234 */ 235 236 enum rexp_node_type 237 { 238 r_cset, /* Match from a character set. `a' or `[a-z]'*/ 239 r_concat, /* Concat two regexps. `ab' */ 240 r_alternate, /* Choose one of two regexps. `a\|b' */ 241 r_opt, /* Optional regexp. `a?' */ 242 r_star, /* Repeated regexp. `a*' */ 243 r_2phase_star, /* hard to explain */ 244 r_side_effect, /* Matches the empty string, but 245 * implies that a side effect must 246 * take place. These nodes are used 247 * by the parser to implement parens, 248 * backreferences etc. 249 */ 250 251 r_data /* R_DATA is soley for the convenience 252 * of parsers or other rexp 253 * transformers that want to 254 * (temporarily) introduce new node 255 * types in rexp structures. These 256 * must be eliminated 257 * by the time build_nfa is called. 258 */ 259 }; 260 261 struct rexp_node 262 { 263 enum rexp_node_type type; 264 union 265 { 266 rx_Bitset cset; 267 rx_side_effect side_effect; 268 struct 269 { 270 struct rexp_node *left; 271 struct rexp_node *right; 272 } pair; 273 void * data; 274 } params; 275 }; 276 277 278 279 /* This defines the structure of the NFA into which rexps are compiled. */ 280 281 struct rx_nfa_state 282 { 283 int id; 284 struct rx_nfa_edge *edges; 285 struct rx_possible_future *futures; 286 unsigned int is_final:1; 287 unsigned int is_start:1; 288 unsigned int eclosure_needed:1; 289 struct rx_nfa_state *next; 290 unsigned int mark:1; 291 }; 292 293 enum rx_nfa_etype 294 { 295 ne_cset, 296 ne_epsilon, 297 ne_side_effect /* A special kind of epsilon. */ 298 }; 299 300 struct rx_nfa_edge 301 { 302 struct rx_nfa_edge *next; 303 enum rx_nfa_etype type; 304 struct rx_nfa_state *dest; 305 union 306 { 307 rx_Bitset cset; 308 rx_side_effect side_effect; 309 } params; 310 }; 311 312 struct rx_nfa_state_set 313 { 314 struct rx_nfa_state * car; 315 struct rx_nfa_state_set * cdr; 316 }; 317 318 struct rx_possible_future 319 { 320 struct rx_possible_future *next; 321 struct rx_se_list * effects; 322 struct rx_nfa_state_set * destset; 323 }; 324 325 326 327 enum rx_opcode 328 { 329 /* 330 * BACKTRACK_POINT is invoked when a transition results in more 331 * than one possible future. 332 * 333 * There is one occurence of this instruction per transition_class 334 * structure; that occurence is only ever executed if the 335 * transition_class contains a list of more than 1 edge. 336 */ 337 rx_backtrack_point = 0, /* data is (struct transition_class *) */ 338 339 /* 340 * RX_DO_SIDE_EFFECTS evaluates the side effects of an epsilon path. 341 * There is one occurence of this instruction per rx_distinct_future. 342 * This instruction is skipped if a rx_distinct_future has no side effects. 343 */ 344 rx_do_side_effects = rx_backtrack_point + 1, 345 /* data is (struct rx_distinct_future *) */ 346 347 /* 348 * RX_CACHE_MISS instructions are stored in rx_distinct_futures whose 349 * destination superstate has been reclaimed (or was never built). 350 * It recomputes the destination superstate. 351 * RX_CACHE_MISS is also stored in a superstate transition table before 352 * any of its edges have been built. 353 */ 354 rx_cache_miss = rx_do_side_effects + 1, 355 /* data is (struct rx_distinct_future *) */ 356 357 /* 358 * RX_NEXT_CHAR is called to consume the next character and take the 359 * corresponding transition. This is the only instruction that uses 360 * the DATA field of the instruction frame instead of DATA_2. 361 * (see EXPLORE_FUTURE in regex.c). 362 */ 363 rx_next_char = rx_cache_miss + 1, /* data is (struct superstate *) */ 364 365 /* RX_BACKTRACK indicates that a transition fails. 366 */ 367 rx_backtrack = rx_next_char + 1, /* no data */ 368 369 /* 370 * RX_ERROR_INX is stored only in places that should never be executed. 371 */ 372 rx_error_inx = rx_backtrack + 1, /* Not supposed to occur. */ 373 374 rx_num_instructions = rx_error_inx + 1 375 }; 376 377 /* An id_instruction_table holds the values stored in instruction 378 * frames. The table is indexed by the enums declared above. 379 */ 380 extern void * rx_id_instruction_table[rx_num_instructions]; 381 382 #if 0 /* Already declared way above. */ 383 /* If the instruction is `rx_next_char' then data is valid. Otherwise it's 0 384 * and data_2 is valid. 385 */ 386 struct rx_inx 387 { 388 void * inx; 389 void * data; 390 void * data_2; 391 }; 392 #endif 393 394 395 #ifndef RX_TAIL_ARRAY 396 #define RX_TAIL_ARRAY 1 397 #endif 398 399 /* A superstate corresponds to a set of nfa states. Those sets are 400 * represented by STRUCT RX_SUPERSET. The constructors 401 * guarantee that only one (shared) structure is created for a given set. 402 */ 403 struct rx_superset 404 { 405 int refs; 406 struct rx_nfa_state * car; /* May or may not be a valid addr. */ 407 int id; /* == car->id for the initial value of *car */ 408 struct rx_superset * cdr; /* May be NULL or a live or semifreed super*/ 409 410 /* If the corresponding superstate exists: */ 411 struct rx_superstate * superstate; 412 413 /* If this is a starting state (as built by re_search_2) 414 * this points to the `struct rx'. The memory for these objects 415 * is typed -- so even after they are freed it is safe to look 416 * at this field (to check, in fact, if this was freed.) 417 */ 418 struct rx * starts_for; 419 420 struct rx_hash_item hash_item; 421 }; 422 423 #define rx_protect_superset(RX,CON) (++(CON)->refs) 424 425 /* Every character occurs in at most one super edge per super-state. 426 * But, that edge might have more than one option, indicating a point 427 * of non-determinism. 428 */ 429 struct rx_super_edge 430 { 431 struct rx_super_edge *next; 432 struct rx_inx rx_backtrack_frame; 433 int cset_size; 434 rx_Bitset cset; 435 struct rx_distinct_future *options; 436 }; 437 438 /* A superstate is a set of nfa states (RX_SUPERSET) along 439 * with a transition table. Superstates are built on demand and reclaimed 440 * without warning. To protect a superstate, use LOCK_SUPERSTATE. 441 * 442 * Joe Keane thought of calling these superstates and several people 443 * have commented on what a good name it is for what they do. 444 */ 445 struct rx_superstate 446 { 447 int rx_id; 448 int locks; 449 struct rx_superstate * next_recyclable; 450 struct rx_superstate * prev_recyclable; 451 struct rx_distinct_future * transition_refs; 452 struct rx_superset * contents; 453 struct rx_super_edge * edges; 454 int is_semifree; 455 int trans_size; 456 struct rx_inx transitions[RX_TAIL_ARRAY]; /* cset sized */ 457 }; 458 459 struct rx_distinct_future 460 { 461 struct rx_distinct_future * next_same_super_edge[2]; 462 struct rx_distinct_future * next_same_dest; 463 struct rx_distinct_future * prev_same_dest; 464 struct rx_superstate * present; /* source state */ 465 struct rx_superstate * future; /* destination state */ 466 struct rx_super_edge * edge; 467 struct rx_inx future_frame; 468 struct rx_inx side_effects_frame; 469 struct rx_se_list * effects; 470 }; 471 472 #define rx_lock_superstate(R,S) ((S)->locks++) 473 #define rx_unlock_superstate(R,S) (--(S)->locks) 474 475 476 /* This page destined for rx.h */ 477 478 struct rx_blocklist 479 { 480 struct rx_blocklist * next; 481 int bytes; 482 }; 483 484 struct rx_freelist 485 { 486 struct rx_freelist * next; 487 }; 488 489 struct rx_cache; 490 491 #ifdef __STDC__ 492 typedef void (*rx_morecore_fn)(struct rx_cache *); 493 #else 494 typedef void (*rx_morecore_fn)(); 495 #endif 496 497 struct rx_cache 498 { 499 struct rx_hash_rules superset_hash_rules; 500 501 /* Objects are allocated by incrementing a pointer that 502 * scans across rx_blocklists. 503 */ 504 struct rx_blocklist * memory; 505 struct rx_blocklist * memory_pos; 506 int bytes_left; 507 char * memory_addr; 508 rx_morecore_fn morecore; 509 510 /* Freelists. */ 511 struct rx_freelist * free_superstates; 512 struct rx_freelist * free_transition_classes; 513 struct rx_freelist * free_discernable_futures; 514 struct rx_freelist * free_supersets; 515 struct rx_freelist * free_hash; 516 517 /* Two sets of superstates -- those that are semifreed, and those 518 * that are being used. 519 */ 520 struct rx_superstate * lru_superstate; 521 struct rx_superstate * semifree_superstate; 522 523 struct rx_superset * empty_superset; 524 525 int superstates; 526 int semifree_superstates; 527 int hits; 528 int misses; 529 int superstates_allowed; 530 531 int local_cset_size; 532 void ** instruction_table; 533 534 struct rx_hash superset_table; 535 }; 536 537 538 539 /* regex.h 540 * 541 * The remaining declarations replace regex.h. 542 */ 543 544 /* This is an array of error messages corresponding to the error codes. 545 */ 546 extern const char *re_error_msg[]; 547 548 /* If any error codes are removed, changed, or added, update the 549 `re_error_msg' table in regex.c. */ 550 typedef enum 551 { 552 REG_NOERROR = 0, /* Success. */ 553 REG_NOMATCH, /* Didn't find a match (for regexec). */ 554 555 /* POSIX regcomp return error codes. (In the order listed in the 556 standard.) */ 557 REG_BADPAT, /* Invalid pattern. */ 558 REG_ECOLLATE, /* Not implemented. */ 559 REG_ECTYPE, /* Invalid character class name. */ 560 REG_EESCAPE, /* Trailing backslash. */ 561 REG_ESUBREG, /* Invalid back reference. */ 562 REG_EBRACK, /* Unmatched left bracket. */ 563 REG_EPAREN, /* Parenthesis imbalance. */ 564 REG_EBRACE, /* Unmatched \{. */ 565 REG_BADBR, /* Invalid contents of \{\}. */ 566 REG_ERANGE, /* Invalid range end. */ 567 REG_ESPACE, /* Ran out of memory. */ 568 REG_BADRPT, /* No preceding re for repetition op. */ 569 570 /* Error codes we've added. */ 571 REG_EEND, /* Premature end. */ 572 REG_ESIZE, /* Compiled pattern bigger than 2^16 bytes. */ 573 REG_ERPAREN /* Unmatched ) or \); not returned from regcomp. */ 574 } reg_errcode_t; 575 576 /* The regex.c support, as a client of rx, defines a set of possible 577 * side effects that can be added to the edge lables of nfa edges. 578 * Here is the list of sidef effects in use. 579 */ 580 581 enum re_side_effects 582 { 583 #define RX_WANT_SE_DEFS 1 584 #undef RX_DEF_SE 585 #undef RX_DEF_CPLX_SE 586 #define RX_DEF_SE(IDEM, NAME, VALUE) NAME VALUE, 587 #define RX_DEF_CPLX_SE(IDEM, NAME, VALUE) NAME VALUE, 588 #include "rx.h" 589 #undef RX_DEF_SE 590 #undef RX_DEF_CPLX_SE 591 #undef RX_WANT_SE_DEFS 592 re_floogle_flap = 65533 593 }; 594 595 /* These hold paramaters for the kinds of side effects that are possible 596 * in the supported pattern languages. These include things like the 597 * numeric bounds of {} operators and the index of paren registers for 598 * subexpression measurement or backreferencing. 599 */ 600 struct re_se_params 601 { 602 enum re_side_effects se; 603 int op1; 604 int op2; 605 }; 606 607 typedef unsigned reg_syntax_t; 608 609 struct re_pattern_buffer 610 { 611 struct rx rx; 612 reg_syntax_t syntax; /* See below for syntax bit definitions. */ 613 614 unsigned int no_sub:1; /* If set, don't return register offsets. */ 615 unsigned int not_bol:1; /* If set, the anchors ('^' and '$') don't */ 616 unsigned int not_eol:1; /* match at the ends of the string. */ 617 unsigned int newline_anchor:1;/* If true, an anchor at a newline matches.*/ 618 unsigned int least_subs:1; /* If set, and returning registers, return 619 * as few values as possible. Only 620 * backreferenced groups and group 0 (the whole 621 * match) will be returned. 622 */ 623 624 /* If true, this says that the matcher should keep registers on its 625 * backtracking stack. For many patterns, we can easily determine that 626 * this isn't necessary. 627 */ 628 unsigned int match_regs_on_stack:1; 629 unsigned int search_regs_on_stack:1; 630 631 /* is_anchored and begbuf_only are filled in by rx_compile. */ 632 unsigned int is_anchored:1; /* Anchorded by ^? */ 633 unsigned int begbuf_only:1; /* Anchored to char position 0? */ 634 635 636 /* If REGS_UNALLOCATED, allocate space in the `regs' structure 637 * for `max (RE_NREGS, re_nsub + 1)' groups. 638 * If REGS_REALLOCATE, reallocate space if necessary. 639 * If REGS_FIXED, use what's there. 640 */ 641 #define REGS_UNALLOCATED 0 642 #define REGS_REALLOCATE 1 643 #define REGS_FIXED 2 644 unsigned int regs_allocated:2; 645 646 647 /* Either a translate table to apply to all characters before 648 * comparing them, or zero for no translation. The translation 649 * is applied to a pattern when it is compiled and to a string 650 * when it is matched. 651 */ 652 char * translate; 653 654 /* If this is a valid pointer, it tells rx not to store the extents of 655 * certain subexpressions (those corresponding to non-zero entries). 656 * Passing 0x1 is the same as passing an array of all ones. Passing 0x0 657 * is the same as passing an array of all zeros. 658 * The array should contain as many entries as their are subexps in the 659 * regexp. 660 */ 661 char * syntax_parens; 662 663 /* Number of subexpressions found by the compiler. */ 664 size_t re_nsub; 665 666 void * buffer; /* Malloced memory for the nfa. */ 667 unsigned long allocated; /* Size of that memory. */ 668 669 /* Pointer to a fastmap, if any, otherwise zero. re_search uses 670 * the fastmap, if there is one, to skip over impossible 671 * starting points for matches. */ 672 char *fastmap; 673 674 unsigned int fastmap_accurate:1; /* These three are internal. */ 675 unsigned int can_match_empty:1; 676 struct rx_nfa_state * start; /* The nfa starting state. */ 677 678 /* This is the list of iterator bounds for {lo,hi} constructs. 679 * The memory pointed to is part of the rx->buffer. 680 */ 681 struct re_se_params *se_params; 682 683 /* This is a bitset representation of the fastmap. 684 * This is a true fastmap that already takes the translate 685 * table into account. 686 */ 687 rx_Bitset fastset; 688 }; 689 690 /* Type for byte offsets within the string. POSIX mandates this. */ 691 typedef int regoff_t; 692 693 /* This is the structure we store register match data in. See 694 regex.texinfo for a full description of what registers match. */ 695 struct re_registers 696 { 697 unsigned num_regs; 698 regoff_t *start; 699 regoff_t *end; 700 }; 701 702 typedef struct re_pattern_buffer regex_t; 703 704 /* POSIX specification for registers. Aside from the different names than 705 `re_registers', POSIX uses an array of structures, instead of a 706 structure of arrays. */ 707 typedef struct 708 { 709 regoff_t rm_so; /* Byte offset from string's start to substring's start. */ 710 regoff_t rm_eo; /* Byte offset from string's start to substring's end. */ 711 } regmatch_t; 712 713 714 /* The following bits are used to determine the regexp syntax we 715 recognize. The set/not-set meanings are chosen so that Emacs syntax 716 remains the value 0. The bits are given in alphabetical order, and 717 the definitions shifted by one from the previous bit; thus, when we 718 add or remove a bit, only one other definition need change. */ 719 720 /* If this bit is not set, then \ inside a bracket expression is literal. 721 If set, then such a \ quotes the following character. */ 722 #define RE_BACKSLASH_ESCAPE_IN_LISTS (1) 723 724 /* If this bit is not set, then + and ? are operators, and \+ and \? are 725 literals. 726 If set, then \+ and \? are operators and + and ? are literals. */ 727 #define RE_BK_PLUS_QM (RE_BACKSLASH_ESCAPE_IN_LISTS << 1) 728 729 /* If this bit is set, then character classes are supported. They are: 730 [:alpha:], [:upper:], [:lower:], [:digit:], [:alnum:], [:xdigit:], 731 [:space:], [:print:], [:punct:], [:graph:], and [:cntrl:]. 732 If not set, then character classes are not supported. */ 733 #define RE_CHAR_CLASSES (RE_BK_PLUS_QM << 1) 734 735 /* If this bit is set, then ^ and $ are always anchors (outside bracket 736 expressions, of course). 737 If this bit is not set, then it depends: 738 ^ is an anchor if it is at the beginning of a regular 739 expression or after an open-group or an alternation operator; 740 $ is an anchor if it is at the end of a regular expression, or 741 before a close-group or an alternation operator. 742 743 This bit could be (re)combined with RE_CONTEXT_INDEP_OPS, because 744 POSIX draft 11.2 says that * etc. in leading positions is undefined. 745 We already implemented a previous draft which made those constructs 746 invalid, though, so we haven't changed the code back. */ 747 #define RE_CONTEXT_INDEP_ANCHORS (RE_CHAR_CLASSES << 1) 748 749 /* If this bit is set, then special characters are always special 750 regardless of where they are in the pattern. 751 If this bit is not set, then special characters are special only in 752 some contexts; otherwise they are ordinary. Specifically, 753 * + ? and intervals are only special when not after the beginning, 754 open-group, or alternation operator. */ 755 #define RE_CONTEXT_INDEP_OPS (RE_CONTEXT_INDEP_ANCHORS << 1) 756 757 /* If this bit is set, then *, +, ?, and { cannot be first in an re or 758 immediately after an alternation or begin-group operator. */ 759 #define RE_CONTEXT_INVALID_OPS (RE_CONTEXT_INDEP_OPS << 1) 760 761 /* If this bit is set, then . matches newline. 762 If not set, then it doesn't. */ 763 #define RE_DOT_NEWLINE (RE_CONTEXT_INVALID_OPS << 1) 764 765 /* If this bit is set, then . doesn't match NUL. 766 If not set, then it does. */ 767 #define RE_DOT_NOT_NULL (RE_DOT_NEWLINE << 1) 768 769 /* If this bit is set, nonmatching lists [^...] do not match newline. 770 If not set, they do. */ 771 #define RE_HAT_LISTS_NOT_NEWLINE (RE_DOT_NOT_NULL << 1) 772 773 /* If this bit is set, either \{...\} or {...} defines an 774 interval, depending on RE_NO_BK_BRACES. 775 If not set, \{, \}, {, and } are literals. */ 776 #define RE_INTERVALS (RE_HAT_LISTS_NOT_NEWLINE << 1) 777 778 /* If this bit is set, +, ? and | aren't recognized as operators. 779 If not set, they are. */ 780 #define RE_LIMITED_OPS (RE_INTERVALS << 1) 781 782 /* If this bit is set, newline is an alternation operator. 783 If not set, newline is literal. */ 784 #define RE_NEWLINE_ALT (RE_LIMITED_OPS << 1) 785 786 /* If this bit is set, then `{...}' defines an interval, and \{ and \} 787 are literals. 788 If not set, then `\{...\}' defines an interval. */ 789 #define RE_NO_BK_BRACES (RE_NEWLINE_ALT << 1) 790 791 /* If this bit is set, (...) defines a group, and \( and \) are literals. 792 If not set, \(...\) defines a group, and ( and ) are literals. */ 793 #define RE_NO_BK_PARENS (RE_NO_BK_BRACES << 1) 794 795 /* If this bit is set, then \<digit> matches <digit>. 796 If not set, then \<digit> is a back-reference. */ 797 #define RE_NO_BK_REFS (RE_NO_BK_PARENS << 1) 798 799 /* If this bit is set, then | is an alternation operator, and \| is literal. 800 If not set, then \| is an alternation operator, and | is literal. */ 801 #define RE_NO_BK_VBAR (RE_NO_BK_REFS << 1) 802 803 /* If this bit is set, then an ending range point collating higher 804 than the starting range point, as in [z-a], is invalid. 805 If not set, then when ending range point collates higher than the 806 starting range point, the range is ignored. */ 807 #define RE_NO_EMPTY_RANGES (RE_NO_BK_VBAR << 1) 808 809 /* If this bit is set, then an unmatched ) is ordinary. 810 If not set, then an unmatched ) is invalid. */ 811 #define RE_UNMATCHED_RIGHT_PAREN_ORD (RE_NO_EMPTY_RANGES << 1) 812 813 /* This global variable defines the particular regexp syntax to use (for 814 some interfaces). When a regexp is compiled, the syntax used is 815 stored in the pattern buffer, so changing this does not affect 816 already-compiled regexps. */ 817 extern reg_syntax_t re_syntax_options; 818 819 /* Define combinations of the above bits for the standard possibilities. 820 (The [[[ comments delimit what gets put into the Texinfo file, so 821 don't delete them!) */ 822 /* [[[begin syntaxes]]] */ 823 #define RE_SYNTAX_EMACS 0 824 825 #define RE_SYNTAX_AWK \ 826 (RE_BACKSLASH_ESCAPE_IN_LISTS | RE_DOT_NOT_NULL \ 827 | RE_NO_BK_PARENS | RE_NO_BK_REFS \ 828 | RE_NO_BK_VAR | RE_NO_EMPTY_RANGES \ 829 | RE_UNMATCHED_RIGHT_PAREN_ORD) 830 831 #define RE_SYNTAX_POSIX_AWK \ 832 (RE_SYNTAX_POSIX_EXTENDED | RE_BACKSLASH_ESCAPE_IN_LISTS) 833 834 #define RE_SYNTAX_GREP \ 835 (RE_BK_PLUS_QM | RE_CHAR_CLASSES \ 836 | RE_HAT_LISTS_NOT_NEWLINE | RE_INTERVALS \ 837 | RE_NEWLINE_ALT) 838 839 #define RE_SYNTAX_EGREP \ 840 (RE_CHAR_CLASSES | RE_CONTEXT_INDEP_ANCHORS \ 841 | RE_CONTEXT_INDEP_OPS | RE_HAT_LISTS_NOT_NEWLINE \ 842 | RE_NEWLINE_ALT | RE_NO_BK_PARENS \ 843 | RE_NO_BK_VBAR) 844 845 #define RE_SYNTAX_POSIX_EGREP \ 846 (RE_SYNTAX_EGREP | RE_INTERVALS | RE_NO_BK_BRACES) 847 848 #define RE_SYNTAX_SED RE_SYNTAX_POSIX_BASIC 849 850 /* Syntax bits common to both basic and extended POSIX regex syntax. */ 851 #define _RE_SYNTAX_POSIX_COMMON \ 852 (RE_CHAR_CLASSES | RE_DOT_NEWLINE | RE_DOT_NOT_NULL \ 853 | RE_INTERVALS | RE_NO_EMPTY_RANGES) 854 855 #define RE_SYNTAX_POSIX_BASIC \ 856 (_RE_SYNTAX_POSIX_COMMON | RE_BK_PLUS_QM) 857 858 /* Differs from ..._POSIX_BASIC only in that RE_BK_PLUS_QM becomes 859 RE_LIMITED_OPS, i.e., \? \+ \| are not recognized. Actually, this 860 isn't minimal, since other operators, such as \`, aren't disabled. */ 861 #define RE_SYNTAX_POSIX_MINIMAL_BASIC \ 862 (_RE_SYNTAX_POSIX_COMMON | RE_LIMITED_OPS) 863 864 #define RE_SYNTAX_POSIX_EXTENDED \ 865 (_RE_SYNTAX_POSIX_COMMON | RE_CONTEXT_INDEP_ANCHORS \ 866 | RE_CONTEXT_INDEP_OPS | RE_NO_BK_BRACES \ 867 | RE_NO_BK_PARENS | RE_NO_BK_VBAR \ 868 | RE_UNMATCHED_RIGHT_PAREN_ORD) 869 870 /* Differs from ..._POSIX_EXTENDED in that RE_CONTEXT_INVALID_OPS 871 replaces RE_CONTEXT_INDEP_OPS and RE_NO_BK_REFS is added. */ 872 #define RE_SYNTAX_POSIX_MINIMAL_EXTENDED \ 873 (_RE_SYNTAX_POSIX_COMMON | RE_CONTEXT_INDEP_ANCHORS \ 874 | RE_CONTEXT_INVALID_OPS | RE_NO_BK_BRACES \ 875 | RE_NO_BK_PARENS | RE_NO_BK_REFS \ 876 | RE_NO_BK_VBAR | RE_UNMATCHED_RIGHT_PAREN_ORD) 877 /* [[[end syntaxes]]] */ 878 879 /* Maximum number of duplicates an interval can allow. Some systems 880 (erroneously) define this in other header files, but we want our 881 value, so remove any previous define. */ 882 #ifdef RE_DUP_MAX 883 #undef RE_DUP_MAX 884 #endif 885 #define RE_DUP_MAX ((1 << 15) - 1) 886 887 888 889 /* POSIX `cflags' bits (i.e., information for `regcomp'). */ 890 891 /* If this bit is set, then use extended regular expression syntax. 892 If not set, then use basic regular expression syntax. */ 893 #define REG_EXTENDED 1 894 895 /* If this bit is set, then ignore case when matching. 896 If not set, then case is significant. */ 897 #define REG_ICASE (REG_EXTENDED << 1) 898 899 /* If this bit is set, then anchors do not match at newline 900 characters in the string. 901 If not set, then anchors do match at newlines. */ 902 #define REG_NEWLINE (REG_ICASE << 1) 903 904 /* If this bit is set, then report only success or fail in regexec. 905 If not set, then returns differ between not matching and errors. */ 906 #define REG_NOSUB (REG_NEWLINE << 1) 907 908 909 /* POSIX `eflags' bits (i.e., information for regexec). */ 910 911 /* If this bit is set, then the beginning-of-line operator doesn't match 912 the beginning of the string (presumably because it's not the 913 beginning of a line). 914 If not set, then the beginning-of-line operator does match the 915 beginning of the string. */ 916 #define REG_NOTBOL 1 917 918 /* Like REG_NOTBOL, except for the end-of-line. */ 919 #define REG_NOTEOL (1 << 1) 920 921 /* If `regs_allocated' is REGS_UNALLOCATED in the pattern buffer, 922 * `re_match_2' returns information about at least this many registers 923 * the first time a `regs' structure is passed. 924 * 925 * Also, this is the greatest number of backreferenced subexpressions 926 * allowed in a pattern being matched without caller-supplied registers. 927 */ 928 #ifndef RE_NREGS 929 #define RE_NREGS 30 930 #endif 931 932 extern int rx_cache_bound; 933 934 #ifdef __STDC__ 935 extern reg_errcode_t rx_compile (const char *pattern, int size, 936 reg_syntax_t syntax, 937 struct re_pattern_buffer * rxb) ; 938 extern int re_search_2 (struct re_pattern_buffer *rxb, 939 const char * string1, int size1, 940 const char * string2, int size2, 941 int startpos, int range, 942 struct re_registers *regs, 943 int stop); 944 extern int re_search (struct re_pattern_buffer * rxb, const char *string, 945 int size, int startpos, int range, 946 struct re_registers *regs); 947 extern int re_match_2 (struct re_pattern_buffer * rxb, 948 const char * string1, int size1, 949 const char * string2, int size2, 950 int pos, struct re_registers *regs, int stop); 951 extern int re_match (struct re_pattern_buffer * rxb, 952 const char * string, 953 int size, int pos, 954 struct re_registers *regs); 955 extern reg_syntax_t re_set_syntax (reg_syntax_t syntax); 956 extern void re_set_registers (struct re_pattern_buffer *bufp, 957 struct re_registers *regs, 958 unsigned num_regs, 959 regoff_t * starts, regoff_t * ends); 960 extern const char * re_compile_pattern (const char *pattern, 961 int length, 962 struct re_pattern_buffer * rxb); 963 extern int re_compile_fastmap (struct re_pattern_buffer * rxb); 964 extern char * re_comp (const char *s); 965 extern int rx_exec (const char *s); 966 extern int regcomp (regex_t * preg, const char * pattern, int cflags); 967 extern int regexec (const regex_t *preg, const char *string, 968 size_t nmatch, regmatch_t pmatch[], 969 int eflags); 970 extern size_t regerror (int errcode, const regex_t *preg, 971 char *errbuf, size_t errbuf_size); 972 extern void regfree (regex_t *preg); 973 974 #else 975 extern reg_errcode_t rx_compile (); 976 extern int re_search_2 (); 977 extern int re_search (); 978 extern int re_match_2 (); 979 extern int re_match (); 980 extern reg_syntax_t re_set_syntax (); 981 extern void re_set_registers (); 982 extern const char * re_compile_pattern (); 983 extern int re_compile_fastmap (); 984 extern char * re_comp (); 985 extern int rx_exec (); 986 extern int regcomp (); 987 extern int regexec (); 988 extern size_t regerror (); 989 extern void regfree (); 990 991 #endif 992 993 994 #else /* RX_WANT_SE_DEFS */ 995 /* Integers are used to represent side effects. 996 * 997 * Simple side effects are given negative integer names by these enums. 998 * 999 * Non-negative names are reserved for complex effects. 1000 * 1001 * Complex effects are those that take arguments. For example, 1002 * a register assignment associated with a group is complex because 1003 * it requires an argument to tell which group is being matched. 1004 * 1005 * The integer name of a complex effect is an index into rxb->se_params. 1006 */ 1007 1008 RX_DEF_SE(1, re_se_try, = -1) /* Epsilon from start state */ 1009 1010 RX_DEF_SE(0, re_se_pushback, = re_se_try - 1) 1011 RX_DEF_SE(0, re_se_push0, = re_se_pushback -1) 1012 RX_DEF_SE(0, re_se_pushpos, = re_se_push0 - 1) 1013 RX_DEF_SE(0, re_se_chkpos, = re_se_pushpos -1) 1014 RX_DEF_SE(0, re_se_poppos, = re_se_chkpos - 1) 1015 1016 RX_DEF_SE(1, re_se_at_dot, = re_se_poppos - 1) /* Emacs only */ 1017 RX_DEF_SE(0, re_se_syntax, = re_se_at_dot - 1) /* Emacs only */ 1018 RX_DEF_SE(0, re_se_not_syntax, = re_se_syntax - 1) /* Emacs only */ 1019 1020 RX_DEF_SE(1, re_se_begbuf, = re_se_not_syntax - 1) /* match beginning of buffer */ 1021 RX_DEF_SE(1, re_se_hat, = re_se_begbuf - 1) /* match beginning of line */ 1022 1023 RX_DEF_SE(1, re_se_wordbeg, = re_se_hat - 1) 1024 RX_DEF_SE(1, re_se_wordbound, = re_se_wordbeg - 1) 1025 RX_DEF_SE(1, re_se_notwordbound, = re_se_wordbound - 1) 1026 1027 RX_DEF_SE(1, re_se_wordend, = re_se_notwordbound - 1) 1028 RX_DEF_SE(1, re_se_endbuf, = re_se_wordend - 1) 1029 1030 /* This fails except at the end of a line. 1031 * It deserves to go here since it is typicly one of the last steps 1032 * in a match. 1033 */ 1034 RX_DEF_SE(1, re_se_dollar, = re_se_endbuf - 1) 1035 1036 /* Simple effects: */ 1037 RX_DEF_SE(1, re_se_fail, = re_se_dollar - 1) 1038 1039 /* Complex effects. These are used in the 'se' field of 1040 * a struct re_se_params. Indexes into the se array 1041 * are stored as instructions on nfa edges. 1042 */ 1043 RX_DEF_CPLX_SE(1, re_se_win, = 0) 1044 RX_DEF_CPLX_SE(1, re_se_lparen, = re_se_win + 1) 1045 RX_DEF_CPLX_SE(1, re_se_rparen, = re_se_lparen + 1) 1046 RX_DEF_CPLX_SE(0, re_se_backref, = re_se_rparen + 1) 1047 RX_DEF_CPLX_SE(0, re_se_iter, = re_se_backref + 1) 1048 RX_DEF_CPLX_SE(0, re_se_end_iter, = re_se_iter + 1) 1049 RX_DEF_CPLX_SE(0, re_se_tv, = re_se_end_iter + 1) 1050 1051 #endif 1052 1053 #endif 1054