xref: /386bsd/usr/src/usr.bin/sed/rx.h (revision a2142627)
1 #if !defined(RXH) || defined(RX_WANT_SE_DEFS)
2 #define RXH
3 
4 /*	Copyright (C) 1992, 1993 Free Software Foundation, Inc.
5 
6 This program is free software; you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published by
8 the Free Software Foundation; either version 2, or (at your option)
9 any later version.
10 
11 This program is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14 GNU General Public License for more details.
15 
16 You should have received a copy of the GNU General Public License
17 along with this software; see the file COPYING.  If not, write to
18 the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.  */
19 /*  t. lord	Wed Sep 23 18:20:57 1992	*/
20 
21 
22 
23 
24 #ifndef RX_WANT_SE_DEFS
25 
26 /* This page: Bitsets */
27 
28 #ifndef RX_subset
29 typedef unsigned int RX_subset;
30 #define RX_subset_bits	(32)
31 #define RX_subset_mask	(RX_subset_bits - 1)
32 #endif
33 
34 typedef RX_subset * rx_Bitset;
35 
36 #ifdef __STDC__
37 typedef void (*rx_bitset_iterator) (void *, int member_index);
38 #else
39 typedef void (*rx_bitset_iterator) ();
40 #endif
41 
42 #define rx_bitset_subset(N)  ((N) / RX_subset_bits)
43 #define rx_bitset_subset_val(B,N)  ((B)[rx_bitset_subset(N)])
44 #define RX_bitset_access(B,N,OP) \
45   ((B)[rx_bitset_subset(N)] OP rx_subset_singletons[(N) & RX_subset_mask])
46 #define RX_bitset_member(B,N)   RX_bitset_access(B, N, &)
47 #define RX_bitset_enjoin(B,N)   RX_bitset_access(B, N, |=)
48 #define RX_bitset_remove(B,N)   RX_bitset_access(B, N, &= ~)
49 #define RX_bitset_toggle(B,N)   RX_bitset_access(B, N, ^= )
50 #define rx_bitset_numb_subsets(N) (((N) + RX_subset_bits - 1) / RX_subset_bits)
51 #define rx_sizeof_bitset(N)	(rx_bitset_numb_subsets(N) * sizeof(RX_subset))
52 
53 
54 
55 /* This page: Splay trees. */
56 
57 #ifdef __STDC__
58 typedef int (*rx_sp_comparer) (void * a, void * b);
59 #else
60 typedef int (*rx_sp_comparer) ();
61 #endif
62 
63 struct rx_sp_node
64 {
65   void * key;
66   void * data;
67   struct rx_sp_node * kids[2];
68 };
69 
70 #ifdef __STDC__
71 typedef void (*rx_sp_key_data_freer) (struct rx_sp_node *);
72 #else
73 typedef void (*rx_sp_key_data_freer) ();
74 #endif
75 
76 
77 /* giant inflatable hash trees */
78 
79 struct rx_hash_item
80 {
81   struct rx_hash_item * next_same_hash;
82   struct rx_hash * table;
83   unsigned long hash;
84   void * data;
85   void * binding;
86 };
87 
88 struct rx_hash
89 {
90   struct rx_hash * parent;
91   int refs;
92   struct rx_hash * children[13];
93   struct rx_hash_item * buckets [13];
94   int bucket_size [13];
95 };
96 
97 struct rx_hash_rules;
98 
99 #ifdef __STDC__
100 /* should return like == */
101 typedef int (*rx_hash_eq)(void *, void *);
102 typedef struct rx_hash * (*rx_alloc_hash)(struct rx_hash_rules *);
103 typedef void (*rx_free_hash)(struct rx_hash *,
104 			    struct rx_hash_rules *);
105 typedef struct rx_hash_item * (*rx_alloc_hash_item)(struct rx_hash_rules *,
106 						    void *);
107 typedef void (*rx_free_hash_item)(struct rx_hash_item *,
108 				 struct rx_hash_rules *);
109 #else
110 typedef int (*rx_hash_eq)();
111 typedef struct rx_hash * (*rx_alloc_hash)();
112 typedef void (*rx_free_hash)();
113 typedef struct rx_hash_item * (*rx_alloc_hash_item)();
114 typedef void (*rx_free_hash_item)();
115 #endif
116 
117 struct rx_hash_rules
118 {
119   rx_hash_eq eq;
120   rx_alloc_hash hash_alloc;
121   rx_free_hash free_hash;
122   rx_alloc_hash_item hash_item_alloc;
123   rx_free_hash_item free_hash_item;
124 };
125 
126 
127 
128 /* Matchers decide what to do by examining a series of these.
129  * Instruction types are described below.
130  */
131 struct rx_inx
132 {
133   void * inx;
134   void * data;
135   void * data_2;
136   void * fnord;
137 };
138 
139 /* Struct RX holds a compiled regular expression - that is, an nfa ready to be
140  * converted on demand to a more efficient nfa.  This is for the low level interface.
141  * The high-level interface incloses this in a `struct re_pattern_buffer'.
142  */
143 struct rx_cache;
144 #ifdef __STDC__
145 struct rx_se_list;
146 struct rx;
147 typedef int (*rx_se_list_order) (struct rx *,
148 				 struct rx_se_list *, struct rx_se_list *);
149 #else
150 typedef int (*rx_se_list_order) ();
151 #endif
152 
153 struct rx_superset;
154 
155 struct rx
156 {
157   int rx_id;		/* Every edition numbered and signed by eclose_nfa. */
158 
159   struct rx_cache * cache;  /* Where superstates come from */
160 
161   /* Every regex defines the size of its own character set. */
162   int local_cset_size;
163 
164   void * buffer;		/* Malloced memory for the nfa. */
165   unsigned long allocated;	/* Size of that memory. */
166 
167   /* How much buffer space to save for external uses.  After compilation,
168    * this space will be available at (buffer + allocated - reserved)
169    */
170   unsigned long reserved;
171 
172   /* --------- The remaining fields are for internal use only. --------- */
173   /* --------- But! they should be initialized to 0.	       --------- */
174   /* NODEC is the number of nodes in the NFA with non-epsilon
175    * orx transitions.
176    */
177   int nodec;
178 
179   /* EPSNODEC is the number of nodes with only epsilon (orx) transitions. */
180   int epsnodec;
181 
182   /* The sum of NODEC & EPSNODEC is the total number of states in the
183    * compiled NFA.
184    */
185 
186   /* side_effect_progs temporarily holds a tree of side effect lists. */
187   struct rx_hash se_list_memo;
188 
189   /* A memo for sets of states in the possible_future lists of an nfa: */
190   struct rx_hash set_list_memo;
191 
192   /* The instruction table is indexed by the enum of instructions defined in
193    * rxrun.h.  The values in the table are used to fill in the `inx'
194    * slot of instruction frames (see rxrun.h).
195    */
196   void ** instruction_table;
197   struct rx_nfa_state *nfa_states;
198   struct rx_nfa_state *start;
199 
200   /* This orders the search through super-nfa paths. */
201   rx_se_list_order se_list_cmp;
202 
203   struct rx_superset * start_set;
204 };
205 
206 /* An RX NFA may contain epsilon edges labeled with side effects.
207  * These side effects represent match actions that can not normally be
208  * defined in a `pure' NFA; for example, recording the location at
209  * which a paren is crossed in a register structure.
210  *
211  * A matcher is supposed to find a particular path
212  * through the NFA (such as leftmost-longest), and then to execute the
213  * side effects along that path.  Operationally, the space of paths is
214  * searched and side effects are carried out incrementally, and with
215  * backtracking.
216  *
217  * As the NFA is manipulated during matching sets of side effects.
218  * Simple lists are used to hold side effect lists.
219  */
220 
221 typedef void * rx_side_effect;
222 
223 struct rx_se_list
224 {
225   rx_side_effect car;
226   struct rx_se_list * cdr;
227 };
228 
229 
230 
231 /* Struct rexp_node holds an expression tree that represents a regexp.
232  * In this expression tree, every node has a type, and some parameters
233  * appropriate to that type.
234  */
235 
236 enum rexp_node_type
237 {
238   r_cset,			/* Match from a character set. `a' or `[a-z]'*/
239   r_concat,			/* Concat two regexps.   `ab' */
240   r_alternate,			/* Choose one of two regexps. `a\|b' */
241   r_opt,			/* Optional regexp. `a?' */
242   r_star,			/* Repeated regexp. `a*' */
243   r_2phase_star,		/* hard to explain */
244   r_side_effect,		/* Matches the empty string, but
245 				 * implies that a side effect must
246 				 * take place.  These nodes are used
247 				 * by the parser to implement parens,
248 				 * backreferences etc.
249 				 */
250 
251   r_data			/* R_DATA is soley for the convenience
252 				 * of parsers or other rexp
253 				 * transformers that want to
254 				 * (temporarily) introduce new node
255 				 * types in rexp structures.  These
256 				 * must be eliminated
257 			    	 * by the time build_nfa is called.
258 			  	 */
259 };
260 
261 struct rexp_node
262 {
263   enum rexp_node_type type;
264   union
265   {
266     rx_Bitset cset;
267     rx_side_effect side_effect;
268     struct
269       {
270 	struct rexp_node *left;
271 	struct rexp_node *right;
272       } pair;
273     void * data;
274   } params;
275 };
276 
277 
278 
279 /* This defines the structure of the NFA into which rexps are compiled. */
280 
281 struct rx_nfa_state
282 {
283   int id;
284   struct rx_nfa_edge *edges;
285   struct rx_possible_future *futures;
286   unsigned int is_final:1;
287   unsigned int is_start:1;
288   unsigned int eclosure_needed:1;
289   struct rx_nfa_state *next;
290   unsigned int mark:1;
291 };
292 
293 enum rx_nfa_etype
294 {
295   ne_cset,
296   ne_epsilon,
297   ne_side_effect		/* A special kind of epsilon. */
298 };
299 
300 struct rx_nfa_edge
301 {
302   struct rx_nfa_edge *next;
303   enum rx_nfa_etype type;
304   struct rx_nfa_state *dest;
305   union
306   {
307     rx_Bitset cset;
308     rx_side_effect side_effect;
309   } params;
310 };
311 
312 struct rx_nfa_state_set
313 {
314   struct rx_nfa_state * car;
315   struct rx_nfa_state_set * cdr;
316 };
317 
318 struct rx_possible_future
319 {
320   struct rx_possible_future *next;
321   struct rx_se_list * effects;
322   struct rx_nfa_state_set * destset;
323 };
324 
325 
326 
327 enum rx_opcode
328 {
329   /*
330    * BACKTRACK_POINT is invoked when a transition results in more
331    * than one possible future.
332    *
333    * There is one occurence of this instruction per transition_class
334    * structure; that occurence is only ever executed if the
335    * transition_class contains a list of more than 1 edge.
336    */
337   rx_backtrack_point = 0,	/* data is (struct transition_class *) */
338 
339   /*
340    * RX_DO_SIDE_EFFECTS evaluates the side effects of an epsilon path.
341    * There is one occurence of this instruction per rx_distinct_future.
342    * This instruction is skipped if a rx_distinct_future has no side effects.
343    */
344   rx_do_side_effects = rx_backtrack_point + 1,
345   /* data is (struct rx_distinct_future *) */
346 
347   /*
348    * RX_CACHE_MISS instructions are stored in rx_distinct_futures whose
349    * destination superstate has been reclaimed (or was never built).
350    * It recomputes the destination superstate.
351    * RX_CACHE_MISS is also stored in a superstate transition table before
352    * any of its edges have been built.
353    */
354   rx_cache_miss = rx_do_side_effects + 1,
355   /* data is (struct rx_distinct_future *) */
356 
357   /*
358    * RX_NEXT_CHAR is called to consume the next character and take the
359    * corresponding transition.  This is the only instruction that uses
360    * the DATA field of the instruction frame instead of DATA_2.
361    * (see EXPLORE_FUTURE in regex.c).
362    */
363   rx_next_char = rx_cache_miss + 1, /* data is (struct superstate *) */
364 
365   /* RX_BACKTRACK indicates that a transition fails.
366    */
367   rx_backtrack = rx_next_char + 1, /* no data */
368 
369   /*
370    * RX_ERROR_INX is stored only in places that should never be executed.
371    */
372   rx_error_inx = rx_backtrack + 1, /* Not supposed to occur. */
373 
374   rx_num_instructions = rx_error_inx + 1
375 };
376 
377 /* An id_instruction_table holds the values stored in instruction
378  * frames.  The table is indexed by the enums declared above.
379  */
380 extern void * rx_id_instruction_table[rx_num_instructions];
381 
382 #if 0				/* Already declared way above. */
383 /*  If the instruction is `rx_next_char' then data is valid.  Otherwise it's 0
384  *  and data_2 is valid.
385  */
386 struct rx_inx
387 {
388   void * inx;
389   void * data;
390   void * data_2;
391 };
392 #endif
393 
394 
395 #ifndef RX_TAIL_ARRAY
396 #define RX_TAIL_ARRAY  1
397 #endif
398 
399 /* A superstate corresponds to a set of nfa states.  Those sets are
400  * represented by STRUCT RX_SUPERSET.  The constructors
401  * guarantee that only one (shared) structure is created for a given set.
402  */
403 struct rx_superset
404 {
405   int refs;
406   struct rx_nfa_state * car;	/* May or may not be a valid addr. */
407   int id;			/* == car->id for the initial value of *car */
408   struct rx_superset * cdr;	/* May be NULL or a live or semifreed super*/
409 
410   /* If the corresponding superstate exists: */
411   struct rx_superstate * superstate;
412 
413   /* If this is a starting state (as built by re_search_2)
414    * this points to the `struct rx'.  The memory for these objects
415    * is typed -- so even after they are freed it is safe to look
416    * at this field (to check, in fact, if this was freed.)
417    */
418   struct rx * starts_for;
419 
420   struct rx_hash_item hash_item;
421 };
422 
423 #define rx_protect_superset(RX,CON) (++(CON)->refs)
424 
425 /* Every character occurs in at most one super edge per super-state.
426  * But, that edge might have more than one option, indicating a point
427  * of non-determinism.
428  */
429 struct rx_super_edge
430 {
431   struct rx_super_edge *next;
432   struct rx_inx rx_backtrack_frame;
433   int cset_size;
434   rx_Bitset cset;
435   struct rx_distinct_future *options;
436 };
437 
438 /* A superstate is a set of nfa states (RX_SUPERSET) along
439  * with a transition table.  Superstates are built on demand and reclaimed
440  * without warning.  To protect a superstate, use LOCK_SUPERSTATE.
441  *
442  * Joe Keane thought of calling these superstates and several people
443  * have commented on what a good name it is for what they do.
444  */
445 struct rx_superstate
446 {
447   int rx_id;
448   int locks;
449   struct rx_superstate * next_recyclable;
450   struct rx_superstate * prev_recyclable;
451   struct rx_distinct_future * transition_refs;
452   struct rx_superset * contents;
453   struct rx_super_edge * edges;
454   int is_semifree;
455   int trans_size;
456   struct rx_inx transitions[RX_TAIL_ARRAY]; /* cset sized */
457 };
458 
459 struct rx_distinct_future
460 {
461   struct rx_distinct_future * next_same_super_edge[2];
462   struct rx_distinct_future * next_same_dest;
463   struct rx_distinct_future * prev_same_dest;
464   struct rx_superstate * present;	/* source state */
465   struct rx_superstate * future;	/* destination state */
466   struct rx_super_edge * edge;
467   struct rx_inx future_frame;
468   struct rx_inx side_effects_frame;
469   struct rx_se_list * effects;
470 };
471 
472 #define rx_lock_superstate(R,S)  ((S)->locks++)
473 #define rx_unlock_superstate(R,S) (--(S)->locks)
474 
475 
476 /* This page destined for rx.h */
477 
478 struct rx_blocklist
479 {
480   struct rx_blocklist * next;
481   int bytes;
482 };
483 
484 struct rx_freelist
485 {
486   struct rx_freelist * next;
487 };
488 
489 struct rx_cache;
490 
491 #ifdef __STDC__
492 typedef void (*rx_morecore_fn)(struct rx_cache *);
493 #else
494 typedef void (*rx_morecore_fn)();
495 #endif
496 
497 struct rx_cache
498 {
499   struct rx_hash_rules superset_hash_rules;
500 
501   /* Objects are allocated by incrementing a pointer that
502    * scans across rx_blocklists.
503    */
504   struct rx_blocklist * memory;
505   struct rx_blocklist * memory_pos;
506   int bytes_left;
507   char * memory_addr;
508   rx_morecore_fn morecore;
509 
510   /* Freelists. */
511   struct rx_freelist * free_superstates;
512   struct rx_freelist * free_transition_classes;
513   struct rx_freelist * free_discernable_futures;
514   struct rx_freelist * free_supersets;
515   struct rx_freelist * free_hash;
516 
517   /* Two sets of superstates -- those that are semifreed, and those
518    * that are being used.
519    */
520   struct rx_superstate * lru_superstate;
521   struct rx_superstate * semifree_superstate;
522 
523   struct rx_superset * empty_superset;
524 
525   int superstates;
526   int semifree_superstates;
527   int hits;
528   int misses;
529   int superstates_allowed;
530 
531   int local_cset_size;
532   void ** instruction_table;
533 
534   struct rx_hash superset_table;
535 };
536 
537 
538 
539 /* regex.h
540  *
541  * The remaining declarations replace regex.h.
542  */
543 
544 /* This is an array of error messages corresponding to the error codes.
545  */
546 extern const char *re_error_msg[];
547 
548 /* If any error codes are removed, changed, or added, update the
549    `re_error_msg' table in regex.c.  */
550 typedef enum
551 {
552   REG_NOERROR = 0,	/* Success.  */
553   REG_NOMATCH,		/* Didn't find a match (for regexec).  */
554 
555   /* POSIX regcomp return error codes.  (In the order listed in the
556      standard.)  */
557   REG_BADPAT,		/* Invalid pattern.  */
558   REG_ECOLLATE,		/* Not implemented.  */
559   REG_ECTYPE,		/* Invalid character class name.  */
560   REG_EESCAPE,		/* Trailing backslash.  */
561   REG_ESUBREG,		/* Invalid back reference.  */
562   REG_EBRACK,		/* Unmatched left bracket.  */
563   REG_EPAREN,		/* Parenthesis imbalance.  */
564   REG_EBRACE,		/* Unmatched \{.  */
565   REG_BADBR,		/* Invalid contents of \{\}.  */
566   REG_ERANGE,		/* Invalid range end.  */
567   REG_ESPACE,		/* Ran out of memory.  */
568   REG_BADRPT,		/* No preceding re for repetition op.  */
569 
570   /* Error codes we've added.  */
571   REG_EEND,		/* Premature end.  */
572   REG_ESIZE,		/* Compiled pattern bigger than 2^16 bytes.  */
573   REG_ERPAREN		/* Unmatched ) or \); not returned from regcomp.  */
574 } reg_errcode_t;
575 
576 /* The regex.c support, as a client of rx, defines a set of possible
577  * side effects that can be added to the edge lables of nfa edges.
578  * Here is the list of sidef effects in use.
579  */
580 
581 enum re_side_effects
582 {
583 #define RX_WANT_SE_DEFS 1
584 #undef RX_DEF_SE
585 #undef RX_DEF_CPLX_SE
586 #define RX_DEF_SE(IDEM, NAME, VALUE)	      NAME VALUE,
587 #define RX_DEF_CPLX_SE(IDEM, NAME, VALUE)     NAME VALUE,
588 #include "rx.h"
589 #undef RX_DEF_SE
590 #undef RX_DEF_CPLX_SE
591 #undef RX_WANT_SE_DEFS
592    re_floogle_flap = 65533
593 };
594 
595 /* These hold paramaters for the kinds of side effects that are possible
596  * in the supported pattern languages.  These include things like the
597  * numeric bounds of {} operators and the index of paren registers for
598  * subexpression measurement or backreferencing.
599  */
600 struct re_se_params
601 {
602   enum re_side_effects se;
603   int op1;
604   int op2;
605 };
606 
607 typedef unsigned reg_syntax_t;
608 
609 struct re_pattern_buffer
610 {
611   struct rx rx;
612   reg_syntax_t syntax;		/* See below for syntax bit definitions. */
613 
614   unsigned int no_sub:1;	/* If set, don't  return register offsets. */
615   unsigned int not_bol:1;	/* If set, the anchors ('^' and '$') don't */
616   unsigned int not_eol:1;	/*     match at the ends of the string.  */
617   unsigned int newline_anchor:1;/* If true, an anchor at a newline matches.*/
618   unsigned int least_subs:1;	/* If set, and returning registers, return
619 				 * as few values as possible.  Only
620 				 * backreferenced groups and group 0 (the whole
621 				 * match) will be returned.
622 				 */
623 
624   /* If true, this says that the matcher should keep registers on its
625    * backtracking stack.  For many patterns, we can easily determine that
626    * this isn't necessary.
627    */
628   unsigned int match_regs_on_stack:1;
629   unsigned int search_regs_on_stack:1;
630 
631   /* is_anchored and begbuf_only are filled in by rx_compile. */
632   unsigned int is_anchored:1;	/* Anchorded by ^? */
633   unsigned int begbuf_only:1;	/* Anchored to char position 0? */
634 
635 
636   /* If REGS_UNALLOCATED, allocate space in the `regs' structure
637    * for `max (RE_NREGS, re_nsub + 1)' groups.
638    * If REGS_REALLOCATE, reallocate space if necessary.
639    * If REGS_FIXED, use what's there.
640    */
641 #define REGS_UNALLOCATED 0
642 #define REGS_REALLOCATE 1
643 #define REGS_FIXED 2
644   unsigned int regs_allocated:2;
645 
646 
647   /* Either a translate table to apply to all characters before
648    * comparing them, or zero for no translation.  The translation
649    * is applied to a pattern when it is compiled and to a string
650    * when it is matched.
651    */
652   char * translate;
653 
654   /* If this is a valid pointer, it tells rx not to store the extents of
655    * certain subexpressions (those corresponding to non-zero entries).
656    * Passing 0x1 is the same as passing an array of all ones.  Passing 0x0
657    * is the same as passing an array of all zeros.
658    * The array should contain as many entries as their are subexps in the
659    * regexp.
660    */
661   char * syntax_parens;
662 
663 	/* Number of subexpressions found by the compiler.  */
664   size_t re_nsub;
665 
666   void * buffer;		/* Malloced memory for the nfa. */
667   unsigned long allocated;	/* Size of that memory. */
668 
669   /* Pointer to a fastmap, if any, otherwise zero.  re_search uses
670    * the fastmap, if there is one, to skip over impossible
671    * starting points for matches.  */
672   char *fastmap;
673 
674   unsigned int fastmap_accurate:1; /* These three are internal. */
675   unsigned int can_match_empty:1;
676   struct rx_nfa_state * start;	/* The nfa starting state. */
677 
678   /* This is the list of iterator bounds for {lo,hi} constructs.
679    * The memory pointed to is part of the rx->buffer.
680    */
681   struct re_se_params *se_params;
682 
683   /* This is a bitset representation of the fastmap.
684    * This is a true fastmap that already takes the translate
685    * table into account.
686    */
687   rx_Bitset fastset;
688 };
689 
690 /* Type for byte offsets within the string.  POSIX mandates this.  */
691 typedef int regoff_t;
692 
693 /* This is the structure we store register match data in.  See
694    regex.texinfo for a full description of what registers match.  */
695 struct re_registers
696 {
697   unsigned num_regs;
698   regoff_t *start;
699   regoff_t *end;
700 };
701 
702 typedef struct re_pattern_buffer regex_t;
703 
704 /* POSIX specification for registers.  Aside from the different names than
705    `re_registers', POSIX uses an array of structures, instead of a
706    structure of arrays.  */
707 typedef struct
708 {
709   regoff_t rm_so;  /* Byte offset from string's start to substring's start.  */
710   regoff_t rm_eo;  /* Byte offset from string's start to substring's end.  */
711 } regmatch_t;
712 
713 
714 /* The following bits are used to determine the regexp syntax we
715    recognize.  The set/not-set meanings are chosen so that Emacs syntax
716    remains the value 0.  The bits are given in alphabetical order, and
717    the definitions shifted by one from the previous bit; thus, when we
718    add or remove a bit, only one other definition need change.  */
719 
720 /* If this bit is not set, then \ inside a bracket expression is literal.
721    If set, then such a \ quotes the following character.  */
722 #define RE_BACKSLASH_ESCAPE_IN_LISTS (1)
723 
724 /* If this bit is not set, then + and ? are operators, and \+ and \? are
725      literals.
726    If set, then \+ and \? are operators and + and ? are literals.  */
727 #define RE_BK_PLUS_QM (RE_BACKSLASH_ESCAPE_IN_LISTS << 1)
728 
729 /* If this bit is set, then character classes are supported.  They are:
730      [:alpha:], [:upper:], [:lower:],  [:digit:], [:alnum:], [:xdigit:],
731      [:space:], [:print:], [:punct:], [:graph:], and [:cntrl:].
732    If not set, then character classes are not supported.  */
733 #define RE_CHAR_CLASSES (RE_BK_PLUS_QM << 1)
734 
735 /* If this bit is set, then ^ and $ are always anchors (outside bracket
736      expressions, of course).
737    If this bit is not set, then it depends:
738         ^  is an anchor if it is at the beginning of a regular
739            expression or after an open-group or an alternation operator;
740         $  is an anchor if it is at the end of a regular expression, or
741            before a close-group or an alternation operator.
742 
743    This bit could be (re)combined with RE_CONTEXT_INDEP_OPS, because
744    POSIX draft 11.2 says that * etc. in leading positions is undefined.
745    We already implemented a previous draft which made those constructs
746    invalid, though, so we haven't changed the code back.  */
747 #define RE_CONTEXT_INDEP_ANCHORS (RE_CHAR_CLASSES << 1)
748 
749 /* If this bit is set, then special characters are always special
750      regardless of where they are in the pattern.
751    If this bit is not set, then special characters are special only in
752      some contexts; otherwise they are ordinary.  Specifically,
753      * + ? and intervals are only special when not after the beginning,
754      open-group, or alternation operator.  */
755 #define RE_CONTEXT_INDEP_OPS (RE_CONTEXT_INDEP_ANCHORS << 1)
756 
757 /* If this bit is set, then *, +, ?, and { cannot be first in an re or
758      immediately after an alternation or begin-group operator.  */
759 #define RE_CONTEXT_INVALID_OPS (RE_CONTEXT_INDEP_OPS << 1)
760 
761 /* If this bit is set, then . matches newline.
762    If not set, then it doesn't.  */
763 #define RE_DOT_NEWLINE (RE_CONTEXT_INVALID_OPS << 1)
764 
765 /* If this bit is set, then . doesn't match NUL.
766    If not set, then it does.  */
767 #define RE_DOT_NOT_NULL (RE_DOT_NEWLINE << 1)
768 
769 /* If this bit is set, nonmatching lists [^...] do not match newline.
770    If not set, they do.  */
771 #define RE_HAT_LISTS_NOT_NEWLINE (RE_DOT_NOT_NULL << 1)
772 
773 /* If this bit is set, either \{...\} or {...} defines an
774      interval, depending on RE_NO_BK_BRACES.
775    If not set, \{, \}, {, and } are literals.  */
776 #define RE_INTERVALS (RE_HAT_LISTS_NOT_NEWLINE << 1)
777 
778 /* If this bit is set, +, ? and | aren't recognized as operators.
779    If not set, they are.  */
780 #define RE_LIMITED_OPS (RE_INTERVALS << 1)
781 
782 /* If this bit is set, newline is an alternation operator.
783    If not set, newline is literal.  */
784 #define RE_NEWLINE_ALT (RE_LIMITED_OPS << 1)
785 
786 /* If this bit is set, then `{...}' defines an interval, and \{ and \}
787      are literals.
788   If not set, then `\{...\}' defines an interval.  */
789 #define RE_NO_BK_BRACES (RE_NEWLINE_ALT << 1)
790 
791 /* If this bit is set, (...) defines a group, and \( and \) are literals.
792    If not set, \(...\) defines a group, and ( and ) are literals.  */
793 #define RE_NO_BK_PARENS (RE_NO_BK_BRACES << 1)
794 
795 /* If this bit is set, then \<digit> matches <digit>.
796    If not set, then \<digit> is a back-reference.  */
797 #define RE_NO_BK_REFS (RE_NO_BK_PARENS << 1)
798 
799 /* If this bit is set, then | is an alternation operator, and \| is literal.
800    If not set, then \| is an alternation operator, and | is literal.  */
801 #define RE_NO_BK_VBAR (RE_NO_BK_REFS << 1)
802 
803 /* If this bit is set, then an ending range point collating higher
804      than the starting range point, as in [z-a], is invalid.
805    If not set, then when ending range point collates higher than the
806      starting range point, the range is ignored.  */
807 #define RE_NO_EMPTY_RANGES (RE_NO_BK_VBAR << 1)
808 
809 /* If this bit is set, then an unmatched ) is ordinary.
810    If not set, then an unmatched ) is invalid.  */
811 #define RE_UNMATCHED_RIGHT_PAREN_ORD (RE_NO_EMPTY_RANGES << 1)
812 
813 /* This global variable defines the particular regexp syntax to use (for
814    some interfaces).  When a regexp is compiled, the syntax used is
815    stored in the pattern buffer, so changing this does not affect
816    already-compiled regexps.  */
817 extern reg_syntax_t re_syntax_options;
818 
819 /* Define combinations of the above bits for the standard possibilities.
820    (The [[[ comments delimit what gets put into the Texinfo file, so
821    don't delete them!)  */
822 /* [[[begin syntaxes]]] */
823 #define RE_SYNTAX_EMACS 0
824 
825 #define RE_SYNTAX_AWK							\
826   (RE_BACKSLASH_ESCAPE_IN_LISTS | RE_DOT_NOT_NULL			\
827    | RE_NO_BK_PARENS            | RE_NO_BK_REFS				\
828    | RE_NO_BK_VAR               | RE_NO_EMPTY_RANGES			\
829    | RE_UNMATCHED_RIGHT_PAREN_ORD)
830 
831 #define RE_SYNTAX_POSIX_AWK 						\
832   (RE_SYNTAX_POSIX_EXTENDED | RE_BACKSLASH_ESCAPE_IN_LISTS)
833 
834 #define RE_SYNTAX_GREP							\
835   (RE_BK_PLUS_QM              | RE_CHAR_CLASSES				\
836    | RE_HAT_LISTS_NOT_NEWLINE | RE_INTERVALS				\
837    | RE_NEWLINE_ALT)
838 
839 #define RE_SYNTAX_EGREP							\
840   (RE_CHAR_CLASSES        | RE_CONTEXT_INDEP_ANCHORS			\
841    | RE_CONTEXT_INDEP_OPS | RE_HAT_LISTS_NOT_NEWLINE			\
842    | RE_NEWLINE_ALT       | RE_NO_BK_PARENS				\
843    | RE_NO_BK_VBAR)
844 
845 #define RE_SYNTAX_POSIX_EGREP						\
846   (RE_SYNTAX_EGREP | RE_INTERVALS | RE_NO_BK_BRACES)
847 
848 #define RE_SYNTAX_SED RE_SYNTAX_POSIX_BASIC
849 
850 /* Syntax bits common to both basic and extended POSIX regex syntax.  */
851 #define _RE_SYNTAX_POSIX_COMMON						\
852   (RE_CHAR_CLASSES | RE_DOT_NEWLINE      | RE_DOT_NOT_NULL		\
853    | RE_INTERVALS  | RE_NO_EMPTY_RANGES)
854 
855 #define RE_SYNTAX_POSIX_BASIC						\
856   (_RE_SYNTAX_POSIX_COMMON | RE_BK_PLUS_QM)
857 
858 /* Differs from ..._POSIX_BASIC only in that RE_BK_PLUS_QM becomes
859    RE_LIMITED_OPS, i.e., \? \+ \| are not recognized.  Actually, this
860    isn't minimal, since other operators, such as \`, aren't disabled.  */
861 #define RE_SYNTAX_POSIX_MINIMAL_BASIC					\
862   (_RE_SYNTAX_POSIX_COMMON | RE_LIMITED_OPS)
863 
864 #define RE_SYNTAX_POSIX_EXTENDED					\
865   (_RE_SYNTAX_POSIX_COMMON | RE_CONTEXT_INDEP_ANCHORS			\
866    | RE_CONTEXT_INDEP_OPS  | RE_NO_BK_BRACES				\
867    | RE_NO_BK_PARENS       | RE_NO_BK_VBAR				\
868    | RE_UNMATCHED_RIGHT_PAREN_ORD)
869 
870 /* Differs from ..._POSIX_EXTENDED in that RE_CONTEXT_INVALID_OPS
871    replaces RE_CONTEXT_INDEP_OPS and RE_NO_BK_REFS is added.  */
872 #define RE_SYNTAX_POSIX_MINIMAL_EXTENDED				\
873   (_RE_SYNTAX_POSIX_COMMON  | RE_CONTEXT_INDEP_ANCHORS			\
874    | RE_CONTEXT_INVALID_OPS | RE_NO_BK_BRACES				\
875    | RE_NO_BK_PARENS        | RE_NO_BK_REFS				\
876    | RE_NO_BK_VBAR	    | RE_UNMATCHED_RIGHT_PAREN_ORD)
877 /* [[[end syntaxes]]] */
878 
879 /* Maximum number of duplicates an interval can allow.  Some systems
880    (erroneously) define this in other header files, but we want our
881    value, so remove any previous define.  */
882 #ifdef RE_DUP_MAX
883 #undef RE_DUP_MAX
884 #endif
885 #define RE_DUP_MAX ((1 << 15) - 1)
886 
887 
888 
889 /* POSIX `cflags' bits (i.e., information for `regcomp').  */
890 
891 /* If this bit is set, then use extended regular expression syntax.
892    If not set, then use basic regular expression syntax.  */
893 #define REG_EXTENDED 1
894 
895 /* If this bit is set, then ignore case when matching.
896    If not set, then case is significant.  */
897 #define REG_ICASE (REG_EXTENDED << 1)
898 
899 /* If this bit is set, then anchors do not match at newline
900      characters in the string.
901    If not set, then anchors do match at newlines.  */
902 #define REG_NEWLINE (REG_ICASE << 1)
903 
904 /* If this bit is set, then report only success or fail in regexec.
905    If not set, then returns differ between not matching and errors.  */
906 #define REG_NOSUB (REG_NEWLINE << 1)
907 
908 
909 /* POSIX `eflags' bits (i.e., information for regexec).  */
910 
911 /* If this bit is set, then the beginning-of-line operator doesn't match
912      the beginning of the string (presumably because it's not the
913      beginning of a line).
914    If not set, then the beginning-of-line operator does match the
915      beginning of the string.  */
916 #define REG_NOTBOL 1
917 
918 /* Like REG_NOTBOL, except for the end-of-line.  */
919 #define REG_NOTEOL (1 << 1)
920 
921 /* If `regs_allocated' is REGS_UNALLOCATED in the pattern buffer,
922  * `re_match_2' returns information about at least this many registers
923  * the first time a `regs' structure is passed.
924  *
925  * Also, this is the greatest number of backreferenced subexpressions
926  * allowed in a pattern being matched without caller-supplied registers.
927  */
928 #ifndef RE_NREGS
929 #define RE_NREGS 30
930 #endif
931 
932 extern int rx_cache_bound;
933 
934 #ifdef __STDC__
935 extern reg_errcode_t rx_compile (const char *pattern, int size,
936 	    reg_syntax_t syntax,
937 	    struct re_pattern_buffer * rxb) ;
938 extern int re_search_2 (struct re_pattern_buffer *rxb,
939 	     const char * string1, int size1,
940 	     const char * string2, int size2,
941 	     int startpos, int range,
942 	     struct re_registers *regs,
943 	     int stop);
944 extern int re_search (struct re_pattern_buffer * rxb, const char *string,
945 	   int size, int startpos, int range,
946 	   struct re_registers *regs);
947 extern int re_match_2 (struct re_pattern_buffer * rxb,
948 	    const char * string1, int size1,
949 	    const char * string2, int size2,
950 	    int pos, struct re_registers *regs, int stop);
951 extern int re_match (struct re_pattern_buffer * rxb,
952 	  const char * string,
953 	  int size, int pos,
954 	  struct re_registers *regs);
955 extern reg_syntax_t re_set_syntax (reg_syntax_t syntax);
956 extern void re_set_registers (struct re_pattern_buffer *bufp,
957 		  struct re_registers *regs,
958 		  unsigned num_regs,
959 		  regoff_t * starts, regoff_t * ends);
960 extern const char * re_compile_pattern (const char *pattern,
961 		    int length,
962 		    struct re_pattern_buffer * rxb);
963 extern int re_compile_fastmap (struct re_pattern_buffer * rxb);
964 extern char * re_comp (const char *s);
965 extern int rx_exec (const char *s);
966 extern int regcomp (regex_t * preg, const char * pattern, int cflags);
967 extern int regexec (const regex_t *preg, const char *string,
968 	 size_t nmatch, regmatch_t pmatch[],
969 	 int eflags);
970 extern size_t regerror (int errcode, const regex_t *preg,
971 	  char *errbuf, size_t errbuf_size);
972 extern void regfree (regex_t *preg);
973 
974 #else
975 extern reg_errcode_t rx_compile ();
976 extern int re_search_2 ();
977 extern int re_search ();
978 extern int re_match_2 ();
979 extern int re_match ();
980 extern reg_syntax_t re_set_syntax ();
981 extern void re_set_registers ();
982 extern const char * re_compile_pattern ();
983 extern int re_compile_fastmap ();
984 extern char * re_comp ();
985 extern int rx_exec ();
986 extern int regcomp ();
987 extern int regexec ();
988 extern size_t regerror ();
989 extern void regfree ();
990 
991 #endif
992 
993 
994 #else /* RX_WANT_SE_DEFS */
995   /* Integers are used to represent side effects.
996    *
997    * Simple side effects are given negative integer names by these enums.
998    *
999    * Non-negative names are reserved for complex effects.
1000    *
1001    * Complex effects are those that take arguments.  For example,
1002    * a register assignment associated with a group is complex because
1003    * it requires an argument to tell which group is being matched.
1004    *
1005    * The integer name of a complex effect is an index into rxb->se_params.
1006    */
1007 
1008   RX_DEF_SE(1, re_se_try, = -1)		/* Epsilon from start state */
1009 
1010   RX_DEF_SE(0, re_se_pushback, = re_se_try - 1)
1011   RX_DEF_SE(0, re_se_push0, = re_se_pushback -1)
1012   RX_DEF_SE(0, re_se_pushpos, = re_se_push0 - 1)
1013   RX_DEF_SE(0, re_se_chkpos, = re_se_pushpos -1)
1014   RX_DEF_SE(0, re_se_poppos, = re_se_chkpos - 1)
1015 
1016   RX_DEF_SE(1, re_se_at_dot, = re_se_poppos - 1)	/* Emacs only */
1017   RX_DEF_SE(0, re_se_syntax, = re_se_at_dot - 1) /* Emacs only */
1018   RX_DEF_SE(0, re_se_not_syntax, = re_se_syntax - 1) /* Emacs only */
1019 
1020   RX_DEF_SE(1, re_se_begbuf, = re_se_not_syntax - 1) /* match beginning of buffer */
1021   RX_DEF_SE(1, re_se_hat, = re_se_begbuf - 1) /* match beginning of line */
1022 
1023   RX_DEF_SE(1, re_se_wordbeg, = re_se_hat - 1)
1024   RX_DEF_SE(1, re_se_wordbound, = re_se_wordbeg - 1)
1025   RX_DEF_SE(1, re_se_notwordbound, = re_se_wordbound - 1)
1026 
1027   RX_DEF_SE(1, re_se_wordend, = re_se_notwordbound - 1)
1028   RX_DEF_SE(1, re_se_endbuf, = re_se_wordend - 1)
1029 
1030   /* This fails except at the end of a line.
1031    * It deserves to go here since it is typicly one of the last steps
1032    * in a match.
1033    */
1034   RX_DEF_SE(1, re_se_dollar, = re_se_endbuf - 1)
1035 
1036   /* Simple effects: */
1037   RX_DEF_SE(1, re_se_fail, = re_se_dollar - 1)
1038 
1039   /* Complex effects.  These are used in the 'se' field of
1040    * a struct re_se_params.  Indexes into the se array
1041    * are stored as instructions on nfa edges.
1042    */
1043   RX_DEF_CPLX_SE(1, re_se_win, = 0)
1044   RX_DEF_CPLX_SE(1, re_se_lparen, = re_se_win + 1)
1045   RX_DEF_CPLX_SE(1, re_se_rparen, = re_se_lparen + 1)
1046   RX_DEF_CPLX_SE(0, re_se_backref, = re_se_rparen + 1)
1047   RX_DEF_CPLX_SE(0, re_se_iter, = re_se_backref + 1)
1048   RX_DEF_CPLX_SE(0, re_se_end_iter, = re_se_iter + 1)
1049   RX_DEF_CPLX_SE(0, re_se_tv, = re_se_end_iter + 1)
1050 
1051 #endif
1052 
1053 #endif
1054