1 /* tree.h - parse tree decls for regexps
2  *
3  ****************************************************************
4  * Copyright (C) 1998, 2000 Thomas Lord
5  *
6  * See the file "COPYING" for further information about
7  * the copyright and warranty status of this work.
8  */
9 
10 
11 
12 #ifndef INCLUDE__RX__TREE_H
13 #define INCLUDE__RX__TREE_H
14 
15 
16 
17 #include "hackerlab/machine/types.h"
18 #include "hackerlab/uni/coding.h"
19 #include "hackerlab/bitsets/bits.h"
20 #include "hackerlab/rx/bits-tree-rules.h"
21 
22 
23 
24 /*(h1 "The Expression Tree Structure")
25  *
26  */
27 
28 
29 /*(c #s"enum rx_exp_node_type" :category type)
30  * enum rx_exp_node_type;
31  *
32  * Every expression node is tagged with a type that describes how
33  * the node should be interpreted.
34  *
35  * The Rx representation of regexp expression trees contains a simple
36  * optimization: a sub-tree which consists entirely of concatenations
37  * of singleton character set nodes can be replaced by a single
38  * node of type `r_string'.
39  *
40  *
41  insert*/
42 #define RX_EXP_NODE_TYPES(FN) \
43   /* Match from a character set. `a' or `[a-z]'			\
44    */								\
45   RX_EXP_NODE_TYPE_##FN(r_cset)					\
46 								\
47   /* Match two subexpressions in order.   `ab'			\
48    */								\
49   RX_EXP_NODE_TYPE_##FN(r_concat)				\
50 								\
51   /* Match two subexpressions in order.   -no syntax-		\
52    * However, maximize the length of the _right_ subexpression	\
53    * not the _left_ subexpression.  This is used to implement +	\
54    * and {n,}							\
55    */								\
56   RX_EXP_NODE_TYPE_##FN(r_right_concat)				\
57 								\
58   /* Choose one of two subexpressions. `a\|b'			\
59    */								\
60   RX_EXP_NODE_TYPE_##FN(r_alternate)				\
61 								\
62   /* Match the subexpression any number of times. `a*'		\
63    */								\
64   RX_EXP_NODE_TYPE_##FN(r_star)					\
65 								\
66   /* Shorthand for a concatenation of characters		\
67    */								\
68   RX_EXP_NODE_TYPE_##FN(r_string)				\
69 								\
70   /* Generates a tagged, final nfa state.			\
71    */								\
72   RX_EXP_NODE_TYPE_##FN(r_cut)					\
73 								\
74   /* Counted subexpression.  `a{4, 1000}'			\
75    */								\
76   RX_EXP_NODE_TYPE_##FN(r_interval)				\
77 								\
78   /* Parenthesized subexpression				\
79    */								\
80   RX_EXP_NODE_TYPE_##FN(r_parens)				\
81 								\
82   /* Context-sensative operator such as "^" and "\1"		\
83    */								\
84   RX_EXP_NODE_TYPE_##FN(r_context)
85 
86 
87 #define RX_EXP_NODE_TYPE_ENUM(X)	X,
88 
89 enum rx_exp_node_type
90 {
91   RX_EXP_NODE_TYPES(ENUM)
92 };
93 /*end-insert
94  */
95 
96 
97 /*(c #s"struct rx_exp_node" :category type)
98  * struct rx_exp_node;
99  *
100  * This is the type of one node of a regexp expression tree.
101  *
102  insert*/
103 
104 struct rx_exp_node
105 {
106   /* This is a reference counted structure type.
107    * See `rx_save_exp' and `rx_free_exp'.
108    */
109   int refs;
110 
111   /* The expression type of this node.
112    */
113   enum rx_exp_node_type type;
114 
115   /* If the node is of type `r_cset', these describe the character set
116    * matched.
117    */
118   int cset_size;
119   bits cset;
120 
121 
122   /* If the node is of type `r_interval' ("a{x,y}"), then these
123    * describe the range of the interval (`intval' is `x' and `intval2'
124    * is `y').
125    *
126    * If the node is of type `r_cut', `intval' is the state label
127    * generated by the cut.
128    *
129    * If the node is of type `r_parens', `intval' is the expression
130    * number (for backreferences and `pmatch' data from `regexec') or
131    * 0, if the expression is an anonymous subexpression (formed by
132    * `[[:(...):]]'.)
133    *
134    * If the node is of type `r_context', `invtval' is the context
135    * operator.  Valid operators are '$' and '^' (anchors), and '0'
136    * .. '9' (backreferences).
137    *
138    */
139   long intval;
140   long intval2;
141 
142   /* If the node is of type `r_concat', `r_right_concat' or
143    * `r_alternate', these are the left and right children of
144    * the node.
145    *
146    * If the node is of type `r_star', `r_interval' or `r_parens', then
147    * `left' is the child of the node.
148    */
149   struct rx_exp_node *left;
150   struct rx_exp_node *right;
151 
152   /* If the node is of type `r_string', this is the contents of the
153    * string.  This string is not 0-terminated.
154    */
155   t_uchar * str;
156   size_t str_len;
157   enum uni_encoding_scheme encoding;
158 
159   /* Intervals, parentheses and context operators are special because
160    * they are not expressible as regular expressions.  Also, any
161    * composite expression with a subexpression which is not a regular
162    * expression is itself not a regular expression.
163    *
164    * `rx_analyze_rexp' fills in this field with a non-zero value for
165    * expression nodes which are "not a regular expression".
166    *
167    * If this field is not 0, a backtracking search may be necessay
168    * when comparing this regexp to a string.
169    */
170   int observed;
171 
172   /* If an `observed' regexp contains only parenthesized
173    * subexpressions and no other non-regular-expression operators
174    * (anchors or backreferences) then backtracking search is only
175    * necessary if the caller of the regexp comparison function wants
176    * to know the positions of matching subexpressions (the
177    * `regmatch_t' data in Posix interfaces).  If the caller only wants
178    * to know the start and end positions of the overall match, and the
179    * pattern contains no anchors or backreferences, then a DFA
180    * (non-backtracking) algorithm can be used even if `observed' is
181    * non-0.
182    *
183    * If `observed' is not 0, then `observation_contingent' is
184    * useful.  If `observation_contingent' is 1, the pattern
185    * contains no anchors or backreferences implying that a fast
186    * DFA search may be possible.
187    */
188   int observation_contingent;
189 
190   /* We have two strategies available for matching:
191    *
192    * 	the NFA technique:	potentially slow, but space efficient.
193    *	the NFA->DFA technique: more reliable throughput but higher
194    *				latency; much less space efficient.
195    *
196    * `small_advised_p' is set to a non-0 value by `rx_analyze_rexp' if
197    * it seems that the NFA technique will not be too slow.
198    */
199   int small_advised_p;
200 
201   /* Some expressions match only strings of one particular length.
202    * Knowing that length, if it is defined, leads to some easy and
203    * rewarding optimizations.
204    *
205    * `rx_analyze_rexp' fills in this field with that length, or -1 if
206    * no such length can be computed for this expression.
207    */
208   long len;
209 
210   /* In the tree rooted at this node, what is the maximum and
211    * minimum subexpression number of an enclosed subexpression.
212    *
213    * This is needed during matching when matching a parenthesized
214    * subexpression.  For trees with no numbered subexpressions,
215    * these values are 0.
216    */
217   int max_enclosed_paren;
218   int min_enclosed_paren;
219 
220   /* These fields are used to cache results computed by
221    * `rx_simplify_rexp' and `rx_unfa'.
222    */
223   struct rx_exp_node * simplified;
224 
225 
226   struct rx_cached_rexp * cr;
227   struct rx_exp_node * next_same_nfa;
228   struct rx_exp_node * prev_same_nfa;
229 };
230 /*end-insert
231  */
232 
233 
234 /* automatically generated __STDC__ prototypes */
235 extern struct rx_exp_node * rx_exp_node (enum rx_exp_node_type type);
236 extern struct rx_exp_node * rx_mk_r_cset (enum rx_exp_node_type type, int size, bits b);
237 extern struct rx_exp_node * rx_mk_r_cset_take (enum rx_exp_node_type type, int size, bits b);
238 extern struct rx_exp_node * rx_mk_r_binop (enum rx_exp_node_type type,
239 					   struct rx_exp_node * a,
240 					   struct rx_exp_node * b);
241 extern struct rx_exp_node * rx_mk_r_monop (enum rx_exp_node_type type,
242 					   struct rx_exp_node * a);
243 extern struct rx_exp_node * rx_mk_r_str (enum rx_exp_node_type type,
244 					 const t_uchar * s,
245 					 size_t len,
246 					 enum uni_encoding_scheme encoding);
247 extern struct rx_exp_node * rx_mk_r_int (enum rx_exp_node_type type,
248 					 int intval);
249 extern struct rx_exp_node * rx_mk_r_subexp_int (enum rx_exp_node_type type,
250 						struct rx_exp_node * subexp,
251 						int intval);
252 extern struct rx_exp_node * rx_mk_r_int2 (enum rx_exp_node_type type,
253 					  int intval,
254 					  int intval2);
255 extern struct rx_exp_node * rx_mk_r_subexp_int2 (enum rx_exp_node_type type,
256 						 struct rx_exp_node * subexp,
257 						 int intval,
258 						 int intval2);
259 extern void rx_save_exp (struct rx_exp_node * node);
260 extern void rx_free_exp (struct rx_exp_node * node);
261 extern int rx_exp_equal (struct rx_exp_node * a, struct rx_exp_node * b);
262 extern unsigned long rx_exp_hash (struct rx_exp_node * node);
263 #endif  /* INCLUDE__RX__TREE_H */
264