1 /* tree.h - parse tree decls for regexps 2 * 3 **************************************************************** 4 * Copyright (C) 1998, 2000 Thomas Lord 5 * 6 * See the file "COPYING" for further information about 7 * the copyright and warranty status of this work. 8 */ 9 10 11 12 #ifndef INCLUDE__RX__TREE_H 13 #define INCLUDE__RX__TREE_H 14 15 16 17 #include "hackerlab/machine/types.h" 18 #include "hackerlab/uni/coding.h" 19 #include "hackerlab/bitsets/bits.h" 20 #include "hackerlab/rx/bits-tree-rules.h" 21 22 23 24 /*(h1 "The Expression Tree Structure") 25 * 26 */ 27 28 29 /*(c #s"enum rx_exp_node_type" :category type) 30 * enum rx_exp_node_type; 31 * 32 * Every expression node is tagged with a type that describes how 33 * the node should be interpreted. 34 * 35 * The Rx representation of regexp expression trees contains a simple 36 * optimization: a sub-tree which consists entirely of concatenations 37 * of singleton character set nodes can be replaced by a single 38 * node of type `r_string'. 39 * 40 * 41 insert*/ 42 #define RX_EXP_NODE_TYPES(FN) \ 43 /* Match from a character set. `a' or `[a-z]' \ 44 */ \ 45 RX_EXP_NODE_TYPE_##FN(r_cset) \ 46 \ 47 /* Match two subexpressions in order. `ab' \ 48 */ \ 49 RX_EXP_NODE_TYPE_##FN(r_concat) \ 50 \ 51 /* Match two subexpressions in order. -no syntax- \ 52 * However, maximize the length of the _right_ subexpression \ 53 * not the _left_ subexpression. This is used to implement + \ 54 * and {n,} \ 55 */ \ 56 RX_EXP_NODE_TYPE_##FN(r_right_concat) \ 57 \ 58 /* Choose one of two subexpressions. `a\|b' \ 59 */ \ 60 RX_EXP_NODE_TYPE_##FN(r_alternate) \ 61 \ 62 /* Match the subexpression any number of times. `a*' \ 63 */ \ 64 RX_EXP_NODE_TYPE_##FN(r_star) \ 65 \ 66 /* Shorthand for a concatenation of characters \ 67 */ \ 68 RX_EXP_NODE_TYPE_##FN(r_string) \ 69 \ 70 /* Generates a tagged, final nfa state. \ 71 */ \ 72 RX_EXP_NODE_TYPE_##FN(r_cut) \ 73 \ 74 /* Counted subexpression. `a{4, 1000}' \ 75 */ \ 76 RX_EXP_NODE_TYPE_##FN(r_interval) \ 77 \ 78 /* Parenthesized subexpression \ 79 */ \ 80 RX_EXP_NODE_TYPE_##FN(r_parens) \ 81 \ 82 /* Context-sensative operator such as "^" and "\1" \ 83 */ \ 84 RX_EXP_NODE_TYPE_##FN(r_context) 85 86 87 #define RX_EXP_NODE_TYPE_ENUM(X) X, 88 89 enum rx_exp_node_type 90 { 91 RX_EXP_NODE_TYPES(ENUM) 92 }; 93 /*end-insert 94 */ 95 96 97 /*(c #s"struct rx_exp_node" :category type) 98 * struct rx_exp_node; 99 * 100 * This is the type of one node of a regexp expression tree. 101 * 102 insert*/ 103 104 struct rx_exp_node 105 { 106 /* This is a reference counted structure type. 107 * See `rx_save_exp' and `rx_free_exp'. 108 */ 109 int refs; 110 111 /* The expression type of this node. 112 */ 113 enum rx_exp_node_type type; 114 115 /* If the node is of type `r_cset', these describe the character set 116 * matched. 117 */ 118 int cset_size; 119 bits cset; 120 121 122 /* If the node is of type `r_interval' ("a{x,y}"), then these 123 * describe the range of the interval (`intval' is `x' and `intval2' 124 * is `y'). 125 * 126 * If the node is of type `r_cut', `intval' is the state label 127 * generated by the cut. 128 * 129 * If the node is of type `r_parens', `intval' is the expression 130 * number (for backreferences and `pmatch' data from `regexec') or 131 * 0, if the expression is an anonymous subexpression (formed by 132 * `[[:(...):]]'.) 133 * 134 * If the node is of type `r_context', `invtval' is the context 135 * operator. Valid operators are '$' and '^' (anchors), and '0' 136 * .. '9' (backreferences). 137 * 138 */ 139 long intval; 140 long intval2; 141 142 /* If the node is of type `r_concat', `r_right_concat' or 143 * `r_alternate', these are the left and right children of 144 * the node. 145 * 146 * If the node is of type `r_star', `r_interval' or `r_parens', then 147 * `left' is the child of the node. 148 */ 149 struct rx_exp_node *left; 150 struct rx_exp_node *right; 151 152 /* If the node is of type `r_string', this is the contents of the 153 * string. This string is not 0-terminated. 154 */ 155 t_uchar * str; 156 size_t str_len; 157 enum uni_encoding_scheme encoding; 158 159 /* Intervals, parentheses and context operators are special because 160 * they are not expressible as regular expressions. Also, any 161 * composite expression with a subexpression which is not a regular 162 * expression is itself not a regular expression. 163 * 164 * `rx_analyze_rexp' fills in this field with a non-zero value for 165 * expression nodes which are "not a regular expression". 166 * 167 * If this field is not 0, a backtracking search may be necessay 168 * when comparing this regexp to a string. 169 */ 170 int observed; 171 172 /* If an `observed' regexp contains only parenthesized 173 * subexpressions and no other non-regular-expression operators 174 * (anchors or backreferences) then backtracking search is only 175 * necessary if the caller of the regexp comparison function wants 176 * to know the positions of matching subexpressions (the 177 * `regmatch_t' data in Posix interfaces). If the caller only wants 178 * to know the start and end positions of the overall match, and the 179 * pattern contains no anchors or backreferences, then a DFA 180 * (non-backtracking) algorithm can be used even if `observed' is 181 * non-0. 182 * 183 * If `observed' is not 0, then `observation_contingent' is 184 * useful. If `observation_contingent' is 1, the pattern 185 * contains no anchors or backreferences implying that a fast 186 * DFA search may be possible. 187 */ 188 int observation_contingent; 189 190 /* We have two strategies available for matching: 191 * 192 * the NFA technique: potentially slow, but space efficient. 193 * the NFA->DFA technique: more reliable throughput but higher 194 * latency; much less space efficient. 195 * 196 * `small_advised_p' is set to a non-0 value by `rx_analyze_rexp' if 197 * it seems that the NFA technique will not be too slow. 198 */ 199 int small_advised_p; 200 201 /* Some expressions match only strings of one particular length. 202 * Knowing that length, if it is defined, leads to some easy and 203 * rewarding optimizations. 204 * 205 * `rx_analyze_rexp' fills in this field with that length, or -1 if 206 * no such length can be computed for this expression. 207 */ 208 long len; 209 210 /* In the tree rooted at this node, what is the maximum and 211 * minimum subexpression number of an enclosed subexpression. 212 * 213 * This is needed during matching when matching a parenthesized 214 * subexpression. For trees with no numbered subexpressions, 215 * these values are 0. 216 */ 217 int max_enclosed_paren; 218 int min_enclosed_paren; 219 220 /* These fields are used to cache results computed by 221 * `rx_simplify_rexp' and `rx_unfa'. 222 */ 223 struct rx_exp_node * simplified; 224 225 226 struct rx_cached_rexp * cr; 227 struct rx_exp_node * next_same_nfa; 228 struct rx_exp_node * prev_same_nfa; 229 }; 230 /*end-insert 231 */ 232 233 234 /* automatically generated __STDC__ prototypes */ 235 extern struct rx_exp_node * rx_exp_node (enum rx_exp_node_type type); 236 extern struct rx_exp_node * rx_mk_r_cset (enum rx_exp_node_type type, int size, bits b); 237 extern struct rx_exp_node * rx_mk_r_cset_take (enum rx_exp_node_type type, int size, bits b); 238 extern struct rx_exp_node * rx_mk_r_binop (enum rx_exp_node_type type, 239 struct rx_exp_node * a, 240 struct rx_exp_node * b); 241 extern struct rx_exp_node * rx_mk_r_monop (enum rx_exp_node_type type, 242 struct rx_exp_node * a); 243 extern struct rx_exp_node * rx_mk_r_str (enum rx_exp_node_type type, 244 const t_uchar * s, 245 size_t len, 246 enum uni_encoding_scheme encoding); 247 extern struct rx_exp_node * rx_mk_r_int (enum rx_exp_node_type type, 248 int intval); 249 extern struct rx_exp_node * rx_mk_r_subexp_int (enum rx_exp_node_type type, 250 struct rx_exp_node * subexp, 251 int intval); 252 extern struct rx_exp_node * rx_mk_r_int2 (enum rx_exp_node_type type, 253 int intval, 254 int intval2); 255 extern struct rx_exp_node * rx_mk_r_subexp_int2 (enum rx_exp_node_type type, 256 struct rx_exp_node * subexp, 257 int intval, 258 int intval2); 259 extern void rx_save_exp (struct rx_exp_node * node); 260 extern void rx_free_exp (struct rx_exp_node * node); 261 extern int rx_exp_equal (struct rx_exp_node * a, struct rx_exp_node * b); 262 extern unsigned long rx_exp_hash (struct rx_exp_node * node); 263 #endif /* INCLUDE__RX__TREE_H */ 264