1 /* nfa.h - decls for manipulating regexps as NFAs 2 * 3 **************************************************************** 4 * Copyright (C) 1998, 2000 Thomas Lord 5 * 6 * See the file "COPYING" for further information about 7 * the copyright and warranty status of this work. 8 */ 9 10 11 #ifndef INCLUDE__RX__NFA_H 12 #define INCLUDE__RX__NFA_H 13 14 struct rx_nfa; 15 struct rx_nfa_state; 16 struct rx_superset; 17 18 #include "hackerlab/hash/hashtree.h" 19 #include "hackerlab/bitsets/bits.h" 20 #include "hackerlab/rx/bits-tree-rules.h" 21 #include "hackerlab/rx/tree.h" 22 #include "hackerlab/rx/nfa-cache.h" 23 #include "hackerlab/rx/super.h" 24 25 26 /************************************************************************ 27 *(h1 "The NFA Data Types") 28 * 29 * 30 * 31 */ 32 33 /*(c #s"struct rx_nfa" :category type) 34 * 35 * This structure type is used to hold a non-deterministic 36 * finite automata. 37 * 38 insert*/ 39 struct rx_nfa 40 { 41 unsigned long rx_id; 42 /* Each NFA has a unique ID assigned by `rx_nfa_xalloc'. */ 43 44 int local_cset_size; 45 /* Either 256 (8-bit ascii) or 1<<21 (Unicode). */ 46 47 struct bits_tree_rule * bitset_rule; 48 49 struct rx_nfa_state * nfa_states; 50 /* The list of states in this NFA, linked by the field `next'. 51 This is filled in by `rx_nfa_state'. */ 52 53 int nfa_state_id; 54 /* NFA states are assigned sequential ids. This is the id of the 55 next state.*/ 56 57 struct rx_nfa_state * start_nfa_state; 58 /* The starting state of the NFA. This field must be filled in 59 by a call to `rx_set_start_state' once the start state has 60 been created. */ 61 62 struct hashtree set_list_memo; 63 /* A hash table used to allocate NFA state sets with the property 64 that equal sets are `=='. */ 65 }; 66 /*end-insert 67 */ 68 69 70 /*(c #s"struct rx_nfa_state" :category type) 71 * 72 * This structure type holds one state of an NFA. 73 * 74 insert*/ 75 struct rx_nfa_state 76 { 77 unsigned int id; 78 /* NFA states are assigned sequential ids.*/ 79 80 long state_label; 81 /* Each state has a label. If the label is non-0, the state is a 82 final state. In the DFA, a superstate is given a state label 83 which is the smallest magnitude of the non-0 state labels of 84 the NFA states it contains. If all of the NFA states have a 85 state label 0, then so does the DFA state. */ 86 87 struct rx_nfa_edge *edges; 88 /* A list of NFA edges originating in this state. 89 The edges are linked by the field `next'. */ 90 91 t_uchar is_start; 92 /* Set to `1' by `rx_set_start_state' and is otherwise 0.*/ 93 94 t_uchar has_cset_edges; 95 /* Set to `1' by `rx_nfa_edge' and is otherwise 0. This is used 96 during DFA matching to recognize dead-end states.*/ 97 98 t_uchar closure_computed; 99 /* Set to `1' by `rx_state_closure' and is otherwise 0.*/ 100 101 struct rx_nfa_state_set * closure; 102 /* This is set to the state's epsilon closure by 103 `rx_state_closure' and is otherwise 0.*/ 104 105 struct rx_superset * superstate_set; 106 /* This field is set by `rx_nfa_state_to_superstate' to cache 107 the superstate set of the eclosure of this NFA state. 108 109 This field does not have a reference count to the set. 110 Instead, if the set is freed, this field is set to 0. 111 112 The superstate set points back to this state in the field 113 `nfa_state'. */ 114 115 t_uchar mark; 116 /* This is used by various graph traversal algorithms. */ 117 118 struct rx_nfa_state *next; 119 /* This link is to the next state of the same NFA in the list 120 of all states starting at `nfa->nfa_states'. */ 121 }; 122 /*end-insert 123 */ 124 125 126 /*(c #s"struct rx_nfa_edge" :other-terms "enum rx_nfa_etype" :category type) 127 * enum rx_nfa_etype; 128 * struct rx_nfa_edge; 129 * 130 * This structure type holds one edge of an NFA. 131 * 132 insert*/ 133 134 /* enum rx_nfa_etype; 135 * 136 * There are two types of NFA edges: character sets and "epsilon". An 137 * epsilon edge represents a transition that can be taken immediately 138 * whenever the source state is reached. A character set transition 139 * can be taken only by matching character in the set. 140 */ 141 enum rx_nfa_etype 142 { 143 ne_cset, 144 ne_epsilon 145 }; 146 147 /* struct rx_nfa_edge; 148 * 149 * One NFA edge. 150 */ 151 struct rx_nfa_edge 152 { 153 enum rx_nfa_etype type; /* Which type of edge? */ 154 struct rx_nfa_state *dest; /* What is the destination state? */ 155 156 bits cset; 157 /* If the edge is a character set edge (`ne_cset'), 158 this is the set of characters it matches. */ 159 160 struct rx_nfa_edge *next; 161 /* Next edge for the same source node. */ 162 }; 163 /*end-insert 164 */ 165 166 /*(c #s"struct rx_nfa_state_set" :category type) 167 * 168 * This structure type holds a set of NFA states, represented as a 169 * list. It is used to represent the epsilon closure of an NFA node. 170 * Any two NFA sets with the same elements are equal in the sense of 171 * `==' if they were returned by `rx_state_closure'. 172 * 173 insert*/ 174 struct rx_nfa_state_set 175 { 176 struct rx_nfa_state * car; 177 struct rx_nfa_state_set * cdr; 178 }; 179 /*end-insert 180 */ 181 182 183 184 /************************************************************************ 185 *(h1 "Shared NFA Data Structures") 186 * 187 * 188 * We have a structure type which represents a cached NFA: 189 */ 190 191 /*(c #s"struct rx_unfa" :category type) 192 * struct rx_unfa; 193 * 194 * This structure holds a regexp expression tree and an NFA for that 195 * expression. There is at most one of these structures for each 196 * expression. 197 * 198 insert*/ 199 struct rx_unfa 200 { 201 int refs; /* A reference count. */ 202 struct rx_exp_node * exp; /* A queue of regexps with the same NFA. */ 203 struct rx_nfa * nfa; /* The NFA. */ 204 }; 205 /*end-insert 206 */ 207 208 209 /* automatically generated __STDC__ prototypes */ 210 extern int rx_build_nfa (struct rx_nfa *rx, 211 struct rx_exp_node *rexp, 212 struct rx_nfa_state **start, 213 struct rx_nfa_state **end); 214 extern void rx__nfa_cache_statistics (size_t * threshold, 215 size_t * failure_pt, 216 size_t * in_use, 217 size_t * high_water_mark, 218 int * hits, 219 int * misses, 220 int * saves); 221 extern struct rx_unfa * rx_unfa (struct rx_exp_node * exp, 222 int cset_size); 223 extern void rx_save_unfa (struct rx_unfa * unfa); 224 extern void rx_free_unfa (struct rx_unfa * unfa); 225 extern int rx__really_free_unfa (void); 226 extern struct rx_nfa * rx_nfa_xalloc (int cset_size); 227 extern void rx_free_nfa (struct rx_nfa * rx); 228 extern struct rx_nfa_state * rx_nfa_state (struct rx_nfa *rx); 229 extern struct rx_nfa_edge * rx_nfa_edge (struct rx_nfa *rx, 230 enum rx_nfa_etype type, 231 struct rx_nfa_state *start, 232 struct rx_nfa_state *dest); 233 extern struct rx_nfa_edge * rx_nfa_cset_edge (struct rx_nfa *rx, 234 enum rx_nfa_etype type, 235 bits cset, 236 struct rx_nfa_state *start, 237 struct rx_nfa_state *dest); 238 extern void rx_set_start_state (struct rx_nfa * rx, struct rx_nfa_state * n); 239 extern void rx_set_state_label (struct rx_nfa * rx, 240 struct rx_nfa_state * n, 241 int label); 242 extern struct rx_nfa_state_set * rx_state_closure (struct rx_nfa * rx, 243 struct rx_nfa_state * n); 244 #endif /* INCLUDE__RX__NFA_H */ 245