1 /* nfa.h - decls for manipulating regexps as NFAs
2  *
3  ****************************************************************
4  * Copyright (C) 1998, 2000 Thomas Lord
5  *
6  * See the file "COPYING" for further information about
7  * the copyright and warranty status of this work.
8  */
9 
10 
11 #ifndef INCLUDE__RX__NFA_H
12 #define INCLUDE__RX__NFA_H
13 
14 struct rx_nfa;
15 struct rx_nfa_state;
16 struct rx_superset;
17 
18 #include "hackerlab/hash/hashtree.h"
19 #include "hackerlab/bitsets/bits.h"
20 #include "hackerlab/rx/bits-tree-rules.h"
21 #include "hackerlab/rx/tree.h"
22 #include "hackerlab/rx/nfa-cache.h"
23 #include "hackerlab/rx/super.h"
24 
25 
26 /************************************************************************
27  *(h1 "The NFA Data Types")
28  *
29  *
30  *
31  */
32 
33 /*(c #s"struct rx_nfa" :category type)
34  *
35  * This structure type is used to hold a non-deterministic
36  * finite automata.
37  *
38  insert*/
39 struct rx_nfa
40 {
41   unsigned long rx_id;
42   /*	Each NFA has a unique ID assigned by `rx_nfa_xalloc'. */
43 
44   int local_cset_size;
45   /* 	Either 256 (8-bit ascii) or 1<<21 (Unicode). */
46 
47   struct bits_tree_rule * bitset_rule;
48 
49   struct rx_nfa_state * nfa_states;
50   /* 	The list of states in this NFA, linked by the field `next'.
51     	This is filled in by `rx_nfa_state'. */
52 
53   int nfa_state_id;
54   /*   	NFA states are assigned sequential ids.  This is the id of the
55      	next state.*/
56 
57   struct rx_nfa_state * start_nfa_state;
58   /* 	The starting state of the NFA.  This field must be filled in
59  	by a call to `rx_set_start_state' once the start state has
60  	been created. */
61 
62   struct hashtree set_list_memo;
63   /*	A hash table used to allocate NFA state sets with the property
64 	that equal sets are `=='. */
65 };
66 /*end-insert
67  */
68 
69 
70 /*(c #s"struct rx_nfa_state" :category type)
71  *
72  * This structure type holds one state of an NFA.
73  *
74  insert*/
75 struct rx_nfa_state
76 {
77   unsigned int id;
78   /*    NFA states are assigned sequential ids.*/
79 
80   long state_label;
81   /* 	Each state has a label.  If the label is non-0, the state is a
82 	final state.  In the DFA, a superstate is given a state label
83 	which is the smallest magnitude of the non-0 state labels of
84 	the NFA states it contains.  If all of the NFA states have a
85 	state label 0, then so does the DFA state. */
86 
87   struct rx_nfa_edge *edges;
88   /*  	A list of NFA edges originating in this state.
89 	The edges are linked by the field `next'. */
90 
91   t_uchar is_start;
92   /*	Set to `1' by `rx_set_start_state' and is otherwise 0.*/
93 
94   t_uchar has_cset_edges;
95   /*	Set to `1' by `rx_nfa_edge' and is otherwise 0.  This is used
96 	during DFA matching to recognize dead-end states.*/
97 
98   t_uchar closure_computed;
99   /*   	Set to `1' by `rx_state_closure' and is otherwise 0.*/
100 
101   struct rx_nfa_state_set * closure;
102   /*	This is set to the state's epsilon closure by
103 	`rx_state_closure' and is otherwise 0.*/
104 
105   struct rx_superset * superstate_set;
106   /* 	This field is set by `rx_nfa_state_to_superstate' to cache
107     	the superstate set of the eclosure of this NFA state.
108 
109 	This field does not have a reference count to the set.
110 	Instead, if the set is freed, this field is set to 0.
111 
112 	The superstate set points back to this state in the field
113 	`nfa_state'. */
114 
115   t_uchar mark;
116   /*   	This is used by various graph traversal algorithms. */
117 
118   struct rx_nfa_state *next;
119   /*	This link is to the next state of the same NFA in the list
120  	of all states starting at `nfa->nfa_states'. */
121 };
122 /*end-insert
123  */
124 
125 
126 /*(c #s"struct rx_nfa_edge" :other-terms "enum rx_nfa_etype" :category type)
127  * enum rx_nfa_etype;
128  * struct rx_nfa_edge;
129  *
130  * This structure type holds one edge of an NFA.
131  *
132  insert*/
133 
134 /* enum rx_nfa_etype;
135  *
136  * There are two types of NFA edges: character sets and "epsilon".  An
137  * epsilon edge represents a transition that can be taken immediately
138  * whenever the source state is reached.  A character set transition
139  * can be taken only by matching character in the set.
140  */
141 enum rx_nfa_etype
142 {
143   ne_cset,
144   ne_epsilon
145 };
146 
147 /* struct rx_nfa_edge;
148  *
149  * One NFA edge.
150  */
151 struct rx_nfa_edge
152 {
153   enum rx_nfa_etype type;	/* Which type of edge? */
154   struct rx_nfa_state *dest;	/* What is the destination state? */
155 
156   bits cset;
157   /*	If the edge is a character set edge (`ne_cset'),
158 	this is the set of characters it matches. */
159 
160   struct rx_nfa_edge *next;
161   /*	Next edge for the same source node. */
162 };
163 /*end-insert
164  */
165 
166 /*(c #s"struct rx_nfa_state_set" :category type)
167  *
168  * This structure type holds a set of NFA states, represented as a
169  * list.  It is used to represent the epsilon closure of an NFA node.
170  * Any two NFA sets with the same elements are equal in the sense of
171  * `==' if they were returned by `rx_state_closure'.
172  *
173  insert*/
174 struct rx_nfa_state_set
175 {
176   struct rx_nfa_state * car;
177   struct rx_nfa_state_set * cdr;
178 };
179 /*end-insert
180  */
181 
182 
183 
184 /************************************************************************
185  *(h1 "Shared NFA Data Structures")
186  *
187  *
188  * We have a structure type which represents a cached NFA:
189  */
190 
191 /*(c #s"struct rx_unfa" :category type)
192  * struct rx_unfa;
193  *
194  * This structure holds a regexp expression tree and an NFA for that
195  * expression.  There is at most one of these structures for each
196  * expression.
197  *
198  insert*/
199 struct rx_unfa
200 {
201   int refs;			/* A reference count. */
202   struct rx_exp_node * exp;	/* A queue of regexps with the same NFA. */
203   struct rx_nfa * nfa;		/* The NFA. */
204 };
205 /*end-insert
206  */
207 
208 
209 /* automatically generated __STDC__ prototypes */
210 extern int rx_build_nfa (struct rx_nfa *rx,
211 			 struct rx_exp_node *rexp,
212 			 struct rx_nfa_state **start,
213 			 struct rx_nfa_state **end);
214 extern void rx__nfa_cache_statistics (size_t * threshold,
215 				      size_t * failure_pt,
216 				      size_t * in_use,
217 				      size_t * high_water_mark,
218 				      int * hits,
219 				      int * misses,
220 				      int * saves);
221 extern struct rx_unfa * rx_unfa (struct rx_exp_node * exp,
222 				 int cset_size);
223 extern void rx_save_unfa (struct rx_unfa * unfa);
224 extern void rx_free_unfa (struct rx_unfa * unfa);
225 extern int rx__really_free_unfa (void);
226 extern struct rx_nfa * rx_nfa_xalloc (int cset_size);
227 extern void rx_free_nfa (struct rx_nfa * rx);
228 extern struct rx_nfa_state * rx_nfa_state (struct rx_nfa *rx);
229 extern struct rx_nfa_edge * rx_nfa_edge (struct rx_nfa *rx,
230 					 enum rx_nfa_etype type,
231 					 struct rx_nfa_state *start,
232 					 struct rx_nfa_state *dest);
233 extern struct rx_nfa_edge * rx_nfa_cset_edge (struct rx_nfa *rx,
234 					      enum rx_nfa_etype type,
235 					      bits cset,
236 					      struct rx_nfa_state *start,
237 					      struct rx_nfa_state *dest);
238 extern void rx_set_start_state (struct rx_nfa * rx, struct rx_nfa_state * n);
239 extern void rx_set_state_label (struct rx_nfa * rx,
240 				struct rx_nfa_state * n,
241 				int label);
242 extern struct rx_nfa_state_set * rx_state_closure (struct rx_nfa * rx,
243 						   struct rx_nfa_state * n);
244 #endif  /* INCLUDE__RX__NFA_H */
245