1 /* dfa.c - functions that manipulate regexps as DFAs
2 *
3 ****************************************************************
4 * Copyright (C) 1998, 2000 Thomas Lord
5 *
6 * See the file "COPYING" for further information about
7 * the copyright and warranty status of this work.
8 */
9
10
11
12 #include "hackerlab/bugs/panic.h"
13 #include "hackerlab/mem/mem.h"
14 #include "hackerlab/rx/escape.h"
15 #include "hackerlab/rx/dfa.h"
16
17
18 /************************************************************************
19 *(h0 "DFA String Comparisons"
20 * :includes ("rx/dfa.h"))
21 *
22 *
23 * A common use for regular expressions is to compile them to DFA and
24 * compare them to strings using a loop that advances through DFA
25 * states.
26 *
27 * In Rx, the DFA data structure has been heavily optimized for such
28 * loops. The functions in this chapter implement the most common
29 * kinds of DFA loop, taking full advantage of the Rx optimizations.
30 */
31 /*(menu)
32 */
33
34
35 /*(include-documentation "dfa.h")
36 */
37
38 /************************************************************************
39 *(h1 "DFA Allocation Functions")
40 *
41 */
42
43 /*(c rx_dfa_alloc)
44 * struct rx_dfa * rx_dfa_alloc (alloc_limits limits);
45 *
46 * Allocate a new DFA, initially not pointing to any NFA or DFA state.
47 *
48 */
49 struct rx_dfa *
rx_dfa_alloc(alloc_limits limits)50 rx_dfa_alloc (alloc_limits limits)
51 {
52 struct rx_dfa * a;
53 a = (struct rx_dfa *)lim_malloc (limits, sizeof (*a));
54 mem_set0 ((t_uchar *)a, sizeof (*a));
55 return a;
56 }
57
58
59 /*(c rx_dfa_free)
60 * void rx_free_dfa (alloc_limits limits, struct rx_dfa * dfa);
61 *
62 * Release all storage associated with `dfa'.
63 */
64 void
rx_dfa_free(alloc_limits limits,struct rx_dfa * dfa)65 rx_dfa_free (alloc_limits limits, struct rx_dfa * dfa)
66 {
67 if (dfa->rx)
68 rx_clear_dfa_state (dfa);
69 lim_free (limits, dfa);
70 }
71
72
73
74 /************************************************************************
75 *(h1 "Initializing a DFA")
76 *
77 */
78
79
80 /*(c rx_init_dfa_from_nfa)
81 * void rx_init_dfa_from_nfa (struct rx_dfa * frame, struct rx_nfa * rx);
82 *
83 * Make `rx' the NFA of DFA machine `frame'.
84 *
85 * The DFA machine must not already have a current DFA state (must be
86 * in state 0). See xref:"rx_clear_dfa_state".
87 *
88 * This function sets the DFA used by `frame', but does not set the
89 * current DFA state of `frame'.
90 */
91 void
rx_init_dfa_from_nfa(struct rx_dfa * frame,struct rx_nfa * rx)92 rx_init_dfa_from_nfa (struct rx_dfa * frame, struct rx_nfa * rx)
93 {
94 frame->rx = rx;
95 frame->state = 0;
96 frame->final_tag = 0;
97 }
98
99
100 /*(c rx_init_dfa_from_dfa)
101 * void rx_init_dfa_from_dfa (struct rx_dfa * dest, struct rx_dfa * src);
102 *
103 * Make the NFA and DFA state of `src' the NFA and DFA state of
104 * `dest'. After a call to this function (which is inexpensive), you
105 * have two DFA machines in equivalent states.
106 *
107 * The DFA machine `dest' must not already have a current DFA state
108 * (`state' must be in state 0). See xref:"rx_clear_dfa_state".
109 */
110 void
rx_init_dfa_from_dfa(struct rx_dfa * dest,struct rx_dfa * src)111 rx_init_dfa_from_dfa (struct rx_dfa * dest, struct rx_dfa * src)
112 {
113 dest->rx = src->rx;
114 dest->state = src->state;
115 dest->final_tag = src->final_tag;
116 if (dest->state)
117 rx_lock_superstate (dest->rx, dest->state);
118 }
119
120
121 /*(c rx_clear_dfa_state)
122 * void rx_clear_dfa_state (struct rx_dfa * frame);
123 *
124 * Clear the DFA state of DFA machine `frame'.
125 */
126 void
rx_clear_dfa_state(struct rx_dfa * frame)127 rx_clear_dfa_state (struct rx_dfa * frame)
128 {
129 if (frame->state)
130 {
131 rx_unlock_superstate (frame->rx, frame->state);
132 frame->state = 0;
133 frame->final_tag = 0;
134 }
135 }
136
137 /* rx_dfa_goto_start_superstate
138 *
139 * Return (or initialize) a DFA to its start state.
140 */
141
142 /*(c rx_dfa_goto_start_superstate)
143 * void rx_dfa_goto_start_superstate (struct rx_dfa * frame,
144 * int storage_unit_size);
145 *
146 * Return state machine `frame' to its starting state.
147 *
148 * The starting state is a DFA state built from the epsilon closure of
149 * the starting state of the NFA. See xref:"rx_set_start_state".
150 */
151 int
rx_dfa_goto_start_superstate(struct rx_dfa * frame,int storage_unit_size)152 rx_dfa_goto_start_superstate (struct rx_dfa * frame,
153 int storage_unit_size)
154 {
155 struct rx_superstate * start_state;
156 start_state = rx_nfa_state_to_superstate (frame->rx, frame->rx->start_nfa_state, storage_unit_size);
157 if (!start_state)
158 return -1;
159 if (frame->state)
160 rx_unlock_superstate (frame->rx, frame->state);
161 frame->state = start_state;
162 frame->final_tag = start_state->members->state_label;
163 rx_lock_superstate (frame->rx, frame->state);
164 return 0;
165 }
166
167
168
169 /************************************************************************
170 *(h1 "DFA Comparison Functions")
171 *
172 * The functions in this section compare an input string to a regular
173 * expression by advancing through DFA states according to the
174 * characters in the input string.
175 */
176
177 /*(c rx_dfa_can_continue)
178 * int rx_dfa_can_continue (struct rx_dfa * frame);
179 *
180 * Return a non-zero value if there exist characters for which the
181 * current state of DFA machine `frame' has transitions defined. If
182 * this function returns 0, then the machine is in a dead-end state.
183 */
184 int
rx_dfa_can_continue(struct rx_dfa * frame)185 rx_dfa_can_continue (struct rx_dfa * frame)
186 {
187 return ( frame->state
188 && frame->state->members->has_cset_edges);
189 }
190
191
192 int
rx_dfa_tag(struct rx_dfa * frame)193 rx_dfa_tag (struct rx_dfa * frame)
194 {
195 return frame->final_tag;
196 }
197
198 /*(c rx_dfa_fits)
199 * int rx_dfa_fits (int * label,
200 * struct rx_dfa * frame,
201 * const t_uchar * burst,
202 * size_t len);
203 *
204 *
205 * Compare a DFA to string: is the entire string matched by the DFA?
206 * Return -1 if an error occurs, 0 otherwise.
207 *
208 * The final state label reached is returned in `*label': 0 if
209 * if the string does not match, non-0 if it does.
210 *
211 * It is possible to asynchronously abort a call to this function.
212 * See xref:"Exiting Long-running Matches".
213 */
214 int
rx_dfa_fits(int * label,struct rx_dfa * frame,const t_uchar * burst,size_t len)215 rx_dfa_fits (int * label,
216 struct rx_dfa * frame,
217 const t_uchar * burst,
218 size_t len)
219 {
220 int adv;
221 adv = rx_dfa_advance (frame, burst, len);
222 if (adv < 0)
223 return -1;
224 else if (!adv)
225 {
226 *label = 0;
227 return 0;
228 }
229 else
230 {
231 *label = frame->final_tag;
232 return 0;
233 }
234 }
235
236
237
238
239 /*(c rx_dfa_advance)
240 * int rx_dfa_advance (struct rx_dfa * frame,
241 * const t_uchar * burst,
242 * size_t len);
243 *
244 *
245 * Advance a DFA, reading characters from the input string. Stop at
246 * the end of the string, returning 1 or when a character is
247 * encountered for which no transition is defined, returning 0.
248 * -1 == ESPACE.
249 *
250 * This is similar to `rx_dfa_fits', except that in this case, we
251 * don't care about the state label of the final state.
252 *
253 * It is possible to asynchronously abort a call to this function.
254 * See xref:"Exiting Long-running Matches".
255 */
256 int
rx_dfa_advance(struct rx_dfa * frame,const t_uchar * burst,size_t len)257 rx_dfa_advance (struct rx_dfa * frame,
258 const t_uchar * burst,
259 size_t len)
260 {
261 rx_transition_table inx_table;
262
263 if (!len)
264 return 1;
265
266 inx_table = frame->state->transitions;
267 rx_unlock_superstate (frame->rx, frame->state);
268 frame->state = 0;
269
270 while (len--)
271 {
272 struct rx_inx * inx;
273 rx_transition_table next_table;
274
275 if (rx_poll)
276 (*rx_poll)();
277
278 inx = rx_transition8 (inx_table, *burst);
279 next_table = (rx_transition_table)inx->data;
280 while (!next_table)
281 {
282 struct rx_superstate * state;
283 state = rx_transitions_to_suprestate (inx_table);
284
285 switch ((long)inx->inx)
286 {
287 case rx_backtrack:
288 /* RX_BACKTRACK means that we've reached the empty
289 * superstate, indicating that match can't succeed
290 * from this point.
291 */
292 frame->state = 0;
293 frame->final_tag = 0;
294 return 0;
295
296 case rx_cache_miss:
297 /* Because the superstate NFA is lazily constructed,
298 * and in fact may erode from underneath us, we sometimes
299 * have to construct the next instruction from the hard way.
300 * This invokes one step in the lazy-conversion.
301 */
302 inx = rx_handle_cache_miss (frame->rx, state, *burst, inx->data_2);
303 if (!inx)
304 {
305 frame->state = 0;
306 frame->final_tag = 0;
307 return -1;
308 }
309 next_table = (rx_transition_table)inx->data;
310 continue;
311
312
313 /* No other instructions are legal here.
314 */
315 default:
316 panic ("unrecognized instruction in rx_dfa_advance");
317 }
318 }
319 inx_table = next_table;
320 ++burst;
321 }
322
323 frame->state = rx_transitions_to_suprestate (inx_table);
324 frame->final_tag = frame->state->members->state_label;
325 rx_lock_superstate (frame->rx, frame->state);
326 return 1;
327 }
328
329
330 /*(c rx_dfa_advance_to_final)
331 * size_t rx_dfa_advance_to_final (struct rx_dfa * frame,
332 * const t_uchar * burst,
333 * size_t len);
334 *
335 * Advance a DFA, reading characters from a string.
336 *
337 * Stop at the end of the string, a character with no transition, or
338 * when a superstate is encountered with a non-0 label. Return the
339 * number of characters read from the string.
340 *
341 * 0 == backtrack, 1 == found final state or ran out of characters, -1 == ESPACE
342 *
343 * This function stops on a transition *into* a state with a non-0
344 * state label. It doesn't matter if the machine is initially in a
345 * state with a non-0 label: the machine will consume the first input
346 * character regardless. That means that if your regular expression
347 * can match the empty string, you must detect this condition before
348 * calling `rx_dfa_advance_to_final' by checking `dfa->final_tag'
349 * after setting the start state of the DFA.
350 *
351 * If the match stopped in a final state, `dfa->final_tag' contains
352 * the non-0 state label of the final state, otherwise, it contains 0.
353 * If the match stopped on an illegal character, `dfa->state' is 0,
354 * otherwise it is non-0.
355 *
356 * It is possible to asynchronously abort a call to this function.
357 * See xref:"Exiting Long-running Matches".
358 */
359 int
rx_dfa_advance_to_final(size_t * amt,struct rx_dfa * frame,const t_uchar * burst,size_t len)360 rx_dfa_advance_to_final (size_t * amt,
361 struct rx_dfa * frame,
362 const t_uchar * burst,
363 size_t len)
364 {
365 size_t initial_len;
366 rx_transition_table inx_table;
367
368 if (!len)
369 {
370 *amt = 0;
371 return 1;
372 }
373
374 initial_len = len;
375 inx_table = frame->state->transitions;
376 rx_unlock_superstate (frame->rx, frame->state);
377 frame->state = 0;
378
379 while (len--)
380 {
381 struct rx_inx * inx;
382 rx_transition_table next_table;
383
384 if (rx_poll)
385 (*rx_poll)();
386
387 inx = rx_transition8 (inx_table, *burst);
388 next_table = (rx_transition_table)inx->data;
389
390 while (!next_table)
391 {
392 struct rx_superstate * state;
393
394 state = rx_transitions_to_suprestate (inx_table);
395
396 switch ((long)inx->inx)
397 {
398 case rx_backtrack:
399 /* RX_BACKTRACK means that we've reached the empty
400 * superstate, indicating that match can't succeed
401 * from this point.
402 */
403 frame->state = 0;
404 frame->final_tag = 0;
405 *amt = (initial_len - len) - 1;
406 return 0;
407
408 case rx_cache_miss:
409 /* Because the superstate NFA is lazily constructed,
410 * and in fact may erode from underneath us, we sometimes
411 * have to construct the next instruction from the hard way.
412 * This invokes one step in the lazy-conversion.
413 */
414 inx = rx_handle_cache_miss (frame->rx, state, *burst, inx->data_2);
415 if (!inx)
416 {
417 frame->state = 0;
418 frame->final_tag = 0;
419 return -1;
420 }
421 next_table = (rx_transition_table)inx->data;
422 continue;
423
424 /* No other instructions are legal here.
425 */
426 default:
427 while (1)
428 panic ("unrecognized instruction in rx_dfa_advance_to_final");
429 }
430 }
431
432 if (inx->data_2)
433 {
434 frame->state = rx_transitions_to_suprestate (next_table);
435 rx_lock_superstate (frame->rx, frame->state);
436 frame->final_tag = (long)inx->data_2;
437 *amt = (initial_len - len);
438 return 1;
439 }
440 inx_table = next_table;
441 ++burst;
442 }
443
444 /* Consumed all of the characters. */
445 frame->state = rx_transitions_to_suprestate (inx_table);
446 rx_lock_superstate (frame->rx, frame->state);
447 frame->final_tag = 0;
448 *amt = initial_len;
449 return 1;
450 }
451
452