1 /* dfa.c - functions that manipulate regexps as DFAs
2  *
3  ****************************************************************
4  * Copyright (C) 1998, 2000 Thomas Lord
5  *
6  * See the file "COPYING" for further information about
7  * the copyright and warranty status of this work.
8  */
9 
10 
11 
12 #include "hackerlab/bugs/panic.h"
13 #include "hackerlab/mem/mem.h"
14 #include "hackerlab/rx/escape.h"
15 #include "hackerlab/rx/dfa.h"
16 
17 
18 /************************************************************************
19  *(h0 "DFA String Comparisons"
20  *    :includes ("rx/dfa.h"))
21  *
22  *
23  * A common use for regular expressions is to compile them to DFA and
24  * compare them to strings using a loop that advances through DFA
25  * states.
26  *
27  * In Rx, the DFA data structure has been heavily optimized for such
28  * loops.  The functions in this chapter implement the most common
29  * kinds of DFA loop, taking full advantage of the Rx optimizations.
30  */
31 /*(menu)
32  */
33 
34 
35 /*(include-documentation "dfa.h")
36  */
37 
38 /************************************************************************
39  *(h1 "DFA Allocation Functions")
40  *
41  */
42 
43 /*(c rx_dfa_alloc)
44  * struct rx_dfa * rx_dfa_alloc (alloc_limits limits);
45  *
46  * Allocate a new DFA, initially not pointing to any NFA or DFA state.
47  *
48  */
49 struct rx_dfa *
rx_dfa_alloc(alloc_limits limits)50 rx_dfa_alloc (alloc_limits limits)
51 {
52   struct rx_dfa * a;
53   a = (struct rx_dfa *)lim_malloc (limits, sizeof (*a));
54   mem_set0 ((t_uchar *)a, sizeof (*a));
55   return a;
56 }
57 
58 
59 /*(c rx_dfa_free)
60  * void rx_free_dfa (alloc_limits limits, struct rx_dfa * dfa);
61  *
62  * Release all storage associated with `dfa'.
63  */
64 void
rx_dfa_free(alloc_limits limits,struct rx_dfa * dfa)65 rx_dfa_free (alloc_limits limits, struct rx_dfa * dfa)
66 {
67   if (dfa->rx)
68     rx_clear_dfa_state (dfa);
69   lim_free (limits, dfa);
70 }
71 
72 
73 
74 /************************************************************************
75  *(h1 "Initializing a DFA")
76  *
77  */
78 
79 
80 /*(c rx_init_dfa_from_nfa)
81  * void rx_init_dfa_from_nfa (struct rx_dfa * frame, struct rx_nfa * rx);
82  *
83  * Make `rx' the NFA of DFA machine `frame'.
84  *
85  * The DFA machine must not already have a current DFA state (must be
86  * in state 0).  See xref:"rx_clear_dfa_state".
87  *
88  * This function sets the DFA used by `frame', but does not set the
89  * current DFA state of `frame'.
90  */
91 void
rx_init_dfa_from_nfa(struct rx_dfa * frame,struct rx_nfa * rx)92 rx_init_dfa_from_nfa (struct rx_dfa * frame, struct rx_nfa * rx)
93 {
94   frame->rx = rx;
95   frame->state = 0;
96   frame->final_tag = 0;
97 }
98 
99 
100 /*(c rx_init_dfa_from_dfa)
101  * void rx_init_dfa_from_dfa (struct rx_dfa * dest, struct rx_dfa * src);
102  *
103  * Make the NFA and DFA state of `src' the NFA and DFA state of
104  * `dest'.  After a call to this function (which is inexpensive), you
105  * have two DFA machines in equivalent states.
106  *
107  * The DFA machine `dest' must not already have a current DFA state
108  * (`state' must be in state 0).  See xref:"rx_clear_dfa_state".
109  */
110 void
rx_init_dfa_from_dfa(struct rx_dfa * dest,struct rx_dfa * src)111 rx_init_dfa_from_dfa (struct rx_dfa * dest, struct rx_dfa * src)
112 {
113   dest->rx = src->rx;
114   dest->state = src->state;
115   dest->final_tag = src->final_tag;
116   if (dest->state)
117     rx_lock_superstate (dest->rx, dest->state);
118 }
119 
120 
121 /*(c rx_clear_dfa_state)
122  * void rx_clear_dfa_state (struct rx_dfa * frame);
123  *
124  * Clear the DFA state of DFA machine `frame'.
125  */
126 void
rx_clear_dfa_state(struct rx_dfa * frame)127 rx_clear_dfa_state (struct rx_dfa * frame)
128 {
129   if (frame->state)
130     {
131       rx_unlock_superstate (frame->rx, frame->state);
132       frame->state = 0;
133       frame->final_tag = 0;
134     }
135 }
136 
137 /* rx_dfa_goto_start_superstate
138  *
139  * Return (or initialize) a DFA to its start state.
140  */
141 
142 /*(c rx_dfa_goto_start_superstate)
143  * void rx_dfa_goto_start_superstate (struct rx_dfa * frame,
144  *                                    int storage_unit_size);
145  *
146  * Return state machine `frame' to its starting state.
147  *
148  * The starting state is a DFA state built from the epsilon closure of
149  * the starting state of the NFA.  See xref:"rx_set_start_state".
150  */
151 int
rx_dfa_goto_start_superstate(struct rx_dfa * frame,int storage_unit_size)152 rx_dfa_goto_start_superstate (struct rx_dfa * frame,
153 			      int storage_unit_size)
154 {
155   struct rx_superstate * start_state;
156   start_state = rx_nfa_state_to_superstate (frame->rx, frame->rx->start_nfa_state, storage_unit_size);
157   if (!start_state)
158     return -1;
159   if (frame->state)
160     rx_unlock_superstate (frame->rx, frame->state);
161   frame->state = start_state;
162   frame->final_tag = start_state->members->state_label;
163   rx_lock_superstate (frame->rx, frame->state);
164   return 0;
165 }
166 
167 
168 
169 /************************************************************************
170  *(h1 "DFA Comparison Functions")
171  *
172  * The functions in this section compare an input string to a regular
173  * expression by advancing through DFA states according to the
174  * characters in the input string.
175  */
176 
177 /*(c rx_dfa_can_continue)
178  * int rx_dfa_can_continue (struct rx_dfa * frame);
179  *
180  * Return a non-zero value if there exist characters for which the
181  * current state of DFA machine `frame' has transitions defined.  If
182  * this function returns 0, then the machine is in a dead-end state.
183  */
184 int
rx_dfa_can_continue(struct rx_dfa * frame)185 rx_dfa_can_continue (struct rx_dfa * frame)
186 {
187   return (   frame->state
188 	  && frame->state->members->has_cset_edges);
189 }
190 
191 
192 int
rx_dfa_tag(struct rx_dfa * frame)193 rx_dfa_tag (struct rx_dfa * frame)
194 {
195   return frame->final_tag;
196 }
197 
198 /*(c rx_dfa_fits)
199  * int rx_dfa_fits (int * label,
200  *                  struct rx_dfa * frame,
201  *                  const t_uchar * burst,
202  *                  size_t len);
203  *
204  *
205  * Compare a DFA to string: is the entire string matched by the DFA?
206  * Return -1 if an error occurs, 0 otherwise.
207  *
208  * The final state label reached is returned in `*label': 0 if
209  * if the string does not match, non-0 if it does.
210  *
211  * It is possible to asynchronously abort a call to this function.
212  * See xref:"Exiting Long-running Matches".
213  */
214 int
rx_dfa_fits(int * label,struct rx_dfa * frame,const t_uchar * burst,size_t len)215 rx_dfa_fits (int * label,
216 	     struct rx_dfa * frame,
217 	     const t_uchar * burst,
218 	     size_t len)
219 {
220   int adv;
221   adv = rx_dfa_advance (frame, burst, len);
222   if (adv < 0)
223     return -1;
224   else if (!adv)
225     {
226       *label = 0;
227       return 0;
228     }
229   else
230     {
231       *label = frame->final_tag;
232       return 0;
233     }
234 }
235 
236 
237 
238 
239 /*(c rx_dfa_advance)
240  * int rx_dfa_advance (struct rx_dfa * frame,
241  *		       const t_uchar * burst,
242  *		       size_t len);
243  *
244  *
245  * Advance a DFA, reading characters from the input string.  Stop at
246  * the end of the string, returning 1 or when a character is
247  * encountered for which no transition is defined, returning 0.
248  * -1 == ESPACE.
249  *
250  * This is similar to `rx_dfa_fits', except that in this case, we
251  * don't care about the state label of the final state.
252  *
253  * It is possible to asynchronously abort a call to this function.
254  * See xref:"Exiting Long-running Matches".
255  */
256 int
rx_dfa_advance(struct rx_dfa * frame,const t_uchar * burst,size_t len)257 rx_dfa_advance (struct rx_dfa * frame,
258 		const t_uchar * burst,
259 		size_t len)
260 {
261   rx_transition_table inx_table;
262 
263   if (!len)
264     return 1;
265 
266   inx_table = frame->state->transitions;
267   rx_unlock_superstate (frame->rx, frame->state);
268   frame->state = 0;
269 
270   while (len--)
271     {
272       struct rx_inx * inx;
273       rx_transition_table next_table;
274 
275       if (rx_poll)
276 	(*rx_poll)();
277 
278       inx = rx_transition8 (inx_table, *burst);
279       next_table = (rx_transition_table)inx->data;
280       while (!next_table)
281 	{
282 	  struct rx_superstate * state;
283 	  state = rx_transitions_to_suprestate (inx_table);
284 
285 	  switch ((long)inx->inx)
286 	    {
287 	    case rx_backtrack:
288 	      /* RX_BACKTRACK means that we've reached the empty
289 	       * superstate, indicating that match can't succeed
290 	       * from this point.
291 	       */
292 	      frame->state = 0;
293 	      frame->final_tag = 0;
294 	      return 0;
295 
296 	    case rx_cache_miss:
297 	      /* Because the superstate NFA is lazily constructed,
298 	       * and in fact may erode from underneath us, we sometimes
299 	       * have to construct the next instruction from the hard way.
300 	       * This invokes one step in the lazy-conversion.
301 	       */
302 	      inx = rx_handle_cache_miss (frame->rx, state, *burst, inx->data_2);
303 	      if (!inx)
304 		{
305 		  frame->state = 0;
306 		  frame->final_tag = 0;
307 		  return -1;
308 		}
309 	      next_table = (rx_transition_table)inx->data;
310 	      continue;
311 
312 
313 	      /* No other instructions are legal here.
314 	       */
315 	    default:
316 	      panic ("unrecognized instruction in rx_dfa_advance");
317 	  }
318 	}
319       inx_table = next_table;
320       ++burst;
321     }
322 
323   frame->state = rx_transitions_to_suprestate (inx_table);
324   frame->final_tag = frame->state->members->state_label;
325   rx_lock_superstate (frame->rx, frame->state);
326   return 1;
327 }
328 
329 
330 /*(c rx_dfa_advance_to_final)
331  * size_t rx_dfa_advance_to_final (struct rx_dfa * frame,
332  *				   const t_uchar * burst,
333  *				   size_t len);
334  *
335  * Advance a DFA, reading characters from a string.
336  *
337  * Stop at the end of the string, a character with no transition, or
338  * when a superstate is encountered with a non-0 label.  Return the
339  * number of characters read from the string.
340  *
341  * 0 == backtrack, 1 == found final state or ran out of characters, -1 == ESPACE
342  *
343  * This function stops on a transition *into* a state with a non-0
344  * state label.  It doesn't matter if the machine is initially in a
345  * state with a non-0 label: the machine will consume the first input
346  * character regardless.  That means that if your regular expression
347  * can match the empty string, you must detect this condition before
348  * calling `rx_dfa_advance_to_final' by checking `dfa->final_tag'
349  * after setting the start state of the DFA.
350  *
351  * If the match stopped in a final state, `dfa->final_tag' contains
352  * the non-0 state label of the final state, otherwise, it contains 0.
353  * If the match stopped on an illegal character, `dfa->state' is 0,
354  * otherwise it is non-0.
355  *
356  * It is possible to asynchronously abort a call to this function.
357  * See xref:"Exiting Long-running Matches".
358  */
359 int
rx_dfa_advance_to_final(size_t * amt,struct rx_dfa * frame,const t_uchar * burst,size_t len)360 rx_dfa_advance_to_final (size_t * amt,
361 			 struct rx_dfa * frame,
362 			 const t_uchar * burst,
363 			 size_t len)
364 {
365   size_t initial_len;
366   rx_transition_table inx_table;
367 
368   if (!len)
369     {
370       *amt = 0;
371       return 1;
372     }
373 
374   initial_len = len;
375   inx_table = frame->state->transitions;
376   rx_unlock_superstate (frame->rx, frame->state);
377   frame->state = 0;
378 
379   while (len--)
380     {
381       struct rx_inx * inx;
382       rx_transition_table next_table;
383 
384       if (rx_poll)
385 	(*rx_poll)();
386 
387       inx = rx_transition8 (inx_table, *burst);
388       next_table = (rx_transition_table)inx->data;
389 
390       while (!next_table)
391 	{
392 	  struct rx_superstate * state;
393 
394 	  state = rx_transitions_to_suprestate (inx_table);
395 
396 	  switch ((long)inx->inx)
397 	    {
398 	    case rx_backtrack:
399 	      /* RX_BACKTRACK means that we've reached the empty
400 	       * superstate, indicating that match can't succeed
401 	       * from this point.
402 	       */
403 	      frame->state = 0;
404 	      frame->final_tag = 0;
405 	      *amt = (initial_len - len) - 1;
406 	      return 0;
407 
408 	    case rx_cache_miss:
409 	      /* Because the superstate NFA is lazily constructed,
410 	       * and in fact may erode from underneath us, we sometimes
411 	       * have to construct the next instruction from the hard way.
412 	       * This invokes one step in the lazy-conversion.
413 	       */
414 	      inx = rx_handle_cache_miss (frame->rx, state, *burst, inx->data_2);
415 	      if (!inx)
416 		{
417 		  frame->state = 0;
418 		  frame->final_tag = 0;
419 		  return -1;
420 		}
421 	      next_table = (rx_transition_table)inx->data;
422 	      continue;
423 
424 	      /* No other instructions are legal here.
425 	       */
426 	    default:
427 	      while (1)
428 		panic ("unrecognized instruction in rx_dfa_advance_to_final");
429 	  }
430 	}
431 
432       if (inx->data_2)
433 	{
434 	  frame->state = rx_transitions_to_suprestate (next_table);
435 	  rx_lock_superstate (frame->rx, frame->state);
436 	  frame->final_tag = (long)inx->data_2;
437 	  *amt = (initial_len - len);
438 	  return 1;
439 	}
440       inx_table = next_table;
441       ++burst;
442     }
443 
444   /* Consumed all of the characters. */
445   frame->state = rx_transitions_to_suprestate (inx_table);
446   rx_lock_superstate (frame->rx, frame->state);
447   frame->final_tag = 0;
448   *amt = initial_len;
449   return 1;
450 }
451 
452