1 /* dfa-utf16.c - utf16 functions that manipulate regexps as DFAs
2  *
3  ****************************************************************
4  * Copyright (C) 1998, 2000 Thomas Lord
5  *
6  * See the file "COPYING" for further information about
7  * the copyright and warranty status of this work.
8  */
9 
10 
11 
12 #include "hackerlab/bugs/panic.h"
13 #include "hackerlab/rx/escape.h"
14 #include "hackerlab/rx/super.h"
15 #include "hackerlab/rx/dfa-utf16.h"
16 
17 
18 /************************************************************************
19  *(h0 "DFA String Comparisons for 16-bit Character Storage Units"
20  *    :includes ("rx/dfa.h"
21  *		 "rx/dfa_utf16.h"))
22  *
23  *
24  * A common use for regular expressions is to compile them to DFA and
25  * compare them to strings using a loop that advances through DFA
26  * states.
27  *
28  * In Rx, the DFA data structure has been heavily optimized for such
29  * loops.  The functions in this chapter implement the most common
30  * kinds of DFA loop, taking full advantage of the Rx optimizations.
31  */
32 /*(menu)
33  */
34 
35 /************************************************************************
36  *(h1 "DFA Comparison Functions for 16-bit Character Storage Units")
37  *
38  * The functions in this section compare an input string to a regular
39  * expression by advancing through DFA states according to the
40  * characters in the input string.
41  */
42 
43 
44 /*(c rx_dfa_utf16_fits)
45  * int rx_dfa_utf16_fits (struct rx_dfa * frame,
46  *		      const t_uint16 * burst,
47  *		      size_t len);
48  *
49  *
50  * Compare a DFA to string: is the entire string matched by the DFA?
51  * Return a non-zero value (the state-label of the final DFA state) if
52  * the string matches, 0 otherwise.
53  *
54  * This function works by advancing the DFA through all of the
55  * characters in the input string and checking the state label of the
56  * last state reached.  If that label is not 0, then the string
57  * matches.  If that label is 0, or if an illegal input character is
58  * reached before the end of the input string, the string does not
59  * match.
60  *
61  * It is possible to asynchronously abort a call to this function.
62  * See xref:"Exiting Long-running Matches".
63  */
64 int
rx_dfa_utf16_fits(int * label,struct rx_dfa * frame,const t_uint16 * burst,size_t len)65 rx_dfa_utf16_fits (int * label,
66 		   struct rx_dfa * frame,
67 		   const t_uint16 * burst,
68 		   size_t len)
69 {
70   int adv;
71   adv = rx_dfa_utf16_advance (frame, burst, len);
72   if (adv < 0)
73     return -1;
74   else if (!adv)
75     {
76       *label = 0;
77       return 0;
78     }
79   else
80     {
81       *label = frame->final_tag;
82       return 0;
83     }
84 }
85 
86 
87 
88 
89 /*(c rx_dfa_utf16_advance)
90  * int rx_dfa_advance (struct rx_dfa * frame,
91  *		       const t_uint16 * burst,
92  *		       size_t len);
93  *
94  *
95  * Advance a DFA, reading characters from the input string.  Stop at
96  * the end of the string, returning 1 or when a character is
97  * encountered for which no transition is defined, returning 0.
98  *
99  * This is similar to `rx_dfa_fits', except that in this case, we
100  * don't care about the state label of the final state.
101  *
102  * It is possible to asynchronously abort a call to this function.
103  * See xref:"Exiting Long-running Matches".
104  */
105 int
rx_dfa_utf16_advance(struct rx_dfa * frame,const t_uint16 * burst,size_t len)106 rx_dfa_utf16_advance (struct rx_dfa * frame,
107 		      const t_uint16 * burst,
108 		      size_t len)
109 {
110   rx_transition_table inx_table;
111 
112   if (!len)
113     return 1;
114 
115   inx_table = frame->state->transitions;
116   rx_unlock_superstate (frame->rx, frame->state);
117   frame->state = 0;
118 
119   while (len--)
120     {
121       struct rx_inx * inx;
122       rx_transition_table next_table;
123 
124       if (rx_poll)
125 	(*rx_poll)();
126 
127       inx = rx_transition16 (inx_table, *burst);
128       next_table = (rx_transition_table)inx->data;
129       while (!next_table)
130 	{
131 	  struct rx_superstate * state;
132 	  state = rx_transitions_to_suprestate (inx_table);
133 
134 	  switch ((long)inx->inx)
135 	    {
136 	    case rx_huge_char:
137 	      {
138 		t_uint16 hi;
139 		t_uint16 lo;
140 		t_unicode c;
141 
142 		hi = *burst;
143 		if (!len)
144 		  goto handle_as_backtrack;
145 		--len;
146 		++burst;
147 		lo = *burst;
148 		if (!uni_is_low_surrogate (lo))
149 		  goto handle_as_backtrack;
150 
151 		c = uni_assemble_surrogates (hi, lo);
152 		inx = rx_transition21 (state->huge_char_transitions, c);
153 		next_table = (rx_transition_table)inx->data;
154 
155 		while (!next_table)
156 		  {
157 		    switch ((enum rx_opcode)inx->inx)
158 		      {
159 		      default:
160 		      case rx_huge_char:
161 			goto handle_by_panic;
162 		      case rx_backtrack:
163 			goto handle_as_backtrack;
164 		      case rx_cache_miss:
165 			inx = rx_handle_cache_miss (frame->rx, state, c, inx->data_2);
166 			if (!inx)
167 			  {
168 			    frame->state = 0;
169 			    frame->final_tag = 0;
170 			    return -1;
171 			  }
172 			next_table = (rx_transition_table)inx->data;
173 			continue;
174 		      }
175 		  }
176 		continue;
177 	      }
178 
179 	    case rx_backtrack:
180 	    handle_as_backtrack:
181 	      /* RX_BACKTRACK means that we've reached the empty
182 	       * superstate, indicating that match can't succeed
183 	       * from this point.
184 	       */
185 	      frame->state = 0;
186 	      frame->final_tag = 0;
187 	      return 0;
188 
189 	    case rx_cache_miss:
190 	      /* Because the superstate NFA is lazily constructed,
191 	       * and in fact may erode from underneath us, we sometimes
192 	       * have to construct the next instruction from the hard way.
193 	       * This invokes one step in the lazy-conversion.
194 	       */
195 	      inx = rx_handle_cache_miss (frame->rx, state, *burst, inx->data_2);
196 	      if (!inx)
197 		{
198 		  frame->state = 0;
199 		  frame->final_tag = 0;
200 		  return -1;
201 		}
202 	      next_table = (rx_transition_table)inx->data;
203 	      continue;
204 
205 
206 	      /* No other instructions are legal here.
207 	       */
208 	    default:
209 	    handle_by_panic:
210 	      panic ("unrecognized instruction in rx_dfa_advance");
211 	  }
212 	}
213       inx_table = next_table;
214       ++burst;
215     }
216 
217   frame->state = rx_transitions_to_suprestate (inx_table);
218   frame->final_tag = frame->state->members->state_label;
219   rx_lock_superstate (frame->rx, frame->state);
220   return 1;
221 }
222 
223 
224 /*(c rx_dfa_utf16_advance_to_final)
225  * size_t rx_dfa_utf16_advance_to_final (struct rx_dfa * frame,
226  *				     const t_uint16 * burst,
227  *				     size_t len);
228  *
229  * Advance a DFA, reading characters from a string.
230  *
231  * Stop at the end of the string, a character with no transition, or
232  * when a superstate is encountered with a non-0 label.  Return the
233  * number of characters read from the string.
234  *
235  * This function stops on a transition *into* a state with a non-0
236  * state label.  It doesn't matter if the machine is initially in a
237  * state with a non-0 label: the machine will consume the first input
238  * character regardless.  That means that if your regular expression
239  * can match the empty string, you must detect this condition before
240  * calling `rx_dfa_advance_to_final' by checking `dfa->final_tag'
241  * after setting the start state of the DFA.
242  *
243  * If the match stopped in a final state, `dfa->final_tag' contains
244  * the non-0 state label of the final state, otherwise, it contains 0.
245  * If the match stopped on an illegal character, `dfa->state' is 0,
246  * otherwise it is non-0.
247  *
248  * It is possible to asynchronously abort a call to this function.
249  * See xref:"Exiting Long-running Matches".
250  */
251 int
rx_dfa_utf16_advance_to_final(size_t * amt,struct rx_dfa * frame,const t_uint16 * burst,size_t len)252 rx_dfa_utf16_advance_to_final (size_t * amt,
253 			       struct rx_dfa * frame,
254 			       const t_uint16 * burst,
255 			       size_t len)
256 {
257   size_t initial_len;
258   rx_transition_table inx_table;
259 
260   if (!len)
261     {
262       *amt = 0;
263       return 1;
264     }
265 
266   initial_len = len;
267   inx_table = frame->state->transitions;
268   rx_unlock_superstate (frame->rx, frame->state);
269   frame->state = 0;
270 
271   while (len--)
272     {
273       struct rx_inx * inx;
274       rx_transition_table next_table;
275 
276       if (rx_poll)
277 	(*rx_poll)();
278 
279       inx = rx_transition16 (inx_table, *burst);
280       next_table = (rx_transition_table)inx->data;
281 
282       while (!next_table)
283 	{
284 	  struct rx_superstate * state;
285 
286 	  state = rx_transitions_to_suprestate (inx_table);
287 
288 	  switch ((enum rx_opcode)inx->inx)
289 	    {
290 	    case rx_huge_char:
291 	      {
292 		t_uint16 hi;
293 		t_uint16 lo;
294 		t_unicode c;
295 
296 		hi = *burst;
297 		if (!len)
298 		  goto handle_as_backtrack;
299 		--len;
300 		++burst;
301 		lo = *burst;
302 		if (!uni_is_low_surrogate (lo))
303 		  goto handle_as_backtrack;
304 
305 		c = uni_assemble_surrogates (hi, lo);
306 		inx = rx_transition21 (state->huge_char_transitions, c);
307 		next_table = (rx_transition_table)inx->data;
308 
309 		while (!next_table)
310 		  {
311 		    switch ((enum rx_opcode)inx->inx)
312 		      {
313 		      default:
314 		      case rx_huge_char:
315 			goto handle_by_panic;
316 		      case rx_backtrack:
317 			goto handle_as_backtrack;
318 		      case rx_cache_miss:
319 			inx = rx_handle_cache_miss (frame->rx, state, c, inx->data_2);
320 			if (!inx)
321 			  {
322 			    frame->state = 0;
323 			    frame->final_tag = 0;
324 			    return -1;
325 			  }
326 			next_table = (rx_transition_table)inx->data;
327 			continue;
328 		      }
329 		  }
330 		continue;
331 	      }
332 
333 	    case rx_backtrack:
334 	    handle_as_backtrack:
335 	      /* RX_BACKTRACK means that we've reached the empty
336 	       * superstate, indicating that match can't succeed
337 	       * from this point.
338 	       */
339 	      frame->state = 0;
340 	      frame->final_tag = 0;
341 	      *amt = (initial_len - len) - 1;
342 	      return 0;
343 
344 	    case rx_cache_miss:
345 	      /* Because the superstate NFA is lazily constructed,
346 	       * and in fact may erode from underneath us, we sometimes
347 	       * have to construct the next instruction from the hard way.
348 	       * This invokes one step in the lazy-conversion.
349 	       */
350 	      inx = rx_handle_cache_miss (frame->rx, state, *burst, inx->data_2);
351 	      if (!inx)
352 		{
353 		  frame->state = 0;
354 		  frame->final_tag = 0;
355 		  return -1;
356 		}
357 	      next_table = (rx_transition_table)inx->data;
358 	      continue;
359 
360 	      /* No other instructions are legal here.
361 	       */
362 	    default:
363 	    handle_by_panic:
364 	      while (1)
365 		panic ("unrecognized instruction in rx_dfa_advance_to_final");
366 	  }
367 	}
368 
369       if (inx->data_2)
370 	{
371 	  frame->state = rx_transitions_to_suprestate (next_table);
372 	  rx_lock_superstate (frame->rx, frame->state);
373 	  frame->final_tag = (long)inx->data_2;
374 	  *amt = (initial_len - len);
375 	  return 1;
376 	}
377       inx_table = next_table;
378       ++burst;
379     }
380 
381   /* Consumed all of the characters. */
382   frame->state = rx_transitions_to_suprestate (inx_table);
383   rx_lock_superstate (frame->rx, frame->state);
384   frame->final_tag = 0;
385   *amt = initial_len;
386   return 0;
387 }
388 
389