1 /* dfa-utf16.c - utf16 functions that manipulate regexps as DFAs
2 *
3 ****************************************************************
4 * Copyright (C) 1998, 2000 Thomas Lord
5 *
6 * See the file "COPYING" for further information about
7 * the copyright and warranty status of this work.
8 */
9
10
11
12 #include "hackerlab/bugs/panic.h"
13 #include "hackerlab/rx/escape.h"
14 #include "hackerlab/rx/super.h"
15 #include "hackerlab/rx/dfa-utf16.h"
16
17
18 /************************************************************************
19 *(h0 "DFA String Comparisons for 16-bit Character Storage Units"
20 * :includes ("rx/dfa.h"
21 * "rx/dfa_utf16.h"))
22 *
23 *
24 * A common use for regular expressions is to compile them to DFA and
25 * compare them to strings using a loop that advances through DFA
26 * states.
27 *
28 * In Rx, the DFA data structure has been heavily optimized for such
29 * loops. The functions in this chapter implement the most common
30 * kinds of DFA loop, taking full advantage of the Rx optimizations.
31 */
32 /*(menu)
33 */
34
35 /************************************************************************
36 *(h1 "DFA Comparison Functions for 16-bit Character Storage Units")
37 *
38 * The functions in this section compare an input string to a regular
39 * expression by advancing through DFA states according to the
40 * characters in the input string.
41 */
42
43
44 /*(c rx_dfa_utf16_fits)
45 * int rx_dfa_utf16_fits (struct rx_dfa * frame,
46 * const t_uint16 * burst,
47 * size_t len);
48 *
49 *
50 * Compare a DFA to string: is the entire string matched by the DFA?
51 * Return a non-zero value (the state-label of the final DFA state) if
52 * the string matches, 0 otherwise.
53 *
54 * This function works by advancing the DFA through all of the
55 * characters in the input string and checking the state label of the
56 * last state reached. If that label is not 0, then the string
57 * matches. If that label is 0, or if an illegal input character is
58 * reached before the end of the input string, the string does not
59 * match.
60 *
61 * It is possible to asynchronously abort a call to this function.
62 * See xref:"Exiting Long-running Matches".
63 */
64 int
rx_dfa_utf16_fits(int * label,struct rx_dfa * frame,const t_uint16 * burst,size_t len)65 rx_dfa_utf16_fits (int * label,
66 struct rx_dfa * frame,
67 const t_uint16 * burst,
68 size_t len)
69 {
70 int adv;
71 adv = rx_dfa_utf16_advance (frame, burst, len);
72 if (adv < 0)
73 return -1;
74 else if (!adv)
75 {
76 *label = 0;
77 return 0;
78 }
79 else
80 {
81 *label = frame->final_tag;
82 return 0;
83 }
84 }
85
86
87
88
89 /*(c rx_dfa_utf16_advance)
90 * int rx_dfa_advance (struct rx_dfa * frame,
91 * const t_uint16 * burst,
92 * size_t len);
93 *
94 *
95 * Advance a DFA, reading characters from the input string. Stop at
96 * the end of the string, returning 1 or when a character is
97 * encountered for which no transition is defined, returning 0.
98 *
99 * This is similar to `rx_dfa_fits', except that in this case, we
100 * don't care about the state label of the final state.
101 *
102 * It is possible to asynchronously abort a call to this function.
103 * See xref:"Exiting Long-running Matches".
104 */
105 int
rx_dfa_utf16_advance(struct rx_dfa * frame,const t_uint16 * burst,size_t len)106 rx_dfa_utf16_advance (struct rx_dfa * frame,
107 const t_uint16 * burst,
108 size_t len)
109 {
110 rx_transition_table inx_table;
111
112 if (!len)
113 return 1;
114
115 inx_table = frame->state->transitions;
116 rx_unlock_superstate (frame->rx, frame->state);
117 frame->state = 0;
118
119 while (len--)
120 {
121 struct rx_inx * inx;
122 rx_transition_table next_table;
123
124 if (rx_poll)
125 (*rx_poll)();
126
127 inx = rx_transition16 (inx_table, *burst);
128 next_table = (rx_transition_table)inx->data;
129 while (!next_table)
130 {
131 struct rx_superstate * state;
132 state = rx_transitions_to_suprestate (inx_table);
133
134 switch ((long)inx->inx)
135 {
136 case rx_huge_char:
137 {
138 t_uint16 hi;
139 t_uint16 lo;
140 t_unicode c;
141
142 hi = *burst;
143 if (!len)
144 goto handle_as_backtrack;
145 --len;
146 ++burst;
147 lo = *burst;
148 if (!uni_is_low_surrogate (lo))
149 goto handle_as_backtrack;
150
151 c = uni_assemble_surrogates (hi, lo);
152 inx = rx_transition21 (state->huge_char_transitions, c);
153 next_table = (rx_transition_table)inx->data;
154
155 while (!next_table)
156 {
157 switch ((enum rx_opcode)inx->inx)
158 {
159 default:
160 case rx_huge_char:
161 goto handle_by_panic;
162 case rx_backtrack:
163 goto handle_as_backtrack;
164 case rx_cache_miss:
165 inx = rx_handle_cache_miss (frame->rx, state, c, inx->data_2);
166 if (!inx)
167 {
168 frame->state = 0;
169 frame->final_tag = 0;
170 return -1;
171 }
172 next_table = (rx_transition_table)inx->data;
173 continue;
174 }
175 }
176 continue;
177 }
178
179 case rx_backtrack:
180 handle_as_backtrack:
181 /* RX_BACKTRACK means that we've reached the empty
182 * superstate, indicating that match can't succeed
183 * from this point.
184 */
185 frame->state = 0;
186 frame->final_tag = 0;
187 return 0;
188
189 case rx_cache_miss:
190 /* Because the superstate NFA is lazily constructed,
191 * and in fact may erode from underneath us, we sometimes
192 * have to construct the next instruction from the hard way.
193 * This invokes one step in the lazy-conversion.
194 */
195 inx = rx_handle_cache_miss (frame->rx, state, *burst, inx->data_2);
196 if (!inx)
197 {
198 frame->state = 0;
199 frame->final_tag = 0;
200 return -1;
201 }
202 next_table = (rx_transition_table)inx->data;
203 continue;
204
205
206 /* No other instructions are legal here.
207 */
208 default:
209 handle_by_panic:
210 panic ("unrecognized instruction in rx_dfa_advance");
211 }
212 }
213 inx_table = next_table;
214 ++burst;
215 }
216
217 frame->state = rx_transitions_to_suprestate (inx_table);
218 frame->final_tag = frame->state->members->state_label;
219 rx_lock_superstate (frame->rx, frame->state);
220 return 1;
221 }
222
223
224 /*(c rx_dfa_utf16_advance_to_final)
225 * size_t rx_dfa_utf16_advance_to_final (struct rx_dfa * frame,
226 * const t_uint16 * burst,
227 * size_t len);
228 *
229 * Advance a DFA, reading characters from a string.
230 *
231 * Stop at the end of the string, a character with no transition, or
232 * when a superstate is encountered with a non-0 label. Return the
233 * number of characters read from the string.
234 *
235 * This function stops on a transition *into* a state with a non-0
236 * state label. It doesn't matter if the machine is initially in a
237 * state with a non-0 label: the machine will consume the first input
238 * character regardless. That means that if your regular expression
239 * can match the empty string, you must detect this condition before
240 * calling `rx_dfa_advance_to_final' by checking `dfa->final_tag'
241 * after setting the start state of the DFA.
242 *
243 * If the match stopped in a final state, `dfa->final_tag' contains
244 * the non-0 state label of the final state, otherwise, it contains 0.
245 * If the match stopped on an illegal character, `dfa->state' is 0,
246 * otherwise it is non-0.
247 *
248 * It is possible to asynchronously abort a call to this function.
249 * See xref:"Exiting Long-running Matches".
250 */
251 int
rx_dfa_utf16_advance_to_final(size_t * amt,struct rx_dfa * frame,const t_uint16 * burst,size_t len)252 rx_dfa_utf16_advance_to_final (size_t * amt,
253 struct rx_dfa * frame,
254 const t_uint16 * burst,
255 size_t len)
256 {
257 size_t initial_len;
258 rx_transition_table inx_table;
259
260 if (!len)
261 {
262 *amt = 0;
263 return 1;
264 }
265
266 initial_len = len;
267 inx_table = frame->state->transitions;
268 rx_unlock_superstate (frame->rx, frame->state);
269 frame->state = 0;
270
271 while (len--)
272 {
273 struct rx_inx * inx;
274 rx_transition_table next_table;
275
276 if (rx_poll)
277 (*rx_poll)();
278
279 inx = rx_transition16 (inx_table, *burst);
280 next_table = (rx_transition_table)inx->data;
281
282 while (!next_table)
283 {
284 struct rx_superstate * state;
285
286 state = rx_transitions_to_suprestate (inx_table);
287
288 switch ((enum rx_opcode)inx->inx)
289 {
290 case rx_huge_char:
291 {
292 t_uint16 hi;
293 t_uint16 lo;
294 t_unicode c;
295
296 hi = *burst;
297 if (!len)
298 goto handle_as_backtrack;
299 --len;
300 ++burst;
301 lo = *burst;
302 if (!uni_is_low_surrogate (lo))
303 goto handle_as_backtrack;
304
305 c = uni_assemble_surrogates (hi, lo);
306 inx = rx_transition21 (state->huge_char_transitions, c);
307 next_table = (rx_transition_table)inx->data;
308
309 while (!next_table)
310 {
311 switch ((enum rx_opcode)inx->inx)
312 {
313 default:
314 case rx_huge_char:
315 goto handle_by_panic;
316 case rx_backtrack:
317 goto handle_as_backtrack;
318 case rx_cache_miss:
319 inx = rx_handle_cache_miss (frame->rx, state, c, inx->data_2);
320 if (!inx)
321 {
322 frame->state = 0;
323 frame->final_tag = 0;
324 return -1;
325 }
326 next_table = (rx_transition_table)inx->data;
327 continue;
328 }
329 }
330 continue;
331 }
332
333 case rx_backtrack:
334 handle_as_backtrack:
335 /* RX_BACKTRACK means that we've reached the empty
336 * superstate, indicating that match can't succeed
337 * from this point.
338 */
339 frame->state = 0;
340 frame->final_tag = 0;
341 *amt = (initial_len - len) - 1;
342 return 0;
343
344 case rx_cache_miss:
345 /* Because the superstate NFA is lazily constructed,
346 * and in fact may erode from underneath us, we sometimes
347 * have to construct the next instruction from the hard way.
348 * This invokes one step in the lazy-conversion.
349 */
350 inx = rx_handle_cache_miss (frame->rx, state, *burst, inx->data_2);
351 if (!inx)
352 {
353 frame->state = 0;
354 frame->final_tag = 0;
355 return -1;
356 }
357 next_table = (rx_transition_table)inx->data;
358 continue;
359
360 /* No other instructions are legal here.
361 */
362 default:
363 handle_by_panic:
364 while (1)
365 panic ("unrecognized instruction in rx_dfa_advance_to_final");
366 }
367 }
368
369 if (inx->data_2)
370 {
371 frame->state = rx_transitions_to_suprestate (next_table);
372 rx_lock_superstate (frame->rx, frame->state);
373 frame->final_tag = (long)inx->data_2;
374 *amt = (initial_len - len);
375 return 1;
376 }
377 inx_table = next_table;
378 ++burst;
379 }
380
381 /* Consumed all of the characters. */
382 frame->state = rx_transitions_to_suprestate (inx_table);
383 rx_lock_superstate (frame->rx, frame->state);
384 frame->final_tag = 0;
385 *amt = initial_len;
386 return 0;
387 }
388
389