1 /* match-regexp.c - functions for comparing a string to a regexp
2 *
3 ****************************************************************
4 * Copyright (C) 1998, 2000 Thomas Lord
5 *
6 * See the file "COPYING" for further information about
7 * the copyright and warranty status of this work.
8 */
9
10
11 /* #define TRACE */
12
13 #include "hackerlab/bugs/panic.h"
14 #include "hackerlab/char/char-class.h"
15 #include "hackerlab/char/str.h"
16 #include "hackerlab/bitsets/bits.h"
17 #include "hackerlab/rx/nfa-cache.h"
18 #include "hackerlab/rx/dfa.h"
19 #include "hackerlab/rx/escape.h"
20 #ifdef TRACE
21 #include "hackerlab/vu/safe.h"
22 #include "hackerlab/rx/dbug.h"
23
24 int rx_trace = 0;
25 int rx_depth_limit = 0;
26
27 #endif
28 #include "hackerlab/rx-posix/errnorx.h"
29 #include "hackerlab/rx-posix/match-regexp.h"
30
31
32
33 static unsigned long solns_allocated;
34 static unsigned long solns_freed;
35
36
37 #define max0(N) (((N) >= 0) ? (N) : 0)
38
39
40 /* __STDC__ prototypes for static functions */
41 static int rx_str_vmfn (void * closure,
42 const t_uchar ** burstp,
43 rx_off_t * lenp,
44 rx_off_t * offsetp,
45 rx_off_t start,
46 rx_off_t end,
47 rx_off_t need);
48 static int rx_str_contextfn (void * closure,
49 struct rx_exp_node * node,
50 rx_off_t start,
51 rx_off_t end,
52 struct rx_registers * regs);
53 static int rx_solution_fit_cset_p (struct rx_solutions * solns);
54 static int rx_solution_fit_str_p (struct rx_solutions * solns);
55 static int rx_solution_fits (struct rx_solutions * solns);
56 static int rx_dfa_longest_counting (int * label,
57 size_t * match_len,
58 size_t * matches_count,
59 struct rx_dfa * dfa,
60 const t_uchar * str,
61 size_t len);
62 static int rx_longest_split_guess (int * label, struct rx_solutions * solns);
63 static int save_nested_registers (struct rx_solutions * solns);
64 static void restore_nested_registers (struct rx_solutions * solns);
65 static int rx_next_solution_internal (struct rx_solutions * solns, jmp_buf * err_escape, int depth);
66
67
68
69 /************************************************************************
70 *(h0 "Regexp Matching"
71 * :includes ("rx-posix/match-regexp.h"))
72 *
73 *
74 * In xref:"Regexp Expression Trees", we saw how to build expression
75 * trees for regexps and elsewhere how to convert those regexps which
76 * are regular expressions to DFA. In xref:"DFA String Comparisons"
77 * we saw DFA-based functions that can compare a string to a regular
78 * expression.
79 *
80 * But what about those regexps which are not regular expressions? So
81 * far, except for the quasi-standard functions `regexec' and
82 * `regnexec', we have no way to match non-regular regexps.
83 *
84 * The functions in this chapter implement regexp matching. The
85 * interface is very detailed, but there are only a few functions and
86 * they are conceptually simple.
87 *
88 */
89 /*(menu)
90 */
91
92 /************************************************************************
93 *(h1 "An Overview of the Regexp Matching Interface")
94 *
95 * Given a regexp, and an input string, a regexp comparison function
96 * should return a value that indicates whether or not the string
97 * matches, and an array of values filled in with the substring
98 * positions of elements of the input string that matched the
99 * parenthesized subexpressions in the match. (For example, see
100 * xref:"regexec".) We start out intending to build a regexp
101 * comparison function that fits the preceeding description and make
102 * two observations.
103 *
104 * First, the function needs to be recursive because we only know how
105 * to check whether or not some kinds of regexp match by checking
106 * whether their subexpressions match.
107 *
108 * Second, the function needs to return more than one set of return
109 * values because a regexp can often match a string in more than one
110 * way -- returning a different set of values for each way of
111 * matching. To find the best overall match for a regexp, we may have
112 * to consider more than one way of matching the subexpressions of
113 * that regexp. In fact, the Posix standard for regexps essentially
114 * mandates that we perform a potentially exhaustive search through
115 * the possible ways of matching subexpressions. So, our recursive
116 * comparison function has to be able to find all of the different
117 * ways of matching a particular regexp and string to facilitate that
118 * search.
119 *
120 * Fortunately, we are able to order the search through the set of
121 * subexpressions in such a way that we will often find the best
122 * answer (and know it is the best answer) early. So while our
123 * recursive comparison function has to be able to return all possible
124 * solutions, it should be free to return them in the order we will
125 * search them, and to only actually construct a particular solution
126 * if it is actually required during a search. We can usually save
127 * time by not building solutions that we don't need to search.
128 *
129 * The functions in this chapter implement such a recursive comparison
130 * function. Given a regexp and a string, the functions return a
131 * lazilly-constructed stream of all of the possible ways of matching
132 * the regexp to the string. The stream is sorted; in the solution
133 * stream of a regexp subexpression, the best match, according to the
134 * rules mandated by Posix, is always the earliest match in the stream
135 * that also satisfies the constraints of the overall match of the
136 * containing expressions.
137 *
138 * You build one of these streams of subexpression solutions by calling
139 * `rx_make_solutions', passing the regexp and input string. You
140 * read off elements of the stream by calling `rx_next_solution' which
141 * actually does the work of computing the next solution in the list.
142 *
143 * Implementing a function like `regexec' or `regnexec' is easy.
144 * `regexec' iterates over each possible starting point for a match:
145 *
146 * for (start_pos = 0; start_pos < string_length; ++start_pos)
147 *
148 * At each starting position, `regexec' iterates through each possible
149 * ending position:
150 *
151 * for (end_pos = string_length; end_pos >= start_pos; --end_pos)
152 *
153 * For each possible starting and ending position, `regexec'
154 * constructs a stream of regexp solutions for that substring (by
155 * calling `rx_make_solutions'). It tries to read the first solution
156 * on that stream by calling `rx_next_solution'. If the stream is not
157 * empty, the first solution in the stream is the best possible
158 * solution, and it is returned from `regexec'.
159 *
160 * When `regexec' calls `rx_next_solution' to find out if the stream
161 * of solutions is empty, more streams of solutions may be created for
162 * the subexpressions of the overall regexp. Those recursively
163 * created streams may have to be searched to find an overall
164 * solution. Even though `regexec' only reads one solution from the
165 * stream it creates, the recursive calls to `rx_next_solution' may
166 * read many solutions from the streams they create.
167 *
168 * As just described, you might think `regexec' and `regnexec' do a
169 * lot of work (running so many separate regexp comparisons). You
170 * would be right. Consequently, the real-world implementations of
171 * `regexec' and `regnexec' employ several tricks to reduce the number
172 * of regexp comparisons actually performed -- they try to avoid
173 * calling the functions in this chapter. In addition, the functions
174 * in this chapter employ several tricks to avoid searching
175 * recursively created streams of solutions. Still, in the general
176 * case, functions like `regexec' use the interface described here in
177 * the computationally expensive way described above. (The exact
178 * complexity of a search depends on the particular regexp but in the
179 * worst cases, it is a polynomial function (of high degree) of the
180 * length of the string being searched.)
181 *
182 */
183
184 /************************************************************************
185 *(h1 "Regexp Match Function Data Structures")
186 *
187 */
188
189 /*(c #s"struct rx_solutions" :category type)
190 * struct rx_solutions;
191 *
192 * This opaque structure type represents the stream of solutions for
193 * matching a particular regexp against a particular string.
194 */
195 struct rx_solutions
196 {
197 int small_p;
198 int step;
199
200 int cset_size;
201 struct rx_exp_node * exp;
202 struct rx_exp_node ** subexps;
203 int nsub;
204 struct rx_registers * regs;
205
206 rx_off_t start;
207 rx_off_t end;
208
209 rx_vmfn vmfn;
210 rx_contextfn contextfn;
211 void * closure;
212
213 struct rx_unfa * dfa;
214 struct rx_dfa match_engine;
215
216 int certainly_fits; /* we know in advance that rx_solution_fits returns 1 */
217 int certain_final_tag;
218
219 int no_fancy_guessing;
220 struct rx_unfa * left_dfa;
221 struct rx_dfa left_match_engine;
222
223 rx_off_t split_guess;
224 struct rx_solutions * left;
225 struct rx_solutions * right;
226
227 int interval_x;
228 int interval_from;
229 int interval_to;
230 rx_off_t * saved_rm_so;
231 rx_off_t * saved_rm_eo;
232 int final_tag;
233 };
234
235 static struct rx_solutions no_solutions;
236
237 #if 0
238 /*(c #s"struct rx_registers" :category type)
239 * struct rx_registers;
240 *
241 * This structure type holds information about the match position
242 * of one regexp subexpression in a (potentially) larger string.
243 * It includes at least these fields:
244 *
245 insert*/
246 struct rx_registers
247 {
248 rx_off_t rm_so; /* The start of the match. */
249 rx_off_t rm_eo; /* The end of the match. */
250 };
251 /*end-insert
252 */
253 #endif
254
255
256 /************************************************************************
257 * Low Level Support for Basic String Matching
258 *
259 */
260
261 struct rx_str_closure
262 {
263 struct rx_context_rules rules;
264 const t_uchar * str;
265 rx_off_t len;
266 };
267
268
269 static int
rx_str_vmfn(void * closure,const t_uchar ** burstp,rx_off_t * lenp,rx_off_t * offsetp,rx_off_t start,rx_off_t end,rx_off_t need)270 rx_str_vmfn (void * closure,
271 const t_uchar ** burstp,
272 rx_off_t * lenp,
273 rx_off_t * offsetp,
274 rx_off_t start,
275 rx_off_t end,
276 rx_off_t need)
277 {
278 struct rx_str_closure * strc;
279
280 strc = (struct rx_str_closure *)closure;
281
282 if ( (need < 0)
283 || (need > strc->len))
284 return 0;
285
286 *burstp = strc->str;
287 *lenp = strc->len;
288 *offsetp = 0;
289 return 0;
290 }
291
292
293 static int
rx_str_contextfn(void * closure,struct rx_exp_node * node,rx_off_t start,rx_off_t end,struct rx_registers * regs)294 rx_str_contextfn (void * closure,
295 struct rx_exp_node * node,
296 rx_off_t start,
297 rx_off_t end,
298 struct rx_registers * regs)
299 {
300 struct rx_str_closure * strc;
301
302 strc = (struct rx_str_closure *)closure;
303
304 switch (node->intval)
305 {
306 case '1': case '2': case '3': case '4': case '5':
307 case '6': case '7': case '8': case '9':
308 {
309 int cmp;
310 int regn;
311
312 regn = node->intval - '0';
313 if ( (regs[regn].rm_so == -1)
314 || ((end - start) != (regs[regn].rm_eo - regs[regn].rm_so)))
315 return 0;
316 else
317 {
318 if (strc->rules.case_indep)
319 cmp = str_casecmp_n ((t_uchar *)strc->str + start,
320 end - start,
321 (t_uchar *)strc->str + regs[regn].rm_so,
322 end - start);
323 else
324 cmp = str_cmp_n ((t_uchar *)strc->str + start,
325 end - start,
326 (t_uchar *)strc->str + regs[regn].rm_so,
327 end - start);
328
329 return (!cmp
330 ? 1
331 : 0);
332 }
333 }
334
335 case '^':
336 {
337 return (( (start == end)
338 && ( ((start == 0) && !strc->rules.not_bol)
339 || ( (start > 0)
340 && strc->rules.newline_anchor
341 && (strc->str[start - 1] == '\n'))))
342 ? 1
343 : 0);
344 }
345
346 case '$':
347 {
348 return (( (start == end)
349 && ( ((start == strc->len) && !strc->rules.not_eol)
350 || ( (start < strc->len)
351 && strc->rules.newline_anchor
352 && (strc->str[start] == '\n'))))
353 ? 1
354 : 0);
355 }
356
357 default:
358 while (1)
359 panic ("unrecognized context function in rx_str_contextfn");
360 }
361 }
362
363
364 /************************************************************************
365 *(h1 "Building and Freeing a Stream of Solutions -- The Simplified Common Case")
366 *
367 */
368
369 #if 0
370 /*(c rx_basic_make_solutions)
371 * struct rx_solutions *
372 * rx_basic_make_solutions (struct rx_registers * regs,
373 * struct rx_exp_node * expression,
374 * struct rx_exp_node ** subexps,
375 * int nsub,
376 * rx_off_t start,
377 * rx_off_t end,
378 * struct rx_context_rules * rules,
379 * const t_uchar * str,
380 * int small_p);
381 *
382 * Construct a stream of solutions for matching the regexp
383 * `expression' against the substring of `str' beginning at `start'
384 * and ending at `end'.
385 *
386 * Part of the output of stream, the positions of sub-expressions in a
387 * solution, will be stored in `regs' each time the next solution is
388 * read using `rx_next_solution'.
389 *
390 * The parameter `context_rules' contains flags which effect the
391 * meaning of some regexp operators.
392 *
393 insert*/
394 struct rx_context_rules
395 {
396 t_uchar newline_anchor;
397 /* If set, an anchor at a newline matches.*/
398
399 t_uchar not_bol;
400 t_uchar not_eol;
401 /* If set, the anchors do not match the beginning and end
402 of the complete input string.*/
403
404
405 t_uchar case_indep;
406 /* If set, all string comparisons should be case independent.*/
407 };
408 /*end-insert
409 *
410 * You must apply `rx_analyze_rexp' to the regexp `expression' before
411 * calling his function. `rx_analyze_rexp' computes recursively
412 * defined properties of the subexpressions of the expression and
413 * records those properties in the expression tree. During a match,
414 * `rx_next_solution' examines those properties to decide how each
415 * subexpression should be treated.
416 *
417 * `rx_analyze_rexp' will return to you an array of the parenthesized
418 * subexpressions of `expression' and tell you the total number of
419 * subexpressions. You must pass this information along to
420 * `rx_basic_make_solutions' as the `subexps' and `nsub' parameters.
421 *
422 * `rx_analyze_rexp' will also fill in the `small_advised_p' field of
423 * `expression'. You can pass this along to `rx_basic_make_solutions'
424 * as the parameter `small_p'. If not 0, `small_p' advises
425 * `rx_basic_make_solutions' that the regexp is small and simple and
426 * therefore not worthy of several high-latency optimizations that
427 * `rx_next_solution' is capable of. Setting this parameter correctly
428 * can improve performance, setting it incorrectly can ruin
429 * performance, and `rx_analyze_rexp' can only approximately compute
430 * whether or not to set this parameter (it makes a safe guess). The
431 * safest value to pass is 0, which activates the high-latency
432 * optimizations, sometimes (slightly) slowing down matches that
433 * aren't particularly expensive while (substantially) speeding up
434 * some matches that are particularly expensive.
435 *
436 * `rx_basic_make_solutions' requires that the entire input string be
437 * in contiguous memory at one time. Another function,
438 * `rx_make_solutions', can be used if the string is not contiguous in
439 * memory or is not all in memory at one time.
440 */
441 #endif
442 struct rx_solutions *
rx_basic_make_solutions(struct rx_registers * regs,struct rx_exp_node * expression,struct rx_exp_node ** subexps,int nsub,rx_off_t start,rx_off_t end,struct rx_context_rules * rules,const t_uchar * str,int small_p)443 rx_basic_make_solutions (struct rx_registers * regs,
444 struct rx_exp_node * expression,
445 struct rx_exp_node ** subexps,
446 int nsub,
447 rx_off_t start,
448 rx_off_t end,
449 struct rx_context_rules * rules,
450 const t_uchar * str,
451 int small_p)
452 {
453 struct rx_solutions * solns;
454 struct rx_str_closure * closure;
455
456 if ( expression
457 && (expression->len >= 0)
458 && (expression->len != (end - start)))
459 return &no_solutions;
460
461 closure = (struct rx_str_closure *)rx_nfa_cache_malloc (sizeof (*closure));
462 if (!closure)
463 return 0;
464
465 closure->str = str;
466 closure->len = end;
467 closure->rules = *rules;
468 solns = rx_make_solutions (regs,
469 256, expression, subexps, nsub,
470 start, end, 0,
471 rx_str_vmfn, rx_str_contextfn, (void *)closure, small_p, 0, 0);
472 if (!solns)
473 {
474 rx_nfa_cache_free ((void *)closure);
475 return 0;
476 }
477 return solns;
478 }
479
480
481 /*(c rx_basic_free_solutions)
482 * void rx_basic_free_solutions (struct rx_solutions * solns);
483 *
484 * Release all storage held by a stream of regexp solutions allocated
485 * by `rx_basic_make_solutions'.
486 */
487 void
rx_basic_free_solutions(struct rx_solutions * solns)488 rx_basic_free_solutions (struct rx_solutions * solns)
489 {
490 if (solns == 0)
491 return;
492
493 if (solns == &no_solutions)
494 return;
495
496 rx_nfa_cache_free (rx_solutions_closure (solns));
497 rx_free_solutions (solns);
498 }
499
500
501
502 /************************************************************************
503 * Low Level Support for rx_next_solution
504 *
505 *
506 *
507 */
508
509 /* static int rx_solution_fit_cset_p (struct rx_solutions * solns);
510 *
511 * The expression is a character set node and the current substring is
512 * one character long. Compare the two and return 1 if they match, 0
513 * otherwise.
514 */
515 static int
rx_solution_fit_cset_p(struct rx_solutions * solns)516 rx_solution_fit_cset_p (struct rx_solutions * solns)
517 {
518 int current_pos;
519 const t_uchar * burst;
520 rx_off_t burst_addr;
521 rx_off_t burst_len;
522 rx_off_t rel_pos_in_burst;
523 int vmstat;
524 const t_uchar * pos;
525
526 current_pos = solns->start;
527 vmstat = solns->vmfn (solns->closure,
528 &burst, &burst_len, &burst_addr,
529 current_pos, current_pos + 1, current_pos);
530
531 if (vmstat)
532 return vmstat;
533
534 rel_pos_in_burst = current_pos - burst_addr;
535 pos = burst + rel_pos_in_burst;
536 return !!bits_is_member (solns->exp->cset, *pos);
537 }
538
539
540 /* static int rx_solution_fit_str_p (struct rx_solutions * solns);
541 *
542 * The expression is a string node and the current substring is the
543 * same length. Compare the two and return 1 if they match, 0
544 * otherwise.
545 */
546 static int
rx_solution_fit_str_p(struct rx_solutions * solns)547 rx_solution_fit_str_p (struct rx_solutions * solns)
548 {
549 rx_off_t current_pos;
550 const t_uchar * burst;
551 rx_off_t burst_addr;
552 rx_off_t burst_len;
553 rx_off_t burst_end_addr;
554 rx_off_t rel_pos_in_burst;
555 int vmstat;
556 rx_off_t count;
557 t_uchar * key;
558
559
560 current_pos = solns->start;
561 count = solns->exp->str_len;
562 key = (t_uchar *)solns->exp->str;
563
564 next_burst:
565 vmstat = solns->vmfn (solns->closure,
566 &burst, &burst_len, &burst_addr,
567 current_pos, solns->end,
568 current_pos);
569
570 if (vmstat)
571 return vmstat;
572
573 rel_pos_in_burst = current_pos - burst_addr;
574 burst_end_addr = burst_addr + burst_len;
575
576 {
577 const t_uchar * pos;
578
579 pos = burst + rel_pos_in_burst;
580
581 if (burst_end_addr >= solns->end)
582 {
583 while (count)
584 {
585 if (rx_poll)
586 (*rx_poll)();
587 if (*pos != *key)
588 return 0;
589 ++pos;
590 ++key;
591 --count;
592 }
593 return 1;
594 }
595 else
596 {
597 rx_off_t part_count;
598 rx_off_t part_count_init;
599
600 part_count_init = burst_len - rel_pos_in_burst;
601 part_count = part_count_init;
602 while (part_count)
603 {
604 if (rx_poll)
605 (*rx_poll)();
606 if (*pos != *key)
607 return 0;
608 ++pos;
609 ++key;
610 --part_count;
611 }
612 count -= part_count_init;
613 current_pos += burst_len - rel_pos_in_burst;
614 goto next_burst;
615 }
616 }
617 }
618
619
620 /* static int rx_solution_fits (struct rx_solutions * solns);
621 *
622 * Compare a DFA which describes a superset of the regexp expression
623 * to the current substring and return 1 if they match, 0 otherwise.
624 * That the DFA matches is a necessary but not sufficient condition of
625 * the regexp expression itself matching. This test can be used to
626 * avoid a more expensive regexp comparison. Return -1 for ESPACE.
627 */
628 static int
rx_solution_fits(struct rx_solutions * solns)629 rx_solution_fits (struct rx_solutions * solns)
630 {
631 const t_uchar * burst;
632 rx_off_t burst_addr;
633 rx_off_t burst_len;
634 rx_off_t burst_end_addr;
635 rx_off_t rel_pos_in_burst;
636 int vmstat;
637 rx_off_t current_pos;
638
639 if (solns->certainly_fits)
640 {
641 solns->match_engine.final_tag = solns->certain_final_tag;
642 return 1;
643 }
644
645 current_pos = solns->start;
646
647 next_burst:
648 vmstat = solns->vmfn (solns->closure,
649 &burst, &burst_len, &burst_addr,
650 current_pos, solns->end,
651 current_pos);
652
653 if (vmstat)
654 return vmstat;
655
656 rel_pos_in_burst = current_pos - burst_addr;
657 burst_end_addr = burst_addr + burst_len;
658
659 if (burst_end_addr >= solns->end)
660 {
661 int fit_label;
662
663 if (rx_dfa_fits (&fit_label,
664 &solns->match_engine,
665 burst + rel_pos_in_burst,
666 solns->end - current_pos))
667 return -1; /* espace */
668 return fit_label;
669 }
670 else
671 {
672 int fit_status;
673 fit_status = rx_dfa_advance (&solns->match_engine,
674 burst + rel_pos_in_burst,
675 burst_len - rel_pos_in_burst);
676 if (fit_status < 0)
677 {
678 return -1;
679 }
680 else if (fit_status)
681 {
682 current_pos += burst_len - rel_pos_in_burst;
683 goto next_burst;
684 }
685 else
686 return 0;
687 }
688 }
689
690
691 static int
rx_dfa_longest_counting(int * label,size_t * match_len,size_t * matches_count,struct rx_dfa * dfa,const t_uchar * str,size_t len)692 rx_dfa_longest_counting (int * label,
693 size_t * match_len,
694 size_t * matches_count,
695 struct rx_dfa * dfa,
696 const t_uchar * str,
697 size_t len)
698 {
699 size_t pos;
700 size_t n_matches;
701 size_t best_len;
702 int best_label;
703
704 pos = 0;
705 n_matches = 0;
706 best_len = 0;
707 best_label = rx_dfa_tag (dfa);
708
709 if (best_label)
710 {
711 n_matches = 1;
712 }
713
714 while (pos < len)
715 {
716 int adv_stat;
717 size_t adv_amt;
718
719 adv_stat = rx_dfa_advance_to_final (&adv_amt, dfa, str + pos, len - pos);
720
721 if (adv_stat < 0)
722 return -1;
723
724 if (!adv_stat)
725 break;
726
727 pos += adv_amt;
728 if (rx_dfa_tag (dfa))
729 {
730 best_len = pos;
731 ++n_matches;
732 best_label = rx_dfa_tag (dfa);
733 }
734 }
735
736 *label = best_label;
737 *match_len = best_len;
738 *matches_count = n_matches;
739 return 0;
740 }
741
742
743 /* adjust split_guess downward to the nearest plausible
744 * position. Return 0 if none exists, 1 otherwise.
745 */
746 static int
rx_longest_split_guess(int * label,struct rx_solutions * solns)747 rx_longest_split_guess (int * label, struct rx_solutions * solns)
748 {
749 const t_uchar * burst;
750 rx_off_t burst_addr;
751 rx_off_t burst_len;
752 rx_off_t burst_end_addr;
753 rx_off_t rel_pos_in_burst;
754 int vmstat;
755 rx_off_t current_pos;
756 size_t n_matches;
757 rx_off_t best_guess;
758 int best_label;
759
760 if (solns->no_fancy_guessing)
761 {
762 *label = 0;
763 return 1;
764 }
765
766 current_pos = solns->start;
767 n_matches = 0;
768 best_guess = -1;
769 best_label = 0;
770
771 if (rx_dfa_goto_start_superstate (&solns->left_match_engine, 1))
772 return -1;
773
774 while (1)
775 {
776 int fit_label;
777 size_t matches_here;
778 size_t len_here;
779
780 vmstat = solns->vmfn (solns->closure,
781 &burst, &burst_len, &burst_addr,
782 current_pos, solns->split_guess,
783 current_pos);
784
785 if (vmstat)
786 {
787 rx_clear_dfa_state (&solns->left_match_engine);
788 return vmstat;
789 }
790
791 rel_pos_in_burst = current_pos - burst_addr;
792 burst_end_addr = burst_addr + burst_len;
793
794 if (rx_dfa_longest_counting (&fit_label,
795 &len_here,
796 &matches_here,
797 &solns->left_match_engine,
798 burst + rel_pos_in_burst,
799 ((burst_end_addr >= solns->split_guess)
800 ? solns->split_guess - current_pos
801 : burst_end_addr - current_pos)))
802 {
803 rx_clear_dfa_state (&solns->left_match_engine);
804 return -1;
805 }
806
807 if (fit_label)
808 {
809 n_matches += matches_here;
810 best_guess = len_here + rel_pos_in_burst;
811 best_label = fit_label;
812 }
813
814 if (!fit_label || (burst_end_addr >= solns->split_guess))
815 {
816 if (!n_matches)
817 {
818 rx_clear_dfa_state (&solns->left_match_engine);
819 return 0;
820 }
821
822 solns->split_guess = best_guess;
823 *label = best_label;
824 if (n_matches > (best_guess * 2))
825 solns->no_fancy_guessing = 1;
826
827 rx_clear_dfa_state (&solns->left_match_engine);
828 return 1;
829 }
830
831 current_pos += burst_len - rel_pos_in_burst;
832 }
833 }
834
835
836
837 /************************************************************************
838 *(h1 "Reading Elements From a Stream of Solutions")
839 *
840 */
841
842 /* Save the values of registers for reporting parentheses
843 * enclosed in the expression solns->exp. Initialize
844 * those registers to -1.
845 *
846 * These are stored in the `solns' structure. Only one set of register
847 * values can be saved, per `solns', at a time.
848 */
849 static int
save_nested_registers(struct rx_solutions * solns)850 save_nested_registers (struct rx_solutions * solns)
851 {
852 int first_subexp;
853 int last_subexp;
854 int n_subexps;
855 int x;
856
857 /* Save the value of the set of numbered
858 * parentheses in the tree rooted at this
859 * expression and initialized them to -1. If
860 * this expression does not match, they will
861 * be restored. If it does match, they get values
862 * from that match and not from previous matches.
863 */
864
865 first_subexp = solns->exp->min_enclosed_paren;
866 last_subexp = solns->exp->max_enclosed_paren;
867 n_subexps = 1 + (last_subexp - first_subexp);
868
869 if (!solns->saved_rm_so)
870 {
871 solns->saved_rm_so = (rx_off_t *)rx_nfa_cache_malloc (sizeof (rx_off_t) * n_subexps);
872 solns->saved_rm_eo = (rx_off_t *)rx_nfa_cache_malloc (sizeof (rx_off_t) * n_subexps);
873 if (!solns->saved_rm_so || !solns->saved_rm_eo)
874 {
875 rx_nfa_cache_free ((void *)solns->saved_rm_so);
876 rx_nfa_cache_free ((void *)solns->saved_rm_eo);
877 return -1;
878 }
879 }
880
881 for (x = 0; x < n_subexps; ++x)
882 {
883 solns->saved_rm_so[x] = solns->regs[x + first_subexp].rm_so;
884 solns->saved_rm_eo[x] = solns->regs[x + first_subexp].rm_eo;
885 solns->regs[x + first_subexp].rm_so = -1;
886 solns->regs[x + first_subexp].rm_eo = -1;
887 }
888 return 0;
889 }
890
891 /* Restore register values saved by `save_nested_registers'.
892 *
893 * This doesn't erase the stored values.
894 */
895 static void
restore_nested_registers(struct rx_solutions * solns)896 restore_nested_registers (struct rx_solutions * solns)
897 {
898 int first_subexp;
899 int last_subexp;
900 int n_subexps;
901 int x;
902
903 first_subexp = solns->exp->min_enclosed_paren;
904 last_subexp = solns->exp->max_enclosed_paren;
905 n_subexps = 1 + (last_subexp - first_subexp);
906
907 for (x = 0; x < n_subexps; ++x)
908 {
909 solns->regs[x + first_subexp].rm_so = solns->saved_rm_so[x];
910 solns->regs[x + first_subexp].rm_eo = solns->saved_rm_eo[x];
911 }
912 }
913
914 /*(c rx_next_solution)
915 * int rx_next_solution (struct rx_solutions * solns);
916 *
917 * If there are no more solutions in the stream of regexp solutions
918 * `solns', return 0. Otherwise, advance the stream to the next
919 * solution and return 1. Return -1 for ESPACE.
920 *
921 * Advancing the stream means filling in the `regs' structure that was
922 * passed to `rx_basic_make_solutions' or `rx_make_solutions' and
923 * establishing the value that will be returned by subsequent calls to
924 * `rx_solutions_final_tag'.
925 *
926 * Solutions are returned in order from best to worst according to the
927 * leftmost-longest rule (see xref:"Describing Regexps Formally").
928 *
929 * It is possible to asynchronously abort a call to this function.
930 * See xref:"Exiting Long-Running Matches".
931 */
932 int
rx_next_solution(struct rx_solutions * solns)933 rx_next_solution (struct rx_solutions * solns)
934 {
935 jmp_buf err_escape;
936
937 if (setjmp (err_escape))
938 {
939 return -1;
940 }
941 return rx_next_solution_internal (solns, &err_escape, 0);
942 }
943
944
945 static int
rx_next_solution_internal(struct rx_solutions * solns,jmp_buf * err_escape,int depth)946 rx_next_solution_internal (struct rx_solutions * solns, jmp_buf * err_escape, int depth)
947 {
948 #ifdef TRACE
949 if (rx_trace && (!rx_depth_limit || (depth < rx_depth_limit)))
950 {
951 safe_printfmt (2, "%*s================\n", depth, "");
952 safe_printfmt (2, "%*sstep: %d\n", depth, "", solns->step);
953 safe_printfmt (2, "%*s----------------\n", depth, "");
954 safe_printfmt (2, "%*sexpression:\n", depth, "");
955 safe_printfmt (2, "%*s", depth, "");
956 rx_unparse_print_rexp (2, 256, solns->exp);
957 safe_printfmt (2, "\n");
958 safe_printfmt (2, "%*s----------------\n", depth, "");
959 safe_printfmt (2, "%*sstring:\n", depth, "");
960 {
961 const t_uchar * burst;
962 rx_off_t burst_addr;
963 rx_off_t burst_len;
964 rx_off_t rel_pos_in_burst;
965 int vmstat;
966
967 vmstat = solns->vmfn (solns->closure,
968 &burst, &burst_len, &burst_addr,
969 solns->start, solns->end, solns->start);
970
971 if (vmstat)
972 panic ("vmstat failure in trace");
973
974 rel_pos_in_burst = solns->start - burst_addr;
975 safe_printfmt (2, "%*s%.*s\n", depth, "",
976 ((solns->end - solns->start) <= (burst_len - rel_pos_in_burst)
977 ? (solns->end - solns->start)
978 : (burst_len - rel_pos_in_burst)),
979 burst + rel_pos_in_burst);
980 }
981 safe_printfmt (2, "%*s----------------\n", depth, "");
982 }
983 #endif
984
985 #define KNOWN_LENGTH(EXP) (!(EXP) \
986 ? 0 \
987 : ((EXP)->len >= 0 \
988 ? (EXP)->len \
989 : ((((EXP)->type == r_context) && ((EXP)->intval >= '0') && ((EXP)->intval <= '9')) \
990 ? ((solns->regs[(EXP)->intval - '0'].rm_so >= 0) \
991 ? (solns->regs[(EXP)->intval - '0'].rm_eo - solns->regs[(EXP)->intval - '0'].rm_so) \
992 : -1) \
993 : -1)))
994
995
996 if (solns == &no_solutions)
997 {
998 return 0;
999 }
1000
1001 if (!solns->exp)
1002 {
1003 if (solns->step != 0)
1004 {
1005 return 0;
1006 }
1007 else
1008 {
1009 solns->step = 1;
1010 solns->final_tag = 1;
1011 return (solns->start == solns->end
1012 ? 1
1013 : 0);
1014 }
1015 }
1016 else if (!(solns->small_p || solns->exp->observed))
1017 {
1018 if (solns->step != 0)
1019 {
1020 return 0;
1021 }
1022 else if (solns->exp->type == r_string)
1023 {
1024 int ans;
1025 ans = rx_solution_fit_str_p (solns);
1026 solns->final_tag = 1;
1027 solns->step = -1;
1028 return ans;
1029 }
1030 else
1031 {
1032 int ans;
1033 ans = rx_solution_fits (solns);
1034 if (ans < 0)
1035 longjmp (*err_escape, 1);
1036 solns->final_tag = solns->match_engine.final_tag;
1037 solns->step = -1;
1038 return !!ans;
1039 }
1040 }
1041 else /* if (solns->small_p || solns->exp->observed) */
1042 {
1043 int fits;
1044 switch (solns->step)
1045 {
1046 case -2:
1047 if (solns->exp->min_enclosed_paren)
1048 {
1049 int first_subexp;
1050 int last_subexp;
1051 int x;
1052
1053 first_subexp = solns->exp->min_enclosed_paren;
1054 last_subexp = solns->exp->max_enclosed_paren;
1055
1056 for (x = first_subexp; x <= last_subexp; ++x)
1057 {
1058 solns->regs[x].rm_so = solns->saved_rm_so[x - first_subexp];
1059 solns->regs[x].rm_eo = solns->saved_rm_eo[x - first_subexp];
1060 }
1061 }
1062 return 0;
1063
1064 case -1:
1065 return 0;
1066
1067 case 0:
1068 if (solns->small_p)
1069 {
1070 solns->step = 1;
1071 goto resolve_fit;
1072 }
1073 else
1074 {
1075 fits = rx_solution_fits (solns);
1076 if (fits < 0)
1077 longjmp (*err_escape, 1);
1078 /* Set final_tag here because this rough fit test
1079 * may be all the matching that gets done.
1080 * For example, consider a paren node containing
1081 * a true regular expression ending with a cut
1082 * operator.
1083 */
1084 solns->final_tag = solns->match_engine.final_tag;
1085 if (fits == 0)
1086 {
1087 solns->step = -1;
1088 return fits;
1089 }
1090 else
1091 {
1092 solns->step = 1;
1093 goto resolve_fit;
1094 }
1095 }
1096
1097 default:
1098 resolve_fit:
1099 switch (solns->exp->type)
1100 {
1101 case r_string:
1102 if (!solns->small_p)
1103 {
1104 while (1)
1105 panic ("bogus regexp in rx_next_solution_internal");
1106 }
1107 else
1108 {
1109 int answer;
1110 answer = rx_solution_fit_str_p (solns);
1111 solns->final_tag = 1;
1112 solns->step = -1;
1113 return answer;
1114 }
1115
1116 case r_cset:
1117 if (!solns->small_p)
1118 {
1119 while (1)
1120 panic ("bogus regexp in rx_next_solution_internal");
1121 }
1122 else
1123 {
1124 int answer;
1125 answer = rx_solution_fit_cset_p (solns);
1126 solns->final_tag = 1;
1127 solns->step = -1;
1128 return answer;
1129 }
1130
1131 case r_cut:
1132 if (!solns->small_p)
1133 {
1134 while (1)
1135 panic ("bogus regexp in rx_next_solution_internal");
1136 }
1137 else
1138 {
1139 solns->final_tag = solns->exp->intval;
1140 solns->step = -1;
1141 return !!solns->exp->intval;
1142 }
1143
1144 case r_parens:
1145 {
1146 int paren_stat;
1147 switch (solns->step)
1148 {
1149 case 1:
1150 if (solns->exp->min_enclosed_paren)
1151 {
1152 if (save_nested_registers (solns))
1153 longjmp (*err_escape, 1);
1154 }
1155
1156 if ( !solns->small_p
1157 && ( !solns->exp->left
1158 || !solns->exp->left->observed))
1159 {
1160 /* We already proved that this (simple)
1161 * subexpression matches. Optionally, record
1162 * its position. Return success once, and
1163 * failure for all subsequent attempts to find
1164 * other matches.
1165 */
1166 if (solns->exp->intval)
1167 {
1168 solns->regs[solns->exp->intval].rm_so = solns->start;
1169 solns->regs[solns->exp->intval].rm_eo = solns->end;
1170 }
1171 solns->step = -2;
1172 /* Keep the final_tag from the fits test. */
1173 return 1;
1174 }
1175
1176 /* We must search through possible matches of the subexpression.
1177 */
1178 solns->left = rx_make_solutions (solns->regs,
1179 solns->cset_size,
1180 solns->exp->left,
1181 solns->subexps,
1182 solns->nsub,
1183 solns->start,
1184 solns->end,
1185 0,
1186 solns->vmfn,
1187 solns->contextfn,
1188 solns->closure,
1189 solns->small_p,
1190 0,
1191 0);
1192 if (!solns->left)
1193 longjmp (*err_escape, 1);
1194
1195 case 2:
1196 if (solns->exp->min_enclosed_paren)
1197 {
1198 int first_subexp;
1199 int last_subexp;
1200 int x;
1201
1202 /* Initialize the parentheses in the tree
1203 * rooted at this expression to -1.
1204 */
1205
1206 first_subexp = solns->exp->min_enclosed_paren;
1207 last_subexp = solns->exp->max_enclosed_paren;
1208
1209 for (x = first_subexp; x <= last_subexp; ++x)
1210 {
1211 solns->regs[x].rm_so = -1;
1212 solns->regs[x].rm_eo = -1;
1213 }
1214 }
1215
1216 paren_stat = rx_next_solution_internal (solns->left, err_escape, depth + 1);
1217 if (paren_stat < 0)
1218 longjmp (*err_escape, 1);
1219
1220 if (paren_stat == 1)
1221 {
1222 if (solns->exp->intval)
1223 {
1224 solns->regs[solns->exp->intval].rm_so = solns->start;
1225 solns->regs[solns->exp->intval].rm_eo = solns->end;
1226 }
1227 solns->final_tag = solns->left->final_tag;
1228 solns->step = 2;
1229 return 1;
1230 }
1231 else
1232 {
1233 rx_free_solutions (solns->left);
1234 solns->left = 0;
1235 if (solns->exp->min_enclosed_paren)
1236 {
1237 /* This expression didn't match. Restore the old subexpression
1238 * positions.
1239 */
1240 restore_nested_registers (solns);
1241 }
1242 solns->step = -1;
1243 return paren_stat;
1244 }
1245 }
1246 }
1247
1248 case r_alternate:
1249 {
1250 int alt_stat;
1251 switch (solns->step)
1252 {
1253 case 1:
1254 solns->left = rx_make_solutions (solns->regs,
1255 solns->cset_size,
1256 solns->exp->left,
1257 solns->subexps,
1258 solns->nsub,
1259 solns->start,
1260 solns->end,
1261 0,
1262 solns->vmfn,
1263 solns->contextfn,
1264 solns->closure,
1265 solns->small_p,
1266 0,
1267 0);
1268 if (!solns->left)
1269 longjmp (*err_escape, 1);
1270
1271 case 2:
1272 alt_stat = rx_next_solution_internal (solns->left, err_escape, depth + 1);
1273 if (alt_stat < 0)
1274 longjmp (*err_escape, 1);
1275
1276 if (alt_stat == 1)
1277 {
1278 solns->final_tag = solns->left->final_tag;
1279 solns->step = 2;
1280 return alt_stat;
1281 }
1282 else
1283 {
1284 rx_free_solutions (solns->left);
1285 solns->left = 0;
1286 }
1287
1288 solns->right = rx_make_solutions (solns->regs,
1289 solns->cset_size,
1290 solns->exp->right,
1291 solns->subexps,
1292 solns->nsub,
1293 solns->start,
1294 solns->end,
1295 0,
1296 solns->vmfn,
1297 solns->contextfn,
1298 solns->closure,
1299 solns->small_p,
1300 0,
1301 0);
1302 if (!solns->right)
1303 longjmp (*err_escape, 1);
1304
1305 case 4:
1306 alt_stat = rx_next_solution_internal (solns->right, err_escape, depth + 1);
1307 if (alt_stat < 0)
1308 longjmp (*err_escape, 1);
1309
1310 if (alt_stat == 1)
1311 {
1312 solns->final_tag = solns->right->final_tag;
1313 solns->step = 4;
1314 return alt_stat;
1315 }
1316 else
1317 {
1318 solns->step = -1;
1319 rx_free_solutions (solns->right);
1320 solns->right = 0;
1321 return alt_stat;
1322 }
1323 }
1324 }
1325
1326 case r_concat:
1327 {
1328 switch (solns->step)
1329 {
1330 int concat_stat;
1331 int split_guess_stat;
1332 int certain_label;
1333
1334 case 1:
1335
1336 if (!solns->exp->left)
1337 solns->split_guess = solns->start;
1338 else
1339 {
1340 rx_off_t known_left_length;
1341
1342 known_left_length = KNOWN_LENGTH (solns->exp->left);
1343 if (known_left_length >= 0)
1344 {
1345 if (known_left_length > (solns->end - solns->start))
1346 {
1347 solns->step = -1;
1348 return 0;
1349 }
1350 solns->split_guess = solns->start + known_left_length;
1351 }
1352 else
1353 {
1354 rx_off_t known_right_length;
1355
1356 known_right_length = KNOWN_LENGTH (solns->exp->right);
1357
1358 if (known_right_length >= 0)
1359 {
1360 if (known_right_length > (solns->end - solns->start))
1361 {
1362 solns->step = -1;
1363 return 0;
1364 }
1365 solns->split_guess = solns->end - known_right_length;
1366 }
1367 else
1368 solns->split_guess = solns->end;
1369 }
1370 }
1371
1372 concat_split_guess_loop:
1373
1374 split_guess_stat = rx_longest_split_guess (&certain_label, solns);
1375
1376 if (split_guess_stat < 0)
1377 longjmp (*err_escape, 1);
1378
1379 if (!split_guess_stat)
1380 {
1381 solns->step = -1;
1382 return 0;
1383 }
1384
1385 solns->left = rx_make_solutions (solns->regs,
1386 solns->cset_size,
1387 solns->exp->left,
1388 solns->subexps,
1389 solns->nsub,
1390 solns->start,
1391 solns->split_guess,
1392 0,
1393 solns->vmfn,
1394 solns->contextfn,
1395 solns->closure,
1396 solns->small_p,
1397 !!certain_label,
1398 certain_label);
1399 if (!solns->left)
1400 longjmp (*err_escape, 1);
1401
1402 concat_try_next_left_match:
1403
1404 concat_stat = rx_next_solution_internal (solns->left, err_escape, depth + 1);
1405 if (concat_stat < 0)
1406 longjmp (*err_escape, 1);
1407
1408 if (concat_stat != 1)
1409 {
1410 rx_free_solutions (solns->left);
1411 solns->left = 0;
1412 if ( (KNOWN_LENGTH (solns->exp->left) < 0)
1413 && (KNOWN_LENGTH (solns->exp->right) < 0))
1414 {
1415 --solns->split_guess;
1416 if (solns->split_guess >= solns->start)
1417 goto concat_split_guess_loop;
1418 }
1419
1420 solns->step = -1;
1421 return 0;
1422 }
1423
1424 solns->right = rx_make_solutions (solns->regs,
1425 solns->cset_size,
1426 solns->exp->right,
1427 solns->subexps,
1428 solns->nsub,
1429 solns->split_guess,
1430 solns->end,
1431 0,
1432 solns->vmfn,
1433 solns->contextfn,
1434 solns->closure,
1435 solns->small_p,
1436 0,
1437 0);
1438 if (!solns->right)
1439 longjmp (*err_escape, 1);
1440
1441 case 2:
1442 concat_stat = rx_next_solution_internal (solns->right, err_escape, depth + 1);
1443 if (concat_stat < 0)
1444 longjmp (*err_escape, 1);
1445
1446 if (concat_stat == 1)
1447 {
1448 solns->final_tag = solns->right->final_tag;
1449 solns->step = 2;
1450 return concat_stat;
1451 }
1452 else
1453 {
1454 rx_free_solutions (solns->right);
1455 solns->right = 0;
1456 goto concat_try_next_left_match;
1457 }
1458 }
1459 }
1460
1461 case r_right_concat:
1462 {
1463 switch (solns->step)
1464 {
1465 int concat_stat;
1466 rx_off_t known_left_length;
1467 rx_off_t known_right_length;
1468
1469 case 1:
1470 known_left_length = KNOWN_LENGTH (solns->exp->left);
1471 known_right_length = KNOWN_LENGTH (solns->exp->right);
1472 if (known_left_length >= 0)
1473 {
1474 solns->split_guess = solns->start + known_left_length;
1475 }
1476 else if (known_right_length >= 0)
1477 {
1478 solns->split_guess = solns->end - known_right_length;
1479 }
1480 else
1481 solns->split_guess = solns->start;
1482
1483
1484 right_concat_split_guess_loop:
1485 solns->left = rx_make_solutions (solns->regs,
1486 solns->cset_size,
1487 solns->exp->left,
1488 solns->subexps,
1489 solns->nsub,
1490 solns->start,
1491 solns->split_guess,
1492 0,
1493 solns->vmfn,
1494 solns->contextfn,
1495 solns->closure,
1496 solns->small_p,
1497 0,
1498 0);
1499 if (!solns->left)
1500 longjmp (*err_escape, 1);
1501
1502 right_concat_try_next_left_match:
1503
1504 concat_stat = rx_next_solution_internal (solns->left, err_escape, depth + 1);
1505 if (concat_stat < 0)
1506 longjmp (*err_escape, 1);
1507
1508 if (concat_stat != 1)
1509 {
1510 rx_free_solutions (solns->left);
1511 solns->left = 0;
1512 if ( (KNOWN_LENGTH (solns->exp->left) < 0)
1513 && (KNOWN_LENGTH (solns->exp->right) < 0))
1514 {
1515 ++solns->split_guess;
1516 if (solns->split_guess <= solns->end)
1517 goto right_concat_split_guess_loop;
1518 }
1519
1520 solns->step = -1;
1521 return 0;
1522 }
1523
1524 solns->right = rx_make_solutions (solns->regs,
1525 solns->cset_size,
1526 solns->exp->right,
1527 solns->subexps,
1528 solns->nsub,
1529 solns->split_guess,
1530 solns->end,
1531 0,
1532 solns->vmfn,
1533 solns->contextfn,
1534 solns->closure,
1535 solns->small_p,
1536 0,
1537 0);
1538 if (!solns->right)
1539 longjmp (*err_escape, 1);
1540
1541 case 2:
1542 concat_stat = rx_next_solution_internal (solns->right, err_escape, depth + 1);
1543 if (concat_stat < 0)
1544 longjmp (*err_escape, 1);
1545
1546 if (concat_stat == 1)
1547 {
1548 solns->final_tag = solns->right->final_tag;
1549 solns->step = 2;
1550 return concat_stat;
1551 }
1552 else
1553 {
1554 rx_free_solutions (solns->right);
1555 solns->right = 0;
1556 goto right_concat_try_next_left_match;
1557 }
1558 }
1559 }
1560
1561 case r_star:
1562 {
1563 int star_stat;
1564
1565 switch (solns->step)
1566 {
1567 case 1:
1568 /* Begin by trying to match the entire string
1569 * with a single iteration.
1570 */
1571 solns->left = rx_make_solutions (solns->regs,
1572 solns->cset_size,
1573 solns->exp->left,
1574 solns->subexps,
1575 solns->nsub,
1576 solns->start,
1577 solns->end,
1578 0,
1579 solns->vmfn,
1580 solns->contextfn,
1581 solns->closure,
1582 solns->small_p,
1583 0,
1584 0);
1585 if (!solns->left)
1586 longjmp (*err_escape, 1);
1587
1588
1589 case 2:
1590 star_stat = rx_next_solution_internal (solns->left, err_escape, depth + 1);
1591 if (star_stat < 0)
1592 longjmp (*err_escape, 1);
1593
1594 if (star_stat == 1)
1595 {
1596 /* found a single-iteration whole-string match */
1597 solns->final_tag = solns->left->final_tag;
1598 solns->step = 2;
1599 return 1;
1600 }
1601
1602 /* no more single-iteration whole-string matches */
1603 rx_free_solutions (solns->left);
1604 solns->left = 0;
1605
1606 if (solns->start == solns->end)
1607 {
1608 /* when comaring to the empty string and none of the single-iteration
1609 * matches work, try matching with zero iterations, then give up
1610 */
1611 solns->step = -1;
1612 return 1;
1613 }
1614
1615 /* when comparing to a non-empty string and none of the single-iteration
1616 * matches work, try all matches with two or more non-empty iterations.
1617 */
1618 if (!solns->exp->left || (solns->exp->left->len == 0))
1619 {
1620 solns->step = -1;
1621 return 0;
1622 }
1623 else if (solns->exp->left->len > 0)
1624 {
1625 if ( (solns->exp->left->len >= (solns->end - solns->start))
1626 || (0 != ((solns->end - solns->start) % solns->exp->left->len)))
1627 {
1628 solns->step = -1;
1629 return 0;
1630 }
1631 solns->split_guess = solns->end - solns->exp->left->len;
1632 }
1633 else
1634 solns->split_guess = solns->start + 1;
1635
1636 case 3:
1637 star_3:
1638
1639 /* Suppose the string has been split into two parts: s == s1 s2
1640 * and we are trying to match R*.
1641 *
1642 * Recurse by matching s1 against R*, and s2 against R.
1643 *
1644 * However, if s == s1 and s2 is the empty string, comparing
1645 * s1 to R* would cause an infinite recursion. So instead,
1646 * compare s1 to R and s2 to R.
1647 *
1648 * Comparing s2 to R is often much less expensive than comparing
1649 * s1 to R*. If s2 doesn't match R, time spent comparing s1 to R*
1650 * is wasted. So, before trying s1 against R*, make sure that
1651 * s2 can match R.
1652 *
1653 * Registers are subtle. There are three possibilities:
1654 *
1655 * [A] R contains no reporting parens
1656 * [B] R contains reporting parens
1657 *
1658 * In case A, matching s1 against R* won't change any registers,
1659 * so it is safe to match s to R first.
1660 *
1661 * In the second case, either
1662 *
1663 * [B1] R is a reporting parenthesized expression
1664 * [B2] R is a non-reporting parenthesized expression
1665 *
1666 * In either of those two cases, the first step of
1667 * comparing s2 to R is to set all of the
1668 * registers for nested reporting parentheses to
1669 * -1. As a result, again, s1 against R* has no (observable) effect
1670 * on registers -- so it is safe to match s to R first.
1671 *
1672 */
1673
1674 solns->right = rx_make_solutions (solns->regs,
1675 solns->cset_size,
1676 solns->exp->left,
1677 solns->subexps,
1678 solns->nsub,
1679 solns->split_guess,
1680 solns->end,
1681 0,
1682 solns->vmfn,
1683 solns->contextfn,
1684 solns->closure,
1685 solns->small_p,
1686 0,
1687 0);
1688 if (!solns->right)
1689 longjmp (*err_escape, 1);
1690
1691 star_stat = rx_next_solution_internal (solns->right, err_escape, depth + 1);
1692 if (star_stat < 0)
1693 longjmp (*err_escape, 1);
1694
1695 if (star_stat != 1)
1696 {
1697 /* No match for s2 against R, so guess again:
1698 */
1699 rx_free_solutions (solns->right);
1700 solns->right = 0;
1701 if (solns->exp->left->len > 0)
1702 {
1703 solns->step = -1;
1704 return 0;
1705 }
1706 ++solns->split_guess;
1707 if (solns->split_guess > solns->end)
1708 {
1709 solns->step = -1;
1710 return 0;
1711 }
1712 goto star_3;
1713 }
1714
1715 /* Aha -- s2 matches R. Does s1 match R*?
1716 *
1717 * Testing s1 against R* can modify some registers
1718 * that were set while matching s2 to R. These
1719 * registers must be saved and then restored before
1720 * returning the overall match.
1721 */
1722 if (save_nested_registers (solns))
1723 {
1724 longjmp (*err_escape, 1);
1725 }
1726
1727 if (solns->split_guess == solns->end)
1728 solns->left = rx_make_solutions (solns->regs,
1729 solns->cset_size,
1730 solns->exp->left,
1731 solns->subexps,
1732 solns->nsub,
1733 solns->start,
1734 solns->split_guess,
1735 0,
1736 solns->vmfn,
1737 solns->contextfn,
1738 solns->closure,
1739 solns->small_p,
1740 0,
1741 0);
1742 else
1743 solns->left = rx_make_solutions (solns->regs,
1744 solns->cset_size,
1745 solns->exp,
1746 solns->subexps,
1747 solns->nsub,
1748 solns->start,
1749 solns->split_guess,
1750 0,
1751 solns->vmfn,
1752 solns->contextfn,
1753 solns->closure,
1754 solns->small_p,
1755 0,
1756 0);
1757 if (!solns->left)
1758 longjmp (*err_escape, 1);
1759
1760 star_stat = rx_next_solution_internal (solns->left, err_escape, depth + 1);
1761 if (star_stat < 0)
1762 longjmp (*err_escape, 1);
1763
1764 if (star_stat != 1)
1765 {
1766 /* no matches for the left part of the split string
1767 * at this split_guess.
1768 */
1769 rx_free_solutions (solns->left);
1770 rx_free_solutions (solns->right);
1771 solns->left = 0;
1772 solns->right = 0;
1773 if (solns->exp->left->len > 0)
1774 {
1775 solns->step = -1;
1776 return 0;
1777 }
1778 ++solns->split_guess;
1779 if (solns->split_guess > solns->end)
1780 {
1781 solns->step = -1;
1782 return 0;
1783 }
1784 goto star_3;
1785 }
1786
1787 /* Found at least one solution for both s1 against R* and
1788 * s2 against R.
1789 *
1790 * Return that solution. First, restore the registers from
1791 * the s2/R match:
1792 *
1793 */
1794
1795 restore_nested_registers (solns);
1796 solns->final_tag = solns->right->final_tag;
1797 solns->step = 4;
1798 return 1;
1799
1800 case 4:
1801 /* This is reached when we've found one solution for
1802 * s1 against R* and s2 against R, but that solution
1803 * was rejected.
1804 *
1805 * Try to find another solution for s2/R, using the same
1806 * match for s1/R*.
1807 */
1808
1809 star_stat = rx_next_solution_internal (solns->right, err_escape, depth + 1);
1810 if (star_stat < 0)
1811 longjmp (*err_escape, 1);
1812
1813 if (star_stat == 1)
1814 {
1815 /* found a match */
1816 solns->final_tag = solns->right->final_tag;
1817 solns->step = 4;
1818 }
1819
1820 /* No more matches for s2/R for the current match of s1/R*
1821 *
1822 * We could look for another s1/R* match -- but why bother?
1823 * Any changes to registers made by such a match are
1824 * not observable after any s/R match. So, try again with
1825 * a different split guess:
1826 */
1827
1828 rx_free_solutions (solns->left);
1829 rx_free_solutions (solns->right);
1830 solns->left = 0;
1831 solns->right = 0;
1832 if (solns->exp->left->len > 0)
1833 {
1834 solns->step = -1;
1835 return 0;
1836 }
1837 ++solns->split_guess;
1838 if (solns->split_guess > solns->end)
1839 {
1840 solns->step = -1;
1841 return 0;
1842 }
1843 goto star_3;
1844 }
1845 }
1846
1847 case r_interval:
1848 {
1849 /* On entry, interval_x is the number of iterations
1850 * already charged to this interval. So, instead
1851 * of {intval, intval2}, the interval is actually:
1852 *
1853 * { max(0, intval - interval_x),
1854 * max(0, intval2 - interval_x) }
1855 *
1856 */
1857 switch (solns->step)
1858 {
1859 int interval_stat;
1860
1861 case 1:
1862 solns->interval_from = max0(solns->exp->intval - solns->interval_x);
1863 solns->interval_to = max0(solns->exp->intval2 - solns->interval_x);
1864
1865 /* Should we try single iteration solutions?
1866 *
1867 * Yes if either:
1868 *
1869 * (interval_from <= 1) && (interval_to >= 1)
1870 *
1871 * or
1872 * ((solns->start == solns->end) && (interval_to >= 1))
1873 *
1874 * If the string is the empty string (start == end), and any
1875 * iterations are permitted (interval_to >= 1) then all possible
1876 * solutions (except the 0-iteration solution) can be found with
1877 * a single iteration. Repeated iterations for a null match do not
1878 * add new solutions compared to a single iteration because the
1879 * single iteration will always have the same side effects as multiple
1880 * iterations ending with that single iteration.
1881 */
1882
1883 if ( ( (solns->start == solns->end)
1884 && (solns->interval_to >= 1)
1885 && ( !solns->exp->left
1886 || (solns->exp->left->len <= 0)))
1887 || ( (solns->interval_from <= 1)
1888 && (solns->interval_to >= 1)
1889 && (solns->start != solns->end)
1890 && solns->exp->left
1891 && ( (solns->exp->left->len < 0)
1892 || (solns->exp->left->len == (solns->end - solns->start)))))
1893
1894 {
1895 solns->left = rx_make_solutions (solns->regs,
1896 solns->cset_size,
1897 solns->exp->left,
1898 solns->subexps,
1899 solns->nsub,
1900 solns->start,
1901 solns->end,
1902 0,
1903 solns->vmfn,
1904 solns->contextfn,
1905 solns->closure,
1906 solns->small_p,
1907 0,
1908 0);
1909 if (!solns->left)
1910 longjmp (*err_escape, 1);
1911 case 2:
1912 interval_stat = rx_next_solution_internal (solns->left, err_escape, depth + 1);
1913 if (interval_stat < 0)
1914 longjmp (*err_escape, 1);
1915
1916 if (interval_stat != 1)
1917 {
1918 rx_free_solutions (solns->left);
1919 solns->left = 0;
1920 goto try_0_iteration_null_match;
1921 }
1922 else
1923 {
1924 solns->step = 2;
1925 return 1;
1926 }
1927 }
1928
1929 try_0_iteration_null_match:
1930
1931 /* If we're trying to match the empty string, and 0 iterations are
1932 * permitted, that is the next best solution. If we're trying to
1933 * match the empty string, the 0 iteration solution is the only
1934 * remaining possibility.
1935 */
1936 if (solns->start == solns->end)
1937 {
1938 solns->step = -1;
1939 return (solns->interval_from == 0);
1940 }
1941
1942 /* The string is not empty. If only 0 iterations are permitted,
1943 * then no solutions are possible.
1944 */
1945 if (solns->interval_to == 0)
1946 {
1947 solns->step = -1;
1948 return 0;
1949 }
1950
1951 /* The string is not empty and no single iteration solution works.
1952 * Search through solutions with at least two iterations. We split
1953 * the string into two parts, and try to match the left part with N
1954 * iterations (N >= 1) and the right part with 1 iteration.
1955 *
1956 * To satisfy the left-to-right longest-subexpression rule, the right
1957 * part (matched by the last iteration) should be as long as possible.
1958 */
1959
1960 if (!solns->exp->left || (solns->exp->left->len == 0))
1961 {
1962 /* never reached unless certain other optimizations are removed
1963 */
1964 solns->step = -1;
1965 return 0;
1966 }
1967 else if (solns->exp->left->len > 0)
1968 {
1969 if ( (solns->exp->left->len > (solns->end - solns->start))
1970 || (0 != ((solns->end - solns->start) % solns->exp->left->len)))
1971 {
1972 solns->step = -1;
1973 return 0;
1974 }
1975 solns->split_guess = solns->end - solns->exp->left->len;
1976 }
1977 else
1978 solns->split_guess = solns->start;
1979
1980 interval_split_start_at_new_split_guess:
1981
1982 solns->right = rx_make_solutions (solns->regs,
1983 solns->cset_size,
1984 solns->exp->left,
1985 solns->subexps,
1986 solns->nsub,
1987 solns->split_guess,
1988 solns->end,
1989 0,
1990 solns->vmfn,
1991 solns->contextfn,
1992 solns->closure,
1993 solns->small_p,
1994 0,
1995 0);
1996 if (!solns->right)
1997 longjmp (*err_escape, 1);
1998
1999 next_interval_right_solution:
2000 interval_stat = rx_next_solution_internal (solns->right, err_escape, depth + 1);
2001 if (interval_stat < 0)
2002 longjmp (*err_escape, 1);
2003
2004 if (interval_stat != 1)
2005 {
2006 rx_free_solutions (solns->right);
2007 solns->right = 0;
2008
2009
2010 if (solns->exp->left->len < 0)
2011 {
2012 ++solns->split_guess;
2013 if (solns->split_guess <= solns->end)
2014 goto interval_split_start_at_new_split_guess;
2015 }
2016 /* No more solutions at all. */
2017 solns->step = -1;
2018 return 0;
2019 }
2020
2021 /* Found a solution for the right half of the split.
2022 */
2023
2024 if (save_nested_registers (solns))
2025 {
2026 longjmp (*err_escape, 1);
2027 }
2028 solns->left = rx_make_solutions (solns->regs,
2029 solns->cset_size,
2030 solns->exp,
2031 solns->subexps,
2032 solns->nsub,
2033 solns->start,
2034 solns->split_guess,
2035 solns->interval_x + 1,
2036 solns->vmfn,
2037 solns->contextfn,
2038 solns->closure,
2039 solns->small_p,
2040 0,
2041 0);
2042 if (!solns->left)
2043 longjmp (*err_escape, 1);
2044
2045 interval_stat = rx_next_solution_internal (solns->left, err_escape, depth + 1);
2046 if (interval_stat < 0)
2047 longjmp (*err_escape, 1);
2048
2049 if (interval_stat != 1)
2050 {
2051 /* No more solutions for the left part of the string.
2052 */
2053 rx_free_solutions (solns->left);
2054 solns->left = 0;
2055 goto next_interval_right_solution;
2056 }
2057
2058 /* We found an overall solution for the interval.
2059 */
2060 solns->step = 3;
2061 restore_nested_registers (solns);
2062 solns->final_tag = solns->right->final_tag;
2063 return 1;
2064
2065 case 3:
2066 rx_free_solutions (solns->left);
2067 solns->left = 0;
2068 goto next_interval_right_solution;
2069
2070 }
2071 }
2072
2073 case r_context:
2074 {
2075 solns->step = -1;
2076 solns->final_tag = 1;
2077 return solns->contextfn (solns->closure,
2078 solns->exp,
2079 solns->start, solns->end,
2080 solns->regs);
2081 }
2082
2083
2084 }
2085 }
2086 while (1)
2087 panic ("unreached in rx_next_solution_internal");
2088 }
2089 }
2090
2091 /*(c rx_solutions_final_tag)
2092 * int rx_solutions_final_tag (struct rx_solutions * solns);
2093 *
2094 * Return the state label of the last DFA state reached during
2095 * the solution most recently returned by `rx_next_solution'.
2096 */
2097 int
rx_solutions_final_tag(struct rx_solutions * solns)2098 rx_solutions_final_tag (struct rx_solutions * solns)
2099 {
2100 return solns->final_tag;
2101 }
2102
2103
2104
2105 /************************************************************************
2106 *(h1 "Building and Freeing a Stream of Solutions -- The General Case")
2107 *
2108 * The functions in this section are quite general. They can be used
2109 * to compare a regexp to a string which is not entirely in memory at
2110 * any one time.
2111 */
2112
2113 /*(c rx_make_solutions)
2114 * struct rx_solutions *
2115 * rx_make_solutions (struct rx_registers * regs,
2116 * int cset_size,
2117 * struct rx_exp_node * expression,
2118 * struct rx_exp_node ** subexps,
2119 * int nsub,
2120 * rx_off_t start,
2121 * rx_off_t end,
2122 * int interval_x,
2123 * rx_vmfn vmfn,
2124 * rx_contextfn contextfn,
2125 * void * closure,
2126 * int small_p,
2127 * int certainly_fits,
2128 * int certain_final_tag);
2129 *
2130 *
2131 * Construct a stream-of-solutions to a regexp matching problem. For
2132 * arguments which are not documented here, see
2133 * xref:"rx_basic_make_solutions".
2134 *
2135 * If `expression' is of type `r_interval', then `interval_x'
2136 * is the number of matches already accumulated for the subexpression.
2137 * In other words, if expression is `RE{M,N}', it will match as:
2138 *
2139 * RE{max (0,M - interval_x), max (0, N - interval_x)
2140 *
2141 * This function does not directly accept an input string. Instead,
2142 * it accepts the parameters `vmfn', `contextfn', and `closure'.
2143 *
2144 * `vmfn' is a function which can return the address of a substring
2145 * containing a chosen character position within the entire input
2146 * string. It is used to page-in parts of the input string during a
2147 * match. The details of this parameter are given below.
2148 *
2149 * `contextfn' is a function that knows how to evaluate regexp
2150 * expressions which are anchors or backreferences. These kinds of
2151 * subexpressions may be defined in terms of parts of the input string
2152 * that are not in memory or that are not contiguous in memory. The
2153 * `contextfn' has the job of sorting that out in an efficient way.
2154 * The details of this parameter are also given below.
2155 *
2156 * `closure' is an opaque parameter that is passed along to `vmfn' and
2157 * `contextfn'.
2158 *
2159 * `small_p', if not 0, means that the regexp is so trivial that
2160 * heavy-weight optimizations are not likely to pay off. It is always
2161 * safe to pass 0 for this parameter. Internally, Rx passes a non-0
2162 * value if `expression' is 0 or if `expression->small_advised_p' is
2163 * non-0. `expression->small_advised_p' is filled in by
2164 * `rx_analyze_rexp'.
2165 *
2166 * `certainly_fits', if not 0, means that if `expression' is converted
2167 * to a true regular expression (such as by `rx_simplify_rexp') and
2168 * compared to the target string, the target string will certainly match.
2169 * Setting this field can speed up some matches. If this field is
2170 * non-0, `certain_final_tag' must also be set. It is always safe to
2171 * pass 0 for `certainly_fits', though doing so may slow down some matches
2172 * slightly.
2173 *
2174 * `certain_final_tag' is the final DFA state label returned for a
2175 * match of simplified form of `expression' against the target string.
2176 * This field is only used if `certainly_fits' is not 0.
2177 */
2178 struct rx_solutions *
rx_make_solutions(struct rx_registers * regs,int cset_size,struct rx_exp_node * expression,struct rx_exp_node ** subexps,int nsub,rx_off_t start,rx_off_t end,int interval_x,rx_vmfn vmfn,rx_contextfn contextfn,void * closure,int small_p,int certainly_fits,int certain_final_tag)2179 rx_make_solutions (struct rx_registers * regs,
2180 int cset_size,
2181 struct rx_exp_node * expression,
2182 struct rx_exp_node ** subexps,
2183 int nsub,
2184 rx_off_t start,
2185 rx_off_t end,
2186 int interval_x,
2187 rx_vmfn vmfn,
2188 rx_contextfn contextfn,
2189 void * closure,
2190 int small_p,
2191 int certainly_fits,
2192 int certain_final_tag)
2193 {
2194 struct rx_solutions * solns;
2195
2196 if (!expression && (start != end))
2197 return &no_solutions;
2198
2199 if ( expression
2200 && (expression->len >= 0)
2201 && ((expression->type != r_interval)
2202 ? (expression->len != (end - start))
2203 : (/* Is an interval expression of fixed length */
2204 (!expression->len
2205 ? (end != start)
2206 : ((expression->len - (interval_x * expression->left->len)) != (end - start))))))
2207 return &no_solutions;
2208
2209 solns = (struct rx_solutions *)rx_nfa_cache_malloc (sizeof (*solns));
2210 if (!solns)
2211 return 0;
2212
2213 ++solns_allocated;
2214
2215 solns->small_p = small_p;
2216 solns->step = 0;
2217
2218 solns->cset_size = cset_size;
2219 solns->exp = expression;
2220 rx_save_exp (expression);
2221 solns->subexps = subexps;
2222 solns->nsub = nsub;
2223 solns->regs = regs;
2224
2225 solns->start = start;
2226 solns->end = end;
2227
2228 solns->vmfn = vmfn;
2229 solns->contextfn = contextfn;
2230 solns->closure = closure;
2231
2232 solns->dfa = 0;
2233 solns->match_engine.rx = 0;
2234 solns->match_engine.state = 0;
2235
2236 solns->certainly_fits = certainly_fits;
2237 solns->certain_final_tag = certain_final_tag;
2238
2239 solns->no_fancy_guessing = 0;
2240 solns->left_dfa = 0;
2241 solns->left_match_engine.rx = 0;
2242 solns->left_match_engine.state = 0;
2243
2244 solns->split_guess = 0;
2245 solns->left = 0;
2246 solns->right = 0;
2247
2248 solns->interval_x = interval_x;
2249 solns->interval_from = 0;
2250 solns->interval_to = 0;
2251
2252 solns->saved_rm_so = 0;
2253 solns->saved_rm_eo = 0;
2254 solns->final_tag = 0;
2255
2256 if (!small_p)
2257 {
2258 if (!solns->exp || !solns->exp->observed)
2259 {
2260 solns->dfa = rx_unfa (expression, cset_size);
2261 if (!solns->dfa)
2262 {
2263 rx_free_solutions (solns);
2264 return 0;
2265 }
2266 rx_init_dfa_from_nfa (&solns->match_engine, solns->dfa->nfa);
2267 if (rx_dfa_goto_start_superstate (&solns->match_engine, 1))
2268 {
2269 rx_free_solutions (solns);
2270 return 0;
2271 }
2272 }
2273 else
2274 {
2275 struct rx_exp_node * simplified;
2276 if (rx_simplify_rexp (&simplified, cset_size, solns->exp, subexps))
2277 {
2278 rx_free_solutions (solns);
2279 return 0;
2280 }
2281 solns->dfa = rx_unfa (simplified, cset_size);
2282 if (!solns->dfa)
2283 {
2284 rx_free_exp (simplified);
2285 rx_free_solutions (solns);
2286 return 0;
2287 }
2288 rx_init_dfa_from_nfa (&solns->match_engine, solns->dfa->nfa);
2289 if (rx_dfa_goto_start_superstate (&solns->match_engine, 1))
2290 {
2291 rx_free_exp (simplified);
2292 rx_free_solutions (solns);
2293 return 0;
2294 }
2295 rx_free_exp (simplified);
2296 }
2297 }
2298
2299
2300 if (solns->exp && (solns->exp->observed || solns->small_p) && (solns->exp->type == r_concat))
2301 {
2302 struct rx_exp_node * left_simplified;
2303
2304 if (rx_simplify_rexp (&left_simplified, cset_size, solns->exp->left, solns->subexps))
2305 {
2306 rx_free_solutions (solns);
2307 return 0;
2308 }
2309
2310 solns->left_dfa = rx_unfa (left_simplified, cset_size);
2311 if (!solns->left_dfa)
2312 {
2313 rx_free_exp (left_simplified);
2314 rx_free_solutions (solns);
2315 return 0;
2316 }
2317 rx_init_dfa_from_nfa (&solns->left_match_engine, solns->left_dfa->nfa);
2318 rx_free_exp (left_simplified);
2319 }
2320
2321 return solns;
2322 }
2323
2324
2325 /*(include-documentation "match-regexp.h")
2326 */
2327
2328
2329 /*(c rx_solutions_closure)
2330 * void * rx_solutions_closure (struct rx_solutions * solns);
2331 *
2332 * Given a stream of regexp solutions, return the `closure'
2333 * parameter that was passed to `rx_make_solutions'.
2334 */
2335 void *
rx_solutions_closure(struct rx_solutions * solns)2336 rx_solutions_closure (struct rx_solutions * solns)
2337 {
2338 return solns->closure;
2339 }
2340
2341
2342 /*(c rx_free_solutions)
2343 * void rx_free_solutions (struct rx_solutions * solns);
2344 *
2345 * Free all storage associated with a stream of regexp solutions
2346 * allocated by `rx_make_solutions'. See also
2347 * xref:"rx_basic_free_solutions".
2348 */
2349 void
rx_free_solutions(struct rx_solutions * solns)2350 rx_free_solutions (struct rx_solutions * solns)
2351 {
2352 if (!solns)
2353 return;
2354
2355 if (solns == &no_solutions)
2356 return;
2357
2358 if (solns->left)
2359 {
2360 rx_free_solutions (solns->left);
2361 solns->left = 0;
2362 }
2363
2364 if (solns->right)
2365 {
2366 rx_free_solutions (solns->right);
2367 solns->right = 0;
2368 }
2369
2370 if (solns->dfa)
2371 {
2372 rx_free_unfa (solns->dfa);
2373 solns->dfa = 0;
2374 }
2375
2376 rx_clear_dfa_state (&solns->match_engine);
2377
2378 if (solns->left_dfa)
2379 {
2380 rx_free_unfa (solns->left_dfa);
2381 solns->left_dfa = 0;
2382 }
2383
2384 rx_clear_dfa_state (&solns->left_match_engine);
2385
2386 if (solns->exp)
2387 {
2388 rx_free_exp (solns->exp);
2389 solns->exp = 0;
2390 }
2391
2392 if (solns->saved_rm_so)
2393 rx_nfa_cache_free (solns->saved_rm_so);
2394
2395 if (solns->saved_rm_eo)
2396 rx_nfa_cache_free (solns->saved_rm_eo);
2397
2398 rx_nfa_cache_free (solns);
2399 ++solns_freed;
2400 }
2401
2402
2403
2404
2405
2406 /****************************************************************
2407 *(h1 "Regexp Tree To Regular Expression Conversion")
2408 *
2409 */
2410
2411 /*(c rx_simplify_rexp)
2412 * int rx_simplify_rexp (struct rx_exp_node ** answer,
2413 * int cset_size,
2414 * struct rx_exp_node *node,
2415 * struct rx_exp_node ** subexps);
2416 *
2417 * Convert an expression, which might not be a regular expression, into
2418 * a regular expression matching a superset of the original pattern.
2419 *
2420 * This is useful for a matching heuristic: the regular superset
2421 * language is used to test a candidate string for a match, the
2422 * original irregular regexp is used to verify the match. The test
2423 * using the regular superset is very fast and many non-matching
2424 * strings can be quickly rejected; the verification using the
2425 * irregular regexp may be slow, so we avoid it when we can.
2426 *
2427 * If the input expression is a regular expression, this is an
2428 * identity function which increments the reference count of `node'.
2429 *
2430 * `answer' is a return parameter.
2431 *
2432 * `subexps' is an array of pointers into the expression `node'.
2433 * Element `N' of the array is the `N'th parenthesized subexpression
2434 * of the tree rooted at `node'. This array is usually computed by
2435 * `rx_analyze_rexp'.
2436 */
2437 int
rx_simplify_rexp(struct rx_exp_node ** answer,int cset_size,struct rx_exp_node * node,struct rx_exp_node ** subexps)2438 rx_simplify_rexp (struct rx_exp_node ** answer,
2439 int cset_size,
2440 struct rx_exp_node * node,
2441 struct rx_exp_node ** subexps)
2442 {
2443 int err;
2444
2445 if (!node)
2446 {
2447 *answer = 0;
2448 return 0;
2449 }
2450
2451 if (!node->observed)
2452 {
2453 rx_save_exp (node);
2454 *answer = node;
2455 return 0;
2456 }
2457
2458 if (node->simplified)
2459 {
2460 rx_save_exp (node->simplified);
2461 *answer = node->simplified;
2462 return 0;
2463 }
2464
2465 switch (node->type)
2466 {
2467 default:
2468 case r_cset:
2469 case r_string:
2470 case r_cut:
2471 panic ("bogus regexp in rx_simplify_rexp");
2472 return -1;
2473
2474 case r_parens:
2475 err = rx_simplify_rexp (answer, cset_size, node->left, subexps);
2476 if (err)
2477 return err;
2478 break;
2479
2480 case r_context:
2481 if (char_is_digit (node->intval))
2482 {
2483 err = rx_simplify_rexp (answer, cset_size, subexps [node->intval - '0'], subexps);
2484 if (err)
2485 return err;
2486 }
2487 else
2488 *answer = 0;
2489 break;
2490
2491 case r_concat:
2492 case r_right_concat:
2493 case r_alternate:
2494 case r_star:
2495 case r_interval:
2496 {
2497 struct rx_exp_node *n;
2498
2499 n = rx_exp_node (node->type);
2500 if (!n)
2501 return REG_ESPACE;
2502
2503 if (node->cset)
2504 {
2505 panic ("found a cset bitset in an unexpected place");
2506 }
2507
2508 n->intval = node->intval;
2509 n->intval2 = node->intval2;
2510 err = rx_simplify_rexp (&n->left, cset_size, node->left, subexps);
2511 if (err)
2512 {
2513 rx_free_exp (n);
2514 return err;
2515 }
2516 err = rx_simplify_rexp (&n->right, cset_size, node->right, subexps);
2517 if (err)
2518 {
2519 rx_free_exp (n);
2520 return err;
2521 }
2522 *answer = n;
2523 }
2524 break;
2525 }
2526
2527 node->simplified = *answer;
2528 rx_save_exp (node->simplified);
2529 return 0;
2530 }
2531
2532
2533
2534 /****************************************************************
2535 *(h1 "Regexp Tree Analysis")
2536 *
2537 */
2538
2539 /*(c rx_analyze_rexp)
2540 * void rx_analyze_rexp (struct rx_exp_node *** subexps,
2541 * size_t * re_nsub,
2542 * struct rx_exp_node * node);
2543 *
2544 * Recursively analyze the expression tree rooted at `node'. For each
2545 * node, fill in the fields `observed', `observation_contingent',
2546 * `small_advised_p', and `len'. Build an array of parenthesized
2547 * sub-expressions in the tree and return that array in `subexps' and
2548 * the number of elements it contains in `re_nsub'.
2549 *
2550 * See `struct rx_exp_node' for information about the meaning of
2551 * structure fields filled in by this function.
2552 */
2553 int
rx_analyze_rexp(struct rx_exp_node *** subexps,size_t * re_nsub,struct rx_exp_node * node)2554 rx_analyze_rexp (struct rx_exp_node *** subexps,
2555 size_t * re_nsub,
2556 struct rx_exp_node * node)
2557 {
2558 #define rx_max(a, b) ((a) > (b) ? (a) : (b))
2559 #define rx_min(a, b) ((a) < (b) ? (a) : (b))
2560 if (node)
2561 {
2562 size_t this_subexp;
2563
2564 this_subexp = 0;
2565
2566 if (node->type == r_parens)
2567 {
2568 if (node->intval > 0)
2569 {
2570 struct rx_exp_node ** array;
2571
2572 this_subexp = *re_nsub;
2573 ++*re_nsub;
2574 if (!*subexps)
2575 array = ((struct rx_exp_node **)
2576 rx_nfa_cache_malloc (sizeof (struct rx_exp_node *) * *re_nsub));
2577 else
2578 array = ((struct rx_exp_node **)
2579 rx_nfa_cache_realloc (*subexps,
2580 sizeof (struct rx_exp_node *) * *re_nsub));
2581 if (!array)
2582 return -1;
2583 else
2584 *subexps = array;
2585 }
2586 }
2587
2588 if (node->left && rx_analyze_rexp (subexps, re_nsub, node->left))
2589 return -1;
2590
2591 if (node->right && rx_analyze_rexp (subexps, re_nsub, node->right))
2592 return -1;
2593
2594 switch (node->type)
2595 {
2596 case r_cset:
2597 node->len = 1;
2598 node->observed = 0;
2599 node->observation_contingent = 1;
2600 node->small_advised_p = 1;
2601 node->max_enclosed_paren = 0;
2602 node->min_enclosed_paren = 0;
2603 break;
2604
2605 case r_string:
2606 node->len = node->str_len;
2607 node->observed = 0;
2608 node->observation_contingent = 1;
2609 node->small_advised_p = 1;
2610 node->max_enclosed_paren = 0;
2611 node->min_enclosed_paren = 0;
2612 break;
2613
2614 case r_cut:
2615 node->len = -1;
2616 node->observed = 0;
2617 node->observation_contingent = 1;
2618 node->small_advised_p = 1;
2619 node->max_enclosed_paren = 0;
2620 node->min_enclosed_paren = 0;
2621 break;
2622
2623 case r_concat:
2624 case r_right_concat:
2625 case r_alternate:
2626 {
2627 int lob, rob;
2628 int lobc, robc;
2629 long llen, rlen;
2630
2631 lob = (!node->left ? 0 : node->left->observed);
2632 lobc = (!node->left ? 1 : node->left->observation_contingent);
2633 rob = (!node->right ? 0 : node->right->observed);
2634 robc = (!node->right ? 1 : node->right->observation_contingent);
2635 llen = (!node->left ? 0 : node->left->len);
2636 rlen = (!node->right ? 0 : node->right->len);
2637
2638 if ((llen < 0) || (rlen < 0))
2639 node->len = -1;
2640 else if ((node->type == r_concat) || (node->type == r_right_concat))
2641 {
2642 if (((size_t)llen + (size_t)rlen) > SSIZE_MAX)
2643 panic ("absurdly large regexp in rx_analyze_rexp");
2644 node->len = (long)(llen + rlen);
2645 }
2646 else /* node->type == r_alternate */
2647 {
2648 if (llen == rlen)
2649 node->len = llen;
2650 else
2651 node->len = -1;
2652 }
2653
2654 node->observed = lob || rob;
2655 node->observation_contingent = lobc && robc;
2656
2657 if ((node->type == r_concat) || (node->type == r_right_concat))
2658 {
2659 node->small_advised_p = ( (!node->left || node->left->small_advised_p)
2660 && (!node->right || node->right->small_advised_p));
2661 }
2662 else /* node->type == r_alternate */
2663 {
2664 node->small_advised_p = 0;
2665 }
2666 node->max_enclosed_paren = rx_max ((node->left ? node->left->max_enclosed_paren : 0),
2667 (node->right ? node->right->max_enclosed_paren : 0));
2668 {
2669 int lmin;
2670 int rmin;
2671 lmin = (node->left ? node->left->min_enclosed_paren : 0);
2672 rmin = (node->right ? node->right->min_enclosed_paren : 0);
2673 node->min_enclosed_paren = (!lmin
2674 ? rmin
2675 : (!rmin
2676 ? lmin
2677 : rx_min (lmin, rmin)));
2678 }
2679 break;
2680 }
2681
2682 case r_star:
2683 node->len = ((node->left && node->left->len)
2684 ? -1
2685 : 0);
2686 node->observed = (node->left
2687 ? node->left->observed
2688 : 0);
2689 node->observation_contingent = (node->left
2690 ? node->left->observation_contingent
2691 : 1);
2692 node->small_advised_p = 0;
2693 node->max_enclosed_paren = (node->left ? node->left->max_enclosed_paren : 0);
2694 node->min_enclosed_paren = (node->left ? node->left->min_enclosed_paren : 0);
2695 break;
2696
2697 case r_interval:
2698 if (!node->left || (node->left->len == 0) || (node->intval2 == 0))
2699 node->len = 0;
2700 else if (node->left->len < 0)
2701 node->len = -1;
2702 else if (node->intval == node->intval2)
2703 {
2704 if (node->left->len > (SSIZE_MAX / node->intval))
2705 node->len = -1;
2706 else
2707 node->len = node->left->len * node->intval;
2708 }
2709 else
2710 node->len = -1;
2711 node->observed = 1;
2712 node->observation_contingent = 0;
2713 node->small_advised_p = 0;
2714 node->max_enclosed_paren = (node->left ? node->left->max_enclosed_paren : 0);
2715 node->min_enclosed_paren = (node->left ? node->left->min_enclosed_paren : 0);
2716 break;
2717
2718 case r_parens:
2719 if (node->intval > 0)
2720 {
2721 node->observed = 1;
2722 (*subexps)[this_subexp] = node;
2723 }
2724 else
2725 node->observed = (node->left
2726 ? node->left->observed
2727 : 0);
2728 node->observation_contingent = ( node->observed
2729 && (node->left
2730 ? node->left->observation_contingent
2731 : 1));
2732 node->len = (node->left
2733 ? node->left->len
2734 : 0);
2735 node->small_advised_p = (!node->left || node->left->small_advised_p);
2736 node->max_enclosed_paren = rx_max (node->intval, (node->left ? node->left->max_enclosed_paren : 0));
2737
2738 if (node->left && node->left->min_enclosed_paren)
2739 {
2740 if (!node->intval)
2741 node->min_enclosed_paren = node->left->min_enclosed_paren;
2742 else
2743 node->min_enclosed_paren = rx_min (node->intval, node->left->min_enclosed_paren);
2744 }
2745 else
2746 node->min_enclosed_paren = node->intval;
2747 break;
2748
2749 case r_context:
2750 switch (node->intval)
2751 {
2752 default:
2753 node->small_advised_p = 1;
2754 node->observed = 1;
2755 node->observation_contingent = 0;
2756 node->len = -1;
2757 break;
2758 case '^':
2759 case '$':
2760 node->small_advised_p = 1;
2761 node->observed = 1;
2762 node->observation_contingent = 0;
2763 node->len = 0;
2764 break;
2765 }
2766 node->max_enclosed_paren = 0;
2767 node->min_enclosed_paren = 0;
2768 break;
2769 }
2770 }
2771 return 0;
2772 }
2773
2774