1 /* match-regexp.c - functions for comparing a string to a regexp
2  *
3  ****************************************************************
4  * Copyright (C) 1998, 2000 Thomas Lord
5  *
6  * See the file "COPYING" for further information about
7  * the copyright and warranty status of this work.
8  */
9 
10 
11 /* #define TRACE */
12 
13 #include "hackerlab/bugs/panic.h"
14 #include "hackerlab/char/char-class.h"
15 #include "hackerlab/char/str.h"
16 #include "hackerlab/bitsets/bits.h"
17 #include "hackerlab/rx/nfa-cache.h"
18 #include "hackerlab/rx/dfa.h"
19 #include "hackerlab/rx/escape.h"
20 #ifdef TRACE
21 #include "hackerlab/vu/safe.h"
22 #include "hackerlab/rx/dbug.h"
23 
24 int rx_trace = 0;
25 int rx_depth_limit = 0;
26 
27 #endif
28 #include "hackerlab/rx-posix/errnorx.h"
29 #include "hackerlab/rx-posix/match-regexp.h"
30 
31 
32 
33 static unsigned long solns_allocated;
34 static unsigned long solns_freed;
35 
36 
37 #define max0(N)		(((N) >= 0) ? (N) : 0)
38 
39 
40 /* __STDC__ prototypes for static functions */
41 static int rx_str_vmfn (void * closure,
42 			const t_uchar ** burstp,
43 			rx_off_t * lenp,
44 			rx_off_t * offsetp,
45 			rx_off_t start,
46 			rx_off_t end,
47 			rx_off_t need);
48 static int rx_str_contextfn (void * closure,
49 			     struct rx_exp_node * node,
50 			     rx_off_t start,
51 			     rx_off_t end,
52 			     struct rx_registers * regs);
53 static int rx_solution_fit_cset_p (struct rx_solutions * solns);
54 static int rx_solution_fit_str_p (struct rx_solutions * solns);
55 static int rx_solution_fits (struct rx_solutions * solns);
56 static int rx_dfa_longest_counting (int * label,
57 				    size_t * match_len,
58 				    size_t * matches_count,
59 				    struct rx_dfa * dfa,
60 				    const t_uchar * str,
61 				    size_t len);
62 static int rx_longest_split_guess (int * label, struct rx_solutions * solns);
63 static int save_nested_registers (struct rx_solutions * solns);
64 static void restore_nested_registers (struct rx_solutions * solns);
65 static int rx_next_solution_internal (struct rx_solutions * solns, jmp_buf * err_escape, int depth);
66 
67 
68 
69 /************************************************************************
70  *(h0 "Regexp Matching"
71  *    :includes ("rx-posix/match-regexp.h"))
72  *
73  *
74  * In xref:"Regexp Expression Trees", we saw how to build expression
75  * trees for regexps and elsewhere how to convert those regexps which
76  * are regular expressions to DFA.  In xref:"DFA String Comparisons"
77  * we saw DFA-based functions that can compare a string to a regular
78  * expression.
79  *
80  * But what about those regexps which are not regular expressions?  So
81  * far, except for the quasi-standard functions `regexec' and
82  * `regnexec', we have no way to match non-regular regexps.
83  *
84  * The functions in this chapter implement regexp matching.  The
85  * interface is very detailed, but there are only a few functions and
86  * they are conceptually simple.
87  *
88  */
89 /*(menu)
90  */
91 
92 /************************************************************************
93  *(h1 "An Overview of the Regexp Matching Interface")
94  *
95  * Given a regexp, and an input string, a regexp comparison function
96  * should return a value that indicates whether or not the string
97  * matches, and an array of values filled in with the substring
98  * positions of elements of the input string that matched the
99  * parenthesized subexpressions in the match.  (For example, see
100  * xref:"regexec".)  We start out intending to build a regexp
101  * comparison function that fits the preceeding description and make
102  * two observations.
103  *
104  * First, the function needs to be recursive because we only know how
105  * to check whether or not some kinds of regexp match by checking
106  * whether their subexpressions match.
107  *
108  * Second, the function needs to return more than one set of return
109  * values because a regexp can often match a string in more than one
110  * way -- returning a different set of values for each way of
111  * matching.  To find the best overall match for a regexp, we may have
112  * to consider more than one way of matching the subexpressions of
113  * that regexp.  In fact, the Posix standard for regexps essentially
114  * mandates that we perform a potentially exhaustive search through
115  * the possible ways of matching subexpressions.  So, our recursive
116  * comparison function has to be able to find all of the different
117  * ways of matching a particular regexp and string to facilitate that
118  * search.
119  *
120  * Fortunately, we are able to order the search through the set of
121  * subexpressions in such a way that we will often find the best
122  * answer (and know it is the best answer) early.  So while our
123  * recursive comparison function has to be able to return all possible
124  * solutions, it should be free to return them in the order we will
125  * search them, and to only actually construct a particular solution
126  * if it is actually required during a search.  We can usually save
127  * time by not building solutions that we don't need to search.
128  *
129  * The functions in this chapter implement such a recursive comparison
130  * function.  Given a regexp and a string, the functions return a
131  * lazilly-constructed stream of all of the possible ways of matching
132  * the regexp to the string.  The stream is sorted; in the solution
133  * stream of a regexp subexpression, the best match, according to the
134  * rules mandated by Posix, is always the earliest match in the stream
135  * that also satisfies the constraints of the overall match of the
136  * containing expressions.
137  *
138  * You build one of these streams of subexpression solutions by calling
139  * `rx_make_solutions', passing the regexp and input string.  You
140  * read off elements of the stream by calling `rx_next_solution' which
141  * actually does the work of computing the next solution in the list.
142  *
143  * Implementing a function like `regexec' or `regnexec' is easy.
144  * `regexec' iterates over each possible starting point for a match:
145  *
146  *     for (start_pos = 0; start_pos < string_length; ++start_pos)
147  *
148  * At each starting position, `regexec' iterates through each possible
149  * ending position:
150  *
151  *	 for (end_pos = string_length; end_pos >= start_pos; --end_pos)
152  *
153  * For each possible starting and ending position, `regexec'
154  * constructs a stream of regexp solutions for that substring (by
155  * calling `rx_make_solutions').  It tries to read the first solution
156  * on that stream by calling `rx_next_solution'.  If the stream is not
157  * empty, the first solution in the stream is the best possible
158  * solution, and it is returned from `regexec'.
159  *
160  * When `regexec' calls `rx_next_solution' to find out if the stream
161  * of solutions is empty, more streams of solutions may be created for
162  * the subexpressions of the overall regexp.  Those recursively
163  * created streams may have to be searched to find an overall
164  * solution.  Even though `regexec' only reads one solution from the
165  * stream it creates, the recursive calls to `rx_next_solution' may
166  * read many solutions from the streams they create.
167  *
168  * As just described, you might think `regexec' and `regnexec' do a
169  * lot of work (running so many separate regexp comparisons).  You
170  * would be right.  Consequently, the real-world implementations of
171  * `regexec' and `regnexec' employ several tricks to reduce the number
172  * of regexp comparisons actually performed -- they try to avoid
173  * calling the functions in this chapter.  In addition, the functions
174  * in this chapter employ several tricks to avoid searching
175  * recursively created streams of solutions.  Still, in the general
176  * case, functions like `regexec' use the interface described here in
177  * the computationally expensive way described above.  (The exact
178  * complexity of a search depends on the particular regexp but in the
179  * worst cases, it is a polynomial function (of high degree) of the
180  * length of the string being searched.)
181  *
182  */
183 
184 /************************************************************************
185  *(h1 "Regexp Match Function Data Structures")
186  *
187  */
188 
189 /*(c #s"struct rx_solutions" :category type)
190  * struct rx_solutions;
191  *
192  * This opaque structure type represents the stream of solutions for
193  * matching a particular regexp against a particular string.
194  */
195 struct rx_solutions
196 {
197   int small_p;
198   int step;
199 
200   int cset_size;
201   struct rx_exp_node * exp;
202   struct rx_exp_node ** subexps;
203   int nsub;
204   struct rx_registers * regs;
205 
206   rx_off_t start;
207   rx_off_t end;
208 
209   rx_vmfn vmfn;
210   rx_contextfn contextfn;
211   void * closure;
212 
213   struct rx_unfa * dfa;
214   struct rx_dfa match_engine;
215 
216   int certainly_fits;		/* we know in advance that rx_solution_fits returns 1 */
217   int certain_final_tag;
218 
219   int no_fancy_guessing;
220   struct rx_unfa * left_dfa;
221   struct rx_dfa left_match_engine;
222 
223   rx_off_t split_guess;
224   struct rx_solutions * left;
225   struct rx_solutions * right;
226 
227   int interval_x;
228   int interval_from;
229   int interval_to;
230   rx_off_t * saved_rm_so;
231   rx_off_t * saved_rm_eo;
232   int final_tag;
233 };
234 
235 static struct rx_solutions no_solutions;
236 
237 #if 0
238 /*(c #s"struct rx_registers" :category type)
239  * struct rx_registers;
240  *
241  * This structure type holds information about the match position
242  * of one regexp subexpression in a (potentially) larger string.
243  * It includes at least these fields:
244  *
245  insert*/
246 struct rx_registers
247 {
248   rx_off_t rm_so;		/* The start of the match. */
249   rx_off_t rm_eo;		/* The end of the match. */
250 };
251 /*end-insert
252  */
253 #endif
254 
255 
256 /************************************************************************
257  * Low Level Support for Basic String Matching
258  *
259  */
260 
261 struct rx_str_closure
262 {
263   struct rx_context_rules rules;
264   const t_uchar * str;
265   rx_off_t len;
266 };
267 
268 
269 static int
rx_str_vmfn(void * closure,const t_uchar ** burstp,rx_off_t * lenp,rx_off_t * offsetp,rx_off_t start,rx_off_t end,rx_off_t need)270 rx_str_vmfn (void * closure,
271 	     const t_uchar ** burstp,
272 	     rx_off_t * lenp,
273 	     rx_off_t * offsetp,
274 	     rx_off_t start,
275 	     rx_off_t end,
276 	     rx_off_t need)
277 {
278   struct rx_str_closure * strc;
279 
280   strc = (struct rx_str_closure *)closure;
281 
282   if (   (need < 0)
283       || (need > strc->len))
284     return 0;
285 
286   *burstp = strc->str;
287   *lenp = strc->len;
288   *offsetp = 0;
289   return 0;
290 }
291 
292 
293 static int
rx_str_contextfn(void * closure,struct rx_exp_node * node,rx_off_t start,rx_off_t end,struct rx_registers * regs)294 rx_str_contextfn (void * closure,
295 		  struct rx_exp_node * node,
296 		  rx_off_t start,
297 		  rx_off_t end,
298 		  struct rx_registers * regs)
299 {
300   struct rx_str_closure * strc;
301 
302   strc = (struct rx_str_closure *)closure;
303 
304   switch (node->intval)
305     {
306     case '1': case '2': case '3': case '4': case '5':
307     case '6': case '7': case '8': case '9':
308       {
309 	int cmp;
310 	int regn;
311 
312 	regn = node->intval - '0';
313 	if (   (regs[regn].rm_so == -1)
314 	    || ((end - start) != (regs[regn].rm_eo - regs[regn].rm_so)))
315 	  return 0;
316 	else
317 	  {
318 	    if (strc->rules.case_indep)
319 	      cmp = str_casecmp_n ((t_uchar *)strc->str + start,
320 				   end - start,
321 				   (t_uchar *)strc->str + regs[regn].rm_so,
322 				   end - start);
323 	    else
324 	      cmp = str_cmp_n ((t_uchar *)strc->str + start,
325 			       end - start,
326 			       (t_uchar *)strc->str + regs[regn].rm_so,
327 			       end - start);
328 
329 	    return (!cmp
330 		    ? 1
331 		    : 0);
332 	  }
333       }
334 
335     case '^':
336       {
337 	return ((   (start == end)
338 		 && (   ((start == 0) && !strc->rules.not_bol)
339 		     || (   (start > 0)
340 			 && strc->rules.newline_anchor
341 			 && (strc->str[start - 1] == '\n'))))
342 		? 1
343 		: 0);
344       }
345 
346     case '$':
347       {
348 	return ((   (start == end)
349 		 && (   ((start == strc->len) && !strc->rules.not_eol)
350 		     || (   (start < strc->len)
351 			 && strc->rules.newline_anchor
352 			 && (strc->str[start] == '\n'))))
353 		? 1
354 		: 0);
355       }
356 
357     default:
358       while (1)
359 	panic ("unrecognized context function in rx_str_contextfn");
360     }
361 }
362 
363 
364 /************************************************************************
365  *(h1 "Building and Freeing a Stream of Solutions -- The Simplified Common Case")
366  *
367  */
368 
369 #if 0
370 /*(c rx_basic_make_solutions)
371  * struct rx_solutions *
372  * rx_basic_make_solutions (struct rx_registers * regs,
373  *			    struct rx_exp_node * expression,
374  *			    struct rx_exp_node ** subexps,
375  *			    int nsub,
376  *			    rx_off_t start,
377  *			    rx_off_t end,
378  *			    struct rx_context_rules * rules,
379  *			    const t_uchar * str,
380  *			    int small_p);
381  *
382  * Construct a stream of solutions for matching the regexp
383  * `expression' against the substring of `str' beginning at `start'
384  * and ending at `end'.
385  *
386  * Part of the output of stream, the positions of sub-expressions in a
387  * solution, will be stored in `regs' each time the next solution is
388  * read using `rx_next_solution'.
389  *
390  * The parameter `context_rules' contains flags which effect the
391  * meaning of some regexp operators.
392  *
393  insert*/
394 struct rx_context_rules
395 {
396   t_uchar newline_anchor;
397   /*   	If set, an anchor at a newline matches.*/
398 
399   t_uchar not_bol;
400   t_uchar not_eol;
401   /*	If set, the anchors do not match the beginning and end
402 	of the complete input string.*/
403 
404 
405   t_uchar case_indep;
406   /*	If set, all string comparisons should be case independent.*/
407 };
408 /*end-insert
409  *
410  * You must apply `rx_analyze_rexp' to the regexp `expression' before
411  * calling his function.  `rx_analyze_rexp' computes recursively
412  * defined properties of the subexpressions of the expression and
413  * records those properties in the expression tree. During a match,
414  * `rx_next_solution' examines those properties to decide how each
415  * subexpression should be treated.
416  *
417  * `rx_analyze_rexp' will return to you an array of the parenthesized
418  * subexpressions of `expression' and tell you the total number of
419  * subexpressions.  You must pass this information along to
420  * `rx_basic_make_solutions' as the `subexps' and `nsub' parameters.
421  *
422  * `rx_analyze_rexp' will also fill in the `small_advised_p' field of
423  * `expression'.  You can pass this along to `rx_basic_make_solutions'
424  * as the parameter `small_p'.  If not 0, `small_p' advises
425  * `rx_basic_make_solutions' that the regexp is small and simple and
426  * therefore not worthy of several high-latency optimizations that
427  * `rx_next_solution' is capable of.  Setting this parameter correctly
428  * can improve performance, setting it incorrectly can ruin
429  * performance, and `rx_analyze_rexp' can only approximately compute
430  * whether or not to set this parameter (it makes a safe guess).  The
431  * safest value to pass is 0, which activates the high-latency
432  * optimizations, sometimes (slightly) slowing down matches that
433  * aren't particularly expensive while (substantially) speeding up
434  * some matches that are particularly expensive.
435  *
436  * `rx_basic_make_solutions' requires that the entire input string be
437  * in contiguous memory at one time.  Another function,
438  * `rx_make_solutions', can be used if the string is not contiguous in
439  * memory or is not all in memory at one time.
440  */
441 #endif
442 struct rx_solutions *
rx_basic_make_solutions(struct rx_registers * regs,struct rx_exp_node * expression,struct rx_exp_node ** subexps,int nsub,rx_off_t start,rx_off_t end,struct rx_context_rules * rules,const t_uchar * str,int small_p)443 rx_basic_make_solutions (struct rx_registers * regs,
444 			 struct rx_exp_node * expression,
445 			 struct rx_exp_node ** subexps,
446 			 int nsub,
447 			 rx_off_t start,
448 			 rx_off_t end,
449 			 struct rx_context_rules * rules,
450 			 const t_uchar * str,
451 			 int small_p)
452 {
453   struct rx_solutions * solns;
454   struct rx_str_closure * closure;
455 
456   if (   expression
457       && (expression->len >= 0)
458       && (expression->len != (end - start)))
459     return &no_solutions;
460 
461   closure = (struct rx_str_closure *)rx_nfa_cache_malloc (sizeof (*closure));
462   if (!closure)
463     return 0;
464 
465   closure->str = str;
466   closure->len = end;
467   closure->rules = *rules;
468   solns =  rx_make_solutions (regs,
469 			      256, expression, subexps, nsub,
470 			      start, end, 0,
471 			      rx_str_vmfn, rx_str_contextfn, (void *)closure, small_p, 0, 0);
472   if (!solns)
473     {
474       rx_nfa_cache_free ((void *)closure);
475       return 0;
476     }
477   return solns;
478 }
479 
480 
481 /*(c rx_basic_free_solutions)
482  * void rx_basic_free_solutions (struct rx_solutions * solns);
483  *
484  * Release all storage held by a stream of regexp solutions allocated
485  * by `rx_basic_make_solutions'.
486  */
487 void
rx_basic_free_solutions(struct rx_solutions * solns)488 rx_basic_free_solutions (struct rx_solutions * solns)
489 {
490   if (solns == 0)
491     return;
492 
493   if (solns == &no_solutions)
494     return;
495 
496   rx_nfa_cache_free (rx_solutions_closure (solns));
497   rx_free_solutions (solns);
498 }
499 
500 
501 
502 /************************************************************************
503  * Low Level Support for rx_next_solution
504  *
505  *
506  *
507  */
508 
509 /* static int rx_solution_fit_cset_p (struct rx_solutions * solns);
510  *
511  * The expression is a character set node and the current substring is
512  * one character long.  Compare the two and return 1 if they match, 0
513  * otherwise.
514  */
515 static int
rx_solution_fit_cset_p(struct rx_solutions * solns)516 rx_solution_fit_cset_p (struct rx_solutions * solns)
517 {
518   int current_pos;
519   const t_uchar * burst;
520   rx_off_t burst_addr;
521   rx_off_t burst_len;
522   rx_off_t rel_pos_in_burst;
523   int vmstat;
524   const t_uchar * pos;
525 
526   current_pos = solns->start;
527   vmstat = solns->vmfn (solns->closure,
528 			&burst, &burst_len, &burst_addr,
529 			current_pos, current_pos + 1, current_pos);
530 
531   if (vmstat)
532     return vmstat;
533 
534   rel_pos_in_burst = current_pos - burst_addr;
535   pos = burst + rel_pos_in_burst;
536   return !!bits_is_member (solns->exp->cset, *pos);
537 }
538 
539 
540 /* static int rx_solution_fit_str_p (struct rx_solutions * solns);
541  *
542  * The expression is a string node and the current substring is the
543  * same length.  Compare the two and return 1 if they match, 0
544  * otherwise.
545  */
546 static int
rx_solution_fit_str_p(struct rx_solutions * solns)547 rx_solution_fit_str_p (struct rx_solutions * solns)
548 {
549   rx_off_t current_pos;
550   const t_uchar * burst;
551   rx_off_t burst_addr;
552   rx_off_t burst_len;
553   rx_off_t burst_end_addr;
554   rx_off_t rel_pos_in_burst;
555   int vmstat;
556   rx_off_t count;
557   t_uchar * key;
558 
559 
560   current_pos = solns->start;
561   count = solns->exp->str_len;
562   key = (t_uchar *)solns->exp->str;
563 
564  next_burst:
565   vmstat = solns->vmfn (solns->closure,
566 			&burst, &burst_len, &burst_addr,
567 			current_pos, solns->end,
568 			current_pos);
569 
570   if (vmstat)
571     return vmstat;
572 
573   rel_pos_in_burst = current_pos - burst_addr;
574   burst_end_addr = burst_addr + burst_len;
575 
576   {
577     const t_uchar * pos;
578 
579     pos = burst + rel_pos_in_burst;
580 
581     if (burst_end_addr >= solns->end)
582       {
583 	while (count)
584 	  {
585 	    if (rx_poll)
586 	      (*rx_poll)();
587 	    if (*pos != *key)
588 	      return 0;
589 	    ++pos;
590 	    ++key;
591 	    --count;
592 	  }
593 	return 1;
594       }
595     else
596       {
597 	rx_off_t part_count;
598 	rx_off_t part_count_init;
599 
600 	part_count_init = burst_len - rel_pos_in_burst;
601 	part_count = part_count_init;
602 	while (part_count)
603 	  {
604 	    if (rx_poll)
605 	      (*rx_poll)();
606 	    if (*pos != *key)
607 	      return 0;
608 	    ++pos;
609 	    ++key;
610 	    --part_count;
611 	  }
612 	count -= part_count_init;
613 	current_pos += burst_len - rel_pos_in_burst;
614 	goto next_burst;
615       }
616   }
617 }
618 
619 
620 /* static int rx_solution_fits (struct rx_solutions * solns);
621  *
622  * Compare a DFA which describes a superset of the regexp expression
623  * to the current substring and return 1 if they match, 0 otherwise.
624  * That the DFA matches is a necessary but not sufficient condition of
625  * the regexp expression itself matching.  This test can be used to
626  * avoid a more expensive regexp comparison.  Return -1 for ESPACE.
627  */
628 static int
rx_solution_fits(struct rx_solutions * solns)629 rx_solution_fits (struct rx_solutions * solns)
630 {
631   const t_uchar * burst;
632   rx_off_t burst_addr;
633   rx_off_t burst_len;
634   rx_off_t burst_end_addr;
635   rx_off_t rel_pos_in_burst;
636   int vmstat;
637   rx_off_t current_pos;
638 
639   if (solns->certainly_fits)
640     {
641       solns->match_engine.final_tag = solns->certain_final_tag;
642       return 1;
643     }
644 
645   current_pos = solns->start;
646 
647  next_burst:
648   vmstat = solns->vmfn (solns->closure,
649 			&burst, &burst_len, &burst_addr,
650 			current_pos, solns->end,
651 			current_pos);
652 
653   if (vmstat)
654     return vmstat;
655 
656   rel_pos_in_burst = current_pos - burst_addr;
657   burst_end_addr = burst_addr + burst_len;
658 
659   if (burst_end_addr >= solns->end)
660     {
661       int fit_label;
662 
663       if (rx_dfa_fits (&fit_label,
664 		       &solns->match_engine,
665 		       burst + rel_pos_in_burst,
666 		       solns->end - current_pos))
667 	return -1;		/* espace */
668       return fit_label;
669     }
670   else
671     {
672       int fit_status;
673       fit_status = rx_dfa_advance (&solns->match_engine,
674 				   burst + rel_pos_in_burst,
675 				   burst_len - rel_pos_in_burst);
676       if (fit_status < 0)
677 	{
678 	  return -1;
679 	}
680       else if (fit_status)
681 	{
682 	  current_pos += burst_len - rel_pos_in_burst;
683 	  goto next_burst;
684 	}
685       else
686 	return 0;
687     }
688 }
689 
690 
691 static int
rx_dfa_longest_counting(int * label,size_t * match_len,size_t * matches_count,struct rx_dfa * dfa,const t_uchar * str,size_t len)692 rx_dfa_longest_counting (int * label,
693 			 size_t * match_len,
694 			 size_t * matches_count,
695 			 struct rx_dfa * dfa,
696 			 const t_uchar * str,
697 			 size_t len)
698 {
699   size_t pos;
700   size_t n_matches;
701   size_t best_len;
702   int best_label;
703 
704   pos = 0;
705   n_matches = 0;
706   best_len = 0;
707   best_label = rx_dfa_tag (dfa);
708 
709   if (best_label)
710     {
711       n_matches = 1;
712     }
713 
714   while (pos < len)
715     {
716       int adv_stat;
717       size_t adv_amt;
718 
719       adv_stat = rx_dfa_advance_to_final (&adv_amt, dfa, str + pos, len - pos);
720 
721       if (adv_stat < 0)
722 	return -1;
723 
724       if (!adv_stat)
725 	break;
726 
727       pos += adv_amt;
728       if (rx_dfa_tag (dfa))
729 	{
730 	  best_len = pos;
731 	  ++n_matches;
732 	  best_label = rx_dfa_tag (dfa);
733 	}
734     }
735 
736   *label = best_label;
737   *match_len = best_len;
738   *matches_count = n_matches;
739   return 0;
740 }
741 
742 
743 /* adjust split_guess downward to the nearest plausible
744  * position.  Return 0 if none exists, 1 otherwise.
745  */
746 static int
rx_longest_split_guess(int * label,struct rx_solutions * solns)747 rx_longest_split_guess (int * label, struct rx_solutions * solns)
748 {
749   const t_uchar * burst;
750   rx_off_t burst_addr;
751   rx_off_t burst_len;
752   rx_off_t burst_end_addr;
753   rx_off_t rel_pos_in_burst;
754   int vmstat;
755   rx_off_t current_pos;
756   size_t n_matches;
757   rx_off_t best_guess;
758   int best_label;
759 
760   if (solns->no_fancy_guessing)
761     {
762       *label = 0;
763       return 1;
764     }
765 
766   current_pos = solns->start;
767   n_matches = 0;
768   best_guess = -1;
769   best_label = 0;
770 
771   if (rx_dfa_goto_start_superstate (&solns->left_match_engine, 1))
772     return -1;
773 
774   while (1)
775     {
776       int fit_label;
777       size_t matches_here;
778       size_t len_here;
779 
780       vmstat = solns->vmfn (solns->closure,
781 			    &burst, &burst_len, &burst_addr,
782 			    current_pos, solns->split_guess,
783 			    current_pos);
784 
785       if (vmstat)
786 	{
787 	  rx_clear_dfa_state (&solns->left_match_engine);
788 	  return vmstat;
789 	}
790 
791       rel_pos_in_burst = current_pos - burst_addr;
792       burst_end_addr = burst_addr + burst_len;
793 
794       if (rx_dfa_longest_counting (&fit_label,
795 				   &len_here,
796 				   &matches_here,
797 				   &solns->left_match_engine,
798 				   burst + rel_pos_in_burst,
799 				   ((burst_end_addr >= solns->split_guess)
800 				    ? solns->split_guess - current_pos
801 				    : burst_end_addr - current_pos)))
802 	{
803 	  rx_clear_dfa_state (&solns->left_match_engine);
804 	  return -1;
805 	}
806 
807       if (fit_label)
808 	{
809 	  n_matches += matches_here;
810 	  best_guess = len_here + rel_pos_in_burst;
811 	  best_label = fit_label;
812 	}
813 
814       if (!fit_label || (burst_end_addr >= solns->split_guess))
815 	{
816 	  if (!n_matches)
817 	    {
818 	      rx_clear_dfa_state (&solns->left_match_engine);
819 	      return 0;
820 	    }
821 
822 	  solns->split_guess = best_guess;
823 	  *label = best_label;
824 	  if (n_matches > (best_guess * 2))
825 	    solns->no_fancy_guessing = 1;
826 
827 	  rx_clear_dfa_state (&solns->left_match_engine);
828 	  return 1;
829 	}
830 
831       current_pos += burst_len - rel_pos_in_burst;
832     }
833 }
834 
835 
836 
837 /************************************************************************
838  *(h1 "Reading Elements From a Stream of Solutions")
839  *
840  */
841 
842 /* Save the values of registers for reporting parentheses
843  * enclosed in the expression solns->exp.  Initialize
844  * those registers to -1.
845  *
846  * These are stored in the `solns' structure.  Only one set of register
847  * values can be saved, per `solns', at a time.
848  */
849 static int
save_nested_registers(struct rx_solutions * solns)850 save_nested_registers (struct rx_solutions * solns)
851 {
852   int first_subexp;
853   int last_subexp;
854   int n_subexps;
855   int x;
856 
857   /* Save the value of the set of numbered
858    * parentheses in the tree rooted at this
859    * expression and initialized them to -1.  If
860    * this expression does not match, they will
861    * be restored.  If it does match, they get values
862    * from that match and not from previous matches.
863    */
864 
865   first_subexp = solns->exp->min_enclosed_paren;
866   last_subexp = solns->exp->max_enclosed_paren;
867   n_subexps = 1 + (last_subexp - first_subexp);
868 
869   if (!solns->saved_rm_so)
870     {
871       solns->saved_rm_so = (rx_off_t *)rx_nfa_cache_malloc (sizeof (rx_off_t) * n_subexps);
872       solns->saved_rm_eo = (rx_off_t *)rx_nfa_cache_malloc (sizeof (rx_off_t) * n_subexps);
873       if (!solns->saved_rm_so || !solns->saved_rm_eo)
874 	{
875 	  rx_nfa_cache_free ((void *)solns->saved_rm_so);
876 	  rx_nfa_cache_free ((void *)solns->saved_rm_eo);
877 	  return -1;
878 	}
879     }
880 
881   for (x = 0; x < n_subexps; ++x)
882     {
883       solns->saved_rm_so[x] = solns->regs[x + first_subexp].rm_so;
884       solns->saved_rm_eo[x] = solns->regs[x + first_subexp].rm_eo;
885       solns->regs[x + first_subexp].rm_so = -1;
886       solns->regs[x + first_subexp].rm_eo = -1;
887     }
888   return 0;
889 }
890 
891 /* Restore register values saved by `save_nested_registers'.
892  *
893  * This doesn't erase the stored values.
894  */
895 static void
restore_nested_registers(struct rx_solutions * solns)896 restore_nested_registers (struct rx_solutions * solns)
897 {
898   int first_subexp;
899   int last_subexp;
900   int n_subexps;
901   int x;
902 
903   first_subexp = solns->exp->min_enclosed_paren;
904   last_subexp = solns->exp->max_enclosed_paren;
905   n_subexps = 1 + (last_subexp - first_subexp);
906 
907   for (x = 0; x < n_subexps; ++x)
908     {
909       solns->regs[x + first_subexp].rm_so = solns->saved_rm_so[x];
910       solns->regs[x + first_subexp].rm_eo = solns->saved_rm_eo[x];
911     }
912 }
913 
914 /*(c rx_next_solution)
915  * int rx_next_solution (struct rx_solutions * solns);
916  *
917  * If there are no more solutions in the stream of regexp solutions
918  * `solns', return 0.  Otherwise, advance the stream to the next
919  * solution and return 1.  Return -1 for ESPACE.
920  *
921  * Advancing the stream means filling in the `regs' structure that was
922  * passed to `rx_basic_make_solutions' or `rx_make_solutions' and
923  * establishing the value that will be returned by subsequent calls to
924  * `rx_solutions_final_tag'.
925  *
926  * Solutions are returned in order from best to worst according to the
927  * leftmost-longest rule (see xref:"Describing Regexps Formally").
928  *
929  * It is possible to asynchronously abort a call to this function.
930  * See xref:"Exiting Long-Running Matches".
931  */
932 int
rx_next_solution(struct rx_solutions * solns)933 rx_next_solution (struct rx_solutions * solns)
934 {
935   jmp_buf err_escape;
936 
937   if (setjmp (err_escape))
938     {
939       return -1;
940     }
941   return rx_next_solution_internal (solns, &err_escape, 0);
942 }
943 
944 
945 static int
rx_next_solution_internal(struct rx_solutions * solns,jmp_buf * err_escape,int depth)946 rx_next_solution_internal (struct rx_solutions * solns, jmp_buf * err_escape, int depth)
947 {
948 #ifdef TRACE
949   if (rx_trace && (!rx_depth_limit || (depth < rx_depth_limit)))
950     {
951       safe_printfmt (2, "%*s================\n", depth, "");
952       safe_printfmt (2, "%*sstep: %d\n", depth, "", solns->step);
953       safe_printfmt (2, "%*s----------------\n", depth, "");
954       safe_printfmt (2, "%*sexpression:\n", depth, "");
955       safe_printfmt (2, "%*s", depth, "");
956       rx_unparse_print_rexp (2, 256, solns->exp);
957       safe_printfmt (2, "\n");
958       safe_printfmt (2, "%*s----------------\n", depth, "");
959       safe_printfmt (2, "%*sstring:\n", depth, "");
960       {
961 	const t_uchar * burst;
962 	rx_off_t burst_addr;
963 	rx_off_t burst_len;
964 	rx_off_t rel_pos_in_burst;
965 	int vmstat;
966 
967 	vmstat = solns->vmfn (solns->closure,
968 			      &burst, &burst_len, &burst_addr,
969 			      solns->start, solns->end, solns->start);
970 
971 	if (vmstat)
972 	  panic ("vmstat failure in trace");
973 
974 	rel_pos_in_burst = solns->start - burst_addr;
975 	safe_printfmt (2, "%*s%.*s\n", depth, "",
976 		       ((solns->end - solns->start) <= (burst_len - rel_pos_in_burst)
977 			? (solns->end - solns->start)
978 			: (burst_len - rel_pos_in_burst)),
979 		       burst + rel_pos_in_burst);
980       }
981       safe_printfmt (2, "%*s----------------\n", depth, "");
982     }
983 #endif
984 
985 #define KNOWN_LENGTH(EXP)	(!(EXP) \
986 				 ? 0 \
987 				 : ((EXP)->len >= 0 \
988 				    ? (EXP)->len \
989 				    : ((((EXP)->type == r_context) && ((EXP)->intval >= '0') && ((EXP)->intval <= '9')) \
990 				       ? ((solns->regs[(EXP)->intval - '0'].rm_so >= 0) \
991 					  ? (solns->regs[(EXP)->intval - '0'].rm_eo - solns->regs[(EXP)->intval - '0'].rm_so) \
992 					  : -1) \
993 				       : -1)))
994 
995 
996   if (solns == &no_solutions)
997     {
998       return 0;
999     }
1000 
1001   if (!solns->exp)
1002     {
1003       if (solns->step != 0)
1004 	{
1005 	  return 0;
1006 	}
1007       else
1008 	{
1009 	  solns->step = 1;
1010 	  solns->final_tag = 1;
1011 	  return (solns->start == solns->end
1012 		  ? 1
1013 		  : 0);
1014 	}
1015     }
1016   else if (!(solns->small_p || solns->exp->observed))
1017     {
1018       if (solns->step != 0)
1019 	{
1020 	  return 0;
1021 	}
1022       else if (solns->exp->type == r_string)
1023 	{
1024 	  int ans;
1025 	  ans = rx_solution_fit_str_p (solns);
1026 	  solns->final_tag = 1;
1027 	  solns->step = -1;
1028 	  return ans;
1029 	}
1030       else
1031 	{
1032 	  int ans;
1033 	  ans = rx_solution_fits (solns);
1034 	  if (ans < 0)
1035 	    longjmp (*err_escape, 1);
1036 	  solns->final_tag = solns->match_engine.final_tag;
1037 	  solns->step = -1;
1038 	  return !!ans;
1039 	}
1040     }
1041   else				/* if (solns->small_p || solns->exp->observed) */
1042     {
1043       int fits;
1044       switch (solns->step)
1045 	{
1046 	case -2:
1047 	  if (solns->exp->min_enclosed_paren)
1048 	    {
1049 	      int first_subexp;
1050 	      int last_subexp;
1051 	      int x;
1052 
1053 	      first_subexp = solns->exp->min_enclosed_paren;
1054 	      last_subexp = solns->exp->max_enclosed_paren;
1055 
1056 	      for (x = first_subexp; x <= last_subexp; ++x)
1057 		{
1058 		  solns->regs[x].rm_so = solns->saved_rm_so[x - first_subexp];
1059 		  solns->regs[x].rm_eo = solns->saved_rm_eo[x - first_subexp];
1060 		}
1061 	    }
1062 	  return 0;
1063 
1064 	case -1:
1065 	  return 0;
1066 
1067 	case 0:
1068 	  if (solns->small_p)
1069 	    {
1070 	      solns->step = 1;
1071 	      goto resolve_fit;
1072 	    }
1073 	  else
1074 	    {
1075 	      fits = rx_solution_fits (solns);
1076 	      if (fits < 0)
1077 		longjmp (*err_escape, 1);
1078 	      /* Set final_tag here because this rough fit test
1079 	       * may be all the matching that gets done.
1080 	       * For example, consider a paren node containing
1081 	       * a true regular expression ending with a cut
1082 	       * operator.
1083 	       */
1084 	      solns->final_tag = solns->match_engine.final_tag;
1085 	      if (fits == 0)
1086 		{
1087 		  solns->step = -1;
1088 		  return fits;
1089 		}
1090 	      else
1091 		{
1092 		  solns->step = 1;
1093 		  goto resolve_fit;
1094 		}
1095 	    }
1096 
1097 	default:
1098 	resolve_fit:
1099 	  switch (solns->exp->type)
1100 	    {
1101 	    case r_string:
1102 	      if (!solns->small_p)
1103 		{
1104 		  while (1)
1105 		    panic ("bogus regexp in rx_next_solution_internal");
1106 		}
1107 	      else
1108 		{
1109 		  int answer;
1110 		  answer = rx_solution_fit_str_p (solns);
1111 		  solns->final_tag = 1;
1112 		  solns->step = -1;
1113 		  return answer;
1114 		}
1115 
1116 	    case r_cset:
1117 	      if (!solns->small_p)
1118 		{
1119 		  while (1)
1120 		    panic ("bogus regexp in rx_next_solution_internal");
1121 		}
1122 	      else
1123 		{
1124 		  int answer;
1125 		  answer = rx_solution_fit_cset_p (solns);
1126 		  solns->final_tag = 1;
1127 		  solns->step = -1;
1128 		  return answer;
1129 		}
1130 
1131 	    case r_cut:
1132 	      if (!solns->small_p)
1133 		{
1134 		  while (1)
1135 		    panic ("bogus regexp in rx_next_solution_internal");
1136 		}
1137 	      else
1138 		{
1139 		  solns->final_tag = solns->exp->intval;
1140 		  solns->step = -1;
1141 		  return !!solns->exp->intval;
1142 		}
1143 
1144 	    case r_parens:
1145 	      {
1146 		int paren_stat;
1147 		switch (solns->step)
1148 		  {
1149 		  case 1:
1150 		    if (solns->exp->min_enclosed_paren)
1151 		      {
1152 			if (save_nested_registers (solns))
1153 			  longjmp (*err_escape, 1);
1154 		      }
1155 
1156 		    if (   !solns->small_p
1157 			&& (   !solns->exp->left
1158 			    || !solns->exp->left->observed))
1159 		      {
1160 			/* We already proved that this (simple)
1161 			 * subexpression matches.  Optionally, record
1162 			 * its position.  Return success once, and
1163 			 * failure for all subsequent attempts to find
1164 			 * other matches.
1165 			 */
1166 			if (solns->exp->intval)
1167 			  {
1168 			    solns->regs[solns->exp->intval].rm_so = solns->start;
1169 			    solns->regs[solns->exp->intval].rm_eo = solns->end;
1170 			  }
1171 			solns->step = -2;
1172 			/* Keep the final_tag from the fits test. */
1173 			return 1;
1174 		      }
1175 
1176 		    /* We must search through possible matches of the subexpression.
1177 		     */
1178 		    solns->left = rx_make_solutions (solns->regs,
1179 						     solns->cset_size,
1180 						     solns->exp->left,
1181 						     solns->subexps,
1182 						     solns->nsub,
1183 						     solns->start,
1184 						     solns->end,
1185 						     0,
1186 						     solns->vmfn,
1187 						     solns->contextfn,
1188 						     solns->closure,
1189 						     solns->small_p,
1190 						     0,
1191 						     0);
1192 		    if (!solns->left)
1193 		      longjmp (*err_escape, 1);
1194 
1195 		  case 2:
1196 		    if (solns->exp->min_enclosed_paren)
1197 		      {
1198 			int first_subexp;
1199 			int last_subexp;
1200 			int x;
1201 
1202 			/* Initialize the parentheses in the tree
1203 			 * rooted at this expression to -1.
1204 			 */
1205 
1206 			first_subexp = solns->exp->min_enclosed_paren;
1207 			last_subexp = solns->exp->max_enclosed_paren;
1208 
1209 			for (x = first_subexp; x <= last_subexp; ++x)
1210 			  {
1211 			    solns->regs[x].rm_so = -1;
1212 			    solns->regs[x].rm_eo = -1;
1213 			  }
1214 		      }
1215 
1216 		    paren_stat = rx_next_solution_internal (solns->left, err_escape, depth + 1);
1217 		    if (paren_stat < 0)
1218 		      longjmp (*err_escape, 1);
1219 
1220 		    if (paren_stat == 1)
1221 		      {
1222 			if (solns->exp->intval)
1223 			  {
1224 			    solns->regs[solns->exp->intval].rm_so = solns->start;
1225 			    solns->regs[solns->exp->intval].rm_eo = solns->end;
1226 			  }
1227 			solns->final_tag = solns->left->final_tag;
1228 			solns->step = 2;
1229 			return 1;
1230 		      }
1231 		    else
1232 		      {
1233 			rx_free_solutions (solns->left);
1234 			solns->left = 0;
1235 			if (solns->exp->min_enclosed_paren)
1236 			  {
1237 			    /* This expression didn't match.  Restore the old subexpression
1238 			     * positions.
1239 			     */
1240 			    restore_nested_registers (solns);
1241 			  }
1242 			solns->step = -1;
1243 			return paren_stat;
1244 		      }
1245 		  }
1246 	      }
1247 
1248 	    case r_alternate:
1249 	      {
1250 		int alt_stat;
1251 		switch (solns->step)
1252 		  {
1253 		  case 1:
1254 		    solns->left = rx_make_solutions (solns->regs,
1255 						     solns->cset_size,
1256 						     solns->exp->left,
1257 						     solns->subexps,
1258 						     solns->nsub,
1259 						     solns->start,
1260 						     solns->end,
1261 						     0,
1262 						     solns->vmfn,
1263 						     solns->contextfn,
1264 						     solns->closure,
1265 						     solns->small_p,
1266 						     0,
1267 						     0);
1268 		    if (!solns->left)
1269 		      longjmp (*err_escape, 1);
1270 
1271 		  case 2:
1272 		    alt_stat = rx_next_solution_internal (solns->left, err_escape, depth + 1);
1273 		    if (alt_stat < 0)
1274 		      longjmp (*err_escape, 1);
1275 
1276 		    if (alt_stat == 1)
1277 		      {
1278 			solns->final_tag = solns->left->final_tag;
1279 			solns->step = 2;
1280 			return alt_stat;
1281 		      }
1282 		    else
1283 		      {
1284 			rx_free_solutions (solns->left);
1285 			solns->left = 0;
1286 		      }
1287 
1288 		    solns->right = rx_make_solutions (solns->regs,
1289 						      solns->cset_size,
1290 						      solns->exp->right,
1291 						      solns->subexps,
1292 						      solns->nsub,
1293 						      solns->start,
1294 						      solns->end,
1295 						      0,
1296 						      solns->vmfn,
1297 						      solns->contextfn,
1298 						      solns->closure,
1299 						      solns->small_p,
1300 						      0,
1301 						      0);
1302 		    if (!solns->right)
1303 		      longjmp (*err_escape, 1);
1304 
1305 		  case 4:
1306 		    alt_stat = rx_next_solution_internal (solns->right, err_escape, depth + 1);
1307 		    if (alt_stat < 0)
1308 		      longjmp (*err_escape, 1);
1309 
1310 		    if (alt_stat == 1)
1311 		      {
1312 			solns->final_tag = solns->right->final_tag;
1313 			solns->step = 4;
1314 			return alt_stat;
1315 		      }
1316 		    else
1317 		      {
1318 			solns->step = -1;
1319 			rx_free_solutions (solns->right);
1320 			solns->right = 0;
1321 			return alt_stat;
1322 		      }
1323 		  }
1324 	      }
1325 
1326 	    case r_concat:
1327 	      {
1328 		switch (solns->step)
1329 		  {
1330 		    int concat_stat;
1331 		    int split_guess_stat;
1332 		    int certain_label;
1333 
1334 		  case 1:
1335 
1336 		    if (!solns->exp->left)
1337 		      solns->split_guess = solns->start;
1338 		    else
1339 		      {
1340 			rx_off_t known_left_length;
1341 
1342 			known_left_length = KNOWN_LENGTH (solns->exp->left);
1343 			if (known_left_length >= 0)
1344 			  {
1345 			    if (known_left_length > (solns->end - solns->start))
1346 			      {
1347 				solns->step = -1;
1348 				return 0;
1349 			      }
1350 			    solns->split_guess = solns->start + known_left_length;
1351 			  }
1352 			else
1353 			  {
1354 			    rx_off_t known_right_length;
1355 
1356 			    known_right_length = KNOWN_LENGTH (solns->exp->right);
1357 
1358 			    if (known_right_length >= 0)
1359 			      {
1360 				if (known_right_length > (solns->end - solns->start))
1361 				  {
1362 				    solns->step = -1;
1363 				    return 0;
1364 				  }
1365 				solns->split_guess = solns->end - known_right_length;
1366 			      }
1367 			    else
1368 			      solns->split_guess = solns->end;
1369 			  }
1370 		      }
1371 
1372 		  concat_split_guess_loop:
1373 
1374 		    split_guess_stat = rx_longest_split_guess (&certain_label, solns);
1375 
1376 		    if (split_guess_stat < 0)
1377 		      longjmp (*err_escape, 1);
1378 
1379 		    if (!split_guess_stat)
1380 		      {
1381 			solns->step = -1;
1382 			return 0;
1383 		      }
1384 
1385 		    solns->left = rx_make_solutions (solns->regs,
1386 						     solns->cset_size,
1387 						     solns->exp->left,
1388 						     solns->subexps,
1389 						     solns->nsub,
1390 						     solns->start,
1391 						     solns->split_guess,
1392 						     0,
1393 						     solns->vmfn,
1394 						     solns->contextfn,
1395 						     solns->closure,
1396 						     solns->small_p,
1397 						     !!certain_label,
1398 						     certain_label);
1399 		    if (!solns->left)
1400 		      longjmp (*err_escape, 1);
1401 
1402 		  concat_try_next_left_match:
1403 
1404 		    concat_stat = rx_next_solution_internal (solns->left, err_escape, depth + 1);
1405 		    if (concat_stat < 0)
1406 		      longjmp (*err_escape, 1);
1407 
1408 		    if (concat_stat != 1)
1409 		      {
1410 			rx_free_solutions (solns->left);
1411 			solns->left = 0;
1412 			if (   (KNOWN_LENGTH (solns->exp->left) < 0)
1413 			    && (KNOWN_LENGTH (solns->exp->right) < 0))
1414 			  {
1415 			    --solns->split_guess;
1416 			    if (solns->split_guess >= solns->start)
1417 			      goto concat_split_guess_loop;
1418 			  }
1419 
1420 			solns->step = -1;
1421 			return 0;
1422 		      }
1423 
1424 		    solns->right = rx_make_solutions (solns->regs,
1425 						      solns->cset_size,
1426 						      solns->exp->right,
1427 						      solns->subexps,
1428 						      solns->nsub,
1429 						      solns->split_guess,
1430 						      solns->end,
1431 						      0,
1432 						      solns->vmfn,
1433 						      solns->contextfn,
1434 						      solns->closure,
1435 						      solns->small_p,
1436 						      0,
1437 						      0);
1438 		    if (!solns->right)
1439 		      longjmp (*err_escape, 1);
1440 
1441 		  case 2:
1442 		    concat_stat = rx_next_solution_internal (solns->right, err_escape, depth + 1);
1443 		    if (concat_stat < 0)
1444 		      longjmp (*err_escape, 1);
1445 
1446 		    if (concat_stat == 1)
1447 		      {
1448 			solns->final_tag = solns->right->final_tag;
1449 			solns->step = 2;
1450 			return concat_stat;
1451 		      }
1452 		    else
1453 		      {
1454 			rx_free_solutions (solns->right);
1455 			solns->right = 0;
1456 			goto concat_try_next_left_match;
1457 		      }
1458 		  }
1459 	      }
1460 
1461 	    case r_right_concat:
1462 	      {
1463 		switch (solns->step)
1464 		  {
1465 		    int concat_stat;
1466 		    rx_off_t known_left_length;
1467 		    rx_off_t known_right_length;
1468 
1469 		  case 1:
1470 		    known_left_length = KNOWN_LENGTH (solns->exp->left);
1471 		    known_right_length = KNOWN_LENGTH (solns->exp->right);
1472 		    if (known_left_length >= 0)
1473 		      {
1474 			solns->split_guess = solns->start + known_left_length;
1475 		      }
1476 		    else if (known_right_length >= 0)
1477 		      {
1478 			solns->split_guess = solns->end - known_right_length;
1479 		      }
1480 		    else
1481 		      solns->split_guess = solns->start;
1482 
1483 
1484 		  right_concat_split_guess_loop:
1485 		    solns->left = rx_make_solutions (solns->regs,
1486 						     solns->cset_size,
1487 						     solns->exp->left,
1488 						     solns->subexps,
1489 						     solns->nsub,
1490 						     solns->start,
1491 						     solns->split_guess,
1492 						     0,
1493 						     solns->vmfn,
1494 						     solns->contextfn,
1495 						     solns->closure,
1496 						     solns->small_p,
1497 						     0,
1498 						     0);
1499 		    if (!solns->left)
1500 		      longjmp (*err_escape, 1);
1501 
1502 		  right_concat_try_next_left_match:
1503 
1504 		    concat_stat = rx_next_solution_internal (solns->left, err_escape, depth + 1);
1505 		    if (concat_stat < 0)
1506 		      longjmp (*err_escape, 1);
1507 
1508 		    if (concat_stat != 1)
1509 		      {
1510 			rx_free_solutions (solns->left);
1511 			solns->left = 0;
1512 			if (   (KNOWN_LENGTH (solns->exp->left) < 0)
1513 			    && (KNOWN_LENGTH (solns->exp->right) < 0))
1514 			  {
1515 			    ++solns->split_guess;
1516 			    if (solns->split_guess <= solns->end)
1517 			      goto right_concat_split_guess_loop;
1518 			  }
1519 
1520 			solns->step = -1;
1521 			return 0;
1522 		      }
1523 
1524 		    solns->right = rx_make_solutions (solns->regs,
1525 						      solns->cset_size,
1526 						      solns->exp->right,
1527 						      solns->subexps,
1528 						      solns->nsub,
1529 						      solns->split_guess,
1530 						      solns->end,
1531 						      0,
1532 						      solns->vmfn,
1533 						      solns->contextfn,
1534 						      solns->closure,
1535 						      solns->small_p,
1536 						      0,
1537 						      0);
1538 		    if (!solns->right)
1539 		      longjmp (*err_escape, 1);
1540 
1541 		  case 2:
1542 		    concat_stat = rx_next_solution_internal (solns->right, err_escape, depth + 1);
1543 		    if (concat_stat < 0)
1544 		      longjmp (*err_escape, 1);
1545 
1546 		    if (concat_stat == 1)
1547 		      {
1548 			solns->final_tag = solns->right->final_tag;
1549 			solns->step = 2;
1550 			return concat_stat;
1551 		      }
1552 		    else
1553 		      {
1554 			rx_free_solutions (solns->right);
1555 			solns->right = 0;
1556 			goto right_concat_try_next_left_match;
1557 		      }
1558 		  }
1559 	      }
1560 
1561 	    case r_star:
1562 	      {
1563 		int star_stat;
1564 
1565 		switch (solns->step)
1566 		  {
1567 		  case 1:
1568 		    /* Begin by trying to match the entire string
1569 		     * with a single iteration.
1570 		     */
1571 		    solns->left = rx_make_solutions (solns->regs,
1572 						     solns->cset_size,
1573 						     solns->exp->left,
1574 						     solns->subexps,
1575 						     solns->nsub,
1576 						     solns->start,
1577 						     solns->end,
1578 						     0,
1579 						     solns->vmfn,
1580 						     solns->contextfn,
1581 						     solns->closure,
1582 						     solns->small_p,
1583 						     0,
1584 						     0);
1585 		    if (!solns->left)
1586 		      longjmp (*err_escape, 1);
1587 
1588 
1589 		  case 2:
1590 		    star_stat = rx_next_solution_internal (solns->left, err_escape, depth + 1);
1591 		    if (star_stat < 0)
1592 		      longjmp (*err_escape, 1);
1593 
1594 		    if (star_stat == 1)
1595 		      {
1596 			/* found a single-iteration whole-string match */
1597 			solns->final_tag = solns->left->final_tag;
1598 			solns->step = 2;
1599 			return 1;
1600 		      }
1601 
1602 		    /* no more single-iteration whole-string matches */
1603 		    rx_free_solutions (solns->left);
1604 		    solns->left = 0;
1605 
1606 		    if (solns->start == solns->end)
1607 		      {
1608 			/* when comaring to the empty string and none of the single-iteration
1609 			 * matches work, try matching with zero iterations, then give up
1610 			 */
1611 			solns->step = -1;
1612 			return 1;
1613 		      }
1614 
1615 		    /* when comparing to a non-empty string and none of the single-iteration
1616 		     * matches work, try all matches with two or more non-empty iterations.
1617 		     */
1618 		    if (!solns->exp->left || (solns->exp->left->len == 0))
1619 		      {
1620 			solns->step = -1;
1621 			return 0;
1622 		      }
1623 		    else if (solns->exp->left->len > 0)
1624 		      {
1625 			if (   (solns->exp->left->len >= (solns->end - solns->start))
1626 			    || (0 != ((solns->end - solns->start) % solns->exp->left->len)))
1627 			  {
1628 			    solns->step = -1;
1629 			    return 0;
1630 			  }
1631 			solns->split_guess = solns->end - solns->exp->left->len;
1632 		      }
1633 		    else
1634 		      solns->split_guess = solns->start + 1;
1635 
1636 		  case 3:
1637 		  star_3:
1638 
1639 		    /* Suppose the string has been split into two parts: s == s1 s2
1640 		     * and we are trying to match R*.
1641 		     *
1642 		     * Recurse by matching s1 against R*, and s2 against R.
1643 		     *
1644 		     * However, if s == s1 and s2 is the empty string, comparing
1645 		     * s1 to R* would cause an infinite recursion.  So instead,
1646 		     * compare s1 to R and s2 to R.
1647 		     *
1648 		     * Comparing s2 to R is often much less expensive than comparing
1649 		     * s1 to R*.  If s2 doesn't match R, time spent comparing s1 to R*
1650 		     * is wasted.  So, before trying s1 against R*, make sure that
1651 		     * s2 can match R.
1652 		     *
1653 		     * Registers are subtle.  There are three possibilities:
1654 		     *
1655 		     *		[A] R contains no reporting parens
1656 		     *		[B] R contains reporting parens
1657 		     *
1658 		     * In case A, matching s1 against R* won't change any registers,
1659 		     * so it is safe to match s to R first.
1660 		     *
1661 		     * In the second case, either
1662 		     *
1663 		     * 		[B1] R is a reporting parenthesized expression
1664 		     * 		[B2] R is a non-reporting parenthesized expression
1665 		     *
1666 		     * In either of those two cases, the first step of
1667 		     * comparing s2 to R is to set all of the
1668 		     * registers for nested reporting parentheses to
1669 		     * -1.  As a result, again, s1 against R* has no (observable) effect
1670 		     * on registers -- so it is safe to match s to R first.
1671 		     *
1672 		     */
1673 
1674 		    solns->right = rx_make_solutions (solns->regs,
1675 						      solns->cset_size,
1676 						      solns->exp->left,
1677 						      solns->subexps,
1678 						      solns->nsub,
1679 						      solns->split_guess,
1680 						      solns->end,
1681 						      0,
1682 						      solns->vmfn,
1683 						      solns->contextfn,
1684 						      solns->closure,
1685 						      solns->small_p,
1686 						      0,
1687 						      0);
1688 		    if (!solns->right)
1689 		      longjmp (*err_escape, 1);
1690 
1691 		    star_stat = rx_next_solution_internal (solns->right, err_escape, depth + 1);
1692 		    if (star_stat < 0)
1693 		      longjmp (*err_escape, 1);
1694 
1695 		    if (star_stat != 1)
1696 		      {
1697 			/* No match for s2 against R, so guess again:
1698 			 */
1699 			rx_free_solutions (solns->right);
1700 			solns->right = 0;
1701 			if (solns->exp->left->len > 0)
1702 			  {
1703 			    solns->step = -1;
1704 			    return 0;
1705 			  }
1706 			++solns->split_guess;
1707 			if (solns->split_guess > solns->end)
1708 			  {
1709 			    solns->step = -1;
1710 			    return 0;
1711 			  }
1712 			goto star_3;
1713 		      }
1714 
1715 		    /* Aha -- s2 matches R.  Does s1 match R*?
1716 		     *
1717 		     * Testing s1 against R* can modify some registers
1718 		     * that were set while matching s2 to R.  These
1719 		     * registers must be saved and then restored before
1720 		     * returning the overall match.
1721 		     */
1722 		    if (save_nested_registers (solns))
1723 		      {
1724 			longjmp (*err_escape, 1);
1725 		      }
1726 
1727 		    if (solns->split_guess == solns->end)
1728 		      solns->left = rx_make_solutions (solns->regs,
1729 						       solns->cset_size,
1730 						       solns->exp->left,
1731 						       solns->subexps,
1732 						       solns->nsub,
1733 						       solns->start,
1734 						       solns->split_guess,
1735 						       0,
1736 						       solns->vmfn,
1737 						       solns->contextfn,
1738 						       solns->closure,
1739 						       solns->small_p,
1740 						       0,
1741 						       0);
1742 		    else
1743 		      solns->left = rx_make_solutions (solns->regs,
1744 						       solns->cset_size,
1745 						       solns->exp,
1746 						       solns->subexps,
1747 						       solns->nsub,
1748 						       solns->start,
1749 						       solns->split_guess,
1750 						       0,
1751 						       solns->vmfn,
1752 						       solns->contextfn,
1753 						       solns->closure,
1754 						       solns->small_p,
1755 						       0,
1756 						       0);
1757 		    if (!solns->left)
1758 		      longjmp (*err_escape, 1);
1759 
1760 		    star_stat = rx_next_solution_internal (solns->left, err_escape, depth + 1);
1761 		    if (star_stat < 0)
1762 		      longjmp (*err_escape, 1);
1763 
1764 		    if (star_stat != 1)
1765 		      {
1766 			/* no matches for the left part of the split string
1767 			 * at this split_guess.
1768 			 */
1769 			rx_free_solutions (solns->left);
1770 			rx_free_solutions (solns->right);
1771 			solns->left = 0;
1772 			solns->right = 0;
1773 			if (solns->exp->left->len > 0)
1774 			  {
1775 			    solns->step = -1;
1776 			    return 0;
1777 			  }
1778 			++solns->split_guess;
1779 			if (solns->split_guess > solns->end)
1780 			  {
1781 			    solns->step = -1;
1782 			    return 0;
1783 			  }
1784 			goto star_3;
1785 		      }
1786 
1787 		    /* Found at least one solution for both s1 against R* and
1788 		     * s2 against R.
1789 		     *
1790 		     * Return that solution.  First, restore the registers from
1791 		     * the s2/R match:
1792 		     *
1793 		     */
1794 
1795 		    restore_nested_registers (solns);
1796 		    solns->final_tag = solns->right->final_tag;
1797 		    solns->step = 4;
1798 		    return 1;
1799 
1800 		  case 4:
1801 		    /* This is reached when we've found one solution for
1802 		     * s1 against R* and s2 against R, but that solution
1803 		     * was rejected.
1804 		     *
1805 		     * Try to find another solution for s2/R, using the same
1806 		     * match for s1/R*.
1807 		     */
1808 
1809 		    star_stat = rx_next_solution_internal (solns->right, err_escape, depth + 1);
1810 		    if (star_stat < 0)
1811 		      longjmp (*err_escape, 1);
1812 
1813 		    if (star_stat == 1)
1814 		      {
1815 			/* found a match */
1816 			solns->final_tag = solns->right->final_tag;
1817 			solns->step = 4;
1818 		      }
1819 
1820 		    /* No more matches for s2/R for the current match of s1/R*
1821 		     *
1822 		     * We could look for another s1/R* match -- but why bother?
1823 		     * Any changes to registers made by such a match are
1824 		     * not observable after any s/R match.  So, try again with
1825 		     * a different split guess:
1826 		     */
1827 
1828 		    rx_free_solutions (solns->left);
1829 		    rx_free_solutions (solns->right);
1830 		    solns->left = 0;
1831 		    solns->right = 0;
1832 		    if (solns->exp->left->len > 0)
1833 		      {
1834 			solns->step = -1;
1835 			return 0;
1836 		      }
1837 		    ++solns->split_guess;
1838 		    if (solns->split_guess > solns->end)
1839 		      {
1840 			solns->step = -1;
1841 			return 0;
1842 		      }
1843 		    goto star_3;
1844 		  }
1845 	      }
1846 
1847 	    case r_interval:
1848 	      {
1849 		/* On entry, interval_x is the number of iterations
1850 		 * already charged to this interval.  So, instead
1851 		 * of {intval, intval2}, the interval is actually:
1852 		 *
1853 		 * 	{ max(0, intval - interval_x),
1854 		 *	  max(0, intval2 - interval_x) }
1855 		 *
1856 		 */
1857 		switch (solns->step)
1858 		  {
1859 		    int interval_stat;
1860 
1861 		  case 1:
1862 		    solns->interval_from = max0(solns->exp->intval - solns->interval_x);
1863 		    solns->interval_to = max0(solns->exp->intval2 - solns->interval_x);
1864 
1865 		    /* Should we try single iteration solutions?
1866 		     *
1867 		     * Yes if either:
1868 		     *
1869 		     *		(interval_from <= 1) && (interval_to >= 1)
1870 		     *
1871 		     * or
1872 		     * 		((solns->start == solns->end) && (interval_to >= 1))
1873 		     *
1874 		     * If the string is the empty string (start == end), and any
1875 		     * iterations are permitted (interval_to >= 1) then all possible
1876 		     * solutions (except the 0-iteration solution) can be found with
1877 		     * a single iteration.  Repeated iterations for a null match do not
1878 		     * add new solutions compared to a single iteration because the
1879 		     * single iteration will always have the same side effects as multiple
1880 		     * iterations ending with that single iteration.
1881 		     */
1882 
1883 		    if (   (   (solns->start == solns->end)
1884 			    && (solns->interval_to >= 1)
1885 			    && (   !solns->exp->left
1886 				|| (solns->exp->left->len <= 0)))
1887 			|| (   (solns->interval_from <= 1)
1888 			    && (solns->interval_to >= 1)
1889 			    && (solns->start != solns->end)
1890 			    && solns->exp->left
1891 			    && (   (solns->exp->left->len < 0)
1892 				|| (solns->exp->left->len == (solns->end - solns->start)))))
1893 
1894 		      {
1895 			solns->left = rx_make_solutions (solns->regs,
1896 							 solns->cset_size,
1897 							 solns->exp->left,
1898 							 solns->subexps,
1899 							 solns->nsub,
1900 							 solns->start,
1901 							 solns->end,
1902 							 0,
1903 							 solns->vmfn,
1904 							 solns->contextfn,
1905 							 solns->closure,
1906 							 solns->small_p,
1907 							 0,
1908 							 0);
1909 			if (!solns->left)
1910 			  longjmp (*err_escape, 1);
1911 		      case 2:
1912 			interval_stat = rx_next_solution_internal (solns->left, err_escape, depth + 1);
1913 			if (interval_stat < 0)
1914 			  longjmp (*err_escape, 1);
1915 
1916 			if (interval_stat != 1)
1917 			  {
1918 			    rx_free_solutions (solns->left);
1919 			    solns->left = 0;
1920 			    goto try_0_iteration_null_match;
1921 			  }
1922 			else
1923 			  {
1924 			    solns->step = 2;
1925 			    return 1;
1926 			  }
1927 		      }
1928 
1929 		  try_0_iteration_null_match:
1930 
1931 		    /* If we're trying to match the empty string, and 0 iterations are
1932 		     * permitted, that is the next best solution.  If we're trying to
1933 		     * match the empty string, the 0 iteration solution is the only
1934 		     * remaining possibility.
1935 		     */
1936 		    if (solns->start == solns->end)
1937 		      {
1938 			solns->step = -1;
1939 			return (solns->interval_from == 0);
1940 		      }
1941 
1942 		    /* The string is not empty.  If only 0 iterations are permitted,
1943 		     * then no solutions are possible.
1944 		     */
1945 		    if (solns->interval_to == 0)
1946 		      {
1947 			solns->step = -1;
1948 			return 0;
1949 		      }
1950 
1951 		    /* The string is not empty and no single iteration solution works.
1952 		     * Search through solutions with at least two iterations.  We split
1953 		     * the string into two parts, and try to match the left part with N
1954 		     * iterations (N >= 1) and the right part with 1 iteration.
1955 		     *
1956 		     * To satisfy the left-to-right longest-subexpression rule, the right
1957 		     * part (matched by the last iteration) should be as long as possible.
1958 		     */
1959 
1960 		    if (!solns->exp->left || (solns->exp->left->len == 0))
1961 		      {
1962 			/* never reached unless certain other optimizations are removed
1963 			 */
1964 			solns->step = -1;
1965 			return 0;
1966 		      }
1967 		    else if (solns->exp->left->len > 0)
1968 		      {
1969 			if (   (solns->exp->left->len > (solns->end - solns->start))
1970 			    || (0 != ((solns->end - solns->start) % solns->exp->left->len)))
1971 			  {
1972 			    solns->step = -1;
1973 			    return 0;
1974 			  }
1975 			solns->split_guess = solns->end - solns->exp->left->len;
1976 		      }
1977 		    else
1978 		      solns->split_guess = solns->start;
1979 
1980 		  interval_split_start_at_new_split_guess:
1981 
1982 		    solns->right = rx_make_solutions (solns->regs,
1983 						      solns->cset_size,
1984 						      solns->exp->left,
1985 						      solns->subexps,
1986 						      solns->nsub,
1987 						      solns->split_guess,
1988 						      solns->end,
1989 						      0,
1990 						      solns->vmfn,
1991 						      solns->contextfn,
1992 						      solns->closure,
1993 						      solns->small_p,
1994 						      0,
1995 						      0);
1996 		    if (!solns->right)
1997 		      longjmp (*err_escape, 1);
1998 
1999 		  next_interval_right_solution:
2000 		    interval_stat = rx_next_solution_internal (solns->right, err_escape, depth + 1);
2001 		    if (interval_stat < 0)
2002 		      longjmp (*err_escape, 1);
2003 
2004 		    if (interval_stat != 1)
2005 		      {
2006 			rx_free_solutions (solns->right);
2007 			solns->right = 0;
2008 
2009 
2010 			if (solns->exp->left->len < 0)
2011 			  {
2012 			    ++solns->split_guess;
2013 			    if (solns->split_guess <= solns->end)
2014 			      goto interval_split_start_at_new_split_guess;
2015 			  }
2016 			/* No more solutions at all. */
2017 			solns->step = -1;
2018 			return 0;
2019 		      }
2020 
2021 		    /* Found a solution for the right half of the split.
2022 		     */
2023 
2024 		    if (save_nested_registers (solns))
2025 		      {
2026 			longjmp (*err_escape, 1);
2027 		      }
2028 		    solns->left = rx_make_solutions (solns->regs,
2029 						     solns->cset_size,
2030 						     solns->exp,
2031 						     solns->subexps,
2032 						     solns->nsub,
2033 						     solns->start,
2034 						     solns->split_guess,
2035 						     solns->interval_x + 1,
2036 						     solns->vmfn,
2037 						     solns->contextfn,
2038 						     solns->closure,
2039 						     solns->small_p,
2040 						     0,
2041 						     0);
2042 		    if (!solns->left)
2043 		      longjmp (*err_escape, 1);
2044 
2045 		    interval_stat = rx_next_solution_internal (solns->left, err_escape, depth + 1);
2046 		    if (interval_stat < 0)
2047 		      longjmp (*err_escape, 1);
2048 
2049 		    if (interval_stat != 1)
2050 		      {
2051 			/* No more solutions for the left part of the string.
2052 			 */
2053 			rx_free_solutions (solns->left);
2054 			solns->left = 0;
2055 			goto next_interval_right_solution;
2056 		      }
2057 
2058 		    /* We found an overall solution for the interval.
2059 		     */
2060 		    solns->step = 3;
2061 		    restore_nested_registers (solns);
2062 		    solns->final_tag = solns->right->final_tag;
2063 		    return 1;
2064 
2065 		  case 3:
2066 		    rx_free_solutions (solns->left);
2067 		    solns->left = 0;
2068 		    goto next_interval_right_solution;
2069 
2070 		  }
2071 	      }
2072 
2073 	    case r_context:
2074 	      {
2075 		solns->step = -1;
2076 		solns->final_tag = 1;
2077 		return solns->contextfn (solns->closure,
2078 					 solns->exp,
2079 					 solns->start, solns->end,
2080 					 solns->regs);
2081 	      }
2082 
2083 
2084 	    }
2085 	}
2086       while (1)
2087 	panic ("unreached in rx_next_solution_internal");
2088     }
2089 }
2090 
2091 /*(c rx_solutions_final_tag)
2092  * int rx_solutions_final_tag (struct rx_solutions * solns);
2093  *
2094  * Return the state label of the last DFA state reached during
2095  * the solution most recently returned by `rx_next_solution'.
2096  */
2097 int
rx_solutions_final_tag(struct rx_solutions * solns)2098 rx_solutions_final_tag (struct rx_solutions * solns)
2099 {
2100   return solns->final_tag;
2101 }
2102 
2103 
2104 
2105 /************************************************************************
2106  *(h1 "Building and Freeing a Stream of Solutions -- The General Case")
2107  *
2108  * The functions in this section are quite general.  They can be used
2109  * to compare a regexp to a string which is not entirely in memory at
2110  * any one time.
2111  */
2112 
2113 /*(c rx_make_solutions)
2114  * struct rx_solutions *
2115  * rx_make_solutions (struct rx_registers * regs,
2116  *		      int cset_size,
2117  *		      struct rx_exp_node * expression,
2118  *		      struct rx_exp_node ** subexps,
2119  *		      int nsub,
2120  *		      rx_off_t start,
2121  *		      rx_off_t end,
2122  * 		      int interval_x,
2123  *		      rx_vmfn vmfn,
2124  *		      rx_contextfn contextfn,
2125  *		      void * closure,
2126  *		      int small_p,
2127  *		      int certainly_fits,
2128  *		      int certain_final_tag);
2129  *
2130  *
2131  * Construct a stream-of-solutions to a regexp matching problem.  For
2132  * arguments which are not documented here, see
2133  * xref:"rx_basic_make_solutions".
2134  *
2135  * If `expression' is of type `r_interval', then `interval_x'
2136  * is the number of matches already accumulated for the subexpression.
2137  * In other words, if expression is `RE{M,N}', it will match as:
2138  *
2139  * 	RE{max (0,M - interval_x), max (0, N - interval_x)
2140  *
2141  * This function does not directly accept an input string. Instead,
2142  * it accepts the parameters `vmfn', `contextfn', and `closure'.
2143  *
2144  * `vmfn' is a function which can return the address of a substring
2145  * containing a chosen character position within the entire input
2146  * string.  It is used to page-in parts of the input string during a
2147  * match.  The details of this parameter are given below.
2148  *
2149  * `contextfn' is a function that knows how to evaluate regexp
2150  * expressions which are anchors or backreferences.  These kinds of
2151  * subexpressions may be defined in terms of parts of the input string
2152  * that are not in memory or that are not contiguous in memory.  The
2153  * `contextfn' has the job of sorting that out in an efficient way.
2154  * The details of this parameter are also given below.
2155  *
2156  * `closure' is an opaque parameter that is passed along to `vmfn' and
2157  * `contextfn'.
2158  *
2159  * `small_p', if not 0, means that the regexp is so trivial that
2160  * heavy-weight optimizations are not likely to pay off.  It is always
2161  * safe to pass 0 for this parameter.  Internally, Rx passes a non-0
2162  * value if `expression' is 0 or if `expression->small_advised_p' is
2163  * non-0.  `expression->small_advised_p' is filled in by
2164  * `rx_analyze_rexp'.
2165  *
2166  * `certainly_fits', if not 0, means that if `expression' is converted
2167  * to a true regular expression (such as by `rx_simplify_rexp') and
2168  * compared to the target string, the target string will certainly match.
2169  * Setting this field can speed up some matches.  If this field is
2170  * non-0, `certain_final_tag' must also be set.  It is always safe to
2171  * pass 0 for `certainly_fits', though doing so may slow down some matches
2172  * slightly.
2173  *
2174  * `certain_final_tag' is the final DFA state label returned for a
2175  * match of simplified form of `expression' against the target string.
2176  * This field is only used if `certainly_fits' is not 0.
2177  */
2178 struct rx_solutions *
rx_make_solutions(struct rx_registers * regs,int cset_size,struct rx_exp_node * expression,struct rx_exp_node ** subexps,int nsub,rx_off_t start,rx_off_t end,int interval_x,rx_vmfn vmfn,rx_contextfn contextfn,void * closure,int small_p,int certainly_fits,int certain_final_tag)2179 rx_make_solutions (struct rx_registers * regs,
2180 		   int cset_size,
2181 		   struct rx_exp_node * expression,
2182 		   struct rx_exp_node ** subexps,
2183 		   int nsub,
2184 		   rx_off_t start,
2185 		   rx_off_t end,
2186 		   int interval_x,
2187 		   rx_vmfn vmfn,
2188 		   rx_contextfn contextfn,
2189 		   void * closure,
2190 		   int small_p,
2191 		   int certainly_fits,
2192 		   int certain_final_tag)
2193 {
2194   struct rx_solutions * solns;
2195 
2196   if (!expression && (start != end))
2197     return &no_solutions;
2198 
2199   if (   expression
2200       && (expression->len >= 0)
2201       && ((expression->type != r_interval)
2202 	  ? (expression->len != (end - start))
2203 	  : (/* Is an interval expression of fixed length */
2204 	     (!expression->len
2205 	      ? (end != start)
2206 	      : ((expression->len - (interval_x * expression->left->len)) != (end - start))))))
2207     return &no_solutions;
2208 
2209   solns = (struct rx_solutions *)rx_nfa_cache_malloc (sizeof (*solns));
2210   if (!solns)
2211     return 0;
2212 
2213   ++solns_allocated;
2214 
2215   solns->small_p = small_p;
2216   solns->step = 0;
2217 
2218   solns->cset_size = cset_size;
2219   solns->exp = expression;
2220   rx_save_exp (expression);
2221   solns->subexps = subexps;
2222   solns->nsub = nsub;
2223   solns->regs = regs;
2224 
2225   solns->start = start;
2226   solns->end = end;
2227 
2228   solns->vmfn = vmfn;
2229   solns->contextfn = contextfn;
2230   solns->closure = closure;
2231 
2232   solns->dfa = 0;
2233   solns->match_engine.rx = 0;
2234   solns->match_engine.state = 0;
2235 
2236   solns->certainly_fits = certainly_fits;
2237   solns->certain_final_tag = certain_final_tag;
2238 
2239   solns->no_fancy_guessing = 0;
2240   solns->left_dfa = 0;
2241   solns->left_match_engine.rx = 0;
2242   solns->left_match_engine.state = 0;
2243 
2244   solns->split_guess = 0;
2245   solns->left = 0;
2246   solns->right = 0;
2247 
2248   solns->interval_x = interval_x;
2249   solns->interval_from = 0;
2250   solns->interval_to = 0;
2251 
2252   solns->saved_rm_so = 0;
2253   solns->saved_rm_eo = 0;
2254   solns->final_tag = 0;
2255 
2256   if (!small_p)
2257     {
2258       if (!solns->exp || !solns->exp->observed)
2259 	{
2260 	  solns->dfa = rx_unfa (expression, cset_size);
2261 	  if (!solns->dfa)
2262 	    {
2263 	      rx_free_solutions (solns);
2264 	      return 0;
2265 	    }
2266 	  rx_init_dfa_from_nfa (&solns->match_engine, solns->dfa->nfa);
2267 	  if (rx_dfa_goto_start_superstate (&solns->match_engine, 1))
2268 	    {
2269 	      rx_free_solutions (solns);
2270 	      return 0;
2271 	    }
2272 	}
2273       else
2274 	{
2275 	  struct rx_exp_node * simplified;
2276 	  if (rx_simplify_rexp (&simplified, cset_size, solns->exp, subexps))
2277 	    {
2278 	      rx_free_solutions (solns);
2279 	      return 0;
2280 	    }
2281 	  solns->dfa = rx_unfa (simplified, cset_size);
2282 	  if (!solns->dfa)
2283 	    {
2284 	      rx_free_exp (simplified);
2285 	      rx_free_solutions (solns);
2286 	      return 0;
2287 	    }
2288 	  rx_init_dfa_from_nfa (&solns->match_engine, solns->dfa->nfa);
2289 	  if (rx_dfa_goto_start_superstate (&solns->match_engine, 1))
2290 	    {
2291 	      rx_free_exp (simplified);
2292 	      rx_free_solutions (solns);
2293 	      return 0;
2294 	    }
2295 	  rx_free_exp (simplified);
2296 	}
2297     }
2298 
2299 
2300   if (solns->exp && (solns->exp->observed || solns->small_p) && (solns->exp->type == r_concat))
2301     {
2302       struct rx_exp_node * left_simplified;
2303 
2304       if (rx_simplify_rexp (&left_simplified, cset_size, solns->exp->left, solns->subexps))
2305 	{
2306 	  rx_free_solutions (solns);
2307 	  return 0;
2308 	}
2309 
2310       solns->left_dfa = rx_unfa (left_simplified, cset_size);
2311       if (!solns->left_dfa)
2312 	{
2313 	  rx_free_exp (left_simplified);
2314 	  rx_free_solutions (solns);
2315 	  return 0;
2316 	}
2317       rx_init_dfa_from_nfa (&solns->left_match_engine, solns->left_dfa->nfa);
2318       rx_free_exp (left_simplified);
2319     }
2320 
2321   return solns;
2322 }
2323 
2324 
2325 /*(include-documentation "match-regexp.h")
2326  */
2327 
2328 
2329 /*(c rx_solutions_closure)
2330  * void * rx_solutions_closure (struct rx_solutions * solns);
2331  *
2332  * Given a stream of regexp solutions, return the `closure'
2333  * parameter that was passed to `rx_make_solutions'.
2334  */
2335 void *
rx_solutions_closure(struct rx_solutions * solns)2336 rx_solutions_closure (struct rx_solutions * solns)
2337 {
2338   return solns->closure;
2339 }
2340 
2341 
2342 /*(c rx_free_solutions)
2343  * void rx_free_solutions (struct rx_solutions * solns);
2344  *
2345  * Free all storage associated with a stream of regexp solutions
2346  * allocated by `rx_make_solutions'.  See also
2347  * xref:"rx_basic_free_solutions".
2348  */
2349 void
rx_free_solutions(struct rx_solutions * solns)2350 rx_free_solutions (struct rx_solutions * solns)
2351 {
2352   if (!solns)
2353     return;
2354 
2355   if (solns == &no_solutions)
2356     return;
2357 
2358   if (solns->left)
2359     {
2360       rx_free_solutions (solns->left);
2361       solns->left = 0;
2362     }
2363 
2364   if (solns->right)
2365     {
2366       rx_free_solutions (solns->right);
2367       solns->right = 0;
2368     }
2369 
2370   if (solns->dfa)
2371     {
2372       rx_free_unfa (solns->dfa);
2373       solns->dfa = 0;
2374     }
2375 
2376   rx_clear_dfa_state (&solns->match_engine);
2377 
2378   if (solns->left_dfa)
2379     {
2380       rx_free_unfa (solns->left_dfa);
2381       solns->left_dfa = 0;
2382     }
2383 
2384   rx_clear_dfa_state (&solns->left_match_engine);
2385 
2386   if (solns->exp)
2387     {
2388       rx_free_exp (solns->exp);
2389       solns->exp = 0;
2390     }
2391 
2392   if (solns->saved_rm_so)
2393     rx_nfa_cache_free (solns->saved_rm_so);
2394 
2395   if (solns->saved_rm_eo)
2396     rx_nfa_cache_free (solns->saved_rm_eo);
2397 
2398   rx_nfa_cache_free (solns);
2399   ++solns_freed;
2400 }
2401 
2402 
2403 
2404 
2405 
2406 /****************************************************************
2407  *(h1 "Regexp Tree To Regular Expression Conversion")
2408  *
2409  */
2410 
2411 /*(c rx_simplify_rexp)
2412  * int rx_simplify_rexp (struct rx_exp_node ** answer,
2413  *                       int cset_size,
2414  *                       struct rx_exp_node *node,
2415  *                       struct rx_exp_node ** subexps);
2416  *
2417  * Convert an expression, which might not be a regular expression, into
2418  * a regular expression matching a superset of the original pattern.
2419  *
2420  * This is useful for a matching heuristic: the regular superset
2421  * language is used to test a candidate string for a match, the
2422  * original irregular regexp is used to verify the match.  The test
2423  * using the regular superset is very fast and many non-matching
2424  * strings can be quickly rejected; the verification using the
2425  * irregular regexp may be slow, so we avoid it when we can.
2426  *
2427  * If the input expression is a regular expression, this is an
2428  * identity function which increments the reference count of `node'.
2429  *
2430  * `answer' is a return parameter.
2431  *
2432  * `subexps' is an array of pointers into the expression `node'.
2433  * Element `N' of the array is the `N'th parenthesized subexpression
2434  * of the tree rooted at `node'.  This array is usually computed by
2435  * `rx_analyze_rexp'.
2436  */
2437 int
rx_simplify_rexp(struct rx_exp_node ** answer,int cset_size,struct rx_exp_node * node,struct rx_exp_node ** subexps)2438 rx_simplify_rexp (struct rx_exp_node ** answer,
2439 		  int cset_size,
2440 		  struct rx_exp_node * node,
2441 		  struct rx_exp_node ** subexps)
2442 {
2443   int err;
2444 
2445   if (!node)
2446     {
2447       *answer = 0;
2448       return 0;
2449     }
2450 
2451   if (!node->observed)
2452     {
2453       rx_save_exp (node);
2454       *answer = node;
2455       return 0;
2456     }
2457 
2458   if (node->simplified)
2459     {
2460       rx_save_exp (node->simplified);
2461       *answer = node->simplified;
2462       return 0;
2463     }
2464 
2465   switch (node->type)
2466     {
2467     default:
2468     case r_cset:
2469     case r_string:
2470     case r_cut:
2471       panic ("bogus regexp in rx_simplify_rexp");
2472       return -1;
2473 
2474     case r_parens:
2475       err = rx_simplify_rexp (answer, cset_size, node->left, subexps);
2476       if (err)
2477 	return err;
2478       break;
2479 
2480     case r_context:
2481       if (char_is_digit (node->intval))
2482 	{
2483 	  err = rx_simplify_rexp (answer, cset_size, subexps [node->intval - '0'], subexps);
2484 	  if (err)
2485 	    return err;
2486 	}
2487       else
2488 	*answer = 0;
2489       break;
2490 
2491     case r_concat:
2492     case r_right_concat:
2493     case r_alternate:
2494     case r_star:
2495     case r_interval:
2496       {
2497 	struct rx_exp_node *n;
2498 
2499 	n = rx_exp_node (node->type);
2500 	if (!n)
2501 	  return REG_ESPACE;
2502 
2503 	if (node->cset)
2504 	  {
2505 	    panic ("found a cset bitset in an unexpected place");
2506 	  }
2507 
2508 	n->intval = node->intval;
2509 	n->intval2 = node->intval2;
2510 	err = rx_simplify_rexp (&n->left, cset_size, node->left, subexps);
2511 	if (err)
2512 	  {
2513 	    rx_free_exp (n);
2514 	    return err;
2515 	  }
2516 	err = rx_simplify_rexp (&n->right, cset_size, node->right, subexps);
2517 	if (err)
2518 	  {
2519 	    rx_free_exp (n);
2520 	    return err;
2521 	  }
2522 	*answer = n;
2523       }
2524       break;
2525     }
2526 
2527   node->simplified = *answer;
2528   rx_save_exp (node->simplified);
2529   return 0;
2530 }
2531 
2532 
2533 
2534 /****************************************************************
2535  *(h1 "Regexp Tree Analysis")
2536  *
2537  */
2538 
2539 /*(c rx_analyze_rexp)
2540  * void rx_analyze_rexp (struct rx_exp_node *** subexps,
2541  *                       size_t * re_nsub,
2542  *                       struct rx_exp_node * node);
2543  *
2544  * Recursively analyze the expression tree rooted at `node'.  For each
2545  * node, fill in the fields `observed', `observation_contingent',
2546  * `small_advised_p', and `len'.  Build an array of parenthesized
2547  * sub-expressions in the tree and return that array in `subexps' and
2548  * the number of elements it contains in `re_nsub'.
2549  *
2550  * See `struct rx_exp_node' for information about the meaning of
2551  * structure fields filled in by this function.
2552  */
2553 int
rx_analyze_rexp(struct rx_exp_node *** subexps,size_t * re_nsub,struct rx_exp_node * node)2554 rx_analyze_rexp (struct rx_exp_node *** subexps,
2555 		 size_t * re_nsub,
2556 		 struct rx_exp_node * node)
2557 {
2558 #define rx_max(a, b) ((a) > (b) ? (a) : (b))
2559 #define rx_min(a, b) ((a) < (b) ? (a) : (b))
2560   if (node)
2561     {
2562       size_t this_subexp;
2563 
2564       this_subexp = 0;
2565 
2566       if (node->type == r_parens)
2567 	{
2568 	  if (node->intval > 0)
2569 	    {
2570 	      struct rx_exp_node ** array;
2571 
2572 	      this_subexp = *re_nsub;
2573 	      ++*re_nsub;
2574 	      if (!*subexps)
2575 		array = ((struct rx_exp_node **)
2576 			 rx_nfa_cache_malloc (sizeof (struct rx_exp_node *) * *re_nsub));
2577 	      else
2578 		array = ((struct rx_exp_node **)
2579 			 rx_nfa_cache_realloc (*subexps,
2580 					       sizeof (struct rx_exp_node *) * *re_nsub));
2581 	      if (!array)
2582 		return -1;
2583 	      else
2584 		*subexps = array;
2585 	    }
2586 	}
2587 
2588       if (node->left && rx_analyze_rexp (subexps, re_nsub, node->left))
2589 	return -1;
2590 
2591       if (node->right && rx_analyze_rexp (subexps, re_nsub, node->right))
2592 	return -1;
2593 
2594       switch (node->type)
2595 	{
2596 	case r_cset:
2597 	  node->len = 1;
2598 	  node->observed = 0;
2599 	  node->observation_contingent = 1;
2600 	  node->small_advised_p = 1;
2601 	  node->max_enclosed_paren = 0;
2602 	  node->min_enclosed_paren = 0;
2603 	  break;
2604 
2605  	case r_string:
2606  	  node->len = node->str_len;
2607  	  node->observed = 0;
2608 	  node->observation_contingent = 1;
2609 	  node->small_advised_p = 1;
2610 	  node->max_enclosed_paren = 0;
2611 	  node->min_enclosed_paren = 0;
2612  	  break;
2613 
2614 	case r_cut:
2615 	  node->len = -1;
2616 	  node->observed = 0;
2617 	  node->observation_contingent = 1;
2618 	  node->small_advised_p = 1;
2619 	  node->max_enclosed_paren = 0;
2620 	  node->min_enclosed_paren = 0;
2621 	  break;
2622 
2623 	case r_concat:
2624 	case r_right_concat:
2625 	case r_alternate:
2626 	  {
2627 	    int lob, rob;
2628 	    int lobc, robc;
2629 	    long llen, rlen;
2630 
2631 	    lob = (!node->left ? 0 : node->left->observed);
2632 	    lobc = (!node->left ? 1 : node->left->observation_contingent);
2633 	    rob = (!node->right ? 0 : node->right->observed);
2634 	    robc = (!node->right ? 1 : node->right->observation_contingent);
2635 	    llen = (!node->left ? 0 : node->left->len);
2636 	    rlen = (!node->right ? 0 : node->right->len);
2637 
2638 	    if ((llen < 0) || (rlen < 0))
2639 	      node->len = -1;
2640 	    else if ((node->type == r_concat) || (node->type == r_right_concat))
2641 	      {
2642 		if (((size_t)llen + (size_t)rlen) > SSIZE_MAX)
2643 		  panic ("absurdly large regexp in rx_analyze_rexp");
2644 		node->len = (long)(llen + rlen);
2645 	      }
2646 	    else /* node->type == r_alternate */
2647 	      {
2648 		if (llen == rlen)
2649 		  node->len = llen;
2650 		else
2651 		  node->len = -1;
2652 	      }
2653 
2654 	    node->observed = lob || rob;
2655 	    node->observation_contingent = lobc && robc;
2656 
2657 	    if ((node->type == r_concat) || (node->type == r_right_concat))
2658 	      {
2659 		node->small_advised_p = (   (!node->left || node->left->small_advised_p)
2660 					 && (!node->right || node->right->small_advised_p));
2661 	      }
2662 	    else /* node->type == r_alternate */
2663 	      {
2664 		node->small_advised_p = 0;
2665 	      }
2666 	    node->max_enclosed_paren = rx_max ((node->left ? node->left->max_enclosed_paren : 0),
2667 					       (node->right ? node->right->max_enclosed_paren : 0));
2668 	    {
2669 	      int lmin;
2670 	      int rmin;
2671 	      lmin = (node->left ? node->left->min_enclosed_paren : 0);
2672 	      rmin = (node->right ? node->right->min_enclosed_paren : 0);
2673 	      node->min_enclosed_paren = (!lmin
2674 					  ? rmin
2675 					  : (!rmin
2676 					     ? lmin
2677 					     : rx_min (lmin, rmin)));
2678 	    }
2679 	    break;
2680 	  }
2681 
2682 	case r_star:
2683 	  node->len = ((node->left && node->left->len)
2684 		       ? -1
2685 		       : 0);
2686 	  node->observed = (node->left
2687 			    ? node->left->observed
2688 			    : 0);
2689 	  node->observation_contingent = (node->left
2690 					  ? node->left->observation_contingent
2691 					  : 1);
2692 	  node->small_advised_p = 0;
2693 	  node->max_enclosed_paren = (node->left ? node->left->max_enclosed_paren : 0);
2694 	  node->min_enclosed_paren = (node->left ? node->left->min_enclosed_paren : 0);
2695 	  break;
2696 
2697 	case  r_interval:
2698 	  if (!node->left || (node->left->len == 0) || (node->intval2 == 0))
2699 	    node->len = 0;
2700 	  else if (node->left->len < 0)
2701 	    node->len = -1;
2702 	  else if (node->intval == node->intval2)
2703 	    {
2704 	      if (node->left->len > (SSIZE_MAX / node->intval))
2705 		node->len = -1;
2706 	      else
2707 		node->len = node->left->len * node->intval;
2708 	    }
2709 	  else
2710 	    node->len = -1;
2711 	  node->observed = 1;
2712 	  node->observation_contingent = 0;
2713 	  node->small_advised_p = 0;
2714 	  node->max_enclosed_paren = (node->left ? node->left->max_enclosed_paren : 0);
2715 	  node->min_enclosed_paren = (node->left ? node->left->min_enclosed_paren : 0);
2716 	  break;
2717 
2718 	case r_parens:
2719 	  if (node->intval > 0)
2720 	    {
2721 	      node->observed = 1;
2722 	      (*subexps)[this_subexp] = node;
2723 	    }
2724 	  else
2725 	    node->observed = (node->left
2726 			      ? node->left->observed
2727 			      : 0);
2728 	  node->observation_contingent = (   node->observed
2729 					  && (node->left
2730 					      ? node->left->observation_contingent
2731 					      : 1));
2732 	  node->len = (node->left
2733 		       ? node->left->len
2734 		       : 0);
2735 	  node->small_advised_p = (!node->left || node->left->small_advised_p);
2736 	  node->max_enclosed_paren = rx_max (node->intval, (node->left ? node->left->max_enclosed_paren : 0));
2737 
2738 	  if (node->left && node->left->min_enclosed_paren)
2739 	    {
2740 	      if (!node->intval)
2741 		node->min_enclosed_paren = node->left->min_enclosed_paren;
2742 	      else
2743 		node->min_enclosed_paren = rx_min (node->intval, node->left->min_enclosed_paren);
2744 	    }
2745 	  else
2746 	    node->min_enclosed_paren = node->intval;
2747 	  break;
2748 
2749 	case r_context:
2750 	  switch (node->intval)
2751 	    {
2752 	    default:
2753 	      node->small_advised_p = 1;
2754 	      node->observed = 1;
2755 	      node->observation_contingent = 0;
2756 	      node->len = -1;
2757 	      break;
2758 	    case '^':
2759 	    case '$':
2760 	      node->small_advised_p = 1;
2761 	      node->observed = 1;
2762 	      node->observation_contingent = 0;
2763 	      node->len = 0;
2764 	      break;
2765 	    }
2766 	  node->max_enclosed_paren = 0;
2767 	  node->min_enclosed_paren = 0;
2768 	  break;
2769 	}
2770     }
2771   return 0;
2772 }
2773 
2774