1 /* posix.c - posix regexp compatibility functions
2 *
3 ****************************************************************
4 * Copyright (C) 1998, 2000 Thomas Lord
5 *
6 * See the file "COPYING" for further information about
7 * the copyright and warranty status of this work.
8 */
9
10
11
12 #include "hackerlab/os/malloc.h"
13 #include "hackerlab/bugs/panic.h"
14 #include "hackerlab/char/char-class.h"
15 #include "hackerlab/char/str.h"
16 #include "hackerlab/mem/mem.h"
17 #include "hackerlab/rx/tree.h"
18 #include "hackerlab/rx/escape.h"
19 #include "hackerlab/rx/nfa-cache.h"
20 #include "hackerlab/rx/dfa.h"
21 #include "hackerlab/rx-posix/re8-parse.h"
22 #include "hackerlab/rx-posix/posix.h"
23
24
25 #define RX_MANY_CASES 30
26
27
28 /* __STDC__ prototypes for static functions */
29 static int rx_regexec (regmatch_t pmatch[],
30 int no_subexp_reporting,
31 int no_pos_reporting,
32 const regex_t *preg,
33 struct rx_context_rules * rules,
34 rx_off_t start,
35 rx_off_t end,
36 const char *string);
37 static int is_simple_anchored_regexp (int * anchored_start, int * anchored_end, struct rx_exp_node * pattern, int no_subexp_reporting);
38 static int rx_regmatch (regmatch_t pmatch[],
39 int no_subexp_reporting,
40 int no_pos_reporting,
41 const regex_t *preg,
42 struct rx_context_rules * rules,
43 rx_off_t start,
44 rx_off_t end,
45 const char *string,
46 int is_bos,
47 int is_eos);
48 static void rx_is_anchored_p (int * left,
49 int * right,
50 struct rx_exp_node * exp);
51 static int rx_fill_in_fastmap (int cset_size, t_uchar * map, struct rx_exp_node * exp);
52
53
54 /************************************************************************
55 *(h1 "Posix Regexp Functions"
56 * :includes ("sys/types.h"
57 * "hackerlab/rx-posix/regex.h"))
58 *
59 * |Posix regexp functions|
60 * |Posix| |Posix.2| |1003.2| |ANSI/IEEE 1003.2| |ISO/IEDC 994502|
61 *
62 * The standard Posix regexp functions provided by Rx are:
63 *
64 * regcomp
65 * regexec
66 * regfree
67 * regerror
68 *
69 * Two closely related but nonstandard functions are also provided:
70 *
71 * regncomp
72 * regnexec
73 *
74 */
75
76
77
78 #if 0
79 /*(c regcomp)
80 * int regcomp (regex_t * preg, const char * pattern, int cflags);
81 *
82 * Compile the 0-terminated regexp specification `pattern'.
83 *
84 * The compiled pattern is stored in `*preg', which has the field
85 * (required by Posix):
86 *
87 * size_t re_nsub; The number of parenthesized
88 * subexpressions in the compiled
89 * pattern.
90 *
91 *
92 * `cflags' is a combination of bits which effect compilation:
93 *
94 insert*/
95
96 enum rx_cflags
97 {
98 REG_EXTENDED = 1,
99 /* If REG_EXTENDED is set, then use extended regular expression
100 syntax. If not set, then use basic regular expression
101 syntax. In extended syntax, none of the regexp operators are
102 written with a backslash. */
103
104
105 REG_ICASE = (REG_EXTENDED << 1),
106 /* If REG_ICASE is set, then ignore case when matching. If not
107 set, then case is significant. */
108
109
110 REG_NOSUB = (REG_ICASE << 1),
111 /* Report only success/failure in `regexec'.
112 Using this flag can improve performance for
113 some regexps. */
114
115 REG_NEWLINE = (REG_NOSUB << 1),
116 /* If REG_NEWLINE is set, then "." and complemented character
117 sets do not match at newline characters in the string. Also,
118 "^" and "$" do match at newlines.
119
120 If not set, then anchors do not match at newlines and the
121 character sets contain newline.*/
122
123 REG_DFA_ONLY = (REG_NEWLINE << 1),
124 /* If this bit is set, then restrict the pattern
125 language to patterns that compile to efficient
126 state machines. In particular, `regexec' will
127 not report positions for parenthesized subexpressions;
128 "^", "$", backreferences ("\n"), and duplication
129 ("{n,m}") are interpreted as normal characters.
130
131 REG_DFA_ONLY is a non-standard flag. */
132 };
133 /*end-insert
134 *
135 * `regcomp' returns 0 on success and an error code on failure (see
136 * xref:"regerror").
137 */
138 #endif
139 int
regcomp(regex_t * preg,const char * pattern,int cflags)140 regcomp (regex_t * preg, const char * pattern, int cflags)
141 {
142 return regncomp (preg, pattern, str_length (pattern), cflags);
143 }
144
145
146 /*(c regncomp)
147 * int regncomp (regex_t * preg,
148 * const char * pattern,
149 * size_t len,
150 * int cflags);
151 *
152 * Compile the `len'-byte regexp specification `pattern'.
153 *
154 *
155 * The compiled pattern is stored in `*preg', which has the field
156 * (required by Posix):
157 *
158 * size_t re_nsub; The number of parenthesized
159 * subexpressions in the compiled
160 * pattern.
161 *
162 * `cflags' is a combination of bits which effect compilation. See
163 * xref:"regcomp".
164 *
165 * `regncomp' returns 0 on success and an error code on failure (see
166 * xref:"regerror").
167 *
168 * \Note:/ `regncomp' is not part of the Posix.2 interface for
169 * regexp matching. It is an Rx extension.
170 */
171 int
regncomp(regex_t * preg,const char * pattern,size_t len,int cflags)172 regncomp (regex_t * preg,
173 const char * pattern,
174 size_t len,
175 int cflags)
176 {
177 int ret;
178 struct rx_exp_node * exp;
179 int nsub;
180
181 mem_set0 ((char *)preg, sizeof (*preg));
182
183 if (!(cflags & REG_ICASE))
184 {
185 preg->icase = 0;
186 preg->translate = 0;
187 }
188 else
189 {
190 unsigned i;
191
192 preg->icase = 1;
193
194 preg->translate = (t_uchar *) rx_nfa_cache_malloc (256);
195 if (!preg->translate)
196 return (int) REG_ESPACE;
197
198 /* Map uppercase characters to corresponding lowercase ones. */
199 for (i = 0; i < 256; i++)
200 preg->translate[i] = char_is_upper (i) ? char_to_lower (i) : i;
201 }
202
203 ret = rx_parse (&exp,
204 &nsub,
205 pattern, len,
206 (cflags & REG_EXTENDED),
207 (cflags & REG_NEWLINE),
208 (cflags & REG_DFA_ONLY),
209 256,
210 preg->translate);
211
212 /* POSIX doesn't distinguish between an unmatched open-group and an
213 * unmatched close-group: both are REG_EPAREN.
214 */
215 if ((ret == REG_ELPAREN) || (ret == REG_ERPAREN))
216 ret = REG_EPAREN;
217
218 if (ret)
219 return (int)ret;
220
221 if (!(cflags & REG_NEWLINE))
222 preg->newline_anchor = 0;
223 else
224 preg->newline_anchor = 1;
225
226 preg->pattern = exp;
227 preg->re_nsub = 1;
228 preg->subexps = 0;
229 if (rx_analyze_rexp (&preg->subexps, &preg->re_nsub, preg->pattern))
230 {
231 rx_free_exp (preg->pattern);
232 rx_nfa_cache_free ((void *)preg->subexps);
233 mem_set0 ((char *)preg, sizeof (*preg));
234 return REG_ESPACE;
235 }
236 preg->is_nullable = rx_fill_in_fastmap (256,
237 preg->fastmap,
238 preg->pattern);
239 rx_is_anchored_p (&preg->is_left_anchored, &preg->is_right_anchored, preg->pattern);
240 preg->no_sub = !!(cflags & REG_NOSUB);
241 preg->small_p = (!preg->pattern || preg->pattern->small_advised_p);
242 return 0;
243 }
244
245
246
247
248 #if 0
249 /*(c regexec)
250 * int regexec (const regex_t *preg,
251 * const char *string,
252 * size_t nmatch,
253 * regmatch_t pmatch[],
254 * int eflags);
255 *
256 * Search for a match of compiled regexp `preg' in `string'.
257 * Return the positions of the match and the first `nmatch-1'
258 * parenthesized subexpressions in `pmatch'.
259 *
260 * Return 0 if a match is found, an error code otherwise. See
261 * xref:"regerror".
262 *
263 * It is possible to asynchronously abort a call to `regexec'. See
264 * xref:"Escaping Long-Running Matches".
265 *
266 * `preg' must have been filled in by `regcomp' or `regncomp'.
267 *
268 * `string' must be 0 terminated. See xref:"regnexec".
269 *
270 * `nmatch' may be 0 and must not be negative (Posix specifies that
271 * the parameter be declared signed). It is the number of elements
272 * in the array pointed to by `pmatch'.
273 *
274 * `pmatch' may be 0 if `nmatch' is 0. The details of `regmatch_t' are:
275 *
276 insert*/
277 struct rx_registers
278 {
279 regoff_t rm_so; /* Byte offset to substring start. */
280 regoff_t rm_eo; /* Byte offset to substring end. */
281
282 int final_tag; /* In pmatch[0] this field is set to
283 * the state label of the last DFA state
284 * encountered during a match.
285 *
286 * This field is implementation specific.
287 * Applications which intend to be portable
288 * between implementations of Posix should
289 * not use this field.
290 */
291 };
292 /*end-insert
293 *
294 * The state label |state label (in Posix regexps)| of the final DFA state for most regexps is 1. If a
295 * pattern contains the cut operator |cut (in Posix regexps)| `[[:cut <n>:]]' |[[:cut n:]]| its DFAs will
296 * contain a final state with label `n' at that point in the regexp.
297 * This is useful for detecting which of several possible alternatives
298 * actually occured in a match, as in this example:
299 *
300 * pattern: if[[:cut 1:]]\\|while[[:cut 2:]]
301 *
302 * pmatch[0].final_tag is 1 after matching "if"
303 * pmatch[0].final_tag is 2 after matching "while"
304 *
305 * `eflags' is a bit-wise or (`|') of any of these values:
306 *
307 insert*/
308 enum rx_eflags
309 {
310 REG_NOTBOL = 1,
311 /* If REG_NOTBOL is set, then the beginning-of-line operator `^'
312 * doesn't match the beginning of the input string (presumably
313 * because it's not the beginning of a line). If not set, then the
314 * beginning-of-line operator does match the beginning of the
315 * string.
316 *
317 * (Standardized in Posix.2)
318 */
319
320 REG_NOTEOL = (REG_NOTBOL << 1),
321 /* REG_NOTEOL is similar to REG_NOTBOL, except that it applies to
322 * the end-of-line operator `$' and the end of the input string.
323 *
324 * (Standardized in Posix.2)
325 */
326
327 REG_NO_SUBEXP_REPORTING = (REG_NOTEOL << 1),
328 /* REG_NO_SUBEXP_REPORTING causes `regexec' to fill in only
329 * `pmatch[0]' and to ignore other elements of `pmatch'. For some
330 * patterns (those which do not contain back-references or anchors)
331 * this can speed up matching considerably.
332 *
333 * (non-standard)
334 */
335
336 REG_ALLOC_REGS = (REG_NO_SUBEXP_REPORTING << 1),
337 /* REG_ALLOC_REGS is only used by `regnexec'. It causes `regnexec'
338 * to allocate storage for `regmatch_t' values.
339 *
340 * (non-standard)
341 */
342 };
343 /*end-insert
344 *
345 * The match returned satisfies the left-most longest rule |left-most longest rule (in Posix regexps)| which states
346 * a left-most match of the overall regexp will be returned. Of those
347 * matches, one of the longest will be returned.
348 *
349 * There may be more than one longest match because two matches of
350 * equal length may differ in how they fill in the array `pmatch'.
351 * For example:
352 *
353 * "aaaabbbb" can match \(a*\)\(a*b*\)
354 * with pmatch[1] == "aaaa" [*]
355 * and pmatch[2] == "bbbb"
356 * or
357 * with pmatch[1] == "aaa"
358 * and pmatch[2] == "abbbb"
359 * or
360 * with pmatch[1] == "aa"
361 * and pmatch[2] == "aabbbb"
362 * or
363 * with pmatch[1] == "a"
364 * and pmatch[2] == "aaabbbb"
365 * or
366 * with pmatch[1] == ""
367 * and pmatch[2] == "aaaabbbb"
368 *
369 *
370 *
371 * Of the possible values of `pmatch', Rx implements the standard
372 * behavior of returning that match which recursively maximizes the
373 * lengths of the substrings matched by each subpattern, from left to
374 * right. In the preceeding example, the correct answer is marked
375 * with `[*]'.
376 *
377 */
378 #endif
379 int
regexec(const regex_t * preg,const char * string,size_t nmatch,regmatch_t pmatch[],int eflags)380 regexec (const regex_t *preg,
381 const char *string,
382 size_t nmatch,
383 regmatch_t pmatch[],
384 int eflags)
385 {
386 return regnexec (preg,
387 string,
388 str_length (string),
389 nmatch,
390 &pmatch,
391 (eflags & ~REG_ALLOC_REGS));
392 }
393
394
395 /*(c regnexec)
396 * int regnexec (const regex_t *preg,
397 * const char *string,
398 * regoff_t length,
399 * size_t nmatch,
400 * regmatch_t ** pmatch,
401 * int eflags);
402 *
403 * Search for a match of compiled regexp `preg' in `string'.
404 * Return the positions of the match and the first `nmatch-1'
405 * parenthesized subexpressions in `*pmatch'.
406 *
407 * Return 0 if a match is found, an error code otherwise. See
408 * xref:"regerror".
409 *
410 * `preg' must have been filled in by `regcomp' or `regncomp'.
411 *
412 * `string' must be `length' bytes long.
413 *
414 * See xref:"regnexec" for details about other parameters but
415 * note that `regnexec' and `regexec' use different types for
416 * the parameter `pmatch'.
417 *
418 * In `regexec', `pmatch' is only used to pass a pointer. In
419 * `regnexec', `pmatch' is used both to pass a pointer, and to return
420 * a pointer to the caller.
421 *
422 * Callers are permitted to pass 0 for `nmatch' and `pmatch'. Callers
423 * are also permitted to pass the address of a pointer whose value is
424 * 0 for parameter `pmatch'. If they do so, and also set the bit
425 * `REG_ALLOC_REGS' in `eflags', then `pmatch' will be a return
426 * parameter, returning a malloced array of `preg->re_nsub' elements
427 * containing the sub-expression positions of a successful match.
428 *
429 * It is possible to asynchronously abort a call to `regnexec'. See
430 * xref:"Escaping Long-Running Matches".
431 *
432 * \Note:/ `regnexec' is not part of the Posix.2 interface for
433 * regexp matching. It is an Rx extension.
434 */
435 int
regnexec(const regex_t * preg,const char * string,regoff_t len,size_t nmatch,regmatch_t ** pmatch,int eflags)436 regnexec (const regex_t *preg,
437 const char *string,
438 regoff_t len,
439 size_t nmatch,
440 regmatch_t **pmatch,
441 int eflags)
442 {
443 struct rx_context_rules rules;
444 regmatch_t * regs;
445 size_t nregs;
446 int stat;
447
448 rules.newline_anchor = preg->newline_anchor;
449 rules.not_bol = !!(eflags & REG_NOTBOL);
450 rules.not_eol = !!(eflags & REG_NOTEOL);
451 rules.case_indep = preg->icase;
452
453 if (!preg->no_sub && (nmatch >= preg->re_nsub))
454 {
455 regs = *pmatch;
456 nregs = nmatch;
457 }
458 else
459 {
460 regs = (regmatch_t *)malloc (preg->re_nsub * sizeof (*regs));
461 if (!regs)
462 return REG_ESPACE;
463 nregs = preg->re_nsub;
464 }
465
466 {
467 size_t x;
468 for (x = 0; x < nregs; ++x)
469 regs[x].rm_so = regs[x].rm_eo = -1;
470 }
471
472
473 stat = rx_regexec (regs,
474 (preg->no_sub || (!(eflags & REG_ALLOC_REGS) && (nmatch <= 1)) || (eflags & REG_NO_SUBEXP_REPORTING)),
475 preg->no_sub,
476 preg,
477 &rules,
478 0, len,
479 string);
480
481 if (!stat && pmatch && !preg->no_sub && (regs != *pmatch))
482 {
483 size_t x;
484 for (x = 0; x < nmatch; ++x)
485 (*pmatch)[x] = regs[x];
486 }
487
488 if (!stat && (eflags & REG_ALLOC_REGS))
489 *pmatch = regs;
490 else if (regs && (!pmatch || (regs != *pmatch)))
491 free (regs);
492
493 return stat;
494 }
495
496
497 /*(c regfree)
498 * void regfree (regex_t *preg);
499 *
500 * Release all storage allocated for the compiled regexp `preg'.
501 * This does not free `preg' itself.
502 */
503 void
regfree(regex_t * preg)504 regfree (regex_t *preg)
505 {
506 if (preg->pattern)
507 {
508 rx_free_exp (preg->pattern);
509 preg->pattern = 0;
510 }
511 if (preg->subexps)
512 {
513 rx_nfa_cache_free (preg->subexps);
514 preg->subexps = 0;
515 }
516 if (preg->translate != 0)
517 {
518 rx_nfa_cache_free (preg->translate);
519 preg->translate = 0;
520 }
521 }
522
523
524
525
526 /*(c regerror)
527 * size_t regerror (int errcode,
528 * const regex_t *preg,
529 * char *errbuf,
530 * size_t errbuf_size);
531 *
532 * Returns a message corresponding to an error code, `errcode',
533 * returned from either `regcomp' or `regexec'. The size of the
534 * message is returned. At most, `errbuf_size - 1' characters of the
535 * message are copied to `errbuf'. Whatever is stored in `errbuf' is
536 * 0-terminated.
537 *
538 * |error codes (for Posix regexps)|
539 * The POSIX error codes for regexp pattern matchers are:
540 *
541 * REG_NOMATCH "no match"
542 * REG_BADPAT "invalid regular expression"
543 * REG_ECOLLATE "invalid collation character"
544 * REG_ECTYPE "invalid character class name"
545 * REG_EESCAPE "trailing backslash"
546 * REG_ESUBREG "invalid back reference"
547 * REG_EBRACK "unmatched [ or [^"
548 * REG_EPAREN "unmatched (, \\(, ) or \\)"
549 * REG_EBRACE "unmatched \\{"
550 * REG_BADBR "invalid content of \\{\\}"
551 * REG_ERANGE "invalid range end"
552 * REG_ESPACE "memory exhausted"
553 * REG_BADRPT "invalid preceding regular expression"
554 *
555 * Rx also provides a non-standard error code that is used if
556 * `regexec' or `regnexec' is interrupted (see xref:"Escaping
557 * Long-Running Matches").
558 *
559 * REG_MATCH_INTERRUPTED "match interrupted"
560 *
561 */
562 size_t
regerror(int errcode,const regex_t * preg,char * errbuf,size_t errbuf_size)563 regerror (int errcode,
564 const regex_t *preg,
565 char *errbuf,
566 size_t errbuf_size)
567 {
568 const char *msg;
569 size_t msg_size;
570
571 msg = (rx_error_msg[errcode] == 0) ? "Success" : (char *)rx_error_msg[errcode];
572 msg_size = str_length (msg) + 1; /* Includes the 0. */
573 if (errbuf_size != 0)
574 {
575 if (msg_size > errbuf_size)
576 {
577 str_cpy_n (errbuf, (t_uchar *)msg, errbuf_size - 1);
578 errbuf[errbuf_size - 1] = 0;
579 }
580 else
581 str_cpy (errbuf, (t_uchar *)msg);
582 }
583 return msg_size;
584 }
585
586
587 static int
rx_regexec(regmatch_t pmatch[],int no_subexp_reporting,int no_pos_reporting,const regex_t * preg,struct rx_context_rules * rules,rx_off_t start,rx_off_t end,const char * string)588 rx_regexec (regmatch_t pmatch[],
589 int no_subexp_reporting,
590 int no_pos_reporting,
591 const regex_t *preg,
592 struct rx_context_rules * rules,
593 rx_off_t start,
594 rx_off_t end,
595 const char *string)
596 {
597 int x;
598 int stat;
599 int left_anchored;
600 struct rx_exp_node * simplified;
601 struct rx_unfa * unfa;
602 struct rx_dfa machine;
603 int have_machine;
604
605 left_anchored = preg->is_left_anchored;
606
607 unfa = 0;
608 have_machine = 0;
609 if (!preg->small_p && ((end - start) > RX_MANY_CASES))
610 {
611 int err;
612 err = rx_simplify_rexp (&simplified, 256, preg->pattern, preg->subexps);
613 if (err)
614 return REG_ESPACE;
615
616 unfa = rx_unfa (simplified, 256);
617 if (!unfa)
618 {
619 rx_free_exp (simplified);
620 return REG_ESPACE;
621 }
622 rx_init_dfa_from_nfa ((struct rx_dfa *)&machine, unfa->nfa);
623 have_machine = 1;
624 rx_free_exp (simplified);
625 }
626
627 for (x = start; x <= end; ++x)
628 {
629 if (preg->is_nullable
630 || ((x < end)
631 && (preg->fastmap[((t_uchar *)string)[x]])))
632 {
633 if (!preg->is_nullable && !preg->small_p && ((end - start) > RX_MANY_CASES))
634 {
635 size_t amt;
636 int adv;
637
638 if (rx_dfa_goto_start_superstate ((struct rx_dfa *)&machine, 1))
639 {
640 espace_error:
641 rx_clear_dfa_state ((struct rx_dfa *)&machine);
642 rx_free_unfa ((struct rx_unfa *)unfa);
643 return REG_ESPACE;
644 }
645
646 if (setjmp (rx_escape_jmp_buf))
647 {
648 rx_clear_dfa_state ((struct rx_dfa *)&machine);
649 rx_free_unfa ((struct rx_unfa *)unfa);
650 return REG_MATCH_INTERRUPTED;
651 }
652 adv = rx_dfa_advance_to_final (&amt, (struct rx_dfa *)&machine, string + x, end - start - x);
653 if (0 > adv)
654 goto espace_error;
655 if (!adv || (!machine.final_tag && (amt < (end - start - x))))
656 goto nomatch;
657 if (no_pos_reporting && (!preg->pattern || !preg->pattern->observed))
658 {
659 rx_clear_dfa_state ((struct rx_dfa *)&machine);
660 rx_free_unfa ((struct rx_unfa *)unfa);
661 return 0;
662 }
663 }
664 stat = rx_regmatch (pmatch, no_subexp_reporting, no_pos_reporting, preg, rules, x, end, string, (x == start), 1);
665 if (!stat || (stat != REG_NOMATCH))
666 {
667 if (have_machine)
668 rx_clear_dfa_state ((struct rx_dfa *)&machine);
669 rx_free_unfa ((struct rx_unfa *)unfa);
670 return stat;
671 }
672 }
673 nomatch:
674 if (left_anchored)
675 {
676 if (!preg->newline_anchor)
677 {
678 if (have_machine)
679 rx_clear_dfa_state ((struct rx_dfa *)&machine);
680 rx_free_unfa ((struct rx_unfa *)unfa);
681 return REG_NOMATCH;
682 }
683 else
684 while (x < end)
685 {
686 if (string[x] == '\n')
687 break;
688 else
689 ++x;
690 }
691 }
692 }
693 if (have_machine)
694 rx_clear_dfa_state ((struct rx_dfa *)&machine);
695 rx_free_unfa ((struct rx_unfa *)unfa);
696 return REG_NOMATCH;
697 }
698
699
700 static int
is_simple_anchored_regexp(int * anchored_start,int * anchored_end,struct rx_exp_node * pattern,int no_subexp_reporting)701 is_simple_anchored_regexp (int * anchored_start, int * anchored_end, struct rx_exp_node * pattern, int no_subexp_reporting)
702 {
703 struct rx_exp_node * left;
704 struct rx_exp_node * right;
705 int left_start;
706 int left_end;
707 int left_is;
708 int right_start;
709 int right_end;
710 int right_is;
711
712
713 *anchored_start = 0;
714 *anchored_end = 0;
715
716 if (!pattern)
717 return 0;
718
719 if ( (pattern->type == r_context)
720 && ((pattern->intval == '^')
721 || (pattern->intval == '$')))
722 {
723 *anchored_start = (pattern->intval == '^');
724 *anchored_end = (pattern->intval == '$');
725 return 1;
726 }
727
728 if ( (pattern->type != r_concat)
729 && (pattern->type != r_right_concat)
730 && (pattern->type != r_alternate)
731 && ( (pattern->type != r_parens)
732 || (!no_subexp_reporting && pattern->intval)))
733 return 0;
734
735 if (pattern->type == r_parens)
736 return is_simple_anchored_regexp (anchored_start, anchored_end, pattern->left, no_subexp_reporting);
737
738 left = pattern->left;
739 right = pattern->right;
740
741 left_is = is_simple_anchored_regexp (&left_start, &left_end, left, no_subexp_reporting);
742 right_is = is_simple_anchored_regexp (&right_start, &right_end, right, no_subexp_reporting);
743
744 if (!left_is && !right_is)
745 return 0;
746
747 if (!left_is && left)
748 {
749 if (left->observed && (!no_subexp_reporting || !left->observation_contingent))
750 return 0;
751 }
752
753 if (!right_is && right)
754 {
755 if (right->observed && (!no_subexp_reporting || !right->observation_contingent))
756 return 0;
757 }
758
759 if ((pattern->type == r_concat) || (pattern->type == r_right_concat))
760 {
761 if (left_is && left_end && right)
762 return 0;
763 if (right_is && right_start && left)
764 return 0;
765 *anchored_start = left_start || right_start;
766 *anchored_end = left_end || right_end;
767 return 1;
768 }
769 else
770 {
771 /* r_alternate */
772 if ((left_start != right_start) || (left_end != right_end))
773 return 0;
774 *anchored_start = left_start;
775 *anchored_end = left_end;
776 return 1;
777 }
778 }
779
780
781 static int
rx_regmatch(regmatch_t pmatch[],int no_subexp_reporting,int no_pos_reporting,const regex_t * preg,struct rx_context_rules * rules,rx_off_t start,rx_off_t end,const char * string,int is_bos,int is_eos)782 rx_regmatch (regmatch_t pmatch[],
783 int no_subexp_reporting,
784 int no_pos_reporting,
785 const regex_t *preg,
786 struct rx_context_rules * rules,
787 rx_off_t start,
788 rx_off_t end,
789 const char *string,
790 int is_bos,
791 int is_eos)
792 {
793 struct rx_solutions * solutions;
794 int answer;
795 struct rx_context_rules local_rules;
796 rx_off_t orig_end;
797 rx_off_t end_lower_bound;
798 rx_off_t end_upper_bound;
799 int simple_anchored;
800 int anchored_start;
801 int anchored_end;
802
803 local_rules = *rules;
804 orig_end = end;
805
806 if (!preg->pattern)
807 {
808 end_lower_bound = start;
809 end_upper_bound = start;
810 }
811 else if (preg->pattern->len >= 0)
812 {
813 if ((end - start) < preg->pattern->len)
814 return REG_NOMATCH;
815 end_lower_bound = start + preg->pattern->len;
816 end_upper_bound = start + preg->pattern->len;
817 }
818 else
819 {
820 end_lower_bound = start;
821 end_upper_bound = end;
822 }
823
824 simple_anchored = is_simple_anchored_regexp (&anchored_start, &anchored_end, preg->pattern, no_subexp_reporting);
825 if ( simple_anchored
826 || !preg->pattern
827 || !preg->pattern->observed
828 || (preg->pattern->observation_contingent && no_subexp_reporting))
829 {
830 int free_pattern;
831 struct rx_exp_node * pattern;
832 struct rx_unfa * unfa;
833 struct rx_dfa machine;
834 rx_off_t best;
835 int best_label;
836 rx_off_t pos;
837 size_t amt;
838
839 if (simple_anchored && anchored_start && rules->not_bol && is_bos)
840 return REG_NOMATCH;
841
842 free_pattern = 0;
843
844 if (!simple_anchored || !preg->pattern || !preg->pattern->observed)
845 pattern = preg->pattern;
846 else
847 {
848 int err;
849 err = rx_simplify_rexp (&pattern, 256, preg->pattern, preg->subexps);
850 if (err)
851 return REG_ESPACE;
852 free_pattern = 1;
853 }
854
855 unfa = rx_unfa (pattern, 256);
856
857 if (free_pattern)
858 rx_free_exp (pattern);
859 if (!unfa)
860 return REG_ESPACE;
861
862 rx_init_dfa_from_nfa ((struct rx_dfa *)&machine, unfa->nfa);
863 if (rx_dfa_goto_start_superstate ((struct rx_dfa *)&machine, 1))
864 {
865 espace_error:
866 rx_clear_dfa_state ((struct rx_dfa *)&machine);
867 rx_free_unfa (unfa);
868 return REG_ESPACE;
869 }
870
871 if (setjmp (rx_escape_jmp_buf))
872 {
873 rx_clear_dfa_state ((struct rx_dfa *)&machine);
874 rx_free_unfa (unfa);
875 return REG_MATCH_INTERRUPTED;
876 }
877
878 best = -1;
879 best_label = 0;
880 pos = start;
881 if (machine.final_tag)
882 {
883 if ( !simple_anchored
884 || !anchored_end
885 || (rules->newline_anchor && (pos < orig_end) && (string[pos] == '\n'))
886 || (!rules->not_eol && is_eos && (orig_end == start)))
887 {
888 if (no_pos_reporting)
889 {
890 rx_clear_dfa_state ((struct rx_dfa *)&machine);
891 rx_free_unfa (unfa);
892 return 0;
893 }
894 best = start;
895 best_label = (int)machine.final_tag;
896 }
897 }
898 while (pos < end_upper_bound)
899 {
900 int adv;
901
902 adv = rx_dfa_advance_to_final (&amt, (struct rx_dfa *)&machine, string + pos, end_upper_bound - pos);
903 if (0 > adv)
904 goto espace_error;
905 if (!adv || !machine.final_tag)
906 break;
907 pos += amt;
908 if ( !simple_anchored
909 || !anchored_end
910 || (rules->newline_anchor && (pos < orig_end) && (string[pos] == '\n'))
911 || (!rules->not_eol && is_eos && (pos == orig_end)))
912 {
913 if (no_pos_reporting)
914 {
915 rx_clear_dfa_state ((struct rx_dfa *)&machine);
916 rx_free_unfa (unfa);
917 return 0;
918 }
919 best = pos;
920 best_label = (int)machine.final_tag;
921 }
922 }
923 if ((best >= 0) && pmatch)
924 {
925 pmatch[0].rm_so = start;
926 pmatch[0].rm_eo = best;
927 pmatch[0].final_tag = best_label;
928 }
929 rx_clear_dfa_state ((struct rx_dfa *)&machine);
930 rx_free_unfa (unfa);
931 if (best < 0)
932 return REG_NOMATCH;
933 else
934 return 0;
935 }
936
937 answer = 0;
938
939 {
940 #define n_end_guess 256
941 regoff_t end_guesses[n_end_guess];
942 int valid_end_guesses;
943 int total_end_guesses = 0;
944 int end_guess_position;
945 int no_end_guess_optimization;
946 int next_no_end_guess_optimization;
947 struct rx_exp_node * pattern;
948 struct rx_unfa * unfa;
949 struct rx_dfa machine;
950 int end_search_direction;
951
952 next_no_end_guess_optimization = (end_upper_bound == end_lower_bound);
953 #if 0
954 /* to disable DFA optimization: */
955 next_no_end_guess_optimization = 1;
956 #endif
957
958 anchored_start = preg->is_left_anchored;
959 anchored_end = preg->is_right_anchored;
960
961 {
962 int err;
963 err = rx_simplify_rexp (&pattern, 256, preg->pattern, preg->subexps);
964 if (err)
965 return REG_ESPACE;
966 }
967
968 unfa = rx_unfa (pattern, 256);
969 rx_free_exp (pattern);
970 if (!unfa)
971 return REG_ESPACE;
972
973 rx_init_dfa_from_nfa ((struct rx_dfa *)&machine, unfa->nfa);
974
975 /* precondition: end is the last end-point to check for a match.
976 * That's end_lower_bound no_pos_reporting,
977 * and end_upper_bound otherwise.
978 *
979 * if no_pos_reporting is not 0,
980 * The DFA is in the right state for
981 * processing the character at position `end'.
982 *
983 * end_search_direction = (no_pos_reporting ? 1 : -1)
984 */
985
986 if (!no_pos_reporting)
987 {
988 end = end_upper_bound;
989 end_search_direction = -1;
990 }
991 else
992 {
993 end = end_lower_bound;
994 end_search_direction = 1;
995
996 if (rx_dfa_goto_start_superstate ((struct rx_dfa *)&machine, 1))
997 {
998 espace_error0:
999 rx_clear_dfa_state ((struct rx_dfa *)&machine);
1000 rx_free_unfa (unfa);
1001 return REG_ESPACE;
1002 }
1003
1004 if (setjmp (rx_escape_jmp_buf))
1005 {
1006 rx_clear_dfa_state ((struct rx_dfa *)&machine);
1007 rx_free_unfa (unfa);
1008 return REG_MATCH_INTERRUPTED;
1009 }
1010
1011 if (start < end)
1012 {
1013 if (!rx_dfa_advance (&machine, string + start, end - start))
1014 goto espace_error0;
1015 }
1016 }
1017
1018 while (no_pos_reporting ? (end <= end_upper_bound) : (end >= end_lower_bound))
1019 {
1020 rx_off_t pos;
1021 #define POSITIONS_REMAIN (no_pos_reporting ? (pos <= end_upper_bound) : (pos >= end_lower_bound))
1022
1023 #define VALID_FINAL_POSITION \
1024 ( !anchored_end \
1025 || ((pos == orig_end) && is_eos && !rules->not_eol) \
1026 || ((pos < orig_end) && rules->newline_anchor && (string[pos] == '\n')))
1027
1028
1029 /* invariant:
1030 *
1031 * end is the next plausible end-position of a match.
1032 *
1033 * next_no_end_guess_optimization says whether or not DFA checks are
1034 * likely to pay off by eliminating some possible
1035 * values of `end' from consideration.
1036 *
1037 * If no_pos_reporting, the DFA is set to process the character
1038 * at `end'.
1039 */
1040
1041 no_end_guess_optimization = next_no_end_guess_optimization;
1042
1043 if (no_end_guess_optimization)
1044 {
1045 /* Arrange to check all possible end points without doing a DFA check.
1046 */
1047 valid_end_guesses = 0;
1048 end_guess_position = n_end_guess - 1;
1049 pos = end;
1050
1051 while (POSITIONS_REMAIN && (valid_end_guesses < n_end_guess))
1052 {
1053 if (VALID_FINAL_POSITION)
1054 {
1055 end_guesses[end_guess_position] = pos;
1056 --end_guess_position;
1057 ++valid_end_guesses;
1058 }
1059 pos += end_search_direction;
1060 }
1061 end_guess_position = 0;
1062 /* Post-condition:
1063 *
1064 * Treating end_guesses as a circular stack, their are valid_end_guesses
1065 * items on the stack which are possible end positions in the order
1066 * they should be checked.
1067 */
1068 }
1069 else
1070 {
1071 size_t amt;
1072
1073 /* Arrange to check end points that pass a DFA check.
1074 */
1075 if (no_pos_reporting)
1076 {
1077 /* We'll find possible end positions in the order they should
1078 * be checked, so treat end_guesses as a queue.
1079 */
1080 end_guess_position = n_end_guess - 1;
1081
1082 /* end is the next position to check for possible
1083 * end-of-match. The DFA is already set to process this character.
1084 */
1085 pos = end;
1086 }
1087 else
1088 {
1089 /* We'll find possible end positions in the opposite of the order they should
1090 * be checked, so treat end_guesses as a stack.
1091 */
1092 end_guess_position = 0;
1093
1094 /* end is the last position to check for possible
1095 * end-of-match. Checking begins at `start'.
1096 */
1097 pos = start;
1098 if (rx_dfa_goto_start_superstate ((struct rx_dfa *)&machine, 1))
1099 {
1100 espace_error2:
1101 rx_clear_dfa_state ((struct rx_dfa *)&machine);
1102 rx_free_unfa (unfa);
1103 return REG_ESPACE;
1104 }
1105 }
1106
1107 valid_end_guesses = 0;
1108 total_end_guesses = 0;
1109
1110 if (setjmp (rx_escape_jmp_buf))
1111 {
1112 rx_clear_dfa_state ((struct rx_dfa *)&machine);
1113 rx_free_unfa (unfa);
1114 return REG_MATCH_INTERRUPTED;
1115 }
1116
1117 if (machine.final_tag && VALID_FINAL_POSITION)
1118 {
1119 end_guesses[end_guess_position] = pos;
1120 ++valid_end_guesses;
1121 ++total_end_guesses;
1122
1123 end_guess_position -= end_search_direction;
1124 }
1125
1126 {
1127 while (no_pos_reporting ? (pos < end_upper_bound) : (pos < end))
1128 {
1129 int adv;
1130
1131 if (no_pos_reporting)
1132 adv = rx_dfa_advance_to_final (&amt, (struct rx_dfa *)&machine, string + pos, end_upper_bound - pos);
1133 else
1134 adv = rx_dfa_advance_to_final (&amt, (struct rx_dfa *)&machine, string + pos, end - pos);
1135 if (0 > adv)
1136 goto espace_error2;
1137 if (!adv || !machine.final_tag)
1138 break;
1139 pos += amt;
1140 if (!VALID_FINAL_POSITION)
1141 continue;
1142 end_guesses[end_guess_position] = pos;
1143 if (no_pos_reporting)
1144 {
1145 ++valid_end_guesses;
1146 if (valid_end_guesses == n_end_guess)
1147 {
1148 end_guess_position = 0;
1149 break;
1150 }
1151 else
1152 --end_guess_position;
1153 }
1154 else
1155 {
1156 if (valid_end_guesses < n_end_guess)
1157 ++valid_end_guesses;
1158 end_guess_position = ((end_guess_position + 1) % n_end_guess);
1159 ++total_end_guesses;
1160 }
1161 }
1162
1163 if (no_pos_reporting)
1164 end_guess_position = 0;
1165 }
1166
1167 /* end_guesses is now a stack of values which when popped,
1168 * are plausible end positions int the correct order to
1169 * check.
1170 */
1171 }
1172
1173 if (!valid_end_guesses)
1174 {
1175 rx_clear_dfa_state ((struct rx_dfa *)&machine);
1176 rx_free_unfa (unfa);
1177 return REG_NOMATCH;
1178 }
1179
1180 if (no_end_guess_optimization || (total_end_guesses == ((end_upper_bound - end_lower_bound) + 1)))
1181 next_no_end_guess_optimization = 1;
1182
1183 while (valid_end_guesses)
1184 {
1185 end_guess_position = (((end_guess_position + n_end_guess) - 1) % n_end_guess);
1186 --valid_end_guesses;
1187 end = end_guesses[end_guess_position];
1188
1189 local_rules.not_eol = (rules->not_eol
1190
1191 ? ( ((end == orig_end) && is_eos)
1192 || !local_rules.newline_anchor
1193 || (string[end] != '\n')) /* string[end] is valid because either (end < orig_end) || !is_eos */
1194
1195 : ( (end != orig_end)
1196 && (!local_rules.newline_anchor
1197 || (string[end] != '\n'))));
1198 solutions = rx_basic_make_solutions (pmatch, preg->pattern, preg->subexps, preg->re_nsub,
1199 start, end, &local_rules, string, preg->small_p);
1200 if (!solutions)
1201 {
1202 rx_clear_dfa_state ((struct rx_dfa *)&machine);
1203 rx_free_unfa (unfa);
1204 return REG_ESPACE;
1205 }
1206
1207 if (setjmp (rx_escape_jmp_buf))
1208 {
1209 rx_clear_dfa_state ((struct rx_dfa *)&machine);
1210 rx_free_unfa (unfa);
1211 rx_basic_free_solutions (solutions);
1212 return REG_MATCH_INTERRUPTED;
1213 }
1214 answer = rx_next_solution (solutions);
1215 if (answer < 0)
1216 {
1217 rx_clear_dfa_state ((struct rx_dfa *)&machine);
1218 rx_free_unfa (unfa);
1219 rx_basic_free_solutions (solutions);
1220 return REG_ESPACE;
1221 }
1222
1223 if (answer == 1)
1224 {
1225 if (!no_pos_reporting && pmatch)
1226 {
1227 pmatch[0].rm_so = start;
1228 pmatch[0].rm_eo = end;
1229 pmatch[0].final_tag = rx_solutions_final_tag (solutions);
1230 }
1231 rx_basic_free_solutions (solutions);
1232 rx_clear_dfa_state ((struct rx_dfa *)&machine);
1233 rx_free_unfa (unfa);
1234 return 0;
1235 }
1236 else
1237 rx_basic_free_solutions (solutions);
1238 }
1239 /* post condition: end is the last end position tried.
1240 */
1241 end += end_search_direction;
1242 }
1243 rx_clear_dfa_state ((struct rx_dfa *)&machine);
1244 rx_free_unfa (unfa);
1245 }
1246
1247 return REG_NOMATCH;
1248 }
1249
1250
1251
1252
1253 /*c rx_is_anchored_p
1254 * void rx_is_anchored_p (int * left,
1255 * int * right,
1256 * struct rx_exp_node * exp);
1257 *
1258 * Is an expression "anchored", meaning, must it match at string
1259 * or line boundary on either the left or right (`^' or `$')?
1260 *
1261 * Knowing whether or not an expression is anchored is useful for
1262 * optimizing some common kinds of regexp search, so this function
1263 * computes that property.
1264 *
1265 insert*/
1266 static void
rx_is_anchored_p(int * left,int * right,struct rx_exp_node * exp)1267 rx_is_anchored_p (int * left,
1268 int * right,
1269 struct rx_exp_node * exp)
1270 {
1271 int ign;
1272
1273 if (!left)
1274 left = &ign;
1275
1276 if (!right)
1277 right = &ign;
1278
1279 if (!exp)
1280 {
1281 *left = 0;
1282 *right = 0;
1283 return;
1284 }
1285
1286 switch (exp->type)
1287 {
1288 default:
1289 case r_star:
1290 case r_cset:
1291 case r_string:
1292 case r_cut:
1293 *left = 0;
1294 *right = 0;
1295 break;
1296
1297 case r_parens:
1298 rx_is_anchored_p (left, right, exp->left);
1299 break;
1300
1301 case r_concat:
1302 case r_right_concat:
1303 if (!left)
1304 rx_is_anchored_p (left, right, exp->right);
1305 else if (!right)
1306 rx_is_anchored_p (left, right, exp->left);
1307 else
1308 {
1309 rx_is_anchored_p (left, &ign, exp->left);
1310 rx_is_anchored_p (&ign, right, exp->right);
1311 }
1312 break;
1313
1314
1315 case r_alternate:
1316 {
1317 int l1;
1318 int r1;
1319 int l2;
1320 int r2;
1321
1322 rx_is_anchored_p (&l1, &r1, exp->left);
1323 rx_is_anchored_p (&l2, &r2, exp->right);
1324 *left = l1 && l2;
1325 *right = r1 && r2;
1326 break;
1327 }
1328
1329 case r_interval:
1330 if (exp->intval == 0)
1331 {
1332 *left = 0;
1333 *right = 0;
1334 }
1335 else
1336 rx_is_anchored_p (left, right, exp->left);
1337 break;
1338
1339 case r_context:
1340 if (exp->intval == '^')
1341 {
1342 *left = 1;
1343 *right = 0;
1344 }
1345 else if (exp->intval == '$')
1346 {
1347 *left = 0;
1348 *right = 1;
1349 }
1350 else
1351 {
1352 *left = 0;
1353 *right = 0;
1354 }
1355 break;
1356 }
1357 }
1358 /*end-insert
1359 */
1360
1361 /*c rx_fill_in_fastmap
1362 * int rx_fill_in_fastmap (int cset_size,
1363 * t_uchar * map,
1364 * struct rx_exp_node * exp);
1365 *
1366 * If a pattern can not match the empty string, then there is
1367 * a set of characters (the "fastmap") from which the first character
1368 * of a matching string must come. For some patterns, the fastmap is
1369 * smaller than the complete character set and is easy to compute.
1370 * Knowing the fastmap is useful for optimizing some kinds of
1371 * regexp search.
1372 *
1373 * This function returns (in `map') a set represented as an array of
1374 * 256 bytes, with entries for members of the fastmap set equal to 1,
1375 * and other entries equal to 0.
1376 */
1377 static int
rx_fill_in_fastmap(int cset_size,t_uchar * map,struct rx_exp_node * exp)1378 rx_fill_in_fastmap (int cset_size, t_uchar * map, struct rx_exp_node * exp)
1379 {
1380 if (!exp)
1381 {
1382 can_match_empty:
1383 {
1384 int x;
1385 for (x = 0; x < cset_size; ++x)
1386 map[x] = 1;
1387 }
1388 return 1;
1389 }
1390
1391 switch (exp->type)
1392 {
1393 case r_cset:
1394 {
1395 int x;
1396 int most;
1397
1398 most = exp->cset_size;
1399 for (x = 0; x < most; ++x)
1400 if (bits_is_member (exp->cset, x))
1401 map[x] = 1;
1402 }
1403 return 0;
1404
1405 case r_string:
1406 if (exp->str_len)
1407 {
1408 map[exp->str[0]] = 1;
1409 return 0;
1410 }
1411 else
1412 return 1;
1413
1414 case r_concat:
1415 case r_right_concat:
1416 return ( rx_fill_in_fastmap (cset_size, map, exp->left)
1417 && rx_fill_in_fastmap (cset_size, map, exp->right));
1418
1419 case r_alternate:
1420 return ( rx_fill_in_fastmap (cset_size, map, exp->left)
1421 || rx_fill_in_fastmap (cset_size, map, exp->right));
1422
1423 case r_parens:
1424 return rx_fill_in_fastmap (cset_size, map, exp->left);
1425
1426 case r_star:
1427 goto can_match_empty;
1428
1429 case r_interval:
1430 if (exp->intval == 0)
1431 goto can_match_empty;
1432 else
1433 return rx_fill_in_fastmap (cset_size, map, exp->left);
1434
1435 case r_cut:
1436 goto can_match_empty;
1437
1438 case r_context:
1439 goto can_match_empty;
1440
1441 default:
1442 while (1)
1443 panic ("bogus regexp in rx_fill_in_fastmap");
1444 }
1445 }
1446