1 /* Extended regular expression matching and search library.
2 Copyright (C) 1985, 1989-90 Free Software Foundation, Inc.
3
4 This program is free software; you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation; either version 2, or (at your option)
7 any later version.
8
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
13
14 You should have received a copy of the GNU General Public License
15 along with this program; if not, write to the Free Software
16 Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */
17
18
19 // This is a translation into C++ of regex.c, the GNU regexp package.
20
21 /* To test, compile with -Dtest. This Dtestable feature turns this into
22 a self-contained program which reads a pattern, describes how it
23 compiles, then reads a string and searches for it.
24
25 On the other hand, if you compile with both -Dtest and -Dcanned you
26 can run some tests we've already thought of. */
27
28 /* AIX requires the alloca decl to be the first thing in the file. */
29 #ifdef __GNUC__
30 #define alloca alloca
31 #else
32 #ifdef sparc
33 #include <alloca.h>
34 #else
35 #ifdef _AIX
36 #pragma alloca
37 #else
38 char *alloca ();
39 #endif
40 #endif
41 #endif
42
43 #ifdef emacs
44
45 /* The `emacs' switch turns on certain special matching commands
46 that make sense only in emacs. */
47
48 #include "config.h"
49 #include "lisp.h"
50 #include "buffer.h"
51 #include "syntax.h"
52
53 #else /* not emacs */
54
55 #include <_G_config.h>
56 #include <string.h>
57 #include <stdlib.h>
58
59 /* Define the syntax stuff, so we can do the \<, \>, etc. */
60
61 /* This must be nonzero for the wordchar and notwordchar pattern
62 commands in re_match_2. */
63 #ifndef Sword
64 #define Sword 1
65 #endif
66
67 #define SYNTAX(c) re_syntax_table[c]
68
69
70 #ifdef SYNTAX_TABLE
71
72 char *re_syntax_table;
73
74 #else /* not SYNTAX_TABLE */
75
76 static char re_syntax_table[256];
77
78
79 static void
init_syntax_once()80 init_syntax_once ()
81 {
82 register int c;
83 static int done = 0;
84
85 if (done)
86 return;
87
88 memset (re_syntax_table, 0, sizeof re_syntax_table);
89
90 for (c = 'a'; c <= 'z'; c++)
91 re_syntax_table[c] = Sword;
92
93 for (c = 'A'; c <= 'Z'; c++)
94 re_syntax_table[c] = Sword;
95
96 for (c = '0'; c <= '9'; c++)
97 re_syntax_table[c] = Sword;
98
99 done = 1;
100 }
101
102 #endif /* SYNTAX_TABLE */
103 #endif /* emacs */
104
105 /* We write fatal error messages on standard error. */
106 #include <stdio.h>
107
108 /* isalpha(3) etc. are used for the character classes. */
109 #include <ctype.h>
110 /* Sequents are missing isgraph. */
111 #ifndef isgraph
112 #define isgraph(c) (isprint((c)) && !isspace((c)))
113 #endif
114
115 /* Get the interface, including the syntax bits. */
116 #include "regex.h"
117
118
119 /* These are the command codes that appear in compiled regular
120 expressions, one per byte. Some command codes are followed by
121 argument bytes. A command code can specify any interpretation
122 whatsoever for its arguments. Zero-bytes may appear in the compiled
123 regular expression.
124
125 The value of `exactn' is needed in search.c (search_buffer) in emacs.
126 So regex.h defines a symbol `RE_EXACTN_VALUE' to be 1; the value of
127 `exactn' we use here must also be 1. */
128
129 enum regexpcode
130 {
131 unused=0,
132 exactn=1, /* Followed by one byte giving n, then by n literal bytes. */
133 begline, /* Fail unless at beginning of line. */
134 endline, /* Fail unless at end of line. */
135 jump, /* Followed by two bytes giving relative address to jump to. */
136 on_failure_jump, /* Followed by two bytes giving relative address of
137 place to resume at in case of failure. */
138 finalize_jump, /* Throw away latest failure point and then jump to
139 address. */
140 maybe_finalize_jump, /* Like jump but finalize if safe to do so.
141 This is used to jump back to the beginning
142 of a repeat. If the command that follows
143 this jump is clearly incompatible with the
144 one at the beginning of the repeat, such that
145 we can be sure that there is no use backtracking
146 out of repetitions already completed,
147 then we finalize. */
148 dummy_failure_jump, /* Jump, and push a dummy failure point. This
149 failure point will be thrown away if an attempt
150 is made to use it for a failure. A + construct
151 makes this before the first repeat. Also
152 use it as an intermediary kind of jump when
153 compiling an or construct. */
154 succeed_n, /* Used like on_failure_jump except has to succeed n times;
155 then gets turned into an on_failure_jump. The relative
156 address following it is useless until then. The
157 address is followed by two bytes containing n. */
158 jump_n, /* Similar to jump, but jump n times only; also the relative
159 address following is in turn followed by yet two more bytes
160 containing n. */
161 set_number_at, /* Set the following relative location to the
162 subsequent number. */
163 anychar, /* Matches any (more or less) one character. */
164 charset, /* Matches any one char belonging to specified set.
165 First following byte is number of bitmap bytes.
166 Then come bytes for a bitmap saying which chars are in.
167 Bits in each byte are ordered low-bit-first.
168 A character is in the set if its bit is 1.
169 A character too large to have a bit in the map
170 is automatically not in the set. */
171 charset_not, /* Same parameters as charset, but match any character
172 that is not one of those specified. */
173 start_memory, /* Start remembering the text that is matched, for
174 storing in a memory register. Followed by one
175 byte containing the register number. Register numbers
176 must be in the range 0 through RE_NREGS. */
177 stop_memory, /* Stop remembering the text that is matched
178 and store it in a memory register. Followed by
179 one byte containing the register number. Register
180 numbers must be in the range 0 through RE_NREGS. */
181 duplicate, /* Match a duplicate of something remembered.
182 Followed by one byte containing the index of the memory
183 register. */
184 #ifdef emacs
185 before_dot, /* Succeeds if before point. */
186 at_dot, /* Succeeds if at point. */
187 after_dot, /* Succeeds if after point. */
188 #endif
189 begbuf, /* Succeeds if at beginning of buffer. */
190 endbuf, /* Succeeds if at end of buffer. */
191 wordchar, /* Matches any word-constituent character. */
192 notwordchar, /* Matches any char that is not a word-constituent. */
193 wordbeg, /* Succeeds if at word beginning. */
194 wordend, /* Succeeds if at word end. */
195 wordbound, /* Succeeds if at a word boundary. */
196 notwordbound,/* Succeeds if not at a word boundary. */
197 #ifdef emacs
198 syntaxspec, /* Matches any character whose syntax is specified.
199 followed by a byte which contains a syntax code,
200 e.g., Sword. */
201 notsyntaxspec /* Matches any character whose syntax differs from
202 that specified. */
203 #endif
204 };
205
206
207 /* Number of failure points to allocate space for initially,
208 when matching. If this number is exceeded, more space is allocated,
209 so it is not a hard limit. */
210
211 #ifndef NFAILURES
212 #define NFAILURES 80
213 #endif
214
215
216 #ifndef SIGN_EXTEND_CHAR
217 #ifdef __STDC__
218 #define SIGN_EXTEND_CHAR(c) ((signed char)(c))
219 #else
220 #define SIGN_EXTEND_CHAR(c) (((c)^128) - 128) /* As in Harbison and Steele. */
221 #endif
222 #endif /* not SIGN_EXTEND_CHAR */
223
224 /* Store NUMBER in two contiguous bytes starting at DESTINATION. */
225 #define STORE_NUMBER(destination, number) \
226 { (destination)[0] = (number) & 0377; \
227 (destination)[1] = (number) >> 8; }
228
229 /* Same as STORE_NUMBER, except increment the destination pointer to
230 the byte after where the number is stored. Watch out that values for
231 DESTINATION such as p + 1 won't work, whereas p will. */
232 #define STORE_NUMBER_AND_INCR(destination, number) \
233 { STORE_NUMBER(destination, number); \
234 (destination) += 2; }
235
236
237 /* Put into DESTINATION a number stored in two contingous bytes starting
238 at SOURCE. */
239 #define EXTRACT_NUMBER(destination, source) \
240 { (destination) = *(source) & 0377; \
241 (destination) += SIGN_EXTEND_CHAR (*(char *)((source) + 1)) << 8; }
242
243 /* Same as EXTRACT_NUMBER, except increment the pointer for source to
244 point to second byte of SOURCE. Note that SOURCE has to be a value
245 such as p, not, e.g., p + 1. */
246 #define EXTRACT_NUMBER_AND_INCR(destination, source) \
247 { EXTRACT_NUMBER (destination, source); \
248 (source) += 2; }
249
250
251 /* Specify the precise syntax of regexps for compilation. This provides
252 for compatibility for various utilities which historically have
253 different, incompatible syntaxes.
254
255 The argument SYNTAX is a bit-mask comprised of the various bits
256 defined in regex.h. */
257
258 int
re_set_syntax(int syntax)259 re_set_syntax (int syntax)
260 {
261 int ret;
262
263 ret = obscure_syntax;
264 obscure_syntax = syntax;
265 return ret;
266 }
267
268 /* Set by re_set_syntax to the current regexp syntax to recognize. */
269 int obscure_syntax = 0;
270
271
272
273 /* Macros for re_compile_pattern, which is found below these definitions. */
274
275 #define CHAR_CLASS_MAX_LENGTH 6
276
277 /* Fetch the next character in the uncompiled pattern, translating it if
278 necessary. */
279 #define PATFETCH(c) \
280 {if (p == pend) goto end_of_pattern; \
281 c = * (const unsigned char *) p++; \
282 if (translate) c = translate[c]; }
283
284 /* Fetch the next character in the uncompiled pattern, with no
285 translation. */
286 #define PATFETCH_RAW(c) \
287 {if (p == pend) goto end_of_pattern; \
288 c = * (const unsigned char *) p++; }
289
290 #define PATUNFETCH p--
291
292
293 /* If the buffer isn't allocated when it comes in, use this. */
294 #define INIT_BUF_SIZE 28
295
296 /* Make sure we have at least N more bytes of space in buffer. */
297 #define GET_BUFFER_SPACE(n) \
298 { \
299 while (b - bufp->buffer + (n) >= bufp->allocated) \
300 EXTEND_BUFFER; \
301 }
302
303 /* Make sure we have one more byte of buffer space and then add CH to it. */
304 #define BUFPUSH(ch) \
305 { \
306 GET_BUFFER_SPACE (1); \
307 *b++ = (char) (ch); \
308 }
309
310 /* Extend the buffer by twice its current size via reallociation and
311 reset the pointers that pointed into the old allocation to point to
312 the correct places in the new allocation. If extending the buffer
313 results in it being larger than 1 << 16, then flag memory exhausted. */
314 #define EXTEND_BUFFER \
315 { char *old_buffer = bufp->buffer; \
316 if (bufp->allocated == (1L<<16)) goto too_big; \
317 bufp->allocated *= 2; \
318 if (bufp->allocated > (1L<<16)) bufp->allocated = (1L<<16); \
319 bufp->buffer = (char *) realloc (bufp->buffer, bufp->allocated); \
320 if (bufp->buffer == 0) \
321 goto memory_exhausted; \
322 b = (b - old_buffer) + bufp->buffer; \
323 if (fixup_jump) \
324 fixup_jump = (fixup_jump - old_buffer) + bufp->buffer; \
325 if (laststart) \
326 laststart = (laststart - old_buffer) + bufp->buffer; \
327 begalt = (begalt - old_buffer) + bufp->buffer; \
328 if (pending_exact) \
329 pending_exact = (pending_exact - old_buffer) + bufp->buffer; \
330 }
331
332 /* Set the bit for character C in a character set list. */
333 #define SET_LIST_BIT(c) (b[(c) / BYTEWIDTH] |= 1 << ((c) % BYTEWIDTH))
334
335 /* Get the next unsigned number in the uncompiled pattern. */
336 #define GET_UNSIGNED_NUMBER(num) \
337 { if (p != pend) \
338 { \
339 PATFETCH (c); \
340 while (isdigit (c)) \
341 { \
342 if (num < 0) \
343 num = 0; \
344 num = num * 10 + c - '0'; \
345 if (p == pend) \
346 break; \
347 PATFETCH (c); \
348 } \
349 } \
350 }
351
352 /* Subroutines for re_compile_pattern. */
353 static void store_jump (char *from, char opcode, char *to);
354 static void insert_jump (char op, char *from, char *to, char *current_end);
355 static void store_jump_n (char *from, char opcode, char *to, unsigned n);
356 static void insert_jump_n (char, char *, char *, char *, unsigned);
357 static void insert_op_2 (char, char *, char *_end, int, int);
358
359
360 /* re_compile_pattern takes a regular-expression string
361 and converts it into a buffer full of byte commands for matching.
362
363 PATTERN is the address of the pattern string
364 SIZE is the length of it.
365 BUFP is a struct re_pattern_buffer * which points to the info
366 on where to store the byte commands.
367 This structure contains a char * which points to the
368 actual space, which should have been obtained with malloc.
369 re_compile_pattern may use realloc to grow the buffer space.
370
371 The number of bytes of commands can be found out by looking in
372 the `struct re_pattern_buffer' that bufp pointed to, after
373 re_compile_pattern returns. */
374
375 char *
re_compile_pattern(const char * pattern,int size,struct re_pattern_buffer * bufp)376 re_compile_pattern (const char *pattern, int size, struct re_pattern_buffer *bufp)
377 {
378 register char *b = bufp->buffer;
379 register const char *p = pattern;
380 const char *pend = pattern + size;
381 register unsigned c, c1;
382 const char *p1;
383 unsigned char *translate = (unsigned char *) bufp->translate;
384
385 /* Address of the count-byte of the most recently inserted `exactn'
386 command. This makes it possible to tell whether a new exact-match
387 character can be added to that command or requires a new `exactn'
388 command. */
389
390 char *pending_exact = 0;
391
392 /* Address of the place where a forward-jump should go to the end of
393 the containing expression. Each alternative of an `or', except the
394 last, ends with a forward-jump of this sort. */
395
396 char *fixup_jump = 0;
397
398 /* Address of start of the most recently finished expression.
399 This tells postfix * where to find the start of its operand. */
400
401 char *laststart = 0;
402
403 /* In processing a repeat, 1 means zero matches is allowed. */
404
405 char zero_times_ok;
406
407 /* In processing a repeat, 1 means many matches is allowed. */
408
409 char many_times_ok;
410
411 /* Address of beginning of regexp, or inside of last \(. */
412
413 char *begalt = b;
414
415 /* In processing an interval, at least this many matches must be made. */
416 int lower_bound;
417
418 /* In processing an interval, at most this many matches can be made. */
419 int upper_bound;
420
421 /* Place in pattern (i.e., the {) to which to go back if the interval
422 is invalid. */
423 const char *beg_interval = 0;
424
425 /* Stack of information saved by \( and restored by \).
426 Four stack elements are pushed by each \(:
427 First, the value of b.
428 Second, the value of fixup_jump.
429 Third, the value of regnum.
430 Fourth, the value of begalt. */
431
432 int stackb[40];
433 int *stackp = stackb;
434 int *stacke = stackb + 40;
435 int *stackt;
436
437 /* Counts \('s as they are encountered. Remembered for the matching \),
438 where it becomes the register number to put in the stop_memory
439 command. */
440
441 unsigned regnum = 1;
442
443 bufp->fastmap_accurate = 0;
444
445 #ifndef emacs
446 #ifndef SYNTAX_TABLE
447 /* Initialize the syntax table. */
448 init_syntax_once();
449 #endif
450 #endif
451
452 if (bufp->allocated == 0)
453 {
454 bufp->allocated = INIT_BUF_SIZE;
455 if (bufp->buffer)
456 /* EXTEND_BUFFER loses when bufp->allocated is 0. */
457 bufp->buffer = (char *) realloc (bufp->buffer, INIT_BUF_SIZE);
458 else
459 /* Caller did not allocate a buffer. Do it for them. */
460 bufp->buffer = (char *) malloc (INIT_BUF_SIZE);
461 if (!bufp->buffer) goto memory_exhausted;
462 begalt = b = bufp->buffer;
463 }
464
465 while (p != pend)
466 {
467 PATFETCH (c);
468
469 switch (c)
470 {
471 case '$':
472 {
473 const char *p1 = p;
474 /* When testing what follows the $,
475 look past the \-constructs that don't consume anything. */
476 if (! (obscure_syntax & RE_CONTEXT_INDEP_OPS))
477 while (p1 != pend)
478 {
479 if (*p1 == '\\' && p1 + 1 != pend
480 && (p1[1] == '<' || p1[1] == '>'
481 || p1[1] == '`' || p1[1] == '\''
482 #ifdef emacs
483 || p1[1] == '='
484 #endif
485 || p1[1] == 'b' || p1[1] == 'B'))
486 p1 += 2;
487 else
488 break;
489 }
490 if (obscure_syntax & RE_TIGHT_VBAR)
491 {
492 if (! (obscure_syntax & RE_CONTEXT_INDEP_OPS) && p1 != pend)
493 goto normal_char;
494 /* Make operand of last vbar end before this `$'. */
495 if (fixup_jump)
496 store_jump (fixup_jump, jump, b);
497 fixup_jump = 0;
498 BUFPUSH (endline);
499 break;
500 }
501 /* $ means succeed if at end of line, but only in special contexts.
502 If validly in the middle of a pattern, it is a normal character. */
503
504 if ((obscure_syntax & RE_CONTEXTUAL_INVALID_OPS) && p1 != pend)
505 goto invalid_pattern;
506 if (p1 == pend || *p1 == '\n'
507 || (obscure_syntax & RE_CONTEXT_INDEP_OPS)
508 || (obscure_syntax & RE_NO_BK_PARENS
509 ? *p1 == ')'
510 : *p1 == '\\' && p1[1] == ')')
511 || (obscure_syntax & RE_NO_BK_VBAR
512 ? *p1 == '|'
513 : *p1 == '\\' && p1[1] == '|'))
514 {
515 BUFPUSH (endline);
516 break;
517 }
518 goto normal_char;
519 }
520 case '^':
521 /* ^ means succeed if at beg of line, but only if no preceding
522 pattern. */
523
524 if ((obscure_syntax & RE_CONTEXTUAL_INVALID_OPS) && laststart)
525 goto invalid_pattern;
526 if (laststart && p - 2 >= pattern && p[-2] != '\n'
527 && !(obscure_syntax & RE_CONTEXT_INDEP_OPS))
528 goto normal_char;
529 if (obscure_syntax & RE_TIGHT_VBAR)
530 {
531 if (p != pattern + 1
532 && ! (obscure_syntax & RE_CONTEXT_INDEP_OPS))
533 goto normal_char;
534 BUFPUSH (begline);
535 begalt = b;
536 }
537 else
538 BUFPUSH (begline);
539 break;
540
541 case '+':
542 case '?':
543 if ((obscure_syntax & RE_BK_PLUS_QM)
544 || (obscure_syntax & RE_LIMITED_OPS))
545 goto normal_char;
546 handle_plus:
547 case '*':
548 /* If there is no previous pattern, char not special. */
549 if (!laststart)
550 {
551 if (obscure_syntax & RE_CONTEXTUAL_INVALID_OPS)
552 goto invalid_pattern;
553 else if (! (obscure_syntax & RE_CONTEXT_INDEP_OPS))
554 goto normal_char;
555 }
556 /* If there is a sequence of repetition chars,
557 collapse it down to just one. */
558 zero_times_ok = 0;
559 many_times_ok = 0;
560 while (1)
561 {
562 zero_times_ok |= c != '+';
563 many_times_ok |= c != '?';
564 if (p == pend)
565 break;
566 PATFETCH (c);
567 if (c == '*')
568 ;
569 else if (!(obscure_syntax & RE_BK_PLUS_QM)
570 && (c == '+' || c == '?'))
571 ;
572 else if ((obscure_syntax & RE_BK_PLUS_QM)
573 && c == '\\')
574 {
575 int c1;
576 PATFETCH (c1);
577 if (!(c1 == '+' || c1 == '?'))
578 {
579 PATUNFETCH;
580 PATUNFETCH;
581 break;
582 }
583 c = c1;
584 }
585 else
586 {
587 PATUNFETCH;
588 break;
589 }
590 }
591
592 /* Star, etc. applied to an empty pattern is equivalent
593 to an empty pattern. */
594 if (!laststart)
595 break;
596
597 /* Now we know whether or not zero matches is allowed
598 and also whether or not two or more matches is allowed. */
599 if (many_times_ok)
600 {
601 /* If more than one repetition is allowed, put in at the
602 end a backward relative jump from b to before the next
603 jump we're going to put in below (which jumps from
604 laststart to after this jump). */
605 GET_BUFFER_SPACE (3);
606 store_jump (b, maybe_finalize_jump, laststart - 3);
607 b += 3; /* Because store_jump put stuff here. */
608 }
609 /* On failure, jump from laststart to b + 3, which will be the
610 end of the buffer after this jump is inserted. */
611 GET_BUFFER_SPACE (3);
612 insert_jump (on_failure_jump, laststart, b + 3, b);
613 pending_exact = 0;
614 b += 3;
615 if (!zero_times_ok)
616 {
617 /* At least one repetition is required, so insert a
618 dummy-failure before the initial on-failure-jump
619 instruction of the loop. This effects a skip over that
620 instruction the first time we hit that loop. */
621 GET_BUFFER_SPACE (6);
622 insert_jump (dummy_failure_jump, laststart, laststart + 6, b);
623 b += 3;
624 }
625 break;
626
627 case '.':
628 laststart = b;
629 BUFPUSH (anychar);
630 break;
631
632 case '[':
633 if (p == pend)
634 goto invalid_pattern;
635 while (b - bufp->buffer
636 > bufp->allocated - 3 - (1 << BYTEWIDTH) / BYTEWIDTH)
637 EXTEND_BUFFER;
638
639 laststart = b;
640 if (*p == '^')
641 {
642 BUFPUSH (charset_not);
643 p++;
644 }
645 else
646 BUFPUSH (charset);
647 p1 = p;
648
649 BUFPUSH ((1 << BYTEWIDTH) / BYTEWIDTH);
650 /* Clear the whole map */
651 memset (b, 0, (1 << BYTEWIDTH) / BYTEWIDTH);
652
653 if ((obscure_syntax & RE_HAT_NOT_NEWLINE) && b[-2] == charset_not)
654 SET_LIST_BIT ('\n');
655
656
657 /* Read in characters and ranges, setting map bits. */
658 while (1)
659 {
660 /* Don't translate while fetching, in case it's a range bound.
661 When we set the bit for the character, we translate it. */
662 PATFETCH_RAW (c);
663
664 /* If set, \ escapes characters when inside [...]. */
665 if ((obscure_syntax & RE_AWK_CLASS_HACK) && c == '\\')
666 {
667 PATFETCH(c1);
668 SET_LIST_BIT (c1);
669 continue;
670 }
671 if (c == ']')
672 {
673 if (p == p1 + 1)
674 {
675 /* If this is an empty bracket expression. */
676 if ((obscure_syntax & RE_NO_EMPTY_BRACKETS)
677 && p == pend)
678 goto invalid_pattern;
679 }
680 else
681 /* Stop if this isn't merely a ] inside a bracket
682 expression, but rather the end of a bracket
683 expression. */
684 break;
685 }
686 /* Get a range. */
687 if (p[0] == '-' && p[1] != ']')
688 {
689 PATFETCH (c1);
690 /* Don't translate the range bounds while fetching them. */
691 PATFETCH_RAW (c1);
692
693 if ((obscure_syntax & RE_NO_EMPTY_RANGES) && c > c1)
694 goto invalid_pattern;
695
696 if ((obscure_syntax & RE_NO_HYPHEN_RANGE_END)
697 && c1 == '-' && *p != ']')
698 goto invalid_pattern;
699
700 while (c <= c1)
701 {
702 /* Translate each char that's in the range. */
703 if (translate)
704 SET_LIST_BIT (translate[c]);
705 else
706 SET_LIST_BIT (c);
707 c++;
708 }
709 }
710 else if ((obscure_syntax & RE_CHAR_CLASSES)
711 && c == '[' && p[0] == ':')
712 {
713 /* Longest valid character class word has six characters. */
714 char str[CHAR_CLASS_MAX_LENGTH];
715 PATFETCH (c);
716 c1 = 0;
717 /* If no ] at end. */
718 if (p == pend)
719 goto invalid_pattern;
720 while (1)
721 {
722 /* Don't translate the ``character class'' characters. */
723 PATFETCH_RAW (c);
724 if (c == ':' || c == ']' || p == pend
725 || c1 == CHAR_CLASS_MAX_LENGTH)
726 break;
727 str[c1++] = c;
728 }
729 str[c1] = '\0';
730 if (p == pend
731 || c == ']' /* End of the bracket expression. */
732 || p[0] != ']'
733 || p + 1 == pend
734 || (strcmp (str, "alpha") != 0
735 && strcmp (str, "upper") != 0
736 && strcmp (str, "lower") != 0
737 && strcmp (str, "digit") != 0
738 && strcmp (str, "alnum") != 0
739 && strcmp (str, "xdigit") != 0
740 && strcmp (str, "space") != 0
741 && strcmp (str, "print") != 0
742 && strcmp (str, "punct") != 0
743 && strcmp (str, "graph") != 0
744 && strcmp (str, "cntrl") != 0))
745 {
746 /* Undo the ending character, the letters, and leave
747 the leading : and [ (but set bits for them). */
748 c1++;
749 while (c1--)
750 PATUNFETCH;
751 SET_LIST_BIT ('[');
752 SET_LIST_BIT (':');
753 }
754 else
755 {
756 /* The ] at the end of the character class. */
757 PATFETCH (c);
758 if (c != ']')
759 goto invalid_pattern;
760 for (c = 0; c < (1 << BYTEWIDTH); c++)
761 {
762 if ((strcmp (str, "alpha") == 0 && isalpha (c))
763 || (strcmp (str, "upper") == 0 && isupper (c))
764 || (strcmp (str, "lower") == 0 && islower (c))
765 || (strcmp (str, "digit") == 0 && isdigit (c))
766 || (strcmp (str, "alnum") == 0 && isalnum (c))
767 || (strcmp (str, "xdigit") == 0 && isxdigit (c))
768 || (strcmp (str, "space") == 0 && isspace (c))
769 || (strcmp (str, "print") == 0 && isprint (c))
770 || (strcmp (str, "punct") == 0 && ispunct (c))
771 || (strcmp (str, "graph") == 0 && isgraph (c))
772 || (strcmp (str, "cntrl") == 0 && iscntrl (c)))
773 SET_LIST_BIT (c);
774 }
775 }
776 }
777 else if (translate)
778 SET_LIST_BIT (translate[c]);
779 else
780 SET_LIST_BIT (c);
781 }
782
783 /* Discard any character set/class bitmap bytes that are all
784 0 at the end of the map. Decrement the map-length byte too. */
785 while ((int) b[-1] > 0 && b[b[-1] - 1] == 0)
786 b[-1]--;
787 b += b[-1];
788 break;
789
790 case '(':
791 if (! (obscure_syntax & RE_NO_BK_PARENS))
792 goto normal_char;
793 else
794 goto handle_open;
795
796 case ')':
797 if (! (obscure_syntax & RE_NO_BK_PARENS))
798 goto normal_char;
799 else
800 goto handle_close;
801
802 case '\n':
803 if (! (obscure_syntax & RE_NEWLINE_OR))
804 goto normal_char;
805 else
806 goto handle_bar;
807
808 case '|':
809 if ((obscure_syntax & RE_CONTEXTUAL_INVALID_OPS)
810 && (! laststart || p == pend))
811 goto invalid_pattern;
812 else if (! (obscure_syntax & RE_NO_BK_VBAR))
813 goto normal_char;
814 else
815 goto handle_bar;
816
817 case '{':
818 if (! ((obscure_syntax & RE_NO_BK_CURLY_BRACES)
819 && (obscure_syntax & RE_INTERVALS)))
820 goto normal_char;
821 else
822 goto handle_interval;
823
824 case '\\':
825 if (p == pend) goto invalid_pattern;
826 PATFETCH_RAW (c);
827 switch (c)
828 {
829 case '(':
830 if (obscure_syntax & RE_NO_BK_PARENS)
831 goto normal_backsl;
832 handle_open:
833 if (stackp == stacke) goto nesting_too_deep;
834
835 /* Laststart should point to the start_memory that we are about
836 to push (unless the pattern has RE_NREGS or more ('s). */
837 *stackp++ = b - bufp->buffer;
838 if (regnum < RE_NREGS)
839 {
840 BUFPUSH (start_memory);
841 BUFPUSH (regnum);
842 }
843 *stackp++ = fixup_jump ? fixup_jump - bufp->buffer + 1 : 0;
844 *stackp++ = regnum++;
845 *stackp++ = begalt - bufp->buffer;
846 fixup_jump = 0;
847 laststart = 0;
848 begalt = b;
849 break;
850
851 case ')':
852 if (obscure_syntax & RE_NO_BK_PARENS)
853 goto normal_backsl;
854 handle_close:
855 if (stackp == stackb) goto unmatched_close;
856 begalt = *--stackp + bufp->buffer;
857 if (fixup_jump)
858 store_jump (fixup_jump, jump, b);
859 if (stackp[-1] < RE_NREGS)
860 {
861 BUFPUSH (stop_memory);
862 BUFPUSH (stackp[-1]);
863 }
864 stackp -= 2;
865 fixup_jump = *stackp ? *stackp + bufp->buffer - 1 : 0;
866 laststart = *--stackp + bufp->buffer;
867 break;
868
869 case '|':
870 if ((obscure_syntax & RE_LIMITED_OPS)
871 || (obscure_syntax & RE_NO_BK_VBAR))
872 goto normal_backsl;
873 handle_bar:
874 if (obscure_syntax & RE_LIMITED_OPS)
875 goto normal_char;
876 /* Insert before the previous alternative a jump which
877 jumps to this alternative if the former fails. */
878 GET_BUFFER_SPACE (6);
879 insert_jump (on_failure_jump, begalt, b + 6, b);
880 pending_exact = 0;
881 b += 3;
882 /* The alternative before the previous alternative has a
883 jump after it which gets executed if it gets matched.
884 Adjust that jump so it will jump to the previous
885 alternative's analogous jump (put in below, which in
886 turn will jump to the next (if any) alternative's such
887 jump, etc.). The last such jump jumps to the correct
888 final destination. */
889 if (fixup_jump)
890 store_jump (fixup_jump, jump, b);
891
892 /* Leave space for a jump after previous alternative---to be
893 filled in later. */
894 fixup_jump = b;
895 b += 3;
896
897 laststart = 0;
898 begalt = b;
899 break;
900
901 case '{':
902 if (! (obscure_syntax & RE_INTERVALS)
903 /* Let \{ be a literal. */
904 || ((obscure_syntax & RE_INTERVALS)
905 && (obscure_syntax & RE_NO_BK_CURLY_BRACES))
906 /* If it's the string "\{". */
907 || (p - 2 == pattern && p == pend))
908 goto normal_backsl;
909 handle_interval:
910 beg_interval = p - 1; /* The {. */
911 /* If there is no previous pattern, this isn't an interval. */
912 if (!laststart)
913 {
914 if (obscure_syntax & RE_CONTEXTUAL_INVALID_OPS)
915 goto invalid_pattern;
916 else
917 goto normal_backsl;
918 }
919 /* It also isn't an interval if not preceded by an re
920 matching a single character or subexpression, or if
921 the current type of intervals can't handle back
922 references and the previous thing is a back reference. */
923 if (! (*laststart == anychar
924 || *laststart == charset
925 || *laststart == charset_not
926 || *laststart == start_memory
927 || (*laststart == exactn && laststart[1] == 1)
928 || (! (obscure_syntax & RE_NO_BK_REFS)
929 && *laststart == duplicate)))
930 {
931 if (obscure_syntax & RE_NO_BK_CURLY_BRACES)
932 goto normal_char;
933
934 /* Posix extended syntax is handled in previous
935 statement; this is for Posix basic syntax. */
936 if (obscure_syntax & RE_INTERVALS)
937 goto invalid_pattern;
938
939 goto normal_backsl;
940 }
941 lower_bound = -1; /* So can see if are set. */
942 upper_bound = -1;
943 GET_UNSIGNED_NUMBER (lower_bound);
944 if (c == ',')
945 {
946 GET_UNSIGNED_NUMBER (upper_bound);
947 if (upper_bound < 0)
948 upper_bound = RE_DUP_MAX;
949 }
950 if (upper_bound < 0)
951 upper_bound = lower_bound;
952 if (! (obscure_syntax & RE_NO_BK_CURLY_BRACES))
953 {
954 if (c != '\\')
955 goto invalid_pattern;
956 PATFETCH (c);
957 }
958 if (c != '}' || lower_bound < 0 || upper_bound > RE_DUP_MAX
959 || lower_bound > upper_bound
960 || ((obscure_syntax & RE_NO_BK_CURLY_BRACES)
961 && p != pend && *p == '{'))
962 {
963 if (obscure_syntax & RE_NO_BK_CURLY_BRACES)
964 goto unfetch_interval;
965 else
966 goto invalid_pattern;
967 }
968
969 /* If upper_bound is zero, don't want to succeed at all;
970 jump from laststart to b + 3, which will be the end of
971 the buffer after this jump is inserted. */
972
973 if (upper_bound == 0)
974 {
975 GET_BUFFER_SPACE (3);
976 insert_jump (jump, laststart, b + 3, b);
977 b += 3;
978 }
979
980 /* Otherwise, after lower_bound number of succeeds, jump
981 to after the jump_n which will be inserted at the end
982 of the buffer, and insert that jump_n. */
983 else
984 { /* Set to 5 if only one repetition is allowed and
985 hence no jump_n is inserted at the current end of
986 the buffer; then only space for the succeed_n is
987 needed. Otherwise, need space for both the
988 succeed_n and the jump_n. */
989
990 unsigned slots_needed = upper_bound == 1 ? 5 : 10;
991
992 GET_BUFFER_SPACE ((int) slots_needed);
993 /* Initialize the succeed_n to n, even though it will
994 be set by its attendant set_number_at, because
995 re_compile_fastmap will need to know it. Jump to
996 what the end of buffer will be after inserting
997 this succeed_n and possibly appending a jump_n. */
998 insert_jump_n (succeed_n, laststart, b + slots_needed,
999 b, lower_bound);
1000 b += 5; /* Just increment for the succeed_n here. */
1001
1002 /* More than one repetition is allowed, so put in at
1003 the end of the buffer a backward jump from b to the
1004 succeed_n we put in above. By the time we've gotten
1005 to this jump when matching, we'll have matched once
1006 already, so jump back only upper_bound - 1 times. */
1007
1008 if (upper_bound > 1)
1009 {
1010 store_jump_n (b, jump_n, laststart, upper_bound - 1);
1011 b += 5;
1012 /* When hit this when matching, reset the
1013 preceding jump_n's n to upper_bound - 1. */
1014 BUFPUSH (set_number_at);
1015 GET_BUFFER_SPACE (2);
1016 STORE_NUMBER_AND_INCR (b, -5);
1017 STORE_NUMBER_AND_INCR (b, upper_bound - 1);
1018 }
1019 /* When hit this when matching, set the succeed_n's n. */
1020 GET_BUFFER_SPACE (5);
1021 insert_op_2 (set_number_at, laststart, b, 5, lower_bound);
1022 b += 5;
1023 }
1024 pending_exact = 0;
1025 beg_interval = 0;
1026 break;
1027
1028
1029 unfetch_interval:
1030 /* If an invalid interval, match the characters as literals. */
1031 if (beg_interval)
1032 p = beg_interval;
1033 else
1034 {
1035 fprintf (stderr,
1036 "regex: no interval beginning to which to backtrack.\n");
1037 exit (1);
1038 }
1039
1040 beg_interval = 0;
1041 PATFETCH (c); /* normal_char expects char in `c'. */
1042 goto normal_char;
1043 break;
1044
1045 #ifdef emacs
1046 case '=':
1047 BUFPUSH (at_dot);
1048 break;
1049
1050 case 's':
1051 laststart = b;
1052 BUFPUSH (syntaxspec);
1053 PATFETCH (c);
1054 BUFPUSH (syntax_spec_code[c]);
1055 break;
1056
1057 case 'S':
1058 laststart = b;
1059 BUFPUSH (notsyntaxspec);
1060 PATFETCH (c);
1061 BUFPUSH (syntax_spec_code[c]);
1062 break;
1063 #endif /* emacs */
1064
1065 case 'w':
1066 laststart = b;
1067 BUFPUSH (wordchar);
1068 break;
1069
1070 case 'W':
1071 laststart = b;
1072 BUFPUSH (notwordchar);
1073 break;
1074
1075 case '<':
1076 BUFPUSH (wordbeg);
1077 break;
1078
1079 case '>':
1080 BUFPUSH (wordend);
1081 break;
1082
1083 case 'b':
1084 BUFPUSH (wordbound);
1085 break;
1086
1087 case 'B':
1088 BUFPUSH (notwordbound);
1089 break;
1090
1091 case '`':
1092 BUFPUSH (begbuf);
1093 break;
1094
1095 case '\'':
1096 BUFPUSH (endbuf);
1097 break;
1098
1099 case '1':
1100 case '2':
1101 case '3':
1102 case '4':
1103 case '5':
1104 case '6':
1105 case '7':
1106 case '8':
1107 case '9':
1108 if (obscure_syntax & RE_NO_BK_REFS)
1109 goto normal_char;
1110 c1 = c - '0';
1111 if (c1 >= regnum)
1112 {
1113 if (obscure_syntax & RE_NO_EMPTY_BK_REF)
1114 goto invalid_pattern;
1115 else
1116 goto normal_char;
1117 }
1118 /* Can't back reference to a subexpression if inside of it. */
1119 for (stackt = stackp - 2; stackt > stackb; stackt -= 4)
1120 if (*stackt == c1)
1121 goto normal_char;
1122 laststart = b;
1123 BUFPUSH (duplicate);
1124 BUFPUSH (c1);
1125 break;
1126
1127 case '+':
1128 case '?':
1129 if (obscure_syntax & RE_BK_PLUS_QM)
1130 goto handle_plus;
1131 else
1132 goto normal_backsl;
1133 break;
1134
1135 default:
1136 normal_backsl:
1137 /* You might think it would be useful for \ to mean
1138 not to translate; but if we don't translate it
1139 it will never match anything. */
1140 if (translate) c = translate[c];
1141 goto normal_char;
1142 }
1143 break;
1144
1145 default:
1146 normal_char: /* Expects the character in `c'. */
1147 if (!pending_exact || pending_exact + *pending_exact + 1 != b
1148 || *pending_exact == 0177 || *p == '*' || *p == '^'
1149 || ((obscure_syntax & RE_BK_PLUS_QM)
1150 ? *p == '\\' && (p[1] == '+' || p[1] == '?')
1151 : (*p == '+' || *p == '?'))
1152 || ((obscure_syntax & RE_INTERVALS)
1153 && ((obscure_syntax & RE_NO_BK_CURLY_BRACES)
1154 ? *p == '{'
1155 : (p[0] == '\\' && p[1] == '{'))))
1156 {
1157 laststart = b;
1158 BUFPUSH (exactn);
1159 pending_exact = b;
1160 BUFPUSH (0);
1161 }
1162 BUFPUSH (c);
1163 (*pending_exact)++;
1164 }
1165 }
1166
1167 if (fixup_jump)
1168 store_jump (fixup_jump, jump, b);
1169
1170 if (stackp != stackb) goto unmatched_open;
1171
1172 bufp->used = b - bufp->buffer;
1173 return 0;
1174
1175 invalid_pattern:
1176 return "Invalid regular expression";
1177
1178 unmatched_open:
1179 return "Unmatched \\(";
1180
1181 unmatched_close:
1182 return "Unmatched \\)";
1183
1184 end_of_pattern:
1185 return "Premature end of regular expression";
1186
1187 nesting_too_deep:
1188 return "Nesting too deep";
1189
1190 too_big:
1191 return "Regular expression too big";
1192
1193 memory_exhausted:
1194 return "Memory exhausted";
1195 }
1196
1197
1198 /* Store a jump of the form <OPCODE> <relative address>.
1199 Store in the location FROM a jump operation to jump to relative
1200 address FROM - TO. OPCODE is the opcode to store. */
1201
1202 static void
store_jump(char * from,char opcode,char * to)1203 store_jump (char *from, char opcode, char *to)
1204 {
1205 from[0] = opcode;
1206 STORE_NUMBER(from + 1, to - (from + 3));
1207 }
1208
1209
1210 /* Open up space before char FROM, and insert there a jump to TO.
1211 CURRENT_END gives the end of the storage not in use, so we know
1212 how much data to copy up. OP is the opcode of the jump to insert.
1213
1214 If you call this function, you must zero out pending_exact. */
1215
1216 static void
insert_jump(char op,char * from,char * to,char * current_end)1217 insert_jump (char op, char *from, char *to, char *current_end)
1218 {
1219 register char *pfrom = current_end; /* Copy from here... */
1220 register char *pto = current_end + 3; /* ...to here. */
1221
1222 while (pfrom != from)
1223 *--pto = *--pfrom;
1224 store_jump (from, op, to);
1225 }
1226
1227
1228 /* Store a jump of the form <opcode> <relative address> <n> .
1229
1230 Store in the location FROM a jump operation to jump to relative
1231 address FROM - TO. OPCODE is the opcode to store, N is a number the
1232 jump uses, say, to decide how many times to jump.
1233
1234 If you call this function, you must zero out pending_exact. */
1235
1236 static void
store_jump_n(char * from,char opcode,char * to,unsigned n)1237 store_jump_n (char *from, char opcode, char *to, unsigned n)
1238 {
1239 from[0] = opcode;
1240 STORE_NUMBER (from + 1, to - (from + 3));
1241 STORE_NUMBER (from + 3, n);
1242 }
1243
1244
1245 /* Similar to insert_jump, but handles a jump which needs an extra
1246 number to handle minimum and maximum cases. Open up space at
1247 location FROM, and insert there a jump to TO. CURRENT_END gives the
1248 end of the storage in use, so we know how much data to copy up. OP is
1249 the opcode of the jump to insert.
1250
1251 If you call this function, you must zero out pending_exact. */
1252
1253 static void
insert_jump_n(char op,char * from,char * to,char * current_end,unsigned n)1254 insert_jump_n (char op, char *from, char *to, char *current_end, unsigned n)
1255 {
1256 register char *pfrom = current_end; /* Copy from here... */
1257 register char *pto = current_end + 5; /* ...to here. */
1258
1259 while (pfrom != from)
1260 *--pto = *--pfrom;
1261 store_jump_n (from, op, to, n);
1262 }
1263
1264
1265 /* Open up space at location THERE, and insert operation OP followed by
1266 NUM_1 and NUM_2. CURRENT_END gives the end of the storage in use, so
1267 we know how much data to copy up.
1268
1269 If you call this function, you must zero out pending_exact. */
1270
1271 static void
insert_op_2(char op,char * there,char * current_end,int num_1,int num_2)1272 insert_op_2 (char op, char *there, char *current_end, int num_1, int num_2)
1273 {
1274 register char *pfrom = current_end; /* Copy from here... */
1275 register char *pto = current_end + 5; /* ...to here. */
1276
1277 while (pfrom != there)
1278 *--pto = *--pfrom;
1279
1280 there[0] = op;
1281 STORE_NUMBER (there + 1, num_1);
1282 STORE_NUMBER (there + 3, num_2);
1283 }
1284
1285
1286
1287 /* Given a pattern, compute a fastmap from it. The fastmap records
1288 which of the (1 << BYTEWIDTH) possible characters can start a string
1289 that matches the pattern. This fastmap is used by re_search to skip
1290 quickly over totally implausible text.
1291
1292 The caller must supply the address of a (1 << BYTEWIDTH)-byte data
1293 area as bufp->fastmap.
1294 The other components of bufp describe the pattern to be used. */
1295
1296 void
re_compile_fastmap(struct re_pattern_buffer * bufp)1297 re_compile_fastmap (struct re_pattern_buffer *bufp)
1298 {
1299 unsigned char *pattern = (unsigned char *) bufp->buffer;
1300 int size = bufp->used;
1301 register char *fastmap = bufp->fastmap;
1302 register unsigned char *p = pattern;
1303 register unsigned char *pend = pattern + size;
1304 register int j, k;
1305 unsigned char *translate = (unsigned char *) bufp->translate;
1306
1307 unsigned char *stackb[NFAILURES];
1308 unsigned char **stackp = stackb;
1309
1310 unsigned is_a_succeed_n;
1311
1312 memset (fastmap, 0, (1 << BYTEWIDTH));
1313 bufp->fastmap_accurate = 1;
1314 bufp->can_be_null = 0;
1315
1316 while (p)
1317 {
1318 is_a_succeed_n = 0;
1319 if (p == pend)
1320 {
1321 bufp->can_be_null = 1;
1322 break;
1323 }
1324 #ifdef SWITCH_ENUM_BUG
1325 switch ((int) ((enum regexpcode) *p++))
1326 #else
1327 switch ((enum regexpcode) *p++)
1328 #endif
1329 {
1330 case exactn:
1331 if (translate)
1332 fastmap[translate[p[1]]] = 1;
1333 else
1334 fastmap[p[1]] = 1;
1335 break;
1336
1337 case unused:
1338 case begline:
1339 #ifdef emacs
1340 case before_dot:
1341 case at_dot:
1342 case after_dot:
1343 #endif
1344 case begbuf:
1345 case endbuf:
1346 case wordbound:
1347 case notwordbound:
1348 case wordbeg:
1349 case wordend:
1350 continue;
1351
1352 case endline:
1353 if (translate)
1354 fastmap[translate['\n']] = 1;
1355 else
1356 fastmap['\n'] = 1;
1357
1358 if (bufp->can_be_null != 1)
1359 bufp->can_be_null = 2;
1360 break;
1361
1362 case jump_n:
1363 case finalize_jump:
1364 case maybe_finalize_jump:
1365 case jump:
1366 case dummy_failure_jump:
1367 EXTRACT_NUMBER_AND_INCR (j, p);
1368 p += j;
1369 if (j > 0)
1370 continue;
1371 /* Jump backward reached implies we just went through
1372 the body of a loop and matched nothing.
1373 Opcode jumped to should be an on_failure_jump.
1374 Just treat it like an ordinary jump.
1375 For a * loop, it has pushed its failure point already;
1376 If so, discard that as redundant. */
1377
1378 if ((enum regexpcode) *p != on_failure_jump
1379 && (enum regexpcode) *p != succeed_n)
1380 continue;
1381 p++;
1382 EXTRACT_NUMBER_AND_INCR (j, p);
1383 p += j;
1384 if (stackp != stackb && *stackp == p)
1385 stackp--;
1386 continue;
1387
1388 case on_failure_jump:
1389 handle_on_failure_jump:
1390 EXTRACT_NUMBER_AND_INCR (j, p);
1391 *++stackp = p + j;
1392 if (is_a_succeed_n)
1393 EXTRACT_NUMBER_AND_INCR (k, p); /* Skip the n. */
1394 continue;
1395
1396 case succeed_n:
1397 is_a_succeed_n = 1;
1398 /* Get to the number of times to succeed. */
1399 p += 2;
1400 /* Increment p past the n for when k != 0. */
1401 EXTRACT_NUMBER_AND_INCR (k, p);
1402 if (k == 0)
1403 {
1404 p -= 4;
1405 goto handle_on_failure_jump;
1406 }
1407 continue;
1408
1409 case set_number_at:
1410 p += 4;
1411 continue;
1412
1413 case start_memory:
1414 case stop_memory:
1415 p++;
1416 continue;
1417
1418 case duplicate:
1419 bufp->can_be_null = 1;
1420 fastmap['\n'] = 1;
1421 case anychar:
1422 for (j = 0; j < (1 << BYTEWIDTH); j++)
1423 if (j != '\n')
1424 fastmap[j] = 1;
1425 if (bufp->can_be_null)
1426 return;
1427 /* Don't return; check the alternative paths
1428 so we can set can_be_null if appropriate. */
1429 break;
1430
1431 case wordchar:
1432 for (j = 0; j < (1 << BYTEWIDTH); j++)
1433 if (SYNTAX (j) == Sword)
1434 fastmap[j] = 1;
1435 break;
1436
1437 case notwordchar:
1438 for (j = 0; j < (1 << BYTEWIDTH); j++)
1439 if (SYNTAX (j) != Sword)
1440 fastmap[j] = 1;
1441 break;
1442
1443 #ifdef emacs
1444 case syntaxspec:
1445 k = *p++;
1446 for (j = 0; j < (1 << BYTEWIDTH); j++)
1447 if (SYNTAX (j) == (enum syntaxcode) k)
1448 fastmap[j] = 1;
1449 break;
1450
1451 case notsyntaxspec:
1452 k = *p++;
1453 for (j = 0; j < (1 << BYTEWIDTH); j++)
1454 if (SYNTAX (j) != (enum syntaxcode) k)
1455 fastmap[j] = 1;
1456 break;
1457 #endif /* not emacs */
1458
1459 case charset:
1460 for (j = *p++ * BYTEWIDTH - 1; j >= 0; j--)
1461 if (p[j / BYTEWIDTH] & (1 << (j % BYTEWIDTH)))
1462 {
1463 if (translate)
1464 fastmap[translate[j]] = 1;
1465 else
1466 fastmap[j] = 1;
1467 }
1468 break;
1469
1470 case charset_not:
1471 /* Chars beyond end of map must be allowed */
1472 for (j = *p * BYTEWIDTH; j < (1 << BYTEWIDTH); j++)
1473 if (translate)
1474 fastmap[translate[j]] = 1;
1475 else
1476 fastmap[j] = 1;
1477
1478 for (j = *p++ * BYTEWIDTH - 1; j >= 0; j--)
1479 if (!(p[j / BYTEWIDTH] & (1 << (j % BYTEWIDTH))))
1480 {
1481 if (translate)
1482 fastmap[translate[j]] = 1;
1483 else
1484 fastmap[j] = 1;
1485 }
1486 break;
1487 }
1488
1489 /* Get here means we have successfully found the possible starting
1490 characters of one path of the pattern. We need not follow this
1491 path any farther. Instead, look at the next alternative
1492 remembered in the stack. */
1493 if (stackp != stackb)
1494 p = *stackp--;
1495 else
1496 break;
1497 }
1498 }
1499
1500
1501
1502 /* Like re_search_2, below, but only one string is specified, and
1503 doesn't let you say where to stop matching. */
1504
1505 int
re_search(struct re_pattern_buffer * pbufp,char * string,int size,int startpos,int range,struct re_registers * regs)1506 re_search (struct re_pattern_buffer *pbufp,
1507 char *string,
1508 int size,
1509 int startpos,
1510 int range,
1511 struct re_registers *regs)
1512 {
1513 return re_search_2 (pbufp, (char *) 0, 0, string, size, startpos, range,
1514 regs, size);
1515 }
1516
1517
1518 /* Using the compiled pattern in PBUFP->buffer, first tries to match the
1519 virtual concatenation of STRING1 and STRING2, starting first at index
1520 STARTPOS, then at STARTPOS + 1, and so on. RANGE is the number of
1521 places to try before giving up. If RANGE is negative, it searches
1522 backwards, i.e., the starting positions tried are STARTPOS, STARTPOS
1523 - 1, etc. STRING1 and STRING2 are of SIZE1 and SIZE2, respectively.
1524 In REGS, return the indices of the virtual concatenation of STRING1
1525 and STRING2 that matched the entire PBUFP->buffer and its contained
1526 subexpressions. Do not consider matching one past the index MSTOP in
1527 the virtual concatenation of STRING1 and STRING2.
1528
1529 The value returned is the position in the strings at which the match
1530 was found, or -1 if no match was found, or -2 if error (such as
1531 failure stack overflow). */
1532
1533 int
re_search_2(struct re_pattern_buffer * pbufp,char * string1,int size1,char * string2,int size2,int startpos,register int range,struct re_registers * regs,int mstop)1534 re_search_2 (struct re_pattern_buffer *pbufp,
1535 char *string1, int size1,
1536 char *string2, int size2,
1537 int startpos,
1538 register int range,
1539 struct re_registers *regs,
1540 int mstop)
1541 {
1542 register char *fastmap = pbufp->fastmap;
1543 register unsigned char *translate = (unsigned char *) pbufp->translate;
1544 int total_size = size1 + size2;
1545 int endpos = startpos + range;
1546 int val;
1547
1548 /* Check for out-of-range starting position. */
1549 if (startpos < 0 || startpos > total_size)
1550 return -1;
1551
1552 /* Fix up range if it would eventually take startpos outside of the
1553 virtual concatenation of string1 and string2. */
1554 if (endpos < -1)
1555 range = -1 - startpos;
1556 else if (endpos > total_size)
1557 range = total_size - startpos;
1558
1559 /* Update the fastmap now if not correct already. */
1560 if (fastmap && !pbufp->fastmap_accurate)
1561 re_compile_fastmap (pbufp);
1562
1563 /* If the search isn't to be a backwards one, don't waste time in a
1564 long search for a pattern that says it is anchored. */
1565 if (pbufp->used > 0 && (enum regexpcode) pbufp->buffer[0] == begbuf
1566 && range > 0)
1567 {
1568 if (startpos > 0)
1569 return -1;
1570 else
1571 range = 1;
1572 }
1573
1574 while (1)
1575 {
1576 /* If a fastmap is supplied, skip quickly over characters that
1577 cannot possibly be the start of a match. Note, however, that
1578 if the pattern can possibly match the null string, we must
1579 test it at each starting point so that we take the first null
1580 string we get. */
1581
1582 if (fastmap && startpos < total_size && pbufp->can_be_null != 1)
1583 {
1584 if (range > 0) /* Searching forwards. */
1585 {
1586 register int lim = 0;
1587 register unsigned char *p;
1588 int irange = range;
1589 if (startpos < size1 && startpos + range >= size1)
1590 lim = range - (size1 - startpos);
1591
1592 p = ((unsigned char *)
1593 &(startpos >= size1 ? string2 - size1 : string1)[startpos]);
1594
1595 while (range > lim && !fastmap[translate
1596 ? translate[*p++]
1597 : *p++])
1598 range--;
1599 startpos += irange - range;
1600 }
1601 else /* Searching backwards. */
1602 {
1603 register unsigned char c;
1604
1605 if (string1 == 0 || startpos >= size1)
1606 c = string2[startpos - size1];
1607 else
1608 c = string1[startpos];
1609
1610 c &= 0xff;
1611 if (translate ? !fastmap[translate[c]] : !fastmap[c])
1612 goto advance;
1613 }
1614 }
1615
1616 if (range >= 0 && startpos == total_size
1617 && fastmap && pbufp->can_be_null == 0)
1618 return -1;
1619
1620 val = re_match_2 (pbufp, string1, size1, string2, size2, startpos,
1621 regs, mstop);
1622 if (val >= 0)
1623 return startpos;
1624 if (val == -2)
1625 return -2;
1626
1627 #ifdef C_ALLOCA
1628 alloca (0);
1629 #endif /* C_ALLOCA */
1630
1631 advance:
1632 if (!range)
1633 break;
1634 else if (range > 0)
1635 {
1636 range--;
1637 startpos++;
1638 }
1639 else
1640 {
1641 range++;
1642 startpos--;
1643 }
1644 }
1645 return -1;
1646 }
1647
1648
1649
1650 #ifndef emacs /* emacs never uses this. */
1651 int
re_match(struct re_pattern_buffer * pbufp,char * string,int size,int pos,struct re_registers * regs)1652 re_match (struct re_pattern_buffer *pbufp,
1653 char *string,
1654 int size,
1655 int pos,
1656 struct re_registers *regs)
1657 {
1658 return re_match_2 (pbufp, (char *) 0, 0, string, size, pos, regs, size);
1659 }
1660 #endif /* not emacs */
1661
1662
1663 /* The following are used for re_match_2, defined below: */
1664
1665 /* Roughly the maximum number of failure points on the stack. Would be
1666 exactly that if always pushed MAX_NUM_FAILURE_ITEMS each time we failed. */
1667
1668 int re_max_failures = 2000;
1669
1670 /* Routine used by re_match_2. */
1671 static int bcmp_translate (char *, char *, int, unsigned char *);
1672
1673
1674 /* Structure and accessing macros used in re_match_2: */
1675
1676 struct register_info
1677 {
1678 unsigned is_active : 1;
1679 unsigned matched_something : 1;
1680 };
1681
1682 #define IS_ACTIVE(R) ((R).is_active)
1683 #define MATCHED_SOMETHING(R) ((R).matched_something)
1684
1685
1686 /* Macros used by re_match_2: */
1687
1688
1689 /* I.e., regstart, regend, and reg_info. */
1690
1691 #define NUM_REG_ITEMS 3
1692
1693 /* We push at most this many things on the stack whenever we
1694 fail. The `+ 2' refers to PATTERN_PLACE and STRING_PLACE, which are
1695 arguments to the PUSH_FAILURE_POINT macro. */
1696
1697 #define MAX_NUM_FAILURE_ITEMS (RE_NREGS * NUM_REG_ITEMS + 2)
1698
1699
1700 /* We push this many things on the stack whenever we fail. */
1701
1702 #define NUM_FAILURE_ITEMS (last_used_reg * NUM_REG_ITEMS + 2)
1703
1704
1705 /* This pushes most of the information about the current state we will want
1706 if we ever fail back to it. */
1707
1708 #define PUSH_FAILURE_POINT(pattern_place, string_place) \
1709 { \
1710 short last_used_reg, this_reg; \
1711 \
1712 /* Find out how many registers are active or have been matched. \
1713 (Aside from register zero, which is only set at the end.) */ \
1714 for (last_used_reg = RE_NREGS - 1; last_used_reg > 0; last_used_reg--)\
1715 if (regstart[last_used_reg] != (unsigned char *) -1) \
1716 break; \
1717 \
1718 if (stacke - stackp < NUM_FAILURE_ITEMS) \
1719 { \
1720 unsigned char **stackx; \
1721 int len = stacke - stackb; \
1722 if (len > re_max_failures * MAX_NUM_FAILURE_ITEMS) \
1723 return -2; \
1724 \
1725 /* Roughly double the size of the stack. */ \
1726 stackx = (unsigned char **) alloca (2 * len \
1727 * sizeof (unsigned char *));\
1728 /* Only copy what is in use. */ \
1729 memcpy (stackx, stackb, len * sizeof (char *)); \
1730 stackp = stackx + (stackp - stackb); \
1731 stackb = stackx; \
1732 stacke = stackb + 2 * len; \
1733 } \
1734 \
1735 /* Now push the info for each of those registers. */ \
1736 for (this_reg = 1; this_reg <= last_used_reg; this_reg++) \
1737 { \
1738 *stackp++ = regstart[this_reg]; \
1739 *stackp++ = regend[this_reg]; \
1740 *stackp++ = (unsigned char *) ®_info[this_reg]; \
1741 } \
1742 \
1743 /* Push how many registers we saved. */ \
1744 *stackp++ = (unsigned char *) last_used_reg; \
1745 \
1746 *stackp++ = pattern_place; \
1747 *stackp++ = string_place; \
1748 }
1749
1750
1751 /* This pops what PUSH_FAILURE_POINT pushes. */
1752
1753 #define POP_FAILURE_POINT() \
1754 { \
1755 int temp; \
1756 stackp -= 2; /* Remove failure points. */ \
1757 temp = (int) *--stackp; /* How many regs pushed. */ \
1758 temp *= NUM_REG_ITEMS; /* How much to take off the stack. */ \
1759 stackp -= temp; /* Remove the register info. */ \
1760 }
1761
1762
1763 #define MATCHING_IN_FIRST_STRING (dend == end_match_1)
1764
1765 /* Is true if there is a first string and if PTR is pointing anywhere
1766 inside it or just past the end. */
1767
1768 #define IS_IN_FIRST_STRING(ptr) \
1769 (size1 && string1 <= (ptr) && (ptr) <= string1 + size1)
1770
1771 /* Call before fetching a character with *d. This switches over to
1772 string2 if necessary. */
1773
1774 #define PREFETCH \
1775 while (d == dend) \
1776 { \
1777 /* end of string2 => fail. */ \
1778 if (dend == end_match_2) \
1779 goto fail; \
1780 /* end of string1 => advance to string2. */ \
1781 d = string2; \
1782 dend = end_match_2; \
1783 }
1784
1785
1786 /* Call this when have matched something; it sets `matched' flags for the
1787 registers corresponding to the subexpressions of which we currently
1788 are inside. */
1789 #define SET_REGS_MATCHED \
1790 { unsigned this_reg; \
1791 for (this_reg = 0; this_reg < RE_NREGS; this_reg++) \
1792 { \
1793 if (IS_ACTIVE(reg_info[this_reg])) \
1794 MATCHED_SOMETHING(reg_info[this_reg]) = 1; \
1795 else \
1796 MATCHED_SOMETHING(reg_info[this_reg]) = 0; \
1797 } \
1798 }
1799
1800 /* Test if at very beginning or at very end of the virtual concatenation
1801 of string1 and string2. If there is only one string, we've put it in
1802 string2. */
1803
1804 #define AT_STRINGS_BEG (d == (size1 ? string1 : string2) || !size2)
1805 #define AT_STRINGS_END (d == end2)
1806
1807 #define AT_WORD_BOUNDARY \
1808 (AT_STRINGS_BEG || AT_STRINGS_END || IS_A_LETTER (d - 1) != IS_A_LETTER (d))
1809
1810 /* We have two special cases to check for:
1811 1) if we're past the end of string1, we have to look at the first
1812 character in string2;
1813 2) if we're before the beginning of string2, we have to look at the
1814 last character in string1; we assume there is a string1, so use
1815 this in conjunction with AT_STRINGS_BEG. */
1816 #define IS_A_LETTER(d) \
1817 (SYNTAX ((d) == end1 ? *string2 : (d) == string2 - 1 ? *(end1 - 1) : *(d))\
1818 == Sword)
1819
1820
1821 /* Match the pattern described by PBUFP against the virtual
1822 concatenation of STRING1 and STRING2, which are of SIZE1 and SIZE2,
1823 respectively. Start the match at index POS in the virtual
1824 concatenation of STRING1 and STRING2. In REGS, return the indices of
1825 the virtual concatenation of STRING1 and STRING2 that matched the
1826 entire PBUFP->buffer and its contained subexpressions. Do not
1827 consider matching one past the index MSTOP in the virtual
1828 concatenation of STRING1 and STRING2.
1829
1830 If pbufp->fastmap is nonzero, then it had better be up to date.
1831
1832 The reason that the data to match are specified as two components
1833 which are to be regarded as concatenated is so this function can be
1834 used directly on the contents of an Emacs buffer.
1835
1836 -1 is returned if there is no match. -2 is returned if there is an
1837 error (such as match stack overflow). Otherwise the value is the
1838 length of the substring which was matched. */
1839
1840 int
re_match_2(struct re_pattern_buffer * pbufp,char * string1_arg,int size1,char * string2_arg,int size2,int pos,struct re_registers * regs,int mstop)1841 re_match_2 (struct re_pattern_buffer *pbufp,
1842 char *string1_arg, int size1,
1843 char *string2_arg, int size2,
1844 int pos,
1845 struct re_registers *regs,
1846 int mstop)
1847 {
1848 register unsigned char *p = (unsigned char *) pbufp->buffer;
1849
1850 /* Pointer to beyond end of buffer. */
1851 register unsigned char *pend = p + pbufp->used;
1852
1853 unsigned char *string1 = (unsigned char *) string1_arg;
1854 unsigned char *string2 = (unsigned char *) string2_arg;
1855 unsigned char *end1; /* Just past end of first string. */
1856 unsigned char *end2; /* Just past end of second string. */
1857
1858 /* Pointers into string1 and string2, just past the last characters in
1859 each to consider matching. */
1860 unsigned char *end_match_1, *end_match_2;
1861
1862 register unsigned char *d, *dend;
1863 register int mcnt; /* Multipurpose. */
1864 unsigned char *translate = (unsigned char *) pbufp->translate;
1865 unsigned is_a_jump_n = 0;
1866
1867 /* Failure point stack. Each place that can handle a failure further
1868 down the line pushes a failure point on this stack. It consists of
1869 restart, regend, and reg_info for all registers corresponding to the
1870 subexpressions we're currently inside, plus the number of such
1871 registers, and, finally, two char *'s. The first char * is where to
1872 resume scanning the pattern; the second one is where to resume
1873 scanning the strings. If the latter is zero, the failure point is a
1874 ``dummy''; if a failure happens and the failure point is a dummy, it
1875 gets discarded and the next next one is tried. */
1876
1877 unsigned char *initial_stack[MAX_NUM_FAILURE_ITEMS * NFAILURES];
1878 unsigned char **stackb = initial_stack;
1879 unsigned char **stackp = stackb;
1880 unsigned char **stacke = &stackb[MAX_NUM_FAILURE_ITEMS * NFAILURES];
1881
1882
1883 /* Information on the contents of registers. These are pointers into
1884 the input strings; they record just what was matched (on this
1885 attempt) by a subexpression part of the pattern, that is, the
1886 regnum-th regstart pointer points to where in the pattern we began
1887 matching and the regnum-th regend points to right after where we
1888 stopped matching the regnum-th subexpression. (The zeroth register
1889 keeps track of what the whole pattern matches.) */
1890
1891 unsigned char *regstart[RE_NREGS];
1892 unsigned char *regend[RE_NREGS];
1893
1894 /* The is_active field of reg_info helps us keep track of which (possibly
1895 nested) subexpressions we are currently in. The matched_something
1896 field of reg_info[reg_num] helps us tell whether or not we have
1897 matched any of the pattern so far this time through the reg_num-th
1898 subexpression. These two fields get reset each time through any
1899 loop their register is in. */
1900
1901 struct register_info reg_info[RE_NREGS];
1902
1903
1904 /* The following record the register info as found in the above
1905 variables when we find a match better than any we've seen before.
1906 This happens as we backtrack through the failure points, which in
1907 turn happens only if we have not yet matched the entire string. */
1908
1909 unsigned best_regs_set = 0;
1910 unsigned char *best_regstart[RE_NREGS];
1911 unsigned char *best_regend[RE_NREGS];
1912
1913 /* Initialize subexpression text positions to -1 to mark ones that no
1914 \( or ( and \) or ) has been seen for. Also set all registers to
1915 inactive and mark them as not having matched anything or ever
1916 failed. */
1917 for (mcnt = 0; mcnt < RE_NREGS; mcnt++)
1918 {
1919 regstart[mcnt] = regend[mcnt] = (unsigned char *) -1;
1920 IS_ACTIVE (reg_info[mcnt]) = 0;
1921 MATCHED_SOMETHING (reg_info[mcnt]) = 0;
1922 }
1923
1924 if (regs)
1925 for (mcnt = 0; mcnt < RE_NREGS; mcnt++)
1926 regs->start[mcnt] = regs->end[mcnt] = -1;
1927
1928 /* Set up pointers to ends of strings.
1929 Don't allow the second string to be empty unless both are empty. */
1930 if (size2 == 0)
1931 {
1932 string2 = string1;
1933 size2 = size1;
1934 string1 = 0;
1935 size1 = 0;
1936 }
1937 end1 = string1 + size1;
1938 end2 = string2 + size2;
1939
1940 /* Compute where to stop matching, within the two strings. */
1941 if (mstop <= size1)
1942 {
1943 end_match_1 = string1 + mstop;
1944 end_match_2 = string2;
1945 }
1946 else
1947 {
1948 end_match_1 = end1;
1949 end_match_2 = string2 + mstop - size1;
1950 }
1951
1952 /* `p' scans through the pattern as `d' scans through the data. `dend'
1953 is the end of the input string that `d' points within. `d' is
1954 advanced into the following input string whenever necessary, but
1955 this happens before fetching; therefore, at the beginning of the
1956 loop, `d' can be pointing at the end of a string, but it cannot
1957 equal string2. */
1958
1959 if (size1 != 0 && pos <= size1)
1960 d = string1 + pos, dend = end_match_1;
1961 else
1962 d = string2 + pos - size1, dend = end_match_2;
1963
1964
1965 /* This loops over pattern commands. It exits by returning from the
1966 function if match is complete, or it drops through if match fails
1967 at this starting point in the input data. */
1968
1969 while (1)
1970 {
1971 is_a_jump_n = 0;
1972 /* End of pattern means we might have succeeded. */
1973 if (p == pend)
1974 {
1975 /* If not end of string, try backtracking. Otherwise done. */
1976 if (d != end_match_2)
1977 {
1978 if (stackp != stackb)
1979 {
1980 /* More failure points to try. */
1981
1982 unsigned in_same_string =
1983 IS_IN_FIRST_STRING (best_regend[0])
1984 == MATCHING_IN_FIRST_STRING;
1985
1986 /* If exceeds best match so far, save it. */
1987 if (! best_regs_set
1988 || (in_same_string && d > best_regend[0])
1989 || (! in_same_string && ! MATCHING_IN_FIRST_STRING))
1990 {
1991 best_regs_set = 1;
1992 best_regend[0] = d; /* Never use regstart[0]. */
1993
1994 for (mcnt = 1; mcnt < RE_NREGS; mcnt++)
1995 {
1996 best_regstart[mcnt] = regstart[mcnt];
1997 best_regend[mcnt] = regend[mcnt];
1998 }
1999 }
2000 goto fail;
2001 }
2002 /* If no failure points, don't restore garbage. */
2003 else if (best_regs_set)
2004 {
2005 restore_best_regs:
2006 /* Restore best match. */
2007 d = best_regend[0];
2008
2009 for (mcnt = 0; mcnt < RE_NREGS; mcnt++)
2010 {
2011 regstart[mcnt] = best_regstart[mcnt];
2012 regend[mcnt] = best_regend[mcnt];
2013 }
2014 }
2015 }
2016
2017 /* If caller wants register contents data back, convert it
2018 to indices. */
2019 if (regs)
2020 {
2021 regs->start[0] = pos;
2022 if (MATCHING_IN_FIRST_STRING)
2023 regs->end[0] = d - string1;
2024 else
2025 regs->end[0] = d - string2 + size1;
2026 for (mcnt = 1; mcnt < RE_NREGS; mcnt++)
2027 {
2028 if (regend[mcnt] == (unsigned char *) -1)
2029 {
2030 regs->start[mcnt] = -1;
2031 regs->end[mcnt] = -1;
2032 continue;
2033 }
2034 if (IS_IN_FIRST_STRING (regstart[mcnt]))
2035 regs->start[mcnt] = regstart[mcnt] - string1;
2036 else
2037 regs->start[mcnt] = regstart[mcnt] - string2 + size1;
2038
2039 if (IS_IN_FIRST_STRING (regend[mcnt]))
2040 regs->end[mcnt] = regend[mcnt] - string1;
2041 else
2042 regs->end[mcnt] = regend[mcnt] - string2 + size1;
2043 }
2044 }
2045 return d - pos - (MATCHING_IN_FIRST_STRING
2046 ? string1
2047 : string2 - size1);
2048 }
2049
2050 /* Otherwise match next pattern command. */
2051 #ifdef SWITCH_ENUM_BUG
2052 switch ((int) ((enum regexpcode) *p++))
2053 #else
2054 switch ((enum regexpcode) *p++)
2055 #endif
2056 {
2057
2058 /* \( [or `(', as appropriate] is represented by start_memory,
2059 \) by stop_memory. Both of those commands are followed by
2060 a register number in the next byte. The text matched
2061 within the \( and \) is recorded under that number. */
2062 case start_memory:
2063 regstart[*p] = d;
2064 IS_ACTIVE (reg_info[*p]) = 1;
2065 MATCHED_SOMETHING (reg_info[*p]) = 0;
2066 p++;
2067 break;
2068
2069 case stop_memory:
2070 regend[*p] = d;
2071 IS_ACTIVE (reg_info[*p]) = 0;
2072
2073 /* If just failed to match something this time around with a sub-
2074 expression that's in a loop, try to force exit from the loop. */
2075 if ((! MATCHED_SOMETHING (reg_info[*p])
2076 || (enum regexpcode) p[-3] == start_memory)
2077 && (p + 1) != pend)
2078 {
2079 register unsigned char *p2 = p + 1;
2080 mcnt = 0;
2081 switch (*p2++)
2082 {
2083 case jump_n:
2084 is_a_jump_n = 1;
2085 case finalize_jump:
2086 case maybe_finalize_jump:
2087 case jump:
2088 case dummy_failure_jump:
2089 EXTRACT_NUMBER_AND_INCR (mcnt, p2);
2090 if (is_a_jump_n)
2091 p2 += 2;
2092 break;
2093 }
2094 p2 += mcnt;
2095
2096 /* If the next operation is a jump backwards in the pattern
2097 to an on_failure_jump, exit from the loop by forcing a
2098 failure after pushing on the stack the on_failure_jump's
2099 jump in the pattern, and d. */
2100 if (mcnt < 0 && (enum regexpcode) *p2++ == on_failure_jump)
2101 {
2102 EXTRACT_NUMBER_AND_INCR (mcnt, p2);
2103 PUSH_FAILURE_POINT (p2 + mcnt, d);
2104 goto fail;
2105 }
2106 }
2107 p++;
2108 break;
2109
2110 /* \<digit> has been turned into a `duplicate' command which is
2111 followed by the numeric value of <digit> as the register number. */
2112 case duplicate:
2113 {
2114 int regno = *p++; /* Get which register to match against */
2115 register unsigned char *d2, *dend2;
2116
2117 /* Where in input to try to start matching. */
2118 d2 = regstart[regno];
2119
2120 /* Where to stop matching; if both the place to start and
2121 the place to stop matching are in the same string, then
2122 set to the place to stop, otherwise, for now have to use
2123 the end of the first string. */
2124
2125 dend2 = ((IS_IN_FIRST_STRING (regstart[regno])
2126 == IS_IN_FIRST_STRING (regend[regno]))
2127 ? regend[regno] : end_match_1);
2128 while (1)
2129 {
2130 /* If necessary, advance to next segment in register
2131 contents. */
2132 while (d2 == dend2)
2133 {
2134 if (dend2 == end_match_2) break;
2135 if (dend2 == regend[regno]) break;
2136 d2 = string2, dend2 = regend[regno]; /* end of string1 => advance to string2. */
2137 }
2138 /* At end of register contents => success */
2139 if (d2 == dend2) break;
2140
2141 /* If necessary, advance to next segment in data. */
2142 PREFETCH;
2143
2144 /* How many characters left in this segment to match. */
2145 mcnt = dend - d;
2146
2147 /* Want how many consecutive characters we can match in
2148 one shot, so, if necessary, adjust the count. */
2149 if (mcnt > dend2 - d2)
2150 mcnt = dend2 - d2;
2151
2152 /* Compare that many; failure if mismatch, else move
2153 past them. */
2154 if (translate
2155 ? bcmp_translate ((char*)d, (char*)d2, mcnt, translate)
2156 : memcmp (d, d2, mcnt))
2157 goto fail;
2158 d += mcnt, d2 += mcnt;
2159 }
2160 }
2161 break;
2162
2163 case anychar:
2164 PREFETCH; /* Fetch a data character. */
2165 /* Match anything but a newline, maybe even a null. */
2166 if ((translate ? translate[*d] : *d) == '\n'
2167 || ((obscure_syntax & RE_DOT_NOT_NULL)
2168 && (translate ? translate[*d] : *d) == '\000'))
2169 goto fail;
2170 SET_REGS_MATCHED;
2171 d++;
2172 break;
2173
2174 case charset:
2175 case charset_not:
2176 {
2177 int not = 0; /* Nonzero for charset_not. */
2178 register int c;
2179 if (*(p - 1) == (unsigned char) charset_not)
2180 not = 1;
2181
2182 PREFETCH; /* Fetch a data character. */
2183
2184 if (translate)
2185 c = translate[*d];
2186 else
2187 c = *d;
2188
2189 if (c < *p * BYTEWIDTH
2190 && p[1 + c / BYTEWIDTH] & (1 << (c % BYTEWIDTH)))
2191 not = !not;
2192
2193 p += 1 + *p;
2194
2195 if (!not) goto fail;
2196 SET_REGS_MATCHED;
2197 d++;
2198 break;
2199 }
2200
2201 case begline:
2202 if ((size1 != 0 && d == string1)
2203 || (size1 == 0 && size2 != 0 && d == string2)
2204 || (d && d[-1] == '\n')
2205 || (size1 == 0 && size2 == 0))
2206 break;
2207 else
2208 goto fail;
2209
2210 case endline:
2211 if (d == end2
2212 || (d == end1 ? (size2 == 0 || *string2 == '\n') : *d == '\n'))
2213 break;
2214 goto fail;
2215
2216 /* `or' constructs are handled by starting each alternative with
2217 an on_failure_jump that points to the start of the next
2218 alternative. Each alternative except the last ends with a
2219 jump to the joining point. (Actually, each jump except for
2220 the last one really jumps to the following jump, because
2221 tensioning the jumps is a hassle.) */
2222
2223 /* The start of a stupid repeat has an on_failure_jump that points
2224 past the end of the repeat text. This makes a failure point so
2225 that on failure to match a repetition, matching restarts past
2226 as many repetitions have been found with no way to fail and
2227 look for another one. */
2228
2229 /* A smart repeat is similar but loops back to the on_failure_jump
2230 so that each repetition makes another failure point. */
2231
2232 case on_failure_jump:
2233 on_failure:
2234 EXTRACT_NUMBER_AND_INCR (mcnt, p);
2235 PUSH_FAILURE_POINT (p + mcnt, d);
2236 break;
2237
2238 /* The end of a smart repeat has a maybe_finalize_jump back.
2239 Change it either to a finalize_jump or an ordinary jump. */
2240 case maybe_finalize_jump:
2241 EXTRACT_NUMBER_AND_INCR (mcnt, p);
2242 {
2243 register unsigned char *p2 = p;
2244 /* Compare what follows with the beginning of the repeat.
2245 If we can establish that there is nothing that they would
2246 both match, we can change to finalize_jump. */
2247 while (p2 + 1 != pend
2248 && (*p2 == (unsigned char) stop_memory
2249 || *p2 == (unsigned char) start_memory))
2250 p2 += 2; /* Skip over reg number. */
2251 if (p2 == pend)
2252 p[-3] = (unsigned char) finalize_jump;
2253 else if (*p2 == (unsigned char) exactn
2254 || *p2 == (unsigned char) endline)
2255 {
2256 register int c = *p2 == (unsigned char) endline ? '\n' : p2[2];
2257 register unsigned char *p1 = p + mcnt;
2258 /* p1[0] ... p1[2] are an on_failure_jump.
2259 Examine what follows that. */
2260 if (p1[3] == (unsigned char) exactn && p1[5] != c)
2261 p[-3] = (unsigned char) finalize_jump;
2262 else if (p1[3] == (unsigned char) charset
2263 || p1[3] == (unsigned char) charset_not)
2264 {
2265 int not = p1[3] == (unsigned char) charset_not;
2266 if (c < p1[4] * BYTEWIDTH
2267 && p1[5 + c / BYTEWIDTH] & (1 << (c % BYTEWIDTH)))
2268 not = !not;
2269 /* `not' is 1 if c would match. */
2270 /* That means it is not safe to finalize. */
2271 if (!not)
2272 p[-3] = (unsigned char) finalize_jump;
2273 }
2274 }
2275 }
2276 p -= 2; /* Point at relative address again. */
2277 if (p[-1] != (unsigned char) finalize_jump)
2278 {
2279 p[-1] = (unsigned char) jump;
2280 goto nofinalize;
2281 }
2282 /* Note fall through. */
2283
2284 /* The end of a stupid repeat has a finalize_jump back to the
2285 start, where another failure point will be made which will
2286 point to after all the repetitions found so far. */
2287
2288 /* Take off failure points put on by matching on_failure_jump
2289 because didn't fail. Also remove the register information
2290 put on by the on_failure_jump. */
2291 case finalize_jump:
2292 POP_FAILURE_POINT ();
2293 /* Note fall through. */
2294
2295 /* Jump without taking off any failure points. */
2296 case jump:
2297 nofinalize:
2298 EXTRACT_NUMBER_AND_INCR (mcnt, p);
2299 p += mcnt;
2300 break;
2301
2302 case dummy_failure_jump:
2303 /* Normally, the on_failure_jump pushes a failure point, which
2304 then gets popped at finalize_jump. We will end up at
2305 finalize_jump, also, and with a pattern of, say, `a+', we
2306 are skipping over the on_failure_jump, so we have to push
2307 something meaningless for finalize_jump to pop. */
2308 PUSH_FAILURE_POINT (0, 0);
2309 goto nofinalize;
2310
2311
2312 /* Have to succeed matching what follows at least n times. Then
2313 just handle like an on_failure_jump. */
2314 case succeed_n:
2315 EXTRACT_NUMBER (mcnt, p + 2);
2316 /* Originally, this is how many times we HAVE to succeed. */
2317 if (mcnt)
2318 {
2319 mcnt--;
2320 p += 2;
2321 STORE_NUMBER_AND_INCR (p, mcnt);
2322 }
2323 else if (mcnt == 0)
2324 {
2325 p[2] = unused;
2326 p[3] = unused;
2327 goto on_failure;
2328 }
2329 else
2330 {
2331 fprintf (stderr, "regex: the succeed_n's n is not set.\n");
2332 exit (1);
2333 }
2334 break;
2335
2336 case jump_n:
2337 EXTRACT_NUMBER (mcnt, p + 2);
2338 /* Originally, this is how many times we CAN jump. */
2339 if (mcnt)
2340 {
2341 mcnt--;
2342 STORE_NUMBER(p + 2, mcnt);
2343 goto nofinalize; /* Do the jump without taking off
2344 any failure points. */
2345 }
2346 /* If don't have to jump any more, skip over the rest of command. */
2347 else
2348 p += 4;
2349 break;
2350
2351 case set_number_at:
2352 {
2353 register unsigned char *p1;
2354
2355 EXTRACT_NUMBER_AND_INCR (mcnt, p);
2356 p1 = p + mcnt;
2357 EXTRACT_NUMBER_AND_INCR (mcnt, p);
2358 STORE_NUMBER (p1, mcnt);
2359 break;
2360 }
2361
2362 /* Ignore these. Used to ignore the n of succeed_n's which
2363 currently have n == 0. */
2364 case unused:
2365 break;
2366
2367 case wordbound:
2368 if (AT_WORD_BOUNDARY)
2369 break;
2370 goto fail;
2371
2372 case notwordbound:
2373 if (AT_WORD_BOUNDARY)
2374 goto fail;
2375 break;
2376
2377 case wordbeg:
2378 /* Have to check if AT_STRINGS_BEG before looking at d - 1. */
2379 if (IS_A_LETTER (d) && (AT_STRINGS_BEG || !IS_A_LETTER (d - 1)))
2380 break;
2381 goto fail;
2382
2383 case wordend:
2384 /* Have to check if AT_STRINGS_BEG before looking at d - 1. */
2385 if (!AT_STRINGS_BEG && IS_A_LETTER (d - 1)
2386 && (!IS_A_LETTER (d) || AT_STRINGS_END))
2387 break;
2388 goto fail;
2389
2390 #ifdef emacs
2391 case before_dot:
2392 if (PTR_CHAR_POS (d) >= point)
2393 goto fail;
2394 break;
2395
2396 case at_dot:
2397 if (PTR_CHAR_POS (d) != point)
2398 goto fail;
2399 break;
2400
2401 case after_dot:
2402 if (PTR_CHAR_POS (d) <= point)
2403 goto fail;
2404 break;
2405
2406 case wordchar:
2407 mcnt = (int) Sword;
2408 goto matchsyntax;
2409
2410 case syntaxspec:
2411 mcnt = *p++;
2412 matchsyntax:
2413 PREFETCH;
2414 if (SYNTAX (*d++) != (enum syntaxcode) mcnt) goto fail;
2415 SET_REGS_MATCHED;
2416 break;
2417
2418 case notwordchar:
2419 mcnt = (int) Sword;
2420 goto matchnotsyntax;
2421
2422 case notsyntaxspec:
2423 mcnt = *p++;
2424 matchnotsyntax:
2425 PREFETCH;
2426 if (SYNTAX (*d++) == (enum syntaxcode) mcnt) goto fail;
2427 SET_REGS_MATCHED;
2428 break;
2429
2430 #else /* not emacs */
2431
2432 case wordchar:
2433 PREFETCH;
2434 if (!IS_A_LETTER (d))
2435 goto fail;
2436 SET_REGS_MATCHED;
2437 break;
2438
2439 case notwordchar:
2440 PREFETCH;
2441 if (IS_A_LETTER (d))
2442 goto fail;
2443 SET_REGS_MATCHED;
2444 break;
2445
2446 #endif /* not emacs */
2447
2448 case begbuf:
2449 if (AT_STRINGS_BEG)
2450 break;
2451 goto fail;
2452
2453 case endbuf:
2454 if (AT_STRINGS_END)
2455 break;
2456 goto fail;
2457
2458 case exactn:
2459 /* Match the next few pattern characters exactly.
2460 mcnt is how many characters to match. */
2461 mcnt = *p++;
2462 /* This is written out as an if-else so we don't waste time
2463 testing `translate' inside the loop. */
2464 if (translate)
2465 {
2466 do
2467 {
2468 PREFETCH;
2469 if (translate[*d++] != *p++) goto fail;
2470 }
2471 while (--mcnt);
2472 }
2473 else
2474 {
2475 do
2476 {
2477 PREFETCH;
2478 if (*d++ != *p++) goto fail;
2479 }
2480 while (--mcnt);
2481 }
2482 SET_REGS_MATCHED;
2483 break;
2484 }
2485 continue; /* Successfully executed one pattern command; keep going. */
2486
2487 /* Jump here if any matching operation fails. */
2488 fail:
2489 if (stackp != stackb)
2490 /* A restart point is known. Restart there and pop it. */
2491 {
2492 short last_used_reg, this_reg;
2493
2494 /* If this failure point is from a dummy_failure_point, just
2495 skip it. */
2496 if (!stackp[-2])
2497 {
2498 POP_FAILURE_POINT ();
2499 goto fail;
2500 }
2501
2502 d = *--stackp;
2503 p = *--stackp;
2504 if (d >= string1 && d <= end1)
2505 dend = end_match_1;
2506 /* Restore register info. */
2507 last_used_reg = (short) *--stackp;
2508
2509 /* Make the ones that weren't saved -1 or 0 again. */
2510 for (this_reg = RE_NREGS - 1; this_reg > last_used_reg; this_reg--)
2511 {
2512 regend[this_reg] = (unsigned char *) -1;
2513 regstart[this_reg] = (unsigned char *) -1;
2514 IS_ACTIVE (reg_info[this_reg]) = 0;
2515 MATCHED_SOMETHING (reg_info[this_reg]) = 0;
2516 }
2517
2518 /* And restore the rest from the stack. */
2519 for ( ; this_reg > 0; this_reg--)
2520 {
2521 reg_info[this_reg] = *(struct register_info *) *--stackp;
2522 regend[this_reg] = *--stackp;
2523 regstart[this_reg] = *--stackp;
2524 }
2525 }
2526 else
2527 break; /* Matching at this starting point really fails. */
2528 }
2529
2530 if (best_regs_set)
2531 goto restore_best_regs;
2532 return -1; /* Failure to match. */
2533 }
2534
2535
2536 static int
bcmp_translate(char * s1,char * s2,int len,unsigned char * translate)2537 bcmp_translate (char *s1, char *s2, int len, unsigned char *translate)
2538 {
2539 register unsigned char *p1 = (unsigned char*)s1;
2540 register unsigned char *p2 = (unsigned char*)s2;
2541 while (len)
2542 {
2543 if (translate [*p1++] != translate [*p2++]) return 1;
2544 len--;
2545 }
2546 return 0;
2547 }
2548
2549
2550
2551 /* Entry points compatible with 4.2 BSD regex library. */
2552
2553 #ifndef emacs
2554
2555 static struct re_pattern_buffer re_comp_buf;
2556
2557 char *
re_comp(const char * s)2558 re_comp (const char *s)
2559 {
2560 if (!s)
2561 {
2562 if (!re_comp_buf.buffer)
2563 return "No previous regular expression";
2564 return 0;
2565 }
2566
2567 if (!re_comp_buf.buffer)
2568 {
2569 if (!(re_comp_buf.buffer = (char *) malloc (200)))
2570 return "Memory exhausted";
2571 re_comp_buf.allocated = 200;
2572 if (!(re_comp_buf.fastmap = (char *) malloc (1 << BYTEWIDTH)))
2573 return "Memory exhausted";
2574 }
2575 return re_compile_pattern (s, strlen (s), &re_comp_buf);
2576 }
2577
2578 int
re_exec(const char * s)2579 re_exec (const char *s)
2580 {
2581 int len = strlen (s);
2582 return 0 <= re_search (&re_comp_buf, s, len, 0, len,
2583 (struct re_registers *) 0);
2584 }
2585 #endif /* not emacs */
2586
2587
2588
2589 #ifdef test
2590
2591 #include <stdio.h>
2592
2593 /* Indexed by a character, gives the upper case equivalent of the
2594 character. */
2595
2596 char upcase[0400] =
2597 { 000, 001, 002, 003, 004, 005, 006, 007,
2598 010, 011, 012, 013, 014, 015, 016, 017,
2599 020, 021, 022, 023, 024, 025, 026, 027,
2600 030, 031, 032, 033, 034, 035, 036, 037,
2601 040, 041, 042, 043, 044, 045, 046, 047,
2602 050, 051, 052, 053, 054, 055, 056, 057,
2603 060, 061, 062, 063, 064, 065, 066, 067,
2604 070, 071, 072, 073, 074, 075, 076, 077,
2605 0100, 0101, 0102, 0103, 0104, 0105, 0106, 0107,
2606 0110, 0111, 0112, 0113, 0114, 0115, 0116, 0117,
2607 0120, 0121, 0122, 0123, 0124, 0125, 0126, 0127,
2608 0130, 0131, 0132, 0133, 0134, 0135, 0136, 0137,
2609 0140, 0101, 0102, 0103, 0104, 0105, 0106, 0107,
2610 0110, 0111, 0112, 0113, 0114, 0115, 0116, 0117,
2611 0120, 0121, 0122, 0123, 0124, 0125, 0126, 0127,
2612 0130, 0131, 0132, 0173, 0174, 0175, 0176, 0177,
2613 0200, 0201, 0202, 0203, 0204, 0205, 0206, 0207,
2614 0210, 0211, 0212, 0213, 0214, 0215, 0216, 0217,
2615 0220, 0221, 0222, 0223, 0224, 0225, 0226, 0227,
2616 0230, 0231, 0232, 0233, 0234, 0235, 0236, 0237,
2617 0240, 0241, 0242, 0243, 0244, 0245, 0246, 0247,
2618 0250, 0251, 0252, 0253, 0254, 0255, 0256, 0257,
2619 0260, 0261, 0262, 0263, 0264, 0265, 0266, 0267,
2620 0270, 0271, 0272, 0273, 0274, 0275, 0276, 0277,
2621 0300, 0301, 0302, 0303, 0304, 0305, 0306, 0307,
2622 0310, 0311, 0312, 0313, 0314, 0315, 0316, 0317,
2623 0320, 0321, 0322, 0323, 0324, 0325, 0326, 0327,
2624 0330, 0331, 0332, 0333, 0334, 0335, 0336, 0337,
2625 0340, 0341, 0342, 0343, 0344, 0345, 0346, 0347,
2626 0350, 0351, 0352, 0353, 0354, 0355, 0356, 0357,
2627 0360, 0361, 0362, 0363, 0364, 0365, 0366, 0367,
2628 0370, 0371, 0372, 0373, 0374, 0375, 0376, 0377
2629 };
2630
2631 #ifdef canned
2632
2633 #include "tests.h"
2634
2635 typedef enum { extended_test, basic_test } test_type;
2636
2637 /* Use this to run the tests we've thought of. */
2638
2639 void
main()2640 main ()
2641 {
2642 test_type t = extended_test;
2643
2644 if (t == basic_test)
2645 {
2646 printf ("Running basic tests:\n\n");
2647 test_posix_basic ();
2648 }
2649 else if (t == extended_test)
2650 {
2651 printf ("Running extended tests:\n\n");
2652 test_posix_extended ();
2653 }
2654 }
2655
2656 #else /* not canned */
2657
2658 /* Use this to run interactive tests. */
2659
2660 void
main(int argc,char ** argv)2661 main (int argc, char **argv)
2662 {
2663 char pat[80];
2664 struct re_pattern_buffer buf;
2665 int i;
2666 char c;
2667 char fastmap[(1 << BYTEWIDTH)];
2668
2669 /* Allow a command argument to specify the style of syntax. */
2670 if (argc > 1)
2671 obscure_syntax = atoi (argv[1]);
2672
2673 buf.allocated = 40;
2674 buf.buffer = (char *) malloc (buf.allocated);
2675 buf.fastmap = fastmap;
2676 buf.translate = upcase;
2677
2678 while (1)
2679 {
2680 gets (pat);
2681
2682 if (*pat)
2683 {
2684 re_compile_pattern (pat, strlen(pat), &buf);
2685
2686 for (i = 0; i < buf.used; i++)
2687 printchar (buf.buffer[i]);
2688
2689 putchar ('\n');
2690
2691 printf ("%d allocated, %d used.\n", buf.allocated, buf.used);
2692
2693 re_compile_fastmap (&buf);
2694 printf ("Allowed by fastmap: ");
2695 for (i = 0; i < (1 << BYTEWIDTH); i++)
2696 if (fastmap[i]) printchar (i);
2697 putchar ('\n');
2698 }
2699
2700 gets (pat); /* Now read the string to match against */
2701
2702 i = re_match (&buf, pat, strlen (pat), 0, 0);
2703 printf ("Match value %d.\n", i);
2704 }
2705 }
2706
2707 #endif
2708
2709
2710 #ifdef NOTDEF
2711 void
print_buf(struct re_pattern_buffer * bufpbufp)2712 print_buf (struct re_pattern_buffer *bufpbufp)
2713 {
2714 int i;
2715
2716 printf ("buf is :\n----------------\n");
2717 for (i = 0; i < bufp->used; i++)
2718 printchar (bufp->buffer[i]);
2719
2720 printf ("\n%d allocated, %d used.\n", bufp->allocated, bufp->used);
2721
2722 printf ("Allowed by fastmap: ");
2723 for (i = 0; i < (1 << BYTEWIDTH); i++)
2724 if (bufp->fastmap[i])
2725 printchar (i);
2726 printf ("\nAllowed by translate: ");
2727 if (bufp->translate)
2728 for (i = 0; i < (1 << BYTEWIDTH); i++)
2729 if (bufp->translate[i])
2730 printchar (i);
2731 printf ("\nfastmap is%s accurate\n", bufp->fastmap_accurate ? "" : "n't");
2732 printf ("can %s be null\n----------", bufp->can_be_null ? "" : "not");
2733 }
2734 #endif /* NOTDEF */
2735
2736 void
printchar(char c)2737 printchar (char c)
2738 {
2739 if (c < 040 || c >= 0177)
2740 {
2741 putchar ('\\');
2742 putchar (((c >> 6) & 3) + '0');
2743 putchar (((c >> 3) & 7) + '0');
2744 putchar ((c & 7) + '0');
2745 }
2746 else
2747 putchar (c);
2748 }
2749
2750 void
error(char * string)2751 error (char *string)
2752 {
2753 puts (string);
2754 exit (1);
2755 }
2756 #endif /* test */
2757