1 /* Extended regular expression matching and search library,
2 version 0.12.
3 (Implements POSIX draft P10003.2/D11.2, except for
4 internationalization features.)
5
6 Copyright (C) 1993 Free Software Foundation, Inc.
7
8 This program is free software; you can redistribute it and/or modify
9 it under the terms of the GNU General Public License as published by
10 the Free Software Foundation; either version 2, or (at your option)
11 any later version.
12
13 This program is distributed in the hope that it will be useful,
14 but WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 GNU General Public License for more details.
17
18 You should have received a copy of the GNU General Public License
19 along with this program; if not, write to the Free Software
20 Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */
21
22 /* AIX requires this to be the first thing in the file. */
23 #if defined (_AIX) && !defined (REGEX_MALLOC)
24 #pragma alloca
25 #endif
26
27 #define _GNU_SOURCE
28
29 /* We need this for `regex.h', and perhaps for the Emacs include files. */
30 #include <sys/types.h>
31 #include <stdint.h>
32
33 #ifdef HAVE_CONFIG_H
34 #include "config.h"
35 #endif
36
37 /* The `emacs' switch turns on certain matching commands
38 that make sense only in Emacs. */
39 #ifdef emacs
40
41 #include "lisp.h"
42 #include "buffer.h"
43 #include "syntax.h"
44
45 /* Emacs uses `NULL' as a predicate. */
46 #undef NULL
47
48 #else /* not emacs */
49
50 /* We used to test for `BSTRING' here, but only GCC and Emacs define
51 `BSTRING', as far as I know, and neither of them use this code. */
52 #if HAVE_STRING_H || STDC_HEADERS
53 #include <string.h>
54 #ifdef HAVE_STRINGS_H
55 #include <strings.h>
56 #endif
57 #ifndef bcmp
58 #define bcmp(s1, s2, n) memcmp ((s1), (s2), (n))
59 #endif
60 #ifndef bcopy
61 #define bcopy(s, d, n) memcpy ((d), (s), (n))
62 #endif
63 #ifndef bzero
64 #define bzero(s, n) memset ((s), 0, (n))
65 #endif
66 #else
67 #include <strings.h>
68 #endif
69
70 #ifdef STDC_HEADERS
71 #include <stdlib.h>
72
73 #if defined(_WIN32) && defined(HAVE_MALLOC_H)
74 #include <malloc.h>
75 #endif
76
77 #else
78 char *malloc ();
79 char *realloc ();
80 #endif
81
82
83 /* Define the syntax stuff for \<, \>, etc. */
84
85 /* This must be nonzero for the wordchar and notwordchar pattern
86 commands in re_match_2. */
87 #ifndef Sword
88 #define Sword 1
89 #endif
90
91 #ifdef SYNTAX_TABLE
92
93 extern char *re_syntax_table;
94
95 #else /* not SYNTAX_TABLE */
96
97 /* How many characters in the character set. */
98 #define CHAR_SET_SIZE 256
99
100 static char re_syntax_table[CHAR_SET_SIZE];
101
102 static void
init_syntax_once(void)103 init_syntax_once (void)
104 {
105 register int c;
106 static int done = 0;
107
108 if (done)
109 return;
110
111 bzero (re_syntax_table, sizeof re_syntax_table);
112
113 for (c = 'a'; c <= 'z'; c++)
114 re_syntax_table[c] = Sword;
115
116 for (c = 'A'; c <= 'Z'; c++)
117 re_syntax_table[c] = Sword;
118
119 for (c = '0'; c <= '9'; c++)
120 re_syntax_table[c] = Sword;
121
122 re_syntax_table['_'] = Sword;
123
124 done = 1;
125 }
126
127 #endif /* not SYNTAX_TABLE */
128
129 #define SYNTAX(c) re_syntax_table[c]
130
131 #endif /* not emacs */
132
133 /* Get the interface, including the syntax bits. */
134 #include "regex.h"
135
136 /* isalpha etc. are used for the character classes. */
137 #include <ctype.h>
138
139 #ifndef isascii
140 #define isascii(c) 1
141 #endif
142
143 #ifdef isblank
144 #define ISBLANK(c) (isascii (c) && isblank (c))
145 #else
146 #define ISBLANK(c) ((c) == ' ' || (c) == '\t')
147 #endif
148 #ifdef isgraph
149 #define ISGRAPH(c) (isascii (c) && isgraph (c))
150 #else
151 #define ISGRAPH(c) (isascii (c) && isprint (c) && !isspace (c))
152 #endif
153
154 #define ISPRINT(c) (isascii (c) && isprint (c))
155 #define ISDIGIT(c) (isascii (c) && isdigit (c))
156 #define ISALNUM(c) (isascii (c) && isalnum (c))
157 #define ISALPHA(c) (isascii (c) && isalpha (c))
158 #define ISCNTRL(c) (isascii (c) && iscntrl (c))
159 #define ISLOWER(c) (isascii (c) && islower (c))
160 #define ISPUNCT(c) (isascii (c) && ispunct (c))
161 #define ISSPACE(c) (isascii (c) && isspace (c))
162 #define ISUPPER(c) (isascii (c) && isupper (c))
163 #define ISXDIGIT(c) (isascii (c) && isxdigit (c))
164
165 #ifndef NULL
166 #define NULL 0
167 #endif
168
169 /* We remove any previous definition of `SIGN_EXTEND_CHAR',
170 since ours (we hope) works properly with all combinations of
171 machines, compilers, `char' and `unsigned char' argument types.
172 (Per Bothner suggested the basic approach.) */
173 #undef SIGN_EXTEND_CHAR
174 #if __STDC__
175 #define SIGN_EXTEND_CHAR(c) ((signed char) (c))
176 #else /* not __STDC__ */
177 /* As in Harbison and Steele. */
178 #define SIGN_EXTEND_CHAR(c) ((((unsigned char) (c)) ^ 128) - 128)
179 #endif
180
181 /* Should we use malloc or alloca? If REGEX_MALLOC is not defined, we
182 use `alloca' instead of `malloc'. This is because using malloc in
183 re_search* or re_match* could cause memory leaks when C-g is used in
184 Emacs; also, malloc is slower and causes storage fragmentation. On
185 the other hand, malloc is more portable, and easier to debug.
186
187 Because we sometimes use alloca, some routines have to be macros,
188 not functions -- `alloca'-allocated space disappears at the end of the
189 function it is called in. */
190
191 #ifdef REGEX_MALLOC
192
193 #define REGEX_ALLOCATE malloc
194 #define REGEX_REALLOCATE(source, osize, nsize) realloc (source, nsize)
195
196 #else /* not REGEX_MALLOC */
197
198 /* Emacs already defines alloca, sometimes. */
199 #ifndef alloca
200
201 /* Make alloca work the best possible way. */
202 #ifdef __GNUC__
203 #define alloca __builtin_alloca
204 #else /* not __GNUC__ */
205 #if HAVE_ALLOCA_H
206 #include <alloca.h>
207 #else /* not __GNUC__ or HAVE_ALLOCA_H */
208 #ifndef _AIX /* Already did AIX, up at the top. */
209 #ifndef _WIN32
210 char *alloca ();
211 #endif
212 #endif /* not _AIX */
213 #endif /* not HAVE_ALLOCA_H */
214 #endif /* not __GNUC__ */
215
216 #endif /* not alloca */
217
218 #define REGEX_ALLOCATE alloca
219
220 /* Assumes a `char *destination' variable. */
221 #define REGEX_REALLOCATE(source, osize, nsize) \
222 (destination = (char *) alloca (nsize), \
223 bcopy (source, destination, osize), \
224 destination)
225
226 #endif /* not REGEX_MALLOC */
227
228
229 /* True if `size1' is non-NULL and PTR is pointing anywhere inside
230 `string1' or just past its end. This works if PTR is NULL, which is
231 a good thing. */
232 #define FIRST_STRING_P(ptr) \
233 (size1 && string1 <= (ptr) && (ptr) <= string1 + size1)
234
235 /* (Re)Allocate N items of type T using malloc, or fail. */
236 #define TALLOC(n, t) ((t *) malloc ((n) * sizeof (t)))
237 #define RETALLOC(addr, n, t) ((addr) = (t *) realloc (addr, (n) * sizeof (t)))
238 #define REGEX_TALLOC(n, t) ((t *) REGEX_ALLOCATE ((n) * sizeof (t)))
239
240 #define BYTEWIDTH 8 /* In bits. */
241
242 #define STREQ(s1, s2) ((strcmp (s1, s2) == 0))
243
244 #define MAX(a, b) ((a) > (b) ? (a) : (b))
245 #define MIN(a, b) ((a) < (b) ? (a) : (b))
246
247 typedef char boolean;
248 #define false 0
249 #define true 1
250
251 /* These are the command codes that appear in compiled regular
252 expressions. Some opcodes are followed by argument bytes. A
253 command code can specify any interpretation whatsoever for its
254 arguments. Zero bytes may appear in the compiled regular expression.
255
256 The value of `exactn' is needed in search.c (search_buffer) in Emacs.
257 So regex.h defines a symbol `RE_EXACTN_VALUE' to be 1; the value of
258 `exactn' we use here must also be 1. */
259
260 typedef enum
261 {
262 no_op = 0,
263
264 /* Followed by one byte giving n, then by n literal bytes. */
265 exactn = 1,
266
267 /* Matches any (more or less) character. */
268 anychar,
269
270 /* Matches any one char belonging to specified set. First
271 following byte is number of bitmap bytes. Then come bytes
272 for a bitmap saying which chars are in. Bits in each byte
273 are ordered low-bit-first. A character is in the set if its
274 bit is 1. A character too large to have a bit in the map is
275 automatically not in the set. */
276 charset,
277
278 /* Same parameters as charset, but match any character that is
279 not one of those specified. */
280 charset_not,
281
282 /* Start remembering the text that is matched, for storing in a
283 register. Followed by one byte with the register number, in
284 the range 0 to one less than the pattern buffer's re_nsub
285 field. Then followed by one byte with the number of groups
286 inner to this one. (This last has to be part of the
287 start_memory only because we need it in the on_failure_jump
288 of re_match_2.) */
289 start_memory,
290
291 /* Stop remembering the text that is matched and store it in a
292 memory register. Followed by one byte with the register
293 number, in the range 0 to one less than `re_nsub' in the
294 pattern buffer, and one byte with the number of inner groups,
295 just like `start_memory'. (We need the number of inner
296 groups here because we don't have any easy way of finding the
297 corresponding start_memory when we're at a stop_memory.) */
298 stop_memory,
299
300 /* Match a duplicate of something remembered. Followed by one
301 byte containing the register number. */
302 duplicate,
303
304 /* Fail unless at beginning of line. */
305 begline,
306
307 /* Fail unless at end of line. */
308 endline,
309
310 /* Succeeds if at beginning of buffer (if emacs) or at beginning
311 of string to be matched (if not). */
312 begbuf,
313
314 /* Analogously, for end of buffer/string. */
315 endbuf,
316
317 /* Followed by two byte relative address to which to jump. */
318 jump,
319
320 /* Same as jump, but marks the end of an alternative. */
321 jump_past_alt,
322
323 /* Followed by two-byte relative address of place to resume at
324 in case of failure. */
325 on_failure_jump,
326
327 /* Like on_failure_jump, but pushes a placeholder instead of the
328 current string position when executed. */
329 on_failure_keep_string_jump,
330
331 /* Throw away latest failure point and then jump to following
332 two-byte relative address. */
333 pop_failure_jump,
334
335 /* Change to pop_failure_jump if know won't have to backtrack to
336 match; otherwise change to jump. This is used to jump
337 back to the beginning of a repeat. If what follows this jump
338 clearly won't match what the repeat does, such that we can be
339 sure that there is no use backtracking out of repetitions
340 already matched, then we change it to a pop_failure_jump.
341 Followed by two-byte address. */
342 maybe_pop_jump,
343
344 /* Jump to following two-byte address, and push a dummy failure
345 point. This failure point will be thrown away if an attempt
346 is made to use it for a failure. A `+' construct makes this
347 before the first repeat. Also used as an intermediary kind
348 of jump when compiling an alternative. */
349 dummy_failure_jump,
350
351 /* Push a dummy failure point and continue. Used at the end of
352 alternatives. */
353 push_dummy_failure,
354
355 /* Followed by two-byte relative address and two-byte number n.
356 After matching N times, jump to the address upon failure. */
357 succeed_n,
358
359 /* Followed by two-byte relative address, and two-byte number n.
360 Jump to the address N times, then fail. */
361 jump_n,
362
363 /* Set the following two-byte relative address to the
364 subsequent two-byte number. The address *includes* the two
365 bytes of number. */
366 set_number_at,
367
368 wordchar, /* Matches any word-constituent character. */
369 notwordchar, /* Matches any char that is not a word-constituent. */
370
371 wordbeg, /* Succeeds if at word beginning. */
372 wordend, /* Succeeds if at word end. */
373
374 wordbound, /* Succeeds if at a word boundary. */
375 notwordbound /* Succeeds if not at a word boundary. */
376
377 #ifdef emacs
378 ,before_dot, /* Succeeds if before point. */
379 at_dot, /* Succeeds if at point. */
380 after_dot, /* Succeeds if after point. */
381
382 /* Matches any character whose syntax is specified. Followed by
383 a byte which contains a syntax code, e.g., Sword. */
384 syntaxspec,
385
386 /* Matches any character whose syntax is not that specified. */
387 notsyntaxspec
388 #endif /* emacs */
389 } re_opcode_t;
390
391 /* Common operations on the compiled pattern. */
392
393 /* Store NUMBER in two contiguous bytes starting at DESTINATION. */
394
395 #define STORE_NUMBER(destination, number) \
396 do { \
397 (destination)[0] = (number) & 0377; \
398 (destination)[1] = (number) >> 8; \
399 } while (0)
400
401 /* Same as STORE_NUMBER, except increment DESTINATION to
402 the byte after where the number is stored. Therefore, DESTINATION
403 must be an lvalue. */
404
405 #define STORE_NUMBER_AND_INCR(destination, number) \
406 do { \
407 STORE_NUMBER (destination, number); \
408 (destination) += 2; \
409 } while (0)
410
411 /* Put into DESTINATION a number stored in two contiguous bytes starting
412 at SOURCE. */
413
414 #define EXTRACT_NUMBER(destination, source) \
415 do { \
416 (destination) = *(source) & 0377; \
417 (destination) += SIGN_EXTEND_CHAR (*((source) + 1)) << 8; \
418 } while (0)
419
420 #ifdef DEBUG
421 static void
extract_number(dest,source)422 extract_number (dest, source)
423 int *dest;
424 unsigned char *source;
425 {
426 int temp = SIGN_EXTEND_CHAR (*(source + 1));
427 *dest = *source & 0377;
428 *dest += temp << 8;
429 }
430
431 #ifndef EXTRACT_MACROS /* To debug the macros. */
432 #undef EXTRACT_NUMBER
433 #define EXTRACT_NUMBER(dest, src) extract_number (&dest, src)
434 #endif /* not EXTRACT_MACROS */
435
436 #endif /* DEBUG */
437
438 /* Same as EXTRACT_NUMBER, except increment SOURCE to after the number.
439 SOURCE must be an lvalue. */
440
441 #define EXTRACT_NUMBER_AND_INCR(destination, source) \
442 do { \
443 EXTRACT_NUMBER (destination, source); \
444 (source) += 2; \
445 } while (0)
446
447 #ifdef DEBUG
448 static void
extract_number_and_incr(destination,source)449 extract_number_and_incr (destination, source)
450 int *destination;
451 unsigned char **source;
452 {
453 extract_number (destination, *source);
454 *source += 2;
455 }
456
457 #ifndef EXTRACT_MACROS
458 #undef EXTRACT_NUMBER_AND_INCR
459 #define EXTRACT_NUMBER_AND_INCR(dest, src) \
460 extract_number_and_incr (&dest, &src)
461 #endif /* not EXTRACT_MACROS */
462
463 #endif /* DEBUG */
464
465 /* If DEBUG is defined, Regex prints many voluminous messages about what
466 it is doing (if the variable `debug' is nonzero). If linked with the
467 main program in `iregex.c', you can enter patterns and strings
468 interactively. And if linked with the main program in `main.c' and
469 the other test files, you can run the already-written tests. */
470
471 #ifdef DEBUG
472
473 /* We use standard I/O for debugging. */
474 #include <stdio.h>
475
476 /* It is useful to test things that ``must'' be true when debugging. */
477 #include <assert.h>
478
479 static int debug = 0;
480
481 #define DEBUG_STATEMENT(e) e
482 #define DEBUG_PRINT1(x) if (debug) printf (x)
483 #define DEBUG_PRINT2(x1, x2) if (debug) printf (x1, x2)
484 #define DEBUG_PRINT3(x1, x2, x3) if (debug) printf (x1, x2, x3)
485 #define DEBUG_PRINT4(x1, x2, x3, x4) if (debug) printf (x1, x2, x3, x4)
486 #define DEBUG_PRINT_COMPILED_PATTERN(p, s, e) \
487 if (debug) print_partial_compiled_pattern (s, e)
488 #define DEBUG_PRINT_DOUBLE_STRING(w, s1, sz1, s2, sz2) \
489 if (debug) print_double_string (w, s1, sz1, s2, sz2)
490
491
492 extern void printchar ();
493
494 /* Print the fastmap in human-readable form. */
495
496 void
print_fastmap(fastmap)497 print_fastmap (fastmap)
498 char *fastmap;
499 {
500 unsigned was_a_range = 0;
501 unsigned i = 0;
502
503 while (i < (1 << BYTEWIDTH))
504 {
505 if (fastmap[i++])
506 {
507 was_a_range = 0;
508 printchar (i - 1);
509 while (i < (1 << BYTEWIDTH) && fastmap[i])
510 {
511 was_a_range = 1;
512 i++;
513 }
514 if (was_a_range)
515 {
516 printf ("-");
517 printchar (i - 1);
518 }
519 }
520 }
521 putchar ('\n');
522 }
523
524
525 /* Print a compiled pattern string in human-readable form, starting at
526 the START pointer into it and ending just before the pointer END. */
527
528 void
print_partial_compiled_pattern(start,end)529 print_partial_compiled_pattern (start, end)
530 unsigned char *start;
531 unsigned char *end;
532 {
533 int mcnt, mcnt2;
534 unsigned char *p = start;
535 unsigned char *pend = end;
536
537 if (start == NULL)
538 {
539 printf ("(null)\n");
540 return;
541 }
542
543 /* Loop over pattern commands. */
544 while (p < pend)
545 {
546 switch ((re_opcode_t) *p++)
547 {
548 case no_op:
549 printf ("/no_op");
550 break;
551
552 case exactn:
553 mcnt = *p++;
554 printf ("/exactn/%d", mcnt);
555 do
556 {
557 putchar ('/');
558 printchar (*p++);
559 }
560 while (--mcnt);
561 break;
562
563 case start_memory:
564 mcnt = *p++;
565 printf ("/start_memory/%d/%d", mcnt, *p++);
566 break;
567
568 case stop_memory:
569 mcnt = *p++;
570 printf ("/stop_memory/%d/%d", mcnt, *p++);
571 break;
572
573 case duplicate:
574 printf ("/duplicate/%d", *p++);
575 break;
576
577 case anychar:
578 printf ("/anychar");
579 break;
580
581 case charset:
582 case charset_not:
583 {
584 register int c;
585
586 printf ("/charset%s",
587 (re_opcode_t) *(p - 1) == charset_not ? "_not" : "");
588
589 assert (p + *p < pend);
590
591 for (c = 0; c < *p; c++)
592 {
593 unsigned bit;
594 unsigned char map_byte = p[1 + c];
595
596 putchar ('/');
597
598 for (bit = 0; bit < BYTEWIDTH; bit++)
599 if (map_byte & (1 << bit))
600 printchar (c * BYTEWIDTH + bit);
601 }
602 p += 1 + *p;
603 break;
604 }
605
606 case begline:
607 printf ("/begline");
608 break;
609
610 case endline:
611 printf ("/endline");
612 break;
613
614 case on_failure_jump:
615 extract_number_and_incr (&mcnt, &p);
616 printf ("/on_failure_jump/0/%d", mcnt);
617 break;
618
619 case on_failure_keep_string_jump:
620 extract_number_and_incr (&mcnt, &p);
621 printf ("/on_failure_keep_string_jump/0/%d", mcnt);
622 break;
623
624 case dummy_failure_jump:
625 extract_number_and_incr (&mcnt, &p);
626 printf ("/dummy_failure_jump/0/%d", mcnt);
627 break;
628
629 case push_dummy_failure:
630 printf ("/push_dummy_failure");
631 break;
632
633 case maybe_pop_jump:
634 extract_number_and_incr (&mcnt, &p);
635 printf ("/maybe_pop_jump/0/%d", mcnt);
636 break;
637
638 case pop_failure_jump:
639 extract_number_and_incr (&mcnt, &p);
640 printf ("/pop_failure_jump/0/%d", mcnt);
641 break;
642
643 case jump_past_alt:
644 extract_number_and_incr (&mcnt, &p);
645 printf ("/jump_past_alt/0/%d", mcnt);
646 break;
647
648 case jump:
649 extract_number_and_incr (&mcnt, &p);
650 printf ("/jump/0/%d", mcnt);
651 break;
652
653 case succeed_n:
654 extract_number_and_incr (&mcnt, &p);
655 extract_number_and_incr (&mcnt2, &p);
656 printf ("/succeed_n/0/%d/0/%d", mcnt, mcnt2);
657 break;
658
659 case jump_n:
660 extract_number_and_incr (&mcnt, &p);
661 extract_number_and_incr (&mcnt2, &p);
662 printf ("/jump_n/0/%d/0/%d", mcnt, mcnt2);
663 break;
664
665 case set_number_at:
666 extract_number_and_incr (&mcnt, &p);
667 extract_number_and_incr (&mcnt2, &p);
668 printf ("/set_number_at/0/%d/0/%d", mcnt, mcnt2);
669 break;
670
671 case wordbound:
672 printf ("/wordbound");
673 break;
674
675 case notwordbound:
676 printf ("/notwordbound");
677 break;
678
679 case wordbeg:
680 printf ("/wordbeg");
681 break;
682
683 case wordend:
684 printf ("/wordend");
685
686 #ifdef emacs
687 case before_dot:
688 printf ("/before_dot");
689 break;
690
691 case at_dot:
692 printf ("/at_dot");
693 break;
694
695 case after_dot:
696 printf ("/after_dot");
697 break;
698
699 case syntaxspec:
700 printf ("/syntaxspec");
701 mcnt = *p++;
702 printf ("/%d", mcnt);
703 break;
704
705 case notsyntaxspec:
706 printf ("/notsyntaxspec");
707 mcnt = *p++;
708 printf ("/%d", mcnt);
709 break;
710 #endif /* emacs */
711
712 case wordchar:
713 printf ("/wordchar");
714 break;
715
716 case notwordchar:
717 printf ("/notwordchar");
718 break;
719
720 case begbuf:
721 printf ("/begbuf");
722 break;
723
724 case endbuf:
725 printf ("/endbuf");
726 break;
727
728 default:
729 printf ("?%d", *(p-1));
730 }
731 }
732 printf ("/\n");
733 }
734
735
736 void
print_compiled_pattern(bufp)737 print_compiled_pattern (bufp)
738 struct re_pattern_buffer *bufp;
739 {
740 unsigned char *buffer = bufp->buffer;
741
742 print_partial_compiled_pattern (buffer, buffer + bufp->used);
743 printf ("%d bytes used/%d bytes allocated.\n", bufp->used, bufp->allocated);
744
745 if (bufp->fastmap_accurate && bufp->fastmap)
746 {
747 printf ("fastmap: ");
748 print_fastmap (bufp->fastmap);
749 }
750
751 printf ("re_nsub: %d\t", bufp->re_nsub);
752 printf ("regs_alloc: %d\t", bufp->regs_allocated);
753 printf ("can_be_null: %d\t", bufp->can_be_null);
754 printf ("newline_anchor: %d\n", bufp->newline_anchor);
755 printf ("no_sub: %d\t", bufp->no_sub);
756 printf ("not_bol: %d\t", bufp->not_bol);
757 printf ("not_eol: %d\t", bufp->not_eol);
758 printf ("syntax: %d\n", bufp->syntax);
759 /* Perhaps we should print the translate table? */
760 }
761
762
763 void
print_double_string(where,string1,size1,string2,size2)764 print_double_string (where, string1, size1, string2, size2)
765 const char *where;
766 const char *string1;
767 const char *string2;
768 int size1;
769 int size2;
770 {
771 unsigned this_char;
772
773 if (where == NULL)
774 printf ("(null)");
775 else
776 {
777 if (FIRST_STRING_P (where))
778 {
779 for (this_char = where - string1; this_char < size1; this_char++)
780 printchar (string1[this_char]);
781
782 where = string2;
783 }
784
785 for (this_char = where - string2; this_char < size2; this_char++)
786 printchar (string2[this_char]);
787 }
788 }
789
790 #else /* not DEBUG */
791
792 #undef assert
793 #define assert(e)
794
795 #define DEBUG_STATEMENT(e)
796 #define DEBUG_PRINT1(x)
797 #define DEBUG_PRINT2(x1, x2)
798 #define DEBUG_PRINT3(x1, x2, x3)
799 #define DEBUG_PRINT4(x1, x2, x3, x4)
800 #define DEBUG_PRINT_COMPILED_PATTERN(p, s, e)
801 #define DEBUG_PRINT_DOUBLE_STRING(w, s1, sz1, s2, sz2)
802
803 #endif /* not DEBUG */
804
805 /* Set by `re_set_syntax' to the current regexp syntax to recognize. Can
806 also be assigned to arbitrarily: each pattern buffer stores its own
807 syntax, so it can be changed between regex compilations. */
808 reg_syntax_t re_syntax_options = RE_SYNTAX_EMACS;
809
810
811 /* Specify the precise syntax of regexps for compilation. This provides
812 for compatibility for various utilities which historically have
813 different, incompatible syntaxes.
814
815 The argument SYNTAX is a bit mask comprised of the various bits
816 defined in regex.h. We return the old syntax. */
817
818 reg_syntax_t
re_set_syntax(syntax)819 re_set_syntax (syntax)
820 reg_syntax_t syntax;
821 {
822 reg_syntax_t ret = re_syntax_options;
823
824 re_syntax_options = syntax;
825 return ret;
826 }
827
828 /* This table gives an error message for each of the error codes listed
829 in regex.h. Obviously the order here has to be same as there. */
830
831 static const char *re_error_msg[] =
832 { NULL, /* REG_NOERROR */
833 "No match", /* REG_NOMATCH */
834 "Invalid regular expression", /* REG_BADPAT */
835 "Invalid collation character", /* REG_ECOLLATE */
836 "Invalid character class name", /* REG_ECTYPE */
837 "Trailing backslash", /* REG_EESCAPE */
838 "Invalid back reference", /* REG_ESUBREG */
839 "Unmatched [ or [^", /* REG_EBRACK */
840 "Unmatched ( or \\(", /* REG_EPAREN */
841 "Unmatched \\{", /* REG_EBRACE */
842 "Invalid content of \\{\\}", /* REG_BADBR */
843 "Invalid range end", /* REG_ERANGE */
844 "Memory exhausted", /* REG_ESPACE */
845 "Invalid preceding regular expression", /* REG_BADRPT */
846 "Premature end of regular expression", /* REG_EEND */
847 "Regular expression too big", /* REG_ESIZE */
848 "Unmatched ) or \\)", /* REG_ERPAREN */
849 };
850
851 /* Subroutine declarations and macros for regex_compile. */
852
853
854 typedef struct
855 {
856 struct compile_stack_elt_t *stack;
857 unsigned size;
858 unsigned avail; /* Offset of next open position. */
859 } compile_stack_type;
860
861 /* But patterns can have more than `MAX_REGNUM' registers. We just
862 ignore the excess. */
863 typedef unsigned regnum_t;
864
865 static void store_op1 (re_opcode_t op, unsigned char *loc, int arg), store_op2 (re_opcode_t op, unsigned char *loc, int arg1, int arg2);
866 static void insert_op1 (re_opcode_t op, unsigned char *loc, int arg, unsigned char *end), insert_op2 (re_opcode_t op, unsigned char *loc, int arg1, int arg2, unsigned char *end);
867 static boolean at_begline_loc_p (const char *pattern, const char *p, reg_syntax_t syntax), at_endline_loc_p (const char *pattern, const char *p, int syntax);
868 static boolean group_in_compile_stack (compile_stack_type compile_stack, regnum_t regnum);
869 static reg_errcode_t compile_range (const char **p_ptr, const char *pend, char *translate, reg_syntax_t syntax, unsigned char *b);
870
871 /* Fetch the next character in the uncompiled pattern---translating it
872 if necessary. Also cast from a signed character in the constant
873 string passed to us by the user to an unsigned char that we can use
874 as an array index (in, e.g., `translate'). */
875 #define PATFETCH(c) \
876 do {if (p == pend) return REG_EEND; \
877 c = (unsigned char) *p++; \
878 if (translate) c = translate[c]; \
879 } while (0)
880
881 /* Fetch the next character in the uncompiled pattern, with no
882 translation. */
883 #define PATFETCH_RAW(c) \
884 do {if (p == pend) return REG_EEND; \
885 c = (unsigned char) *p++; \
886 } while (0)
887
888 /* Go backwards one character in the pattern. */
889 #define PATUNFETCH p--
890
891
892 /* If `translate' is non-null, return translate[D], else just D. We
893 cast the subscript to translate because some data is declared as
894 `char *', to avoid warnings when a string constant is passed. But
895 when we use a character as a subscript we must make it unsigned. */
896 #define TRANSLATE(d) (translate ? translate[(unsigned char) (d)] : (d))
897
898
899 /* Macros for outputting the compiled pattern into `buffer'. */
900
901 /* If the buffer isn't allocated when it comes in, use this. */
902 #define INIT_BUF_SIZE 32
903
904 /* Make sure we have at least N more bytes of space in buffer. */
905 #define GET_BUFFER_SPACE(n) \
906 while ((unsigned long)(b - bufp->buffer + (n)) > bufp->allocated) \
907 EXTEND_BUFFER ()
908
909 /* Make sure we have one more byte of buffer space and then add C to it. */
910 #define BUF_PUSH(c) \
911 do { \
912 GET_BUFFER_SPACE (1); \
913 *b++ = (unsigned char) (c); \
914 } while (0)
915
916
917 /* Ensure we have two more bytes of buffer space and then append C1 and C2. */
918 #define BUF_PUSH_2(c1, c2) \
919 do { \
920 GET_BUFFER_SPACE (2); \
921 *b++ = (unsigned char) (c1); \
922 *b++ = (unsigned char) (c2); \
923 } while (0)
924
925
926 /* As with BUF_PUSH_2, except for three bytes. */
927 #define BUF_PUSH_3(c1, c2, c3) \
928 do { \
929 GET_BUFFER_SPACE (3); \
930 *b++ = (unsigned char) (c1); \
931 *b++ = (unsigned char) (c2); \
932 *b++ = (unsigned char) (c3); \
933 } while (0)
934
935
936 /* Store a jump with opcode OP at LOC to location TO. We store a
937 relative address offset by the three bytes the jump itself occupies. */
938 #define STORE_JUMP(op, loc, to) \
939 store_op1 (op, loc, (to) - (loc) - 3)
940
941 /* Likewise, for a two-argument jump. */
942 #define STORE_JUMP2(op, loc, to, arg) \
943 store_op2 (op, loc, (to) - (loc) - 3, arg)
944
945 /* Like `STORE_JUMP', but for inserting. Assume `b' is the buffer end. */
946 #define INSERT_JUMP(op, loc, to) \
947 insert_op1 (op, loc, (to) - (loc) - 3, b)
948
949 /* Like `STORE_JUMP2', but for inserting. Assume `b' is the buffer end. */
950 #define INSERT_JUMP2(op, loc, to, arg) \
951 insert_op2 (op, loc, (to) - (loc) - 3, arg, b)
952
953
954 /* This is not an arbitrary limit: the arguments which represent offsets
955 into the pattern are two bytes long. So if 2^16 bytes turns out to
956 be too small, many things would have to change. */
957 #define MAX_BUF_SIZE (1L << 16)
958
959
960 /* Extend the buffer by twice its current size via realloc and
961 reset the pointers that pointed into the old block to point to the
962 correct places in the new one. If extending the buffer results in it
963 being larger than MAX_BUF_SIZE, then flag memory exhausted. */
964 #define EXTEND_BUFFER() \
965 do { \
966 unsigned char *old_buffer = bufp->buffer; \
967 if (bufp->allocated == MAX_BUF_SIZE) \
968 return REG_ESIZE; \
969 bufp->allocated <<= 1; \
970 if (bufp->allocated > MAX_BUF_SIZE) \
971 bufp->allocated = MAX_BUF_SIZE; \
972 bufp->buffer = (unsigned char *) realloc (bufp->buffer, bufp->allocated);\
973 if (bufp->buffer == NULL) \
974 return REG_ESPACE; \
975 /* If the buffer moved, move all the pointers into it. */ \
976 if (old_buffer != bufp->buffer) \
977 { \
978 b = (b - old_buffer) + bufp->buffer; \
979 begalt = (begalt - old_buffer) + bufp->buffer; \
980 if (fixup_alt_jump) \
981 fixup_alt_jump = (fixup_alt_jump - old_buffer) + bufp->buffer;\
982 if (laststart) \
983 laststart = (laststart - old_buffer) + bufp->buffer; \
984 if (pending_exact) \
985 pending_exact = (pending_exact - old_buffer) + bufp->buffer; \
986 } \
987 } while (0)
988
989
990 /* Since we have one byte reserved for the register number argument to
991 {start,stop}_memory, the maximum number of groups we can report
992 things about is what fits in that byte. */
993 #define MAX_REGNUM 255
994
995
996 /* Macros for the compile stack. */
997
998 /* Since offsets can go either forwards or backwards, this type needs to
999 be able to hold values from -(MAX_BUF_SIZE - 1) to MAX_BUF_SIZE - 1. */
1000 typedef int pattern_offset_t;
1001
1002 typedef struct compile_stack_elt_t
1003 {
1004 pattern_offset_t begalt_offset;
1005 pattern_offset_t fixup_alt_jump;
1006 pattern_offset_t inner_group_offset;
1007 pattern_offset_t laststart_offset;
1008 regnum_t regnum;
1009 } compile_stack_elt_t;
1010
1011
1012 #define INIT_COMPILE_STACK_SIZE 32
1013
1014 #define COMPILE_STACK_EMPTY (compile_stack.avail == 0)
1015 #define COMPILE_STACK_FULL (compile_stack.avail == compile_stack.size)
1016
1017 /* The next available element. */
1018 #define COMPILE_STACK_TOP (compile_stack.stack[compile_stack.avail])
1019
1020
1021 /* Set the bit for character C in a list. */
1022 #define SET_LIST_BIT(c) \
1023 (b[((unsigned char) (c)) / BYTEWIDTH] \
1024 |= 1 << (((unsigned char) c) % BYTEWIDTH))
1025
1026
1027 /* Get the next unsigned number in the uncompiled pattern. */
1028 #define GET_UNSIGNED_NUMBER(num) \
1029 { if (p != pend) \
1030 { \
1031 PATFETCH (c); \
1032 while (ISDIGIT (c)) \
1033 { \
1034 if (num < 0) \
1035 num = 0; \
1036 num = num * 10 + c - '0'; \
1037 if (p == pend) \
1038 break; \
1039 PATFETCH (c); \
1040 } \
1041 } \
1042 }
1043
1044 #define CHAR_CLASS_MAX_LENGTH 6 /* Namely, `xdigit'. */
1045
1046 #define IS_CHAR_CLASS(string) \
1047 (STREQ (string, "alpha") || STREQ (string, "upper") \
1048 || STREQ (string, "lower") || STREQ (string, "digit") \
1049 || STREQ (string, "alnum") || STREQ (string, "xdigit") \
1050 || STREQ (string, "space") || STREQ (string, "print") \
1051 || STREQ (string, "punct") || STREQ (string, "graph") \
1052 || STREQ (string, "cntrl") || STREQ (string, "blank"))
1053
1054 /* `regex_compile' compiles PATTERN (of length SIZE) according to SYNTAX.
1055 Returns one of error codes defined in `regex.h', or zero for success.
1056
1057 Assumes the `allocated' (and perhaps `buffer') and `translate'
1058 fields are set in BUFP on entry.
1059
1060 If it succeeds, results are put in BUFP (if it returns an error, the
1061 contents of BUFP are undefined):
1062 `buffer' is the compiled pattern;
1063 `syntax' is set to SYNTAX;
1064 `used' is set to the length of the compiled pattern;
1065 `fastmap_accurate' is zero;
1066 `re_nsub' is the number of subexpressions in PATTERN;
1067 `not_bol' and `not_eol' are zero;
1068
1069 The `fastmap' and `newline_anchor' fields are neither
1070 examined nor set. */
1071
1072 static reg_errcode_t regex_compile (const char *pattern, int size, reg_syntax_t syntax, struct re_pattern_buffer *bufp);
1073 static reg_errcode_t
regex_compile(pattern,size,syntax,bufp)1074 regex_compile (pattern, size, syntax, bufp)
1075 const char *pattern;
1076 int size;
1077 reg_syntax_t syntax;
1078 struct re_pattern_buffer *bufp;
1079 {
1080 /* We fetch characters from PATTERN here. Even though PATTERN is
1081 `char *' (i.e., signed), we declare these variables as unsigned, so
1082 they can be reliably used as array indices. */
1083 register unsigned char c, c1;
1084
1085 /* A random tempory spot in PATTERN. */
1086 const char *p1;
1087
1088 /* Points to the end of the buffer, where we should append. */
1089 register unsigned char *b;
1090
1091 /* Keeps track of unclosed groups. */
1092 compile_stack_type compile_stack;
1093
1094 /* Points to the current (ending) position in the pattern. */
1095 const char *p = pattern;
1096 const char *pend = pattern + size;
1097
1098 /* How to translate the characters in the pattern. */
1099 char *translate = bufp->translate;
1100
1101 /* Address of the count-byte of the most recently inserted `exactn'
1102 command. This makes it possible to tell if a new exact-match
1103 character can be added to that command or if the character requires
1104 a new `exactn' command. */
1105 unsigned char *pending_exact = 0;
1106
1107 /* Address of start of the most recently finished expression.
1108 This tells, e.g., postfix * where to find the start of its
1109 operand. Reset at the beginning of groups and alternatives. */
1110 unsigned char *laststart = 0;
1111
1112 /* Address of beginning of regexp, or inside of last group. */
1113 unsigned char *begalt;
1114
1115 /* Place in the uncompiled pattern (i.e., the {) to
1116 which to go back if the interval is invalid. */
1117 const char *beg_interval;
1118
1119 /* Address of the place where a forward jump should go to the end of
1120 the containing expression. Each alternative of an `or' -- except the
1121 last -- ends with a forward jump of this sort. */
1122 unsigned char *fixup_alt_jump = 0;
1123
1124 /* Counts open-groups as they are encountered. Remembered for the
1125 matching close-group on the compile stack, so the same register
1126 number is put in the stop_memory as the start_memory. */
1127 regnum_t regnum = 0;
1128
1129 #ifdef DEBUG
1130 DEBUG_PRINT1 ("\nCompiling pattern: ");
1131 if (debug)
1132 {
1133 unsigned debug_count;
1134
1135 for (debug_count = 0; debug_count < size; debug_count++)
1136 printchar (pattern[debug_count]);
1137 putchar ('\n');
1138 }
1139 #endif /* DEBUG */
1140
1141 /* Initialize the compile stack. */
1142 compile_stack.stack = TALLOC (INIT_COMPILE_STACK_SIZE, compile_stack_elt_t);
1143 if (compile_stack.stack == NULL)
1144 return REG_ESPACE;
1145
1146 compile_stack.size = INIT_COMPILE_STACK_SIZE;
1147 compile_stack.avail = 0;
1148
1149 /* Initialize the pattern buffer. */
1150 bufp->syntax = syntax;
1151 bufp->fastmap_accurate = 0;
1152 bufp->not_bol = bufp->not_eol = 0;
1153
1154 /* Set `used' to zero, so that if we return an error, the pattern
1155 printer (for debugging) will think there's no pattern. We reset it
1156 at the end. */
1157 bufp->used = 0;
1158
1159 /* Always count groups, whether or not bufp->no_sub is set. */
1160 bufp->re_nsub = 0;
1161
1162 #if !defined (emacs) && !defined (SYNTAX_TABLE)
1163 /* Initialize the syntax table. */
1164 init_syntax_once ();
1165 #endif
1166
1167 if (bufp->allocated == 0)
1168 {
1169 if (bufp->buffer)
1170 { /* If zero allocated, but buffer is non-null, try to realloc
1171 enough space. This loses if buffer's address is bogus, but
1172 that is the user's responsibility. */
1173 RETALLOC (bufp->buffer, INIT_BUF_SIZE, unsigned char);
1174 }
1175 else
1176 { /* Caller did not allocate a buffer. Do it for them. */
1177 bufp->buffer = TALLOC (INIT_BUF_SIZE, unsigned char);
1178 }
1179 if (!bufp->buffer) return REG_ESPACE;
1180
1181 bufp->allocated = INIT_BUF_SIZE;
1182 }
1183
1184 begalt = b = bufp->buffer;
1185
1186 /* Loop through the uncompiled pattern until we're at the end. */
1187 while (p != pend)
1188 {
1189 PATFETCH (c);
1190
1191 switch (c)
1192 {
1193 case '^':
1194 {
1195 if ( /* If at start of pattern, it's an operator. */
1196 p == pattern + 1
1197 /* If context independent, it's an operator. */
1198 || syntax & RE_CONTEXT_INDEP_ANCHORS
1199 /* Otherwise, depends on what's come before. */
1200 || at_begline_loc_p (pattern, p, syntax))
1201 BUF_PUSH (begline);
1202 else
1203 goto normal_char;
1204 }
1205 break;
1206
1207
1208 case '$':
1209 {
1210 if ( /* If at end of pattern, it's an operator. */
1211 p == pend
1212 /* If context independent, it's an operator. */
1213 || syntax & RE_CONTEXT_INDEP_ANCHORS
1214 /* Otherwise, depends on what's next. */
1215 || at_endline_loc_p (p, pend, syntax))
1216 BUF_PUSH (endline);
1217 else
1218 goto normal_char;
1219 }
1220 break;
1221
1222
1223 case '+':
1224 case '?':
1225 if ((syntax & RE_BK_PLUS_QM)
1226 || (syntax & RE_LIMITED_OPS))
1227 goto normal_char;
1228 handle_plus:
1229 case '*':
1230 /* If there is no previous pattern... */
1231 if (!laststart)
1232 {
1233 if (syntax & RE_CONTEXT_INVALID_OPS)
1234 return REG_BADRPT;
1235 else if (!(syntax & RE_CONTEXT_INDEP_OPS))
1236 goto normal_char;
1237 }
1238
1239 {
1240 /* Are we optimizing this jump? */
1241 boolean keep_string_p = false;
1242
1243 /* 1 means zero (many) matches is allowed. */
1244 char zero_times_ok = 0, many_times_ok = 0;
1245
1246 /* If there is a sequence of repetition chars, collapse it
1247 down to just one (the right one). We can't combine
1248 interval operators with these because of, e.g., `a{2}*',
1249 which should only match an even number of `a's. */
1250
1251 for (;;)
1252 {
1253 zero_times_ok |= c != '+';
1254 many_times_ok |= c != '?';
1255
1256 if (p == pend)
1257 break;
1258
1259 PATFETCH (c);
1260
1261 if (c == '*'
1262 || (!(syntax & RE_BK_PLUS_QM) && (c == '+' || c == '?')))
1263 ;
1264
1265 else if (syntax & RE_BK_PLUS_QM && c == '\\')
1266 {
1267 if (p == pend) return REG_EESCAPE;
1268
1269 PATFETCH (c1);
1270 if (!(c1 == '+' || c1 == '?'))
1271 {
1272 PATUNFETCH;
1273 PATUNFETCH;
1274 break;
1275 }
1276
1277 c = c1;
1278 }
1279 else
1280 {
1281 PATUNFETCH;
1282 break;
1283 }
1284
1285 /* If we get here, we found another repeat character. */
1286 }
1287
1288 /* Star, etc. applied to an empty pattern is equivalent
1289 to an empty pattern. */
1290 if (!laststart)
1291 break;
1292
1293 /* Now we know whether or not zero matches is allowed
1294 and also whether or not two or more matches is allowed. */
1295 if (many_times_ok)
1296 { /* More than one repetition is allowed, so put in at the
1297 end a backward relative jump from `b' to before the next
1298 jump we're going to put in below (which jumps from
1299 laststart to after this jump).
1300
1301 But if we are at the `*' in the exact sequence `.*\n',
1302 insert an unconditional jump backwards to the .,
1303 instead of the beginning of the loop. This way we only
1304 push a failure point once, instead of every time
1305 through the loop. */
1306 assert (p - 1 > pattern);
1307
1308 /* Allocate the space for the jump. */
1309 GET_BUFFER_SPACE (3);
1310
1311 /* We know we are not at the first character of the pattern,
1312 because laststart was nonzero. And we've already
1313 incremented `p', by the way, to be the character after
1314 the `*'. Do we have to do something analogous here
1315 for null bytes, because of RE_DOT_NOT_NULL? */
1316 if (TRANSLATE (*(p - 2)) == TRANSLATE ('.')
1317 && zero_times_ok
1318 && p < pend && TRANSLATE (*p) == TRANSLATE ('\n')
1319 && !(syntax & RE_DOT_NEWLINE))
1320 { /* We have .*\n. */
1321 STORE_JUMP (jump, b, laststart);
1322 keep_string_p = true;
1323 }
1324 else
1325 /* Anything else. */
1326 STORE_JUMP (maybe_pop_jump, b, laststart - 3);
1327
1328 /* We've added more stuff to the buffer. */
1329 b += 3;
1330 }
1331
1332 /* On failure, jump from laststart to b + 3, which will be the
1333 end of the buffer after this jump is inserted. */
1334 GET_BUFFER_SPACE (3);
1335 INSERT_JUMP (keep_string_p ? on_failure_keep_string_jump
1336 : on_failure_jump,
1337 laststart, b + 3);
1338 pending_exact = 0;
1339 b += 3;
1340
1341 if (!zero_times_ok)
1342 {
1343 /* At least one repetition is required, so insert a
1344 `dummy_failure_jump' before the initial
1345 `on_failure_jump' instruction of the loop. This
1346 effects a skip over that instruction the first time
1347 we hit that loop. */
1348 GET_BUFFER_SPACE (3);
1349 INSERT_JUMP (dummy_failure_jump, laststart, laststart + 6);
1350 b += 3;
1351 }
1352 }
1353 break;
1354
1355
1356 case '.':
1357 laststart = b;
1358 BUF_PUSH (anychar);
1359 break;
1360
1361
1362 case '[':
1363 {
1364 boolean had_char_class = false;
1365
1366 if (p == pend) return REG_EBRACK;
1367
1368 /* Ensure that we have enough space to push a charset: the
1369 opcode, the length count, and the bitset; 34 bytes in all. */
1370 GET_BUFFER_SPACE (34);
1371
1372 laststart = b;
1373
1374 /* We test `*p == '^' twice, instead of using an if
1375 statement, so we only need one BUF_PUSH. */
1376 BUF_PUSH (*p == '^' ? charset_not : charset);
1377 if (*p == '^')
1378 p++;
1379
1380 /* Remember the first position in the bracket expression. */
1381 p1 = p;
1382
1383 /* Push the number of bytes in the bitmap. */
1384 BUF_PUSH ((1 << BYTEWIDTH) / BYTEWIDTH);
1385
1386 /* Clear the whole map. */
1387 bzero (b, (1 << BYTEWIDTH) / BYTEWIDTH);
1388
1389 /* charset_not matches newline according to a syntax bit. */
1390 if ((re_opcode_t) b[-2] == charset_not
1391 && (syntax & RE_HAT_LISTS_NOT_NEWLINE))
1392 SET_LIST_BIT ('\n');
1393
1394 /* Read in characters and ranges, setting map bits. */
1395 for (;;)
1396 {
1397 if (p == pend) return REG_EBRACK;
1398
1399 PATFETCH (c);
1400
1401 /* \ might escape characters inside [...] and [^...]. */
1402 if ((syntax & RE_BACKSLASH_ESCAPE_IN_LISTS) && c == '\\')
1403 {
1404 if (p == pend) return REG_EESCAPE;
1405
1406 PATFETCH (c1);
1407 SET_LIST_BIT (c1);
1408 continue;
1409 }
1410
1411 /* Could be the end of the bracket expression. If it's
1412 not (i.e., when the bracket expression is `[]' so
1413 far), the ']' character bit gets set way below. */
1414 if (c == ']' && p != p1 + 1)
1415 break;
1416
1417 /* Look ahead to see if it's a range when the last thing
1418 was a character class. */
1419 if (had_char_class && c == '-' && *p != ']')
1420 return REG_ERANGE;
1421
1422 /* Look ahead to see if it's a range when the last thing
1423 was a character: if this is a hyphen not at the
1424 beginning or the end of a list, then it's the range
1425 operator. */
1426 if (c == '-'
1427 && !(p - 2 >= pattern && p[-2] == '[')
1428 && !(p - 3 >= pattern && p[-3] == '[' && p[-2] == '^')
1429 && *p != ']')
1430 {
1431 reg_errcode_t ret
1432 = compile_range (&p, pend, translate, syntax, b);
1433 if (ret != REG_NOERROR) return ret;
1434 }
1435
1436 else if (p[0] == '-' && p[1] != ']')
1437 { /* This handles ranges made up of characters only. */
1438 reg_errcode_t ret;
1439
1440 /* Move past the `-'. */
1441 PATFETCH (c1);
1442
1443 ret = compile_range (&p, pend, translate, syntax, b);
1444 if (ret != REG_NOERROR) return ret;
1445 }
1446
1447 /* See if we're at the beginning of a possible character
1448 class. */
1449
1450 else if (syntax & RE_CHAR_CLASSES && c == '[' && *p == ':')
1451 { /* Leave room for the null. */
1452 char str[CHAR_CLASS_MAX_LENGTH + 1];
1453
1454 PATFETCH (c);
1455 c1 = 0;
1456
1457 /* If pattern is `[[:'. */
1458 if (p == pend) return REG_EBRACK;
1459
1460 for (;;)
1461 {
1462 PATFETCH (c);
1463 if (c == ':' || c == ']' || p == pend
1464 || c1 == CHAR_CLASS_MAX_LENGTH)
1465 break;
1466 str[c1++] = c;
1467 }
1468 str[c1] = '\0';
1469
1470 /* If isn't a word bracketed by `[:' and:`]':
1471 undo the ending character, the letters, and leave
1472 the leading `:' and `[' (but set bits for them). */
1473 if (c == ':' && *p == ']')
1474 {
1475 int ch;
1476 boolean is_alnum = STREQ (str, "alnum");
1477 boolean is_alpha = STREQ (str, "alpha");
1478 boolean is_blank = STREQ (str, "blank");
1479 boolean is_cntrl = STREQ (str, "cntrl");
1480 boolean is_digit = STREQ (str, "digit");
1481 boolean is_graph = STREQ (str, "graph");
1482 boolean is_lower = STREQ (str, "lower");
1483 boolean is_print = STREQ (str, "print");
1484 boolean is_punct = STREQ (str, "punct");
1485 boolean is_space = STREQ (str, "space");
1486 boolean is_upper = STREQ (str, "upper");
1487 boolean is_xdigit = STREQ (str, "xdigit");
1488
1489 if (!IS_CHAR_CLASS (str)) return REG_ECTYPE;
1490
1491 /* Throw away the ] at the end of the character
1492 class. */
1493 PATFETCH (c);
1494
1495 if (p == pend) return REG_EBRACK;
1496
1497 for (ch = 0; ch < 1 << BYTEWIDTH; ch++)
1498 {
1499 if ( (is_alnum && ISALNUM (ch))
1500 || (is_alpha && ISALPHA (ch))
1501 || (is_blank && ISBLANK (ch))
1502 || (is_cntrl && ISCNTRL (ch))
1503 || (is_digit && ISDIGIT (ch))
1504 || (is_graph && ISGRAPH (ch))
1505 || (is_lower && ISLOWER (ch))
1506 || (is_print && ISPRINT (ch))
1507 || (is_punct && ISPUNCT (ch))
1508 || (is_space && ISSPACE (ch))
1509 || (is_upper && ISUPPER (ch))
1510 || (is_xdigit && ISXDIGIT (ch)))
1511 SET_LIST_BIT (ch);
1512 }
1513 had_char_class = true;
1514 }
1515 else
1516 {
1517 c1++;
1518 while (c1--)
1519 PATUNFETCH;
1520 SET_LIST_BIT ('[');
1521 SET_LIST_BIT (':');
1522 had_char_class = false;
1523 }
1524 }
1525 else
1526 {
1527 had_char_class = false;
1528 SET_LIST_BIT (c);
1529 }
1530 }
1531
1532 /* Discard any (non)matching list bytes that are all 0 at the
1533 end of the map. Decrease the map-length byte too. */
1534 while ((int) b[-1] > 0 && b[b[-1] - 1] == 0)
1535 b[-1]--;
1536 b += b[-1];
1537 }
1538 break;
1539
1540
1541 case '(':
1542 if (syntax & RE_NO_BK_PARENS)
1543 goto handle_open;
1544 else
1545 goto normal_char;
1546
1547
1548 case ')':
1549 if (syntax & RE_NO_BK_PARENS)
1550 goto handle_close;
1551 else
1552 goto normal_char;
1553
1554
1555 case '\n':
1556 if (syntax & RE_NEWLINE_ALT)
1557 goto handle_alt;
1558 else
1559 goto normal_char;
1560
1561
1562 case '|':
1563 if (syntax & RE_NO_BK_VBAR)
1564 goto handle_alt;
1565 else
1566 goto normal_char;
1567
1568
1569 case '{':
1570 if (syntax & RE_INTERVALS && syntax & RE_NO_BK_BRACES)
1571 goto handle_interval;
1572 else
1573 goto normal_char;
1574
1575
1576 case '\\':
1577 if (p == pend) return REG_EESCAPE;
1578
1579 /* Do not translate the character after the \, so that we can
1580 distinguish, e.g., \B from \b, even if we normally would
1581 translate, e.g., B to b. */
1582 PATFETCH_RAW (c);
1583
1584 switch (c)
1585 {
1586 case '(':
1587 if (syntax & RE_NO_BK_PARENS)
1588 goto normal_backslash;
1589
1590 handle_open:
1591 bufp->re_nsub++;
1592 regnum++;
1593
1594 if (COMPILE_STACK_FULL)
1595 {
1596 RETALLOC (compile_stack.stack, compile_stack.size << 1,
1597 compile_stack_elt_t);
1598 if (compile_stack.stack == NULL) return REG_ESPACE;
1599
1600 compile_stack.size <<= 1;
1601 }
1602
1603 /* These are the values to restore when we hit end of this
1604 group. They are all relative offsets, so that if the
1605 whole pattern moves because of realloc, they will still
1606 be valid. */
1607 COMPILE_STACK_TOP.begalt_offset = begalt - bufp->buffer;
1608 COMPILE_STACK_TOP.fixup_alt_jump
1609 = fixup_alt_jump ? fixup_alt_jump - bufp->buffer + 1 : 0;
1610 COMPILE_STACK_TOP.laststart_offset = b - bufp->buffer;
1611 COMPILE_STACK_TOP.regnum = regnum;
1612
1613 /* We will eventually replace the 0 with the number of
1614 groups inner to this one. But do not push a
1615 start_memory for groups beyond the last one we can
1616 represent in the compiled pattern. */
1617 if (regnum <= MAX_REGNUM)
1618 {
1619 COMPILE_STACK_TOP.inner_group_offset = b - bufp->buffer + 2;
1620 BUF_PUSH_3 (start_memory, regnum, 0);
1621 }
1622
1623 compile_stack.avail++;
1624
1625 fixup_alt_jump = 0;
1626 laststart = 0;
1627 begalt = b;
1628 /* If we've reached MAX_REGNUM groups, then this open
1629 won't actually generate any code, so we'll have to
1630 clear pending_exact explicitly. */
1631 pending_exact = 0;
1632 break;
1633
1634
1635 case ')':
1636 if (syntax & RE_NO_BK_PARENS) goto normal_backslash;
1637
1638 /* snax: added braces here to elimnate gcc warning */
1639
1640 if (COMPILE_STACK_EMPTY)
1641 {
1642 if (syntax & RE_UNMATCHED_RIGHT_PAREN_ORD)
1643 goto normal_backslash;
1644 else
1645 return REG_ERPAREN;
1646 }
1647 handle_close:
1648 if (fixup_alt_jump)
1649 { /* Push a dummy failure point at the end of the
1650 alternative for a possible future
1651 `pop_failure_jump' to pop. See comments at
1652 `push_dummy_failure' in `re_match_2'. */
1653 BUF_PUSH (push_dummy_failure);
1654
1655 /* We allocated space for this jump when we assigned
1656 to `fixup_alt_jump', in the `handle_alt' case below. */
1657 STORE_JUMP (jump_past_alt, fixup_alt_jump, b - 1);
1658 }
1659
1660 /* snax: added braces here too */
1661 /* See similar code for backslashed left paren above. */
1662 if (COMPILE_STACK_EMPTY)
1663 {
1664 if (syntax & RE_UNMATCHED_RIGHT_PAREN_ORD)
1665 goto normal_char;
1666 else
1667 return REG_ERPAREN;
1668 }
1669
1670 /* Since we just checked for an empty stack above, this
1671 ``can't happen''. */
1672 assert (compile_stack.avail != 0);
1673 {
1674 /* We don't just want to restore into `regnum', because
1675 later groups should continue to be numbered higher,
1676 as in `(ab)c(de)' -- the second group is #2. */
1677 regnum_t this_group_regnum;
1678
1679 compile_stack.avail--;
1680 begalt = bufp->buffer + COMPILE_STACK_TOP.begalt_offset;
1681 fixup_alt_jump
1682 = COMPILE_STACK_TOP.fixup_alt_jump
1683 ? bufp->buffer + COMPILE_STACK_TOP.fixup_alt_jump - 1
1684 : 0;
1685 laststart = bufp->buffer + COMPILE_STACK_TOP.laststart_offset;
1686 this_group_regnum = COMPILE_STACK_TOP.regnum;
1687 /* If we've reached MAX_REGNUM groups, then this open
1688 won't actually generate any code, so we'll have to
1689 clear pending_exact explicitly. */
1690 pending_exact = 0;
1691
1692 /* We're at the end of the group, so now we know how many
1693 groups were inside this one. */
1694 if (this_group_regnum <= MAX_REGNUM)
1695 {
1696 unsigned char *inner_group_loc
1697 = bufp->buffer + COMPILE_STACK_TOP.inner_group_offset;
1698
1699 *inner_group_loc = regnum - this_group_regnum;
1700 BUF_PUSH_3 (stop_memory, this_group_regnum,
1701 regnum - this_group_regnum);
1702 }
1703 }
1704 break;
1705
1706
1707 case '|': /* `\|'. */
1708 if (syntax & RE_LIMITED_OPS || syntax & RE_NO_BK_VBAR)
1709 goto normal_backslash;
1710 handle_alt:
1711 if (syntax & RE_LIMITED_OPS)
1712 goto normal_char;
1713
1714 /* Insert before the previous alternative a jump which
1715 jumps to this alternative if the former fails. */
1716 GET_BUFFER_SPACE (3);
1717 INSERT_JUMP (on_failure_jump, begalt, b + 6);
1718 pending_exact = 0;
1719 b += 3;
1720
1721 /* The alternative before this one has a jump after it
1722 which gets executed if it gets matched. Adjust that
1723 jump so it will jump to this alternative's analogous
1724 jump (put in below, which in turn will jump to the next
1725 (if any) alternative's such jump, etc.). The last such
1726 jump jumps to the correct final destination. A picture:
1727 _____ _____
1728 | | | |
1729 | v | v
1730 a | b | c
1731
1732 If we are at `b', then fixup_alt_jump right now points to a
1733 three-byte space after `a'. We'll put in the jump, set
1734 fixup_alt_jump to right after `b', and leave behind three
1735 bytes which we'll fill in when we get to after `c'. */
1736
1737 if (fixup_alt_jump)
1738 STORE_JUMP (jump_past_alt, fixup_alt_jump, b);
1739
1740 /* Mark and leave space for a jump after this alternative,
1741 to be filled in later either by next alternative or
1742 when know we're at the end of a series of alternatives. */
1743 fixup_alt_jump = b;
1744 GET_BUFFER_SPACE (3);
1745 b += 3;
1746
1747 laststart = 0;
1748 begalt = b;
1749 break;
1750
1751
1752 case '{':
1753 /* If \{ is a literal. */
1754 if (!(syntax & RE_INTERVALS)
1755 /* If we're at `\{' and it's not the open-interval
1756 operator. */
1757 || ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
1758 || (p - 2 == pattern && p == pend))
1759 goto normal_backslash;
1760
1761 handle_interval:
1762 {
1763 /* If got here, then the syntax allows intervals. */
1764
1765 /* At least (most) this many matches must be made. */
1766 int lower_bound = -1, upper_bound = -1;
1767
1768 beg_interval = p - 1;
1769
1770 if (p == pend)
1771 {
1772 if (syntax & RE_NO_BK_BRACES)
1773 goto unfetch_interval;
1774 else
1775 return REG_EBRACE;
1776 }
1777
1778 GET_UNSIGNED_NUMBER (lower_bound);
1779
1780 if (c == ',')
1781 {
1782 GET_UNSIGNED_NUMBER (upper_bound);
1783 if (upper_bound < 0) upper_bound = RE_DUP_MAX;
1784 }
1785 else
1786 /* Interval such as `{1}' => match exactly once. */
1787 upper_bound = lower_bound;
1788
1789 if (lower_bound < 0 || upper_bound > RE_DUP_MAX
1790 || lower_bound > upper_bound)
1791 {
1792 if (syntax & RE_NO_BK_BRACES)
1793 goto unfetch_interval;
1794 else
1795 return REG_BADBR;
1796 }
1797
1798 if (!(syntax & RE_NO_BK_BRACES))
1799 {
1800 if (c != '\\') return REG_EBRACE;
1801
1802 PATFETCH (c);
1803 }
1804
1805 if (c != '}')
1806 {
1807 if (syntax & RE_NO_BK_BRACES)
1808 goto unfetch_interval;
1809 else
1810 return REG_BADBR;
1811 }
1812
1813 /* We just parsed a valid interval. */
1814
1815 /* If it's invalid to have no preceding re. */
1816 if (!laststart)
1817 {
1818 if (syntax & RE_CONTEXT_INVALID_OPS)
1819 return REG_BADRPT;
1820 else if (syntax & RE_CONTEXT_INDEP_OPS)
1821 laststart = b;
1822 else
1823 goto unfetch_interval;
1824 }
1825
1826 /* If the upper bound is zero, don't want to succeed at
1827 all; jump from `laststart' to `b + 3', which will be
1828 the end of the buffer after we insert the jump. */
1829 if (upper_bound == 0)
1830 {
1831 GET_BUFFER_SPACE (3);
1832 INSERT_JUMP (jump, laststart, b + 3);
1833 b += 3;
1834 }
1835
1836 /* Otherwise, we have a nontrivial interval. When
1837 we're all done, the pattern will look like:
1838 set_number_at <jump count> <upper bound>
1839 set_number_at <succeed_n count> <lower bound>
1840 succeed_n <after jump addr> <succed_n count>
1841 <body of loop>
1842 jump_n <succeed_n addr> <jump count>
1843 (The upper bound and `jump_n' are omitted if
1844 `upper_bound' is 1, though.) */
1845 else
1846 { /* If the upper bound is > 1, we need to insert
1847 more at the end of the loop. */
1848 unsigned nbytes = 10 + (upper_bound > 1) * 10;
1849
1850 GET_BUFFER_SPACE (nbytes);
1851
1852 /* Initialize lower bound of the `succeed_n', even
1853 though it will be set during matching by its
1854 attendant `set_number_at' (inserted next),
1855 because `re_compile_fastmap' needs to know.
1856 Jump to the `jump_n' we might insert below. */
1857 INSERT_JUMP2 (succeed_n, laststart,
1858 b + 5 + (upper_bound > 1) * 5,
1859 lower_bound);
1860 b += 5;
1861
1862 /* Code to initialize the lower bound. Insert
1863 before the `succeed_n'. The `5' is the last two
1864 bytes of this `set_number_at', plus 3 bytes of
1865 the following `succeed_n'. */
1866 insert_op2 (set_number_at, laststart, 5, lower_bound, b);
1867 b += 5;
1868
1869 if (upper_bound > 1)
1870 { /* More than one repetition is allowed, so
1871 append a backward jump to the `succeed_n'
1872 that starts this interval.
1873
1874 When we've reached this during matching,
1875 we'll have matched the interval once, so
1876 jump back only `upper_bound - 1' times. */
1877 STORE_JUMP2 (jump_n, b, laststart + 5,
1878 upper_bound - 1);
1879 b += 5;
1880
1881 /* The location we want to set is the second
1882 parameter of the `jump_n'; that is `b-2' as
1883 an absolute address. `laststart' will be
1884 the `set_number_at' we're about to insert;
1885 `laststart+3' the number to set, the source
1886 for the relative address. But we are
1887 inserting into the middle of the pattern --
1888 so everything is getting moved up by 5.
1889 Conclusion: (b - 2) - (laststart + 3) + 5,
1890 i.e., b - laststart.
1891
1892 We insert this at the beginning of the loop
1893 so that if we fail during matching, we'll
1894 reinitialize the bounds. */
1895 insert_op2 (set_number_at, laststart, b - laststart,
1896 upper_bound - 1, b);
1897 b += 5;
1898 }
1899 }
1900 pending_exact = 0;
1901 beg_interval = NULL;
1902 }
1903 break;
1904
1905 unfetch_interval:
1906 /* If an invalid interval, match the characters as literals. */
1907 assert (beg_interval);
1908 p = beg_interval;
1909 beg_interval = NULL;
1910
1911 /* normal_char and normal_backslash need `c'. */
1912 PATFETCH (c);
1913
1914 if (!(syntax & RE_NO_BK_BRACES))
1915 {
1916 if (p > pattern && p[-1] == '\\')
1917 goto normal_backslash;
1918 }
1919 goto normal_char;
1920
1921 #ifdef emacs
1922 /* There is no way to specify the before_dot and after_dot
1923 operators. rms says this is ok. --karl */
1924 case '=':
1925 BUF_PUSH (at_dot);
1926 break;
1927
1928 case 's':
1929 laststart = b;
1930 PATFETCH (c);
1931 BUF_PUSH_2 (syntaxspec, syntax_spec_code[c]);
1932 break;
1933
1934 case 'S':
1935 laststart = b;
1936 PATFETCH (c);
1937 BUF_PUSH_2 (notsyntaxspec, syntax_spec_code[c]);
1938 break;
1939 #endif /* emacs */
1940
1941
1942 case 'w':
1943 laststart = b;
1944 BUF_PUSH (wordchar);
1945 break;
1946
1947
1948 case 'W':
1949 laststart = b;
1950 BUF_PUSH (notwordchar);
1951 break;
1952
1953
1954 case '<':
1955 BUF_PUSH (wordbeg);
1956 break;
1957
1958 case '>':
1959 BUF_PUSH (wordend);
1960 break;
1961
1962 case 'b':
1963 BUF_PUSH (wordbound);
1964 break;
1965
1966 case 'B':
1967 BUF_PUSH (notwordbound);
1968 break;
1969
1970 case '`':
1971 BUF_PUSH (begbuf);
1972 break;
1973
1974 case '\'':
1975 BUF_PUSH (endbuf);
1976 break;
1977
1978 case '1': case '2': case '3': case '4': case '5':
1979 case '6': case '7': case '8': case '9':
1980 if (syntax & RE_NO_BK_REFS)
1981 goto normal_char;
1982
1983 c1 = c - '0';
1984
1985 if (c1 > regnum)
1986 return REG_ESUBREG;
1987
1988 /* Can't back reference to a subexpression if inside of it. */
1989 if (group_in_compile_stack (compile_stack, c1))
1990 goto normal_char;
1991
1992 laststart = b;
1993 BUF_PUSH_2 (duplicate, c1);
1994 break;
1995
1996
1997 case '+':
1998 case '?':
1999 if (syntax & RE_BK_PLUS_QM)
2000 goto handle_plus;
2001 else
2002 goto normal_backslash;
2003
2004 default:
2005 normal_backslash:
2006 /* You might think it would be useful for \ to mean
2007 not to translate; but if we don't translate it
2008 it will never match anything. */
2009 c = TRANSLATE (c);
2010 goto normal_char;
2011 }
2012 break;
2013
2014
2015 default:
2016 /* Expects the character in `c'. */
2017 normal_char:
2018 /* If no exactn currently being built. */
2019 if (!pending_exact
2020
2021 /* If last exactn not at current position. */
2022 || pending_exact + *pending_exact + 1 != b
2023
2024 /* We have only one byte following the exactn for the count. */
2025 || *pending_exact == (1 << BYTEWIDTH) - 1
2026
2027 /* If followed by a repetition operator. */
2028 || *p == '*' || *p == '^'
2029 || ((syntax & RE_BK_PLUS_QM)
2030 ? *p == '\\' && (p[1] == '+' || p[1] == '?')
2031 : (*p == '+' || *p == '?'))
2032 || ((syntax & RE_INTERVALS)
2033 && ((syntax & RE_NO_BK_BRACES)
2034 ? *p == '{'
2035 : (p[0] == '\\' && p[1] == '{'))))
2036 {
2037 /* Start building a new exactn. */
2038
2039 laststart = b;
2040
2041 BUF_PUSH_2 (exactn, 0);
2042 pending_exact = b - 1;
2043 }
2044
2045 BUF_PUSH (c);
2046 (*pending_exact)++;
2047 break;
2048 } /* switch (c) */
2049 } /* while p != pend */
2050
2051
2052 /* Through the pattern now. */
2053
2054 if (fixup_alt_jump)
2055 STORE_JUMP (jump_past_alt, fixup_alt_jump, b);
2056
2057 if (!COMPILE_STACK_EMPTY)
2058 return REG_EPAREN;
2059
2060 free (compile_stack.stack);
2061
2062 /* We have succeeded; set the length of the buffer. */
2063 bufp->used = b - bufp->buffer;
2064
2065 #ifdef DEBUG
2066 if (debug)
2067 {
2068 DEBUG_PRINT1 ("\nCompiled pattern: ");
2069 print_compiled_pattern (bufp);
2070 }
2071 #endif /* DEBUG */
2072
2073 return REG_NOERROR;
2074 } /* regex_compile */
2075
2076 /* Subroutines for `regex_compile'. */
2077
2078 /* Store OP at LOC followed by two-byte integer parameter ARG. */
2079
2080 static void
store_op1(op,loc,arg)2081 store_op1 (op, loc, arg)
2082 re_opcode_t op;
2083 unsigned char *loc;
2084 int arg;
2085 {
2086 *loc = (unsigned char) op;
2087 STORE_NUMBER (loc + 1, arg);
2088 }
2089
2090
2091 /* Like `store_op1', but for two two-byte parameters ARG1 and ARG2. */
2092
2093 static void
store_op2(op,loc,arg1,arg2)2094 store_op2 (op, loc, arg1, arg2)
2095 re_opcode_t op;
2096 unsigned char *loc;
2097 int arg1, arg2;
2098 {
2099 *loc = (unsigned char) op;
2100 STORE_NUMBER (loc + 1, arg1);
2101 STORE_NUMBER (loc + 3, arg2);
2102 }
2103
2104
2105 /* Copy the bytes from LOC to END to open up three bytes of space at LOC
2106 for OP followed by two-byte integer parameter ARG. */
2107
2108 static void
insert_op1(op,loc,arg,end)2109 insert_op1 (op, loc, arg, end)
2110 re_opcode_t op;
2111 unsigned char *loc;
2112 int arg;
2113 unsigned char *end;
2114 {
2115 register unsigned char *pfrom = end;
2116 register unsigned char *pto = end + 3;
2117
2118 while (pfrom != loc)
2119 *--pto = *--pfrom;
2120
2121 store_op1 (op, loc, arg);
2122 }
2123
2124
2125 /* Like `insert_op1', but for two two-byte parameters ARG1 and ARG2. */
2126
2127 static void
insert_op2(op,loc,arg1,arg2,end)2128 insert_op2 (op, loc, arg1, arg2, end)
2129 re_opcode_t op;
2130 unsigned char *loc;
2131 int arg1, arg2;
2132 unsigned char *end;
2133 {
2134 register unsigned char *pfrom = end;
2135 register unsigned char *pto = end + 5;
2136
2137 while (pfrom != loc)
2138 *--pto = *--pfrom;
2139
2140 store_op2 (op, loc, arg1, arg2);
2141 }
2142
2143
2144 /* P points to just after a ^ in PATTERN. Return true if that ^ comes
2145 after an alternative or a begin-subexpression. We assume there is at
2146 least one character before the ^. */
2147
2148 static boolean
at_begline_loc_p(pattern,p,syntax)2149 at_begline_loc_p (pattern, p, syntax)
2150 const char *pattern, *p;
2151 reg_syntax_t syntax;
2152 {
2153 const char *prev = p - 2;
2154 boolean prev_prev_backslash = prev > pattern && prev[-1] == '\\';
2155
2156 return
2157 /* After a subexpression? */
2158 (*prev == '(' && (syntax & RE_NO_BK_PARENS || prev_prev_backslash))
2159 /* After an alternative? */
2160 || (*prev == '|' && (syntax & RE_NO_BK_VBAR || prev_prev_backslash));
2161 }
2162
2163
2164 /* The dual of at_begline_loc_p. This one is for $. We assume there is
2165 at least one character after the $, i.e., `P < PEND'. */
2166
2167 static boolean
at_endline_loc_p(p,pend,syntax)2168 at_endline_loc_p (p, pend, syntax)
2169 const char *p, *pend;
2170 int syntax;
2171 {
2172 const char *next = p;
2173 boolean next_backslash = *next == '\\';
2174 const char *next_next = p + 1 < pend ? p + 1 : NULL;
2175
2176 return
2177 /* Before a subexpression? */
2178 (syntax & RE_NO_BK_PARENS ? *next == ')'
2179 : next_backslash && next_next && *next_next == ')')
2180 /* Before an alternative? */
2181 || (syntax & RE_NO_BK_VBAR ? *next == '|'
2182 : next_backslash && next_next && *next_next == '|');
2183 }
2184
2185
2186 /* Returns true if REGNUM is in one of COMPILE_STACK's elements and
2187 false if it's not. */
2188
2189 static boolean
group_in_compile_stack(compile_stack,regnum)2190 group_in_compile_stack (compile_stack, regnum)
2191 compile_stack_type compile_stack;
2192 regnum_t regnum;
2193 {
2194 int this_element;
2195
2196 for (this_element = compile_stack.avail - 1;
2197 this_element >= 0;
2198 this_element--)
2199 if (compile_stack.stack[this_element].regnum == regnum)
2200 return true;
2201
2202 return false;
2203 }
2204
2205
2206 /* Read the ending character of a range (in a bracket expression) from the
2207 uncompiled pattern *P_PTR (which ends at PEND). We assume the
2208 starting character is in `P[-2]'. (`P[-1]' is the character `-'.)
2209 Then we set the translation of all bits between the starting and
2210 ending characters (inclusive) in the compiled pattern B.
2211
2212 Return an error code.
2213
2214 We use these short variable names so we can use the same macros as
2215 `regex_compile' itself. */
2216
2217 static reg_errcode_t
compile_range(p_ptr,pend,translate,syntax,b)2218 compile_range (p_ptr, pend, translate, syntax, b)
2219 const char **p_ptr, *pend;
2220 char *translate;
2221 reg_syntax_t syntax;
2222 unsigned char *b;
2223 {
2224 int this_char;
2225
2226 const char *p = *p_ptr;
2227 int range_start, range_end;
2228
2229 if (p == pend)
2230 return REG_ERANGE;
2231
2232 /* Even though the pattern is a signed `char *', we need to fetch
2233 with unsigned char *'s; if the high bit of the pattern character
2234 is set, the range endpoints will be negative if we fetch using a
2235 signed char *.
2236
2237 We also want to fetch the endpoints without translating them; the
2238 appropriate translation is done in the bit-setting loop below. */
2239 range_start = ((unsigned char *) p)[-2];
2240 range_end = ((unsigned char *) p)[0];
2241
2242 /* Have to increment the pointer into the pattern string, so the
2243 caller isn't still at the ending character. */
2244 (*p_ptr)++;
2245
2246 /* If the start is after the end, the range is empty. */
2247 if (range_start > range_end)
2248 return syntax & RE_NO_EMPTY_RANGES ? REG_ERANGE : REG_NOERROR;
2249
2250 /* Here we see why `this_char' has to be larger than an `unsigned
2251 char' -- the range is inclusive, so if `range_end' == 0xff
2252 (assuming 8-bit characters), we would otherwise go into an infinite
2253 loop, since all characters <= 0xff. */
2254 for (this_char = range_start; this_char <= range_end; this_char++)
2255 {
2256 SET_LIST_BIT (TRANSLATE (this_char));
2257 }
2258
2259 return REG_NOERROR;
2260 }
2261
2262 /* Failure stack declarations and macros; both re_compile_fastmap and
2263 re_match_2 use a failure stack. These have to be macros because of
2264 REGEX_ALLOCATE. */
2265
2266
2267 /* Number of failure points for which to initially allocate space
2268 when matching. If this number is exceeded, we allocate more
2269 space, so it is not a hard limit. */
2270 #ifndef INIT_FAILURE_ALLOC
2271 #define INIT_FAILURE_ALLOC 5
2272 #endif
2273
2274 /* Roughly the maximum number of failure points on the stack. Would be
2275 exactly that if always used MAX_FAILURE_SPACE each time we failed.
2276 This is a variable only so users of regex can assign to it; we never
2277 change it ourselves. */
2278 int re_max_failures = 2000;
2279
2280 typedef const unsigned char *fail_stack_elt_t;
2281
2282 typedef struct
2283 {
2284 fail_stack_elt_t *stack;
2285 unsigned size;
2286 unsigned avail; /* Offset of next open position. */
2287 } fail_stack_type;
2288
2289 #define FAIL_STACK_EMPTY() (fail_stack.avail == 0)
2290 #define FAIL_STACK_PTR_EMPTY() (fail_stack_ptr->avail == 0)
2291 #define FAIL_STACK_FULL() (fail_stack.avail == fail_stack.size)
2292 #define FAIL_STACK_TOP() (fail_stack.stack[fail_stack.avail])
2293
2294
2295 /* Initialize `fail_stack'. Do `return -2' if the alloc fails. */
2296
2297 #define INIT_FAIL_STACK() \
2298 do { \
2299 fail_stack.stack = (fail_stack_elt_t *) \
2300 REGEX_ALLOCATE (INIT_FAILURE_ALLOC * sizeof (fail_stack_elt_t)); \
2301 \
2302 if (fail_stack.stack == NULL) \
2303 return -2; \
2304 \
2305 fail_stack.size = INIT_FAILURE_ALLOC; \
2306 fail_stack.avail = 0; \
2307 } while (0)
2308
2309
2310 /* Double the size of FAIL_STACK, up to approximately `re_max_failures' items.
2311
2312 Return 1 if succeeds, and 0 if either ran out of memory
2313 allocating space for it or it was already too large.
2314
2315 REGEX_REALLOCATE requires `destination' be declared. */
2316
2317 #define DOUBLE_FAIL_STACK(fail_stack) \
2318 ((fail_stack).size > re_max_failures * MAX_FAILURE_ITEMS \
2319 ? 0 \
2320 : ((fail_stack).stack = (fail_stack_elt_t *) \
2321 REGEX_REALLOCATE ((fail_stack).stack, \
2322 (fail_stack).size * sizeof (fail_stack_elt_t), \
2323 ((fail_stack).size << 1) * sizeof (fail_stack_elt_t)), \
2324 \
2325 (fail_stack).stack == NULL \
2326 ? 0 \
2327 : ((fail_stack).size <<= 1, \
2328 1)))
2329
2330
2331 /* Push PATTERN_OP on FAIL_STACK.
2332
2333 Return 1 if was able to do so and 0 if ran out of memory allocating
2334 space to do so. */
2335 #define PUSH_PATTERN_OP(pattern_op, fail_stack) \
2336 ((FAIL_STACK_FULL () \
2337 && !DOUBLE_FAIL_STACK (fail_stack)) \
2338 ? 0 \
2339 : ((fail_stack).stack[(fail_stack).avail++] = pattern_op, \
2340 1))
2341
2342 /* This pushes an item onto the failure stack. Must be a four-byte
2343 value. Assumes the variable `fail_stack'. Probably should only
2344 be called from within `PUSH_FAILURE_POINT'. */
2345 #define PUSH_FAILURE_ITEM(item) \
2346 fail_stack.stack[fail_stack.avail++] = (fail_stack_elt_t) item
2347
2348 /* The complement operation. Assumes `fail_stack' is nonempty. */
2349 #define POP_FAILURE_ITEM() fail_stack.stack[--fail_stack.avail]
2350
2351 /* Used to omit pushing failure point id's when we're not debugging. */
2352 #ifdef DEBUG
2353 #define DEBUG_PUSH PUSH_FAILURE_ITEM
2354 #define DEBUG_POP(item_addr) *(item_addr) = POP_FAILURE_ITEM ()
2355 #else
2356 #define DEBUG_PUSH(item)
2357 #define DEBUG_POP(item_addr)
2358 #endif
2359
2360
2361 /* Push the information about the state we will need
2362 if we ever fail back to it.
2363
2364 Requires variables fail_stack, regstart, regend, reg_info, and
2365 num_regs be declared. DOUBLE_FAIL_STACK requires `destination' be
2366 declared.
2367
2368 Does `return FAILURE_CODE' if runs out of memory. */
2369
2370 #define PUSH_FAILURE_POINT(pattern_place, string_place, failure_code) \
2371 do { \
2372 char *destination; \
2373 /* Must be int, so when we don't save any registers, the arithmetic \
2374 of 0 + -1 isn't done as unsigned. */ \
2375 unsigned int this_reg; \
2376 \
2377 DEBUG_STATEMENT (failure_id++); \
2378 DEBUG_STATEMENT (nfailure_points_pushed++); \
2379 DEBUG_PRINT2 ("\nPUSH_FAILURE_POINT #%u:\n", failure_id); \
2380 DEBUG_PRINT2 (" Before push, next avail: %d\n", (fail_stack).avail);\
2381 DEBUG_PRINT2 (" size: %d\n", (fail_stack).size);\
2382 \
2383 DEBUG_PRINT2 (" slots needed: %d\n", NUM_FAILURE_ITEMS); \
2384 DEBUG_PRINT2 (" available: %d\n", REMAINING_AVAIL_SLOTS); \
2385 \
2386 /* Ensure we have enough space allocated for what we will push. */ \
2387 while (REMAINING_AVAIL_SLOTS < NUM_FAILURE_ITEMS) \
2388 { \
2389 if (!DOUBLE_FAIL_STACK (fail_stack)) \
2390 return failure_code; \
2391 \
2392 DEBUG_PRINT2 ("\n Doubled stack; size now: %d\n", \
2393 (fail_stack).size); \
2394 DEBUG_PRINT2 (" slots available: %d\n", REMAINING_AVAIL_SLOTS);\
2395 } \
2396 \
2397 /* Push the info, starting with the registers. */ \
2398 DEBUG_PRINT1 ("\n"); \
2399 \
2400 for (this_reg = lowest_active_reg; this_reg <= highest_active_reg; \
2401 this_reg++) \
2402 { \
2403 DEBUG_PRINT2 (" Pushing reg: %d\n", this_reg); \
2404 DEBUG_STATEMENT (num_regs_pushed++); \
2405 \
2406 DEBUG_PRINT2 (" start: 0x%x\n", regstart[this_reg]); \
2407 PUSH_FAILURE_ITEM (regstart[this_reg]); \
2408 \
2409 DEBUG_PRINT2 (" end: 0x%x\n", regend[this_reg]); \
2410 PUSH_FAILURE_ITEM (regend[this_reg]); \
2411 \
2412 DEBUG_PRINT2 (" info: 0x%x\n ", reg_info[this_reg]); \
2413 DEBUG_PRINT2 (" match_null=%d", \
2414 REG_MATCH_NULL_STRING_P (reg_info[this_reg])); \
2415 DEBUG_PRINT2 (" active=%d", IS_ACTIVE (reg_info[this_reg])); \
2416 DEBUG_PRINT2 (" matched_something=%d", \
2417 MATCHED_SOMETHING (reg_info[this_reg])); \
2418 DEBUG_PRINT2 (" ever_matched=%d", \
2419 EVER_MATCHED_SOMETHING (reg_info[this_reg])); \
2420 DEBUG_PRINT1 ("\n"); \
2421 PUSH_FAILURE_ITEM (reg_info[this_reg].word); \
2422 } \
2423 \
2424 DEBUG_PRINT2 (" Pushing low active reg: %d\n", lowest_active_reg);\
2425 PUSH_FAILURE_ITEM (lowest_active_reg); \
2426 \
2427 DEBUG_PRINT2 (" Pushing high active reg: %d\n", highest_active_reg);\
2428 PUSH_FAILURE_ITEM (highest_active_reg); \
2429 \
2430 DEBUG_PRINT2 (" Pushing pattern 0x%x: ", pattern_place); \
2431 DEBUG_PRINT_COMPILED_PATTERN (bufp, pattern_place, pend); \
2432 PUSH_FAILURE_ITEM (pattern_place); \
2433 \
2434 DEBUG_PRINT2 (" Pushing string 0x%x: `", string_place); \
2435 DEBUG_PRINT_DOUBLE_STRING (string_place, string1, size1, string2, \
2436 size2); \
2437 DEBUG_PRINT1 ("'\n"); \
2438 PUSH_FAILURE_ITEM (string_place); \
2439 \
2440 DEBUG_PRINT2 (" Pushing failure id: %u\n", failure_id); \
2441 DEBUG_PUSH (failure_id); \
2442 } while (0)
2443
2444 /* This is the number of items that are pushed and popped on the stack
2445 for each register. */
2446 #define NUM_REG_ITEMS 3
2447
2448 /* Individual items aside from the registers. */
2449 #ifdef DEBUG
2450 #define NUM_NONREG_ITEMS 5 /* Includes failure point id. */
2451 #else
2452 #define NUM_NONREG_ITEMS 4
2453 #endif
2454
2455 /* We push at most this many items on the stack. */
2456 #define MAX_FAILURE_ITEMS ((num_regs - 1) * NUM_REG_ITEMS + NUM_NONREG_ITEMS)
2457
2458 /* We actually push this many items. */
2459 #define NUM_FAILURE_ITEMS \
2460 ((highest_active_reg - lowest_active_reg + 1) * NUM_REG_ITEMS \
2461 + NUM_NONREG_ITEMS)
2462
2463 /* How many items can still be added to the stack without overflowing it. */
2464 #define REMAINING_AVAIL_SLOTS ((fail_stack).size - (fail_stack).avail)
2465
2466
2467 /* Pops what PUSH_FAIL_STACK pushes.
2468
2469 We restore into the parameters, all of which should be lvalues:
2470 STR -- the saved data position.
2471 PAT -- the saved pattern position.
2472 LOW_REG, HIGH_REG -- the highest and lowest active registers.
2473 REGSTART, REGEND -- arrays of string positions.
2474 REG_INFO -- array of information about each subexpression.
2475
2476 Also assumes the variables `fail_stack' and (if debugging), `bufp',
2477 `pend', `string1', `size1', `string2', and `size2'. */
2478
2479 #define POP_FAILURE_POINT(str, pat, low_reg, high_reg, regstart, regend, reg_info)\
2480 { \
2481 DEBUG_STATEMENT (fail_stack_elt_t failure_id;) \
2482 int this_reg; \
2483 const unsigned char *string_temp; \
2484 \
2485 assert (!FAIL_STACK_EMPTY ()); \
2486 \
2487 /* Remove failure points and point to how many regs pushed. */ \
2488 DEBUG_PRINT1 ("POP_FAILURE_POINT:\n"); \
2489 DEBUG_PRINT2 (" Before pop, next avail: %d\n", fail_stack.avail); \
2490 DEBUG_PRINT2 (" size: %d\n", fail_stack.size); \
2491 \
2492 assert (fail_stack.avail >= NUM_NONREG_ITEMS); \
2493 \
2494 DEBUG_POP (&failure_id); \
2495 DEBUG_PRINT2 (" Popping failure id: %u\n", failure_id); \
2496 \
2497 /* If the saved string location is NULL, it came from an \
2498 on_failure_keep_string_jump opcode, and we want to throw away the \
2499 saved NULL, thus retaining our current position in the string. */ \
2500 string_temp = POP_FAILURE_ITEM (); \
2501 if (string_temp != NULL) \
2502 str = (const char *) string_temp; \
2503 \
2504 DEBUG_PRINT2 (" Popping string 0x%x: `", str); \
2505 DEBUG_PRINT_DOUBLE_STRING (str, string1, size1, string2, size2); \
2506 DEBUG_PRINT1 ("'\n"); \
2507 \
2508 pat = (unsigned char *) POP_FAILURE_ITEM (); \
2509 DEBUG_PRINT2 (" Popping pattern 0x%x: ", pat); \
2510 DEBUG_PRINT_COMPILED_PATTERN (bufp, pat, pend); \
2511 \
2512 /* Restore register info. */ \
2513 high_reg = (uintptr_t) POP_FAILURE_ITEM (); \
2514 DEBUG_PRINT2 (" Popping high active reg: %d\n", high_reg); \
2515 \
2516 low_reg = (uintptr_t) POP_FAILURE_ITEM (); \
2517 DEBUG_PRINT2 (" Popping low active reg: %d\n", low_reg); \
2518 \
2519 for (this_reg = high_reg; (unsigned) this_reg >= low_reg; this_reg--) \
2520 { \
2521 DEBUG_PRINT2 (" Popping reg: %d\n", this_reg); \
2522 \
2523 reg_info[this_reg].word = POP_FAILURE_ITEM (); \
2524 DEBUG_PRINT2 (" info: 0x%x\n", reg_info[this_reg]); \
2525 \
2526 regend[this_reg] = (const char *) POP_FAILURE_ITEM (); \
2527 DEBUG_PRINT2 (" end: 0x%x\n", regend[this_reg]); \
2528 \
2529 regstart[this_reg] = (const char *) POP_FAILURE_ITEM (); \
2530 DEBUG_PRINT2 (" start: 0x%x\n", regstart[this_reg]); \
2531 } \
2532 \
2533 DEBUG_STATEMENT (nfailure_points_popped++); \
2534 } /* POP_FAILURE_POINT */
2535
2536 /* re_compile_fastmap computes a ``fastmap'' for the compiled pattern in
2537 BUFP. A fastmap records which of the (1 << BYTEWIDTH) possible
2538 characters can start a string that matches the pattern. This fastmap
2539 is used by re_search to skip quickly over impossible starting points.
2540
2541 The caller must supply the address of a (1 << BYTEWIDTH)-byte data
2542 area as BUFP->fastmap.
2543
2544 We set the `fastmap', `fastmap_accurate', and `can_be_null' fields in
2545 the pattern buffer.
2546
2547 Returns 0 if we succeed, -2 if an internal error. */
2548
2549 int
re_compile_fastmap(bufp)2550 re_compile_fastmap (bufp)
2551 struct re_pattern_buffer *bufp;
2552 {
2553 int j, k;
2554 fail_stack_type fail_stack;
2555 #ifndef REGEX_MALLOC
2556 char *destination;
2557 #endif
2558 /* We don't push any register information onto the failure stack. */
2559 unsigned num_regs = 0;
2560
2561 register char *fastmap = bufp->fastmap;
2562 unsigned char *pattern = bufp->buffer;
2563 unsigned long size = bufp->used;
2564 const unsigned char *p = pattern;
2565 register unsigned char *pend = pattern + size;
2566
2567 /* Assume that each path through the pattern can be null until
2568 proven otherwise. We set this false at the bottom of switch
2569 statement, to which we get only if a particular path doesn't
2570 match the empty string. */
2571 boolean path_can_be_null = true;
2572
2573 /* We aren't doing a `succeed_n' to begin with. */
2574 boolean succeed_n_p = false;
2575
2576 assert (fastmap != NULL && p != NULL);
2577
2578 INIT_FAIL_STACK ();
2579 bzero (fastmap, 1 << BYTEWIDTH); /* Assume nothing's valid. */
2580 bufp->fastmap_accurate = 1; /* It will be when we're done. */
2581 bufp->can_be_null = 0;
2582
2583 while (p != pend || !FAIL_STACK_EMPTY ())
2584 {
2585 if (p == pend)
2586 {
2587 bufp->can_be_null |= path_can_be_null;
2588
2589 /* Reset for next path. */
2590 path_can_be_null = true;
2591
2592 p = fail_stack.stack[--fail_stack.avail];
2593 }
2594
2595 /* We should never be about to go beyond the end of the pattern. */
2596 assert (p < pend);
2597
2598 #ifdef SWITCH_ENUM_BUG
2599 switch ((int) ((re_opcode_t) *p++))
2600 #else
2601 switch ((re_opcode_t) *p++)
2602 #endif
2603 {
2604
2605 /* I guess the idea here is to simply not bother with a fastmap
2606 if a backreference is used, since it's too hard to figure out
2607 the fastmap for the corresponding group. Setting
2608 `can_be_null' stops `re_search_2' from using the fastmap, so
2609 that is all we do. */
2610 case duplicate:
2611 bufp->can_be_null = 1;
2612 return 0;
2613
2614
2615 /* Following are the cases which match a character. These end
2616 with `break'. */
2617
2618 case exactn:
2619 fastmap[p[1]] = 1;
2620 break;
2621
2622
2623 case charset:
2624 for (j = *p++ * BYTEWIDTH - 1; j >= 0; j--)
2625 if (p[j / BYTEWIDTH] & (1 << (j % BYTEWIDTH)))
2626 fastmap[j] = 1;
2627 break;
2628
2629
2630 case charset_not:
2631 /* Chars beyond end of map must be allowed. */
2632 for (j = *p * BYTEWIDTH; j < (1 << BYTEWIDTH); j++)
2633 fastmap[j] = 1;
2634
2635 for (j = *p++ * BYTEWIDTH - 1; j >= 0; j--)
2636 if (!(p[j / BYTEWIDTH] & (1 << (j % BYTEWIDTH))))
2637 fastmap[j] = 1;
2638 break;
2639
2640
2641 case wordchar:
2642 for (j = 0; j < (1 << BYTEWIDTH); j++)
2643 if (SYNTAX (j) == Sword)
2644 fastmap[j] = 1;
2645 break;
2646
2647
2648 case notwordchar:
2649 for (j = 0; j < (1 << BYTEWIDTH); j++)
2650 if (SYNTAX (j) != Sword)
2651 fastmap[j] = 1;
2652 break;
2653
2654
2655 case anychar:
2656 /* `.' matches anything ... */
2657 for (j = 0; j < (1 << BYTEWIDTH); j++)
2658 fastmap[j] = 1;
2659
2660 /* ... except perhaps newline. */
2661 if (!(bufp->syntax & RE_DOT_NEWLINE))
2662 fastmap['\n'] = 0;
2663
2664 /* Return if we have already set `can_be_null'; if we have,
2665 then the fastmap is irrelevant. Something's wrong here. */
2666 else if (bufp->can_be_null)
2667 return 0;
2668
2669 /* Otherwise, have to check alternative paths. */
2670 break;
2671
2672
2673 #ifdef emacs
2674 case syntaxspec:
2675 k = *p++;
2676 for (j = 0; j < (1 << BYTEWIDTH); j++)
2677 if (SYNTAX (j) == (enum syntaxcode) k)
2678 fastmap[j] = 1;
2679 break;
2680
2681
2682 case notsyntaxspec:
2683 k = *p++;
2684 for (j = 0; j < (1 << BYTEWIDTH); j++)
2685 if (SYNTAX (j) != (enum syntaxcode) k)
2686 fastmap[j] = 1;
2687 break;
2688
2689
2690 /* All cases after this match the empty string. These end with
2691 `continue'. */
2692
2693
2694 case before_dot:
2695 case at_dot:
2696 case after_dot:
2697 continue;
2698 #endif /* not emacs */
2699
2700
2701 case no_op:
2702 case begline:
2703 case endline:
2704 case begbuf:
2705 case endbuf:
2706 case wordbound:
2707 case notwordbound:
2708 case wordbeg:
2709 case wordend:
2710 case push_dummy_failure:
2711 continue;
2712
2713
2714 case jump_n:
2715 case pop_failure_jump:
2716 case maybe_pop_jump:
2717 case jump:
2718 case jump_past_alt:
2719 case dummy_failure_jump:
2720 EXTRACT_NUMBER_AND_INCR (j, p);
2721 p += j;
2722 if (j > 0)
2723 continue;
2724
2725 /* Jump backward implies we just went through the body of a
2726 loop and matched nothing. Opcode jumped to should be
2727 `on_failure_jump' or `succeed_n'. Just treat it like an
2728 ordinary jump. For a * loop, it has pushed its failure
2729 point already; if so, discard that as redundant. */
2730 if ((re_opcode_t) *p != on_failure_jump
2731 && (re_opcode_t) *p != succeed_n)
2732 continue;
2733
2734 p++;
2735 EXTRACT_NUMBER_AND_INCR (j, p);
2736 p += j;
2737
2738 /* If what's on the stack is where we are now, pop it. */
2739 if (!FAIL_STACK_EMPTY ()
2740 && fail_stack.stack[fail_stack.avail - 1] == p)
2741 fail_stack.avail--;
2742
2743 continue;
2744
2745
2746 case on_failure_jump:
2747 case on_failure_keep_string_jump:
2748 handle_on_failure_jump:
2749 EXTRACT_NUMBER_AND_INCR (j, p);
2750
2751 /* For some patterns, e.g., `(a?)?', `p+j' here points to the
2752 end of the pattern. We don't want to push such a point,
2753 since when we restore it above, entering the switch will
2754 increment `p' past the end of the pattern. We don't need
2755 to push such a point since we obviously won't find any more
2756 fastmap entries beyond `pend'. Such a pattern can match
2757 the null string, though. */
2758 if (p + j < pend)
2759 {
2760 if (!PUSH_PATTERN_OP (p + j, fail_stack))
2761 return -2;
2762 }
2763 else
2764 bufp->can_be_null = 1;
2765
2766 if (succeed_n_p)
2767 {
2768 EXTRACT_NUMBER_AND_INCR (k, p); /* Skip the n. */
2769 succeed_n_p = false;
2770 }
2771
2772 continue;
2773
2774
2775 case succeed_n:
2776 /* Get to the number of times to succeed. */
2777 p += 2;
2778
2779 /* Increment p past the n for when k != 0. */
2780 EXTRACT_NUMBER_AND_INCR (k, p);
2781 if (k == 0)
2782 {
2783 p -= 4;
2784 succeed_n_p = true; /* Spaghetti code alert. */
2785 goto handle_on_failure_jump;
2786 }
2787 continue;
2788
2789
2790 case set_number_at:
2791 p += 4;
2792 continue;
2793
2794
2795 case start_memory:
2796 case stop_memory:
2797 p += 2;
2798 continue;
2799
2800
2801 default:
2802 abort (); /* We have listed all the cases. */
2803 } /* switch *p++ */
2804
2805 /* Getting here means we have found the possible starting
2806 characters for one path of the pattern -- and that the empty
2807 string does not match. We need not follow this path further.
2808 Instead, look at the next alternative (remembered on the
2809 stack), or quit if no more. The test at the top of the loop
2810 does these things. */
2811 path_can_be_null = false;
2812 p = pend;
2813 } /* while p */
2814
2815 /* Set `can_be_null' for the last path (also the first path, if the
2816 pattern is empty). */
2817 bufp->can_be_null |= path_can_be_null;
2818 return 0;
2819 } /* re_compile_fastmap */
2820
2821 /* Set REGS to hold NUM_REGS registers, storing them in STARTS and
2822 ENDS. Subsequent matches using PATTERN_BUFFER and REGS will use
2823 this memory for recording register information. STARTS and ENDS
2824 must be allocated using the malloc library routine, and must each
2825 be at least NUM_REGS * sizeof (regoff_t) bytes long.
2826
2827 If NUM_REGS == 0, then subsequent matches should allocate their own
2828 register data.
2829
2830 Unless this function is called, the first search or match using
2831 PATTERN_BUFFER will allocate its own register data, without
2832 freeing the old data. */
2833
2834 void
re_set_registers(bufp,regs,num_regs,starts,ends)2835 re_set_registers (bufp, regs, num_regs, starts, ends)
2836 struct re_pattern_buffer *bufp;
2837 struct re_registers *regs;
2838 unsigned num_regs;
2839 regoff_t *starts, *ends;
2840 {
2841 if (num_regs)
2842 {
2843 bufp->regs_allocated = REGS_REALLOCATE;
2844 regs->num_regs = num_regs;
2845 regs->start = starts;
2846 regs->end = ends;
2847 }
2848 else
2849 {
2850 bufp->regs_allocated = REGS_UNALLOCATED;
2851 regs->num_regs = 0;
2852 regs->start = regs->end = (regoff_t) 0;
2853 }
2854 }
2855
2856 /* Searching routines. */
2857
2858 /* Like re_search_2, below, but only one string is specified, and
2859 doesn't let you say where to stop matching. */
2860
2861 int
re_search(bufp,string,size,startpos,range,regs)2862 re_search (bufp, string, size, startpos, range, regs)
2863 struct re_pattern_buffer *bufp;
2864 const char *string;
2865 int size, startpos, range;
2866 struct re_registers *regs;
2867 {
2868 return re_search_2 (bufp, NULL, 0, string, size, startpos, range,
2869 regs, size);
2870 }
2871
2872
2873 /* Using the compiled pattern in BUFP->buffer, first tries to match the
2874 virtual concatenation of STRING1 and STRING2, starting first at index
2875 STARTPOS, then at STARTPOS + 1, and so on.
2876
2877 STRING1 and STRING2 have length SIZE1 and SIZE2, respectively.
2878
2879 RANGE is how far to scan while trying to match. RANGE = 0 means try
2880 only at STARTPOS; in general, the last start tried is STARTPOS +
2881 RANGE.
2882
2883 In REGS, return the indices of the virtual concatenation of STRING1
2884 and STRING2 that matched the entire BUFP->buffer and its contained
2885 subexpressions.
2886
2887 Do not consider matching one past the index STOP in the virtual
2888 concatenation of STRING1 and STRING2.
2889
2890 We return either the position in the strings at which the match was
2891 found, -1 if no match, or -2 if error (such as failure
2892 stack overflow). */
2893
2894 int
re_search_2(bufp,string1,size1,string2,size2,startpos,range,regs,stop)2895 re_search_2 (bufp, string1, size1, string2, size2, startpos, range, regs, stop)
2896 struct re_pattern_buffer *bufp;
2897 const char *string1, *string2;
2898 int size1, size2;
2899 int startpos;
2900 int range;
2901 struct re_registers *regs;
2902 int stop;
2903 {
2904 int val;
2905 register char *fastmap = bufp->fastmap;
2906 register char *translate = bufp->translate;
2907 int total_size = size1 + size2;
2908 int endpos = startpos + range;
2909
2910 /* Check for out-of-range STARTPOS. */
2911 if (startpos < 0 || startpos > total_size)
2912 return -1;
2913
2914 /* Fix up RANGE if it might eventually take us outside
2915 the virtual concatenation of STRING1 and STRING2. */
2916 if (endpos < -1)
2917 range = -1 - startpos;
2918 else if (endpos > total_size)
2919 range = total_size - startpos;
2920
2921 /* If the search isn't to be a backwards one, don't waste time in a
2922 search for a pattern that must be anchored. */
2923 if (bufp->used > 0 && (re_opcode_t) bufp->buffer[0] == begbuf && range > 0)
2924 {
2925 if (startpos > 0)
2926 return -1;
2927 else
2928 range = 1;
2929 }
2930
2931 /* Update the fastmap now if not correct already. */
2932 if (fastmap && !bufp->fastmap_accurate)
2933 if (re_compile_fastmap (bufp) == -2)
2934 return -2;
2935
2936 /* Loop through the string, looking for a place to start matching. */
2937 for (;;)
2938 {
2939 /* If a fastmap is supplied, skip quickly over characters that
2940 cannot be the start of a match. If the pattern can match the
2941 null string, however, we don't need to skip characters; we want
2942 the first null string. */
2943 if (fastmap && startpos < total_size && !bufp->can_be_null)
2944 {
2945 if (range > 0) /* Searching forwards. */
2946 {
2947 register const char *d;
2948 register int lim = 0;
2949 int irange = range;
2950
2951 if (startpos < size1 && startpos + range >= size1)
2952 lim = range - (size1 - startpos);
2953
2954 d = (startpos >= size1 ? string2 - size1 : string1) + startpos;
2955
2956 /* Written out as an if-else to avoid testing `translate'
2957 inside the loop. */
2958 if (translate)
2959 while (range > lim
2960 && !fastmap[(unsigned char)
2961 translate[(unsigned char) *d++]])
2962 range--;
2963 else
2964 while (range > lim && !fastmap[(unsigned char) *d++])
2965 range--;
2966
2967 startpos += irange - range;
2968 }
2969 else /* Searching backwards. */
2970 {
2971 register char c = (size1 == 0 || startpos >= size1
2972 ? string2[startpos - size1]
2973 : string1[startpos]);
2974
2975 if (!fastmap[(unsigned char) TRANSLATE (c)])
2976 goto advance;
2977 }
2978 }
2979
2980 /* If can't match the null string, and that's all we have left, fail. */
2981 if (range >= 0 && startpos == total_size && fastmap
2982 && !bufp->can_be_null)
2983 return -1;
2984
2985 val = re_match_2 (bufp, string1, size1, string2, size2,
2986 startpos, regs, stop);
2987 if (val >= 0)
2988 return startpos;
2989
2990 if (val == -2)
2991 return -2;
2992
2993 advance:
2994 if (!range)
2995 break;
2996 else if (range > 0)
2997 {
2998 range--;
2999 startpos++;
3000 }
3001 else
3002 {
3003 range++;
3004 startpos--;
3005 }
3006 }
3007 return -1;
3008 } /* re_search_2 */
3009
3010 /* Declarations and macros for re_match_2. */
3011 union register_info_type;
3012 static int bcmp_translate (const unsigned char *s1, const unsigned char *s2, int len, char *translate);
3013 static boolean alt_match_null_string_p (unsigned char *p, unsigned char *end, union register_info_type *reg_info),
3014 common_op_match_null_string_p (unsigned char **p, unsigned char *end, union register_info_type *reg_info),
3015 group_match_null_string_p (unsigned char **p, unsigned char *end, union register_info_type *reg_info);
3016
3017 /* Structure for per-register (a.k.a. per-group) information.
3018 This must not be longer than one word, because we push this value
3019 onto the failure stack. Other register information, such as the
3020 starting and ending positions (which are addresses), and the list of
3021 inner groups (which is a bits list) are maintained in separate
3022 variables.
3023
3024 We are making a (strictly speaking) nonportable assumption here: that
3025 the compiler will pack our bit fields into something that fits into
3026 the type of `word', i.e., is something that fits into one item on the
3027 failure stack. */
3028 typedef union register_info_type
3029 {
3030 fail_stack_elt_t word;
3031 struct
3032 {
3033 /* This field is one if this group can match the empty string,
3034 zero if not. If not yet determined, `MATCH_NULL_UNSET_VALUE'. */
3035 #define MATCH_NULL_UNSET_VALUE 3
3036 unsigned match_null_string_p : 2;
3037 unsigned is_active : 1;
3038 unsigned matched_something : 1;
3039 unsigned ever_matched_something : 1;
3040 } bits;
3041 } register_info_type;
3042
3043 #define REG_MATCH_NULL_STRING_P(R) ((R).bits.match_null_string_p)
3044 #define IS_ACTIVE(R) ((R).bits.is_active)
3045 #define MATCHED_SOMETHING(R) ((R).bits.matched_something)
3046 #define EVER_MATCHED_SOMETHING(R) ((R).bits.ever_matched_something)
3047
3048
3049 /* Call this when have matched a real character; it sets `matched' flags
3050 for the subexpressions which we are currently inside. Also records
3051 that those subexprs have matched. */
3052 #define SET_REGS_MATCHED() \
3053 do \
3054 { \
3055 unsigned r; \
3056 for (r = lowest_active_reg; r <= highest_active_reg; r++) \
3057 { \
3058 MATCHED_SOMETHING (reg_info[r]) \
3059 = EVER_MATCHED_SOMETHING (reg_info[r]) \
3060 = 1; \
3061 } \
3062 } \
3063 while (0)
3064
3065
3066 /* This converts PTR, a pointer into one of the search strings `string1'
3067 and `string2' into an offset from the beginning of that string. */
3068 #define POINTER_TO_OFFSET(ptr) \
3069 (FIRST_STRING_P (ptr) ? (ptr) - string1 : (ptr) - string2 + size1)
3070
3071 /* Registers are set to a sentinel when they haven't yet matched. */
3072 #define REG_UNSET_VALUE ((char *) -1)
3073 #define REG_UNSET(e) ((e) == REG_UNSET_VALUE)
3074
3075
3076 /* Macros for dealing with the split strings in re_match_2. */
3077
3078 #define MATCHING_IN_FIRST_STRING (dend == end_match_1)
3079
3080 /* Call before fetching a character with *d. This switches over to
3081 string2 if necessary. */
3082 #define PREFETCH() \
3083 while (d == dend) \
3084 { \
3085 /* End of string2 => fail. */ \
3086 if (dend == end_match_2) \
3087 goto fail; \
3088 /* End of string1 => advance to string2. */ \
3089 d = string2; \
3090 dend = end_match_2; \
3091 }
3092
3093
3094 /* Test if at very beginning or at very end of the virtual concatenation
3095 of `string1' and `string2'. If only one string, it's `string2'. */
3096 #define AT_STRINGS_BEG(d) ((d) == (size1 ? string1 : string2) || !size2)
3097 #define AT_STRINGS_END(d) ((d) == end2)
3098
3099
3100 /* Test if D points to a character which is word-constituent. We have
3101 two special cases to check for: if past the end of string1, look at
3102 the first character in string2; and if before the beginning of
3103 string2, look at the last character in string1. */
3104 #define WORDCHAR_P(d) \
3105 (SYNTAX ((d) == end1 ? *string2 \
3106 : (d) == string2 - 1 ? *(end1 - 1) : *(d)) \
3107 == Sword)
3108
3109 /* Test if the character before D and the one at D differ with respect
3110 to being word-constituent. */
3111 #define AT_WORD_BOUNDARY(d) \
3112 (AT_STRINGS_BEG (d) || AT_STRINGS_END (d) \
3113 || WORDCHAR_P (d - 1) != WORDCHAR_P (d))
3114
3115
3116 /* Free everything we malloc. */
3117 #ifdef REGEX_MALLOC
3118 #define FREE_VAR(var) if (var) free (var); var = NULL
3119 #define FREE_VARIABLES() \
3120 do { \
3121 FREE_VAR (fail_stack.stack); \
3122 FREE_VAR (regstart); \
3123 FREE_VAR (regend); \
3124 FREE_VAR (old_regstart); \
3125 FREE_VAR (old_regend); \
3126 FREE_VAR (best_regstart); \
3127 FREE_VAR (best_regend); \
3128 FREE_VAR (reg_info); \
3129 FREE_VAR (reg_dummy); \
3130 FREE_VAR (reg_info_dummy); \
3131 } while (0)
3132 #else /* not REGEX_MALLOC */
3133 /* Some MIPS systems (at least) want this to free alloca'd storage. */
3134 #define FREE_VARIABLES() alloca (0)
3135 #endif /* not REGEX_MALLOC */
3136
3137
3138 /* These values must meet several constraints. They must not be valid
3139 register values; since we have a limit of 255 registers (because
3140 we use only one byte in the pattern for the register number), we can
3141 use numbers larger than 255. They must differ by 1, because of
3142 NUM_FAILURE_ITEMS above. And the value for the lowest register must
3143 be larger than the value for the highest register, so we do not try
3144 to actually save any registers when none are active. */
3145 #define NO_HIGHEST_ACTIVE_REG (1 << BYTEWIDTH)
3146 #define NO_LOWEST_ACTIVE_REG (NO_HIGHEST_ACTIVE_REG + 1)
3147
3148 /* Matching routines. */
3149
3150 #ifndef emacs /* Emacs never uses this. */
3151 /* re_match is like re_match_2 except it takes only a single string. */
3152
3153 int
re_match(bufp,string,size,pos,regs)3154 re_match (bufp, string, size, pos, regs)
3155 struct re_pattern_buffer *bufp;
3156 const char *string;
3157 int size, pos;
3158 struct re_registers *regs;
3159 {
3160 return re_match_2 (bufp, NULL, 0, string, size, pos, regs, size);
3161 }
3162 #endif /* not emacs */
3163
3164
3165 /* re_match_2 matches the compiled pattern in BUFP against the
3166 the (virtual) concatenation of STRING1 and STRING2 (of length SIZE1
3167 and SIZE2, respectively). We start matching at POS, and stop
3168 matching at STOP.
3169
3170 If REGS is non-null and the `no_sub' field of BUFP is nonzero, we
3171 store offsets for the substring each group matched in REGS. See the
3172 documentation for exactly how many groups we fill.
3173
3174 We return -1 if no match, -2 if an internal error (such as the
3175 failure stack overflowing). Otherwise, we return the length of the
3176 matched substring. */
3177
3178 int
re_match_2(bufp,string1,size1,string2,size2,pos,regs,stop)3179 re_match_2 (bufp, string1, size1, string2, size2, pos, regs, stop)
3180 struct re_pattern_buffer *bufp;
3181 const char *string1, *string2;
3182 int size1, size2;
3183 int pos;
3184 struct re_registers *regs;
3185 int stop;
3186 {
3187 /* General temporaries. */
3188 int mcnt;
3189 unsigned char *p1;
3190
3191 /* Just past the end of the corresponding string. */
3192 const char *end1, *end2;
3193
3194 /* Pointers into string1 and string2, just past the last characters in
3195 each to consider matching. */
3196 const char *end_match_1, *end_match_2;
3197
3198 /* Where we are in the data, and the end of the current string. */
3199 const char *d, *dend;
3200
3201 /* Where we are in the pattern, and the end of the pattern. */
3202 unsigned char *p = bufp->buffer;
3203 register unsigned char *pend = p + bufp->used;
3204
3205 /* We use this to map every character in the string. */
3206 char *translate = bufp->translate;
3207
3208 /* Failure point stack. Each place that can handle a failure further
3209 down the line pushes a failure point on this stack. It consists of
3210 restart, regend, and reg_info for all registers corresponding to
3211 the subexpressions we're currently inside, plus the number of such
3212 registers, and, finally, two char *'s. The first char * is where
3213 to resume scanning the pattern; the second one is where to resume
3214 scanning the strings. If the latter is zero, the failure point is
3215 a ``dummy''; if a failure happens and the failure point is a dummy,
3216 it gets discarded and the next next one is tried. */
3217 fail_stack_type fail_stack;
3218 #ifdef DEBUG
3219 static unsigned failure_id = 0;
3220 unsigned nfailure_points_pushed = 0, nfailure_points_popped = 0;
3221 #endif
3222
3223 /* We fill all the registers internally, independent of what we
3224 return, for use in backreferences. The number here includes
3225 an element for register zero. */
3226 unsigned num_regs = bufp->re_nsub + 1;
3227
3228 /* The currently active registers. */
3229 uintptr_t lowest_active_reg = NO_LOWEST_ACTIVE_REG;
3230 uintptr_t highest_active_reg = NO_HIGHEST_ACTIVE_REG;
3231
3232 /* Information on the contents of registers. These are pointers into
3233 the input strings; they record just what was matched (on this
3234 attempt) by a subexpression part of the pattern, that is, the
3235 regnum-th regstart pointer points to where in the pattern we began
3236 matching and the regnum-th regend points to right after where we
3237 stopped matching the regnum-th subexpression. (The zeroth register
3238 keeps track of what the whole pattern matches.) */
3239 const char **regstart = 0, **regend = 0;
3240
3241 /* If a group that's operated upon by a repetition operator fails to
3242 match anything, then the register for its start will need to be
3243 restored because it will have been set to wherever in the string we
3244 are when we last see its open-group operator. Similarly for a
3245 register's end. */
3246 const char **old_regstart = 0, **old_regend = 0;
3247
3248 /* The is_active field of reg_info helps us keep track of which (possibly
3249 nested) subexpressions we are currently in. The matched_something
3250 field of reg_info[reg_num] helps us tell whether or not we have
3251 matched any of the pattern so far this time through the reg_num-th
3252 subexpression. These two fields get reset each time through any
3253 loop their register is in. */
3254 register_info_type *reg_info = 0;
3255
3256 /* The following record the register info as found in the above
3257 variables when we find a match better than any we've seen before.
3258 This happens as we backtrack through the failure points, which in
3259 turn happens only if we have not yet matched the entire string. */
3260 unsigned best_regs_set = false;
3261 const char **best_regstart = 0, **best_regend = 0;
3262
3263 /* Logically, this is `best_regend[0]'. But we don't want to have to
3264 allocate space for that if we're not allocating space for anything
3265 else (see below). Also, we never need info about register 0 for
3266 any of the other register vectors, and it seems rather a kludge to
3267 treat `best_regend' differently than the rest. So we keep track of
3268 the end of the best match so far in a separate variable. We
3269 initialize this to NULL so that when we backtrack the first time
3270 and need to test it, it's not garbage. */
3271 const char *match_end = NULL;
3272
3273 /* Used when we pop values we don't care about. */
3274 const char **reg_dummy = 0;
3275 register_info_type *reg_info_dummy = 0;
3276
3277 #ifdef DEBUG
3278 /* Counts the total number of registers pushed. */
3279 unsigned num_regs_pushed = 0;
3280 #endif
3281
3282 DEBUG_PRINT1 ("\n\nEntering re_match_2.\n");
3283
3284 INIT_FAIL_STACK ();
3285
3286 /* Do not bother to initialize all the register variables if there are
3287 no groups in the pattern, as it takes a fair amount of time. If
3288 there are groups, we include space for register 0 (the whole
3289 pattern), even though we never use it, since it simplifies the
3290 array indexing. We should fix this. */
3291 if (bufp->re_nsub)
3292 {
3293 regstart = REGEX_TALLOC (num_regs, const char *);
3294 regend = REGEX_TALLOC (num_regs, const char *);
3295 old_regstart = REGEX_TALLOC (num_regs, const char *);
3296 old_regend = REGEX_TALLOC (num_regs, const char *);
3297 best_regstart = REGEX_TALLOC (num_regs, const char *);
3298 best_regend = REGEX_TALLOC (num_regs, const char *);
3299 reg_info = REGEX_TALLOC (num_regs, register_info_type);
3300 reg_dummy = REGEX_TALLOC (num_regs, const char *);
3301 reg_info_dummy = REGEX_TALLOC (num_regs, register_info_type);
3302
3303 if (!(regstart && regend && old_regstart && old_regend && reg_info
3304 && best_regstart && best_regend && reg_dummy && reg_info_dummy))
3305 {
3306 FREE_VARIABLES ();
3307 return -2;
3308 }
3309 }
3310 #ifdef REGEX_MALLOC
3311 else
3312 {
3313 /* We must initialize all our variables to NULL, so that
3314 `FREE_VARIABLES' doesn't try to free them. */
3315 regstart = regend = old_regstart = old_regend = best_regstart
3316 = best_regend = reg_dummy = NULL;
3317 reg_info = reg_info_dummy = (register_info_type *) NULL;
3318 }
3319 #endif /* REGEX_MALLOC */
3320
3321 /* The starting position is bogus. */
3322 if (pos < 0 || pos > size1 + size2)
3323 {
3324 FREE_VARIABLES ();
3325 return -1;
3326 }
3327
3328 /* Initialize subexpression text positions to -1 to mark ones that no
3329 start_memory/stop_memory has been seen for. Also initialize the
3330 register information struct. */
3331 for (mcnt = 1; (unsigned) mcnt < num_regs; mcnt++)
3332 {
3333 regstart[mcnt] = regend[mcnt]
3334 = old_regstart[mcnt] = old_regend[mcnt] = REG_UNSET_VALUE;
3335
3336 REG_MATCH_NULL_STRING_P (reg_info[mcnt]) = MATCH_NULL_UNSET_VALUE;
3337 IS_ACTIVE (reg_info[mcnt]) = 0;
3338 MATCHED_SOMETHING (reg_info[mcnt]) = 0;
3339 EVER_MATCHED_SOMETHING (reg_info[mcnt]) = 0;
3340 }
3341
3342 /* We move `string1' into `string2' if the latter's empty -- but not if
3343 `string1' is null. */
3344 if (size2 == 0 && string1 != NULL)
3345 {
3346 string2 = string1;
3347 size2 = size1;
3348 string1 = 0;
3349 size1 = 0;
3350 }
3351 end1 = string1 + size1;
3352 end2 = string2 + size2;
3353
3354 /* Compute where to stop matching, within the two strings. */
3355 if (stop <= size1)
3356 {
3357 end_match_1 = string1 + stop;
3358 end_match_2 = string2;
3359 }
3360 else
3361 {
3362 end_match_1 = end1;
3363 end_match_2 = string2 + stop - size1;
3364 }
3365
3366 /* `p' scans through the pattern as `d' scans through the data.
3367 `dend' is the end of the input string that `d' points within. `d'
3368 is advanced into the following input string whenever necessary, but
3369 this happens before fetching; therefore, at the beginning of the
3370 loop, `d' can be pointing at the end of a string, but it cannot
3371 equal `string2'. */
3372 if (size1 > 0 && pos <= size1)
3373 {
3374 d = string1 + pos;
3375 dend = end_match_1;
3376 }
3377 else
3378 {
3379 d = string2 + pos - size1;
3380 dend = end_match_2;
3381 }
3382
3383 DEBUG_PRINT1 ("The compiled pattern is: ");
3384 DEBUG_PRINT_COMPILED_PATTERN (bufp, p, pend);
3385 DEBUG_PRINT1 ("The string to match is: `");
3386 DEBUG_PRINT_DOUBLE_STRING (d, string1, size1, string2, size2);
3387 DEBUG_PRINT1 ("'\n");
3388
3389 /* This loops over pattern commands. It exits by returning from the
3390 function if the match is complete, or it drops through if the match
3391 fails at this starting point in the input data. */
3392 for (;;)
3393 {
3394 DEBUG_PRINT2 ("\n0x%x: ", p);
3395
3396 if (p == pend)
3397 { /* End of pattern means we might have succeeded. */
3398 DEBUG_PRINT1 ("end of pattern ... ");
3399
3400 /* If we haven't matched the entire string, and we want the
3401 longest match, try backtracking. */
3402 if (d != end_match_2)
3403 {
3404 DEBUG_PRINT1 ("backtracking.\n");
3405
3406 if (!FAIL_STACK_EMPTY ())
3407 { /* More failure points to try. */
3408 boolean same_str_p = (FIRST_STRING_P (match_end)
3409 == MATCHING_IN_FIRST_STRING);
3410
3411 /* If exceeds best match so far, save it. */
3412 if (!best_regs_set
3413 || (same_str_p && d > match_end)
3414 || (!same_str_p && !MATCHING_IN_FIRST_STRING))
3415 {
3416 best_regs_set = true;
3417 match_end = d;
3418
3419 DEBUG_PRINT1 ("\nSAVING match as best so far.\n");
3420
3421 for (mcnt = 1; (unsigned) mcnt < num_regs; mcnt++)
3422 {
3423 best_regstart[mcnt] = regstart[mcnt];
3424 best_regend[mcnt] = regend[mcnt];
3425 }
3426 }
3427 goto fail;
3428 }
3429
3430 /* If no failure points, don't restore garbage. */
3431 else if (best_regs_set)
3432 {
3433 restore_best_regs:
3434 /* Restore best match. It may happen that `dend ==
3435 end_match_1' while the restored d is in string2.
3436 For example, the pattern `x.*y.*z' against the
3437 strings `x-' and `y-z-', if the two strings are
3438 not consecutive in memory. */
3439 DEBUG_PRINT1 ("Restoring best registers.\n");
3440
3441 d = match_end;
3442 dend = ((d >= string1 && d <= end1)
3443 ? end_match_1 : end_match_2);
3444
3445 for (mcnt = 1; (unsigned) mcnt < num_regs; mcnt++)
3446 {
3447 regstart[mcnt] = best_regstart[mcnt];
3448 regend[mcnt] = best_regend[mcnt];
3449 }
3450 }
3451 } /* d != end_match_2 */
3452
3453 DEBUG_PRINT1 ("Accepting match.\n");
3454
3455 /* If caller wants register contents data back, do it. */
3456 if (regs && !bufp->no_sub)
3457 {
3458 /* Have the register data arrays been allocated? */
3459 if (bufp->regs_allocated == REGS_UNALLOCATED)
3460 { /* No. So allocate them with malloc. We need one
3461 extra element beyond `num_regs' for the `-1' marker
3462 GNU code uses. */
3463 regs->num_regs = MAX (RE_NREGS, num_regs + 1);
3464 regs->start = TALLOC (regs->num_regs, regoff_t);
3465 regs->end = TALLOC (regs->num_regs, regoff_t);
3466 if (regs->start == NULL || regs->end == NULL)
3467 return -2;
3468 bufp->regs_allocated = REGS_REALLOCATE;
3469 }
3470 else if (bufp->regs_allocated == REGS_REALLOCATE)
3471 { /* Yes. If we need more elements than were already
3472 allocated, reallocate them. If we need fewer, just
3473 leave it alone. */
3474 if (regs->num_regs < num_regs + 1)
3475 {
3476 regs->num_regs = num_regs + 1;
3477 RETALLOC (regs->start, regs->num_regs, regoff_t);
3478 RETALLOC (regs->end, regs->num_regs, regoff_t);
3479 if (regs->start == NULL || regs->end == NULL)
3480 return -2;
3481 }
3482 }
3483 else
3484 assert (bufp->regs_allocated == REGS_FIXED);
3485
3486 /* Convert the pointer data in `regstart' and `regend' to
3487 indices. Register zero has to be set differently,
3488 since we haven't kept track of any info for it. */
3489 if (regs->num_regs > 0)
3490 {
3491 regs->start[0] = pos;
3492 regs->end[0] = (MATCHING_IN_FIRST_STRING ? d - string1
3493 : d - string2 + size1);
3494 }
3495
3496 /* Go through the first `min (num_regs, regs->num_regs)'
3497 registers, since that is all we initialized. */
3498 for (mcnt = 1; (unsigned) mcnt < MIN (num_regs, regs->num_regs);
3499 mcnt++)
3500 {
3501 if (REG_UNSET (regstart[mcnt]) || REG_UNSET (regend[mcnt]))
3502 regs->start[mcnt] = regs->end[mcnt] = -1;
3503 else
3504 {
3505 regs->start[mcnt] = POINTER_TO_OFFSET (regstart[mcnt]);
3506 regs->end[mcnt] = POINTER_TO_OFFSET (regend[mcnt]);
3507 }
3508 }
3509
3510 /* If the regs structure we return has more elements than
3511 were in the pattern, set the extra elements to -1. If
3512 we (re)allocated the registers, this is the case,
3513 because we always allocate enough to have at least one
3514 -1 at the end. */
3515 for (mcnt = num_regs; (unsigned) mcnt < regs->num_regs; mcnt++)
3516 regs->start[mcnt] = regs->end[mcnt] = -1;
3517 } /* regs && !bufp->no_sub */
3518
3519 FREE_VARIABLES ();
3520 DEBUG_PRINT4 ("%u failure points pushed, %u popped (%u remain).\n",
3521 nfailure_points_pushed, nfailure_points_popped,
3522 nfailure_points_pushed - nfailure_points_popped);
3523 DEBUG_PRINT2 ("%u registers pushed.\n", num_regs_pushed);
3524
3525 mcnt = d - pos - (MATCHING_IN_FIRST_STRING
3526 ? string1
3527 : string2 - size1);
3528
3529 DEBUG_PRINT2 ("Returning %d from re_match_2.\n", mcnt);
3530
3531 return mcnt;
3532 }
3533
3534 /* Otherwise match next pattern command. */
3535 #ifdef SWITCH_ENUM_BUG
3536 switch ((int) ((re_opcode_t) *p++))
3537 #else
3538 switch ((re_opcode_t) *p++)
3539 #endif
3540 {
3541 /* Ignore these. Used to ignore the n of succeed_n's which
3542 currently have n == 0. */
3543 case no_op:
3544 DEBUG_PRINT1 ("EXECUTING no_op.\n");
3545 break;
3546
3547
3548 /* Match the next n pattern characters exactly. The following
3549 byte in the pattern defines n, and the n bytes after that
3550 are the characters to match. */
3551 case exactn:
3552 mcnt = *p++;
3553 DEBUG_PRINT2 ("EXECUTING exactn %d.\n", mcnt);
3554
3555 /* This is written out as an if-else so we don't waste time
3556 testing `translate' inside the loop. */
3557 if (translate)
3558 {
3559 do
3560 {
3561 PREFETCH ();
3562 if (translate[(unsigned char) *d++] != (char) *p++)
3563 goto fail;
3564 }
3565 while (--mcnt);
3566 }
3567 else
3568 {
3569 do
3570 {
3571 PREFETCH ();
3572 if (*d++ != (char) *p++) goto fail;
3573 }
3574 while (--mcnt);
3575 }
3576 SET_REGS_MATCHED ();
3577 break;
3578
3579
3580 /* Match any character except possibly a newline or a null. */
3581 case anychar:
3582 DEBUG_PRINT1 ("EXECUTING anychar.\n");
3583
3584 PREFETCH ();
3585
3586 if ((!(bufp->syntax & RE_DOT_NEWLINE) && TRANSLATE (*d) == '\n')
3587 || (bufp->syntax & RE_DOT_NOT_NULL && TRANSLATE (*d) == '\000'))
3588 goto fail;
3589
3590 SET_REGS_MATCHED ();
3591 DEBUG_PRINT2 (" Matched `%d'.\n", *d);
3592 d++;
3593 break;
3594
3595
3596 case charset:
3597 case charset_not:
3598 {
3599 register unsigned char c;
3600 boolean not = (re_opcode_t) *(p - 1) == charset_not;
3601
3602 DEBUG_PRINT2 ("EXECUTING charset%s.\n", not ? "_not" : "");
3603
3604 PREFETCH ();
3605 c = TRANSLATE (*d); /* The character to match. */
3606
3607 /* Cast to `unsigned' instead of `unsigned char' in case the
3608 bit list is a full 32 bytes long. */
3609 if (c < (unsigned) (*p * BYTEWIDTH)
3610 && p[1 + c / BYTEWIDTH] & (1 << (c % BYTEWIDTH)))
3611 not = !not;
3612
3613 p += 1 + *p;
3614
3615 if (!not) goto fail;
3616
3617 SET_REGS_MATCHED ();
3618 d++;
3619 break;
3620 }
3621
3622
3623 /* The beginning of a group is represented by start_memory.
3624 The arguments are the register number in the next byte, and the
3625 number of groups inner to this one in the next. The text
3626 matched within the group is recorded (in the internal
3627 registers data structure) under the register number. */
3628 case start_memory:
3629 DEBUG_PRINT3 ("EXECUTING start_memory %d (%d):\n", *p, p[1]);
3630
3631 /* Find out if this group can match the empty string. */
3632 p1 = p; /* To send to group_match_null_string_p. */
3633
3634 if (REG_MATCH_NULL_STRING_P (reg_info[*p]) == MATCH_NULL_UNSET_VALUE)
3635 REG_MATCH_NULL_STRING_P (reg_info[*p])
3636 = group_match_null_string_p (&p1, pend, reg_info);
3637
3638 /* Save the position in the string where we were the last time
3639 we were at this open-group operator in case the group is
3640 operated upon by a repetition operator, e.g., with `(a*)*b'
3641 against `ab'; then we want to ignore where we are now in
3642 the string in case this attempt to match fails. */
3643 old_regstart[*p] = REG_MATCH_NULL_STRING_P (reg_info[*p])
3644 ? REG_UNSET (regstart[*p]) ? d : regstart[*p]
3645 : regstart[*p];
3646 DEBUG_PRINT2 (" old_regstart: %d\n",
3647 POINTER_TO_OFFSET (old_regstart[*p]));
3648
3649 regstart[*p] = d;
3650 DEBUG_PRINT2 (" regstart: %d\n", POINTER_TO_OFFSET (regstart[*p]));
3651
3652 IS_ACTIVE (reg_info[*p]) = 1;
3653 MATCHED_SOMETHING (reg_info[*p]) = 0;
3654
3655 /* This is the new highest active register. */
3656 highest_active_reg = *p;
3657
3658 /* If nothing was active before, this is the new lowest active
3659 register. */
3660 if (lowest_active_reg == NO_LOWEST_ACTIVE_REG)
3661 lowest_active_reg = *p;
3662
3663 /* Move past the register number and inner group count. */
3664 p += 2;
3665 break;
3666
3667
3668 /* The stop_memory opcode represents the end of a group. Its
3669 arguments are the same as start_memory's: the register
3670 number, and the number of inner groups. */
3671 case stop_memory:
3672 DEBUG_PRINT3 ("EXECUTING stop_memory %d (%d):\n", *p, p[1]);
3673
3674 /* We need to save the string position the last time we were at
3675 this close-group operator in case the group is operated
3676 upon by a repetition operator, e.g., with `((a*)*(b*)*)*'
3677 against `aba'; then we want to ignore where we are now in
3678 the string in case this attempt to match fails. */
3679 old_regend[*p] = REG_MATCH_NULL_STRING_P (reg_info[*p])
3680 ? REG_UNSET (regend[*p]) ? d : regend[*p]
3681 : regend[*p];
3682 DEBUG_PRINT2 (" old_regend: %d\n",
3683 POINTER_TO_OFFSET (old_regend[*p]));
3684
3685 regend[*p] = d;
3686 DEBUG_PRINT2 (" regend: %d\n", POINTER_TO_OFFSET (regend[*p]));
3687
3688 /* This register isn't active anymore. */
3689 IS_ACTIVE (reg_info[*p]) = 0;
3690
3691 /* If this was the only register active, nothing is active
3692 anymore. */
3693 if (lowest_active_reg == highest_active_reg)
3694 {
3695 lowest_active_reg = NO_LOWEST_ACTIVE_REG;
3696 highest_active_reg = NO_HIGHEST_ACTIVE_REG;
3697 }
3698 else
3699 { /* We must scan for the new highest active register, since
3700 it isn't necessarily one less than now: consider
3701 (a(b)c(d(e)f)g). When group 3 ends, after the f), the
3702 new highest active register is 1. */
3703 unsigned char r = *p - 1;
3704 while (r > 0 && !IS_ACTIVE (reg_info[r]))
3705 r--;
3706
3707 /* If we end up at register zero, that means that we saved
3708 the registers as the result of an `on_failure_jump', not
3709 a `start_memory', and we jumped to past the innermost
3710 `stop_memory'. For example, in ((.)*) we save
3711 registers 1 and 2 as a result of the *, but when we pop
3712 back to the second ), we are at the stop_memory 1.
3713 Thus, nothing is active. */
3714 if (r == 0)
3715 {
3716 lowest_active_reg = NO_LOWEST_ACTIVE_REG;
3717 highest_active_reg = NO_HIGHEST_ACTIVE_REG;
3718 }
3719 else
3720 highest_active_reg = r;
3721 }
3722
3723 /* If just failed to match something this time around with a
3724 group that's operated on by a repetition operator, try to
3725 force exit from the ``loop'', and restore the register
3726 information for this group that we had before trying this
3727 last match. */
3728 if ((!MATCHED_SOMETHING (reg_info[*p])
3729 || (re_opcode_t) p[-3] == start_memory)
3730 && (p + 2) < pend)
3731 {
3732 boolean is_a_jump_n = false;
3733
3734 p1 = p + 2;
3735 mcnt = 0;
3736 switch ((re_opcode_t) *p1++)
3737 {
3738 case jump_n:
3739 is_a_jump_n = true;
3740 case pop_failure_jump:
3741 case maybe_pop_jump:
3742 case jump:
3743 case dummy_failure_jump:
3744 EXTRACT_NUMBER_AND_INCR (mcnt, p1);
3745 if (is_a_jump_n)
3746 p1 += 2;
3747 break;
3748
3749 default:
3750 /* do nothing */ ;
3751 }
3752 p1 += mcnt;
3753
3754 /* If the next operation is a jump backwards in the pattern
3755 to an on_failure_jump right before the start_memory
3756 corresponding to this stop_memory, exit from the loop
3757 by forcing a failure after pushing on the stack the
3758 on_failure_jump's jump in the pattern, and d. */
3759 if (mcnt < 0 && (re_opcode_t) *p1 == on_failure_jump
3760 && (re_opcode_t) p1[3] == start_memory && p1[4] == *p)
3761 {
3762 /* If this group ever matched anything, then restore
3763 what its registers were before trying this last
3764 failed match, e.g., with `(a*)*b' against `ab' for
3765 regstart[1], and, e.g., with `((a*)*(b*)*)*'
3766 against `aba' for regend[3].
3767
3768 Also restore the registers for inner groups for,
3769 e.g., `((a*)(b*))*' against `aba' (register 3 would
3770 otherwise get trashed). */
3771
3772 if (EVER_MATCHED_SOMETHING (reg_info[*p]))
3773 {
3774 unsigned r;
3775
3776 EVER_MATCHED_SOMETHING (reg_info[*p]) = 0;
3777
3778 /* Restore this and inner groups' (if any) registers. */
3779 for (r = *p; r < (unsigned) (*p + *(p + 1)); r++)
3780 {
3781 regstart[r] = old_regstart[r];
3782
3783 /* xx why this test? */
3784 if (old_regend[r] >= regstart[r])
3785 regend[r] = old_regend[r];
3786 }
3787 }
3788 p1++;
3789 EXTRACT_NUMBER_AND_INCR (mcnt, p1);
3790 PUSH_FAILURE_POINT (p1 + mcnt, d, -2);
3791
3792 goto fail;
3793 }
3794 }
3795
3796 /* Move past the register number and the inner group count. */
3797 p += 2;
3798 break;
3799
3800
3801 /* \<digit> has been turned into a `duplicate' command which is
3802 followed by the numeric value of <digit> as the register number. */
3803 case duplicate:
3804 {
3805 register const char *d2, *dend2;
3806 int regno = *p++; /* Get which register to match against. */
3807 DEBUG_PRINT2 ("EXECUTING duplicate %d.\n", regno);
3808
3809 /* Can't back reference a group which we've never matched. */
3810 if (REG_UNSET (regstart[regno]) || REG_UNSET (regend[regno]))
3811 goto fail;
3812
3813 /* Where in input to try to start matching. */
3814 d2 = regstart[regno];
3815
3816 /* Where to stop matching; if both the place to start and
3817 the place to stop matching are in the same string, then
3818 set to the place to stop, otherwise, for now have to use
3819 the end of the first string. */
3820
3821 dend2 = ((FIRST_STRING_P (regstart[regno])
3822 == FIRST_STRING_P (regend[regno]))
3823 ? regend[regno] : end_match_1);
3824 for (;;)
3825 {
3826 /* If necessary, advance to next segment in register
3827 contents. */
3828 while (d2 == dend2)
3829 {
3830 if (dend2 == end_match_2) break;
3831 if (dend2 == regend[regno]) break;
3832
3833 /* End of string1 => advance to string2. */
3834 d2 = string2;
3835 dend2 = regend[regno];
3836 }
3837 /* At end of register contents => success */
3838 if (d2 == dend2) break;
3839
3840 /* If necessary, advance to next segment in data. */
3841 PREFETCH ();
3842
3843 /* How many characters left in this segment to match. */
3844 mcnt = dend - d;
3845
3846 /* Want how many consecutive characters we can match in
3847 one shot, so, if necessary, adjust the count. */
3848 if ((unsigned) mcnt > (unsigned) (dend2 - d2))
3849 mcnt = dend2 - d2;
3850
3851 /* Compare that many; failure if mismatch, else move
3852 past them. */
3853 if (translate
3854 ? bcmp_translate ((unsigned char *) d,
3855 (unsigned char *) d2, mcnt, translate)
3856 : bcmp (d, d2, mcnt))
3857 goto fail;
3858 d += mcnt, d2 += mcnt;
3859 }
3860 }
3861 break;
3862
3863
3864 /* begline matches the empty string at the beginning of the string
3865 (unless `not_bol' is set in `bufp'), and, if
3866 `newline_anchor' is set, after newlines. */
3867 case begline:
3868 DEBUG_PRINT1 ("EXECUTING begline.\n");
3869
3870 if (AT_STRINGS_BEG (d))
3871 {
3872 if (!bufp->not_bol) break;
3873 }
3874 else if (d[-1] == '\n' && bufp->newline_anchor)
3875 {
3876 break;
3877 }
3878 /* In all other cases, we fail. */
3879 goto fail;
3880
3881
3882 /* endline is the dual of begline. */
3883 case endline:
3884 DEBUG_PRINT1 ("EXECUTING endline.\n");
3885
3886 if (AT_STRINGS_END (d))
3887 {
3888 if (!bufp->not_eol) break;
3889 }
3890
3891 /* We have to ``prefetch'' the next character. */
3892 else if ((d == end1 ? *string2 : *d) == '\n'
3893 && bufp->newline_anchor)
3894 {
3895 break;
3896 }
3897 goto fail;
3898
3899
3900 /* Match at the very beginning of the data. */
3901 case begbuf:
3902 DEBUG_PRINT1 ("EXECUTING begbuf.\n");
3903 if (AT_STRINGS_BEG (d))
3904 break;
3905 goto fail;
3906
3907
3908 /* Match at the very end of the data. */
3909 case endbuf:
3910 DEBUG_PRINT1 ("EXECUTING endbuf.\n");
3911 if (AT_STRINGS_END (d))
3912 break;
3913 goto fail;
3914
3915
3916 /* on_failure_keep_string_jump is used to optimize `.*\n'. It
3917 pushes NULL as the value for the string on the stack. Then
3918 `pop_failure_point' will keep the current value for the
3919 string, instead of restoring it. To see why, consider
3920 matching `foo\nbar' against `.*\n'. The .* matches the foo;
3921 then the . fails against the \n. But the next thing we want
3922 to do is match the \n against the \n; if we restored the
3923 string value, we would be back at the foo.
3924
3925 Because this is used only in specific cases, we don't need to
3926 check all the things that `on_failure_jump' does, to make
3927 sure the right things get saved on the stack. Hence we don't
3928 share its code. The only reason to push anything on the
3929 stack at all is that otherwise we would have to change
3930 `anychar's code to do something besides goto fail in this
3931 case; that seems worse than this. */
3932 case on_failure_keep_string_jump:
3933 DEBUG_PRINT1 ("EXECUTING on_failure_keep_string_jump");
3934
3935 EXTRACT_NUMBER_AND_INCR (mcnt, p);
3936 DEBUG_PRINT3 (" %d (to 0x%x):\n", mcnt, p + mcnt);
3937
3938 PUSH_FAILURE_POINT (p + mcnt, NULL, -2);
3939 break;
3940
3941
3942 /* Uses of on_failure_jump:
3943
3944 Each alternative starts with an on_failure_jump that points
3945 to the beginning of the next alternative. Each alternative
3946 except the last ends with a jump that in effect jumps past
3947 the rest of the alternatives. (They really jump to the
3948 ending jump of the following alternative, because tensioning
3949 these jumps is a hassle.)
3950
3951 Repeats start with an on_failure_jump that points past both
3952 the repetition text and either the following jump or
3953 pop_failure_jump back to this on_failure_jump. */
3954 case on_failure_jump:
3955 on_failure:
3956 DEBUG_PRINT1 ("EXECUTING on_failure_jump");
3957
3958 EXTRACT_NUMBER_AND_INCR (mcnt, p);
3959 DEBUG_PRINT3 (" %d (to 0x%x)", mcnt, p + mcnt);
3960
3961 /* If this on_failure_jump comes right before a group (i.e.,
3962 the original * applied to a group), save the information
3963 for that group and all inner ones, so that if we fail back
3964 to this point, the group's information will be correct.
3965 For example, in \(a*\)*\1, we need the preceding group,
3966 and in \(\(a*\)b*\)\2, we need the inner group. */
3967
3968 /* We can't use `p' to check ahead because we push
3969 a failure point to `p + mcnt' after we do this. */
3970 p1 = p;
3971
3972 /* We need to skip no_op's before we look for the
3973 start_memory in case this on_failure_jump is happening as
3974 the result of a completed succeed_n, as in \(a\)\{1,3\}b\1
3975 against aba. */
3976 while (p1 < pend && (re_opcode_t) *p1 == no_op)
3977 p1++;
3978
3979 if (p1 < pend && (re_opcode_t) *p1 == start_memory)
3980 {
3981 /* We have a new highest active register now. This will
3982 get reset at the start_memory we are about to get to,
3983 but we will have saved all the registers relevant to
3984 this repetition op, as described above. */
3985 highest_active_reg = *(p1 + 1) + *(p1 + 2);
3986 if (lowest_active_reg == NO_LOWEST_ACTIVE_REG)
3987 lowest_active_reg = *(p1 + 1);
3988 }
3989
3990 DEBUG_PRINT1 (":\n");
3991 PUSH_FAILURE_POINT (p + mcnt, d, -2);
3992 break;
3993
3994
3995 /* A smart repeat ends with `maybe_pop_jump'.
3996 We change it to either `pop_failure_jump' or `jump'. */
3997 case maybe_pop_jump:
3998 EXTRACT_NUMBER_AND_INCR (mcnt, p);
3999 DEBUG_PRINT2 ("EXECUTING maybe_pop_jump %d.\n", mcnt);
4000 {
4001 register unsigned char *p2 = p;
4002
4003 /* Compare the beginning of the repeat with what in the
4004 pattern follows its end. If we can establish that there
4005 is nothing that they would both match, i.e., that we
4006 would have to backtrack because of (as in, e.g., `a*a')
4007 then we can change to pop_failure_jump, because we'll
4008 never have to backtrack.
4009
4010 This is not true in the case of alternatives: in
4011 `(a|ab)*' we do need to backtrack to the `ab' alternative
4012 (e.g., if the string was `ab'). But instead of trying to
4013 detect that here, the alternative has put on a dummy
4014 failure point which is what we will end up popping. */
4015
4016 /* Skip over open/close-group commands. */
4017 while (p2 + 2 < pend
4018 && ((re_opcode_t) *p2 == stop_memory
4019 || (re_opcode_t) *p2 == start_memory))
4020 p2 += 3; /* Skip over args, too. */
4021
4022 /* If we're at the end of the pattern, we can change. */
4023 if (p2 == pend)
4024 {
4025 /* Consider what happens when matching ":\(.*\)"
4026 against ":/". I don't really understand this code
4027 yet. */
4028 p[-3] = (unsigned char) pop_failure_jump;
4029 DEBUG_PRINT1
4030 (" End of pattern: change to `pop_failure_jump'.\n");
4031 }
4032
4033 else if ((re_opcode_t) *p2 == exactn
4034 || (bufp->newline_anchor && (re_opcode_t) *p2 == endline))
4035 {
4036 register unsigned char c
4037 = *p2 == (unsigned char) endline ? '\n' : p2[2];
4038 p1 = p + mcnt;
4039
4040 /* p1[0] ... p1[2] are the `on_failure_jump' corresponding
4041 to the `maybe_finalize_jump' of this case. Examine what
4042 follows. */
4043 if ((re_opcode_t) p1[3] == exactn && p1[5] != c)
4044 {
4045 p[-3] = (unsigned char) pop_failure_jump;
4046 DEBUG_PRINT3 (" %c != %c => pop_failure_jump.\n",
4047 c, p1[5]);
4048 }
4049
4050 else if ((re_opcode_t) p1[3] == charset
4051 || (re_opcode_t) p1[3] == charset_not)
4052 {
4053 int not = (re_opcode_t) p1[3] == charset_not;
4054
4055 if (c < (unsigned char) (p1[4] * BYTEWIDTH)
4056 && p1[5 + c / BYTEWIDTH] & (1 << (c % BYTEWIDTH)))
4057 not = !not;
4058
4059 /* `not' is equal to 1 if c would match, which means
4060 that we can't change to pop_failure_jump. */
4061 if (!not)
4062 {
4063 p[-3] = (unsigned char) pop_failure_jump;
4064 DEBUG_PRINT1 (" No match => pop_failure_jump.\n");
4065 }
4066 }
4067 }
4068 }
4069 p -= 2; /* Point at relative address again. */
4070 if ((re_opcode_t) p[-1] != pop_failure_jump)
4071 {
4072 p[-1] = (unsigned char) jump;
4073 DEBUG_PRINT1 (" Match => jump.\n");
4074 goto unconditional_jump;
4075 }
4076 /* Note fall through. */
4077
4078
4079 /* The end of a simple repeat has a pop_failure_jump back to
4080 its matching on_failure_jump, where the latter will push a
4081 failure point. The pop_failure_jump takes off failure
4082 points put on by this pop_failure_jump's matching
4083 on_failure_jump; we got through the pattern to here from the
4084 matching on_failure_jump, so didn't fail. */
4085 case pop_failure_jump:
4086 {
4087 /* We need to pass separate storage for the lowest and
4088 highest registers, even though we don't care about the
4089 actual values. Otherwise, we will restore only one
4090 register from the stack, since lowest will == highest in
4091 `pop_failure_point'. */
4092 unsigned long dummy_low_reg, dummy_high_reg;
4093 unsigned char *pdummy;
4094 const char *sdummy;
4095
4096 DEBUG_PRINT1 ("EXECUTING pop_failure_jump.\n");
4097 POP_FAILURE_POINT (sdummy, pdummy,
4098 dummy_low_reg, dummy_high_reg,
4099 reg_dummy, reg_dummy, reg_info_dummy);
4100 (void)pdummy;
4101 (void)sdummy;
4102 }
4103 /* Note fall through. */
4104
4105
4106 /* Unconditionally jump (without popping any failure points). */
4107 case jump:
4108 unconditional_jump:
4109 EXTRACT_NUMBER_AND_INCR (mcnt, p); /* Get the amount to jump. */
4110 DEBUG_PRINT2 ("EXECUTING jump %d ", mcnt);
4111 p += mcnt; /* Do the jump. */
4112 DEBUG_PRINT2 ("(to 0x%x).\n", p);
4113 break;
4114
4115
4116 /* We need this opcode so we can detect where alternatives end
4117 in `group_match_null_string_p' et al. */
4118 case jump_past_alt:
4119 DEBUG_PRINT1 ("EXECUTING jump_past_alt.\n");
4120 goto unconditional_jump;
4121
4122
4123 /* Normally, the on_failure_jump pushes a failure point, which
4124 then gets popped at pop_failure_jump. We will end up at
4125 pop_failure_jump, also, and with a pattern of, say, `a+', we
4126 are skipping over the on_failure_jump, so we have to push
4127 something meaningless for pop_failure_jump to pop. */
4128 case dummy_failure_jump:
4129 DEBUG_PRINT1 ("EXECUTING dummy_failure_jump.\n");
4130 /* It doesn't matter what we push for the string here. What
4131 the code at `fail' tests is the value for the pattern. */
4132 PUSH_FAILURE_POINT (0, 0, -2);
4133 goto unconditional_jump;
4134
4135
4136 /* At the end of an alternative, we need to push a dummy failure
4137 point in case we are followed by a `pop_failure_jump', because
4138 we don't want the failure point for the alternative to be
4139 popped. For example, matching `(a|ab)*' against `aab'
4140 requires that we match the `ab' alternative. */
4141 case push_dummy_failure:
4142 DEBUG_PRINT1 ("EXECUTING push_dummy_failure.\n");
4143 /* See comments just above at `dummy_failure_jump' about the
4144 two zeroes. */
4145 PUSH_FAILURE_POINT (0, 0, -2);
4146 break;
4147
4148 /* Have to succeed matching what follows at least n times.
4149 After that, handle like `on_failure_jump'. */
4150 case succeed_n:
4151 EXTRACT_NUMBER (mcnt, p + 2);
4152 DEBUG_PRINT2 ("EXECUTING succeed_n %d.\n", mcnt);
4153
4154 assert (mcnt >= 0);
4155 /* Originally, this is how many times we HAVE to succeed. */
4156 if (mcnt > 0)
4157 {
4158 mcnt--;
4159 p += 2;
4160 STORE_NUMBER_AND_INCR (p, mcnt);
4161 DEBUG_PRINT3 (" Setting 0x%x to %d.\n", p, mcnt);
4162 }
4163 else if (mcnt == 0)
4164 {
4165 DEBUG_PRINT2 (" Setting two bytes from 0x%x to no_op.\n", p+2);
4166 p[2] = (unsigned char) no_op;
4167 p[3] = (unsigned char) no_op;
4168 goto on_failure;
4169 }
4170 break;
4171
4172 case jump_n:
4173 EXTRACT_NUMBER (mcnt, p + 2);
4174 DEBUG_PRINT2 ("EXECUTING jump_n %d.\n", mcnt);
4175
4176 /* Originally, this is how many times we CAN jump. */
4177 if (mcnt)
4178 {
4179 mcnt--;
4180 STORE_NUMBER (p + 2, mcnt);
4181 goto unconditional_jump;
4182 }
4183 /* If don't have to jump any more, skip over the rest of command. */
4184 else
4185 p += 4;
4186 break;
4187
4188 case set_number_at:
4189 {
4190 DEBUG_PRINT1 ("EXECUTING set_number_at.\n");
4191
4192 EXTRACT_NUMBER_AND_INCR (mcnt, p);
4193 p1 = p + mcnt;
4194 EXTRACT_NUMBER_AND_INCR (mcnt, p);
4195 DEBUG_PRINT3 (" Setting 0x%x to %d.\n", p1, mcnt);
4196 STORE_NUMBER (p1, mcnt);
4197 break;
4198 }
4199
4200 case wordbound:
4201 DEBUG_PRINT1 ("EXECUTING wordbound.\n");
4202 if (AT_WORD_BOUNDARY (d))
4203 break;
4204 goto fail;
4205
4206 case notwordbound:
4207 DEBUG_PRINT1 ("EXECUTING notwordbound.\n");
4208 if (AT_WORD_BOUNDARY (d))
4209 goto fail;
4210 break;
4211
4212 case wordbeg:
4213 DEBUG_PRINT1 ("EXECUTING wordbeg.\n");
4214 if (WORDCHAR_P (d) && (AT_STRINGS_BEG (d) || !WORDCHAR_P (d - 1)))
4215 break;
4216 goto fail;
4217
4218 case wordend:
4219 DEBUG_PRINT1 ("EXECUTING wordend.\n");
4220 if (!AT_STRINGS_BEG (d) && WORDCHAR_P (d - 1)
4221 && (!WORDCHAR_P (d) || AT_STRINGS_END (d)))
4222 break;
4223 goto fail;
4224
4225 #ifdef emacs
4226 #ifdef emacs19
4227 case before_dot:
4228 DEBUG_PRINT1 ("EXECUTING before_dot.\n");
4229 if (PTR_CHAR_POS ((unsigned char *) d) >= point)
4230 goto fail;
4231 break;
4232
4233 case at_dot:
4234 DEBUG_PRINT1 ("EXECUTING at_dot.\n");
4235 if (PTR_CHAR_POS ((unsigned char *) d) != point)
4236 goto fail;
4237 break;
4238
4239 case after_dot:
4240 DEBUG_PRINT1 ("EXECUTING after_dot.\n");
4241 if (PTR_CHAR_POS ((unsigned char *) d) <= point)
4242 goto fail;
4243 break;
4244 #else /* not emacs19 */
4245 case at_dot:
4246 DEBUG_PRINT1 ("EXECUTING at_dot.\n");
4247 if (PTR_CHAR_POS ((unsigned char *) d) + 1 != point)
4248 goto fail;
4249 break;
4250 #endif /* not emacs19 */
4251
4252 case syntaxspec:
4253 DEBUG_PRINT2 ("EXECUTING syntaxspec %d.\n", mcnt);
4254 mcnt = *p++;
4255 goto matchsyntax;
4256
4257 case wordchar:
4258 DEBUG_PRINT1 ("EXECUTING Emacs wordchar.\n");
4259 mcnt = (int) Sword;
4260 matchsyntax:
4261 PREFETCH ();
4262 if (SYNTAX (*d++) != (enum syntaxcode) mcnt)
4263 goto fail;
4264 SET_REGS_MATCHED ();
4265 break;
4266
4267 case notsyntaxspec:
4268 DEBUG_PRINT2 ("EXECUTING notsyntaxspec %d.\n", mcnt);
4269 mcnt = *p++;
4270 goto matchnotsyntax;
4271
4272 case notwordchar:
4273 DEBUG_PRINT1 ("EXECUTING Emacs notwordchar.\n");
4274 mcnt = (int) Sword;
4275 matchnotsyntax:
4276 PREFETCH ();
4277 if (SYNTAX (*d++) == (enum syntaxcode) mcnt)
4278 goto fail;
4279 SET_REGS_MATCHED ();
4280 break;
4281
4282 #else /* not emacs */
4283 case wordchar:
4284 DEBUG_PRINT1 ("EXECUTING non-Emacs wordchar.\n");
4285 PREFETCH ();
4286 if (!WORDCHAR_P (d))
4287 goto fail;
4288 SET_REGS_MATCHED ();
4289 d++;
4290 break;
4291
4292 case notwordchar:
4293 DEBUG_PRINT1 ("EXECUTING non-Emacs notwordchar.\n");
4294 PREFETCH ();
4295 if (WORDCHAR_P (d))
4296 goto fail;
4297 SET_REGS_MATCHED ();
4298 d++;
4299 break;
4300 #endif /* not emacs */
4301
4302 default:
4303 abort ();
4304 }
4305 continue; /* Successfully executed one pattern command; keep going. */
4306
4307
4308 /* We goto here if a matching operation fails. */
4309 fail:
4310 if (!FAIL_STACK_EMPTY ())
4311 { /* A restart point is known. Restore to that state. */
4312 DEBUG_PRINT1 ("\nFAIL:\n");
4313 POP_FAILURE_POINT (d, p,
4314 lowest_active_reg, highest_active_reg,
4315 regstart, regend, reg_info);
4316
4317 /* If this failure point is a dummy, try the next one. */
4318 if (!p)
4319 goto fail;
4320
4321 /* If we failed to the end of the pattern, don't examine *p. */
4322 assert (p <= pend);
4323 if (p < pend)
4324 {
4325 boolean is_a_jump_n = false;
4326
4327 /* If failed to a backwards jump that's part of a repetition
4328 loop, need to pop this failure point and use the next one. */
4329 switch ((re_opcode_t) *p)
4330 {
4331 case jump_n:
4332 is_a_jump_n = true;
4333 case maybe_pop_jump:
4334 case pop_failure_jump:
4335 case jump:
4336 p1 = p + 1;
4337 EXTRACT_NUMBER_AND_INCR (mcnt, p1);
4338 p1 += mcnt;
4339
4340 if ((is_a_jump_n && (re_opcode_t) *p1 == succeed_n)
4341 || (!is_a_jump_n
4342 && (re_opcode_t) *p1 == on_failure_jump))
4343 goto fail;
4344 break;
4345 default:
4346 /* do nothing */ ;
4347 }
4348 }
4349
4350 if (d >= string1 && d <= end1)
4351 dend = end_match_1;
4352 }
4353 else
4354 break; /* Matching at this starting point really fails. */
4355 } /* for (;;) */
4356
4357 if (best_regs_set)
4358 goto restore_best_regs;
4359
4360 FREE_VARIABLES ();
4361
4362 return -1; /* Failure to match. */
4363 } /* re_match_2 */
4364
4365 /* Subroutine definitions for re_match_2. */
4366
4367
4368 /* We are passed P pointing to a register number after a start_memory.
4369
4370 Return true if the pattern up to the corresponding stop_memory can
4371 match the empty string, and false otherwise.
4372
4373 If we find the matching stop_memory, sets P to point to one past its number.
4374 Otherwise, sets P to an undefined byte less than or equal to END.
4375
4376 We don't handle duplicates properly (yet). */
4377
4378 static boolean
group_match_null_string_p(p,end,reg_info)4379 group_match_null_string_p (p, end, reg_info)
4380 unsigned char **p, *end;
4381 register_info_type *reg_info;
4382 {
4383 int mcnt;
4384 /* Point to after the args to the start_memory. */
4385 unsigned char *p1 = *p + 2;
4386
4387 while (p1 < end)
4388 {
4389 /* Skip over opcodes that can match nothing, and return true or
4390 false, as appropriate, when we get to one that can't, or to the
4391 matching stop_memory. */
4392
4393 switch ((re_opcode_t) *p1)
4394 {
4395 /* Could be either a loop or a series of alternatives. */
4396 case on_failure_jump:
4397 p1++;
4398 EXTRACT_NUMBER_AND_INCR (mcnt, p1);
4399
4400 /* If the next operation is not a jump backwards in the
4401 pattern. */
4402
4403 if (mcnt >= 0)
4404 {
4405 /* Go through the on_failure_jumps of the alternatives,
4406 seeing if any of the alternatives cannot match nothing.
4407 The last alternative starts with only a jump,
4408 whereas the rest start with on_failure_jump and end
4409 with a jump, e.g., here is the pattern for `a|b|c':
4410
4411 /on_failure_jump/0/6/exactn/1/a/jump_past_alt/0/6
4412 /on_failure_jump/0/6/exactn/1/b/jump_past_alt/0/3
4413 /exactn/1/c
4414
4415 So, we have to first go through the first (n-1)
4416 alternatives and then deal with the last one separately. */
4417
4418
4419 /* Deal with the first (n-1) alternatives, which start
4420 with an on_failure_jump (see above) that jumps to right
4421 past a jump_past_alt. */
4422
4423 while ((re_opcode_t) p1[mcnt-3] == jump_past_alt)
4424 {
4425 /* `mcnt' holds how many bytes long the alternative
4426 is, including the ending `jump_past_alt' and
4427 its number. */
4428
4429 if (!alt_match_null_string_p (p1, p1 + mcnt - 3,
4430 reg_info))
4431 return false;
4432
4433 /* Move to right after this alternative, including the
4434 jump_past_alt. */
4435 p1 += mcnt;
4436
4437 /* Break if it's the beginning of an n-th alternative
4438 that doesn't begin with an on_failure_jump. */
4439 if ((re_opcode_t) *p1 != on_failure_jump)
4440 break;
4441
4442 /* Still have to check that it's not an n-th
4443 alternative that starts with an on_failure_jump. */
4444 p1++;
4445 EXTRACT_NUMBER_AND_INCR (mcnt, p1);
4446 if ((re_opcode_t) p1[mcnt-3] != jump_past_alt)
4447 {
4448 /* Get to the beginning of the n-th alternative. */
4449 p1 -= 3;
4450 break;
4451 }
4452 }
4453
4454 /* Deal with the last alternative: go back and get number
4455 of the `jump_past_alt' just before it. `mcnt' contains
4456 the length of the alternative. */
4457 EXTRACT_NUMBER (mcnt, p1 - 2);
4458
4459 if (!alt_match_null_string_p (p1, p1 + mcnt, reg_info))
4460 return false;
4461
4462 p1 += mcnt; /* Get past the n-th alternative. */
4463 } /* if mcnt > 0 */
4464 break;
4465
4466
4467 case stop_memory:
4468 assert (p1[1] == **p);
4469 *p = p1 + 2;
4470 return true;
4471
4472
4473 default:
4474 if (!common_op_match_null_string_p (&p1, end, reg_info))
4475 return false;
4476 }
4477 } /* while p1 < end */
4478
4479 return false;
4480 } /* group_match_null_string_p */
4481
4482
4483 /* Similar to group_match_null_string_p, but doesn't deal with alternatives:
4484 It expects P to be the first byte of a single alternative and END one
4485 byte past the last. The alternative can contain groups. */
4486
4487 static boolean
alt_match_null_string_p(p,end,reg_info)4488 alt_match_null_string_p (p, end, reg_info)
4489 unsigned char *p, *end;
4490 register_info_type *reg_info;
4491 {
4492 int mcnt;
4493 unsigned char *p1 = p;
4494
4495 while (p1 < end)
4496 {
4497 /* Skip over opcodes that can match nothing, and break when we get
4498 to one that can't. */
4499
4500 switch ((re_opcode_t) *p1)
4501 {
4502 /* It's a loop. */
4503 case on_failure_jump:
4504 p1++;
4505 EXTRACT_NUMBER_AND_INCR (mcnt, p1);
4506 p1 += mcnt;
4507 break;
4508
4509 default:
4510 if (!common_op_match_null_string_p (&p1, end, reg_info))
4511 return false;
4512 }
4513 } /* while p1 < end */
4514
4515 return true;
4516 } /* alt_match_null_string_p */
4517
4518
4519 /* Deals with the ops common to group_match_null_string_p and
4520 alt_match_null_string_p.
4521
4522 Sets P to one after the op and its arguments, if any. */
4523
4524 static boolean
common_op_match_null_string_p(p,end,reg_info)4525 common_op_match_null_string_p (p, end, reg_info)
4526 unsigned char **p, *end;
4527 register_info_type *reg_info;
4528 {
4529 int mcnt;
4530 boolean ret;
4531 int reg_no;
4532 unsigned char *p1 = *p;
4533
4534 switch ((re_opcode_t) *p1++)
4535 {
4536 case no_op:
4537 case begline:
4538 case endline:
4539 case begbuf:
4540 case endbuf:
4541 case wordbeg:
4542 case wordend:
4543 case wordbound:
4544 case notwordbound:
4545 #ifdef emacs
4546 case before_dot:
4547 case at_dot:
4548 case after_dot:
4549 #endif
4550 break;
4551
4552 case start_memory:
4553 reg_no = *p1;
4554 assert (reg_no > 0 && reg_no <= MAX_REGNUM);
4555 ret = group_match_null_string_p (&p1, end, reg_info);
4556
4557 /* Have to set this here in case we're checking a group which
4558 contains a group and a back reference to it. */
4559
4560 if (REG_MATCH_NULL_STRING_P (reg_info[reg_no]) == MATCH_NULL_UNSET_VALUE)
4561 REG_MATCH_NULL_STRING_P (reg_info[reg_no]) = ret;
4562
4563 if (!ret)
4564 return false;
4565 break;
4566
4567 /* If this is an optimized succeed_n for zero times, make the jump. */
4568 case jump:
4569 EXTRACT_NUMBER_AND_INCR (mcnt, p1);
4570 if (mcnt >= 0)
4571 p1 += mcnt;
4572 else
4573 return false;
4574 break;
4575
4576 case succeed_n:
4577 /* Get to the number of times to succeed. */
4578 p1 += 2;
4579 EXTRACT_NUMBER_AND_INCR (mcnt, p1);
4580
4581 if (mcnt == 0)
4582 {
4583 p1 -= 4;
4584 EXTRACT_NUMBER_AND_INCR (mcnt, p1);
4585 p1 += mcnt;
4586 }
4587 else
4588 return false;
4589 break;
4590
4591 case duplicate:
4592 if (!REG_MATCH_NULL_STRING_P (reg_info[*p1]))
4593 return false;
4594 break;
4595
4596 case set_number_at:
4597 p1 += 4;
4598
4599 default:
4600 /* All other opcodes mean we cannot match the empty string. */
4601 return false;
4602 }
4603
4604 *p = p1;
4605 return true;
4606 } /* common_op_match_null_string_p */
4607
4608
4609 /* Return zero if TRANSLATE[S1] and TRANSLATE[S2] are identical for LEN
4610 bytes; nonzero otherwise. */
4611
4612 static int
bcmp_translate(s1,s2,len,translate)4613 bcmp_translate (s1, s2, len, translate)
4614 const unsigned char *s1, *s2;
4615 register int len;
4616 char *translate;
4617 {
4618 register const unsigned char *p1 = s1, *p2 = s2;
4619 while (len)
4620 {
4621 if (translate[*p1++] != translate[*p2++]) return 1;
4622 len--;
4623 }
4624 return 0;
4625 }
4626
4627 /* Entry points for GNU code. */
4628
4629 /* re_compile_pattern is the GNU regular expression compiler: it
4630 compiles PATTERN (of length SIZE) and puts the result in BUFP.
4631 Returns 0 if the pattern was valid, otherwise an error string.
4632
4633 Assumes the `allocated' (and perhaps `buffer') and `translate' fields
4634 are set in BUFP on entry.
4635
4636 We call regex_compile to do the actual compilation. */
4637
4638 const char *
re_compile_pattern(pattern,length,bufp)4639 re_compile_pattern (pattern, length, bufp)
4640 const char *pattern;
4641 int length;
4642 struct re_pattern_buffer *bufp;
4643 {
4644 reg_errcode_t ret;
4645
4646 /* GNU code is written to assume at least RE_NREGS registers will be set
4647 (and at least one extra will be -1). */
4648 bufp->regs_allocated = REGS_UNALLOCATED;
4649
4650 /* And GNU code determines whether or not to get register information
4651 by passing null for the REGS argument to re_match, etc., not by
4652 setting no_sub. */
4653 bufp->no_sub = 0;
4654
4655 /* Match anchors at newline. */
4656 bufp->newline_anchor = 1;
4657
4658 ret = regex_compile (pattern, length, re_syntax_options, bufp);
4659
4660 return re_error_msg[(int) ret];
4661 }
4662
4663 /* Entry points compatible with 4.2 BSD regex library. We don't define
4664 them if this is an Emacs or POSIX compilation. */
4665
4666 #if !defined (emacs) && !defined (_POSIX_SOURCE)
4667
4668 /* BSD has one and only one pattern buffer. */
4669 static struct re_pattern_buffer re_comp_buf;
4670
4671 const char *
re_comp(s)4672 re_comp (s)
4673 const char *s;
4674 {
4675 reg_errcode_t ret;
4676
4677 if (!s)
4678 {
4679 if (!re_comp_buf.buffer)
4680 return "No previous regular expression";
4681 return 0;
4682 }
4683
4684 if (!re_comp_buf.buffer)
4685 {
4686 re_comp_buf.buffer = (unsigned char *) malloc (200);
4687 if (re_comp_buf.buffer == NULL)
4688 return "Memory exhausted";
4689 re_comp_buf.allocated = 200;
4690
4691 re_comp_buf.fastmap = (char *) malloc (1 << BYTEWIDTH);
4692 if (re_comp_buf.fastmap == NULL)
4693 return "Memory exhausted";
4694 }
4695
4696 /* Since `re_exec' always passes NULL for the `regs' argument, we
4697 don't need to initialize the pattern buffer fields which affect it. */
4698
4699 /* Match anchors at newlines. */
4700 re_comp_buf.newline_anchor = 1;
4701
4702 ret = regex_compile (s, strlen (s), re_syntax_options, &re_comp_buf);
4703
4704 /* Yes, we're discarding `const' here. */
4705 return (char *) re_error_msg[(int) ret];
4706 }
4707
4708
4709 int
re_exec(s)4710 re_exec (s)
4711 const char *s;
4712 {
4713 const int len = strlen (s);
4714 return
4715 0 <= re_search (&re_comp_buf, s, len, 0, len, (struct re_registers *) 0);
4716 }
4717 #endif /* not emacs and not _POSIX_SOURCE */
4718
4719 /* POSIX.2 functions. Don't define these for Emacs. */
4720
4721 #ifndef emacs
4722
4723 /* regcomp takes a regular expression as a string and compiles it.
4724
4725 PREG is a regex_t *. We do not expect any fields to be initialized,
4726 since POSIX says we shouldn't. Thus, we set
4727
4728 `buffer' to the compiled pattern;
4729 `used' to the length of the compiled pattern;
4730 `syntax' to RE_SYNTAX_POSIX_EXTENDED if the
4731 REG_EXTENDED bit in CFLAGS is set; otherwise, to
4732 RE_SYNTAX_POSIX_BASIC;
4733 `newline_anchor' to REG_NEWLINE being set in CFLAGS;
4734 `fastmap' and `fastmap_accurate' to zero;
4735 `re_nsub' to the number of subexpressions in PATTERN.
4736
4737 PATTERN is the address of the pattern string.
4738
4739 CFLAGS is a series of bits which affect compilation.
4740
4741 If REG_EXTENDED is set, we use POSIX extended syntax; otherwise, we
4742 use POSIX basic syntax.
4743
4744 If REG_NEWLINE is set, then . and [^...] don't match newline.
4745 Also, regexec will try a match beginning after every newline.
4746
4747 If REG_ICASE is set, then we considers upper- and lowercase
4748 versions of letters to be equivalent when matching.
4749
4750 If REG_NOSUB is set, then when PREG is passed to regexec, that
4751 routine will report only success or failure, and nothing about the
4752 registers.
4753
4754 It returns 0 if it succeeds, nonzero if it doesn't. (See regex.h for
4755 the return codes and their meanings.) */
4756
4757 int
regcomp(preg,pattern,cflags)4758 regcomp (preg, pattern, cflags)
4759 regex_t *preg;
4760 const char *pattern;
4761 int cflags;
4762 {
4763 reg_errcode_t ret;
4764 unsigned syntax
4765 = (cflags & REG_EXTENDED) ?
4766 RE_SYNTAX_POSIX_EXTENDED : RE_SYNTAX_POSIX_BASIC;
4767
4768 /* regex_compile will allocate the space for the compiled pattern. */
4769 preg->buffer = 0;
4770 preg->allocated = 0;
4771
4772 /* Don't bother to use a fastmap when searching. This simplifies the
4773 REG_NEWLINE case: if we used a fastmap, we'd have to put all the
4774 characters after newlines into the fastmap. This way, we just try
4775 every character. */
4776 preg->fastmap = 0;
4777
4778 if (cflags & REG_ICASE)
4779 {
4780 unsigned i;
4781
4782 preg->translate = (char *) malloc (CHAR_SET_SIZE);
4783 if (preg->translate == NULL)
4784 return (int) REG_ESPACE;
4785
4786 /* Map uppercase characters to corresponding lowercase ones. */
4787 for (i = 0; i < CHAR_SET_SIZE; i++)
4788 preg->translate[i] = ISUPPER (i) ? (unsigned) tolower (i) : i;
4789 }
4790 else
4791 preg->translate = NULL;
4792
4793 /* If REG_NEWLINE is set, newlines are treated differently. */
4794 if (cflags & REG_NEWLINE)
4795 { /* REG_NEWLINE implies neither . nor [^...] match newline. */
4796 syntax &= ~RE_DOT_NEWLINE;
4797 syntax |= RE_HAT_LISTS_NOT_NEWLINE;
4798 /* It also changes the matching behavior. */
4799 preg->newline_anchor = 1;
4800 }
4801 else
4802 preg->newline_anchor = 0;
4803
4804 preg->no_sub = !!(cflags & REG_NOSUB);
4805
4806 /* POSIX says a null character in the pattern terminates it, so we
4807 can use strlen here in compiling the pattern. */
4808 ret = regex_compile (pattern, strlen (pattern), syntax, preg);
4809
4810 /* POSIX doesn't distinguish between an unmatched open-group and an
4811 unmatched close-group: both are REG_EPAREN. */
4812 if (ret == REG_ERPAREN) ret = REG_EPAREN;
4813
4814 return (int) ret;
4815 }
4816
4817
4818 /* regexec searches for a given pattern, specified by PREG, in the
4819 string STRING.
4820
4821 If NMATCH is zero or REG_NOSUB was set in the cflags argument to
4822 `regcomp', we ignore PMATCH. Otherwise, we assume PMATCH has at
4823 least NMATCH elements, and we set them to the offsets of the
4824 corresponding matched substrings.
4825
4826 EFLAGS specifies `execution flags' which affect matching: if
4827 REG_NOTBOL is set, then ^ does not match at the beginning of the
4828 string; if REG_NOTEOL is set, then $ does not match at the end.
4829
4830 We return 0 if we find a match and REG_NOMATCH if not. */
4831
4832 int
regexec(preg,string,nmatch,pmatch,eflags)4833 regexec (preg, string, nmatch, pmatch, eflags)
4834 const regex_t *preg;
4835 const char *string;
4836 size_t nmatch;
4837 regmatch_t pmatch[];
4838 int eflags;
4839 {
4840 int ret;
4841 struct re_registers regs;
4842 regex_t private_preg;
4843 int len = strlen (string);
4844 boolean want_reg_info = !preg->no_sub && nmatch > 0;
4845
4846 private_preg = *preg;
4847
4848 private_preg.not_bol = !!(eflags & REG_NOTBOL);
4849 private_preg.not_eol = !!(eflags & REG_NOTEOL);
4850
4851 /* The user has told us exactly how many registers to return
4852 information about, via `nmatch'. We have to pass that on to the
4853 matching routines. */
4854 private_preg.regs_allocated = REGS_FIXED;
4855
4856 if (want_reg_info)
4857 {
4858 regs.num_regs = nmatch;
4859 regs.start = TALLOC (nmatch, regoff_t);
4860 regs.end = TALLOC (nmatch, regoff_t);
4861 if (regs.start == NULL || regs.end == NULL)
4862 return (int) REG_NOMATCH;
4863 }
4864
4865 /* Perform the searching operation. */
4866 ret = re_search (&private_preg, string, len,
4867 /* start: */ 0, /* range: */ len,
4868 want_reg_info ? ®s : (struct re_registers *) 0);
4869
4870 /* Copy the register information to the POSIX structure. */
4871 if (want_reg_info)
4872 {
4873 if (ret >= 0)
4874 {
4875 unsigned r;
4876
4877 for (r = 0; r < nmatch; r++)
4878 {
4879 pmatch[r].rm_so = regs.start[r];
4880 pmatch[r].rm_eo = regs.end[r];
4881 }
4882 }
4883
4884 /* If we needed the temporary register info, free the space now. */
4885 free (regs.start);
4886 free (regs.end);
4887 }
4888
4889 /* We want zero return to mean success, unlike `re_search'. */
4890 return ret >= 0 ? (int) REG_NOERROR : (int) REG_NOMATCH;
4891 }
4892
4893
4894 /* Returns a message corresponding to an error code, ERRCODE, returned
4895 from either regcomp or regexec. We don't use PREG here. */
4896
4897 size_t
regerror(int errcode,const regex_t * preg,char * errbuf,size_t errbuf_size)4898 regerror (int errcode, const regex_t *preg, char *errbuf, size_t errbuf_size) {
4899 const char *msg;
4900 size_t msg_size;
4901
4902 if (errcode < 0
4903 || (size_t) errcode >= (sizeof (re_error_msg) / sizeof (re_error_msg[0])))
4904 /* Only error codes returned by the rest of the code should be passed
4905 to this routine. If we are given anything else, or if other regex
4906 code generates an invalid error code, then the program has a bug.
4907 Dump core so we can fix it. */
4908 abort ();
4909
4910 msg = re_error_msg[errcode];
4911
4912 /* POSIX doesn't require that we do anything in this case, but why
4913 not be nice. */
4914 if (! msg)
4915 msg = "Success";
4916
4917 msg_size = strlen (msg) + 1; /* Includes the null. */
4918
4919 if (errbuf_size != 0)
4920 {
4921 if (msg_size > errbuf_size)
4922 {
4923 strncpy (errbuf, msg, errbuf_size - 1);
4924 errbuf[errbuf_size - 1] = 0;
4925 }
4926 else
4927 strcpy (errbuf, msg);
4928 }
4929
4930 return msg_size;
4931 }
4932
4933
4934 /* Free dynamically allocated space used by PREG. */
4935
4936 void
regfree(preg)4937 regfree (preg)
4938 regex_t *preg;
4939 {
4940 if (preg->buffer != NULL)
4941 free (preg->buffer);
4942 preg->buffer = NULL;
4943
4944 preg->allocated = 0;
4945 preg->used = 0;
4946
4947 if (preg->fastmap != NULL)
4948 free (preg->fastmap);
4949 preg->fastmap = NULL;
4950 preg->fastmap_accurate = 0;
4951
4952 if (preg->translate != NULL)
4953 free (preg->translate);
4954 preg->translate = NULL;
4955 }
4956
4957 #endif /* not emacs */
4958
4959 /*
4960 Local variables:
4961 make-backup-files: t
4962 version-control: t
4963 trim-versions-without-asking: nil
4964 End:
4965 */
4966