1 /* Extended regular expression matching and search library,
2 version 0.12.
3 (Implements POSIX draft P10003.2/D11.2, except for
4 internationalization features.)
5
6 Copyright (C) 1993 Free Software Foundation, Inc.
7
8 This program is free software; you can redistribute it and/or modify
9 it under the terms of the GNU General Public License as published by
10 the Free Software Foundation; either version 2, or (at your option)
11 any later version.
12
13 This program is distributed in the hope that it will be useful,
14 but WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 GNU General Public License for more details.
17
18 You should have received a copy of the GNU General Public License
19 along with this program; if not, write to the Free Software
20 Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. */
21
22 #ifdef DEBUG
23 #undef DEBUG
24 #endif
25
26 #if defined(_WIN64) && defined(_MSC_VER)
27 /* NOTE:
28
29 to compile this file for Windows AMD64 platform please
30 disable any optimisation of this file.
31
32 otherwise, on any calling of _alloca function, you can get fatal error
33 C1001: INTERNAL COMPILER ERROR */
34 # pragma optimize( "", off )
35 #endif
36
37 /* AIX requires this to be the first thing in the file. */
38 #if defined (_AIX) && !defined (REGEX_MALLOC)
39 #pragma alloca
40 #endif
41
42 #define _GNU_SOURCE
43
44 #include "common.h"
45
46 /* We need this for `regex.h', and perhaps for the Emacs include files. */
47 #include <sys/types.h>
48
49 /* The `emacs' switch turns on certain matching commands
50 that make sense only in Emacs. */
51 #ifdef emacs
52
53 #include "lisp.h"
54 #include "buffer.h"
55 #include "syntax.h"
56
57 /* Emacs uses `NULL' as a predicate. */
58 #undef NULL
59
60 #else /* not emacs */
61
62 /* We used to test for `BSTRING' here, but only GCC and Emacs define
63 `BSTRING', as far as I know, and neither of them use this code. */
64 #if HAVE_STRING_H || STDC_HEADERS
65 # include <string.h>
66 # ifndef bcmp
67 # define bcmp(s1, s2, n) memcmp ((s1), (s2), (n))
68 # endif
69 # ifndef bcopy
70 # define bcopy(s, d, n) memcpy ((d), (s), (n))
71 # endif
72 # ifndef bzero
73 # define bzero(s, n) memset ((s), 0, (n))
74 # endif
75 #else
76 # include <strings.h>
77 #endif
78
79 /* Define the syntax stuff for \<, \>, etc. */
80
81 /* This must be non-zero for the wordchar and notwordchar pattern
82 commands in re_match_2. */
83 #ifndef Sword
84 #define Sword 1
85 #endif
86
87 #ifdef SYNTAX_TABLE
88
89 extern char *re_syntax_table;
90
91 #else /* not SYNTAX_TABLE */
92
93 /* How many characters in the character set. */
94 #define CHAR_SET_SIZE 256
95
96 static char re_syntax_table[CHAR_SET_SIZE];
97
98 static void
init_syntax_once()99 init_syntax_once ()
100 {
101 register int c;
102 static int done = 0;
103
104 if (done)
105 return;
106
107 bzero (re_syntax_table, sizeof re_syntax_table);
108
109 for (c = 'a'; c <= 'z'; c++)
110 re_syntax_table[c] = Sword;
111
112 for (c = 'A'; c <= 'Z'; c++)
113 re_syntax_table[c] = Sword;
114
115 for (c = '0'; c <= '9'; c++)
116 re_syntax_table[c] = Sword;
117
118 re_syntax_table['_'] = Sword;
119
120 done = 1;
121 }
122
123 #endif /* not SYNTAX_TABLE */
124
125 #define SYNTAX(c) re_syntax_table[c]
126
127 #endif /* not emacs */
128
129 /* Get the interface, including the syntax bits. */
130 #include "gnuregex.h"
131
132 /* isalpha etc. are used for the character classes. */
133 #include <ctype.h>
134
135 #ifndef isascii
136 #define isascii(c) 1
137 #endif
138
139 #ifdef isblank
140 #define ISBLANK(c) (isascii (c) && isblank (c))
141 #else
142 #define ISBLANK(c) ((c) == ' ' || (c) == '\t')
143 #endif
144 #ifdef isgraph
145 #define ISGRAPH(c) (isascii (c) && isgraph (c))
146 #else
147 #define ISGRAPH(c) (isascii (c) && isprint (c) && !isspace (c))
148 #endif
149
150 #define ISPRINT(c) (isascii (c) && isprint (c))
151 #define ISDIGIT(c) (isascii (c) && isdigit (c))
152 #define ISALNUM(c) (isascii (c) && isalnum (c))
153 #define ISALPHA(c) (isascii (c) && isalpha (c))
154 #define ISCNTRL(c) (isascii (c) && iscntrl (c))
155 #define ISLOWER(c) (isascii (c) && islower (c))
156 #define ISPUNCT(c) (isascii (c) && ispunct (c))
157 #define ISSPACE(c) (isascii (c) && isspace (c))
158 #define ISUPPER(c) (isascii (c) && isupper (c))
159 #define ISXDIGIT(c) (isascii (c) && isxdigit (c))
160
161 #ifndef NULL
162 #define NULL 0
163 #endif
164
165 /* We remove any previous definition of `SIGN_EXTEND_CHAR',
166 since ours (we hope) works properly with all combinations of
167 machines, compilers, `char' and `unsigned char' argument types.
168 (Per Bothner suggested the basic approach.) */
169 #undef SIGN_EXTEND_CHAR
170 #if __STDC__
171 #define SIGN_EXTEND_CHAR(c) ((signed char) (c))
172 #else /* not __STDC__ */
173 /* As in Harbison and Steele. */
174 #define SIGN_EXTEND_CHAR(c) ((((unsigned char) (c)) ^ 128) - 128)
175 #endif
176
177 /* Should we use malloc or alloca? If REGEX_MALLOC is not defined, we
178 use `alloca' instead of `malloc'. This is because using malloc in
179 re_search* or re_match* could cause memory leaks when C-g is used in
180 Emacs; also, malloc is slower and causes storage fragmentation. On
181 the other hand, malloc is more portable, and easier to debug.
182
183 Because we sometimes use alloca, some routines have to be macros,
184 not functions -- `alloca'-allocated space disappears at the end of the
185 function it is called in. */
186
187 #ifdef REGEX_MALLOC
188
189 #define REGEX_ALLOCATE malloc
190 #define REGEX_REALLOCATE(source, osize, nsize) realloc (source, nsize)
191
192 #else /* not REGEX_MALLOC */
193
194 /* Emacs already defines alloca, sometimes. */
195 #ifndef alloca
196
197 /* Make alloca work the best possible way. */
198 #ifdef __GNUC__
199 #define alloca __builtin_alloca
200 #elif HAVE_ALLOCA_H
201 #include <alloca.h>
202 #elif _WINDOWS
203 #define alloca _alloca
204 #elif !defined(_AIX) /* Already did AIX, up at the top. */
205 char *alloca ();
206 #endif /* __GNUC__ */
207 #endif /* not alloca */
208
209 #define REGEX_ALLOCATE alloca
210
211 /* Assumes a `char *destination' variable. */
212 #define REGEX_REALLOCATE(source, osize, nsize) \
213 (destination = (char *) alloca (nsize), \
214 bcopy (source, destination, osize), \
215 destination)
216
217 #endif /* not REGEX_MALLOC */
218
219
220 /* True if `size1' is non-NULL and PTR is pointing anywhere inside
221 `string1' or just past its end. This works if PTR is NULL, which is
222 a good thing. */
223 #define FIRST_STRING_P(ptr) \
224 (size1 && string1 <= (ptr) && (ptr) <= string1 + size1)
225
226 /* (Re)Allocate N items of type T using malloc, or fail. */
227 #define TALLOC(n, t) ((t *) malloc ((n) * sizeof (t)))
228 #define RETALLOC(addr, n, t) ((addr) = (t *) realloc (addr, (n) * sizeof (t)))
229 #define REGEX_TALLOC(n, t) ((t *) REGEX_ALLOCATE ((n) * sizeof (t)))
230
231 #define BYTEWIDTH 8 /* In bits. */
232
233 #define STREQ(s1, s2) ((strcmp (s1, s2) == 0))
234
235 #ifndef MAX
236 #define MAX(a, b) ((a) > (b) ? (a) : (b))
237 #endif
238 #ifndef MIN
239 #define MIN(a, b) ((a) < (b) ? (a) : (b))
240 #endif
241
242 #if defined(boolean)
243 typedef char boolean;
244 #endif
245
246 #define false 0
247 #define true 1
248
249 /* These are the command codes that appear in compiled regular
250 expressions. Some opcodes are followed by argument bytes. A
251 command code can specify any interpretation whatsoever for its
252 arguments. Zero bytes may appear in the compiled regular expression.
253
254 The value of `exactn' is needed in search.c (search_buffer) in Emacs.
255 So regex.h defines a symbol `RE_EXACTN_VALUE' to be 1; the value of
256 `exactn' we use here must also be 1. */
257
258 typedef enum
259 {
260 no_op = 0,
261
262 /* Followed by one byte giving n, then by n literal bytes. */
263 exactn = 1,
264
265 /* Matches any (more or less) character. */
266 anychar,
267
268 /* Matches any one char belonging to specified set. First
269 following byte is number of bitmap bytes. Then come bytes
270 for a bitmap saying which chars are in. Bits in each byte
271 are ordered low-bit-first. A character is in the set if its
272 bit is 1. A character too large to have a bit in the map is
273 automatically not in the set. */
274 charset,
275
276 /* Same parameters as charset, but match any character that is
277 not one of those specified. */
278 charset_not,
279
280 /* Start remembering the text that is matched, for storing in a
281 register. Followed by one byte with the register number, in
282 the range 0 to one less than the pattern buffer's re_nsub
283 field. Then followed by one byte with the number of groups
284 inner to this one. (This last has to be part of the
285 start_memory only because we need it in the on_failure_jump
286 of re_match_2.) */
287 start_memory,
288
289 /* Stop remembering the text that is matched and store it in a
290 memory register. Followed by one byte with the register
291 number, in the range 0 to one less than `re_nsub' in the
292 pattern buffer, and one byte with the number of inner groups,
293 just like `start_memory'. (We need the number of inner
294 groups here because we don't have any easy way of finding the
295 corresponding start_memory when we're at a stop_memory.) */
296 stop_memory,
297
298 /* Match a duplicate of something remembered. Followed by one
299 byte containing the register number. */
300 duplicate,
301
302 /* Fail unless at beginning of line. */
303 begline,
304
305 /* Fail unless at end of line. */
306 endline,
307
308 /* Succeeds if at beginning of buffer (if emacs) or at beginning
309 of string to be matched (if not). */
310 begbuf,
311
312 /* Analogously, for end of buffer/string. */
313 endbuf,
314
315 /* Followed by two byte relative address to which to jump. */
316 jump,
317
318 /* Same as jump, but marks the end of an alternative. */
319 jump_past_alt,
320
321 /* Followed by two-byte relative address of place to resume at
322 in case of failure. */
323 on_failure_jump,
324
325 /* Like on_failure_jump, but pushes a placeholder instead of the
326 current string position when executed. */
327 on_failure_keep_string_jump,
328
329 /* Throw away latest failure point and then jump to following
330 two-byte relative address. */
331 pop_failure_jump,
332
333 /* Change to pop_failure_jump if know won't have to backtrack to
334 match; otherwise change to jump. This is used to jump
335 back to the beginning of a repeat. If what follows this jump
336 clearly won't match what the repeat does, such that we can be
337 sure that there is no use backtracking out of repetitions
338 already matched, then we change it to a pop_failure_jump.
339 Followed by two-byte address. */
340 maybe_pop_jump,
341
342 /* Jump to following two-byte address, and push a dummy failure
343 point. This failure point will be thrown away if an attempt
344 is made to use it for a failure. A `+' construct makes this
345 before the first repeat. Also used as an intermediary kind
346 of jump when compiling an alternative. */
347 dummy_failure_jump,
348
349 /* Push a dummy failure point and continue. Used at the end of
350 alternatives. */
351 push_dummy_failure,
352
353 /* Followed by two-byte relative address and two-byte number n.
354 After matching N times, jump to the address upon failure. */
355 succeed_n,
356
357 /* Followed by two-byte relative address, and two-byte number n.
358 Jump to the address N times, then fail. */
359 jump_n,
360
361 /* Set the following two-byte relative address to the
362 subsequent two-byte number. The address *includes* the two
363 bytes of number. */
364 set_number_at,
365
366 wordchar, /* Matches any word-constituent character. */
367 notwordchar, /* Matches any char that is not a word-constituent. */
368
369 wordbeg, /* Succeeds if at word beginning. */
370 wordend, /* Succeeds if at word end. */
371
372 wordbound, /* Succeeds if at a word boundary. */
373 notwordbound /* Succeeds if not at a word boundary. */
374
375 #ifdef emacs
376 ,before_dot, /* Succeeds if before point. */
377 at_dot, /* Succeeds if at point. */
378 after_dot, /* Succeeds if after point. */
379
380 /* Matches any character whose syntax is specified. Followed by
381 a byte which contains a syntax code, e.g., Sword. */
382 syntaxspec,
383
384 /* Matches any character whose syntax is not that specified. */
385 notsyntaxspec
386 #endif /* emacs */
387 } re_opcode_t;
388
389 /* Common operations on the compiled pattern. */
390
391 /* Store NUMBER in two contiguous bytes starting at DESTINATION. */
392
393 #define STORE_NUMBER(destination, number) \
394 do { \
395 (destination)[0] = (number) & 0377; \
396 (destination)[1] = (number) >> 8; \
397 } while (0)
398
399 /* Same as STORE_NUMBER, except increment DESTINATION to
400 the byte after where the number is stored. Therefore, DESTINATION
401 must be an lvalue. */
402
403 #define STORE_NUMBER_AND_INCR(destination, number) \
404 do { \
405 STORE_NUMBER (destination, number); \
406 (destination) += 2; \
407 } while (0)
408
409 /* Put into DESTINATION a number stored in two contiguous bytes starting
410 at SOURCE. */
411
412 #define EXTRACT_NUMBER(destination, source) \
413 do { \
414 (destination) = *(source) & 0377; \
415 (destination) += SIGN_EXTEND_CHAR (*((source) + 1)) << 8; \
416 } while (0)
417
418 #ifdef DEBUG
419 static void
extract_number(dest,source)420 extract_number (dest, source)
421 int *dest;
422 unsigned char *source;
423 {
424 int temp = SIGN_EXTEND_CHAR (*(source + 1));
425 *dest = *source & 0377;
426 *dest += temp << 8;
427 }
428
429 #ifndef EXTRACT_MACROS /* To debug the macros. */
430 #undef EXTRACT_NUMBER
431 #define EXTRACT_NUMBER(dest, src) extract_number (&dest, src)
432 #endif /* not EXTRACT_MACROS */
433
434 #endif /* DEBUG */
435
436 /* Same as EXTRACT_NUMBER, except increment SOURCE to after the number.
437 SOURCE must be an lvalue. */
438
439 #define EXTRACT_NUMBER_AND_INCR(destination, source) \
440 do { \
441 EXTRACT_NUMBER (destination, source); \
442 (source) += 2; \
443 } while (0)
444
445 #ifdef DEBUG
446 static void
extract_number_and_incr(destination,source)447 extract_number_and_incr (destination, source)
448 int *destination;
449 unsigned char **source;
450 {
451 extract_number (destination, *source);
452 *source += 2;
453 }
454
455 #ifndef EXTRACT_MACROS
456 #undef EXTRACT_NUMBER_AND_INCR
457 #define EXTRACT_NUMBER_AND_INCR(dest, src) \
458 extract_number_and_incr (&dest, &src)
459 #endif /* not EXTRACT_MACROS */
460
461 #endif /* DEBUG */
462
463 /* If DEBUG is defined, Regex prints many voluminous messages about what
464 it is doing (if the variable `debug' is non-zero). If linked with the
465 main program in `iregex.c', you can enter patterns and strings
466 interactively. And if linked with the main program in `main.c' and
467 the other test files, you can run the already-written tests. */
468
469 #ifdef DEBUG
470
471 /* We use standard I/O for debugging. */
472 #include <stdio.h>
473
474 /* It is useful to test things that ``must'' be true when debugging. */
475 #include <assert.h>
476
477 static int debug = 0;
478
479 #define DEBUG_STATEMENT(e) e
480 #define DEBUG_PRINT1(x) if (debug) printf (x)
481 #define DEBUG_PRINT2(x1, x2) if (debug) printf (x1, x2)
482 #define DEBUG_PRINT3(x1, x2, x3) if (debug) printf (x1, x2, x3)
483 #define DEBUG_PRINT4(x1, x2, x3, x4) if (debug) printf (x1, x2, x3, x4)
484 #define DEBUG_PRINT_COMPILED_PATTERN(p, s, e) \
485 if (debug) print_partial_compiled_pattern (s, e)
486 #define DEBUG_PRINT_DOUBLE_STRING(w, s1, sz1, s2, sz2) \
487 if (debug) print_double_string (w, s1, sz1, s2, sz2)
488
489
490 extern void printchar ();
491
492 /* Print the fastmap in human-readable form. */
493
494 void
print_fastmap(fastmap)495 print_fastmap (fastmap)
496 char *fastmap;
497 {
498 unsigned was_a_range = 0;
499 unsigned i = 0;
500
501 while (i < (1 << BYTEWIDTH))
502 {
503 if (fastmap[i++])
504 {
505 was_a_range = 0;
506 printchar (i - 1);
507 while (i < (1 << BYTEWIDTH) && fastmap[i])
508 {
509 was_a_range = 1;
510 i++;
511 }
512 if (was_a_range)
513 {
514 printf ("-");
515 printchar (i - 1);
516 }
517 }
518 }
519 putchar ('\n');
520 }
521
522
523 /* Print a compiled pattern string in human-readable form, starting at
524 the START pointer into it and ending just before the pointer END. */
525
526 void
print_partial_compiled_pattern(start,end)527 print_partial_compiled_pattern (start, end)
528 unsigned char *start;
529 unsigned char *end;
530 {
531 int mcnt, mcnt2;
532 unsigned char *p = start;
533 unsigned char *pend = end;
534
535 if (start == NULL)
536 {
537 printf ("(null)\n");
538 return;
539 }
540
541 /* Loop over pattern commands. */
542 while (p < pend)
543 {
544 switch ((re_opcode_t) *p++)
545 {
546 case no_op:
547 printf ("/no_op");
548 break;
549
550 case exactn:
551 mcnt = *p++;
552 printf ("/exactn/%d", mcnt);
553 do
554 {
555 putchar ('/');
556 printchar (*p++);
557 }
558 while (--mcnt);
559 break;
560
561 case start_memory:
562 mcnt = *p++;
563 printf ("/start_memory/%d/%d", mcnt, *p++);
564 break;
565
566 case stop_memory:
567 mcnt = *p++;
568 printf ("/stop_memory/%d/%d", mcnt, *p++);
569 break;
570
571 case duplicate:
572 printf ("/duplicate/%d", *p++);
573 break;
574
575 case anychar:
576 printf ("/anychar");
577 break;
578
579 case charset:
580 case charset_not:
581 {
582 register int c;
583
584 printf ("/charset%s",
585 (re_opcode_t) *(p - 1) == charset_not ? "_not" : "");
586
587 assert (p + *p < pend);
588
589 for (c = 0; c < *p; c++)
590 {
591 unsigned bit;
592 unsigned char map_byte = p[1 + c];
593
594 putchar ('/');
595
596 for (bit = 0; bit < BYTEWIDTH; bit++)
597 if (map_byte & (1 << bit))
598 printchar (c * BYTEWIDTH + bit);
599 }
600 p += 1 + *p;
601 break;
602 }
603
604 case begline:
605 printf ("/begline");
606 break;
607
608 case endline:
609 printf ("/endline");
610 break;
611
612 case on_failure_jump:
613 extract_number_and_incr (&mcnt, &p);
614 printf ("/on_failure_jump/0/%d", mcnt);
615 break;
616
617 case on_failure_keep_string_jump:
618 extract_number_and_incr (&mcnt, &p);
619 printf ("/on_failure_keep_string_jump/0/%d", mcnt);
620 break;
621
622 case dummy_failure_jump:
623 extract_number_and_incr (&mcnt, &p);
624 printf ("/dummy_failure_jump/0/%d", mcnt);
625 break;
626
627 case push_dummy_failure:
628 printf ("/push_dummy_failure");
629 break;
630
631 case maybe_pop_jump:
632 extract_number_and_incr (&mcnt, &p);
633 printf ("/maybe_pop_jump/0/%d", mcnt);
634 break;
635
636 case pop_failure_jump:
637 extract_number_and_incr (&mcnt, &p);
638 printf ("/pop_failure_jump/0/%d", mcnt);
639 break;
640
641 case jump_past_alt:
642 extract_number_and_incr (&mcnt, &p);
643 printf ("/jump_past_alt/0/%d", mcnt);
644 break;
645
646 case jump:
647 extract_number_and_incr (&mcnt, &p);
648 printf ("/jump/0/%d", mcnt);
649 break;
650
651 case succeed_n:
652 extract_number_and_incr (&mcnt, &p);
653 extract_number_and_incr (&mcnt2, &p);
654 printf ("/succeed_n/0/%d/0/%d", mcnt, mcnt2);
655 break;
656
657 case jump_n:
658 extract_number_and_incr (&mcnt, &p);
659 extract_number_and_incr (&mcnt2, &p);
660 printf ("/jump_n/0/%d/0/%d", mcnt, mcnt2);
661 break;
662
663 case set_number_at:
664 extract_number_and_incr (&mcnt, &p);
665 extract_number_and_incr (&mcnt2, &p);
666 printf ("/set_number_at/0/%d/0/%d", mcnt, mcnt2);
667 break;
668
669 case wordbound:
670 printf ("/wordbound");
671 break;
672
673 case notwordbound:
674 printf ("/notwordbound");
675 break;
676
677 case wordbeg:
678 printf ("/wordbeg");
679 break;
680
681 case wordend:
682 printf ("/wordend");
683
684 #ifdef emacs
685 case before_dot:
686 printf ("/before_dot");
687 break;
688
689 case at_dot:
690 printf ("/at_dot");
691 break;
692
693 case after_dot:
694 printf ("/after_dot");
695 break;
696
697 case syntaxspec:
698 printf ("/syntaxspec");
699 mcnt = *p++;
700 printf ("/%d", mcnt);
701 break;
702
703 case notsyntaxspec:
704 printf ("/notsyntaxspec");
705 mcnt = *p++;
706 printf ("/%d", mcnt);
707 break;
708 #endif /* emacs */
709
710 case wordchar:
711 printf ("/wordchar");
712 break;
713
714 case notwordchar:
715 printf ("/notwordchar");
716 break;
717
718 case begbuf:
719 printf ("/begbuf");
720 break;
721
722 case endbuf:
723 printf ("/endbuf");
724 break;
725
726 default:
727 printf ("?%d", *(p-1));
728 }
729 }
730 printf ("/\n");
731 }
732
733
734 void
print_compiled_pattern(bufp)735 print_compiled_pattern (bufp)
736 struct re_pattern_buffer *bufp;
737 {
738 unsigned char *buffer = bufp->buffer;
739
740 print_partial_compiled_pattern (buffer, buffer + bufp->used);
741 printf ("%d bytes used/%d bytes allocated.\n", bufp->used, bufp->allocated);
742
743 if (bufp->fastmap_accurate && bufp->fastmap)
744 {
745 printf ("fastmap: ");
746 print_fastmap (bufp->fastmap);
747 }
748
749 printf ("re_nsub: %d\t", bufp->re_nsub);
750 printf ("regs_alloc: %d\t", bufp->regs_allocated);
751 printf ("can_be_null: %d\t", bufp->can_be_null);
752 printf ("newline_anchor: %d\n", bufp->newline_anchor);
753 printf ("no_sub: %d\t", bufp->no_sub);
754 printf ("not_bol: %d\t", bufp->not_bol);
755 printf ("not_eol: %d\t", bufp->not_eol);
756 printf ("syntax: %d\n", bufp->syntax);
757 /* Perhaps we should print the translate table? */
758 }
759
760
761 void
print_double_string(where,string1,size1,string2,size2)762 print_double_string (where, string1, size1, string2, size2)
763 const char *where;
764 const char *string1;
765 const char *string2;
766 int size1;
767 int size2;
768 {
769 unsigned this_char;
770
771 if (where == NULL)
772 printf ("(null)");
773 else
774 {
775 if (FIRST_STRING_P (where))
776 {
777 for (this_char = where - string1; this_char < size1; this_char++)
778 printchar (string1[this_char]);
779
780 where = string2;
781 }
782
783 for (this_char = where - string2; this_char < size2; this_char++)
784 printchar (string2[this_char]);
785 }
786 }
787
788 #else /* not DEBUG */
789
790 #undef assert
791 #define assert(e)
792
793 #define DEBUG_STATEMENT(e)
794 #define DEBUG_PRINT1(x)
795 #define DEBUG_PRINT2(x1, x2)
796 #define DEBUG_PRINT3(x1, x2, x3)
797 #define DEBUG_PRINT4(x1, x2, x3, x4)
798 #define DEBUG_PRINT_COMPILED_PATTERN(p, s, e)
799 #define DEBUG_PRINT_DOUBLE_STRING(w, s1, sz1, s2, sz2)
800
801 #endif /* not DEBUG */
802
803 /* Set by `re_set_syntax' to the current regexp syntax to recognize. Can
804 also be assigned to arbitrarily: each pattern buffer stores its own
805 syntax, so it can be changed between regex compilations. */
806 reg_syntax_t re_syntax_options = RE_SYNTAX_EMACS;
807
808
809 /* Specify the precise syntax of regexps for compilation. This provides
810 for compatibility for various utilities which historically have
811 different, incompatible syntaxes.
812
813 The argument SYNTAX is a bit mask comprised of the various bits
814 defined in regex.h. We return the old syntax. */
815
816 reg_syntax_t
re_set_syntax(syntax)817 re_set_syntax (syntax)
818 reg_syntax_t syntax;
819 {
820 reg_syntax_t ret = re_syntax_options;
821
822 re_syntax_options = syntax;
823 return ret;
824 }
825
826 /* This table gives an error message for each of the error codes listed
827 in regex.h. Obviously the order here has to be same as there. */
828
829 static const char *re_error_msg[] =
830 { NULL, /* REG_NOERROR */
831 "No match", /* REG_NOMATCH */
832 "Invalid regular expression", /* REG_BADPAT */
833 "Invalid collation character", /* REG_ECOLLATE */
834 "Invalid character class name", /* REG_ECTYPE */
835 "Trailing backslash", /* REG_EESCAPE */
836 "Invalid back reference", /* REG_ESUBREG */
837 "Unmatched [ or [^", /* REG_EBRACK */
838 "Unmatched ( or \\(", /* REG_EPAREN */
839 "Unmatched \\{", /* REG_EBRACE */
840 "Invalid content of \\{\\}", /* REG_BADBR */
841 "Invalid range end", /* REG_ERANGE */
842 "Memory exhausted", /* REG_ESPACE */
843 "Invalid preceding regular expression", /* REG_BADRPT */
844 "Premature end of regular expression", /* REG_EEND */
845 "Regular expression too big", /* REG_ESIZE */
846 "Unmatched ) or \\)", /* REG_ERPAREN */
847 };
848
849 /* Subroutine declarations and macros for regex_compile. */
850
851 static void store_op1 (), store_op2 ();
852 static void insert_op1 (), insert_op2 ();
853 static boolean at_begline_loc_p (), at_endline_loc_p ();
854 static boolean group_in_compile_stack ();
855 static reg_errcode_t compile_range ();
856
857 #define FREE_MEM(var) if (var) free (var); var = NULL
858
859 /* Fetch the next character in the uncompiled pattern---translating it
860 if necessary. Also cast from a signed character in the constant
861 string passed to us by the user to an unsigned char that we can use
862 as an array index (in, e.g., `translate'). */
863 #define PATFETCH(c) \
864 do {if (p == pend) {FREE_MEM (compile_stack.stack); return REG_EEND;} \
865 c = (unsigned char) *p++; \
866 if (translate) c = translate[c]; \
867 } while (0)
868
869 /* Fetch the next character in the uncompiled pattern, with no
870 translation. */
871 #define PATFETCH_RAW(c) \
872 do {if (p == pend) {FREE_MEM (compile_stack.stack); return REG_EEND;} \
873 c = (unsigned char) *p++; \
874 } while (0)
875
876 /* Go backwards one character in the pattern. */
877 #define PATUNFETCH p--
878
879
880 /* If `translate' is non-null, return translate[D], else just D. We
881 cast the subscript to translate because some data is declared as
882 `char *', to avoid warnings when a string constant is passed. But
883 when we use a character as a subscript we must make it unsigned. */
884 #define TRANSLATE(d) (translate ? translate[(unsigned char) (d)] : (d))
885
886
887 /* Macros for outputting the compiled pattern into `buffer'. */
888
889 /* If the buffer isn't allocated when it comes in, use this. */
890 #define INIT_BUF_SIZE 32
891
892 /* Make sure we have at least N more bytes of space in buffer. */
893 #define GET_BUFFER_SPACE(n) \
894 while ((unsigned long)(b - bufp->buffer + (n)) > bufp->allocated) \
895 EXTEND_BUFFER ()
896
897 /* Make sure we have one more byte of buffer space and then add C to it. */
898 #define BUF_PUSH(c) \
899 do { \
900 GET_BUFFER_SPACE (1); \
901 *b++ = (unsigned char) (c); \
902 } while (0)
903
904
905 /* Ensure we have two more bytes of buffer space and then append C1 and C2. */
906 #define BUF_PUSH_2(c1, c2) \
907 do { \
908 GET_BUFFER_SPACE (2); \
909 *b++ = (unsigned char) (c1); \
910 *b++ = (unsigned char) (c2); \
911 } while (0)
912
913
914 /* As with BUF_PUSH_2, except for three bytes. */
915 #define BUF_PUSH_3(c1, c2, c3) \
916 do { \
917 GET_BUFFER_SPACE (3); \
918 *b++ = (unsigned char) (c1); \
919 *b++ = (unsigned char) (c2); \
920 *b++ = (unsigned char) (c3); \
921 } while (0)
922
923
924 /* Store a jump with opcode OP at LOC to location TO. We store a
925 relative address offset by the three bytes the jump itself occupies. */
926 #define STORE_JUMP(op, loc, to) \
927 store_op1 (op, loc, (to) - (loc) - 3)
928
929 /* Likewise, for a two-argument jump. */
930 #define STORE_JUMP2(op, loc, to, arg) \
931 store_op2 (op, loc, (to) - (loc) - 3, arg)
932
933 /* Like `STORE_JUMP', but for inserting. Assume `b' is the buffer end. */
934 #define INSERT_JUMP(op, loc, to) \
935 insert_op1 (op, loc, (to) - (loc) - 3, b)
936
937 /* Like `STORE_JUMP2', but for inserting. Assume `b' is the buffer end. */
938 #define INSERT_JUMP2(op, loc, to, arg) \
939 insert_op2 (op, loc, (to) - (loc) - 3, arg, b)
940
941
942 /* This is not an arbitrary limit: the arguments which represent offsets
943 into the pattern are two bytes long. So if 2^16 bytes turns out to
944 be too small, many things would have to change. */
945 #define MAX_BUF_SIZE (1L << 16)
946
947
948 /* Extend the buffer by twice its current size via realloc and
949 reset the pointers that pointed into the old block to point to the
950 correct places in the new one. If extending the buffer results in it
951 being larger than MAX_BUF_SIZE, then flag memory exhausted. */
952 #define EXTEND_BUFFER() \
953 do { \
954 unsigned char *old_buffer = bufp->buffer; \
955 if (bufp->allocated == MAX_BUF_SIZE) \
956 { \
957 FREE_MEM (compile_stack.stack); \
958 return REG_ESIZE; \
959 } \
960 bufp->allocated <<= 1; \
961 if (bufp->allocated > MAX_BUF_SIZE) \
962 bufp->allocated = MAX_BUF_SIZE; \
963 bufp->buffer = (unsigned char *) realloc (bufp->buffer, bufp->allocated);\
964 if (bufp->buffer == NULL) \
965 { \
966 FREE_MEM (compile_stack.stack); \
967 return REG_ESPACE; \
968 } \
969 /* If the buffer moved, move all the pointers into it. */ \
970 if (old_buffer != bufp->buffer) \
971 { \
972 b = (b - old_buffer) + bufp->buffer; \
973 begalt = (begalt - old_buffer) + bufp->buffer; \
974 if (fixup_alt_jump) \
975 fixup_alt_jump = (fixup_alt_jump - old_buffer) + bufp->buffer;\
976 if (laststart) \
977 laststart = (laststart - old_buffer) + bufp->buffer; \
978 if (pending_exact) \
979 pending_exact = (pending_exact - old_buffer) + bufp->buffer; \
980 } \
981 } while (0)
982
983
984 /* Since we have one byte reserved for the register number argument to
985 {start,stop}_memory, the maximum number of groups we can report
986 things about is what fits in that byte. */
987 #define MAX_REGNUM 255
988
989 /* But patterns can have more than `MAX_REGNUM' registers. We just
990 ignore the excess. */
991 typedef unsigned regnum_t;
992
993
994 /* Macros for the compile stack. */
995
996 /* Since offsets can go either forwards or backwards, this type needs to
997 be able to hold values from -(MAX_BUF_SIZE - 1) to MAX_BUF_SIZE - 1. */
998 typedef int pattern_offset_t;
999
1000 typedef struct
1001 {
1002 pattern_offset_t begalt_offset;
1003 pattern_offset_t fixup_alt_jump;
1004 pattern_offset_t inner_group_offset;
1005 pattern_offset_t laststart_offset;
1006 regnum_t regnum;
1007 } compile_stack_elt_t;
1008
1009
1010 typedef struct
1011 {
1012 compile_stack_elt_t *stack;
1013 unsigned size;
1014 unsigned avail; /* Offset of next open position. */
1015 } compile_stack_type;
1016
1017
1018 #define INIT_COMPILE_STACK_SIZE 32
1019
1020 #define COMPILE_STACK_EMPTY (compile_stack.avail == 0)
1021 #define COMPILE_STACK_FULL (compile_stack.avail == compile_stack.size)
1022
1023 /* The next available element. */
1024 #define COMPILE_STACK_TOP (compile_stack.stack[compile_stack.avail])
1025
1026
1027 /* Set the bit for character C in a list. */
1028 #define SET_LIST_BIT(c) \
1029 (b[((unsigned char) (c)) / BYTEWIDTH] \
1030 |= 1 << (((unsigned char) c) % BYTEWIDTH))
1031
1032
1033 /* Get the next unsigned number in the uncompiled pattern. */
1034 #define GET_UNSIGNED_NUMBER(num) \
1035 { if (p != pend) \
1036 { \
1037 PATFETCH (c); \
1038 while (ISDIGIT (c)) \
1039 { \
1040 if (num < 0) \
1041 num = 0; \
1042 num = num * 10 + c - '0'; \
1043 if (p == pend) \
1044 break; \
1045 PATFETCH (c); \
1046 } \
1047 } \
1048 }
1049
1050 #define CHAR_CLASS_MAX_LENGTH 6 /* Namely, `xdigit'. */
1051
1052 #define IS_CHAR_CLASS(string) \
1053 (STREQ (string, "alpha") || STREQ (string, "upper") \
1054 || STREQ (string, "lower") || STREQ (string, "digit") \
1055 || STREQ (string, "alnum") || STREQ (string, "xdigit") \
1056 || STREQ (string, "space") || STREQ (string, "print") \
1057 || STREQ (string, "punct") || STREQ (string, "graph") \
1058 || STREQ (string, "cntrl") || STREQ (string, "blank"))
1059
1060 /* `regex_compile' compiles PATTERN (of length SIZE) according to SYNTAX.
1061 Returns one of error codes defined in `regex.h', or zero for success.
1062
1063 Assumes the `allocated' (and perhaps `buffer') and `translate'
1064 fields are set in BUFP on entry.
1065
1066 If it succeeds, results are put in BUFP (if it returns an error, the
1067 contents of BUFP are undefined):
1068 `buffer' is the compiled pattern;
1069 `syntax' is set to SYNTAX;
1070 `used' is set to the length of the compiled pattern;
1071 `fastmap_accurate' is zero;
1072 `re_nsub' is the number of subexpressions in PATTERN;
1073 `not_bol' and `not_eol' are zero;
1074
1075 The `fastmap' and `newline_anchor' fields are neither
1076 examined nor set. */
1077
1078 static reg_errcode_t
regex_compile(pattern,size,syntax,bufp)1079 regex_compile (pattern, size, syntax, bufp)
1080 const char *pattern;
1081 int size;
1082 reg_syntax_t syntax;
1083 struct re_pattern_buffer *bufp;
1084 {
1085 /* We fetch characters from PATTERN here. Even though PATTERN is
1086 `char *' (i.e., signed), we declare these variables as unsigned, so
1087 they can be reliably used as array indices. */
1088 register unsigned char c, c1;
1089
1090 /* A random tempory spot in PATTERN. */
1091 const char *p1;
1092
1093 /* Points to the end of the buffer, where we should append. */
1094 register unsigned char *b;
1095
1096 /* Keeps track of unclosed groups. */
1097 compile_stack_type compile_stack;
1098
1099 /* Points to the current (ending) position in the pattern. */
1100 const char *p = pattern;
1101 const char *pend = pattern + size;
1102
1103 /* How to translate the characters in the pattern. */
1104 char *translate = bufp->translate;
1105
1106 /* Address of the count-byte of the most recently inserted `exactn'
1107 command. This makes it possible to tell if a new exact-match
1108 character can be added to that command or if the character requires
1109 a new `exactn' command. */
1110 unsigned char *pending_exact = 0;
1111
1112 /* Address of start of the most recently finished expression.
1113 This tells, e.g., postfix * where to find the start of its
1114 operand. Reset at the beginning of groups and alternatives. */
1115 unsigned char *laststart = 0;
1116
1117 /* Address of beginning of regexp, or inside of last group. */
1118 unsigned char *begalt;
1119
1120 /* Place in the uncompiled pattern (i.e., the {) to
1121 which to go back if the interval is invalid. */
1122 const char *beg_interval;
1123
1124 /* Address of the place where a forward jump should go to the end of
1125 the containing expression. Each alternative of an `or' -- except the
1126 last -- ends with a forward jump of this sort. */
1127 unsigned char *fixup_alt_jump = 0;
1128
1129 /* Counts open-groups as they are encountered. Remembered for the
1130 matching close-group on the compile stack, so the same register
1131 number is put in the stop_memory as the start_memory. */
1132 regnum_t regnum = 0;
1133
1134 #ifdef DEBUG
1135 DEBUG_PRINT1 ("\nCompiling pattern: ");
1136 if (debug)
1137 {
1138 unsigned debug_count;
1139
1140 for (debug_count = 0; debug_count < size; debug_count++)
1141 printchar (pattern[debug_count]);
1142 putchar ('\n');
1143 }
1144 #endif /* DEBUG */
1145
1146 /* Initialize the compile stack. */
1147 compile_stack.stack = TALLOC (INIT_COMPILE_STACK_SIZE, compile_stack_elt_t);
1148 if (compile_stack.stack == NULL)
1149 return REG_ESPACE;
1150
1151 compile_stack.size = INIT_COMPILE_STACK_SIZE;
1152 compile_stack.avail = 0;
1153
1154 /* Initialize the pattern buffer. */
1155 bufp->syntax = syntax;
1156 bufp->fastmap_accurate = 0;
1157 bufp->not_bol = bufp->not_eol = 0;
1158
1159 /* Set `used' to zero, so that if we return an error, the pattern
1160 printer (for debugging) will think there's no pattern. We reset it
1161 at the end. */
1162 bufp->used = 0;
1163
1164 /* Always count groups, whether or not bufp->no_sub is set. */
1165 bufp->re_nsub = 0;
1166
1167 #if !defined (emacs) && !defined (SYNTAX_TABLE)
1168 /* Initialize the syntax table. */
1169 init_syntax_once ();
1170 #endif
1171
1172 if (bufp->allocated == 0)
1173 {
1174 if (bufp->buffer)
1175 { /* If zero allocated, but buffer is non-null, try to realloc
1176 enough space. This loses if buffer's address is bogus, but
1177 that is the user's responsibility. */
1178 RETALLOC (bufp->buffer, INIT_BUF_SIZE, unsigned char);
1179 }
1180 else
1181 { /* Caller did not allocate a buffer. Do it for them. */
1182 bufp->buffer = TALLOC (INIT_BUF_SIZE, unsigned char);
1183 }
1184 if (!bufp->buffer)
1185 {
1186 FREE_MEM (compile_stack.stack);
1187 return REG_ESPACE;
1188 }
1189 bufp->allocated = INIT_BUF_SIZE;
1190 }
1191
1192 begalt = b = bufp->buffer;
1193
1194 /* Loop through the uncompiled pattern until we're at the end. */
1195 while (p != pend)
1196 {
1197 PATFETCH (c);
1198
1199 switch (c)
1200 {
1201 case '^':
1202 {
1203 if ( /* If at start of pattern, it's an operator. */
1204 p == pattern + 1
1205 /* If context independent, it's an operator. */
1206 || syntax & RE_CONTEXT_INDEP_ANCHORS
1207 /* Otherwise, depends on what's come before. */
1208 || at_begline_loc_p (pattern, p, syntax))
1209 BUF_PUSH (begline);
1210 else
1211 goto normal_char;
1212 }
1213 break;
1214
1215
1216 case '$':
1217 {
1218 if ( /* If at end of pattern, it's an operator. */
1219 p == pend
1220 /* If context independent, it's an operator. */
1221 || syntax & RE_CONTEXT_INDEP_ANCHORS
1222 /* Otherwise, depends on what's next. */
1223 || at_endline_loc_p (p, pend, syntax))
1224 BUF_PUSH (endline);
1225 else
1226 goto normal_char;
1227 }
1228 break;
1229
1230
1231 case '+':
1232 case '?':
1233 if ((syntax & RE_BK_PLUS_QM)
1234 || (syntax & RE_LIMITED_OPS))
1235 goto normal_char;
1236 handle_plus:
1237 case '*':
1238 /* If there is no previous pattern... */
1239 if (!laststart)
1240 {
1241 if (syntax & RE_CONTEXT_INVALID_OPS)
1242 {
1243 FREE_MEM (compile_stack.stack);
1244 return REG_BADRPT;
1245 }
1246 else if (!(syntax & RE_CONTEXT_INDEP_OPS))
1247 goto normal_char;
1248 }
1249
1250 {
1251 /* Are we optimizing this jump? */
1252 boolean keep_string_p = false;
1253
1254 /* 1 means zero (many) matches is allowed. */
1255 char zero_times_ok = 0, many_times_ok = 0;
1256
1257 /* If there is a sequence of repetition chars, collapse it
1258 down to just one (the right one). We can't combine
1259 interval operators with these because of, e.g., `a{2}*',
1260 which should only match an even number of `a's. */
1261
1262 for (;;)
1263 {
1264 zero_times_ok |= c != '+';
1265 many_times_ok |= c != '?';
1266
1267 if (p == pend)
1268 break;
1269
1270 PATFETCH (c);
1271
1272 if (c == '*'
1273 || (!(syntax & RE_BK_PLUS_QM) && (c == '+' || c == '?')))
1274 ;
1275
1276 else if (syntax & RE_BK_PLUS_QM && c == '\\')
1277 {
1278 if (p == pend)
1279 {
1280 FREE_MEM (compile_stack.stack);
1281 return REG_EESCAPE;
1282 }
1283
1284 PATFETCH (c1);
1285 if (!(c1 == '+' || c1 == '?'))
1286 {
1287 PATUNFETCH;
1288 PATUNFETCH;
1289 break;
1290 }
1291
1292 c = c1;
1293 }
1294 else
1295 {
1296 PATUNFETCH;
1297 break;
1298 }
1299
1300 /* If we get here, we found another repeat character. */
1301 }
1302
1303 /* Star, etc. applied to an empty pattern is equivalent
1304 to an empty pattern. */
1305 if (!laststart)
1306 break;
1307
1308 /* Now we know whether or not zero matches is allowed
1309 and also whether or not two or more matches is allowed. */
1310 if (many_times_ok)
1311 { /* More than one repetition is allowed, so put in at the
1312 end a backward relative jump from `b' to before the next
1313 jump we're going to put in below (which jumps from
1314 laststart to after this jump).
1315
1316 But if we are at the `*' in the exact sequence `.*\n',
1317 insert an unconditional jump backwards to the .,
1318 instead of the beginning of the loop. This way we only
1319 push a failure point once, instead of every time
1320 through the loop. */
1321 assert (p - 1 > pattern);
1322
1323 /* Allocate the space for the jump. */
1324 GET_BUFFER_SPACE (3);
1325
1326 /* We know we are not at the first character of the pattern,
1327 because laststart was non-zero. And we've already
1328 incremented `p', by the way, to be the character after
1329 the `*'. Do we have to do something analogous here
1330 for null bytes, because of RE_DOT_NOT_NULL? */
1331 if (TRANSLATE (*(p - 2)) == TRANSLATE ('.')
1332 && zero_times_ok
1333 && p < pend && TRANSLATE (*p) == TRANSLATE ('\n')
1334 && !(syntax & RE_DOT_NEWLINE))
1335 { /* We have .*\n. */
1336 STORE_JUMP (jump, b, laststart);
1337 keep_string_p = true;
1338 }
1339 else
1340 /* Anything else. */
1341 STORE_JUMP (maybe_pop_jump, b, laststart - 3);
1342
1343 /* We've added more stuff to the buffer. */
1344 b += 3;
1345 }
1346
1347 /* On failure, jump from laststart to b + 3, which will be the
1348 end of the buffer after this jump is inserted. */
1349 GET_BUFFER_SPACE (3);
1350 INSERT_JUMP (keep_string_p ? on_failure_keep_string_jump
1351 : on_failure_jump,
1352 laststart, b + 3);
1353 pending_exact = 0;
1354 b += 3;
1355
1356 if (!zero_times_ok)
1357 {
1358 /* At least one repetition is required, so insert a
1359 `dummy_failure_jump' before the initial
1360 `on_failure_jump' instruction of the loop. This
1361 effects a skip over that instruction the first time
1362 we hit that loop. */
1363 GET_BUFFER_SPACE (3);
1364 INSERT_JUMP (dummy_failure_jump, laststart, laststart + 6);
1365 b += 3;
1366 }
1367 }
1368 break;
1369
1370
1371 case '.':
1372 laststart = b;
1373 BUF_PUSH (anychar);
1374 break;
1375
1376
1377 case '[':
1378 {
1379 boolean had_char_class = false;
1380
1381 if (p == pend)
1382 {
1383 FREE_MEM (compile_stack.stack);
1384 return REG_EBRACK;
1385 }
1386 /* Ensure that we have enough space to push a charset: the
1387 opcode, the length count, and the bitset; 34 bytes in all. */
1388 GET_BUFFER_SPACE (34);
1389
1390 laststart = b;
1391
1392 /* We test `*p == '^' twice, instead of using an if
1393 statement, so we only need one BUF_PUSH. */
1394 BUF_PUSH (*p == '^' ? charset_not : charset);
1395 if (*p == '^')
1396 p++;
1397
1398 /* Remember the first position in the bracket expression. */
1399 p1 = p;
1400
1401 /* Push the number of bytes in the bitmap. */
1402 BUF_PUSH ((1 << BYTEWIDTH) / BYTEWIDTH);
1403
1404 /* Clear the whole map. */
1405 bzero (b, (1 << BYTEWIDTH) / BYTEWIDTH);
1406
1407 /* charset_not matches newline according to a syntax bit. */
1408 if ((re_opcode_t) b[-2] == charset_not
1409 && (syntax & RE_HAT_LISTS_NOT_NEWLINE))
1410 SET_LIST_BIT ('\n');
1411
1412 /* Read in characters and ranges, setting map bits. */
1413 for (;;)
1414 {
1415 if (p == pend)
1416 {
1417 FREE_MEM (compile_stack.stack);
1418 return REG_EBRACK;
1419 }
1420
1421 PATFETCH (c);
1422
1423 /* \ might escape characters inside [...] and [^...]. */
1424 if ((syntax & RE_BACKSLASH_ESCAPE_IN_LISTS) && c == '\\')
1425 {
1426 if (p == pend)
1427 {
1428 FREE_MEM (compile_stack.stack);
1429 return REG_EESCAPE;
1430 }
1431
1432 PATFETCH (c1);
1433 SET_LIST_BIT (c1);
1434 continue;
1435 }
1436
1437 /* Could be the end of the bracket expression. If it's
1438 not (i.e., when the bracket expression is `[]' so
1439 far), the ']' character bit gets set way below. */
1440 if (c == ']' && p != p1 + 1)
1441 break;
1442
1443 /* Look ahead to see if it's a range when the last thing
1444 was a character class. */
1445 if (had_char_class && c == '-' && *p != ']')
1446 {
1447 FREE_MEM (compile_stack.stack);
1448 return REG_ERANGE;
1449 }
1450
1451 /* Look ahead to see if it's a range when the last thing
1452 was a character: if this is a hyphen not at the
1453 beginning or the end of a list, then it's the range
1454 operator. */
1455 if (c == '-'
1456 && !(p - 2 >= pattern && p[-2] == '[')
1457 && !(p - 3 >= pattern && p[-3] == '[' && p[-2] == '^')
1458 && *p != ']')
1459 {
1460 reg_errcode_t ret
1461 = compile_range (&p, pend, translate, syntax, b);
1462 if (ret != REG_NOERROR)
1463 {
1464 FREE_MEM (compile_stack.stack);
1465 return ret;
1466 }
1467 }
1468
1469 else if (p[0] == '-' && p[1] != ']')
1470 { /* This handles ranges made up of characters only. */
1471 reg_errcode_t ret;
1472
1473 /* Move past the `-'. */
1474 PATFETCH (c1);
1475
1476 ret = compile_range (&p, pend, translate, syntax, b);
1477 if (ret != REG_NOERROR)
1478 {
1479 FREE_MEM (compile_stack.stack);
1480 return ret;
1481 }
1482 }
1483
1484 /* See if we're at the beginning of a possible character
1485 class. */
1486
1487 else if (syntax & RE_CHAR_CLASSES && c == '[' && *p == ':')
1488 { /* Leave room for the null. */
1489 char str[CHAR_CLASS_MAX_LENGTH + 1];
1490
1491 PATFETCH (c);
1492 c1 = 0;
1493
1494 /* If pattern is `[[:'. */
1495 if (p == pend)
1496 {
1497 FREE_MEM (compile_stack.stack);
1498 return REG_EBRACK;
1499 }
1500
1501 for (;;)
1502 {
1503 PATFETCH (c);
1504 if (c == ':' || c == ']' || p == pend
1505 || c1 == CHAR_CLASS_MAX_LENGTH)
1506 break;
1507 str[c1++] = c;
1508 }
1509 str[c1] = '\0';
1510
1511 /* If isn't a word bracketed by `[:' and:`]':
1512 undo the ending character, the letters, and leave
1513 the leading `:' and `[' (but set bits for them). */
1514 if (c == ':' && *p == ']')
1515 {
1516 int ch;
1517 boolean is_alnum = STREQ (str, "alnum");
1518 boolean is_alpha = STREQ (str, "alpha");
1519 boolean is_blank = STREQ (str, "blank");
1520 boolean is_cntrl = STREQ (str, "cntrl");
1521 boolean is_digit = STREQ (str, "digit");
1522 boolean is_graph = STREQ (str, "graph");
1523 boolean is_lower = STREQ (str, "lower");
1524 boolean is_print = STREQ (str, "print");
1525 boolean is_punct = STREQ (str, "punct");
1526 boolean is_space = STREQ (str, "space");
1527 boolean is_upper = STREQ (str, "upper");
1528 boolean is_xdigit = STREQ (str, "xdigit");
1529
1530 if (!IS_CHAR_CLASS (str))
1531 {
1532 FREE_MEM (compile_stack.stack);
1533 return REG_ECTYPE;
1534 }
1535
1536 /* Throw away the ] at the end of the character
1537 class. */
1538 PATFETCH (c);
1539
1540 if (p == pend)
1541 {
1542 FREE_MEM (compile_stack.stack);
1543 return REG_EBRACK;
1544 }
1545
1546 for (ch = 0; ch < 1 << BYTEWIDTH; ch++)
1547 {
1548 if ( (is_alnum && ISALNUM (ch))
1549 || (is_alpha && ISALPHA (ch))
1550 || (is_blank && ISBLANK (ch))
1551 || (is_cntrl && ISCNTRL (ch))
1552 || (is_digit && ISDIGIT (ch))
1553 || (is_graph && ISGRAPH (ch))
1554 || (is_lower && ISLOWER (ch))
1555 || (is_print && ISPRINT (ch))
1556 || (is_punct && ISPUNCT (ch))
1557 || (is_space && ISSPACE (ch))
1558 || (is_upper && ISUPPER (ch))
1559 || (is_xdigit && ISXDIGIT (ch)))
1560 SET_LIST_BIT (ch);
1561 }
1562 had_char_class = true;
1563 }
1564 else
1565 {
1566 c1++;
1567 while (c1--)
1568 PATUNFETCH;
1569 SET_LIST_BIT ('[');
1570 SET_LIST_BIT (':');
1571 had_char_class = false;
1572 }
1573 }
1574 else
1575 {
1576 had_char_class = false;
1577 SET_LIST_BIT (c);
1578 }
1579 }
1580
1581 /* Discard any (non)matching list bytes that are all 0 at the
1582 end of the map. Decrease the map-length byte too. */
1583 while ((int) b[-1] > 0 && b[b[-1] - 1] == 0)
1584 b[-1]--;
1585 b += b[-1];
1586 }
1587 break;
1588
1589
1590 case '(':
1591 if (syntax & RE_NO_BK_PARENS)
1592 goto handle_open;
1593 else
1594 goto normal_char;
1595
1596
1597 case ')':
1598 if (syntax & RE_NO_BK_PARENS)
1599 goto handle_close;
1600 else
1601 goto normal_char;
1602
1603
1604 case '\n':
1605 if (syntax & RE_NEWLINE_ALT)
1606 goto handle_alt;
1607 else
1608 goto normal_char;
1609
1610
1611 case '|':
1612 if (syntax & RE_NO_BK_VBAR)
1613 goto handle_alt;
1614 else
1615 goto normal_char;
1616
1617
1618 case '{':
1619 if (syntax & RE_INTERVALS && syntax & RE_NO_BK_BRACES)
1620 goto handle_interval;
1621 else
1622 goto normal_char;
1623
1624
1625 case '\\':
1626 if (p == pend)
1627 {
1628 FREE_MEM (compile_stack.stack);
1629 return REG_EESCAPE;
1630 }
1631
1632 /* Do not translate the character after the \, so that we can
1633 distinguish, e.g., \B from \b, even if we normally would
1634 translate, e.g., B to b. */
1635 PATFETCH_RAW (c);
1636
1637 switch (c)
1638 {
1639 case '(':
1640 if (syntax & RE_NO_BK_PARENS)
1641 goto normal_backslash;
1642
1643 handle_open:
1644 bufp->re_nsub++;
1645 regnum++;
1646
1647 if (COMPILE_STACK_FULL)
1648 {
1649 RETALLOC (compile_stack.stack, compile_stack.size << 1,
1650 compile_stack_elt_t);
1651 if (compile_stack.stack == NULL) return REG_ESPACE;
1652
1653 compile_stack.size <<= 1;
1654 }
1655
1656 /* These are the values to restore when we hit end of this
1657 group. They are all relative offsets, so that if the
1658 whole pattern moves because of realloc, they will still
1659 be valid. */
1660 COMPILE_STACK_TOP.begalt_offset = (pattern_offset_t)(begalt - bufp->buffer);
1661 COMPILE_STACK_TOP.fixup_alt_jump
1662 = (pattern_offset_t)(fixup_alt_jump ? fixup_alt_jump - bufp->buffer + 1 : 0);
1663 COMPILE_STACK_TOP.laststart_offset = (pattern_offset_t)(b - bufp->buffer);
1664 COMPILE_STACK_TOP.regnum = regnum;
1665
1666 /* We will eventually replace the 0 with the number of
1667 groups inner to this one. But do not push a
1668 start_memory for groups beyond the last one we can
1669 represent in the compiled pattern. */
1670 if (regnum <= MAX_REGNUM)
1671 {
1672 COMPILE_STACK_TOP.inner_group_offset = (pattern_offset_t)(b - bufp->buffer + 2);
1673 BUF_PUSH_3 (start_memory, regnum, 0);
1674 }
1675
1676 compile_stack.avail++;
1677
1678 fixup_alt_jump = 0;
1679 laststart = 0;
1680 begalt = b;
1681 /* If we've reached MAX_REGNUM groups, then this open
1682 won't actually generate any code, so we'll have to
1683 clear pending_exact explicitly. */
1684 pending_exact = 0;
1685 break;
1686
1687
1688 case ')':
1689 if (syntax & RE_NO_BK_PARENS) goto normal_backslash;
1690
1691 if (COMPILE_STACK_EMPTY)
1692 if (syntax & RE_UNMATCHED_RIGHT_PAREN_ORD)
1693 goto normal_backslash;
1694 else
1695 {
1696 FREE_MEM (compile_stack.stack);
1697 return REG_ERPAREN;
1698 }
1699
1700 handle_close:
1701 if (fixup_alt_jump)
1702 { /* Push a dummy failure point at the end of the
1703 alternative for a possible future
1704 `pop_failure_jump' to pop. See comments at
1705 `push_dummy_failure' in `re_match_2'. */
1706 BUF_PUSH (push_dummy_failure);
1707
1708 /* We allocated space for this jump when we assigned
1709 to `fixup_alt_jump', in the `handle_alt' case below. */
1710 STORE_JUMP (jump_past_alt, fixup_alt_jump, b - 1);
1711 }
1712
1713 /* See similar code for backslashed left paren above. */
1714 if (COMPILE_STACK_EMPTY)
1715 if (syntax & RE_UNMATCHED_RIGHT_PAREN_ORD)
1716 goto normal_char;
1717 else
1718 {
1719 FREE_MEM (compile_stack.stack);
1720 return REG_ERPAREN;
1721 }
1722
1723 /* Since we just checked for an empty stack above, this
1724 ``can't happen''. */
1725 assert (compile_stack.avail != 0);
1726 {
1727 /* We don't just want to restore into `regnum', because
1728 later groups should continue to be numbered higher,
1729 as in `(ab)c(de)' -- the second group is #2. */
1730 regnum_t this_group_regnum;
1731
1732 compile_stack.avail--;
1733 begalt = bufp->buffer + COMPILE_STACK_TOP.begalt_offset;
1734 fixup_alt_jump
1735 = COMPILE_STACK_TOP.fixup_alt_jump
1736 ? bufp->buffer + COMPILE_STACK_TOP.fixup_alt_jump - 1
1737 : 0;
1738 laststart = bufp->buffer + COMPILE_STACK_TOP.laststart_offset;
1739 this_group_regnum = COMPILE_STACK_TOP.regnum;
1740 /* If we've reached MAX_REGNUM groups, then this open
1741 won't actually generate any code, so we'll have to
1742 clear pending_exact explicitly. */
1743 pending_exact = 0;
1744
1745 /* We're at the end of the group, so now we know how many
1746 groups were inside this one. */
1747 if (this_group_regnum <= MAX_REGNUM)
1748 {
1749 unsigned char *inner_group_loc
1750 = bufp->buffer + COMPILE_STACK_TOP.inner_group_offset;
1751
1752 *inner_group_loc = regnum - this_group_regnum;
1753 BUF_PUSH_3 (stop_memory, this_group_regnum,
1754 regnum - this_group_regnum);
1755 }
1756 }
1757 break;
1758
1759
1760 case '|': /* `\|'. */
1761 if (syntax & RE_LIMITED_OPS || syntax & RE_NO_BK_VBAR)
1762 goto normal_backslash;
1763 handle_alt:
1764 if (syntax & RE_LIMITED_OPS)
1765 goto normal_char;
1766
1767 /* Insert before the previous alternative a jump which
1768 jumps to this alternative if the former fails. */
1769 GET_BUFFER_SPACE (3);
1770 INSERT_JUMP (on_failure_jump, begalt, b + 6);
1771 pending_exact = 0;
1772 b += 3;
1773
1774 /* The alternative before this one has a jump after it
1775 which gets executed if it gets matched. Adjust that
1776 jump so it will jump to this alternative's analogous
1777 jump (put in below, which in turn will jump to the next
1778 (if any) alternative's such jump, etc.). The last such
1779 jump jumps to the correct final destination. A picture:
1780 _____ _____
1781 | | | |
1782 | v | v
1783 a | b | c
1784
1785 If we are at `b', then fixup_alt_jump right now points to a
1786 three-byte space after `a'. We'll put in the jump, set
1787 fixup_alt_jump to right after `b', and leave behind three
1788 bytes which we'll fill in when we get to after `c'. */
1789
1790 if (fixup_alt_jump)
1791 STORE_JUMP (jump_past_alt, fixup_alt_jump, b);
1792
1793 /* Mark and leave space for a jump after this alternative,
1794 to be filled in later either by next alternative or
1795 when know we're at the end of a series of alternatives. */
1796 fixup_alt_jump = b;
1797 GET_BUFFER_SPACE (3);
1798 b += 3;
1799
1800 laststart = 0;
1801 begalt = b;
1802 break;
1803
1804
1805 case '{':
1806 /* If \{ is a literal. */
1807 if (!(syntax & RE_INTERVALS)
1808 /* If we're at `\{' and it's not the open-interval
1809 operator. */
1810 || ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
1811 || (p - 2 == pattern && p == pend))
1812 goto normal_backslash;
1813
1814 handle_interval:
1815 {
1816 /* If got here, then the syntax allows intervals. */
1817
1818 /* At least (most) this many matches must be made. */
1819 int lower_bound = -1, upper_bound = -1;
1820
1821 beg_interval = p - 1;
1822
1823 if (p == pend)
1824 {
1825 if (syntax & RE_NO_BK_BRACES)
1826 goto unfetch_interval;
1827 else
1828 {
1829 FREE_MEM (compile_stack.stack);
1830 return REG_EBRACE;
1831 }
1832 }
1833
1834 GET_UNSIGNED_NUMBER (lower_bound);
1835
1836 if (c == ',')
1837 {
1838 GET_UNSIGNED_NUMBER (upper_bound);
1839 if (upper_bound < 0) upper_bound = RE_DUP_MAX;
1840 }
1841 else
1842 /* Interval such as `{1}' => match exactly once. */
1843 upper_bound = lower_bound;
1844
1845 if (lower_bound < 0 || upper_bound > RE_DUP_MAX
1846 || lower_bound > upper_bound)
1847 {
1848 if (syntax & RE_NO_BK_BRACES)
1849 goto unfetch_interval;
1850 else
1851 {
1852 FREE_MEM (compile_stack.stack);
1853 return REG_BADBR;
1854 }
1855 }
1856
1857 if (!(syntax & RE_NO_BK_BRACES))
1858 {
1859 if (c != '\\')
1860 {
1861 FREE_MEM (compile_stack.stack);
1862 return REG_EBRACE;
1863 }
1864 PATFETCH (c);
1865 }
1866
1867 if (c != '}')
1868 {
1869 if (syntax & RE_NO_BK_BRACES)
1870 goto unfetch_interval;
1871 else
1872 {
1873 FREE_MEM (compile_stack.stack);
1874 return REG_BADBR;
1875 }
1876 }
1877
1878 /* We just parsed a valid interval. */
1879
1880 /* If it's invalid to have no preceding re. */
1881 if (!laststart)
1882 {
1883 if (syntax & RE_CONTEXT_INVALID_OPS)
1884 {
1885 FREE_MEM (compile_stack.stack);
1886 return REG_BADRPT;
1887 }
1888 else if (syntax & RE_CONTEXT_INDEP_OPS)
1889 laststart = b;
1890 else
1891 goto unfetch_interval;
1892 }
1893
1894 /* If the upper bound is zero, don't want to succeed at
1895 all; jump from `laststart' to `b + 3', which will be
1896 the end of the buffer after we insert the jump. */
1897 if (upper_bound == 0)
1898 {
1899 GET_BUFFER_SPACE (3);
1900 INSERT_JUMP (jump, laststart, b + 3);
1901 b += 3;
1902 }
1903
1904 /* Otherwise, we have a nontrivial interval. When
1905 we're all done, the pattern will look like:
1906 set_number_at <jump count> <upper bound>
1907 set_number_at <succeed_n count> <lower bound>
1908 succeed_n <after jump addr> <succed_n count>
1909 <body of loop>
1910 jump_n <succeed_n addr> <jump count>
1911 (The upper bound and `jump_n' are omitted if
1912 `upper_bound' is 1, though.) */
1913 else
1914 { /* If the upper bound is > 1, we need to insert
1915 more at the end of the loop. */
1916 unsigned nbytes = 10 + (upper_bound > 1) * 10;
1917
1918 GET_BUFFER_SPACE (nbytes);
1919
1920 /* Initialize lower bound of the `succeed_n', even
1921 though it will be set during matching by its
1922 attendant `set_number_at' (inserted next),
1923 because `re_compile_fastmap' needs to know.
1924 Jump to the `jump_n' we might insert below. */
1925 INSERT_JUMP2 (succeed_n, laststart,
1926 b + 5 + (upper_bound > 1) * 5,
1927 lower_bound);
1928 b += 5;
1929
1930 /* Code to initialize the lower bound. Insert
1931 before the `succeed_n'. The `5' is the last two
1932 bytes of this `set_number_at', plus 3 bytes of
1933 the following `succeed_n'. */
1934 insert_op2 (set_number_at, laststart, 5, lower_bound, b);
1935 b += 5;
1936
1937 if (upper_bound > 1)
1938 { /* More than one repetition is allowed, so
1939 append a backward jump to the `succeed_n'
1940 that starts this interval.
1941
1942 When we've reached this during matching,
1943 we'll have matched the interval once, so
1944 jump back only `upper_bound - 1' times. */
1945 STORE_JUMP2 (jump_n, b, laststart + 5,
1946 upper_bound - 1);
1947 b += 5;
1948
1949 /* The location we want to set is the second
1950 parameter of the `jump_n'; that is `b-2' as
1951 an absolute address. `laststart' will be
1952 the `set_number_at' we're about to insert;
1953 `laststart+3' the number to set, the source
1954 for the relative address. But we are
1955 inserting into the middle of the pattern --
1956 so everything is getting moved up by 5.
1957 Conclusion: (b - 2) - (laststart + 3) + 5,
1958 i.e., b - laststart.
1959
1960 We insert this at the beginning of the loop
1961 so that if we fail during matching, we'll
1962 reinitialize the bounds. */
1963 insert_op2 (set_number_at, laststart, b - laststart,
1964 upper_bound - 1, b);
1965 b += 5;
1966 }
1967 }
1968 pending_exact = 0;
1969 beg_interval = NULL;
1970 }
1971 break;
1972
1973 unfetch_interval:
1974 /* If an invalid interval, match the characters as literals. */
1975 assert (beg_interval);
1976 p = beg_interval;
1977 beg_interval = NULL;
1978
1979 /* normal_char and normal_backslash need `c'. */
1980 PATFETCH (c);
1981
1982 if (!(syntax & RE_NO_BK_BRACES))
1983 {
1984 if (p > pattern && p[-1] == '\\')
1985 goto normal_backslash;
1986 }
1987 goto normal_char;
1988
1989 #ifdef emacs
1990 /* There is no way to specify the before_dot and after_dot
1991 operators. rms says this is ok. --karl */
1992 case '=':
1993 BUF_PUSH (at_dot);
1994 break;
1995
1996 case 's':
1997 laststart = b;
1998 PATFETCH (c);
1999 BUF_PUSH_2 (syntaxspec, syntax_spec_code[c]);
2000 break;
2001
2002 case 'S':
2003 laststart = b;
2004 PATFETCH (c);
2005 BUF_PUSH_2 (notsyntaxspec, syntax_spec_code[c]);
2006 break;
2007 #endif /* emacs */
2008
2009
2010 case 'w':
2011 laststart = b;
2012 BUF_PUSH (wordchar);
2013 break;
2014
2015
2016 case 'W':
2017 laststart = b;
2018 BUF_PUSH (notwordchar);
2019 break;
2020
2021
2022 case '<':
2023 BUF_PUSH (wordbeg);
2024 break;
2025
2026 case '>':
2027 BUF_PUSH (wordend);
2028 break;
2029
2030 case 'b':
2031 BUF_PUSH (wordbound);
2032 break;
2033
2034 case 'B':
2035 BUF_PUSH (notwordbound);
2036 break;
2037
2038 case '`':
2039 BUF_PUSH (begbuf);
2040 break;
2041
2042 case '\'':
2043 BUF_PUSH (endbuf);
2044 break;
2045
2046 case '1': case '2': case '3': case '4': case '5':
2047 case '6': case '7': case '8': case '9':
2048 if (syntax & RE_NO_BK_REFS)
2049 goto normal_char;
2050
2051 c1 = c - '0';
2052
2053 if (c1 > regnum)
2054 {
2055 FREE_MEM (compile_stack.stack);
2056 return REG_ESUBREG;
2057 }
2058
2059 /* Can't back reference to a subexpression if inside of it. */
2060 if (group_in_compile_stack (compile_stack, c1))
2061 goto normal_char;
2062
2063 laststart = b;
2064 BUF_PUSH_2 (duplicate, c1);
2065 break;
2066
2067
2068 case '+':
2069 case '?':
2070 if (syntax & RE_BK_PLUS_QM)
2071 goto handle_plus;
2072 else
2073 goto normal_backslash;
2074
2075 default:
2076 normal_backslash:
2077 /* You might think it would be useful for \ to mean
2078 not to translate; but if we don't translate it
2079 it will never match anything. */
2080 c = TRANSLATE (c);
2081 goto normal_char;
2082 }
2083 break;
2084
2085
2086 default:
2087 /* Expects the character in `c'. */
2088 normal_char:
2089 /* If no exactn currently being built. */
2090 if (!pending_exact
2091
2092 /* If last exactn not at current position. */
2093 || pending_exact + *pending_exact + 1 != b
2094
2095 /* We have only one byte following the exactn for the count. */
2096 || *pending_exact == (1 << BYTEWIDTH) - 1
2097
2098 /* If followed by a repetition operator. */
2099 || *p == '*' || *p == '^'
2100 || ((syntax & RE_BK_PLUS_QM)
2101 ? *p == '\\' && (p[1] == '+' || p[1] == '?')
2102 : (*p == '+' || *p == '?'))
2103 || ((syntax & RE_INTERVALS)
2104 && ((syntax & RE_NO_BK_BRACES)
2105 ? *p == '{'
2106 : (p[0] == '\\' && p[1] == '{'))))
2107 {
2108 /* Start building a new exactn. */
2109
2110 laststart = b;
2111
2112 BUF_PUSH_2 (exactn, 0);
2113 pending_exact = b - 1;
2114 }
2115
2116 BUF_PUSH (c);
2117 (*pending_exact)++;
2118 break;
2119 } /* switch (c) */
2120 } /* while p != pend */
2121
2122
2123 /* Through the pattern now. */
2124
2125 if (fixup_alt_jump)
2126 STORE_JUMP (jump_past_alt, fixup_alt_jump, b);
2127
2128 if (!COMPILE_STACK_EMPTY)
2129 {
2130 FREE_MEM (compile_stack.stack);
2131 return REG_EPAREN;
2132 }
2133
2134 free (compile_stack.stack);
2135
2136 /* We have succeeded; set the length of the buffer. */
2137 bufp->used = (unsigned long)(b - bufp->buffer);
2138
2139 #ifdef DEBUG
2140 if (debug)
2141 {
2142 DEBUG_PRINT1 ("\nCompiled pattern: ");
2143 print_compiled_pattern (bufp);
2144 }
2145 #endif /* DEBUG */
2146
2147 return REG_NOERROR;
2148 } /* regex_compile */
2149
2150 /* Subroutines for `regex_compile'. */
2151
2152 /* Store OP at LOC followed by two-byte integer parameter ARG. */
2153
2154 static void
store_op1(op,loc,arg)2155 store_op1 (op, loc, arg)
2156 re_opcode_t op;
2157 unsigned char *loc;
2158 int arg;
2159 {
2160 *loc = (unsigned char) op;
2161 STORE_NUMBER (loc + 1, arg);
2162 }
2163
2164
2165 /* Like `store_op1', but for two two-byte parameters ARG1 and ARG2. */
2166
2167 static void
store_op2(op,loc,arg1,arg2)2168 store_op2 (op, loc, arg1, arg2)
2169 re_opcode_t op;
2170 unsigned char *loc;
2171 int arg1, arg2;
2172 {
2173 *loc = (unsigned char) op;
2174 STORE_NUMBER (loc + 1, arg1);
2175 STORE_NUMBER (loc + 3, arg2);
2176 }
2177
2178
2179 /* Copy the bytes from LOC to END to open up three bytes of space at LOC
2180 for OP followed by two-byte integer parameter ARG. */
2181
2182 static void
insert_op1(op,loc,arg,end)2183 insert_op1 (op, loc, arg, end)
2184 re_opcode_t op;
2185 unsigned char *loc;
2186 int arg;
2187 unsigned char *end;
2188 {
2189 register unsigned char *pfrom = end;
2190 register unsigned char *pto = end + 3;
2191
2192 while (pfrom != loc)
2193 *--pto = *--pfrom;
2194
2195 store_op1 (op, loc, arg);
2196 }
2197
2198
2199 /* Like `insert_op1', but for two two-byte parameters ARG1 and ARG2. */
2200
2201 static void
insert_op2(op,loc,arg1,arg2,end)2202 insert_op2 (op, loc, arg1, arg2, end)
2203 re_opcode_t op;
2204 unsigned char *loc;
2205 int arg1, arg2;
2206 unsigned char *end;
2207 {
2208 register unsigned char *pfrom = end;
2209 register unsigned char *pto = end + 5;
2210
2211 while (pfrom != loc)
2212 *--pto = *--pfrom;
2213
2214 store_op2 (op, loc, arg1, arg2);
2215 }
2216
2217
2218 /* P points to just after a ^ in PATTERN. Return true if that ^ comes
2219 after an alternative or a begin-subexpression. We assume there is at
2220 least one character before the ^. */
2221
2222 static boolean
at_begline_loc_p(pattern,p,syntax)2223 at_begline_loc_p (pattern, p, syntax)
2224 const char *pattern, *p;
2225 reg_syntax_t syntax;
2226 {
2227 const char *prev = p - 2;
2228 boolean prev_prev_backslash = prev > pattern && prev[-1] == '\\';
2229
2230 return
2231 /* After a subexpression? */
2232 (*prev == '(' && (syntax & RE_NO_BK_PARENS || prev_prev_backslash))
2233 /* After an alternative? */
2234 || (*prev == '|' && (syntax & RE_NO_BK_VBAR || prev_prev_backslash));
2235 }
2236
2237
2238 /* The dual of at_begline_loc_p. This one is for $. We assume there is
2239 at least one character after the $, i.e., `P < PEND'. */
2240
2241 static boolean
at_endline_loc_p(p,pend,syntax)2242 at_endline_loc_p (p, pend, syntax)
2243 const char *p, *pend;
2244 int syntax;
2245 {
2246 const char *next = p;
2247 boolean next_backslash = *next == '\\';
2248 const char *next_next = p + 1 < pend ? p + 1 : NULL;
2249
2250 return
2251 /* Before a subexpression? */
2252 (syntax & RE_NO_BK_PARENS ? *next == ')'
2253 : next_backslash && next_next && *next_next == ')')
2254 /* Before an alternative? */
2255 || (syntax & RE_NO_BK_VBAR ? *next == '|'
2256 : next_backslash && next_next && *next_next == '|');
2257 }
2258
2259
2260 /* Returns true if REGNUM is in one of COMPILE_STACK's elements and
2261 false if it's not. */
2262
2263 static boolean
group_in_compile_stack(compile_stack,regnum)2264 group_in_compile_stack (compile_stack, regnum)
2265 compile_stack_type compile_stack;
2266 regnum_t regnum;
2267 {
2268 int this_element;
2269
2270 for (this_element = compile_stack.avail - 1;
2271 this_element >= 0;
2272 this_element--)
2273 if (compile_stack.stack[this_element].regnum == regnum)
2274 return true;
2275
2276 return false;
2277 }
2278
2279
2280 /* Read the ending character of a range (in a bracket expression) from the
2281 uncompiled pattern *P_PTR (which ends at PEND). We assume the
2282 starting character is in `P[-2]'. (`P[-1]' is the character `-'.)
2283 Then we set the translation of all bits between the starting and
2284 ending characters (inclusive) in the compiled pattern B.
2285
2286 Return an error code.
2287
2288 We use these short variable names so we can use the same macros as
2289 `regex_compile' itself. */
2290
2291 static reg_errcode_t
compile_range(p_ptr,pend,translate,syntax,b)2292 compile_range (p_ptr, pend, translate, syntax, b)
2293 const char **p_ptr, *pend;
2294 char *translate;
2295 reg_syntax_t syntax;
2296 unsigned char *b;
2297 {
2298 unsigned this_char;
2299
2300 const char *p = *p_ptr;
2301 int range_start, range_end;
2302
2303 if (p == pend)
2304 return REG_ERANGE;
2305
2306 /* Even though the pattern is a signed `char *', we need to fetch
2307 with unsigned char *'s; if the high bit of the pattern character
2308 is set, the range endpoints will be negative if we fetch using a
2309 signed char *.
2310
2311 We also want to fetch the endpoints without translating them; the
2312 appropriate translation is done in the bit-setting loop below. */
2313 range_start = ((unsigned char *) p)[-2];
2314 range_end = ((unsigned char *) p)[0];
2315
2316 /* Have to increment the pointer into the pattern string, so the
2317 caller isn't still at the ending character. */
2318 (*p_ptr)++;
2319
2320 /* If the start is after the end, the range is empty. */
2321 if (range_start > range_end)
2322 return syntax & RE_NO_EMPTY_RANGES ? REG_ERANGE : REG_NOERROR;
2323
2324 /* Here we see why `this_char' has to be larger than an `unsigned
2325 char' -- the range is inclusive, so if `range_end' == 0xff
2326 (assuming 8-bit characters), we would otherwise go into an infinite
2327 loop, since all characters <= 0xff. */
2328 for (this_char = range_start; this_char <= (unsigned)range_end; this_char++)
2329 {
2330 SET_LIST_BIT (TRANSLATE (this_char));
2331 }
2332
2333 return REG_NOERROR;
2334 }
2335
2336 /* Failure stack declarations and macros; both re_compile_fastmap and
2337 re_match_2 use a failure stack. These have to be macros because of
2338 REGEX_ALLOCATE. */
2339
2340
2341 /* Number of failure points for which to initially allocate space
2342 when matching. If this number is exceeded, we allocate more
2343 space, so it is not a hard limit. */
2344 #ifndef INIT_FAILURE_ALLOC
2345 #define INIT_FAILURE_ALLOC 5
2346 #endif
2347
2348 /* Roughly the maximum number of failure points on the stack. Would be
2349 exactly that if always used MAX_FAILURE_SPACE each time we failed.
2350 This is a variable only so users of regex can assign to it; we never
2351 change it ourselves. */
2352 int re_max_failures = 2000;
2353
2354 typedef const unsigned char *fail_stack_elt_t;
2355
2356 typedef struct
2357 {
2358 fail_stack_elt_t *stack;
2359 unsigned size;
2360 unsigned avail; /* Offset of next open position. */
2361 } fail_stack_type;
2362
2363 #define FAIL_STACK_EMPTY() (fail_stack.avail == 0)
2364 #define FAIL_STACK_PTR_EMPTY() (fail_stack_ptr->avail == 0)
2365 #define FAIL_STACK_FULL() (fail_stack.avail == fail_stack.size)
2366 #define FAIL_STACK_TOP() (fail_stack.stack[fail_stack.avail])
2367
2368
2369 /* Initialize `fail_stack'. Do `return -2' if the alloc fails. */
2370
2371 #define INIT_FAIL_STACK() \
2372 do { \
2373 fail_stack.stack = (fail_stack_elt_t *) \
2374 REGEX_ALLOCATE (INIT_FAILURE_ALLOC * sizeof (fail_stack_elt_t)); \
2375 \
2376 if (fail_stack.stack == NULL) \
2377 return -2; \
2378 \
2379 fail_stack.size = INIT_FAILURE_ALLOC; \
2380 fail_stack.avail = 0; \
2381 } while (0)
2382
2383
2384 /* Double the size of FAIL_STACK, up to approximately `re_max_failures' items.
2385
2386 Return 1 if succeeds, and 0 if either ran out of memory
2387 allocating space for it or it was already too large.
2388
2389 REGEX_REALLOCATE requires `destination' be declared. */
2390
2391 #define DOUBLE_FAIL_STACK(fail_stack) \
2392 ((fail_stack).size > re_max_failures * MAX_FAILURE_ITEMS \
2393 ? 0 \
2394 : ((fail_stack).stack = (fail_stack_elt_t *) \
2395 REGEX_REALLOCATE ((fail_stack).stack, \
2396 (fail_stack).size * sizeof (fail_stack_elt_t), \
2397 ((fail_stack).size << 1) * sizeof (fail_stack_elt_t)), \
2398 \
2399 (fail_stack).stack == NULL \
2400 ? 0 \
2401 : ((fail_stack).size <<= 1, \
2402 1)))
2403
2404
2405 /* Push PATTERN_OP on FAIL_STACK.
2406
2407 Return 1 if was able to do so and 0 if ran out of memory allocating
2408 space to do so. */
2409 #define PUSH_PATTERN_OP(pattern_op, fail_stack) \
2410 ((FAIL_STACK_FULL () \
2411 && !DOUBLE_FAIL_STACK (fail_stack)) \
2412 ? 0 \
2413 : ((fail_stack).stack[(fail_stack).avail++] = pattern_op, \
2414 1))
2415
2416 /* This pushes an item onto the failure stack. Must be a four-byte
2417 value. Assumes the variable `fail_stack'. Probably should only
2418 be called from within `PUSH_FAILURE_POINT'. */
2419 #define PUSH_FAILURE_ITEM(item) \
2420 fail_stack.stack[fail_stack.avail++] = (fail_stack_elt_t) item
2421
2422 /* The complement operation. Assumes `fail_stack' is nonempty. */
2423 #define POP_FAILURE_ITEM() fail_stack.stack[--fail_stack.avail]
2424
2425 /* Used to omit pushing failure point id's when we're not debugging. */
2426 #ifdef DEBUG
2427 #define DEBUG_PUSH PUSH_FAILURE_ITEM
2428 #define DEBUG_POP(item_addr) *(item_addr) = POP_FAILURE_ITEM ()
2429 #else
2430 #define DEBUG_PUSH(item)
2431 #define DEBUG_POP(item_addr)
2432 #endif
2433
2434
2435 /* Push the information about the state we will need
2436 if we ever fail back to it.
2437
2438 Requires variables fail_stack, regstart, regend, reg_info, and
2439 num_regs be declared. DOUBLE_FAIL_STACK requires `destination' be
2440 declared.
2441
2442 Does `return FAILURE_CODE' if runs out of memory. */
2443
2444 #define PUSH_FAILURE_POINT(pattern_place, string_place, failure_code) \
2445 do { \
2446 char *destination; \
2447 /* Must be int, so when we don't save any registers, the arithmetic \
2448 of 0 + -1 isn't done as unsigned. */ \
2449 int this_reg; \
2450 \
2451 DEBUG_STATEMENT (failure_id++); \
2452 DEBUG_STATEMENT (nfailure_points_pushed++); \
2453 DEBUG_PRINT2 ("\nPUSH_FAILURE_POINT #%u:\n", failure_id); \
2454 DEBUG_PRINT2 (" Before push, next avail: %d\n", (fail_stack).avail);\
2455 DEBUG_PRINT2 (" size: %d\n", (fail_stack).size);\
2456 \
2457 DEBUG_PRINT2 (" slots needed: %d\n", NUM_FAILURE_ITEMS); \
2458 DEBUG_PRINT2 (" available: %d\n", REMAINING_AVAIL_SLOTS); \
2459 \
2460 /* Ensure we have enough space allocated for what we will push. */ \
2461 while (REMAINING_AVAIL_SLOTS < NUM_FAILURE_ITEMS) \
2462 { \
2463 if (!DOUBLE_FAIL_STACK (fail_stack)) \
2464 return failure_code; \
2465 \
2466 DEBUG_PRINT2 ("\n Doubled stack; size now: %d\n", \
2467 (fail_stack).size); \
2468 DEBUG_PRINT2 (" slots available: %d\n", REMAINING_AVAIL_SLOTS);\
2469 } \
2470 \
2471 /* Push the info, starting with the registers. */ \
2472 DEBUG_PRINT1 ("\n"); \
2473 \
2474 for (this_reg = lowest_active_reg; (unsigned)this_reg <= highest_active_reg; \
2475 this_reg++) \
2476 { \
2477 DEBUG_PRINT2 (" Pushing reg: %d\n", this_reg); \
2478 DEBUG_STATEMENT (num_regs_pushed++); \
2479 \
2480 DEBUG_PRINT2 (" start: 0x%x\n", regstart[this_reg]); \
2481 PUSH_FAILURE_ITEM (regstart[this_reg]); \
2482 \
2483 DEBUG_PRINT2 (" end: 0x%x\n", regend[this_reg]); \
2484 PUSH_FAILURE_ITEM (regend[this_reg]); \
2485 \
2486 DEBUG_PRINT2 (" info: 0x%x\n ", reg_info[this_reg]); \
2487 DEBUG_PRINT2 (" match_null=%d", \
2488 REG_MATCH_NULL_STRING_P (reg_info[this_reg])); \
2489 DEBUG_PRINT2 (" active=%d", IS_ACTIVE (reg_info[this_reg])); \
2490 DEBUG_PRINT2 (" matched_something=%d", \
2491 MATCHED_SOMETHING (reg_info[this_reg])); \
2492 DEBUG_PRINT2 (" ever_matched=%d", \
2493 EVER_MATCHED_SOMETHING (reg_info[this_reg])); \
2494 DEBUG_PRINT1 ("\n"); \
2495 PUSH_FAILURE_ITEM (reg_info[this_reg].word); \
2496 } \
2497 \
2498 DEBUG_PRINT2 (" Pushing low active reg: %d\n", lowest_active_reg);\
2499 PUSH_FAILURE_ITEM (lowest_active_reg); \
2500 \
2501 DEBUG_PRINT2 (" Pushing high active reg: %d\n", highest_active_reg);\
2502 PUSH_FAILURE_ITEM (highest_active_reg); \
2503 \
2504 DEBUG_PRINT2 (" Pushing pattern 0x%x: ", pattern_place); \
2505 DEBUG_PRINT_COMPILED_PATTERN (bufp, pattern_place, pend); \
2506 PUSH_FAILURE_ITEM (pattern_place); \
2507 \
2508 DEBUG_PRINT2 (" Pushing string 0x%x: `", string_place); \
2509 DEBUG_PRINT_DOUBLE_STRING (string_place, string1, size1, string2, \
2510 size2); \
2511 DEBUG_PRINT1 ("'\n"); \
2512 PUSH_FAILURE_ITEM (string_place); \
2513 \
2514 DEBUG_PRINT2 (" Pushing failure id: %u\n", failure_id); \
2515 DEBUG_PUSH (failure_id); \
2516 } while (0)
2517
2518 /* This is the number of items that are pushed and popped on the stack
2519 for each register. */
2520 #define NUM_REG_ITEMS 3
2521
2522 /* Individual items aside from the registers. */
2523 #ifdef DEBUG
2524 #define NUM_NONREG_ITEMS 5 /* Includes failure point id. */
2525 #else
2526 #define NUM_NONREG_ITEMS 4
2527 #endif
2528
2529 /* We push at most this many items on the stack. */
2530 #define MAX_FAILURE_ITEMS ((num_regs - 1) * NUM_REG_ITEMS + NUM_NONREG_ITEMS)
2531
2532 /* We actually push this many items. */
2533 #define NUM_FAILURE_ITEMS \
2534 ((highest_active_reg - lowest_active_reg + 1) * NUM_REG_ITEMS \
2535 + NUM_NONREG_ITEMS)
2536
2537 /* How many items can still be added to the stack without overflowing it. */
2538 #define REMAINING_AVAIL_SLOTS ((fail_stack).size - (fail_stack).avail)
2539
2540
2541 /* Pops what PUSH_FAIL_STACK pushes.
2542
2543 We restore into the parameters, all of which should be lvalues:
2544 STR -- the saved data position.
2545 PAT -- the saved pattern position.
2546 LOW_REG, HIGH_REG -- the highest and lowest active registers.
2547 REGSTART, REGEND -- arrays of string positions.
2548 REG_INFO -- array of information about each subexpression.
2549
2550 Also assumes the variables `fail_stack' and (if debugging), `bufp',
2551 `pend', `string1', `size1', `string2', and `size2'. */
2552
2553 #define POP_FAILURE_POINT(str, pat, low_reg, high_reg, regstart, regend, reg_info)\
2554 { \
2555 DEBUG_STATEMENT (fail_stack_elt_t failure_id;) \
2556 int this_reg; \
2557 const unsigned char *string_temp; \
2558 \
2559 assert (!FAIL_STACK_EMPTY ()); \
2560 \
2561 /* Remove failure points and point to how many regs pushed. */ \
2562 DEBUG_PRINT1 ("POP_FAILURE_POINT:\n"); \
2563 DEBUG_PRINT2 (" Before pop, next avail: %d\n", fail_stack.avail); \
2564 DEBUG_PRINT2 (" size: %d\n", fail_stack.size); \
2565 \
2566 assert (fail_stack.avail >= NUM_NONREG_ITEMS); \
2567 \
2568 DEBUG_POP (&failure_id); \
2569 DEBUG_PRINT2 (" Popping failure id: %u\n", failure_id); \
2570 \
2571 /* If the saved string location is NULL, it came from an \
2572 on_failure_keep_string_jump opcode, and we want to throw away the \
2573 saved NULL, thus retaining our current position in the string. */ \
2574 string_temp = POP_FAILURE_ITEM (); \
2575 if (string_temp != NULL) \
2576 str = (const char *) string_temp; \
2577 \
2578 DEBUG_PRINT2 (" Popping string 0x%x: `", str); \
2579 DEBUG_PRINT_DOUBLE_STRING (str, string1, size1, string2, size2); \
2580 DEBUG_PRINT1 ("'\n"); \
2581 \
2582 pat = (unsigned char *) POP_FAILURE_ITEM (); \
2583 DEBUG_PRINT2 (" Popping pattern 0x%x: ", pat); \
2584 DEBUG_PRINT_COMPILED_PATTERN (bufp, pat, pend); \
2585 \
2586 /* Restore register info. */ \
2587 high_reg = (unsigned) POP_FAILURE_ITEM (); \
2588 DEBUG_PRINT2 (" Popping high active reg: %d\n", high_reg); \
2589 \
2590 low_reg = (unsigned) POP_FAILURE_ITEM (); \
2591 DEBUG_PRINT2 (" Popping low active reg: %d\n", low_reg); \
2592 \
2593 for (this_reg = high_reg; (unsigned)this_reg >= low_reg; this_reg--) \
2594 { \
2595 DEBUG_PRINT2 (" Popping reg: %d\n", this_reg); \
2596 \
2597 reg_info[this_reg].word = POP_FAILURE_ITEM (); \
2598 DEBUG_PRINT2 (" info: 0x%x\n", reg_info[this_reg]); \
2599 \
2600 regend[this_reg] = (const char *) POP_FAILURE_ITEM (); \
2601 DEBUG_PRINT2 (" end: 0x%x\n", regend[this_reg]); \
2602 \
2603 regstart[this_reg] = (const char *) POP_FAILURE_ITEM (); \
2604 DEBUG_PRINT2 (" start: 0x%x\n", regstart[this_reg]); \
2605 } \
2606 \
2607 DEBUG_STATEMENT (nfailure_points_popped++); \
2608 } /* POP_FAILURE_POINT */
2609
2610 /* re_compile_fastmap computes a ``fastmap'' for the compiled pattern in
2611 BUFP. A fastmap records which of the (1 << BYTEWIDTH) possible
2612 characters can start a string that matches the pattern. This fastmap
2613 is used by re_search to skip quickly over impossible starting points.
2614
2615 The caller must supply the address of a (1 << BYTEWIDTH)-byte data
2616 area as BUFP->fastmap.
2617
2618 We set the `fastmap', `fastmap_accurate', and `can_be_null' fields in
2619 the pattern buffer.
2620
2621 Returns 0 if we succeed, -2 if an internal error. */
2622
2623 int
re_compile_fastmap(bufp)2624 re_compile_fastmap (bufp)
2625 struct re_pattern_buffer *bufp;
2626 {
2627 int j, k;
2628 fail_stack_type fail_stack;
2629 #ifndef REGEX_MALLOC
2630 char *destination;
2631 #endif
2632 /* We don't push any register information onto the failure stack. */
2633 unsigned num_regs = 0;
2634
2635 register char *fastmap = bufp->fastmap;
2636 unsigned char *pattern = bufp->buffer;
2637 unsigned long size = bufp->used;
2638 const unsigned char *p = pattern;
2639 register unsigned char *pend = pattern + size;
2640
2641 /* Assume that each path through the pattern can be null until
2642 proven otherwise. We set this false at the bottom of switch
2643 statement, to which we get only if a particular path doesn't
2644 match the empty string. */
2645 boolean path_can_be_null = true;
2646
2647 /* We aren't doing a `succeed_n' to begin with. */
2648 boolean succeed_n_p = false;
2649
2650 assert (fastmap != NULL && p != NULL);
2651
2652 INIT_FAIL_STACK ();
2653 bzero (fastmap, 1 << BYTEWIDTH); /* Assume nothing's valid. */
2654 bufp->fastmap_accurate = 1; /* It will be when we're done. */
2655 bufp->can_be_null = 0;
2656
2657 while (p != pend || !FAIL_STACK_EMPTY ())
2658 {
2659 if (p == pend)
2660 {
2661 bufp->can_be_null |= path_can_be_null;
2662
2663 /* Reset for next path. */
2664 path_can_be_null = true;
2665
2666 p = fail_stack.stack[--fail_stack.avail];
2667 }
2668
2669 /* We should never be about to go beyond the end of the pattern. */
2670 assert (p < pend);
2671
2672 #ifdef SWITCH_ENUM_BUG
2673 switch ((int) ((re_opcode_t) *p++))
2674 #else
2675 switch ((re_opcode_t) *p++)
2676 #endif
2677 {
2678
2679 /* I guess the idea here is to simply not bother with a fastmap
2680 if a backreference is used, since it's too hard to figure out
2681 the fastmap for the corresponding group. Setting
2682 `can_be_null' stops `re_search_2' from using the fastmap, so
2683 that is all we do. */
2684 case duplicate:
2685 bufp->can_be_null = 1;
2686 return 0;
2687
2688
2689 /* Following are the cases which match a character. These end
2690 with `break'. */
2691
2692 case exactn:
2693 fastmap[p[1]] = 1;
2694 break;
2695
2696
2697 case charset:
2698 for (j = *p++ * BYTEWIDTH - 1; j >= 0; j--)
2699 if (p[j / BYTEWIDTH] & (1 << (j % BYTEWIDTH)))
2700 fastmap[j] = 1;
2701 break;
2702
2703
2704 case charset_not:
2705 /* Chars beyond end of map must be allowed. */
2706 for (j = *p * BYTEWIDTH; j < (1 << BYTEWIDTH); j++)
2707 fastmap[j] = 1;
2708
2709 for (j = *p++ * BYTEWIDTH - 1; j >= 0; j--)
2710 if (!(p[j / BYTEWIDTH] & (1 << (j % BYTEWIDTH))))
2711 fastmap[j] = 1;
2712 break;
2713
2714
2715 case wordchar:
2716 for (j = 0; j < (1 << BYTEWIDTH); j++)
2717 if (SYNTAX (j) == Sword)
2718 fastmap[j] = 1;
2719 break;
2720
2721
2722 case notwordchar:
2723 for (j = 0; j < (1 << BYTEWIDTH); j++)
2724 if (SYNTAX (j) != Sword)
2725 fastmap[j] = 1;
2726 break;
2727
2728
2729 case anychar:
2730 /* `.' matches anything ... */
2731 for (j = 0; j < (1 << BYTEWIDTH); j++)
2732 fastmap[j] = 1;
2733
2734 /* ... except perhaps newline. */
2735 if (!(bufp->syntax & RE_DOT_NEWLINE))
2736 fastmap['\n'] = 0;
2737
2738 /* Return if we have already set `can_be_null'; if we have,
2739 then the fastmap is irrelevant. Something's wrong here. */
2740 else if (bufp->can_be_null)
2741 return 0;
2742
2743 /* Otherwise, have to check alternative paths. */
2744 break;
2745
2746
2747 #ifdef emacs
2748 case syntaxspec:
2749 k = *p++;
2750 for (j = 0; j < (1 << BYTEWIDTH); j++)
2751 if (SYNTAX (j) == (enum syntaxcode) k)
2752 fastmap[j] = 1;
2753 break;
2754
2755
2756 case notsyntaxspec:
2757 k = *p++;
2758 for (j = 0; j < (1 << BYTEWIDTH); j++)
2759 if (SYNTAX (j) != (enum syntaxcode) k)
2760 fastmap[j] = 1;
2761 break;
2762
2763
2764 /* All cases after this match the empty string. These end with
2765 `continue'. */
2766
2767
2768 case before_dot:
2769 case at_dot:
2770 case after_dot:
2771 continue;
2772 #endif /* not emacs */
2773
2774
2775 case no_op:
2776 case begline:
2777 case endline:
2778 case begbuf:
2779 case endbuf:
2780 case wordbound:
2781 case notwordbound:
2782 case wordbeg:
2783 case wordend:
2784 case push_dummy_failure:
2785 continue;
2786
2787
2788 case jump_n:
2789 case pop_failure_jump:
2790 case maybe_pop_jump:
2791 case jump:
2792 case jump_past_alt:
2793 case dummy_failure_jump:
2794 EXTRACT_NUMBER_AND_INCR (j, p);
2795 p += j;
2796 if (j > 0)
2797 continue;
2798
2799 /* Jump backward implies we just went through the body of a
2800 loop and matched nothing. Opcode jumped to should be
2801 `on_failure_jump' or `succeed_n'. Just treat it like an
2802 ordinary jump. For a * loop, it has pushed its failure
2803 point already; if so, discard that as redundant. */
2804 if ((re_opcode_t) *p != on_failure_jump
2805 && (re_opcode_t) *p != succeed_n)
2806 continue;
2807
2808 p++;
2809 EXTRACT_NUMBER_AND_INCR (j, p);
2810 p += j;
2811
2812 /* If what's on the stack is where we are now, pop it. */
2813 if (!FAIL_STACK_EMPTY ()
2814 && fail_stack.stack[fail_stack.avail - 1] == p)
2815 fail_stack.avail--;
2816
2817 continue;
2818
2819
2820 case on_failure_jump:
2821 case on_failure_keep_string_jump:
2822 handle_on_failure_jump:
2823 EXTRACT_NUMBER_AND_INCR (j, p);
2824
2825 /* For some patterns, e.g., `(a?)?', `p+j' here points to the
2826 end of the pattern. We don't want to push such a point,
2827 since when we restore it above, entering the switch will
2828 increment `p' past the end of the pattern. We don't need
2829 to push such a point since we obviously won't find any more
2830 fastmap entries beyond `pend'. Such a pattern can match
2831 the null string, though. */
2832 if (p + j < pend)
2833 {
2834 if (!PUSH_PATTERN_OP (p + j, fail_stack))
2835 return -2;
2836 }
2837 else
2838 bufp->can_be_null = 1;
2839
2840 if (succeed_n_p)
2841 {
2842 EXTRACT_NUMBER_AND_INCR (k, p); /* Skip the n. */
2843 succeed_n_p = false;
2844 }
2845
2846 continue;
2847
2848
2849 case succeed_n:
2850 /* Get to the number of times to succeed. */
2851 p += 2;
2852
2853 /* Increment p past the n for when k != 0. */
2854 EXTRACT_NUMBER_AND_INCR (k, p);
2855 if (k == 0)
2856 {
2857 p -= 4;
2858 succeed_n_p = true; /* Spaghetti code alert. */
2859 goto handle_on_failure_jump;
2860 }
2861 continue;
2862
2863
2864 case set_number_at:
2865 p += 4;
2866 continue;
2867
2868
2869 case start_memory:
2870 case stop_memory:
2871 p += 2;
2872 continue;
2873
2874
2875 default:
2876 abort (); /* We have listed all the cases. */
2877 } /* switch *p++ */
2878
2879 /* Getting here means we have found the possible starting
2880 characters for one path of the pattern -- and that the empty
2881 string does not match. We need not follow this path further.
2882 Instead, look at the next alternative (remembered on the
2883 stack), or quit if no more. The test at the top of the loop
2884 does these things. */
2885 path_can_be_null = false;
2886 p = pend;
2887 } /* while p */
2888
2889 /* Set `can_be_null' for the last path (also the first path, if the
2890 pattern is empty). */
2891 bufp->can_be_null |= path_can_be_null;
2892 return 0;
2893 } /* re_compile_fastmap */
2894
2895 /* Set REGS to hold NUM_REGS registers, storing them in STARTS and
2896 ENDS. Subsequent matches using PATTERN_BUFFER and REGS will use
2897 this memory for recording register information. STARTS and ENDS
2898 must be allocated using the malloc library routine, and must each
2899 be at least NUM_REGS * sizeof (regoff_t) bytes long.
2900
2901 If NUM_REGS == 0, then subsequent matches should allocate their own
2902 register data.
2903
2904 Unless this function is called, the first search or match using
2905 PATTERN_BUFFER will allocate its own register data, without
2906 freeing the old data. */
2907
2908 void
re_set_registers(bufp,regs,num_regs,starts,ends)2909 re_set_registers (bufp, regs, num_regs, starts, ends)
2910 struct re_pattern_buffer *bufp;
2911 struct re_registers *regs;
2912 unsigned num_regs;
2913 regoff_t *starts, *ends;
2914 {
2915 if (num_regs)
2916 {
2917 bufp->regs_allocated = REGS_REALLOCATE;
2918 regs->num_regs = num_regs;
2919 regs->start = starts;
2920 regs->end = ends;
2921 }
2922 else
2923 {
2924 bufp->regs_allocated = REGS_UNALLOCATED;
2925 regs->num_regs = 0;
2926 regs->start = regs->end = (regoff_t*) 0;
2927 }
2928 }
2929
2930 /* Searching routines. */
2931
2932 /* Like re_search_2, below, but only one string is specified, and
2933 doesn't let you say where to stop matching. */
2934
2935 int
re_search(bufp,string,size,startpos,range,regs)2936 re_search (bufp, string, size, startpos, range, regs)
2937 struct re_pattern_buffer *bufp;
2938 const char *string;
2939 int size, startpos, range;
2940 struct re_registers *regs;
2941 {
2942 return re_search_2 (bufp, NULL, 0, string, size, startpos, range,
2943 regs, size);
2944 }
2945
2946
2947 /* Using the compiled pattern in BUFP->buffer, first tries to match the
2948 virtual concatenation of STRING1 and STRING2, starting first at index
2949 STARTPOS, then at STARTPOS + 1, and so on.
2950
2951 STRING1 and STRING2 have length SIZE1 and SIZE2, respectively.
2952
2953 RANGE is how far to scan while trying to match. RANGE = 0 means try
2954 only at STARTPOS; in general, the last start tried is STARTPOS +
2955 RANGE.
2956
2957 In REGS, return the indices of the virtual concatenation of STRING1
2958 and STRING2 that matched the entire BUFP->buffer and its contained
2959 subexpressions.
2960
2961 Do not consider matching one past the index STOP in the virtual
2962 concatenation of STRING1 and STRING2.
2963
2964 We return either the position in the strings at which the match was
2965 found, -1 if no match, or -2 if error (such as failure
2966 stack overflow). */
2967
2968 int
re_search_2(bufp,string1,size1,string2,size2,startpos,range,regs,stop)2969 re_search_2 (bufp, string1, size1, string2, size2, startpos, range, regs, stop)
2970 struct re_pattern_buffer *bufp;
2971 const char *string1, *string2;
2972 int size1, size2;
2973 int startpos;
2974 int range;
2975 struct re_registers *regs;
2976 int stop;
2977 {
2978 int val;
2979 register char *fastmap = bufp->fastmap;
2980 register char *translate = bufp->translate;
2981 int total_size = size1 + size2;
2982 int endpos = startpos + range;
2983
2984 /* Check for out-of-range STARTPOS. */
2985 if (startpos < 0 || startpos > total_size)
2986 return -1;
2987
2988 /* Fix up RANGE if it might eventually take us outside
2989 the virtual concatenation of STRING1 and STRING2. */
2990 if (endpos < -1)
2991 range = -1 - startpos;
2992 else if (endpos > total_size)
2993 range = total_size - startpos;
2994
2995 /* If the search isn't to be a backwards one, don't waste time in a
2996 search for a pattern that must be anchored. */
2997 if (bufp->used > 0 && (re_opcode_t) bufp->buffer[0] == begbuf && range > 0)
2998 {
2999 if (startpos > 0)
3000 return -1;
3001 else
3002 range = 1;
3003 }
3004
3005 /* Update the fastmap now if not correct already. */
3006 if (fastmap && !bufp->fastmap_accurate)
3007 if (re_compile_fastmap (bufp) == -2)
3008 return -2;
3009
3010 /* Loop through the string, looking for a place to start matching. */
3011 for (;;)
3012 {
3013 /* If a fastmap is supplied, skip quickly over characters that
3014 cannot be the start of a match. If the pattern can match the
3015 null string, however, we don't need to skip characters; we want
3016 the first null string. */
3017 if (fastmap && startpos < total_size && !bufp->can_be_null)
3018 {
3019 if (range > 0) /* Searching forwards. */
3020 {
3021 register const char *d;
3022 register int lim = 0;
3023 int irange = range;
3024
3025 if (startpos < size1 && startpos + range >= size1)
3026 lim = range - (size1 - startpos);
3027
3028 d = (startpos >= size1 ? string2 - size1 : string1) + startpos;
3029
3030 /* Written out as an if-else to avoid testing `translate'
3031 inside the loop. */
3032 if (translate)
3033 while (range > lim
3034 && !fastmap[(unsigned char)
3035 translate[(unsigned char) *d++]])
3036 range--;
3037 else
3038 while (range > lim && !fastmap[(unsigned char) *d++])
3039 range--;
3040
3041 startpos += irange - range;
3042 }
3043 else /* Searching backwards. */
3044 {
3045 register char c = (size1 == 0 || startpos >= size1
3046 ? string2[startpos - size1]
3047 : string1[startpos]);
3048
3049 if (!fastmap[(unsigned char) TRANSLATE (c)])
3050 goto advance;
3051 }
3052 }
3053
3054 /* If can't match the null string, and that's all we have left, fail. */
3055 if (range >= 0 && startpos == total_size && fastmap
3056 && !bufp->can_be_null)
3057 return -1;
3058
3059 val = re_match_2 (bufp, string1, size1, string2, size2,
3060 startpos, regs, stop);
3061 if (val >= 0)
3062 return startpos;
3063
3064 if (val == -2)
3065 return -2;
3066
3067 advance:
3068 if (!range)
3069 break;
3070 else if (range > 0)
3071 {
3072 range--;
3073 startpos++;
3074 }
3075 else
3076 {
3077 range++;
3078 startpos--;
3079 }
3080 }
3081 return -1;
3082 } /* re_search_2 */
3083
3084 /* Declarations and macros for re_match_2. */
3085
3086 static int bcmp_translate ();
3087 static boolean alt_match_null_string_p (),
3088 common_op_match_null_string_p (),
3089 group_match_null_string_p ();
3090
3091 /* Structure for per-register (a.k.a. per-group) information.
3092 This must not be longer than one word, because we push this value
3093 onto the failure stack. Other register information, such as the
3094 starting and ending positions (which are addresses), and the list of
3095 inner groups (which is a bits list) are maintained in separate
3096 variables.
3097
3098 We are making a (strictly speaking) nonportable assumption here: that
3099 the compiler will pack our bit fields into something that fits into
3100 the type of `word', i.e., is something that fits into one item on the
3101 failure stack. */
3102 typedef union
3103 {
3104 fail_stack_elt_t word;
3105 struct
3106 {
3107 /* This field is one if this group can match the empty string,
3108 zero if not. If not yet determined, `MATCH_NULL_UNSET_VALUE'. */
3109 #define MATCH_NULL_UNSET_VALUE 3
3110 unsigned match_null_string_p : 2;
3111 unsigned is_active : 1;
3112 unsigned matched_something : 1;
3113 unsigned ever_matched_something : 1;
3114 } bits;
3115 } register_info_type;
3116
3117 #define REG_MATCH_NULL_STRING_P(R) ((R).bits.match_null_string_p)
3118 #define IS_ACTIVE(R) ((R).bits.is_active)
3119 #define MATCHED_SOMETHING(R) ((R).bits.matched_something)
3120 #define EVER_MATCHED_SOMETHING(R) ((R).bits.ever_matched_something)
3121
3122
3123 /* Call this when have matched a real character; it sets `matched' flags
3124 for the subexpressions which we are currently inside. Also records
3125 that those subexprs have matched. */
3126 #define SET_REGS_MATCHED() \
3127 do \
3128 { \
3129 unsigned r; \
3130 for (r = lowest_active_reg; r <= highest_active_reg; r++) \
3131 { \
3132 MATCHED_SOMETHING (reg_info[r]) \
3133 = EVER_MATCHED_SOMETHING (reg_info[r]) \
3134 = 1; \
3135 } \
3136 } \
3137 while (0)
3138
3139
3140 /* This converts PTR, a pointer into one of the search strings `string1'
3141 and `string2' into an offset from the beginning of that string. */
3142 #define POINTER_TO_OFFSET(ptr) \
3143 (FIRST_STRING_P (ptr) ? (ptr) - string1 : (ptr) - string2 + size1)
3144
3145 /* Registers are set to a sentinel when they haven't yet matched. */
3146 #define REG_UNSET_VALUE ((char *) -1)
3147 #define REG_UNSET(e) ((e) == REG_UNSET_VALUE)
3148
3149
3150 /* Macros for dealing with the split strings in re_match_2. */
3151
3152 #define MATCHING_IN_FIRST_STRING (dend == end_match_1)
3153
3154 /* Call before fetching a character with *d. This switches over to
3155 string2 if necessary. */
3156 #define PREFETCH() \
3157 while (d == dend) \
3158 { \
3159 /* End of string2 => fail. */ \
3160 if (dend == end_match_2) \
3161 goto fail; \
3162 /* End of string1 => advance to string2. */ \
3163 d = string2; \
3164 dend = end_match_2; \
3165 }
3166
3167
3168 /* Test if at very beginning or at very end of the virtual concatenation
3169 of `string1' and `string2'. If only one string, it's `string2'. */
3170 #define AT_STRINGS_BEG(d) ((d) == (size1 ? string1 : string2) || !size2)
3171 #define AT_STRINGS_END(d) ((d) == end2)
3172
3173
3174 /* Test if D points to a character which is word-constituent. We have
3175 two special cases to check for: if past the end of string1, look at
3176 the first character in string2; and if before the beginning of
3177 string2, look at the last character in string1. */
3178 #define WORDCHAR_P(d) \
3179 (SYNTAX ((d) == end1 ? *string2 \
3180 : (d) == string2 - 1 ? *(end1 - 1) : *(d)) \
3181 == Sword)
3182
3183 /* Test if the character before D and the one at D differ with respect
3184 to being word-constituent. */
3185 #define AT_WORD_BOUNDARY(d) \
3186 (AT_STRINGS_BEG (d) || AT_STRINGS_END (d) \
3187 || WORDCHAR_P (d - 1) != WORDCHAR_P (d))
3188
3189
3190 /* Free everything we malloc. */
3191 #ifdef REGEX_MALLOC
3192 #define FREE_VAR(var) if (var) free (var); var = NULL
3193 #define FREE_VARIABLES() \
3194 do { \
3195 FREE_VAR (fail_stack.stack); \
3196 FREE_VAR (regstart); \
3197 FREE_VAR (regend); \
3198 FREE_VAR (old_regstart); \
3199 FREE_VAR (old_regend); \
3200 FREE_VAR (best_regstart); \
3201 FREE_VAR (best_regend); \
3202 FREE_VAR (reg_info); \
3203 FREE_VAR (reg_dummy); \
3204 FREE_VAR (reg_info_dummy); \
3205 } while (0)
3206 #else /* not REGEX_MALLOC */
3207 /* Some MIPS systems (at least) want this to free alloca'd storage. */
3208 #define FREE_VARIABLES() alloca (0)
3209 #endif /* not REGEX_MALLOC */
3210
3211
3212 /* These values must meet several constraints. They must not be valid
3213 register values; since we have a limit of 255 registers (because
3214 we use only one byte in the pattern for the register number), we can
3215 use numbers larger than 255. They must differ by 1, because of
3216 NUM_FAILURE_ITEMS above. And the value for the lowest register must
3217 be larger than the value for the highest register, so we do not try
3218 to actually save any registers when none are active. */
3219 #define NO_HIGHEST_ACTIVE_REG (1 << BYTEWIDTH)
3220 #define NO_LOWEST_ACTIVE_REG (NO_HIGHEST_ACTIVE_REG + 1)
3221
3222 /* Matching routines. */
3223
3224 #ifndef emacs /* Emacs never uses this. */
3225 /* re_match is like re_match_2 except it takes only a single string. */
3226
3227 int
re_match(bufp,string,size,pos,regs)3228 re_match (bufp, string, size, pos, regs)
3229 struct re_pattern_buffer *bufp;
3230 const char *string;
3231 int size, pos;
3232 struct re_registers *regs;
3233 {
3234 return re_match_2 (bufp, NULL, 0, string, size, pos, regs, size);
3235 }
3236 #endif /* not emacs */
3237
3238
3239 /* re_match_2 matches the compiled pattern in BUFP against the
3240 the (virtual) concatenation of STRING1 and STRING2 (of length SIZE1
3241 and SIZE2, respectively). We start matching at POS, and stop
3242 matching at STOP.
3243
3244 If REGS is non-null and the `no_sub' field of BUFP is non-zero, we
3245 store offsets for the substring each group matched in REGS. See the
3246 documentation for exactly how many groups we fill.
3247
3248 We return -1 if no match, -2 if an internal error (such as the
3249 failure stack overflowing). Otherwise, we return the length of the
3250 matched substring. */
3251
3252 int
re_match_2(bufp,string1,size1,string2,size2,pos,regs,stop)3253 re_match_2 (bufp, string1, size1, string2, size2, pos, regs, stop)
3254 struct re_pattern_buffer *bufp;
3255 const char *string1, *string2;
3256 int size1, size2;
3257 int pos;
3258 struct re_registers *regs;
3259 int stop;
3260 {
3261 /* General temporaries. */
3262 int mcnt;
3263 unsigned char *p1;
3264
3265 /* Just past the end of the corresponding string. */
3266 const char *end1, *end2;
3267
3268 /* Pointers into string1 and string2, just past the last characters in
3269 each to consider matching. */
3270 const char *end_match_1, *end_match_2;
3271
3272 /* Where we are in the data, and the end of the current string. */
3273 const char *d, *dend;
3274
3275 /* Where we are in the pattern, and the end of the pattern. */
3276 unsigned char *p = bufp->buffer;
3277 register unsigned char *pend = p + bufp->used;
3278
3279 /* We use this to map every character in the string. */
3280 char *translate = bufp->translate;
3281
3282 /* Failure point stack. Each place that can handle a failure further
3283 down the line pushes a failure point on this stack. It consists of
3284 restart, regend, and reg_info for all registers corresponding to
3285 the subexpressions we're currently inside, plus the number of such
3286 registers, and, finally, two char *'s. The first char * is where
3287 to resume scanning the pattern; the second one is where to resume
3288 scanning the strings. If the latter is zero, the failure point is
3289 a ``dummy''; if a failure happens and the failure point is a dummy,
3290 it gets discarded and the next next one is tried. */
3291 fail_stack_type fail_stack;
3292 #ifdef DEBUG
3293 static unsigned failure_id = 0;
3294 unsigned nfailure_points_pushed = 0, nfailure_points_popped = 0;
3295 #endif
3296
3297 /* We fill all the registers internally, independent of what we
3298 return, for use in backreferences. The number here includes
3299 an element for register zero. */
3300 unsigned num_regs = (unsigned)bufp->re_nsub + 1;
3301
3302 /* The currently active registers. */
3303 unsigned lowest_active_reg = NO_LOWEST_ACTIVE_REG;
3304 unsigned highest_active_reg = NO_HIGHEST_ACTIVE_REG;
3305
3306 /* Information on the contents of registers. These are pointers into
3307 the input strings; they record just what was matched (on this
3308 attempt) by a subexpression part of the pattern, that is, the
3309 regnum-th regstart pointer points to where in the pattern we began
3310 matching and the regnum-th regend points to right after where we
3311 stopped matching the regnum-th subexpression. (The zeroth register
3312 keeps track of what the whole pattern matches.) */
3313 const char **regstart, **regend;
3314
3315 /* If a group that's operated upon by a repetition operator fails to
3316 match anything, then the register for its start will need to be
3317 restored because it will have been set to wherever in the string we
3318 are when we last see its open-group operator. Similarly for a
3319 register's end. */
3320 const char **old_regstart, **old_regend;
3321
3322 /* The is_active field of reg_info helps us keep track of which (possibly
3323 nested) subexpressions we are currently in. The matched_something
3324 field of reg_info[reg_num] helps us tell whether or not we have
3325 matched any of the pattern so far this time through the reg_num-th
3326 subexpression. These two fields get reset each time through any
3327 loop their register is in. */
3328 register_info_type *reg_info;
3329
3330 /* The following record the register info as found in the above
3331 variables when we find a match better than any we've seen before.
3332 This happens as we backtrack through the failure points, which in
3333 turn happens only if we have not yet matched the entire string. */
3334 unsigned best_regs_set = false;
3335 const char **best_regstart, **best_regend;
3336
3337 /* Logically, this is `best_regend[0]'. But we don't want to have to
3338 allocate space for that if we're not allocating space for anything
3339 else (see below). Also, we never need info about register 0 for
3340 any of the other register vectors, and it seems rather a kludge to
3341 treat `best_regend' differently than the rest. So we keep track of
3342 the end of the best match so far in a separate variable. We
3343 initialize this to NULL so that when we backtrack the first time
3344 and need to test it, it's not garbage. */
3345 const char *match_end = NULL;
3346
3347 /* Used when we pop values we don't care about. */
3348 const char **reg_dummy;
3349 register_info_type *reg_info_dummy;
3350
3351 #ifdef DEBUG
3352 /* Counts the total number of registers pushed. */
3353 unsigned num_regs_pushed = 0;
3354 #endif
3355
3356 DEBUG_PRINT1 ("\n\nEntering re_match_2.\n");
3357
3358 INIT_FAIL_STACK ();
3359
3360 /* Do not bother to initialize all the register variables if there are
3361 no groups in the pattern, as it takes a fair amount of time. If
3362 there are groups, we include space for register 0 (the whole
3363 pattern), even though we never use it, since it simplifies the
3364 array indexing. We should fix this. */
3365 if (bufp->re_nsub)
3366 {
3367 regstart = REGEX_TALLOC (num_regs, const char *);
3368 regend = REGEX_TALLOC (num_regs, const char *);
3369 old_regstart = REGEX_TALLOC (num_regs, const char *);
3370 old_regend = REGEX_TALLOC (num_regs, const char *);
3371 best_regstart = REGEX_TALLOC (num_regs, const char *);
3372 best_regend = REGEX_TALLOC (num_regs, const char *);
3373 reg_info = REGEX_TALLOC (num_regs, register_info_type);
3374 reg_dummy = REGEX_TALLOC (num_regs, const char *);
3375 reg_info_dummy = REGEX_TALLOC (num_regs, register_info_type);
3376
3377 if (!(regstart && regend && old_regstart && old_regend && reg_info
3378 && best_regstart && best_regend && reg_dummy && reg_info_dummy))
3379 {
3380 FREE_VARIABLES ();
3381 return -2;
3382 }
3383 }
3384 #ifdef REGEX_MALLOC
3385 else
3386 {
3387 /* We must initialize all our variables to NULL, so that
3388 `FREE_VARIABLES' doesn't try to free them. */
3389 regstart = regend = old_regstart = old_regend = best_regstart
3390 = best_regend = reg_dummy = NULL;
3391 reg_info = reg_info_dummy = (register_info_type *) NULL;
3392 }
3393 #endif /* REGEX_MALLOC */
3394
3395 /* The starting position is bogus. */
3396 if (pos < 0 || pos > size1 + size2)
3397 {
3398 FREE_VARIABLES ();
3399 return -1;
3400 }
3401
3402 /* Initialize subexpression text positions to -1 to mark ones that no
3403 start_memory/stop_memory has been seen for. Also initialize the
3404 register information struct. */
3405 for (mcnt = 1; (unsigned)mcnt < num_regs; mcnt++)
3406 {
3407 regstart[mcnt] = regend[mcnt]
3408 = old_regstart[mcnt] = old_regend[mcnt] = REG_UNSET_VALUE;
3409
3410 REG_MATCH_NULL_STRING_P (reg_info[mcnt]) = MATCH_NULL_UNSET_VALUE;
3411 IS_ACTIVE (reg_info[mcnt]) = 0;
3412 MATCHED_SOMETHING (reg_info[mcnt]) = 0;
3413 EVER_MATCHED_SOMETHING (reg_info[mcnt]) = 0;
3414 }
3415
3416 /* We move `string1' into `string2' if the latter's empty -- but not if
3417 `string1' is null. */
3418 if (size2 == 0 && string1 != NULL)
3419 {
3420 string2 = string1;
3421 size2 = size1;
3422 string1 = 0;
3423 size1 = 0;
3424 }
3425 end1 = string1 + size1;
3426 end2 = string2 + size2;
3427
3428 /* Compute where to stop matching, within the two strings. */
3429 if (stop <= size1)
3430 {
3431 end_match_1 = string1 + stop;
3432 end_match_2 = string2;
3433 }
3434 else
3435 {
3436 end_match_1 = end1;
3437 end_match_2 = string2 + stop - size1;
3438 }
3439
3440 /* `p' scans through the pattern as `d' scans through the data.
3441 `dend' is the end of the input string that `d' points within. `d'
3442 is advanced into the following input string whenever necessary, but
3443 this happens before fetching; therefore, at the beginning of the
3444 loop, `d' can be pointing at the end of a string, but it cannot
3445 equal `string2'. */
3446 if (size1 > 0 && pos <= size1)
3447 {
3448 d = string1 + pos;
3449 dend = end_match_1;
3450 }
3451 else
3452 {
3453 d = string2 + pos - size1;
3454 dend = end_match_2;
3455 }
3456
3457 DEBUG_PRINT1 ("The compiled pattern is: ");
3458 DEBUG_PRINT_COMPILED_PATTERN (bufp, p, pend);
3459 DEBUG_PRINT1 ("The string to match is: `");
3460 DEBUG_PRINT_DOUBLE_STRING (d, string1, size1, string2, size2);
3461 DEBUG_PRINT1 ("'\n");
3462
3463 /* This loops over pattern commands. It exits by returning from the
3464 function if the match is complete, or it drops through if the match
3465 fails at this starting point in the input data. */
3466 for (;;)
3467 {
3468 DEBUG_PRINT2 ("\n0x%x: ", p);
3469
3470 if (p == pend)
3471 { /* End of pattern means we might have succeeded. */
3472 DEBUG_PRINT1 ("end of pattern ... ");
3473
3474 /* If we haven't matched the entire string, and we want the
3475 longest match, try backtracking. */
3476 if (d != end_match_2)
3477 {
3478 DEBUG_PRINT1 ("backtracking.\n");
3479
3480 if (!FAIL_STACK_EMPTY ())
3481 { /* More failure points to try. */
3482 boolean same_str_p = (FIRST_STRING_P (match_end)
3483 == MATCHING_IN_FIRST_STRING);
3484
3485 /* If exceeds best match so far, save it. */
3486 if (!best_regs_set
3487 || (same_str_p && d > match_end)
3488 || (!same_str_p && !MATCHING_IN_FIRST_STRING))
3489 {
3490 best_regs_set = true;
3491 match_end = d;
3492
3493 DEBUG_PRINT1 ("\nSAVING match as best so far.\n");
3494
3495 for (mcnt = 1; (unsigned)mcnt < num_regs; mcnt++)
3496 {
3497 best_regstart[mcnt] = regstart[mcnt];
3498 best_regend[mcnt] = regend[mcnt];
3499 }
3500 }
3501 goto fail;
3502 }
3503
3504 /* If no failure points, don't restore garbage. */
3505 else if (best_regs_set)
3506 {
3507 restore_best_regs:
3508 /* Restore best match. It may happen that `dend ==
3509 end_match_1' while the restored d is in string2.
3510 For example, the pattern `x.*y.*z' against the
3511 strings `x-' and `y-z-', if the two strings are
3512 not consecutive in memory. */
3513 DEBUG_PRINT1 ("Restoring best registers.\n");
3514
3515 d = match_end;
3516 dend = ((d >= string1 && d <= end1)
3517 ? end_match_1 : end_match_2);
3518
3519 for (mcnt = 1; (unsigned)mcnt < num_regs; mcnt++)
3520 {
3521 regstart[mcnt] = best_regstart[mcnt];
3522 regend[mcnt] = best_regend[mcnt];
3523 }
3524 }
3525 } /* d != end_match_2 */
3526
3527 DEBUG_PRINT1 ("Accepting match.\n");
3528
3529 /* If caller wants register contents data back, do it. */
3530 if (regs && !bufp->no_sub)
3531 {
3532 /* Have the register data arrays been allocated? */
3533 if (bufp->regs_allocated == REGS_UNALLOCATED)
3534 { /* No. So allocate them with malloc. We need one
3535 extra element beyond `num_regs' for the `-1' marker
3536 GNU code uses. */
3537 regs->num_regs = MAX (RE_NREGS, num_regs + 1);
3538 regs->start = TALLOC (regs->num_regs, regoff_t);
3539 regs->end = TALLOC (regs->num_regs, regoff_t);
3540 if (regs->start == NULL || regs->end == NULL)
3541 return -2;
3542 bufp->regs_allocated = REGS_REALLOCATE;
3543 }
3544 else if (bufp->regs_allocated == REGS_REALLOCATE)
3545 { /* Yes. If we need more elements than were already
3546 allocated, reallocate them. If we need fewer, just
3547 leave it alone. */
3548 if (regs->num_regs < num_regs + 1)
3549 {
3550 regs->num_regs = num_regs + 1;
3551 RETALLOC (regs->start, regs->num_regs, regoff_t);
3552 RETALLOC (regs->end, regs->num_regs, regoff_t);
3553 if (regs->start == NULL || regs->end == NULL)
3554 return -2;
3555 }
3556 }
3557 else
3558 assert (bufp->regs_allocated == REGS_FIXED);
3559
3560 /* Convert the pointer data in `regstart' and `regend' to
3561 indices. Register zero has to be set differently,
3562 since we haven't kept track of any info for it. */
3563 if (regs->num_regs > 0)
3564 {
3565 regs->start[0] = pos;
3566 regs->end[0] = (regoff_t)(MATCHING_IN_FIRST_STRING ? d - string1
3567 : d - string2 + size1);
3568 }
3569
3570 /* Go through the first `min (num_regs, regs->num_regs)'
3571 registers, since that is all we initialized. */
3572 for (mcnt = 1; (unsigned)mcnt < MIN (num_regs, regs->num_regs); mcnt++)
3573 {
3574 if (REG_UNSET (regstart[mcnt]) || REG_UNSET (regend[mcnt]))
3575 regs->start[mcnt] = regs->end[mcnt] = -1;
3576 else
3577 {
3578 regs->start[mcnt] = (regoff_t)POINTER_TO_OFFSET (regstart[mcnt]);
3579 regs->end[mcnt] = (regoff_t)POINTER_TO_OFFSET (regend[mcnt]);
3580 }
3581 }
3582
3583 /* If the regs structure we return has more elements than
3584 were in the pattern, set the extra elements to -1. If
3585 we (re)allocated the registers, this is the case,
3586 because we always allocate enough to have at least one
3587 -1 at the end. */
3588 for (mcnt = num_regs; (unsigned)mcnt < regs->num_regs; mcnt++)
3589 regs->start[mcnt] = regs->end[mcnt] = -1;
3590 } /* regs && !bufp->no_sub */
3591
3592 FREE_VARIABLES ();
3593 DEBUG_PRINT4 ("%u failure points pushed, %u popped (%u remain).\n",
3594 nfailure_points_pushed, nfailure_points_popped,
3595 nfailure_points_pushed - nfailure_points_popped);
3596 DEBUG_PRINT2 ("%u registers pushed.\n", num_regs_pushed);
3597
3598 mcnt = (int)(d - pos - (MATCHING_IN_FIRST_STRING
3599 ? string1
3600 : string2 - size1));
3601
3602 DEBUG_PRINT2 ("Returning %d from re_match_2.\n", mcnt);
3603
3604 return mcnt;
3605 }
3606
3607 /* Otherwise match next pattern command. */
3608 #ifdef SWITCH_ENUM_BUG
3609 switch ((int) ((re_opcode_t) *p++))
3610 #else
3611 switch ((re_opcode_t) *p++)
3612 #endif
3613 {
3614 /* Ignore these. Used to ignore the n of succeed_n's which
3615 currently have n == 0. */
3616 case no_op:
3617 DEBUG_PRINT1 ("EXECUTING no_op.\n");
3618 break;
3619
3620
3621 /* Match the next n pattern characters exactly. The following
3622 byte in the pattern defines n, and the n bytes after that
3623 are the characters to match. */
3624 case exactn:
3625 mcnt = *p++;
3626 DEBUG_PRINT2 ("EXECUTING exactn %d.\n", mcnt);
3627
3628 /* This is written out as an if-else so we don't waste time
3629 testing `translate' inside the loop. */
3630 if (translate)
3631 {
3632 do
3633 {
3634 PREFETCH ();
3635 if (translate[(unsigned char) *d++] != (char) *p++)
3636 goto fail;
3637 }
3638 while (--mcnt);
3639 }
3640 else
3641 {
3642 do
3643 {
3644 PREFETCH ();
3645 if (*d++ != (char) *p++) goto fail;
3646 }
3647 while (--mcnt);
3648 }
3649 SET_REGS_MATCHED ();
3650 break;
3651
3652
3653 /* Match any character except possibly a newline or a null. */
3654 case anychar:
3655 DEBUG_PRINT1 ("EXECUTING anychar.\n");
3656
3657 PREFETCH ();
3658
3659 if ((!(bufp->syntax & RE_DOT_NEWLINE) && TRANSLATE (*d) == '\n')
3660 || (bufp->syntax & RE_DOT_NOT_NULL && TRANSLATE (*d) == '\000'))
3661 goto fail;
3662
3663 SET_REGS_MATCHED ();
3664 DEBUG_PRINT2 (" Matched `%d'.\n", *d);
3665 d++;
3666 break;
3667
3668
3669 case charset:
3670 case charset_not:
3671 {
3672 register unsigned char c;
3673 boolean not = (re_opcode_t) *(p - 1) == charset_not;
3674
3675 DEBUG_PRINT2 ("EXECUTING charset%s.\n", not ? "_not" : "");
3676
3677 PREFETCH ();
3678 c = TRANSLATE (*d); /* The character to match. */
3679
3680 /* Cast to `unsigned' instead of `unsigned char' in case the
3681 bit list is a full 32 bytes long. */
3682 if (c < (unsigned) (*p * BYTEWIDTH)
3683 && p[1 + c / BYTEWIDTH] & (1 << (c % BYTEWIDTH)))
3684 not = !not;
3685
3686 p += 1 + *p;
3687
3688 if (!not) goto fail;
3689
3690 SET_REGS_MATCHED ();
3691 d++;
3692 break;
3693 }
3694
3695
3696 /* The beginning of a group is represented by start_memory.
3697 The arguments are the register number in the next byte, and the
3698 number of groups inner to this one in the next. The text
3699 matched within the group is recorded (in the internal
3700 registers data structure) under the register number. */
3701 case start_memory:
3702 DEBUG_PRINT3 ("EXECUTING start_memory %d (%d):\n", *p, p[1]);
3703
3704 /* Find out if this group can match the empty string. */
3705 p1 = p; /* To send to group_match_null_string_p. */
3706
3707 if (REG_MATCH_NULL_STRING_P (reg_info[*p]) == MATCH_NULL_UNSET_VALUE)
3708 REG_MATCH_NULL_STRING_P (reg_info[*p])
3709 = group_match_null_string_p (&p1, pend, reg_info);
3710
3711 /* Save the position in the string where we were the last time
3712 we were at this open-group operator in case the group is
3713 operated upon by a repetition operator, e.g., with `(a*)*b'
3714 against `ab'; then we want to ignore where we are now in
3715 the string in case this attempt to match fails. */
3716 old_regstart[*p] = REG_MATCH_NULL_STRING_P (reg_info[*p])
3717 ? REG_UNSET (regstart[*p]) ? d : regstart[*p]
3718 : regstart[*p];
3719 DEBUG_PRINT2 (" old_regstart: %d\n",
3720 POINTER_TO_OFFSET (old_regstart[*p]));
3721
3722 regstart[*p] = d;
3723 DEBUG_PRINT2 (" regstart: %d\n", POINTER_TO_OFFSET (regstart[*p]));
3724
3725 IS_ACTIVE (reg_info[*p]) = 1;
3726 MATCHED_SOMETHING (reg_info[*p]) = 0;
3727
3728 /* This is the new highest active register. */
3729 highest_active_reg = *p;
3730
3731 /* If nothing was active before, this is the new lowest active
3732 register. */
3733 if (lowest_active_reg == NO_LOWEST_ACTIVE_REG)
3734 lowest_active_reg = *p;
3735
3736 /* Move past the register number and inner group count. */
3737 p += 2;
3738 break;
3739
3740
3741 /* The stop_memory opcode represents the end of a group. Its
3742 arguments are the same as start_memory's: the register
3743 number, and the number of inner groups. */
3744 case stop_memory:
3745 DEBUG_PRINT3 ("EXECUTING stop_memory %d (%d):\n", *p, p[1]);
3746
3747 /* We need to save the string position the last time we were at
3748 this close-group operator in case the group is operated
3749 upon by a repetition operator, e.g., with `((a*)*(b*)*)*'
3750 against `aba'; then we want to ignore where we are now in
3751 the string in case this attempt to match fails. */
3752 old_regend[*p] = REG_MATCH_NULL_STRING_P (reg_info[*p])
3753 ? REG_UNSET (regend[*p]) ? d : regend[*p]
3754 : regend[*p];
3755 DEBUG_PRINT2 (" old_regend: %d\n",
3756 POINTER_TO_OFFSET (old_regend[*p]));
3757
3758 regend[*p] = d;
3759 DEBUG_PRINT2 (" regend: %d\n", POINTER_TO_OFFSET (regend[*p]));
3760
3761 /* This register isn't active anymore. */
3762 IS_ACTIVE (reg_info[*p]) = 0;
3763
3764 /* If this was the only register active, nothing is active
3765 anymore. */
3766 if (lowest_active_reg == highest_active_reg)
3767 {
3768 lowest_active_reg = NO_LOWEST_ACTIVE_REG;
3769 highest_active_reg = NO_HIGHEST_ACTIVE_REG;
3770 }
3771 else
3772 { /* We must scan for the new highest active register, since
3773 it isn't necessarily one less than now: consider
3774 (a(b)c(d(e)f)g). When group 3 ends, after the f), the
3775 new highest active register is 1. */
3776 unsigned char r = *p - 1;
3777 while (r > 0 && !IS_ACTIVE (reg_info[r]))
3778 r--;
3779
3780 /* If we end up at register zero, that means that we saved
3781 the registers as the result of an `on_failure_jump', not
3782 a `start_memory', and we jumped to past the innermost
3783 `stop_memory'. For example, in ((.)*) we save
3784 registers 1 and 2 as a result of the *, but when we pop
3785 back to the second ), we are at the stop_memory 1.
3786 Thus, nothing is active. */
3787 if (r == 0)
3788 {
3789 lowest_active_reg = NO_LOWEST_ACTIVE_REG;
3790 highest_active_reg = NO_HIGHEST_ACTIVE_REG;
3791 }
3792 else
3793 highest_active_reg = r;
3794 }
3795
3796 /* If just failed to match something this time around with a
3797 group that's operated on by a repetition operator, try to
3798 force exit from the ``loop'', and restore the register
3799 information for this group that we had before trying this
3800 last match. */
3801 if ((!MATCHED_SOMETHING (reg_info[*p])
3802 || (re_opcode_t) p[-3] == start_memory)
3803 && (p + 2) < pend)
3804 {
3805 boolean is_a_jump_n = false;
3806
3807 p1 = p + 2;
3808 mcnt = 0;
3809 switch ((re_opcode_t) *p1++)
3810 {
3811 case jump_n:
3812 is_a_jump_n = true;
3813 case pop_failure_jump:
3814 case maybe_pop_jump:
3815 case jump:
3816 case dummy_failure_jump:
3817 EXTRACT_NUMBER_AND_INCR (mcnt, p1);
3818 if (is_a_jump_n)
3819 p1 += 2;
3820 break;
3821
3822 default:
3823 /* do nothing */ ;
3824 }
3825 p1 += mcnt;
3826
3827 /* If the next operation is a jump backwards in the pattern
3828 to an on_failure_jump right before the start_memory
3829 corresponding to this stop_memory, exit from the loop
3830 by forcing a failure after pushing on the stack the
3831 on_failure_jump's jump in the pattern, and d. */
3832 if (mcnt < 0 && (re_opcode_t) *p1 == on_failure_jump
3833 && (re_opcode_t) p1[3] == start_memory && p1[4] == *p)
3834 {
3835 /* If this group ever matched anything, then restore
3836 what its registers were before trying this last
3837 failed match, e.g., with `(a*)*b' against `ab' for
3838 regstart[1], and, e.g., with `((a*)*(b*)*)*'
3839 against `aba' for regend[3].
3840
3841 Also restore the registers for inner groups for,
3842 e.g., `((a*)(b*))*' against `aba' (register 3 would
3843 otherwise get trashed). */
3844
3845 if (EVER_MATCHED_SOMETHING (reg_info[*p]))
3846 {
3847 unsigned r;
3848
3849 EVER_MATCHED_SOMETHING (reg_info[*p]) = 0;
3850
3851 /* Restore this and inner groups' (if any) registers. */
3852 for (r = *p; r < (unsigned)(*p + *(p + 1)); r++)
3853 {
3854 regstart[r] = old_regstart[r];
3855
3856 /* xx why this test? */
3857 if (old_regend[r] >= regstart[r])
3858 regend[r] = old_regend[r];
3859 }
3860 }
3861 p1++;
3862 EXTRACT_NUMBER_AND_INCR (mcnt, p1);
3863 PUSH_FAILURE_POINT (p1 + mcnt, d, -2);
3864
3865 goto fail;
3866 }
3867 }
3868
3869 /* Move past the register number and the inner group count. */
3870 p += 2;
3871 break;
3872
3873
3874 /* \<digit> has been turned into a `duplicate' command which is
3875 followed by the numeric value of <digit> as the register number. */
3876 case duplicate:
3877 {
3878 register const char *d2, *dend2;
3879 int regno = *p++; /* Get which register to match against. */
3880 DEBUG_PRINT2 ("EXECUTING duplicate %d.\n", regno);
3881
3882 /* Can't back reference a group which we've never matched. */
3883 if (REG_UNSET (regstart[regno]) || REG_UNSET (regend[regno]))
3884 goto fail;
3885
3886 /* Where in input to try to start matching. */
3887 d2 = regstart[regno];
3888
3889 /* Where to stop matching; if both the place to start and
3890 the place to stop matching are in the same string, then
3891 set to the place to stop, otherwise, for now have to use
3892 the end of the first string. */
3893
3894 dend2 = ((FIRST_STRING_P (regstart[regno])
3895 == FIRST_STRING_P (regend[regno]))
3896 ? regend[regno] : end_match_1);
3897 for (;;)
3898 {
3899 /* If necessary, advance to next segment in register
3900 contents. */
3901 while (d2 == dend2)
3902 {
3903 if (dend2 == end_match_2) break;
3904 if (dend2 == regend[regno]) break;
3905
3906 /* End of string1 => advance to string2. */
3907 d2 = string2;
3908 dend2 = regend[regno];
3909 }
3910 /* At end of register contents => success */
3911 if (d2 == dend2) break;
3912
3913 /* If necessary, advance to next segment in data. */
3914 PREFETCH ();
3915
3916 /* How many characters left in this segment to match. */
3917 mcnt = (int)(dend - d);
3918
3919 /* Want how many consecutive characters we can match in
3920 one shot, so, if necessary, adjust the count. */
3921 if (mcnt > dend2 - d2)
3922 mcnt = (int)(dend2 - d2);
3923
3924 /* Compare that many; failure if mismatch, else move
3925 past them. */
3926 if (translate
3927 ? bcmp_translate (d, d2, mcnt, translate)
3928 : bcmp (d, d2, mcnt))
3929 goto fail;
3930 d += mcnt, d2 += mcnt;
3931 }
3932 }
3933 break;
3934
3935
3936 /* begline matches the empty string at the beginning of the string
3937 (unless `not_bol' is set in `bufp'), and, if
3938 `newline_anchor' is set, after newlines. */
3939 case begline:
3940 DEBUG_PRINT1 ("EXECUTING begline.\n");
3941
3942 if (AT_STRINGS_BEG (d))
3943 {
3944 if (!bufp->not_bol) break;
3945 }
3946 else if (d[-1] == '\n' && bufp->newline_anchor)
3947 {
3948 break;
3949 }
3950 /* In all other cases, we fail. */
3951 goto fail;
3952
3953
3954 /* endline is the dual of begline. */
3955 case endline:
3956 DEBUG_PRINT1 ("EXECUTING endline.\n");
3957
3958 if (AT_STRINGS_END (d))
3959 {
3960 if (!bufp->not_eol) break;
3961 }
3962
3963 /* We have to ``prefetch'' the next character. */
3964 else if ((d == end1 ? *string2 : *d) == '\n'
3965 && bufp->newline_anchor)
3966 {
3967 break;
3968 }
3969 goto fail;
3970
3971
3972 /* Match at the very beginning of the data. */
3973 case begbuf:
3974 DEBUG_PRINT1 ("EXECUTING begbuf.\n");
3975 if (AT_STRINGS_BEG (d))
3976 break;
3977 goto fail;
3978
3979
3980 /* Match at the very end of the data. */
3981 case endbuf:
3982 DEBUG_PRINT1 ("EXECUTING endbuf.\n");
3983 if (AT_STRINGS_END (d))
3984 break;
3985 goto fail;
3986
3987
3988 /* on_failure_keep_string_jump is used to optimize `.*\n'. It
3989 pushes NULL as the value for the string on the stack. Then
3990 `pop_failure_point' will keep the current value for the
3991 string, instead of restoring it. To see why, consider
3992 matching `foo\nbar' against `.*\n'. The .* matches the foo;
3993 then the . fails against the \n. But the next thing we want
3994 to do is match the \n against the \n; if we restored the
3995 string value, we would be back at the foo.
3996
3997 Because this is used only in specific cases, we don't need to
3998 check all the things that `on_failure_jump' does, to make
3999 sure the right things get saved on the stack. Hence we don't
4000 share its code. The only reason to push anything on the
4001 stack at all is that otherwise we would have to change
4002 `anychar's code to do something besides goto fail in this
4003 case; that seems worse than this. */
4004 case on_failure_keep_string_jump:
4005 DEBUG_PRINT1 ("EXECUTING on_failure_keep_string_jump");
4006
4007 EXTRACT_NUMBER_AND_INCR (mcnt, p);
4008 DEBUG_PRINT3 (" %d (to 0x%x):\n", mcnt, p + mcnt);
4009
4010 PUSH_FAILURE_POINT (p + mcnt, NULL, -2);
4011 break;
4012
4013
4014 /* Uses of on_failure_jump:
4015
4016 Each alternative starts with an on_failure_jump that points
4017 to the beginning of the next alternative. Each alternative
4018 except the last ends with a jump that in effect jumps past
4019 the rest of the alternatives. (They really jump to the
4020 ending jump of the following alternative, because tensioning
4021 these jumps is a hassle.)
4022
4023 Repeats start with an on_failure_jump that points past both
4024 the repetition text and either the following jump or
4025 pop_failure_jump back to this on_failure_jump. */
4026 case on_failure_jump:
4027 on_failure:
4028 DEBUG_PRINT1 ("EXECUTING on_failure_jump");
4029
4030 EXTRACT_NUMBER_AND_INCR (mcnt, p);
4031 DEBUG_PRINT3 (" %d (to 0x%x)", mcnt, p + mcnt);
4032
4033 /* If this on_failure_jump comes right before a group (i.e.,
4034 the original * applied to a group), save the information
4035 for that group and all inner ones, so that if we fail back
4036 to this point, the group's information will be correct.
4037 For example, in \(a*\)*\1, we need the preceding group,
4038 and in \(\(a*\)b*\)\2, we need the inner group. */
4039
4040 /* We can't use `p' to check ahead because we push
4041 a failure point to `p + mcnt' after we do this. */
4042 p1 = p;
4043
4044 /* We need to skip no_op's before we look for the
4045 start_memory in case this on_failure_jump is happening as
4046 the result of a completed succeed_n, as in \(a\)\{1,3\}b\1
4047 against aba. */
4048 while (p1 < pend && (re_opcode_t) *p1 == no_op)
4049 p1++;
4050
4051 if (p1 < pend && (re_opcode_t) *p1 == start_memory)
4052 {
4053 /* We have a new highest active register now. This will
4054 get reset at the start_memory we are about to get to,
4055 but we will have saved all the registers relevant to
4056 this repetition op, as described above. */
4057 highest_active_reg = *(p1 + 1) + *(p1 + 2);
4058 if (lowest_active_reg == NO_LOWEST_ACTIVE_REG)
4059 lowest_active_reg = *(p1 + 1);
4060 }
4061
4062 DEBUG_PRINT1 (":\n");
4063 PUSH_FAILURE_POINT (p + mcnt, d, -2);
4064 break;
4065
4066
4067 /* A smart repeat ends with `maybe_pop_jump'.
4068 We change it to either `pop_failure_jump' or `jump'. */
4069 case maybe_pop_jump:
4070 EXTRACT_NUMBER_AND_INCR (mcnt, p);
4071 DEBUG_PRINT2 ("EXECUTING maybe_pop_jump %d.\n", mcnt);
4072 {
4073 register unsigned char *p2 = p;
4074
4075 /* Compare the beginning of the repeat with what in the
4076 pattern follows its end. If we can establish that there
4077 is nothing that they would both match, i.e., that we
4078 would have to backtrack because of (as in, e.g., `a*a')
4079 then we can change to pop_failure_jump, because we'll
4080 never have to backtrack.
4081
4082 This is not true in the case of alternatives: in
4083 `(a|ab)*' we do need to backtrack to the `ab' alternative
4084 (e.g., if the string was `ab'). But instead of trying to
4085 detect that here, the alternative has put on a dummy
4086 failure point which is what we will end up popping. */
4087
4088 /* Skip over open/close-group commands. */
4089 while (p2 + 2 < pend
4090 && ((re_opcode_t) *p2 == stop_memory
4091 || (re_opcode_t) *p2 == start_memory))
4092 p2 += 3; /* Skip over args, too. */
4093
4094 /* If we're at the end of the pattern, we can change. */
4095 if (p2 == pend)
4096 {
4097 /* Consider what happens when matching ":\(.*\)"
4098 against ":/". I don't really understand this code
4099 yet. */
4100 p[-3] = (unsigned char) pop_failure_jump;
4101 DEBUG_PRINT1
4102 (" End of pattern: change to `pop_failure_jump'.\n");
4103 }
4104
4105 else if ((re_opcode_t) *p2 == exactn
4106 || (bufp->newline_anchor && (re_opcode_t) *p2 == endline))
4107 {
4108 register unsigned char c
4109 = *p2 == (unsigned char) endline ? '\n' : p2[2];
4110 p1 = p + mcnt;
4111
4112 /* p1[0] ... p1[2] are the `on_failure_jump' corresponding
4113 to the `maybe_finalize_jump' of this case. Examine what
4114 follows. */
4115 if ((re_opcode_t) p1[3] == exactn && p1[5] != c)
4116 {
4117 p[-3] = (unsigned char) pop_failure_jump;
4118 DEBUG_PRINT3 (" %c != %c => pop_failure_jump.\n",
4119 c, p1[5]);
4120 }
4121
4122 else if ((re_opcode_t) p1[3] == charset
4123 || (re_opcode_t) p1[3] == charset_not)
4124 {
4125 int not = (re_opcode_t) p1[3] == charset_not;
4126
4127 if (c < (unsigned char) (p1[4] * BYTEWIDTH)
4128 && p1[5 + c / BYTEWIDTH] & (1 << (c % BYTEWIDTH)))
4129 not = !not;
4130
4131 /* `not' is equal to 1 if c would match, which means
4132 that we can't change to pop_failure_jump. */
4133 if (!not)
4134 {
4135 p[-3] = (unsigned char) pop_failure_jump;
4136 DEBUG_PRINT1 (" No match => pop_failure_jump.\n");
4137 }
4138 }
4139 }
4140 }
4141 p -= 2; /* Point at relative address again. */
4142 if ((re_opcode_t) p[-1] != pop_failure_jump)
4143 {
4144 p[-1] = (unsigned char) jump;
4145 DEBUG_PRINT1 (" Match => jump.\n");
4146 goto unconditional_jump;
4147 }
4148 /* Note fall through. */
4149
4150
4151 /* The end of a simple repeat has a pop_failure_jump back to
4152 its matching on_failure_jump, where the latter will push a
4153 failure point. The pop_failure_jump takes off failure
4154 points put on by this pop_failure_jump's matching
4155 on_failure_jump; we got through the pattern to here from the
4156 matching on_failure_jump, so didn't fail. */
4157 case pop_failure_jump:
4158 {
4159 /* We need to pass separate storage for the lowest and
4160 highest registers, even though we don't care about the
4161 actual values. Otherwise, we will restore only one
4162 register from the stack, since lowest will == highest in
4163 `pop_failure_point'. */
4164 unsigned dummy_low_reg, dummy_high_reg;
4165 unsigned char *pdummy;
4166 const char *sdummy;
4167
4168 DEBUG_PRINT1 ("EXECUTING pop_failure_jump.\n");
4169 POP_FAILURE_POINT (sdummy, pdummy,
4170 dummy_low_reg, dummy_high_reg,
4171 reg_dummy, reg_dummy, reg_info_dummy);
4172 }
4173 /* Note fall through. */
4174
4175
4176 /* Unconditionally jump (without popping any failure points). */
4177 case jump:
4178 unconditional_jump:
4179 EXTRACT_NUMBER_AND_INCR (mcnt, p); /* Get the amount to jump. */
4180 DEBUG_PRINT2 ("EXECUTING jump %d ", mcnt);
4181 p += mcnt; /* Do the jump. */
4182 DEBUG_PRINT2 ("(to 0x%x).\n", p);
4183 break;
4184
4185
4186 /* We need this opcode so we can detect where alternatives end
4187 in `group_match_null_string_p' et al. */
4188 case jump_past_alt:
4189 DEBUG_PRINT1 ("EXECUTING jump_past_alt.\n");
4190 goto unconditional_jump;
4191
4192
4193 /* Normally, the on_failure_jump pushes a failure point, which
4194 then gets popped at pop_failure_jump. We will end up at
4195 pop_failure_jump, also, and with a pattern of, say, `a+', we
4196 are skipping over the on_failure_jump, so we have to push
4197 something meaningless for pop_failure_jump to pop. */
4198 case dummy_failure_jump:
4199 DEBUG_PRINT1 ("EXECUTING dummy_failure_jump.\n");
4200 /* It doesn't matter what we push for the string here. What
4201 the code at `fail' tests is the value for the pattern. */
4202 PUSH_FAILURE_POINT (0, 0, -2);
4203 goto unconditional_jump;
4204
4205
4206 /* At the end of an alternative, we need to push a dummy failure
4207 point in case we are followed by a `pop_failure_jump', because
4208 we don't want the failure point for the alternative to be
4209 popped. For example, matching `(a|ab)*' against `aab'
4210 requires that we match the `ab' alternative. */
4211 case push_dummy_failure:
4212 DEBUG_PRINT1 ("EXECUTING push_dummy_failure.\n");
4213 /* See comments just above at `dummy_failure_jump' about the
4214 two zeroes. */
4215 PUSH_FAILURE_POINT (0, 0, -2);
4216 break;
4217
4218 /* Have to succeed matching what follows at least n times.
4219 After that, handle like `on_failure_jump'. */
4220 case succeed_n:
4221 EXTRACT_NUMBER (mcnt, p + 2);
4222 DEBUG_PRINT2 ("EXECUTING succeed_n %d.\n", mcnt);
4223
4224 assert (mcnt >= 0);
4225 /* Originally, this is how many times we HAVE to succeed. */
4226 if (mcnt > 0)
4227 {
4228 mcnt--;
4229 p += 2;
4230 STORE_NUMBER_AND_INCR (p, mcnt);
4231 DEBUG_PRINT3 (" Setting 0x%x to %d.\n", p, mcnt);
4232 }
4233 else if (mcnt == 0)
4234 {
4235 DEBUG_PRINT2 (" Setting two bytes from 0x%x to no_op.\n", p+2);
4236 p[2] = (unsigned char) no_op;
4237 p[3] = (unsigned char) no_op;
4238 goto on_failure;
4239 }
4240 break;
4241
4242 case jump_n:
4243 EXTRACT_NUMBER (mcnt, p + 2);
4244 DEBUG_PRINT2 ("EXECUTING jump_n %d.\n", mcnt);
4245
4246 /* Originally, this is how many times we CAN jump. */
4247 if (mcnt)
4248 {
4249 mcnt--;
4250 STORE_NUMBER (p + 2, mcnt);
4251 goto unconditional_jump;
4252 }
4253 /* If don't have to jump any more, skip over the rest of command. */
4254 else
4255 p += 4;
4256 break;
4257
4258 case set_number_at:
4259 {
4260 DEBUG_PRINT1 ("EXECUTING set_number_at.\n");
4261
4262 EXTRACT_NUMBER_AND_INCR (mcnt, p);
4263 p1 = p + mcnt;
4264 EXTRACT_NUMBER_AND_INCR (mcnt, p);
4265 DEBUG_PRINT3 (" Setting 0x%x to %d.\n", p1, mcnt);
4266 STORE_NUMBER (p1, mcnt);
4267 break;
4268 }
4269
4270 case wordbound:
4271 DEBUG_PRINT1 ("EXECUTING wordbound.\n");
4272 if (AT_WORD_BOUNDARY (d))
4273 break;
4274 goto fail;
4275
4276 case notwordbound:
4277 DEBUG_PRINT1 ("EXECUTING notwordbound.\n");
4278 if (AT_WORD_BOUNDARY (d))
4279 goto fail;
4280 break;
4281
4282 case wordbeg:
4283 DEBUG_PRINT1 ("EXECUTING wordbeg.\n");
4284 if (WORDCHAR_P (d) && (AT_STRINGS_BEG (d) || !WORDCHAR_P (d - 1)))
4285 break;
4286 goto fail;
4287
4288 case wordend:
4289 DEBUG_PRINT1 ("EXECUTING wordend.\n");
4290 if (!AT_STRINGS_BEG (d) && WORDCHAR_P (d - 1)
4291 && (!WORDCHAR_P (d) || AT_STRINGS_END (d)))
4292 break;
4293 goto fail;
4294
4295 #ifdef emacs
4296 #ifdef emacs19
4297 case before_dot:
4298 DEBUG_PRINT1 ("EXECUTING before_dot.\n");
4299 if (PTR_CHAR_POS ((unsigned char *) d) >= point)
4300 goto fail;
4301 break;
4302
4303 case at_dot:
4304 DEBUG_PRINT1 ("EXECUTING at_dot.\n");
4305 if (PTR_CHAR_POS ((unsigned char *) d) != point)
4306 goto fail;
4307 break;
4308
4309 case after_dot:
4310 DEBUG_PRINT1 ("EXECUTING after_dot.\n");
4311 if (PTR_CHAR_POS ((unsigned char *) d) <= point)
4312 goto fail;
4313 break;
4314 #else /* not emacs19 */
4315 case at_dot:
4316 DEBUG_PRINT1 ("EXECUTING at_dot.\n");
4317 if (PTR_CHAR_POS ((unsigned char *) d) + 1 != point)
4318 goto fail;
4319 break;
4320 #endif /* not emacs19 */
4321
4322 case syntaxspec:
4323 DEBUG_PRINT2 ("EXECUTING syntaxspec %d.\n", mcnt);
4324 mcnt = *p++;
4325 goto matchsyntax;
4326
4327 case wordchar:
4328 DEBUG_PRINT1 ("EXECUTING Emacs wordchar.\n");
4329 mcnt = (int) Sword;
4330 matchsyntax:
4331 PREFETCH ();
4332 if (SYNTAX (*d++) != (enum syntaxcode) mcnt)
4333 goto fail;
4334 SET_REGS_MATCHED ();
4335 break;
4336
4337 case notsyntaxspec:
4338 DEBUG_PRINT2 ("EXECUTING notsyntaxspec %d.\n", mcnt);
4339 mcnt = *p++;
4340 goto matchnotsyntax;
4341
4342 case notwordchar:
4343 DEBUG_PRINT1 ("EXECUTING Emacs notwordchar.\n");
4344 mcnt = (int) Sword;
4345 matchnotsyntax:
4346 PREFETCH ();
4347 if (SYNTAX (*d++) == (enum syntaxcode) mcnt)
4348 goto fail;
4349 SET_REGS_MATCHED ();
4350 break;
4351
4352 #else /* not emacs */
4353 case wordchar:
4354 DEBUG_PRINT1 ("EXECUTING non-Emacs wordchar.\n");
4355 PREFETCH ();
4356 if (!WORDCHAR_P (d))
4357 goto fail;
4358 SET_REGS_MATCHED ();
4359 d++;
4360 break;
4361
4362 case notwordchar:
4363 DEBUG_PRINT1 ("EXECUTING non-Emacs notwordchar.\n");
4364 PREFETCH ();
4365 if (WORDCHAR_P (d))
4366 goto fail;
4367 SET_REGS_MATCHED ();
4368 d++;
4369 break;
4370 #endif /* not emacs */
4371
4372 default:
4373 abort ();
4374 }
4375 continue; /* Successfully executed one pattern command; keep going. */
4376
4377
4378 /* We goto here if a matching operation fails. */
4379 fail:
4380 if (!FAIL_STACK_EMPTY ())
4381 { /* A restart point is known. Restore to that state. */
4382 DEBUG_PRINT1 ("\nFAIL:\n");
4383 POP_FAILURE_POINT (d, p,
4384 lowest_active_reg, highest_active_reg,
4385 regstart, regend, reg_info);
4386
4387 /* If this failure point is a dummy, try the next one. */
4388 if (!p)
4389 goto fail;
4390
4391 /* If we failed to the end of the pattern, don't examine *p. */
4392 assert (p <= pend);
4393 if (p < pend)
4394 {
4395 boolean is_a_jump_n = false;
4396
4397 /* If failed to a backwards jump that's part of a repetition
4398 loop, need to pop this failure point and use the next one. */
4399 switch ((re_opcode_t) *p)
4400 {
4401 case jump_n:
4402 is_a_jump_n = true;
4403 case maybe_pop_jump:
4404 case pop_failure_jump:
4405 case jump:
4406 p1 = p + 1;
4407 EXTRACT_NUMBER_AND_INCR (mcnt, p1);
4408 p1 += mcnt;
4409
4410 if ((is_a_jump_n && (re_opcode_t) *p1 == succeed_n)
4411 || (!is_a_jump_n
4412 && (re_opcode_t) *p1 == on_failure_jump))
4413 goto fail;
4414 break;
4415 default:
4416 /* do nothing */ ;
4417 }
4418 }
4419
4420 if (d >= string1 && d <= end1)
4421 dend = end_match_1;
4422 }
4423 else
4424 break; /* Matching at this starting point really fails. */
4425 } /* for (;;) */
4426
4427 if (best_regs_set)
4428 goto restore_best_regs;
4429
4430 FREE_VARIABLES ();
4431
4432 return -1; /* Failure to match. */
4433 } /* re_match_2 */
4434
4435 /* Subroutine definitions for re_match_2. */
4436
4437
4438 /* We are passed P pointing to a register number after a start_memory.
4439
4440 Return true if the pattern up to the corresponding stop_memory can
4441 match the empty string, and false otherwise.
4442
4443 If we find the matching stop_memory, sets P to point to one past its number.
4444 Otherwise, sets P to an undefined byte less than or equal to END.
4445
4446 We don't handle duplicates properly (yet). */
4447
4448 static boolean
group_match_null_string_p(p,end,reg_info)4449 group_match_null_string_p (p, end, reg_info)
4450 unsigned char **p, *end;
4451 register_info_type *reg_info;
4452 {
4453 int mcnt;
4454 /* Point to after the args to the start_memory. */
4455 unsigned char *p1 = *p + 2;
4456
4457 while (p1 < end)
4458 {
4459 /* Skip over opcodes that can match nothing, and return true or
4460 false, as appropriate, when we get to one that can't, or to the
4461 matching stop_memory. */
4462
4463 switch ((re_opcode_t) *p1)
4464 {
4465 /* Could be either a loop or a series of alternatives. */
4466 case on_failure_jump:
4467 p1++;
4468 EXTRACT_NUMBER_AND_INCR (mcnt, p1);
4469
4470 /* If the next operation is not a jump backwards in the
4471 pattern. */
4472
4473 if (mcnt >= 0)
4474 {
4475 /* Go through the on_failure_jumps of the alternatives,
4476 seeing if any of the alternatives cannot match nothing.
4477 The last alternative starts with only a jump,
4478 whereas the rest start with on_failure_jump and end
4479 with a jump, e.g., here is the pattern for `a|b|c':
4480
4481 /on_failure_jump/0/6/exactn/1/a/jump_past_alt/0/6
4482 /on_failure_jump/0/6/exactn/1/b/jump_past_alt/0/3
4483 /exactn/1/c
4484
4485 So, we have to first go through the first (n-1)
4486 alternatives and then deal with the last one separately. */
4487
4488
4489 /* Deal with the first (n-1) alternatives, which start
4490 with an on_failure_jump (see above) that jumps to right
4491 past a jump_past_alt. */
4492
4493 while ((re_opcode_t) p1[mcnt-3] == jump_past_alt)
4494 {
4495 /* `mcnt' holds how many bytes long the alternative
4496 is, including the ending `jump_past_alt' and
4497 its number. */
4498
4499 if (!alt_match_null_string_p (p1, p1 + mcnt - 3,
4500 reg_info))
4501 return false;
4502
4503 /* Move to right after this alternative, including the
4504 jump_past_alt. */
4505 p1 += mcnt;
4506
4507 /* Break if it's the beginning of an n-th alternative
4508 that doesn't begin with an on_failure_jump. */
4509 if ((re_opcode_t) *p1 != on_failure_jump)
4510 break;
4511
4512 /* Still have to check that it's not an n-th
4513 alternative that starts with an on_failure_jump. */
4514 p1++;
4515 EXTRACT_NUMBER_AND_INCR (mcnt, p1);
4516 if ((re_opcode_t) p1[mcnt-3] != jump_past_alt)
4517 {
4518 /* Get to the beginning of the n-th alternative. */
4519 p1 -= 3;
4520 break;
4521 }
4522 }
4523
4524 /* Deal with the last alternative: go back and get number
4525 of the `jump_past_alt' just before it. `mcnt' contains
4526 the length of the alternative. */
4527 EXTRACT_NUMBER (mcnt, p1 - 2);
4528
4529 if (!alt_match_null_string_p (p1, p1 + mcnt, reg_info))
4530 return false;
4531
4532 p1 += mcnt; /* Get past the n-th alternative. */
4533 } /* if mcnt > 0 */
4534 break;
4535
4536
4537 case stop_memory:
4538 assert (p1[1] == **p);
4539 *p = p1 + 2;
4540 return true;
4541
4542
4543 default:
4544 if (!common_op_match_null_string_p (&p1, end, reg_info))
4545 return false;
4546 }
4547 } /* while p1 < end */
4548
4549 return false;
4550 } /* group_match_null_string_p */
4551
4552
4553 /* Similar to group_match_null_string_p, but doesn't deal with alternatives:
4554 It expects P to be the first byte of a single alternative and END one
4555 byte past the last. The alternative can contain groups. */
4556
4557 static boolean
alt_match_null_string_p(p,end,reg_info)4558 alt_match_null_string_p (p, end, reg_info)
4559 unsigned char *p, *end;
4560 register_info_type *reg_info;
4561 {
4562 int mcnt;
4563 unsigned char *p1 = p;
4564
4565 while (p1 < end)
4566 {
4567 /* Skip over opcodes that can match nothing, and break when we get
4568 to one that can't. */
4569
4570 switch ((re_opcode_t) *p1)
4571 {
4572 /* It's a loop. */
4573 case on_failure_jump:
4574 p1++;
4575 EXTRACT_NUMBER_AND_INCR (mcnt, p1);
4576 p1 += mcnt;
4577 break;
4578
4579 default:
4580 if (!common_op_match_null_string_p (&p1, end, reg_info))
4581 return false;
4582 }
4583 } /* while p1 < end */
4584
4585 return true;
4586 } /* alt_match_null_string_p */
4587
4588
4589 /* Deals with the ops common to group_match_null_string_p and
4590 alt_match_null_string_p.
4591
4592 Sets P to one after the op and its arguments, if any. */
4593
4594 static boolean
common_op_match_null_string_p(p,end,reg_info)4595 common_op_match_null_string_p (p, end, reg_info)
4596 unsigned char **p, *end;
4597 register_info_type *reg_info;
4598 {
4599 int mcnt;
4600 boolean ret;
4601 int reg_no;
4602 unsigned char *p1 = *p;
4603
4604 switch ((re_opcode_t) *p1++)
4605 {
4606 case no_op:
4607 case begline:
4608 case endline:
4609 case begbuf:
4610 case endbuf:
4611 case wordbeg:
4612 case wordend:
4613 case wordbound:
4614 case notwordbound:
4615 #ifdef emacs
4616 case before_dot:
4617 case at_dot:
4618 case after_dot:
4619 #endif
4620 break;
4621
4622 case start_memory:
4623 reg_no = *p1;
4624 assert (reg_no > 0 && reg_no <= MAX_REGNUM);
4625 ret = group_match_null_string_p (&p1, end, reg_info);
4626
4627 /* Have to set this here in case we're checking a group which
4628 contains a group and a back reference to it. */
4629
4630 if (REG_MATCH_NULL_STRING_P (reg_info[reg_no]) == MATCH_NULL_UNSET_VALUE)
4631 REG_MATCH_NULL_STRING_P (reg_info[reg_no]) = ret;
4632
4633 if (!ret)
4634 return false;
4635 break;
4636
4637 /* If this is an optimized succeed_n for zero times, make the jump. */
4638 case jump:
4639 EXTRACT_NUMBER_AND_INCR (mcnt, p1);
4640 if (mcnt >= 0)
4641 p1 += mcnt;
4642 else
4643 return false;
4644 break;
4645
4646 case succeed_n:
4647 /* Get to the number of times to succeed. */
4648 p1 += 2;
4649 EXTRACT_NUMBER_AND_INCR (mcnt, p1);
4650
4651 if (mcnt == 0)
4652 {
4653 p1 -= 4;
4654 EXTRACT_NUMBER_AND_INCR (mcnt, p1);
4655 p1 += mcnt;
4656 }
4657 else
4658 return false;
4659 break;
4660
4661 case duplicate:
4662 if (!REG_MATCH_NULL_STRING_P (reg_info[*p1]))
4663 return false;
4664 break;
4665
4666 case set_number_at:
4667 p1 += 4;
4668
4669 default:
4670 /* All other opcodes mean we cannot match the empty string. */
4671 return false;
4672 }
4673
4674 *p = p1;
4675 return true;
4676 } /* common_op_match_null_string_p */
4677
4678
4679 /* Return zero if TRANSLATE[S1] and TRANSLATE[S2] are identical for LEN
4680 bytes; non-zero otherwise. */
4681
4682 static int
bcmp_translate(s1,s2,len,translate)4683 bcmp_translate (s1, s2, len, translate)
4684 unsigned char *s1, *s2;
4685 register int len;
4686 char *translate;
4687 {
4688 register unsigned char *p1 = s1, *p2 = s2;
4689 while (len)
4690 {
4691 if (translate[*p1++] != translate[*p2++]) return 1;
4692 len--;
4693 }
4694 return 0;
4695 }
4696
4697 /* Entry points for GNU code. */
4698
4699 /* re_compile_pattern is the GNU regular expression compiler: it
4700 compiles PATTERN (of length SIZE) and puts the result in BUFP.
4701 Returns 0 if the pattern was valid, otherwise an error string.
4702
4703 Assumes the `allocated' (and perhaps `buffer') and `translate' fields
4704 are set in BUFP on entry.
4705
4706 We call regex_compile to do the actual compilation. */
4707
4708 const char *
re_compile_pattern(pattern,length,bufp)4709 re_compile_pattern (pattern, length, bufp)
4710 const char *pattern;
4711 int length;
4712 struct re_pattern_buffer *bufp;
4713 {
4714 reg_errcode_t ret;
4715
4716 /* GNU code is written to assume at least RE_NREGS registers will be set
4717 (and at least one extra will be -1). */
4718 bufp->regs_allocated = REGS_UNALLOCATED;
4719
4720 /* And GNU code determines whether or not to get register information
4721 by passing null for the REGS argument to re_match, etc., not by
4722 setting no_sub. */
4723 bufp->no_sub = 0;
4724
4725 /* Match anchors at newline. */
4726 bufp->newline_anchor = 1;
4727
4728 ret = regex_compile (pattern, length, re_syntax_options, bufp);
4729
4730 return re_error_msg[(int) ret];
4731 }
4732
4733 /* Entry points compatible with 4.2 BSD regex library. We don't define
4734 them if this is an Emacs or POSIX compilation. */
4735
4736 #if !defined (emacs) && !defined (_POSIX_SOURCE)
4737
4738 /* BSD has one and only one pattern buffer. */
4739 static struct re_pattern_buffer re_comp_buf;
4740
4741 char *
re_comp(s)4742 re_comp (s)
4743 const char *s;
4744 {
4745 reg_errcode_t ret;
4746
4747 if (!s)
4748 {
4749 if (!re_comp_buf.buffer)
4750 return "No previous regular expression";
4751 return 0;
4752 }
4753
4754 if (!re_comp_buf.buffer)
4755 {
4756 re_comp_buf.buffer = (unsigned char *) malloc (200);
4757 if (re_comp_buf.buffer == NULL)
4758 return "Memory exhausted";
4759 re_comp_buf.allocated = 200;
4760
4761 re_comp_buf.fastmap = (char *) malloc (1 << BYTEWIDTH);
4762 if (re_comp_buf.fastmap == NULL)
4763 return "Memory exhausted";
4764 }
4765
4766 /* Since `re_exec' always passes NULL for the `regs' argument, we
4767 don't need to initialize the pattern buffer fields which affect it. */
4768
4769 /* Match anchors at newlines. */
4770 re_comp_buf.newline_anchor = 1;
4771
4772 ret = regex_compile (s, strlen (s), re_syntax_options, &re_comp_buf);
4773
4774 /* Yes, we're discarding `const' here. */
4775 return (char *) re_error_msg[(int) ret];
4776 }
4777
4778
4779 int
re_exec(s)4780 re_exec (s)
4781 const char *s;
4782 {
4783 const int len = (int)strlen (s);
4784 return
4785 0 <= re_search (&re_comp_buf, s, len, 0, len, (struct re_registers *) 0);
4786 }
4787 #endif /* not emacs and not _POSIX_SOURCE */
4788
4789 /* POSIX.2 functions. Don't define these for Emacs. */
4790
4791 #ifndef emacs
4792
4793 /* regcomp takes a regular expression as a string and compiles it.
4794
4795 PREG is a regex_t *. We do not expect any fields to be initialized,
4796 since POSIX says we shouldn't. Thus, we set
4797
4798 `buffer' to the compiled pattern;
4799 `used' to the length of the compiled pattern;
4800 `syntax' to RE_SYNTAX_POSIX_EXTENDED if the
4801 REG_EXTENDED bit in CFLAGS is set; otherwise, to
4802 RE_SYNTAX_POSIX_BASIC;
4803 `newline_anchor' to REG_NEWLINE being set in CFLAGS;
4804 `fastmap' and `fastmap_accurate' to zero;
4805 `re_nsub' to the number of subexpressions in PATTERN.
4806
4807 PATTERN is the address of the pattern string.
4808
4809 CFLAGS is a series of bits which affect compilation.
4810
4811 If REG_EXTENDED is set, we use POSIX extended syntax; otherwise, we
4812 use POSIX basic syntax.
4813
4814 If REG_NEWLINE is set, then . and [^...] don't match newline.
4815 Also, regexec will try a match beginning after every newline.
4816
4817 If REG_ICASE is set, then we considers upper- and lowercase
4818 versions of letters to be equivalent when matching.
4819
4820 If REG_NOSUB is set, then when PREG is passed to regexec, that
4821 routine will report only success or failure, and nothing about the
4822 registers.
4823
4824 It returns 0 if it succeeds, non-zero if it doesn't. (See regex.h for
4825 the return codes and their meanings.) */
4826
4827 int
regcomp(preg,pattern,cflags)4828 regcomp (preg, pattern, cflags)
4829 regex_t *preg;
4830 const char *pattern;
4831 int cflags;
4832 {
4833 reg_errcode_t ret;
4834 unsigned syntax
4835 = (cflags & REG_EXTENDED) ?
4836 RE_SYNTAX_POSIX_EXTENDED : RE_SYNTAX_POSIX_BASIC;
4837
4838 /* regex_compile will allocate the space for the compiled pattern. */
4839 preg->buffer = 0;
4840 preg->allocated = 0;
4841
4842 /* Don't bother to use a fastmap when searching. This simplifies the
4843 REG_NEWLINE case: if we used a fastmap, we'd have to put all the
4844 characters after newlines into the fastmap. This way, we just try
4845 every character. */
4846 preg->fastmap = 0;
4847
4848 if (cflags & REG_ICASE)
4849 {
4850 unsigned i;
4851
4852 preg->translate = (char *) malloc (CHAR_SET_SIZE);
4853 if (preg->translate == NULL)
4854 return (int) REG_ESPACE;
4855
4856 /* Map uppercase characters to corresponding lowercase ones. */
4857 for (i = 0; i < CHAR_SET_SIZE; i++)
4858 preg->translate[i] = ISUPPER (i) ? tolower (i) : i;
4859 }
4860 else
4861 preg->translate = NULL;
4862
4863 /* If REG_NEWLINE is set, newlines are treated differently. */
4864 if (cflags & REG_NEWLINE)
4865 { /* REG_NEWLINE implies neither . nor [^...] match newline. */
4866 syntax &= ~RE_DOT_NEWLINE;
4867 syntax |= RE_HAT_LISTS_NOT_NEWLINE;
4868 /* It also changes the matching behavior. */
4869 preg->newline_anchor = 1;
4870 }
4871 else
4872 preg->newline_anchor = 0;
4873
4874 preg->no_sub = !!(cflags & REG_NOSUB);
4875
4876 /* POSIX says a null character in the pattern terminates it, so we
4877 can use strlen here in compiling the pattern. */
4878 ret = regex_compile (pattern, strlen (pattern), syntax, preg);
4879
4880 /* POSIX doesn't distinguish between an unmatched open-group and an
4881 unmatched close-group: both are REG_EPAREN. */
4882 if (ret == REG_ERPAREN) ret = REG_EPAREN;
4883
4884 return (int) ret;
4885 }
4886
4887
4888 /* regexec searches for a given pattern, specified by PREG, in the
4889 string STRING.
4890
4891 If NMATCH is zero or REG_NOSUB was set in the cflags argument to
4892 `regcomp', we ignore PMATCH. Otherwise, we assume PMATCH has at
4893 least NMATCH elements, and we set them to the offsets of the
4894 corresponding matched substrings.
4895
4896 EFLAGS specifies `execution flags' which affect matching: if
4897 REG_NOTBOL is set, then ^ does not match at the beginning of the
4898 string; if REG_NOTEOL is set, then $ does not match at the end.
4899
4900 We return 0 if we find a match and REG_NOMATCH if not. */
4901
4902 int
regexec(preg,string,nmatch,pmatch,eflags)4903 regexec (preg, string, nmatch, pmatch, eflags)
4904 const regex_t *preg;
4905 const char *string;
4906 size_t nmatch;
4907 regmatch_t pmatch[];
4908 int eflags;
4909 {
4910 int ret;
4911 struct re_registers regs;
4912 regex_t private_preg;
4913 int len = (int)strlen (string);
4914 boolean want_reg_info = !preg->no_sub && nmatch > 0;
4915
4916 private_preg = *preg;
4917
4918 private_preg.not_bol = !!(eflags & REG_NOTBOL);
4919 private_preg.not_eol = !!(eflags & REG_NOTEOL);
4920
4921 /* The user has told us exactly how many registers to return
4922 information about, via `nmatch'. We have to pass that on to the
4923 matching routines. */
4924 private_preg.regs_allocated = REGS_FIXED;
4925
4926 if (want_reg_info)
4927 {
4928 regs.num_regs = (unsigned)nmatch;
4929 regs.start = TALLOC (nmatch, regoff_t);
4930 regs.end = TALLOC (nmatch, regoff_t);
4931
4932 if (regs.start == NULL)
4933 {
4934 if (NULL != regs.end)
4935 free (regs.end);
4936 return (int) REG_NOMATCH;
4937 }
4938 if (regs.end == NULL)
4939 {
4940 free (regs.start);
4941 return (int) REG_NOMATCH;
4942 }
4943 }
4944
4945 /* Perform the searching operation. */
4946 ret = re_search (&private_preg, string, len,
4947 /* start: */ 0, /* range: */ len,
4948 want_reg_info ? ®s : (struct re_registers *) 0);
4949
4950 /* Copy the register information to the POSIX structure. */
4951 if (want_reg_info)
4952 {
4953 if (ret >= 0)
4954 {
4955 unsigned r;
4956
4957 for (r = 0; r < nmatch; r++)
4958 {
4959 pmatch[r].rm_so = regs.start[r];
4960 pmatch[r].rm_eo = regs.end[r];
4961 }
4962 }
4963
4964 /* If we needed the temporary register info, free the space now. */
4965 free (regs.start);
4966 free (regs.end);
4967 }
4968
4969 /* We want zero return to mean success, unlike `re_search'. */
4970 return ret >= 0 ? (int) REG_NOERROR : (int) REG_NOMATCH;
4971 }
4972
4973
4974 /* Returns a message corresponding to an error code, ERRCODE, returned
4975 from either regcomp or regexec. We don't use PREG here. */
4976
4977 size_t
regerror(int errcode,const regex_t * preg,char * errbuf,size_t errbuf_size)4978 regerror (
4979 int errcode,
4980 const regex_t *preg,
4981 char *errbuf,
4982 size_t errbuf_size)
4983 {
4984 const char *msg;
4985
4986 if (errcode < 0
4987 || errcode >= (sizeof (re_error_msg) / sizeof (re_error_msg[0])))
4988 /* Only error codes returned by the rest of the code should be passed
4989 to this routine. If we are given anything else, or if other regex
4990 code generates an invalid error code, then the program has a bug.
4991 Dump core so we can fix it. */
4992 abort ();
4993
4994 msg = re_error_msg[errcode];
4995
4996 /* POSIX doesn't require that we do anything in this case, but why
4997 not be nice. */
4998 if (NULL == msg)
4999 msg = "Success";
5000
5001 return zbx_strlcpy(errbuf, msg, errbuf_size) + 1; /* include the null */
5002 }
5003
5004
5005 /* Free dynamically allocated space used by PREG. */
5006
5007 void
regfree(preg)5008 regfree (preg)
5009 regex_t *preg;
5010 {
5011 if (preg->buffer != NULL)
5012 free (preg->buffer);
5013 preg->buffer = NULL;
5014
5015 preg->allocated = 0;
5016 preg->used = 0;
5017
5018 if (preg->fastmap != NULL)
5019 free (preg->fastmap);
5020 preg->fastmap = NULL;
5021 preg->fastmap_accurate = 0;
5022
5023 if (preg->translate != NULL)
5024 free (preg->translate);
5025 preg->translate = NULL;
5026 }
5027
5028 #endif /* not emacs */
5029
5030 #if defined(_WIN64) && defined(_MSC_VER)
5031 /* restore optimization options */
5032 # pragma optimize( "", on )
5033 #endif
5034
5035
5036 /*
5037 Local variables:
5038 make-backup-files: t
5039 version-control: t
5040 trim-versions-without-asking: nil
5041 End:
5042 */
5043