1 /* Extended regular expression matching and search library,
2 version 0.12.
3 (Implements POSIX draft P1003.2/D11.2, except for some of the
4 internationalization features.)
5
6 Copyright (C) 1993,1994,1995,1996,1997,1999,2000 Free Software Foundation, Inc.
7
8 This program is free software; you can redistribute it and/or modify
9 it under the terms of the GNU General Public License as published by
10 the Free Software Foundation; either version 2, or (at your option)
11 any later version.
12
13 This program is distributed in the hope that it will be useful,
14 but WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 GNU General Public License for more details.
17
18 You should have received a copy of the GNU General Public License
19 along with this program; if not, write to the Free Software Foundation,
20 Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */
21
22 /* Multi-byte extension added May, 1993 by t^2 (Takahiro Tanimoto)
23 Last change: Dec 1, 2000 by okabe katsuyuki */
24
25 /* AIX requires this to be the first thing in the file. */
26 #if defined (_AIX) && !defined (REGEX_MALLOC)
27 #pragma alloca
28 #endif
29
30 #undef _GNU_SOURCE
31 #define _GNU_SOURCE
32
33 #ifdef HAVE_CONFIG_H
34 #include <config.h>
35 #endif
36
37 #if defined(STDC_HEADERS) && !defined(emacs)
38 #include <stddef.h>
39 #else
40 /* We need this for `regex.h', and perhaps for the Emacs include files. */
41 #include <sys/types.h>
42 #endif
43
44 /* For platform which support the ISO C amendement 1 functionality we
45 support user defined character classes. */
46 #if defined _LIBC || (defined HAVE_WCTYPE_H && defined HAVE_WCHAR_H)
47 # include <wctype.h>
48 # include <wchar.h>
49 #endif
50
51 /* This is for other GNU distributions with internationalized messages. */
52 #if HAVE_LIBINTL_H || defined (_LIBC)
53 # include <libintl.h>
54 #else
55 # define gettext(msgid) (msgid)
56 #endif
57
58 #ifndef gettext_noop
59 /* This define is so xgettext can find the internationalizable
60 strings. */
61 #define gettext_noop(String) String
62 #endif
63
64 /* The `emacs' switch turns on certain matching commands
65 that make sense only in Emacs. */
66 #ifdef emacs
67
68 #include "lisp.h"
69 #include "buffer.h"
70 #include "syntax.h"
71
72 #else /* not emacs */
73
74 /* If we are not linking with Emacs proper,
75 we can't use the relocating allocator
76 even if config.h says that we can. */
77 #undef REL_ALLOC
78
79 #if defined (STDC_HEADERS) || defined (_LIBC)
80 #include <stdlib.h>
81 #else
82 char *malloc ();
83 char *realloc ();
84 #endif
85
86 /* When used in Emacs's lib-src, we need to get bzero and bcopy somehow.
87 If nothing else has been done, use the method below. */
88 #ifdef INHIBIT_STRING_HEADER
89 #if !(defined (HAVE_BZERO) && defined (HAVE_BCOPY))
90 #if !defined (bzero) && !defined (bcopy)
91 #undef INHIBIT_STRING_HEADER
92 #endif
93 #endif
94 #endif
95
96 /* This is the normal way of making sure we have a bcopy and a bzero.
97 This is used in most programs--a few other programs avoid this
98 by defining INHIBIT_STRING_HEADER. */
99 #ifndef INHIBIT_STRING_HEADER
100 #if defined (HAVE_STRING_H) || defined (STDC_HEADERS) || defined (_LIBC)
101 #include <string.h>
102 #ifndef bcmp
103 #define bcmp(s1, s2, n) memcmp ((s1), (s2), (n))
104 #endif
105 #ifndef bcopy
106 #define bcopy(s, d, n) memmove ((d), (s), (n))
107 #endif
108 #ifndef bzero
109 #define bzero(s, n) memset ((s), 0, (n))
110 #endif
111 #else
112 #include <strings.h>
113 #endif
114 #endif
115
116 /* Define the syntax stuff for \<, \>, etc. */
117
118 /* This must be nonzero for the wordchar and notwordchar pattern
119 commands in re_match_2. */
120 #ifndef Sword
121 #define Sword 1
122 #endif
123
124 #ifdef SWITCH_ENUM_BUG
125 #define SWITCH_ENUM_CAST(x) ((int)(x))
126 #else
127 #define SWITCH_ENUM_CAST(x) (x)
128 #endif
129
130 #endif /* not emacs */
131
132 /* Get the interface, including the syntax bits. */
133 #include "regex.h"
134 #include "mbc.h"
135
136 /* isalpha etc. are used for the character classes. */
137 #include <ctype.h>
138
139 /* Jim Meyering writes:
140
141 "... Some ctype macros are valid only for character codes that
142 isascii says are ASCII (SGI's IRIX-4.0.5 is one such system --when
143 using /bin/cc or gcc but without giving an ansi option). So, all
144 ctype uses should be through macros like ISPRINT... If
145 STDC_HEADERS is defined, then autoconf has verified that the ctype
146 macros don't need to be guarded with references to isascii. ...
147 Defining isascii to 1 should let any compiler worth its salt
148 eliminate the && through constant folding." */
149
150 #if defined (STDC_HEADERS) || (!defined (isascii) && !defined (HAVE_ISASCII))
151 #define ISASCII(c) 1
152 #else
153 #define ISASCII(c) isascii(c)
154 #endif
155
156 #ifdef isblank
157 #define ISBLANK(c) (ISASCII (c) && isblank (c))
158 #else
159 #define ISBLANK(c) ((c) == ' ' || (c) == '\t')
160 #endif
161 #ifdef isgraph
162 #define ISGRAPH(c) (ISASCII (c) && isgraph (c))
163 #else
164 #define ISGRAPH(c) (ISASCII (c) && isprint (c) && !isspace (c))
165 #endif
166
167 #define ISPRINT(c) (ISASCII (c) && isprint (c))
168 #define ISDIGIT(c) (ISASCII (c) && isdigit (c))
169 #define ISALNUM(c) (ISASCII (c) && isalnum (c))
170 #define ISALPHA(c) (ISASCII (c) && isalpha (c))
171 #define ISCNTRL(c) (ISASCII (c) && iscntrl (c))
172 #define ISLOWER(c) (ISASCII (c) && islower (c))
173 #define ISPUNCT(c) (ISASCII (c) && ispunct (c))
174 #define ISSPACE(c) (ISASCII (c) && isspace (c))
175 #define ISUPPER(c) (ISASCII (c) && isupper (c))
176 #define ISXDIGIT(c) (ISASCII (c) && isxdigit (c))
177
178 #ifndef NULL
179 #define NULL (void *)0
180 #endif
181
182 /* We remove any previous definition of `SIGN_EXTEND_CHAR',
183 since ours (we hope) works properly with all combinations of
184 machines, compilers, `char' and `unsigned char' argument types.
185 (Per Bothner suggested the basic approach.) */
186 #undef SIGN_EXTEND_CHAR
187 #if __STDC__ || defined(MSDOS)
188 #define SIGN_EXTEND_CHAR(c) ((signed char) (c))
189 #else /* not __STDC__ */
190 /* As in Harbison and Steele. */
191 #define SIGN_EXTEND_CHAR(c) ((((unsigned char) (c)) ^ 128) - 128)
192 #endif
193
194 #ifndef emacs
195 /* How many characters in the character set. */
196 #define CHAR_SET_SIZE 256
197
198 #ifdef SYNTAX_TABLE
199
200 extern char *re_syntax_table;
201
202 #else /* not SYNTAX_TABLE */
203
204 static char re_syntax_table[CHAR_SET_SIZE];
205
206 static void
init_syntax_once()207 init_syntax_once ()
208 {
209 register int c;
210 static int done = 0;
211
212 if (done)
213 return;
214 bzero (re_syntax_table, sizeof re_syntax_table);
215
216 for (c = 0; c < CHAR_SET_SIZE; c++)
217 if (ISALNUM (c))
218 re_syntax_table[c] = Sword;
219
220 re_syntax_table['_'] = Sword;
221
222 done = 1;
223 }
224
225 #endif /* not SYNTAX_TABLE */
226
227 #define SYNTAX(c) re_syntax_table[(unsigned char) (c)]
228
229 #endif /* emacs */
230
231 /* Should we use malloc or alloca? If REGEX_MALLOC is not defined, we
232 use `alloca' instead of `malloc'. This is because using malloc in
233 re_search* or re_match* could cause memory leaks when C-g is used in
234 Emacs; also, malloc is slower and causes storage fragmentation. On
235 the other hand, malloc is more portable, and easier to debug.
236
237 Because we sometimes use alloca, some routines have to be macros,
238 not functions -- `alloca'-allocated space disappears at the end of the
239 function it is called in. */
240
241 #ifdef REGEX_MALLOC
242
243 #define REGEX_ALLOCATE malloc
244 #define REGEX_REALLOCATE(source, osize, nsize) realloc (source, nsize)
245 #define REGEX_FREE free
246
247 #else /* not REGEX_MALLOC */
248
249 /* Emacs already defines alloca, sometimes. */
250 #ifndef alloca
251
252 /* Make alloca work the best possible way. */
253 #ifdef __GNUC__
254 #define alloca __builtin_alloca
255 #else /* not __GNUC__ */
256 #if HAVE_ALLOCA_H
257 #include <alloca.h>
258 #else /* not __GNUC__ or HAVE_ALLOCA_H */
259 #if 0 /* It is a bad idea to declare alloca. We always cast the result. */
260 #ifndef _AIX /* Already did AIX, up at the top. */
261 #ifdef __TURBOC__
262 #include <malloc.h>
263 #else
264 char *alloca ();
265 #endif
266 #endif /* not _AIX */
267 #endif
268 #endif /* not HAVE_ALLOCA_H */
269 #endif /* not __GNUC__ */
270
271 #endif /* not alloca */
272
273 #define REGEX_ALLOCATE alloca
274
275 /* Assumes a `char *destination' variable. */
276 #define REGEX_REALLOCATE(source, osize, nsize) \
277 (destination = (char *) alloca (nsize), \
278 bcopy (source, destination, osize), \
279 destination)
280
281 /* No need to do anything to free, after alloca. */
282 #define REGEX_FREE(arg) ((void)0) /* Do nothing! But inhibit gcc warning. */
283
284 #endif /* not REGEX_MALLOC */
285
286 /* Define how to allocate the failure stack. */
287
288 #if defined (REL_ALLOC) && defined (REGEX_MALLOC)
289
290 #define REGEX_ALLOCATE_STACK(size) \
291 r_alloc (&failure_stack_ptr, (size))
292 #define REGEX_REALLOCATE_STACK(source, osize, nsize) \
293 r_re_alloc (&failure_stack_ptr, (nsize))
294 #define REGEX_FREE_STACK(ptr) \
295 r_alloc_free (&failure_stack_ptr)
296
297 #else /* not using relocating allocator */
298
299 #ifdef REGEX_MALLOC
300
301 #define REGEX_ALLOCATE_STACK malloc
302 #define REGEX_REALLOCATE_STACK(source, osize, nsize) realloc (source, nsize)
303 #define REGEX_FREE_STACK free
304
305 #else /* not REGEX_MALLOC */
306
307 #define REGEX_ALLOCATE_STACK alloca
308
309 #define REGEX_REALLOCATE_STACK(source, osize, nsize) \
310 REGEX_REALLOCATE (source, osize, nsize)
311 /* No need to explicitly free anything. */
312 #define REGEX_FREE_STACK(arg)
313
314 #endif /* not REGEX_MALLOC */
315 #endif /* not using relocating allocator */
316
317
318 /* True if `size1' is non-NULL and PTR is pointing anywhere inside
319 `string1' or just past its end. This works if PTR is NULL, which is
320 a good thing. */
321 #define FIRST_STRING_P(ptr) \
322 (size1 && string1 <= (ptr) && (ptr) <= string1 + size1)
323
324 /* (Re)Allocate N items of type T using malloc, or fail. */
325 #define TALLOC(n, t) ((t *) malloc ((n) * sizeof (t)))
326 #define RETALLOC(addr, n, t) ((addr) = (t *) realloc (addr, (n) * sizeof (t)))
327 #define RETALLOC_IF(addr, n, t) \
328 if (addr) RETALLOC((addr), (n), t); else (addr) = TALLOC ((n), t)
329 #define REGEX_TALLOC(n, t) ((t *) REGEX_ALLOCATE ((n) * sizeof (t)))
330
331 #define BYTEWIDTH 8 /* In bits. */
332
333 #define STREQ(s1, s2) ((strcmp (s1, s2) == 0))
334
335 #undef MAX
336 #undef MIN
337 #define MAX(a, b) ((a) > (b) ? (a) : (b))
338 #define MIN(a, b) ((a) < (b) ? (a) : (b))
339
340 typedef char boolean;
341 #define false 0
342 #define true 1
343
344 static int re_match_2_internal ();
345
346 /* These are the command codes that appear in compiled regular
347 expressions. Some opcodes are followed by argument bytes. A
348 command code can specify any interpretation whatsoever for its
349 arguments. Zero bytes may appear in the compiled regular expression. */
350
351 typedef enum
352 {
353 no_op = 0,
354
355 /* Succeed right away--no more backtracking. */
356 succeed,
357
358 /* Followed by one byte giving n, then by n literal bytes. */
359 exactn,
360
361 /* Matches any (more or less) character. */
362 anychar,
363
364 /* Matches any one char belonging to specified set. First
365 following byte is number of bitmap bytes. Then come bytes
366 for a bitmap saying which chars are in. Bits in each byte
367 are ordered low-bit-first. A character is in the set if its
368 bit is 1. A character too large to have a bit in the map is
369 automatically not in the set. */
370 charset,
371
372 /* Same parameters as charset, but match any character that is
373 not one of those specified. */
374 charset_not,
375
376 /* Start remembering the text that is matched, for storing in a
377 register. Followed by one byte with the register number, in
378 the range 0 to one less than the pattern buffer's re_nsub
379 field. Then followed by one byte with the number of groups
380 inner to this one. (This last has to be part of the
381 start_memory only because we need it in the on_failure_jump
382 of re_match_2.) */
383 start_memory,
384
385 /* Stop remembering the text that is matched and store it in a
386 memory register. Followed by one byte with the register
387 number, in the range 0 to one less than `re_nsub' in the
388 pattern buffer, and one byte with the number of inner groups,
389 just like `start_memory'. (We need the number of inner
390 groups here because we don't have any easy way of finding the
391 corresponding start_memory when we're at a stop_memory.) */
392 stop_memory,
393
394 /* Match a duplicate of something remembered. Followed by one
395 byte containing the register number. */
396 duplicate,
397
398 /* Fail unless at beginning of line. */
399 begline,
400
401 /* Fail unless at end of line. */
402 endline,
403
404 /* Succeeds if at beginning of buffer (if emacs) or at beginning
405 of string to be matched (if not). */
406 begbuf,
407
408 /* Analogously, for end of buffer/string. */
409 endbuf,
410
411 /* Followed by two byte relative address to which to jump. */
412 jump,
413
414 /* Same as jump, but marks the end of an alternative. */
415 jump_past_alt,
416
417 /* Followed by two-byte relative address of place to resume at
418 in case of failure. */
419 on_failure_jump,
420
421 /* Like on_failure_jump, but pushes a placeholder instead of the
422 current string position when executed. */
423 on_failure_keep_string_jump,
424
425 /* Throw away latest failure point and then jump to following
426 two-byte relative address. */
427 pop_failure_jump,
428
429 /* Change to pop_failure_jump if know won't have to backtrack to
430 match; otherwise change to jump. This is used to jump
431 back to the beginning of a repeat. If what follows this jump
432 clearly won't match what the repeat does, such that we can be
433 sure that there is no use backtracking out of repetitions
434 already matched, then we change it to a pop_failure_jump.
435 Followed by two-byte address. */
436 maybe_pop_jump,
437
438 /* Jump to following two-byte address, and push a dummy failure
439 point. This failure point will be thrown away if an attempt
440 is made to use it for a failure. A `+' construct makes this
441 before the first repeat. Also used as an intermediary kind
442 of jump when compiling an alternative. */
443 dummy_failure_jump,
444
445 /* Push a dummy failure point and continue. Used at the end of
446 alternatives. */
447 push_dummy_failure,
448
449 /* Followed by two-byte relative address and two-byte number n.
450 After matching N times, jump to the address upon failure. */
451 succeed_n,
452
453 /* Followed by two-byte relative address, and two-byte number n.
454 Jump to the address N times, then fail. */
455 jump_n,
456
457 /* Set the following two-byte relative address to the
458 subsequent two-byte number. The address *includes* the two
459 bytes of number. */
460 set_number_at,
461
462 wordchar, /* Matches any word-constituent character. */
463 notwordchar, /* Matches any char that is not a word-constituent. */
464
465 wordbeg, /* Succeeds if at word beginning. */
466 wordend, /* Succeeds if at word end. */
467
468 wordbound, /* Succeeds if at a word boundary. */
469 notwordbound /* Succeeds if not at a word boundary. */
470
471 #ifdef emacs
472 ,before_dot, /* Succeeds if before point. */
473 at_dot, /* Succeeds if at point. */
474 after_dot, /* Succeeds if after point. */
475
476 /* Matches any character whose syntax is specified. Followed by
477 a byte which contains a syntax code, e.g., Sword. */
478 syntaxspec,
479
480 /* Matches any character whose syntax is not that specified. */
481 notsyntaxspec
482 #endif /* emacs */
483 } re_opcode_t;
484
485 /* Common operations on the compiled pattern. */
486
487 /* Store NUMBER in two contiguous bytes starting at DESTINATION. */
488
489 #define STORE_NUMBER(destination, number) \
490 do { \
491 (destination)[0] = (number) & 0377; \
492 (destination)[1] = (number) >> 8; \
493 } while (0)
494
495 /* Same as STORE_NUMBER, except increment DESTINATION to
496 the byte after where the number is stored. Therefore, DESTINATION
497 must be an lvalue. */
498
499 #define STORE_NUMBER_AND_INCR(destination, number) \
500 do { \
501 STORE_NUMBER (destination, number); \
502 (destination) += 2; \
503 } while (0)
504
505 /* Put into DESTINATION a number stored in two contiguous bytes starting
506 at SOURCE. */
507
508 #define EXTRACT_NUMBER(destination, source) \
509 do { \
510 (destination) = *(source) & 0377; \
511 (destination) += SIGN_EXTEND_CHAR (*((source) + 1)) << 8; \
512 } while (0)
513
514 #ifdef DEBUG
515 static void extract_number _RE_ARGS ((int *dest, unsigned char *source));
516 static void
extract_number(dest,source)517 extract_number (dest, source)
518 int *dest;
519 unsigned char *source;
520 {
521 int temp = SIGN_EXTEND_CHAR (*(source + 1));
522 *dest = *source & 0377;
523 *dest += temp << 8;
524 }
525
526 #ifndef EXTRACT_MACROS /* To debug the macros. */
527 #undef EXTRACT_NUMBER
528 #define EXTRACT_NUMBER(dest, src) extract_number (&dest, src)
529 #endif /* not EXTRACT_MACROS */
530
531 #endif /* DEBUG */
532
533 /* Same as EXTRACT_NUMBER, except increment SOURCE to after the number.
534 SOURCE must be an lvalue. */
535
536 #define EXTRACT_NUMBER_AND_INCR(destination, source) \
537 do { \
538 EXTRACT_NUMBER (destination, source); \
539 (source) += 2; \
540 } while (0)
541
542 #ifdef DEBUG
543 static void extract_number_and_incr _RE_ARGS ((int *destination,
544 unsigned char **source));
545 static void
extract_number_and_incr(destination,source)546 extract_number_and_incr (destination, source)
547 int *destination;
548 unsigned char **source;
549 {
550 extract_number (destination, *source);
551 *source += 2;
552 }
553
554 #ifndef EXTRACT_MACROS
555 #undef EXTRACT_NUMBER_AND_INCR
556 #define EXTRACT_NUMBER_AND_INCR(dest, src) \
557 extract_number_and_incr (&dest, &src)
558 #endif /* not EXTRACT_MACROS */
559
560 #endif /* DEBUG */
561
562 #define STORE_MBC(p, wc) do { \
563 unsigned long _wc = (wc); \
564 (p)[3] = (unsigned char) _wc; \
565 (p)[2] = (unsigned char) (_wc >>= 8); \
566 (p)[1] = (unsigned char) (_wc >>= 8); \
567 (p)[0] = (unsigned char) (_wc >> 8); \
568 } while (0)
569 #define STORE_MBC_AND_INCR(p, wc) do { \
570 STORE_MBC((p), (wc)); \
571 (p) += 4; \
572 } while (0)
573
574 #define EXTRACT_MBC(p) \
575 ((((mbchar_t) (unsigned char) (p)[0]) << 24) \
576 | (((mbchar_t) (unsigned char) (p)[1]) << 16) \
577 | (((mbchar_t) (unsigned char) (p)[2]) << 8) \
578 | ((mbchar_t) (unsigned char) (p)[3]))
579 #define EXTRACT_MBC_AND_INCR(p) ((p) += 4, EXTRACT_MBC(p - 4))
580
581 #define EXTRACT_UNSIGNED(p) \
582 ((unsigned char) (p)[0] | (unsigned char) (p)[1] << 8)
583 #define EXTRACT_UNSIGNED_AND_INCR(p) \
584 ((p) += 2, (unsigned char) (p)[-2] | (unsigned char) (p)[-1] << 8)
585
586 /* If DEBUG is defined, Regex prints many voluminous messages about what
587 it is doing (if the variable `debug' is nonzero). If linked with the
588 main program in `iregex.c', you can enter patterns and strings
589 interactively. And if linked with the main program in `main.c' and
590 the other test files, you can run the already-written tests. */
591
592 #ifdef DEBUG
593
594 /* We use standard I/O for debugging. */
595 #include <stdio.h>
596
597 /* It is useful to test things that ``must'' be true when debugging. */
598 #include <assert.h>
599
600 static int debug = 0;
601
602 #define DEBUG_STATEMENT(e) e
603 #define DEBUG_PRINT1(x) if (debug) printf (x)
604 #define DEBUG_PRINT2(x1, x2) if (debug) printf (x1, x2)
605 #define DEBUG_PRINT3(x1, x2, x3) if (debug) printf (x1, x2, x3)
606 #define DEBUG_PRINT4(x1, x2, x3, x4) if (debug) printf (x1, x2, x3, x4)
607 #define DEBUG_PRINT_COMPILED_PATTERN(p, s, e) \
608 if (debug) print_partial_compiled_pattern (s, e)
609 #define DEBUG_PRINT_DOUBLE_STRING(w, s1, sz1, s2, sz2) \
610 if (debug) print_double_string (w, s1, sz1, s2, sz2)
611
612
613 /* Print the fastmap in human-readable form. */
614
615 void
print_fastmap(fastmap)616 print_fastmap (fastmap)
617 char *fastmap;
618 {
619 unsigned was_a_range = 0;
620 unsigned i = 0;
621
622 while (i < (1 << BYTEWIDTH))
623 {
624 if (fastmap[i++])
625 {
626 was_a_range = 0;
627 putchar (i - 1);
628 while (i < (1 << BYTEWIDTH) && fastmap[i])
629 {
630 was_a_range = 1;
631 i++;
632 }
633 if (was_a_range)
634 {
635 printf ("-");
636 putchar (i - 1);
637 }
638 }
639 }
640 putchar ('\n');
641 }
642
643
644 /* Print a compiled pattern string in human-readable form, starting at
645 the START pointer into it and ending just before the pointer END. */
646
647 void
print_partial_compiled_pattern(start,end)648 print_partial_compiled_pattern (start, end)
649 unsigned char *start;
650 unsigned char *end;
651 {
652 int mcnt, mcnt2;
653 unsigned char *p1;
654 unsigned char *p = start;
655 unsigned char *pend = end;
656
657 if (start == NULL)
658 {
659 printf ("(null)\n");
660 return;
661 }
662
663 /* Loop over pattern commands. */
664 while (p < pend)
665 {
666 printf ("%d:\t", p - start);
667
668 switch ((re_opcode_t) *p++)
669 {
670 case no_op:
671 printf ("/no_op");
672 break;
673
674 case exactn:
675 mcnt = *p++;
676 printf ("/exactn/%d", mcnt);
677 do
678 {
679 size_t n;
680 putchar ('/');
681 if (ismbchar (*p) && (n = mbclen(*p)) <= mcnt) {
682 printf ("/%.*s", n, (char *) p);
683 p += n;
684 mcnt -= (n - 1);
685 continue;
686 }
687 putchar (*p++);
688 }
689 while (--mcnt);
690 break;
691
692 case start_memory:
693 mcnt = *p++;
694 printf ("/start_memory/%d/%d", mcnt, *p++);
695 break;
696
697 case stop_memory:
698 mcnt = *p++;
699 printf ("/stop_memory/%d/%d", mcnt, *p++);
700 break;
701
702 case duplicate:
703 printf ("/duplicate/%d", *p++);
704 break;
705
706 case anychar:
707 printf ("/anychar");
708 break;
709
710 case charset:
711 case charset_not:
712 {
713 register int c, last = -100;
714 register int in_range = 0;
715
716 printf ("/charset [%s",
717 (re_opcode_t) *(p - 1) == charset_not ? "^" : "");
718
719 assert (p + *p < pend);
720
721 for (c = 0; c < 256; c++)
722 if (c / 8 < *p
723 && (p[1 + (c/8)] & (1 << (c % 8))))
724 {
725 /* Are we starting a range? */
726 if (last + 1 == c && ! in_range)
727 {
728 putchar ('-');
729 in_range = 1;
730 }
731 /* Have we broken a range? */
732 else if (last + 1 != c && in_range)
733 {
734 putchar (last);
735 in_range = 0;
736 }
737
738 if (! in_range)
739 putchar (c);
740
741 last = c;
742 }
743
744 if (in_range)
745 putchar (last);
746
747 p += 1 + *p;
748 {
749 unsigned short i, size;
750 char buf0[8];
751 char buf1[8];
752
753 size = EXTRACT_UNSIGNED_AND_INCR (p);
754 for (i = 0; i < size; i++) {
755 bzero (buf0, sizeof buf0);
756 bzero (buf1, sizeof buf1);
757 mbcstore (buf0, EXTRACT_MBC_AND_INCR (p));
758 mbcstore (buf1, EXTRACT_MBC_AND_INCR (p));
759 printf ("%s-%s", buf0, buf1);
760 }
761 }
762 putchar (']');
763 }
764 break;
765
766 case begline:
767 printf ("/begline");
768 break;
769
770 case endline:
771 printf ("/endline");
772 break;
773
774 case on_failure_jump:
775 extract_number_and_incr (&mcnt, &p);
776 printf ("/on_failure_jump to %d", p + mcnt - start);
777 break;
778
779 case on_failure_keep_string_jump:
780 extract_number_and_incr (&mcnt, &p);
781 printf ("/on_failure_keep_string_jump to %d", p + mcnt - start);
782 break;
783
784 case dummy_failure_jump:
785 extract_number_and_incr (&mcnt, &p);
786 printf ("/dummy_failure_jump to %d", p + mcnt - start);
787 break;
788
789 case push_dummy_failure:
790 printf ("/push_dummy_failure");
791 break;
792
793 case maybe_pop_jump:
794 extract_number_and_incr (&mcnt, &p);
795 printf ("/maybe_pop_jump to %d", p + mcnt - start);
796 break;
797
798 case pop_failure_jump:
799 extract_number_and_incr (&mcnt, &p);
800 printf ("/pop_failure_jump to %d", p + mcnt - start);
801 break;
802
803 case jump_past_alt:
804 extract_number_and_incr (&mcnt, &p);
805 printf ("/jump_past_alt to %d", p + mcnt - start);
806 break;
807
808 case jump:
809 extract_number_and_incr (&mcnt, &p);
810 printf ("/jump to %d", p + mcnt - start);
811 break;
812
813 case succeed_n:
814 extract_number_and_incr (&mcnt, &p);
815 p1 = p + mcnt;
816 extract_number_and_incr (&mcnt2, &p);
817 printf ("/succeed_n to %d, %d times", p1 - start, mcnt2);
818 break;
819
820 case jump_n:
821 extract_number_and_incr (&mcnt, &p);
822 p1 = p + mcnt;
823 extract_number_and_incr (&mcnt2, &p);
824 printf ("/jump_n to %d, %d times", p1 - start, mcnt2);
825 break;
826
827 case set_number_at:
828 extract_number_and_incr (&mcnt, &p);
829 p1 = p + mcnt;
830 extract_number_and_incr (&mcnt2, &p);
831 printf ("/set_number_at location %d to %d", p1 - start, mcnt2);
832 break;
833
834 case wordbound:
835 printf ("/wordbound");
836 break;
837
838 case notwordbound:
839 printf ("/notwordbound");
840 break;
841
842 case wordbeg:
843 printf ("/wordbeg");
844 break;
845
846 case wordend:
847 printf ("/wordend");
848
849 #ifdef emacs
850 case before_dot:
851 printf ("/before_dot");
852 break;
853
854 case at_dot:
855 printf ("/at_dot");
856 break;
857
858 case after_dot:
859 printf ("/after_dot");
860 break;
861
862 case syntaxspec:
863 printf ("/syntaxspec");
864 mcnt = *p++;
865 printf ("/%d", mcnt);
866 break;
867
868 case notsyntaxspec:
869 printf ("/notsyntaxspec");
870 mcnt = *p++;
871 printf ("/%d", mcnt);
872 break;
873 #endif /* emacs */
874
875 case wordchar:
876 printf ("/wordchar");
877 break;
878
879 case notwordchar:
880 printf ("/notwordchar");
881 break;
882
883 case begbuf:
884 printf ("/begbuf");
885 break;
886
887 case endbuf:
888 printf ("/endbuf");
889 break;
890
891 default:
892 printf ("?%d", *(p-1));
893 }
894
895 putchar ('\n');
896 }
897
898 printf ("%d:\tend of pattern.\n", p - start);
899 }
900
901
902 void
print_compiled_pattern(bufp)903 print_compiled_pattern (bufp)
904 struct re_pattern_buffer *bufp;
905 {
906 unsigned char *buffer = bufp->buffer;
907
908 print_partial_compiled_pattern (buffer, buffer + bufp->used);
909 printf ("%ld bytes used/%ld bytes allocated.\n",
910 bufp->used, bufp->allocated);
911
912 if (bufp->fastmap_accurate && bufp->fastmap)
913 {
914 printf ("fastmap: ");
915 print_fastmap (bufp->fastmap);
916 }
917
918 printf ("re_nsub: %d\t", bufp->re_nsub);
919 printf ("regs_alloc: %d\t", bufp->regs_allocated);
920 printf ("can_be_null: %d\t", bufp->can_be_null);
921 printf ("newline_anchor: %d\n", bufp->newline_anchor);
922 printf ("no_sub: %d\t", bufp->no_sub);
923 printf ("not_bol: %d\t", bufp->not_bol);
924 printf ("not_eol: %d\t", bufp->not_eol);
925 printf ("syntax: %lx\n", bufp->syntax);
926 /* Perhaps we should print the translate table? */
927 }
928
929
930 void
print_double_string(where,string1,size1,string2,size2)931 print_double_string (where, string1, size1, string2, size2)
932 const char *where;
933 const char *string1;
934 const char *string2;
935 int size1;
936 int size2;
937 {
938 int this_char;
939
940 if (where == NULL)
941 printf ("(null)");
942 else
943 {
944 if (FIRST_STRING_P (where))
945 {
946 for (this_char = where - string1; this_char < size1; this_char++)
947 putchar (string1[this_char]);
948
949 where = string2;
950 }
951
952 for (this_char = where - string2; this_char < size2; this_char++)
953 putchar (string2[this_char]);
954 }
955 }
956
957 void
printchar(c)958 printchar (c)
959 int c;
960 {
961 putc (c, stderr);
962 }
963
964 #else /* not DEBUG */
965
966 #undef assert
967 #define assert(e)
968
969 #define DEBUG_STATEMENT(e)
970 #define DEBUG_PRINT1(x)
971 #define DEBUG_PRINT2(x1, x2)
972 #define DEBUG_PRINT3(x1, x2, x3)
973 #define DEBUG_PRINT4(x1, x2, x3, x4)
974 #define DEBUG_PRINT_COMPILED_PATTERN(p, s, e)
975 #define DEBUG_PRINT_DOUBLE_STRING(w, s1, sz1, s2, sz2)
976
977 #endif /* not DEBUG */
978
979 /* Set by `re_set_syntax' to the current regexp syntax to recognize. Can
980 also be assigned to arbitrarily: each pattern buffer stores its own
981 syntax, so it can be changed between regex compilations. */
982 /* This has no initializer because initialized variables in Emacs
983 become read-only after dumping. */
984 reg_syntax_t re_syntax_options;
985
986
987 /* Specify the precise syntax of regexps for compilation. This provides
988 for compatibility for various utilities which historically have
989 different, incompatible syntaxes.
990
991 The argument SYNTAX is a bit mask comprised of the various bits
992 defined in regex.h. We return the old syntax. */
993
994 reg_syntax_t
re_set_syntax(syntax)995 re_set_syntax (syntax)
996 reg_syntax_t syntax;
997 {
998 reg_syntax_t ret = re_syntax_options;
999
1000 re_syntax_options = syntax;
1001 #ifdef DEBUG
1002 if (syntax & RE_DEBUG)
1003 debug = 1;
1004 else if (debug) /* was on but now is not */
1005 debug = 0;
1006 #endif /* DEBUG */
1007 return ret;
1008 }
1009
1010 /* This table gives an error message for each of the error codes listed
1011 in regex.h. Obviously the order here has to be same as there.
1012 POSIX doesn't require that we do anything for REG_NOERROR,
1013 but why not be nice? */
1014
1015 static const char *re_error_msgid[] =
1016 {
1017 gettext_noop ("Success"), /* REG_NOERROR */
1018 gettext_noop ("No match"), /* REG_NOMATCH */
1019 gettext_noop ("Invalid regular expression"), /* REG_BADPAT */
1020 gettext_noop ("Invalid collation character"), /* REG_ECOLLATE */
1021 gettext_noop ("Invalid character class name"), /* REG_ECTYPE */
1022 gettext_noop ("Trailing backslash"), /* REG_EESCAPE */
1023 gettext_noop ("Invalid back reference"), /* REG_ESUBREG */
1024 gettext_noop ("Unmatched [ or [^"), /* REG_EBRACK */
1025 gettext_noop ("Unmatched ( or \\("), /* REG_EPAREN */
1026 gettext_noop ("Unmatched \\{"), /* REG_EBRACE */
1027 gettext_noop ("Invalid content of \\{\\}"), /* REG_BADBR */
1028 gettext_noop ("Invalid range end"), /* REG_ERANGE */
1029 gettext_noop ("Memory exhausted"), /* REG_ESPACE */
1030 gettext_noop ("Invalid preceding regular expression"), /* REG_BADRPT */
1031 gettext_noop ("Premature end of regular expression"), /* REG_EEND */
1032 gettext_noop ("Regular expression too big"), /* REG_ESIZE */
1033 gettext_noop ("Unmatched ) or \\)"), /* REG_ERPAREN */
1034 };
1035
1036 /* Avoiding alloca during matching, to placate r_alloc. */
1037
1038 /* Define MATCH_MAY_ALLOCATE unless we need to make sure that the
1039 searching and matching functions should not call alloca. On some
1040 systems, alloca is implemented in terms of malloc, and if we're
1041 using the relocating allocator routines, then malloc could cause a
1042 relocation, which might (if the strings being searched are in the
1043 ralloc heap) shift the data out from underneath the regexp
1044 routines.
1045
1046 Here's another reason to avoid allocation: Emacs
1047 processes input from X in a signal handler; processing X input may
1048 call malloc; if input arrives while a matching routine is calling
1049 malloc, then we're scrod. But Emacs can't just block input while
1050 calling matching routines; then we don't notice interrupts when
1051 they come in. So, Emacs blocks input around all regexp calls
1052 except the matching calls, which it leaves unprotected, in the
1053 faith that they will not malloc. */
1054
1055 /* Normally, this is fine. */
1056 #define MATCH_MAY_ALLOCATE
1057
1058 /* When using GNU C, we are not REALLY using the C alloca, no matter
1059 what config.h may say. So don't take precautions for it. */
1060 #ifdef __GNUC__
1061 #undef C_ALLOCA
1062 #endif
1063
1064 /* The match routines may not allocate if (1) they would do it with malloc
1065 and (2) it's not safe for them to use malloc.
1066 Note that if REL_ALLOC is defined, matching would not use malloc for the
1067 failure stack, but we would still use it for the register vectors;
1068 so REL_ALLOC should not affect this. */
1069 #if (defined (C_ALLOCA) || defined (REGEX_MALLOC)) && defined (emacs)
1070 #undef MATCH_MAY_ALLOCATE
1071 #endif
1072
1073
1074 /* Failure stack declarations and macros; both re_compile_fastmap and
1075 re_match_2 use a failure stack. These have to be macros because of
1076 REGEX_ALLOCATE_STACK. */
1077
1078
1079 /* Number of failure points for which to initially allocate space
1080 when matching. If this number is exceeded, we allocate more
1081 space, so it is not a hard limit. */
1082 #ifndef INIT_FAILURE_ALLOC
1083 #define INIT_FAILURE_ALLOC 5
1084 #endif
1085
1086 /* Roughly the maximum number of failure points on the stack. Would be
1087 exactly that if always used MAX_FAILURE_ITEMS items each time we failed.
1088 This is a variable only so users of regex can assign to it; we never
1089 change it ourselves. */
1090
1091 #ifndef INT_IS_16BIT
1092 #ifndef WIN32
1093 #if defined (_MSC_VER) || defined (__TURBOC__)
1094 #define INT_IS_16BIT
1095 #endif
1096 #endif
1097 #endif
1098
1099 #ifdef INT_IS_16BIT
1100
1101 #if defined (MATCH_MAY_ALLOCATE)
1102 /* 4400 was enough to cause a crash on Alpha OSF/1,
1103 whose default stack limit is 2mb. */
1104 long int re_max_failures = 4000;
1105 #else
1106 long int re_max_failures = 2000;
1107 #endif
1108
1109 union fail_stack_elt
1110 {
1111 unsigned char *pointer;
1112 long int integer;
1113 };
1114
1115 typedef union fail_stack_elt fail_stack_elt_t;
1116
1117 typedef struct
1118 {
1119 fail_stack_elt_t *stack;
1120 unsigned long int size;
1121 unsigned long int avail; /* Offset of next open position. */
1122 } fail_stack_type;
1123
1124 #else /* not INT_IS_16BIT */
1125
1126 #if defined (MATCH_MAY_ALLOCATE)
1127 /* 4400 was enough to cause a crash on Alpha OSF/1,
1128 whose default stack limit is 2mb. */
1129 int re_max_failures = 20000;
1130 #else
1131 int re_max_failures = 2000;
1132 #endif
1133
1134 union fail_stack_elt
1135 {
1136 unsigned char *pointer;
1137 int integer;
1138 };
1139
1140 typedef union fail_stack_elt fail_stack_elt_t;
1141
1142 typedef struct
1143 {
1144 fail_stack_elt_t *stack;
1145 unsigned size;
1146 unsigned avail; /* Offset of next open position. */
1147 } fail_stack_type;
1148
1149 #endif /* INT_IS_16BIT */
1150
1151 #define FAIL_STACK_EMPTY() (fail_stack.avail == 0)
1152 #define FAIL_STACK_PTR_EMPTY() (fail_stack_ptr->avail == 0)
1153 #define FAIL_STACK_FULL() (fail_stack.avail == fail_stack.size)
1154
1155
1156 /* Define macros to initialize and free the failure stack.
1157 Do `return -2' if the alloc fails. */
1158
1159 #ifdef MATCH_MAY_ALLOCATE
1160 #define INIT_FAIL_STACK() \
1161 do { \
1162 fail_stack.stack = (fail_stack_elt_t *) \
1163 REGEX_ALLOCATE_STACK (INIT_FAILURE_ALLOC * sizeof (fail_stack_elt_t)); \
1164 \
1165 if (fail_stack.stack == NULL) \
1166 return -2; \
1167 \
1168 fail_stack.size = INIT_FAILURE_ALLOC; \
1169 fail_stack.avail = 0; \
1170 } while (0)
1171
1172 #define RESET_FAIL_STACK() REGEX_FREE_STACK (fail_stack.stack)
1173 #else
1174 #define INIT_FAIL_STACK() \
1175 do { \
1176 fail_stack.avail = 0; \
1177 } while (0)
1178
1179 #define RESET_FAIL_STACK()
1180 #endif
1181
1182
1183 /* Double the size of FAIL_STACK, up to approximately `re_max_failures' items.
1184
1185 Return 1 if succeeds, and 0 if either ran out of memory
1186 allocating space for it or it was already too large.
1187
1188 REGEX_REALLOCATE_STACK requires `destination' be declared. */
1189
1190 #define DOUBLE_FAIL_STACK(fail_stack) \
1191 ((fail_stack).size > (unsigned) (re_max_failures * MAX_FAILURE_ITEMS) \
1192 ? 0 \
1193 : ((fail_stack).stack = (fail_stack_elt_t *) \
1194 REGEX_REALLOCATE_STACK ((fail_stack).stack, \
1195 (fail_stack).size * sizeof (fail_stack_elt_t), \
1196 ((fail_stack).size << 1) * sizeof (fail_stack_elt_t)), \
1197 \
1198 (fail_stack).stack == NULL \
1199 ? 0 \
1200 : ((fail_stack).size <<= 1, \
1201 1)))
1202
1203
1204 /* Push pointer POINTER on FAIL_STACK.
1205 Return 1 if was able to do so and 0 if ran out of memory allocating
1206 space to do so. */
1207 #define PUSH_PATTERN_OP(POINTER, FAIL_STACK) \
1208 ((FAIL_STACK_FULL () \
1209 && !DOUBLE_FAIL_STACK (FAIL_STACK)) \
1210 ? 0 \
1211 : ((FAIL_STACK).stack[(FAIL_STACK).avail++].pointer = POINTER, \
1212 1))
1213
1214 /* Push a pointer value onto the failure stack.
1215 Assumes the variable `fail_stack'. Probably should only
1216 be called from within `PUSH_FAILURE_POINT'. */
1217 #define PUSH_FAILURE_POINTER(item) \
1218 fail_stack.stack[fail_stack.avail++].pointer = (unsigned char *) (item)
1219
1220 /* This pushes an integer-valued item onto the failure stack.
1221 Assumes the variable `fail_stack'. Probably should only
1222 be called from within `PUSH_FAILURE_POINT'. */
1223 #define PUSH_FAILURE_INT(item) \
1224 fail_stack.stack[fail_stack.avail++].integer = (item)
1225
1226 /* Push a fail_stack_elt_t value onto the failure stack.
1227 Assumes the variable `fail_stack'. Probably should only
1228 be called from within `PUSH_FAILURE_POINT'. */
1229 #define PUSH_FAILURE_ELT(item) \
1230 fail_stack.stack[fail_stack.avail++] = (item)
1231
1232 /* These three POP... operations complement the three PUSH... operations.
1233 All assume that `fail_stack' is nonempty. */
1234 #define POP_FAILURE_POINTER() fail_stack.stack[--fail_stack.avail].pointer
1235 #define POP_FAILURE_INT() fail_stack.stack[--fail_stack.avail].integer
1236 #define POP_FAILURE_ELT() fail_stack.stack[--fail_stack.avail]
1237
1238 /* Used to omit pushing failure point id's when we're not debugging. */
1239 #ifdef DEBUG
1240 #define DEBUG_PUSH PUSH_FAILURE_INT
1241 #define DEBUG_POP(item_addr) (item_addr)->integer = POP_FAILURE_INT ()
1242 #else
1243 #define DEBUG_PUSH(item)
1244 #define DEBUG_POP(item_addr)
1245 #endif
1246
1247
1248 /* Push the information about the state we will need
1249 if we ever fail back to it.
1250
1251 Requires variables fail_stack, regstart, regend, reg_info, and
1252 num_regs be declared. DOUBLE_FAIL_STACK requires `destination' be
1253 declared.
1254
1255 Does `return FAILURE_CODE' if runs out of memory. */
1256
1257 #define PUSH_FAILURE_POINT(pattern_place, string_place, failure_code) \
1258 do { \
1259 char *destination; \
1260 /* Must be int, so when we don't save any registers, the arithmetic \
1261 of 0 + -1 isn't done as unsigned. */ \
1262 /* Can't be int, since there is not a shred of a guarantee that int \
1263 is wide enough to hold a value of something to which pointer can \
1264 be assigned */ \
1265 s_reg_t this_reg; \
1266 \
1267 DEBUG_STATEMENT (failure_id++); \
1268 DEBUG_STATEMENT (nfailure_points_pushed++); \
1269 DEBUG_PRINT2 ("\nPUSH_FAILURE_POINT #%u:\n", failure_id); \
1270 DEBUG_PRINT2 (" Before push, next avail: %d\n", (fail_stack).avail);\
1271 DEBUG_PRINT2 (" size: %d\n", (fail_stack).size);\
1272 \
1273 DEBUG_PRINT2 (" slots needed: %d\n", NUM_FAILURE_ITEMS); \
1274 DEBUG_PRINT2 (" available: %d\n", REMAINING_AVAIL_SLOTS); \
1275 \
1276 /* Ensure we have enough space allocated for what we will push. */ \
1277 while (REMAINING_AVAIL_SLOTS < NUM_FAILURE_ITEMS) \
1278 { \
1279 if (!DOUBLE_FAIL_STACK (fail_stack)) \
1280 return failure_code; \
1281 \
1282 DEBUG_PRINT2 ("\n Doubled stack; size now: %d\n", \
1283 (fail_stack).size); \
1284 DEBUG_PRINT2 (" slots available: %d\n", REMAINING_AVAIL_SLOTS);\
1285 } \
1286 \
1287 /* Push the info, starting with the registers. */ \
1288 DEBUG_PRINT1 ("\n"); \
1289 \
1290 if (1) \
1291 for (this_reg = lowest_active_reg; this_reg <= highest_active_reg; \
1292 this_reg++) \
1293 { \
1294 DEBUG_PRINT2 (" Pushing reg: %d\n", this_reg); \
1295 DEBUG_STATEMENT (num_regs_pushed++); \
1296 \
1297 DEBUG_PRINT2 (" start: 0x%x\n", regstart[this_reg]); \
1298 PUSH_FAILURE_POINTER (regstart[this_reg]); \
1299 \
1300 DEBUG_PRINT2 (" end: 0x%x\n", regend[this_reg]); \
1301 PUSH_FAILURE_POINTER (regend[this_reg]); \
1302 \
1303 DEBUG_PRINT2 (" info: 0x%x\n ", reg_info[this_reg]); \
1304 DEBUG_PRINT2 (" match_null=%d", \
1305 REG_MATCH_NULL_STRING_P (reg_info[this_reg])); \
1306 DEBUG_PRINT2 (" active=%d", IS_ACTIVE (reg_info[this_reg])); \
1307 DEBUG_PRINT2 (" matched_something=%d", \
1308 MATCHED_SOMETHING (reg_info[this_reg])); \
1309 DEBUG_PRINT2 (" ever_matched=%d", \
1310 EVER_MATCHED_SOMETHING (reg_info[this_reg])); \
1311 DEBUG_PRINT1 ("\n"); \
1312 PUSH_FAILURE_ELT (reg_info[this_reg].word); \
1313 } \
1314 \
1315 DEBUG_PRINT2 (" Pushing low active reg: %d\n", lowest_active_reg);\
1316 PUSH_FAILURE_INT (lowest_active_reg); \
1317 \
1318 DEBUG_PRINT2 (" Pushing high active reg: %d\n", highest_active_reg);\
1319 PUSH_FAILURE_INT (highest_active_reg); \
1320 \
1321 DEBUG_PRINT2 (" Pushing pattern 0x%x:\n", pattern_place); \
1322 DEBUG_PRINT_COMPILED_PATTERN (bufp, pattern_place, pend); \
1323 PUSH_FAILURE_POINTER (pattern_place); \
1324 \
1325 DEBUG_PRINT2 (" Pushing string 0x%x: `", string_place); \
1326 DEBUG_PRINT_DOUBLE_STRING (string_place, string1, size1, string2, \
1327 size2); \
1328 DEBUG_PRINT1 ("'\n"); \
1329 PUSH_FAILURE_POINTER (string_place); \
1330 \
1331 DEBUG_PRINT2 (" Pushing failure id: %u\n", failure_id); \
1332 DEBUG_PUSH (failure_id); \
1333 } while (0)
1334
1335 /* This is the number of items that are pushed and popped on the stack
1336 for each register. */
1337 #define NUM_REG_ITEMS 3
1338
1339 /* Individual items aside from the registers. */
1340 #ifdef DEBUG
1341 #define NUM_NONREG_ITEMS 5 /* Includes failure point id. */
1342 #else
1343 #define NUM_NONREG_ITEMS 4
1344 #endif
1345
1346 /* We push at most this many items on the stack. */
1347 /* We used to use (num_regs - 1), which is the number of registers
1348 this regexp will save; but that was changed to 5
1349 to avoid stack overflow for a regexp with lots of parens. */
1350 #define MAX_FAILURE_ITEMS (5 * NUM_REG_ITEMS + NUM_NONREG_ITEMS)
1351
1352 /* We actually push this many items. */
1353 #define NUM_FAILURE_ITEMS \
1354 (((0 \
1355 ? 0 : highest_active_reg - lowest_active_reg + 1) \
1356 * NUM_REG_ITEMS) \
1357 + NUM_NONREG_ITEMS)
1358
1359 /* How many items can still be added to the stack without overflowing it. */
1360 #define REMAINING_AVAIL_SLOTS ((fail_stack).size - (fail_stack).avail)
1361
1362
1363 /* Pops what PUSH_FAIL_STACK pushes.
1364
1365 We restore into the parameters, all of which should be lvalues:
1366 STR -- the saved data position.
1367 PAT -- the saved pattern position.
1368 LOW_REG, HIGH_REG -- the highest and lowest active registers.
1369 REGSTART, REGEND -- arrays of string positions.
1370 REG_INFO -- array of information about each subexpression.
1371
1372 Also assumes the variables `fail_stack' and (if debugging), `bufp',
1373 `pend', `string1', `size1', `string2', and `size2'. */
1374
1375 #define POP_FAILURE_POINT(str, pat, low_reg, high_reg, regstart, regend, reg_info)\
1376 { \
1377 DEBUG_STATEMENT (fail_stack_elt_t failure_id;) \
1378 s_reg_t this_reg; \
1379 const unsigned char *string_temp; \
1380 \
1381 assert (!FAIL_STACK_EMPTY ()); \
1382 \
1383 /* Remove failure points and point to how many regs pushed. */ \
1384 DEBUG_PRINT1 ("POP_FAILURE_POINT:\n"); \
1385 DEBUG_PRINT2 (" Before pop, next avail: %d\n", fail_stack.avail); \
1386 DEBUG_PRINT2 (" size: %d\n", fail_stack.size); \
1387 \
1388 assert (fail_stack.avail >= NUM_NONREG_ITEMS); \
1389 \
1390 DEBUG_POP (&failure_id); \
1391 DEBUG_PRINT2 (" Popping failure id: %u\n", failure_id); \
1392 \
1393 /* If the saved string location is NULL, it came from an \
1394 on_failure_keep_string_jump opcode, and we want to throw away the \
1395 saved NULL, thus retaining our current position in the string. */ \
1396 string_temp = POP_FAILURE_POINTER (); \
1397 if (string_temp != NULL) \
1398 str = (const char *) string_temp; \
1399 \
1400 DEBUG_PRINT2 (" Popping string 0x%x: `", str); \
1401 DEBUG_PRINT_DOUBLE_STRING (str, string1, size1, string2, size2); \
1402 DEBUG_PRINT1 ("'\n"); \
1403 \
1404 pat = (unsigned char *) POP_FAILURE_POINTER (); \
1405 DEBUG_PRINT2 (" Popping pattern 0x%x:\n", pat); \
1406 DEBUG_PRINT_COMPILED_PATTERN (bufp, pat, pend); \
1407 \
1408 /* Restore register info. */ \
1409 high_reg = (active_reg_t) POP_FAILURE_INT (); \
1410 DEBUG_PRINT2 (" Popping high active reg: %d\n", high_reg); \
1411 \
1412 low_reg = (active_reg_t) POP_FAILURE_INT (); \
1413 DEBUG_PRINT2 (" Popping low active reg: %d\n", low_reg); \
1414 \
1415 if (1) \
1416 for (this_reg = high_reg; this_reg >= low_reg; this_reg--) \
1417 { \
1418 DEBUG_PRINT2 (" Popping reg: %d\n", this_reg); \
1419 \
1420 reg_info[this_reg].word = POP_FAILURE_ELT (); \
1421 DEBUG_PRINT2 (" info: 0x%x\n", reg_info[this_reg]); \
1422 \
1423 regend[this_reg] = (const char *) POP_FAILURE_POINTER (); \
1424 DEBUG_PRINT2 (" end: 0x%x\n", regend[this_reg]); \
1425 \
1426 regstart[this_reg] = (const char *) POP_FAILURE_POINTER (); \
1427 DEBUG_PRINT2 (" start: 0x%x\n", regstart[this_reg]); \
1428 } \
1429 else \
1430 { \
1431 for (this_reg = highest_active_reg; this_reg > high_reg; this_reg--) \
1432 { \
1433 reg_info[this_reg].word.integer = 0; \
1434 regend[this_reg] = 0; \
1435 regstart[this_reg] = 0; \
1436 } \
1437 highest_active_reg = high_reg; \
1438 } \
1439 \
1440 set_regs_matched_done = 0; \
1441 DEBUG_STATEMENT (nfailure_points_popped++); \
1442 } /* POP_FAILURE_POINT */
1443
1444
1445
1446 /* Structure for per-register (a.k.a. per-group) information.
1447 Other register information, such as the
1448 starting and ending positions (which are addresses), and the list of
1449 inner groups (which is a bits list) are maintained in separate
1450 variables.
1451
1452 We are making a (strictly speaking) nonportable assumption here: that
1453 the compiler will pack our bit fields into something that fits into
1454 the type of `word', i.e., is something that fits into one item on the
1455 failure stack. */
1456
1457
1458 /* Declarations and macros for re_match_2. */
1459
1460 typedef union
1461 {
1462 fail_stack_elt_t word;
1463 struct
1464 {
1465 /* This field is one if this group can match the empty string,
1466 zero if not. If not yet determined, `MATCH_NULL_UNSET_VALUE'. */
1467 #define MATCH_NULL_UNSET_VALUE 3
1468 unsigned match_null_string_p : 2;
1469 unsigned is_active : 1;
1470 unsigned matched_something : 1;
1471 unsigned ever_matched_something : 1;
1472 } bits;
1473 } register_info_type;
1474
1475 #define REG_MATCH_NULL_STRING_P(R) ((R).bits.match_null_string_p)
1476 #define IS_ACTIVE(R) ((R).bits.is_active)
1477 #define MATCHED_SOMETHING(R) ((R).bits.matched_something)
1478 #define EVER_MATCHED_SOMETHING(R) ((R).bits.ever_matched_something)
1479
1480
1481 /* Call this when have matched a real character; it sets `matched' flags
1482 for the subexpressions which we are currently inside. Also records
1483 that those subexprs have matched. */
1484 #define SET_REGS_MATCHED() \
1485 do \
1486 { \
1487 if (!set_regs_matched_done) \
1488 { \
1489 active_reg_t r; \
1490 set_regs_matched_done = 1; \
1491 for (r = lowest_active_reg; r <= highest_active_reg; r++) \
1492 { \
1493 MATCHED_SOMETHING (reg_info[r]) \
1494 = EVER_MATCHED_SOMETHING (reg_info[r]) \
1495 = 1; \
1496 } \
1497 } \
1498 } \
1499 while (0)
1500
1501 /* Registers are set to a sentinel when they haven't yet matched. */
1502 static char reg_unset_dummy;
1503 #define REG_UNSET_VALUE (®_unset_dummy)
1504 #define REG_UNSET(e) ((e) == REG_UNSET_VALUE)
1505
1506 /* Subroutine declarations and macros for regex_compile. */
1507
1508 static reg_errcode_t regex_compile _RE_ARGS ((const char *pattern, size_t size,
1509 reg_syntax_t syntax,
1510 struct re_pattern_buffer *bufp));
1511 static void store_op1 _RE_ARGS ((re_opcode_t op, unsigned char *loc, int arg));
1512 static void store_op2 _RE_ARGS ((re_opcode_t op, unsigned char *loc,
1513 int arg1, int arg2));
1514 static void insert_op1 _RE_ARGS ((re_opcode_t op, unsigned char *loc,
1515 int arg, unsigned char *end));
1516 static void insert_op2 _RE_ARGS ((re_opcode_t op, unsigned char *loc,
1517 int arg1, int arg2, unsigned char *end));
1518 static boolean at_begline_loc_p _RE_ARGS ((const char *pattern, const char *p,
1519 reg_syntax_t syntax));
1520 static boolean at_endline_loc_p _RE_ARGS ((const char *p, const char *pend,
1521 reg_syntax_t syntax));
1522 #if 0 /* We don't use this. */
1523 static reg_errcode_t compile_range _RE_ARGS ((const char **p_ptr,
1524 const char *pend,
1525 char *translate,
1526 reg_syntax_t syntax,
1527 unsigned char *b));
1528 #endif
1529
1530 /* Fetch the next character in the uncompiled pattern---translating it
1531 if necessary. Also cast from a signed character in the constant
1532 string passed to us by the user to an unsigned char that we can use
1533 as an array index (in, e.g., `translate'). */
1534 #ifndef PATFETCH
1535 #define PATFETCH(c) \
1536 do {if (p == pend) return REG_EEND; \
1537 c = (unsigned char) *p++; \
1538 if (translate && !ismbchar (c)) \
1539 c = (unsigned char) translate[(unsigned char) (c)]; \
1540 } while (0)
1541 #endif
1542
1543 /* Fetch the next character in the uncompiled pattern, with no
1544 translation. */
1545 #define PATFETCH_RAW(c) \
1546 do {if (p == pend) return REG_EEND; \
1547 c = (unsigned char) *p++; \
1548 } while (0)
1549
1550 /* Go backwards one character in the pattern. */
1551 #define PATUNFETCH p--
1552
1553 #define PATFETCH_MBC(wc) \
1554 do {if (p == pend || p + mbclen(*p) > pend) return REG_EEND; \
1555 { char *end; wc = mbcfetch (p, &end); p = end; } \
1556 if (translate && wc < (1 << BYTEWIDTH)) \
1557 wc = (mbchar_t) translate[(unsigned char) (wc)]; \
1558 } while (0)
1559
1560
1561
1562 /* If `translate' is non-null, return translate[D], else just D. We
1563 cast the subscript to translate because some data is declared as
1564 `char *', to avoid warnings when a string constant is passed. But
1565 when we use a character as a subscript we must make it unsigned. */
1566 #ifndef TRANSLATE
1567 #define TRANSLATE(d) (translate \
1568 ? (unsigned char) translate[(unsigned char) (d)] \
1569 : (d))
1570 #endif
1571
1572
1573 /* Macros for outputting the compiled pattern into `buffer'. */
1574
1575 /* If the buffer isn't allocated when it comes in, use this. */
1576 #define INIT_BUF_SIZE 32
1577
1578 /* Make sure we have at least N more bytes of space in buffer. */
1579 #define GET_BUFFER_SPACE(n) \
1580 while ((unsigned long) (b - bufp->buffer + (n)) > bufp->allocated) \
1581 EXTEND_BUFFER ()
1582
1583 /* Make sure we have one more byte of buffer space and then add C to it. */
1584 #define BUF_PUSH(c) \
1585 do { \
1586 GET_BUFFER_SPACE (1); \
1587 *b++ = (unsigned char) (c); \
1588 } while (0)
1589
1590
1591 /* Ensure we have two more bytes of buffer space and then append C1 and C2. */
1592 #define BUF_PUSH_2(c1, c2) \
1593 do { \
1594 GET_BUFFER_SPACE (2); \
1595 *b++ = (unsigned char) (c1); \
1596 *b++ = (unsigned char) (c2); \
1597 } while (0)
1598
1599
1600 /* As with BUF_PUSH_2, except for three bytes. */
1601 #define BUF_PUSH_3(c1, c2, c3) \
1602 do { \
1603 GET_BUFFER_SPACE (3); \
1604 *b++ = (unsigned char) (c1); \
1605 *b++ = (unsigned char) (c2); \
1606 *b++ = (unsigned char) (c3); \
1607 } while (0)
1608
1609
1610 /* Store a jump with opcode OP at LOC to location TO. We store a
1611 relative address offset by the three bytes the jump itself occupies. */
1612 #define STORE_JUMP(op, loc, to) \
1613 store_op1 (op, loc, (int) ((to) - (loc) - 3))
1614
1615 /* Likewise, for a two-argument jump. */
1616 #define STORE_JUMP2(op, loc, to, arg) \
1617 store_op2 (op, loc, (int) ((to) - (loc) - 3), arg)
1618
1619 /* Like `STORE_JUMP', but for inserting. Assume `b' is the buffer end. */
1620 #define INSERT_JUMP(op, loc, to) \
1621 insert_op1 (op, loc, (int) ((to) - (loc) - 3), b)
1622
1623 /* Like `STORE_JUMP2', but for inserting. Assume `b' is the buffer end. */
1624 #define INSERT_JUMP2(op, loc, to, arg) \
1625 insert_op2 (op, loc, (int) ((to) - (loc) - 3), arg, b)
1626
1627
1628 /* This is not an arbitrary limit: the arguments which represent offsets
1629 into the pattern are two bytes long. So if 2^16 bytes turns out to
1630 be too small, many things would have to change. */
1631 /* Any other compiler which, like MSC, has allocation limit below 2^16
1632 bytes will have to use approach similar to what was done below for
1633 MSC and drop MAX_BUF_SIZE a bit. Otherwise you may end up
1634 reallocating to 0 bytes. Such thing is not going to work too well.
1635 You have been warned!! */
1636 #if (defined(_MSC_VER) || defined(__TURBOC__)) && !defined(WIN32)
1637 /* Microsoft C 16-bit versions limit malloc to approx 65512 bytes.
1638 The REALLOC define eliminates a flurry of conversion warnings,
1639 but is not required. */
1640 #define MAX_BUF_SIZE 65500L
1641 #define REALLOC(p,s) realloc ((p), (size_t) (s))
1642 #else
1643 #define MAX_BUF_SIZE (1L << 16)
1644 #define REALLOC(p,s) realloc ((p), (s))
1645 #endif
1646
1647 /* Extend the buffer by twice its current size via realloc and
1648 reset the pointers that pointed into the old block to point to the
1649 correct places in the new one. If extending the buffer results in it
1650 being larger than MAX_BUF_SIZE, then flag memory exhausted. */
1651 #if __BOUNDED_POINTERS__
1652 # define SET_HIGH_BOUND(P) (__ptrhigh (P) = __ptrlow (P) + bufp->allocated)
1653 # define MOVE_BUFFER_POINTER(P) \
1654 (__ptrlow (P) += incr, SET_HIGH_BOUND (P), __ptrvalue (P) += incr)
1655 # define ELSE_EXTEND_BUFFER_HIGH_BOUND \
1656 else \
1657 { \
1658 SET_HIGH_BOUND (b); \
1659 SET_HIGH_BOUND (begalt); \
1660 if (fixup_alt_jump) \
1661 SET_HIGH_BOUND (fixup_alt_jump); \
1662 if (laststart) \
1663 SET_HIGH_BOUND (laststart); \
1664 if (pending_exact) \
1665 SET_HIGH_BOUND (pending_exact); \
1666 }
1667 #else
1668 # define MOVE_BUFFER_POINTER(P) (P) += incr
1669 # define ELSE_EXTEND_BUFFER_HIGH_BOUND
1670 #endif
1671 #define EXTEND_BUFFER() \
1672 do { \
1673 unsigned char *old_buffer = bufp->buffer; \
1674 if (bufp->allocated == MAX_BUF_SIZE) \
1675 return REG_ESIZE; \
1676 bufp->allocated <<= 1; \
1677 if (bufp->allocated > MAX_BUF_SIZE) \
1678 bufp->allocated = MAX_BUF_SIZE; \
1679 bufp->buffer = (unsigned char *) REALLOC (bufp->buffer, bufp->allocated);\
1680 if (bufp->buffer == NULL) \
1681 return REG_ESPACE; \
1682 /* If the buffer moved, move all the pointers into it. */ \
1683 if (old_buffer != bufp->buffer) \
1684 { \
1685 int incr = bufp->buffer - old_buffer; \
1686 MOVE_BUFFER_POINTER (b); \
1687 MOVE_BUFFER_POINTER (begalt); \
1688 if (fixup_alt_jump) \
1689 MOVE_BUFFER_POINTER (fixup_alt_jump); \
1690 if (laststart) \
1691 MOVE_BUFFER_POINTER (laststart); \
1692 if (pending_exact) \
1693 MOVE_BUFFER_POINTER (pending_exact); \
1694 } \
1695 ELSE_EXTEND_BUFFER_HIGH_BOUND \
1696 } while (0)
1697
1698
1699 /* Since we have one byte reserved for the register number argument to
1700 {start,stop}_memory, the maximum number of groups we can report
1701 things about is what fits in that byte. */
1702 #define MAX_REGNUM 255
1703
1704 /* But patterns can have more than `MAX_REGNUM' registers. We just
1705 ignore the excess. */
1706 typedef unsigned regnum_t;
1707
1708
1709 /* Macros for the compile stack. */
1710
1711 /* Since offsets can go either forwards or backwards, this type needs to
1712 be able to hold values from -(MAX_BUF_SIZE - 1) to MAX_BUF_SIZE - 1. */
1713 /* int may be not enough when sizeof(int) == 2. */
1714 typedef long pattern_offset_t;
1715
1716 typedef struct
1717 {
1718 pattern_offset_t begalt_offset;
1719 pattern_offset_t fixup_alt_jump;
1720 pattern_offset_t inner_group_offset;
1721 pattern_offset_t laststart_offset;
1722 regnum_t regnum;
1723 } compile_stack_elt_t;
1724
1725
1726 typedef struct
1727 {
1728 compile_stack_elt_t *stack;
1729 unsigned size;
1730 unsigned avail; /* Offset of next open position. */
1731 } compile_stack_type;
1732
1733
1734 #define INIT_COMPILE_STACK_SIZE 32
1735
1736 #define COMPILE_STACK_EMPTY (compile_stack.avail == 0)
1737 #define COMPILE_STACK_FULL (compile_stack.avail == compile_stack.size)
1738
1739 /* The next available element. */
1740 #define COMPILE_STACK_TOP (compile_stack.stack[compile_stack.avail])
1741
1742
1743 /* Set the bit for character C in a list. */
1744 #define SET_LIST_BIT(c) \
1745 (b[((unsigned char) (c)) / BYTEWIDTH] \
1746 |= 1 << (((unsigned char) c) % BYTEWIDTH))
1747
1748
1749 /* Get the next unsigned number in the uncompiled pattern. */
1750 #define GET_UNSIGNED_NUMBER(num) \
1751 { if (p != pend) \
1752 { \
1753 PATFETCH (c); \
1754 while (ISDIGIT (c)) \
1755 { \
1756 if (num < 0) \
1757 num = 0; \
1758 num = num * 10 + c - '0'; \
1759 if (p == pend) \
1760 break; \
1761 PATFETCH (c); \
1762 } \
1763 } \
1764 }
1765
1766 #if defined _LIBC || (defined HAVE_WCTYPE_H && defined HAVE_WCHAR_H)
1767 /* The GNU C library provides support for user-defined character classes
1768 and the functions from ISO C amendement 1. */
1769 # ifdef CHARCLASS_NAME_MAX
1770 # define CHAR_CLASS_MAX_LENGTH CHARCLASS_NAME_MAX
1771 # else
1772 /* This shouldn't happen but some implementation might still have this
1773 problem. Use a reasonable default value. */
1774 # define CHAR_CLASS_MAX_LENGTH 256
1775 # endif
1776
1777 # define IS_CHAR_CLASS(string) wctype (string)
1778 #else
1779 # define CHAR_CLASS_MAX_LENGTH 6 /* Namely, `xdigit'. */
1780
1781 # define IS_CHAR_CLASS(string) \
1782 (STREQ (string, "alpha") || STREQ (string, "upper") \
1783 || STREQ (string, "lower") || STREQ (string, "digit") \
1784 || STREQ (string, "alnum") || STREQ (string, "xdigit") \
1785 || STREQ (string, "space") || STREQ (string, "print") \
1786 || STREQ (string, "punct") || STREQ (string, "graph") \
1787 || STREQ (string, "cntrl") || STREQ (string, "blank"))
1788 #endif
1789
1790 /* Handle charset(_not)?.
1791
1792 Structure of charset(_not)? in compiled pattern.
1793
1794 struct {
1795 unsinged char id; charset(_not)?
1796 unsigned char sbc_size;
1797 unsigned char sbc_map[sbc_size]; same as original up to here.
1798 unsigned short mbc_size; number of intervals.
1799 struct {
1800 uint32_t beg; beginning of interval.
1801 uint32_t end; end of interval.
1802 } intervals[mbc_size];
1803 }; */
1804 #define UTF8_CHARSET_SIZE 128
1805
1806 static reg_errcode_t
1807 #if defined(__STDC__) || defined(MSDOS)
set_list_bits(mbchar_t c1,mbchar_t c2,reg_syntax_t syntax,unsigned char * b,const char * translate)1808 set_list_bits (mbchar_t c1, mbchar_t c2,
1809 reg_syntax_t syntax, unsigned char *b, const char *translate)
1810 #else
1811 set_list_bits (c1, c2, syntax, b, translate)
1812 mbchar_t c1, c2;
1813 reg_syntax_t syntax;
1814 unsigned char *b;
1815 const char *translate;
1816 #endif
1817 {
1818 unsigned char sbc_size = b[-1];
1819 unsigned short mbc_size = EXTRACT_UNSIGNED (&b[sbc_size]);
1820 mbchar_t beg, end, upb;
1821 boolean isutf8 = current_mbctype == MBCTYPE_UTF8;
1822 size_t charset_size;
1823
1824 if (c1 > c2)
1825 return syntax & RE_NO_EMPTY_RANGES ? REG_ERANGE : REG_NOERROR;
1826 charset_size = isutf8 ? UTF8_CHARSET_SIZE : (1 << BYTEWIDTH);
1827 if (c1 < charset_size) {
1828 upb = c2;
1829 if (charset_size <= upb)
1830 upb = charset_size - 1; /* The last single-byte char */
1831 if (sbc_size <= upb / BYTEWIDTH) {
1832 /* Allocate maximum size so it never happens again. */
1833 /* NOTE: memcpy() would not work here. */
1834 bcopy (&b[sbc_size], &b[charset_size / BYTEWIDTH], 2 + mbc_size*8);
1835 bzero (&b[sbc_size], charset_size / BYTEWIDTH - sbc_size);
1836 b[-1] = sbc_size = charset_size / BYTEWIDTH;
1837 }
1838 if (!translate) {
1839 for (; c1 <= upb; c1++)
1840 if (!ismbchar (c1))
1841 SET_LIST_BIT (c1);
1842 }
1843 else
1844 for (; c1 <= upb; c1++)
1845 if (!ismbchar (c1))
1846 SET_LIST_BIT (TRANSLATE (c1));
1847 if (c2 < charset_size)
1848 return REG_NOERROR;
1849 c1 = isutf8 ? 0x80 : 0x8000 ; /* The first wide char */
1850 }
1851 b = &b[sbc_size + 2];
1852
1853 /* intervals[beg]
1854 ��----------�� ��----------��
1855 c1
1856 ��----------------------��
1857
1858 ��ޤΤ褦�ʶ�֤Υ���ǥå��� beg ����ꤹ��. */
1859 for (beg = 0, upb = mbc_size; beg < upb; ) {
1860 mbchar_t mid = (beg + upb) >> 1;
1861
1862 if (c1 - 1 > EXTRACT_MBC (&b[mid*8 + 4]))
1863 beg = mid + 1;
1864 else
1865 upb = mid;
1866 }
1867
1868 /* intervals[end]
1869 ��-------�� ��----------��
1870 c2
1871 ��---------------��
1872
1873 ��ޤΤ褦�ʶ�֤Υ���ǥå��� end ����ꤹ��. */
1874 for (end = beg, upb = mbc_size; end < upb; ) {
1875 mbchar_t mid = (end + upb) >> 1;
1876
1877 if (c2 >= EXTRACT_MBC (&b[mid*8]) - 1)
1878 end = mid + 1;
1879 else
1880 upb = mid;
1881 }
1882
1883 if (beg != end) {
1884 /* ��¸�ζ�֤ʤ��Ȥ�1�����礹����,
1885 ��֤λ���, ������������. */
1886 if (c1 > EXTRACT_MBC (&b[beg*8]))
1887 c1 = EXTRACT_MBC (&b[beg*8]);
1888 if (c2 < EXTRACT_MBC (&b[end*8 - 4]))
1889 c2 = EXTRACT_MBC (&b[end*8 - 4]);
1890 }
1891 if (end < mbc_size && end != beg + 1)
1892 /* �ɲä�����֤θ��˴�¸�ζ�֤��ư����. */
1893 /* NOTE: memcpy() would not work here. */
1894 bcopy (&b[end*8], &b[(beg + 1)*8], (mbc_size - end)*8);
1895 STORE_MBC (&b[beg*8 + 0], c1);
1896 STORE_MBC (&b[beg*8 + 4], c2);
1897 mbc_size += beg + 1 - end;
1898 STORE_NUMBER (&b[-2], mbc_size);
1899 return REG_NOERROR;
1900 }
1901
1902 static int
1903 #if defined(__STDC__) || defined(MSDOS)
is_in_list(mbchar_t c,const unsigned char * b)1904 is_in_list (mbchar_t c, const unsigned char *b)
1905 #else
1906 is_in_list (c, b)
1907 mbchar_t c;
1908 const unsigned char *b;
1909 #endif
1910 {
1911 unsigned short size;
1912 int in = (re_opcode_t) b[-1] == charset_not;
1913 boolean isutf8 = current_mbctype == MBCTYPE_UTF8;
1914
1915 size = *b++;
1916 if (c < (isutf8 ? UTF8_CHARSET_SIZE : (1 << BYTEWIDTH))) {
1917 if (c / BYTEWIDTH < size && b[c / BYTEWIDTH] & 1 << c % BYTEWIDTH)
1918 in = !in;
1919 }
1920 else {
1921 mbchar_t i, j;
1922
1923 b += size + 2;
1924 size = EXTRACT_UNSIGNED (&b[-2]);
1925
1926 /* intervals[i]
1927 ��-------�� ��--------��
1928 c
1929 ��----------------��
1930
1931 ��ޤΤ褦�ʶ�֤Υ���ǥå��� i ����ꤹ��. */
1932 for (i = 0, j = size; i < j; ) {
1933 mbchar_t k = (i + j) >> 1;
1934
1935 if (c > EXTRACT_MBC (&b[k*8 + 4]))
1936 i = k + 1;
1937 else
1938 j = k;
1939 }
1940 if (i < size && EXTRACT_MBC (&b[i*8]) <= c
1941 /* [...] ����, ̵���ʥޥ���Х���ʸ�����������. �����Ǥϴ�ñ��
1942 ���ᣲ�Х����ܤ� '\n' �ޤ��� '\0' ������̵���Ȥ���. [^...]
1943 �ξ���, �դ�̵���ʥޥ���Х���ʸ����ޥå�������. */
1944 && (((unsigned char) c != '\n' && (unsigned char) c != '\0') || isutf8))
1945 in = !in;
1946 }
1947 return in;
1948 }
1949
1950 static boolean
1951 #if defined(__STDC__) || defined(MSDOS)
is_mblist_nooverlap(const unsigned char * b1,const unsigned char * b2)1952 is_mblist_nooverlap (const unsigned char *b1, const unsigned char *b2)
1953 #else
1954 is_mblist_nooverlap (b1, b2)
1955 const unsigned char *b1;
1956 const unsigned char *b2;
1957 #endif
1958 {
1959 unsigned short size1;
1960 mbchar_t c11;
1961 mbchar_t c12;
1962 unsigned short size2;
1963 mbchar_t c21;
1964 mbchar_t c22;
1965 unsigned short i;
1966 unsigned short j;
1967
1968 b1 = &b1[2];
1969 size1 = EXTRACT_UNSIGNED (&b1[-2]);
1970 b2 = &b2[2];
1971 size2 = EXTRACT_UNSIGNED (&b2[-2]);
1972 for (i = 0; i < size1; i++)
1973 {
1974 c11 = EXTRACT_MBC (&b1[i * 8]);
1975 c12 = EXTRACT_MBC (&b1[i * 8 + 4]);
1976 for (j = 0; j < size2; j++)
1977 {
1978 c21 = EXTRACT_MBC (&b2[j * 8]);
1979 c22 = EXTRACT_MBC (&b2[j * 8 + 4]);
1980 if (! ((c22 < c11) || (c12 < c21)))
1981 return false;
1982 }
1983 }
1984 return true;
1985 }
1986
1987 static boolean
1988 #if defined(__STDC__) || defined(MSDOS)
is_invmblist_nooverlap(const unsigned char * b1,const unsigned char * b2)1989 is_invmblist_nooverlap (const unsigned char *b1, const unsigned char *b2)
1990 #else
1991 is_invmblist_nooverlap (b1, b2)
1992 const unsigned char *b1;
1993 const unsigned char *b2;
1994 #endif
1995 {
1996 unsigned short size2;
1997 unsigned char *inv_b2;
1998 unsigned short inv_size2;
1999 boolean isutf8 = current_mbctype == MBCTYPE_UTF8;
2000 unsigned short i;
2001 mbchar_t c2;
2002 mbchar_t c;
2003 boolean ret;
2004
2005 b2 = &b2[2];
2006 size2 = EXTRACT_UNSIGNED (&b2[-2]);
2007 inv_b2 = REGEX_ALLOCATE (2 + 8 * (size2 + 2));
2008 inv_b2 = &inv_b2[2];
2009
2010 c = isutf8 ? 0x80 : 0x8000; /* The first wide char */
2011 inv_size2 = 0;
2012 for (i = 0; i < size2; i++) {
2013 c2 = EXTRACT_MBC (&b2[i * 8]) - 1;
2014 if (c <= c2) {
2015 STORE_MBC (&inv_b2[inv_size2 * 8], c);
2016 STORE_MBC (&inv_b2[inv_size2 * 8 + 4], c2);
2017 inv_size2++;
2018 }
2019 c = EXTRACT_MBC (&b2[i * 8 + 4]) + 1;
2020 }
2021 c2 = 0x7fffffffL;
2022 STORE_MBC (&inv_b2[inv_size2 * 8], c);
2023 STORE_MBC (&inv_b2[inv_size2 * 8 + 4], c2);
2024 inv_size2++;
2025 STORE_NUMBER (&inv_b2[-2], inv_size2);
2026 inv_b2 = &inv_b2[-2];
2027
2028 ret = is_mblist_nooverlap(b1, inv_b2);
2029
2030 REGEX_FREE (inv_b2);
2031
2032 return ret;
2033 }
2034
2035 #ifndef MATCH_MAY_ALLOCATE
2036
2037 /* If we cannot allocate large objects within re_match_2_internal,
2038 we make the fail stack and register vectors global.
2039 The fail stack, we grow to the maximum size when a regexp
2040 is compiled.
2041 The register vectors, we adjust in size each time we
2042 compile a regexp, according to the number of registers it needs. */
2043
2044 static fail_stack_type fail_stack;
2045
2046 /* Size with which the following vectors are currently allocated.
2047 That is so we can make them bigger as needed,
2048 but never make them smaller. */
2049 static int regs_allocated_size;
2050
2051 static const char ** regstart, ** regend;
2052 static const char ** old_regstart, ** old_regend;
2053 static const char **best_regstart, **best_regend;
2054 static register_info_type *reg_info;
2055 static const char **reg_dummy;
2056 static register_info_type *reg_info_dummy;
2057
2058 /* Make the register vectors big enough for NUM_REGS registers,
2059 but don't make them smaller. */
2060
2061 static
regex_grow_registers(num_regs)2062 regex_grow_registers (num_regs)
2063 int num_regs;
2064 {
2065 if (num_regs > regs_allocated_size)
2066 {
2067 RETALLOC_IF (regstart, num_regs, const char *);
2068 RETALLOC_IF (regend, num_regs, const char *);
2069 RETALLOC_IF (old_regstart, num_regs, const char *);
2070 RETALLOC_IF (old_regend, num_regs, const char *);
2071 RETALLOC_IF (best_regstart, num_regs, const char *);
2072 RETALLOC_IF (best_regend, num_regs, const char *);
2073 RETALLOC_IF (reg_info, num_regs, register_info_type);
2074 RETALLOC_IF (reg_dummy, num_regs, const char *);
2075 RETALLOC_IF (reg_info_dummy, num_regs, register_info_type);
2076
2077 regs_allocated_size = num_regs;
2078 }
2079 }
2080
2081 #endif /* not MATCH_MAY_ALLOCATE */
2082
2083 static boolean group_in_compile_stack _RE_ARGS ((compile_stack_type
2084 compile_stack,
2085 regnum_t regnum));
2086
2087 /* `regex_compile' compiles PATTERN (of length SIZE) according to SYNTAX.
2088 Returns one of error codes defined in `regex.h', or zero for success.
2089
2090 Assumes the `allocated' (and perhaps `buffer') and `translate'
2091 fields are set in BUFP on entry.
2092
2093 If it succeeds, results are put in BUFP (if it returns an error, the
2094 contents of BUFP are undefined):
2095 `buffer' is the compiled pattern;
2096 `syntax' is set to SYNTAX;
2097 `used' is set to the length of the compiled pattern;
2098 `fastmap_accurate' is zero;
2099 `re_nsub' is the number of subexpressions in PATTERN;
2100 `not_bol' and `not_eol' are zero;
2101
2102 The `fastmap' and `newline_anchor' fields are neither
2103 examined nor set. */
2104
2105 /* Return, freeing storage we allocated. */
2106 #define FREE_STACK_RETURN(value) \
2107 return (free (compile_stack.stack), value)
2108
2109 static reg_errcode_t
regex_compile(pattern,size,syntax,bufp)2110 regex_compile (pattern, size, syntax, bufp)
2111 const char *pattern;
2112 size_t size;
2113 reg_syntax_t syntax;
2114 struct re_pattern_buffer *bufp;
2115 {
2116 /* We fetch characters from PATTERN here. Even though PATTERN is
2117 `char *' (i.e., signed), we declare these variables as unsigned, so
2118 they can be reliably used as array indices. */
2119 register unsigned char c, c1;
2120
2121 /* A random temporary spot in PATTERN. */
2122 const char *p1;
2123
2124 /* Points to the end of the buffer, where we should append. */
2125 register unsigned char *b;
2126
2127 /* Keeps track of unclosed groups. */
2128 compile_stack_type compile_stack;
2129
2130 /* Points to the current (ending) position in the pattern. */
2131 const char *p = pattern;
2132 const char *pend = pattern + size;
2133
2134 /* How to translate the characters in the pattern. */
2135 RE_TRANSLATE_TYPE translate = bufp->translate;
2136
2137 /* Address of the count-byte of the most recently inserted `exactn'
2138 command. This makes it possible to tell if a new exact-match
2139 character can be added to that command or if the character requires
2140 a new `exactn' command. */
2141 unsigned char *pending_exact = 0;
2142
2143 /* Address of start of the most recently finished expression.
2144 This tells, e.g., postfix * where to find the start of its
2145 operand. Reset at the beginning of groups and alternatives. */
2146 unsigned char *laststart = 0;
2147
2148 /* Address of beginning of regexp, or inside of last group. */
2149 unsigned char *begalt;
2150
2151 /* Place in the uncompiled pattern (i.e., the {) to
2152 which to go back if the interval is invalid. */
2153 const char *beg_interval;
2154
2155 /* Address of the place where a forward jump should go to the end of
2156 the containing expression. Each alternative of an `or' -- except the
2157 last -- ends with a forward jump of this sort. */
2158 unsigned char *fixup_alt_jump = 0;
2159
2160 /* Counts open-groups as they are encountered. Remembered for the
2161 matching close-group on the compile stack, so the same register
2162 number is put in the stop_memory as the start_memory. */
2163 regnum_t regnum = 0;
2164
2165 #ifdef DEBUG
2166 DEBUG_PRINT1 ("\nCompiling pattern: ");
2167 if (debug)
2168 {
2169 unsigned debug_count;
2170
2171 for (debug_count = 0; debug_count < size; debug_count++)
2172 putchar (pattern[debug_count]);
2173 putchar ('\n');
2174 }
2175 #endif /* DEBUG */
2176
2177 /* Initialize the compile stack. */
2178 compile_stack.stack = TALLOC (INIT_COMPILE_STACK_SIZE, compile_stack_elt_t);
2179 if (compile_stack.stack == NULL)
2180 return REG_ESPACE;
2181
2182 compile_stack.size = INIT_COMPILE_STACK_SIZE;
2183 compile_stack.avail = 0;
2184
2185 /* Initialize the pattern buffer. */
2186 bufp->syntax = syntax;
2187 bufp->fastmap_accurate = 0;
2188 bufp->not_bol = bufp->not_eol = 0;
2189
2190 /* Set `used' to zero, so that if we return an error, the pattern
2191 printer (for debugging) will think there's no pattern. We reset it
2192 at the end. */
2193 bufp->used = 0;
2194
2195 /* Always count groups, whether or not bufp->no_sub is set. */
2196 bufp->re_nsub = 0;
2197
2198 #if !defined (emacs) && !defined (SYNTAX_TABLE)
2199 /* Initialize the syntax table. */
2200 init_syntax_once ();
2201 #endif
2202
2203 if (bufp->allocated == 0)
2204 {
2205 if (bufp->buffer)
2206 { /* If zero allocated, but buffer is non-null, try to realloc
2207 enough space. This loses if buffer's address is bogus, but
2208 that is the user's responsibility. */
2209 RETALLOC (bufp->buffer, INIT_BUF_SIZE, unsigned char);
2210 }
2211 else
2212 { /* Caller did not allocate a buffer. Do it for them. */
2213 bufp->buffer = TALLOC (INIT_BUF_SIZE, unsigned char);
2214 }
2215 if (!bufp->buffer) FREE_STACK_RETURN (REG_ESPACE);
2216
2217 bufp->allocated = INIT_BUF_SIZE;
2218 }
2219
2220 begalt = b = bufp->buffer;
2221
2222 /* Loop through the uncompiled pattern until we're at the end. */
2223 while (p != pend)
2224 {
2225 PATFETCH (c);
2226
2227 switch (c)
2228 {
2229 case '^':
2230 {
2231 if ( /* If at start of pattern, it's an operator. */
2232 p == pattern + 1
2233 /* If context independent, it's an operator. */
2234 || syntax & RE_CONTEXT_INDEP_ANCHORS
2235 /* Otherwise, depends on what's come before. */
2236 || at_begline_loc_p (pattern, p, syntax))
2237 BUF_PUSH (begline);
2238 else
2239 goto normal_char;
2240 }
2241 break;
2242
2243
2244 case '$':
2245 {
2246 if ( /* If at end of pattern, it's an operator. */
2247 p == pend
2248 /* If context independent, it's an operator. */
2249 || syntax & RE_CONTEXT_INDEP_ANCHORS
2250 /* Otherwise, depends on what's next. */
2251 || at_endline_loc_p (p, pend, syntax))
2252 BUF_PUSH (endline);
2253 else
2254 goto normal_char;
2255 }
2256 break;
2257
2258
2259 case '+':
2260 case '?':
2261 if ((syntax & RE_BK_PLUS_QM)
2262 || (syntax & RE_LIMITED_OPS))
2263 goto normal_char;
2264 handle_plus:
2265 case '*':
2266 /* If there is no previous pattern... */
2267 if (!laststart)
2268 {
2269 if (syntax & RE_CONTEXT_INVALID_OPS)
2270 FREE_STACK_RETURN (REG_BADRPT);
2271 else if (!(syntax & RE_CONTEXT_INDEP_OPS))
2272 goto normal_char;
2273 }
2274
2275 {
2276 /* Are we optimizing this jump? */
2277 boolean keep_string_p = false;
2278
2279 /* 1 means zero (many) matches is allowed. */
2280 char zero_times_ok = 0, many_times_ok = 0;
2281
2282 /* If there is a sequence of repetition chars, collapse it
2283 down to just one (the right one). We can't combine
2284 interval operators with these because of, e.g., `a{2}*',
2285 which should only match an even number of `a's. */
2286
2287 for (;;)
2288 {
2289 zero_times_ok |= c != '+';
2290 many_times_ok |= c != '?';
2291
2292 if (p == pend)
2293 break;
2294
2295 PATFETCH (c);
2296
2297 if (c == '*'
2298 || (!(syntax & RE_BK_PLUS_QM) && (c == '+' || c == '?')))
2299 ;
2300
2301 else if (syntax & RE_BK_PLUS_QM && c == '\\')
2302 {
2303 if (p == pend) FREE_STACK_RETURN (REG_EESCAPE);
2304
2305 PATFETCH (c1);
2306 if (!(c1 == '+' || c1 == '?'))
2307 {
2308 PATUNFETCH;
2309 PATUNFETCH;
2310 break;
2311 }
2312
2313 c = c1;
2314 }
2315 else
2316 {
2317 PATUNFETCH;
2318 break;
2319 }
2320
2321 /* If we get here, we found another repeat character. */
2322 }
2323
2324 /* Star, etc. applied to an empty pattern is equivalent
2325 to an empty pattern. */
2326 if (!laststart)
2327 break;
2328
2329 /* Now we know whether or not zero matches is allowed
2330 and also whether or not two or more matches is allowed. */
2331 if (many_times_ok)
2332 { /* More than one repetition is allowed, so put in at the
2333 end a backward relative jump from `b' to before the next
2334 jump we're going to put in below (which jumps from
2335 laststart to after this jump).
2336
2337 But if we are at the `*' in the exact sequence `.*\n',
2338 insert an unconditional jump backwards to the .,
2339 instead of the beginning of the loop. This way we only
2340 push a failure point once, instead of every time
2341 through the loop. */
2342 assert (p - 1 > pattern);
2343
2344 /* Allocate the space for the jump. */
2345 GET_BUFFER_SPACE (3);
2346
2347 /* We know we are not at the first character of the pattern,
2348 because laststart was nonzero. And we've already
2349 incremented `p', by the way, to be the character after
2350 the `*'. Do we have to do something analogous here
2351 for null bytes, because of RE_DOT_NOT_NULL? */
2352 if (TRANSLATE (*(p - 2)) == TRANSLATE ('.')
2353 && zero_times_ok
2354 && p < pend && TRANSLATE (*p) == TRANSLATE ('\n')
2355 && !(syntax & RE_DOT_NEWLINE))
2356 { /* We have .*\n. */
2357 STORE_JUMP (jump, b, laststart);
2358 keep_string_p = true;
2359 }
2360 else
2361 /* Anything else. */
2362 STORE_JUMP (maybe_pop_jump, b, laststart - 3);
2363
2364 /* We've added more stuff to the buffer. */
2365 b += 3;
2366 }
2367
2368 /* On failure, jump from laststart to b + 3, which will be the
2369 end of the buffer after this jump is inserted. */
2370 GET_BUFFER_SPACE (3);
2371 INSERT_JUMP (keep_string_p ? on_failure_keep_string_jump
2372 : on_failure_jump,
2373 laststart, b + 3);
2374 pending_exact = 0;
2375 b += 3;
2376
2377 if (!zero_times_ok)
2378 {
2379 /* At least one repetition is required, so insert a
2380 `dummy_failure_jump' before the initial
2381 `on_failure_jump' instruction of the loop. This
2382 effects a skip over that instruction the first time
2383 we hit that loop. */
2384 GET_BUFFER_SPACE (3);
2385 INSERT_JUMP (dummy_failure_jump, laststart, laststart + 6);
2386 b += 3;
2387 }
2388 }
2389 break;
2390
2391
2392 case '.':
2393 laststart = b;
2394 BUF_PUSH (anychar);
2395 break;
2396
2397
2398 case '[':
2399 {
2400 boolean had_char_class = false;
2401 mbchar_t c, c1;
2402 long last_char = -1L;
2403
2404 if (p == pend) FREE_STACK_RETURN (REG_EBRACK);
2405
2406 /* Ensure that we have enough space to push a charset: the
2407 opcode, the length count, and the bitset; 34 bytes in all. */
2408 /* + 2 + 8 for mbcharset(_not)? with just one interval. */
2409 GET_BUFFER_SPACE (34 + 2 + 8);
2410
2411 laststart = b;
2412
2413 /* We test `*p == '^' twice, instead of using an if
2414 statement, so we only need one BUF_PUSH. */
2415 BUF_PUSH (*p == '^' ? charset_not : charset);
2416 if (*p == '^')
2417 p++;
2418
2419 /* Remember the first position in the bracket expression. */
2420 p1 = p;
2421
2422 /* Push the number of bytes in the bitmap. */
2423 BUF_PUSH ((1 << BYTEWIDTH) / BYTEWIDTH);
2424
2425 /* Clear the whole map. */
2426 bzero (b, (1 << BYTEWIDTH) / BYTEWIDTH + 2);
2427
2428 /* charset_not matches newline according to a syntax bit. */
2429 if ((re_opcode_t) b[-2] == charset_not
2430 && (syntax & RE_HAT_LISTS_NOT_NEWLINE))
2431 SET_LIST_BIT ('\n');
2432
2433 /* Read in characters and ranges, setting map bits. */
2434 for (;;)
2435 {
2436 int size;
2437
2438 if (p == pend) FREE_STACK_RETURN (REG_EBRACK);
2439
2440 if ((size = EXTRACT_UNSIGNED (&b[(1 << BYTEWIDTH) / BYTEWIDTH])))
2441 /* Ensure the space is enough to hold another interval
2442 of multi-byte chars in charset(_not)?. */
2443 GET_BUFFER_SPACE (32 + 2 + size*8 + 8);
2444
2445 PATFETCH_RAW (c);
2446
2447 /* \ might escape characters inside [...] and [^...]. */
2448 if ((syntax & RE_BACKSLASH_ESCAPE_IN_LISTS) && c == '\\')
2449 {
2450 if (p == pend) FREE_STACK_RETURN (REG_EESCAPE);
2451
2452 PATFETCH_RAW (c1);
2453 if (ismbchar (c1)) {
2454 PATUNFETCH;
2455 PATFETCH_MBC (c1);
2456 (void) set_list_bits (c1, c1, syntax, b, translate);
2457 last_char = c1;
2458 continue;
2459 }
2460 SET_LIST_BIT (TRANSLATE (c1));
2461 last_char = c1;
2462 continue;
2463 }
2464
2465 /* Could be the end of the bracket expression. If it's
2466 not (i.e., when the bracket expression is `[]' so
2467 far), the ']' character bit gets set way below. */
2468 if (c == ']' && p != p1 + 1)
2469 break;
2470
2471 /* Look ahead to see if it's a range when the last thing
2472 was a character class. */
2473 if (had_char_class && c == '-' && *p != ']')
2474 FREE_STACK_RETURN (REG_ERANGE);
2475
2476 if (ismbchar (c)) {
2477 PATUNFETCH;
2478 PATFETCH_MBC (c);
2479 }
2480
2481 /* Look ahead to see if it's a range when the last thing
2482 was a character: if this is a hyphen not at the
2483 beginning or the end of a list, then it's the range
2484 operator. */
2485 if (c == '-'
2486 #if 0 /* The original was: */
2487 && !(p - 2 >= pattern && p[-2] == '[')
2488 && !(p - 3 >= pattern && p[-3] == '[' && p[-2] == '^')
2489 #else /* I wonder why he did not write like this.
2490 Have we got any problems? */
2491 && p != p1 + 1
2492 #endif
2493 && *p != ']')
2494 {
2495 reg_errcode_t ret;
2496
2497 assert (last_char >= 0);
2498 PATFETCH_RAW (c1);
2499 if (ismbchar (c1)) {
2500 PATUNFETCH;
2501 PATFETCH_MBC (c1);
2502 }
2503 ret = set_list_bits (last_char, c1, syntax, b, translate);
2504 last_char = c1;
2505 if (ret != REG_NOERROR) FREE_STACK_RETURN (ret);
2506 }
2507
2508 else if (p[0] == '-' && p[1] != ']')
2509 { /* This handles ranges made up of characters only. */
2510 reg_errcode_t ret;
2511
2512 /* Move past the `-'. */
2513 PATFETCH_RAW (c1);
2514
2515 PATFETCH_RAW (c1);
2516 if (ismbchar (c1)) {
2517 PATUNFETCH;
2518 PATFETCH_MBC (c1);
2519 }
2520 ret = set_list_bits (c, c1, syntax, b, translate);
2521 last_char = c1;
2522 if (ret != REG_NOERROR) FREE_STACK_RETURN (ret);
2523 }
2524
2525 /* See if we're at the beginning of a possible character
2526 class. */
2527
2528 else if (syntax & RE_CHAR_CLASSES && c == '[' && *p == ':')
2529 { /* Leave room for the null. */
2530 char str[CHAR_CLASS_MAX_LENGTH + 1];
2531
2532 PATFETCH_RAW (c);
2533 c1 = 0;
2534
2535 /* If pattern is `[[:'. */
2536 if (p == pend) FREE_STACK_RETURN (REG_EBRACK);
2537
2538 for (;;)
2539 {
2540 PATFETCH (c);
2541 if (c == ':' || c == ']' || p == pend
2542 || c1 == CHAR_CLASS_MAX_LENGTH)
2543 break;
2544 str[c1++] = c;
2545 }
2546 str[c1] = '\0';
2547
2548 /* If isn't a word bracketed by `[:' and:`]':
2549 undo the ending character, the letters, and leave
2550 the leading `:' and `[' (but set bits for them). */
2551 if (c == ':' && *p == ']')
2552 {
2553 #if defined _LIBC || (defined HAVE_WCTYPE_H && defined HAVE_WCHAR_H)
2554 boolean is_lower = STREQ (str, "lower");
2555 boolean is_upper = STREQ (str, "upper");
2556 wctype_t wt;
2557 int ch;
2558
2559 wt = wctype (str);
2560 if (wt == 0)
2561 FREE_STACK_RETURN (REG_ECTYPE);
2562
2563 /* Throw away the ] at the end of the character
2564 class. */
2565 PATFETCH (c);
2566
2567 if (p == pend) FREE_STACK_RETURN (REG_EBRACK);
2568
2569 for (ch = 0; ch < 1 << BYTEWIDTH; ++ch)
2570 {
2571 if (iswctype (btowc (ch), wt))
2572 SET_LIST_BIT (ch);
2573
2574 if (translate && (is_upper || is_lower)
2575 && (ISUPPER (ch) || ISLOWER (ch)))
2576 SET_LIST_BIT (ch);
2577 }
2578
2579 had_char_class = true;
2580 #else
2581 int ch;
2582 boolean is_alnum = STREQ (str, "alnum");
2583 boolean is_alpha = STREQ (str, "alpha");
2584 boolean is_blank = STREQ (str, "blank");
2585 boolean is_cntrl = STREQ (str, "cntrl");
2586 boolean is_digit = STREQ (str, "digit");
2587 boolean is_graph = STREQ (str, "graph");
2588 boolean is_lower = STREQ (str, "lower");
2589 boolean is_print = STREQ (str, "print");
2590 boolean is_punct = STREQ (str, "punct");
2591 boolean is_space = STREQ (str, "space");
2592 boolean is_upper = STREQ (str, "upper");
2593 boolean is_xdigit = STREQ (str, "xdigit");
2594
2595 if (!IS_CHAR_CLASS (str))
2596 FREE_STACK_RETURN (REG_ECTYPE);
2597
2598 /* Throw away the ] at the end of the character
2599 class. */
2600 PATFETCH (c);
2601
2602 if (p == pend) FREE_STACK_RETURN (REG_EBRACK);
2603
2604 for (ch = 0; ch < 1 << BYTEWIDTH; ch++)
2605 {
2606 /* This was split into 3 if's to
2607 avoid an arbitrary limit in some compiler. */
2608 if ( (is_alnum && ISALNUM (ch))
2609 || (is_alpha && ISALPHA (ch))
2610 || (is_blank && ISBLANK (ch))
2611 || (is_cntrl && ISCNTRL (ch)))
2612 SET_LIST_BIT (ch);
2613 if ( (is_digit && ISDIGIT (ch))
2614 || (is_graph && ISGRAPH (ch))
2615 || (is_lower && ISLOWER (ch))
2616 || (is_print && ISPRINT (ch)))
2617 SET_LIST_BIT (ch);
2618 if ( (is_punct && ISPUNCT (ch))
2619 || (is_space && ISSPACE (ch))
2620 || (is_upper && ISUPPER (ch))
2621 || (is_xdigit && ISXDIGIT (ch)))
2622 SET_LIST_BIT (ch);
2623 if ( translate && (is_upper || is_lower)
2624 && (ISUPPER (ch) || ISLOWER (ch)))
2625 SET_LIST_BIT (ch);
2626 }
2627 had_char_class = true;
2628 #endif /* libc || wctype.h */
2629 #ifdef DEBUG
2630 last_char = -1;
2631 #endif
2632 }
2633 else
2634 {
2635 c1++;
2636 while (c1--)
2637 PATUNFETCH;
2638 #if 0 /* The original was: */
2639 SET_LIST_BIT ('[');
2640 SET_LIST_BIT (':');
2641 #else /* I think this is the right way. */
2642 SET_LIST_BIT (TRANSLATE ('['));
2643 SET_LIST_BIT (TRANSLATE (':'));
2644 #endif
2645 had_char_class = false;
2646 last_char = ':';
2647 }
2648 }
2649 else
2650 {
2651 had_char_class = false;
2652 (void) set_list_bits (c, c, syntax, b, translate);
2653 last_char = c;
2654 }
2655 }
2656
2657 /* Discard any (non)matching list bytes that are all 0 at the
2658 end of the map. Decrease the map-length byte too. */
2659 while ((int) b[-1] > 0 && b[b[-1] - 1] == 0)
2660 b[-1]--;
2661 if (b[-1] != (1 << BYTEWIDTH) / BYTEWIDTH)
2662 bcopy (&b[(1 << BYTEWIDTH) / BYTEWIDTH], &b[b[-1]],
2663 2 + EXTRACT_UNSIGNED (&b[(1 << BYTEWIDTH) / BYTEWIDTH])*8);
2664 b += b[-1] + 2 + EXTRACT_UNSIGNED (&b[b[-1]])*8;
2665 break;
2666 }
2667 break;
2668
2669
2670 case '(':
2671 if (syntax & RE_NO_BK_PARENS)
2672 goto handle_open;
2673 else
2674 goto normal_char;
2675
2676
2677 case ')':
2678 if (syntax & RE_NO_BK_PARENS)
2679 goto handle_close;
2680 else
2681 goto normal_char;
2682
2683
2684 case '\n':
2685 if (syntax & RE_NEWLINE_ALT)
2686 goto handle_alt;
2687 else
2688 goto normal_char;
2689
2690
2691 case '|':
2692 if (syntax & RE_NO_BK_VBAR)
2693 goto handle_alt;
2694 else
2695 goto normal_char;
2696
2697
2698 case '{':
2699 if (syntax & RE_INTERVALS && syntax & RE_NO_BK_BRACES)
2700 goto handle_interval;
2701 else
2702 goto normal_char;
2703
2704
2705 case '\\':
2706 if (p == pend) FREE_STACK_RETURN (REG_EESCAPE);
2707
2708 /* Do not translate the character after the \, so that we can
2709 distinguish, e.g., \B from \b, even if we normally would
2710 translate, e.g., B to b. */
2711 PATFETCH_RAW (c);
2712
2713 switch (c)
2714 {
2715 case '(':
2716 if (syntax & RE_NO_BK_PARENS)
2717 goto normal_backslash;
2718
2719 handle_open:
2720 bufp->re_nsub++;
2721 regnum++;
2722
2723 if (COMPILE_STACK_FULL)
2724 {
2725 RETALLOC (compile_stack.stack, compile_stack.size << 1,
2726 compile_stack_elt_t);
2727 if (compile_stack.stack == NULL) return REG_ESPACE;
2728
2729 compile_stack.size <<= 1;
2730 }
2731
2732 /* These are the values to restore when we hit end of this
2733 group. They are all relative offsets, so that if the
2734 whole pattern moves because of realloc, they will still
2735 be valid. */
2736 COMPILE_STACK_TOP.begalt_offset = begalt - bufp->buffer;
2737 COMPILE_STACK_TOP.fixup_alt_jump
2738 = fixup_alt_jump ? fixup_alt_jump - bufp->buffer + 1 : 0;
2739 COMPILE_STACK_TOP.laststart_offset = b - bufp->buffer;
2740 COMPILE_STACK_TOP.regnum = regnum;
2741
2742 /* We will eventually replace the 0 with the number of
2743 groups inner to this one. But do not push a
2744 start_memory for groups beyond the last one we can
2745 represent in the compiled pattern. */
2746 if (regnum <= MAX_REGNUM)
2747 {
2748 COMPILE_STACK_TOP.inner_group_offset = b - bufp->buffer + 2;
2749 BUF_PUSH_3 (start_memory, regnum, 0);
2750 }
2751
2752 compile_stack.avail++;
2753
2754 fixup_alt_jump = 0;
2755 laststart = 0;
2756 begalt = b;
2757 /* If we've reached MAX_REGNUM groups, then this open
2758 won't actually generate any code, so we'll have to
2759 clear pending_exact explicitly. */
2760 pending_exact = 0;
2761 break;
2762
2763
2764 case ')':
2765 if (syntax & RE_NO_BK_PARENS) goto normal_backslash;
2766
2767 if (COMPILE_STACK_EMPTY)
2768 if (syntax & RE_UNMATCHED_RIGHT_PAREN_ORD)
2769 goto normal_backslash;
2770 else
2771 FREE_STACK_RETURN (REG_ERPAREN);
2772
2773 handle_close:
2774 if (fixup_alt_jump)
2775 { /* Push a dummy failure point at the end of the
2776 alternative for a possible future
2777 `pop_failure_jump' to pop. See comments at
2778 `push_dummy_failure' in `re_match_2'. */
2779 BUF_PUSH (push_dummy_failure);
2780
2781 /* We allocated space for this jump when we assigned
2782 to `fixup_alt_jump', in the `handle_alt' case below. */
2783 STORE_JUMP (jump_past_alt, fixup_alt_jump, b - 1);
2784 }
2785
2786 /* See similar code for backslashed left paren above. */
2787 if (COMPILE_STACK_EMPTY)
2788 if (syntax & RE_UNMATCHED_RIGHT_PAREN_ORD)
2789 goto normal_char;
2790 else
2791 FREE_STACK_RETURN (REG_ERPAREN);
2792
2793 /* Since we just checked for an empty stack above, this
2794 ``can't happen''. */
2795 assert (compile_stack.avail != 0);
2796 {
2797 /* We don't just want to restore into `regnum', because
2798 later groups should continue to be numbered higher,
2799 as in `(ab)c(de)' -- the second group is #2. */
2800 regnum_t this_group_regnum;
2801
2802 compile_stack.avail--;
2803 begalt = bufp->buffer + COMPILE_STACK_TOP.begalt_offset;
2804 fixup_alt_jump
2805 = COMPILE_STACK_TOP.fixup_alt_jump
2806 ? bufp->buffer + COMPILE_STACK_TOP.fixup_alt_jump - 1
2807 : 0;
2808 laststart = bufp->buffer + COMPILE_STACK_TOP.laststart_offset;
2809 this_group_regnum = COMPILE_STACK_TOP.regnum;
2810 /* If we've reached MAX_REGNUM groups, then this open
2811 won't actually generate any code, so we'll have to
2812 clear pending_exact explicitly. */
2813 pending_exact = 0;
2814
2815 /* We're at the end of the group, so now we know how many
2816 groups were inside this one. */
2817 if (this_group_regnum <= MAX_REGNUM)
2818 {
2819 unsigned char *inner_group_loc
2820 = bufp->buffer + COMPILE_STACK_TOP.inner_group_offset;
2821
2822 *inner_group_loc = regnum - this_group_regnum;
2823 BUF_PUSH_3 (stop_memory, this_group_regnum,
2824 regnum - this_group_regnum);
2825 }
2826 }
2827 break;
2828
2829
2830 case '|': /* `\|'. */
2831 if (syntax & RE_LIMITED_OPS || syntax & RE_NO_BK_VBAR)
2832 goto normal_backslash;
2833 handle_alt:
2834 if (syntax & RE_LIMITED_OPS)
2835 goto normal_char;
2836
2837 /* Insert before the previous alternative a jump which
2838 jumps to this alternative if the former fails. */
2839 GET_BUFFER_SPACE (3);
2840 INSERT_JUMP (on_failure_jump, begalt, b + 6);
2841 pending_exact = 0;
2842 b += 3;
2843
2844 /* The alternative before this one has a jump after it
2845 which gets executed if it gets matched. Adjust that
2846 jump so it will jump to this alternative's analogous
2847 jump (put in below, which in turn will jump to the next
2848 (if any) alternative's such jump, etc.). The last such
2849 jump jumps to the correct final destination. A picture:
2850 _____ _____
2851 | | | |
2852 | v | v
2853 a | b | c
2854
2855 If we are at `b', then fixup_alt_jump right now points to a
2856 three-byte space after `a'. We'll put in the jump, set
2857 fixup_alt_jump to right after `b', and leave behind three
2858 bytes which we'll fill in when we get to after `c'. */
2859
2860 if (fixup_alt_jump)
2861 STORE_JUMP (jump_past_alt, fixup_alt_jump, b);
2862
2863 /* Mark and leave space for a jump after this alternative,
2864 to be filled in later either by next alternative or
2865 when know we're at the end of a series of alternatives. */
2866 fixup_alt_jump = b;
2867 GET_BUFFER_SPACE (3);
2868 b += 3;
2869
2870 laststart = 0;
2871 begalt = b;
2872 break;
2873
2874
2875 case '{':
2876 /* If \{ is a literal. */
2877 if (!(syntax & RE_INTERVALS)
2878 /* If we're at `\{' and it's not the open-interval
2879 operator. */
2880 || ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
2881 || (p - 2 == pattern && p == pend))
2882 goto normal_backslash;
2883
2884 handle_interval:
2885 {
2886 /* If got here, then the syntax allows intervals. */
2887
2888 /* At least (most) this many matches must be made. */
2889 int lower_bound = -1, upper_bound = -1;
2890
2891 beg_interval = p - 1;
2892
2893 if (p == pend)
2894 {
2895 if (syntax & RE_NO_BK_BRACES)
2896 goto unfetch_interval;
2897 else
2898 FREE_STACK_RETURN (REG_EBRACE);
2899 }
2900
2901 GET_UNSIGNED_NUMBER (lower_bound);
2902
2903 if (c == ',')
2904 {
2905 GET_UNSIGNED_NUMBER (upper_bound);
2906 if (upper_bound < 0) upper_bound = RE_DUP_MAX;
2907 }
2908 else
2909 /* Interval such as `{1}' => match exactly once. */
2910 upper_bound = lower_bound;
2911
2912 if (lower_bound < 0 || upper_bound > RE_DUP_MAX
2913 || lower_bound > upper_bound)
2914 {
2915 if (syntax & RE_NO_BK_BRACES)
2916 goto unfetch_interval;
2917 else
2918 FREE_STACK_RETURN (REG_BADBR);
2919 }
2920
2921 if (!(syntax & RE_NO_BK_BRACES))
2922 {
2923 if (c != '\\') FREE_STACK_RETURN (REG_EBRACE);
2924
2925 PATFETCH (c);
2926 }
2927
2928 if (c != '}')
2929 {
2930 if (syntax & RE_NO_BK_BRACES)
2931 goto unfetch_interval;
2932 else
2933 FREE_STACK_RETURN (REG_BADBR);
2934 }
2935
2936 /* We just parsed a valid interval. */
2937
2938 /* If it's invalid to have no preceding re. */
2939 if (!laststart)
2940 {
2941 if (syntax & RE_CONTEXT_INVALID_OPS)
2942 FREE_STACK_RETURN (REG_BADRPT);
2943 else if (syntax & RE_CONTEXT_INDEP_OPS)
2944 laststart = b;
2945 else
2946 goto unfetch_interval;
2947 }
2948
2949 /* If the upper bound is zero, don't want to succeed at
2950 all; jump from `laststart' to `b + 3', which will be
2951 the end of the buffer after we insert the jump. */
2952 if (upper_bound == 0)
2953 {
2954 GET_BUFFER_SPACE (3);
2955 INSERT_JUMP (jump, laststart, b + 3);
2956 b += 3;
2957 }
2958
2959 /* Otherwise, we have a nontrivial interval. When
2960 we're all done, the pattern will look like:
2961 set_number_at <jump count> <upper bound>
2962 set_number_at <succeed_n count> <lower bound>
2963 succeed_n <after jump addr> <succeed_n count>
2964 <body of loop>
2965 jump_n <succeed_n addr> <jump count>
2966 (The upper bound and `jump_n' are omitted if
2967 `upper_bound' is 1, though.) */
2968 else
2969 { /* If the upper bound is > 1, we need to insert
2970 more at the end of the loop. */
2971 unsigned nbytes = 10 + (upper_bound > 1) * 10;
2972
2973 GET_BUFFER_SPACE (nbytes);
2974
2975 /* Initialize lower bound of the `succeed_n', even
2976 though it will be set during matching by its
2977 attendant `set_number_at' (inserted next),
2978 because `re_compile_fastmap' needs to know.
2979 Jump to the `jump_n' we might insert below. */
2980 INSERT_JUMP2 (succeed_n, laststart,
2981 b + 5 + (upper_bound > 1) * 5,
2982 lower_bound);
2983 b += 5;
2984
2985 /* Code to initialize the lower bound. Insert
2986 before the `succeed_n'. The `5' is the last two
2987 bytes of this `set_number_at', plus 3 bytes of
2988 the following `succeed_n'. */
2989 insert_op2 (set_number_at, laststart, 5, lower_bound, b);
2990 b += 5;
2991
2992 if (upper_bound > 1)
2993 { /* More than one repetition is allowed, so
2994 append a backward jump to the `succeed_n'
2995 that starts this interval.
2996
2997 When we've reached this during matching,
2998 we'll have matched the interval once, so
2999 jump back only `upper_bound - 1' times. */
3000 STORE_JUMP2 (jump_n, b, laststart + 5,
3001 upper_bound - 1);
3002 b += 5;
3003
3004 /* The location we want to set is the second
3005 parameter of the `jump_n'; that is `b-2' as
3006 an absolute address. `laststart' will be
3007 the `set_number_at' we're about to insert;
3008 `laststart+3' the number to set, the source
3009 for the relative address. But we are
3010 inserting into the middle of the pattern --
3011 so everything is getting moved up by 5.
3012 Conclusion: (b - 2) - (laststart + 3) + 5,
3013 i.e., b - laststart.
3014
3015 We insert this at the beginning of the loop
3016 so that if we fail during matching, we'll
3017 reinitialize the bounds. */
3018 insert_op2 (set_number_at, laststart, b - laststart,
3019 upper_bound - 1, b);
3020 b += 5;
3021 }
3022 }
3023 pending_exact = 0;
3024 beg_interval = NULL;
3025 }
3026 break;
3027
3028 unfetch_interval:
3029 /* If an invalid interval, match the characters as literals. */
3030 assert (beg_interval);
3031 p = beg_interval;
3032 beg_interval = NULL;
3033
3034 /* normal_char and normal_backslash need `c'. */
3035 PATFETCH (c);
3036
3037 if (!(syntax & RE_NO_BK_BRACES))
3038 {
3039 if (p > pattern && p[-1] == '\\')
3040 goto normal_backslash;
3041 }
3042 goto normal_char;
3043
3044 #ifdef emacs
3045 /* There is no way to specify the before_dot and after_dot
3046 operators. rms says this is ok. --karl */
3047 case '=':
3048 BUF_PUSH (at_dot);
3049 break;
3050
3051 case 's':
3052 laststart = b;
3053 PATFETCH (c);
3054 BUF_PUSH_2 (syntaxspec, syntax_spec_code[c]);
3055 break;
3056
3057 case 'S':
3058 laststart = b;
3059 PATFETCH (c);
3060 BUF_PUSH_2 (notsyntaxspec, syntax_spec_code[c]);
3061 break;
3062 #endif /* emacs */
3063
3064
3065 case 'w':
3066 if (re_syntax_options & RE_NO_GNU_OPS)
3067 goto normal_char;
3068 laststart = b;
3069 BUF_PUSH (wordchar);
3070 break;
3071
3072
3073 case 'W':
3074 if (re_syntax_options & RE_NO_GNU_OPS)
3075 goto normal_char;
3076 laststart = b;
3077 BUF_PUSH (notwordchar);
3078 break;
3079
3080
3081 case '<':
3082 if (re_syntax_options & RE_NO_GNU_OPS)
3083 goto normal_char;
3084 BUF_PUSH (wordbeg);
3085 break;
3086
3087 case '>':
3088 if (re_syntax_options & RE_NO_GNU_OPS)
3089 goto normal_char;
3090 BUF_PUSH (wordend);
3091 break;
3092
3093 case 'b':
3094 if (re_syntax_options & RE_NO_GNU_OPS)
3095 goto normal_char;
3096 BUF_PUSH (wordbound);
3097 break;
3098
3099 case 'B':
3100 if (re_syntax_options & RE_NO_GNU_OPS)
3101 goto normal_char;
3102 BUF_PUSH (notwordbound);
3103 break;
3104
3105 case '`':
3106 if (re_syntax_options & RE_NO_GNU_OPS)
3107 goto normal_char;
3108 BUF_PUSH (begbuf);
3109 break;
3110
3111 case '\'':
3112 if (re_syntax_options & RE_NO_GNU_OPS)
3113 goto normal_char;
3114 BUF_PUSH (endbuf);
3115 break;
3116
3117 case '1': case '2': case '3': case '4': case '5':
3118 case '6': case '7': case '8': case '9':
3119 if (syntax & RE_NO_BK_REFS)
3120 goto normal_char;
3121
3122 c1 = c - '0';
3123
3124 if (c1 > regnum)
3125 FREE_STACK_RETURN (REG_ESUBREG);
3126
3127 /* Can't back reference to a subexpression if inside of it. */
3128 if (group_in_compile_stack (compile_stack, (regnum_t) c1))
3129 goto normal_char;
3130
3131 laststart = b;
3132 BUF_PUSH_2 (duplicate, c1);
3133 break;
3134
3135
3136 case '+':
3137 case '?':
3138 if (syntax & RE_BK_PLUS_QM)
3139 goto handle_plus;
3140 else
3141 goto normal_backslash;
3142
3143 default:
3144 normal_backslash:
3145 /* You might think it would be useful for \ to mean
3146 not to translate; but if we don't translate it
3147 it will never match anything. */
3148 if (!ismbchar (c))
3149 c = TRANSLATE (c);
3150 goto normal_char;
3151 }
3152 break;
3153
3154
3155 default:
3156 /* Expects the character in `c'. */
3157 normal_char:
3158
3159 p1 = p;
3160 if (ismbchar (c)) {
3161 p += mbclen (c) - 1;
3162 if (p > pend)
3163 return REG_EEND;
3164 }
3165
3166 /* If no exactn currently being built. */
3167 if (!pending_exact
3168
3169 /* If last exactn not at current position. */
3170 || pending_exact + *pending_exact + 1 != b
3171
3172 /* We have only one byte following the exactn for the count. */
3173 || *pending_exact >= (1 << BYTEWIDTH) - mbclen(c)
3174
3175 /* If followed by a repetition operator. */
3176 || *p == '*' || *p == '^'
3177 || ((syntax & RE_BK_PLUS_QM)
3178 ? *p == '\\' && (p[1] == '+' || p[1] == '?')
3179 : (*p == '+' || *p == '?'))
3180 || ((syntax & RE_INTERVALS)
3181 && ((syntax & RE_NO_BK_BRACES)
3182 ? *p == '{'
3183 : (p[0] == '\\' && p[1] == '{'))))
3184 {
3185 /* Start building a new exactn. */
3186
3187 laststart = b;
3188
3189 BUF_PUSH_2 (exactn, 0);
3190 pending_exact = b - 1;
3191 }
3192
3193 BUF_PUSH (c);
3194 (*pending_exact)++;
3195 while (p1 < p) {
3196 BUF_PUSH (*p1++);
3197 (*pending_exact)++;
3198 }
3199 break;
3200 } /* switch (c) */
3201 } /* while p != pend */
3202
3203
3204 /* Through the pattern now. */
3205
3206 if (fixup_alt_jump)
3207 STORE_JUMP (jump_past_alt, fixup_alt_jump, b);
3208
3209 if (!COMPILE_STACK_EMPTY)
3210 FREE_STACK_RETURN (REG_EPAREN);
3211
3212 /* If we don't want backtracking, force success
3213 the first time we reach the end of the compiled pattern. */
3214 if (syntax & RE_NO_POSIX_BACKTRACKING)
3215 BUF_PUSH (succeed);
3216
3217 free (compile_stack.stack);
3218
3219 /* We have succeeded; set the length of the buffer. */
3220 bufp->used = b - bufp->buffer;
3221
3222 #ifdef DEBUG
3223 if (debug)
3224 {
3225 DEBUG_PRINT1 ("\nCompiled pattern: \n");
3226 print_compiled_pattern (bufp);
3227 }
3228 #endif /* DEBUG */
3229
3230 #ifndef MATCH_MAY_ALLOCATE
3231 /* Initialize the failure stack to the largest possible stack. This
3232 isn't necessary unless we're trying to avoid calling alloca in
3233 the search and match routines. */
3234 {
3235 int num_regs = bufp->re_nsub + 1;
3236
3237 /* Since DOUBLE_FAIL_STACK refuses to double only if the current size
3238 is strictly greater than re_max_failures, the largest possible stack
3239 is 2 * re_max_failures failure points. */
3240 if (fail_stack.size < (2 * re_max_failures * MAX_FAILURE_ITEMS))
3241 {
3242 fail_stack.size = (2 * re_max_failures * MAX_FAILURE_ITEMS);
3243
3244 #ifdef emacs
3245 if (! fail_stack.stack)
3246 fail_stack.stack
3247 = (fail_stack_elt_t *) xmalloc (fail_stack.size
3248 * sizeof (fail_stack_elt_t));
3249 else
3250 fail_stack.stack
3251 = (fail_stack_elt_t *) xrealloc (fail_stack.stack,
3252 (fail_stack.size
3253 * sizeof (fail_stack_elt_t)));
3254 #else /* not emacs */
3255 if (! fail_stack.stack)
3256 fail_stack.stack
3257 = (fail_stack_elt_t *) malloc (fail_stack.size
3258 * sizeof (fail_stack_elt_t));
3259 else
3260 fail_stack.stack
3261 = (fail_stack_elt_t *) realloc (fail_stack.stack,
3262 (fail_stack.size
3263 * sizeof (fail_stack_elt_t)));
3264 #endif /* not emacs */
3265 }
3266
3267 regex_grow_registers (num_regs);
3268 }
3269 #endif /* not MATCH_MAY_ALLOCATE */
3270
3271 return REG_NOERROR;
3272 } /* regex_compile */
3273
3274 /* Subroutines for `regex_compile'. */
3275
3276 /* Store OP at LOC followed by two-byte integer parameter ARG. */
3277
3278 static void
store_op1(op,loc,arg)3279 store_op1 (op, loc, arg)
3280 re_opcode_t op;
3281 unsigned char *loc;
3282 int arg;
3283 {
3284 *loc = (unsigned char) op;
3285 STORE_NUMBER (loc + 1, arg);
3286 }
3287
3288
3289 /* Like `store_op1', but for two two-byte parameters ARG1 and ARG2. */
3290
3291 static void
store_op2(op,loc,arg1,arg2)3292 store_op2 (op, loc, arg1, arg2)
3293 re_opcode_t op;
3294 unsigned char *loc;
3295 int arg1, arg2;
3296 {
3297 *loc = (unsigned char) op;
3298 STORE_NUMBER (loc + 1, arg1);
3299 STORE_NUMBER (loc + 3, arg2);
3300 }
3301
3302
3303 /* Copy the bytes from LOC to END to open up three bytes of space at LOC
3304 for OP followed by two-byte integer parameter ARG. */
3305
3306 static void
insert_op1(op,loc,arg,end)3307 insert_op1 (op, loc, arg, end)
3308 re_opcode_t op;
3309 unsigned char *loc;
3310 int arg;
3311 unsigned char *end;
3312 {
3313 register unsigned char *pfrom = end;
3314 register unsigned char *pto = end + 3;
3315
3316 while (pfrom != loc)
3317 *--pto = *--pfrom;
3318
3319 store_op1 (op, loc, arg);
3320 }
3321
3322
3323 /* Like `insert_op1', but for two two-byte parameters ARG1 and ARG2. */
3324
3325 static void
insert_op2(op,loc,arg1,arg2,end)3326 insert_op2 (op, loc, arg1, arg2, end)
3327 re_opcode_t op;
3328 unsigned char *loc;
3329 int arg1, arg2;
3330 unsigned char *end;
3331 {
3332 register unsigned char *pfrom = end;
3333 register unsigned char *pto = end + 5;
3334
3335 while (pfrom != loc)
3336 *--pto = *--pfrom;
3337
3338 store_op2 (op, loc, arg1, arg2);
3339 }
3340
3341
3342 /* P points to just after a ^ in PATTERN. Return true if that ^ comes
3343 after an alternative or a begin-subexpression. We assume there is at
3344 least one character before the ^. */
3345
3346 static boolean
at_begline_loc_p(pattern,p,syntax)3347 at_begline_loc_p (pattern, p, syntax)
3348 const char *pattern, *p;
3349 reg_syntax_t syntax;
3350 {
3351 const char *prev = p - 2;
3352 boolean prev_prev_backslash = prev > pattern && prev[-1] == '\\';
3353
3354 return
3355 /* After a subexpression? */
3356 (*prev == '(' && (syntax & RE_NO_BK_PARENS || prev_prev_backslash))
3357 /* After an alternative? */
3358 || (*prev == '|' && (syntax & RE_NO_BK_VBAR || prev_prev_backslash));
3359 }
3360
3361
3362 /* The dual of at_begline_loc_p. This one is for $. We assume there is
3363 at least one character after the $, i.e., `P < PEND'. */
3364
3365 static boolean
at_endline_loc_p(p,pend,syntax)3366 at_endline_loc_p (p, pend, syntax)
3367 const char *p, *pend;
3368 reg_syntax_t syntax;
3369 {
3370 const char *next = p;
3371 boolean next_backslash = *next == '\\';
3372 const char *next_next = p + 1 < pend ? p + 1 : 0;
3373
3374 return
3375 /* Before a subexpression? */
3376 (syntax & RE_NO_BK_PARENS ? *next == ')'
3377 : next_backslash && next_next && *next_next == ')')
3378 /* Before an alternative? */
3379 || (syntax & RE_NO_BK_VBAR ? *next == '|'
3380 : next_backslash && next_next && *next_next == '|');
3381 }
3382
3383
3384 /* Returns true if REGNUM is in one of COMPILE_STACK's elements and
3385 false if it's not. */
3386
3387 static boolean
group_in_compile_stack(compile_stack,regnum)3388 group_in_compile_stack (compile_stack, regnum)
3389 compile_stack_type compile_stack;
3390 regnum_t regnum;
3391 {
3392 int this_element;
3393
3394 for (this_element = compile_stack.avail - 1;
3395 this_element >= 0;
3396 this_element--)
3397 if (compile_stack.stack[this_element].regnum == regnum)
3398 return true;
3399
3400 return false;
3401 }
3402
3403
3404 #if 0 /* We use set_list_bits() now. */
3405 /* Read the ending character of a range (in a bracket expression) from the
3406 uncompiled pattern *P_PTR (which ends at PEND). We assume the
3407 starting character is in `P[-2]'. (`P[-1]' is the character `-'.)
3408 Then we set the translation of all bits between the starting and
3409 ending characters (inclusive) in the compiled pattern B.
3410
3411 Return an error code.
3412
3413 We use these short variable names so we can use the same macros as
3414 `regex_compile' itself. */
3415
3416 static reg_errcode_t
3417 compile_range (p_ptr, pend, translate, syntax, b)
3418 const char **p_ptr, *pend;
3419 RE_TRANSLATE_TYPE translate;
3420 reg_syntax_t syntax;
3421 unsigned char *b;
3422 {
3423 unsigned this_char;
3424
3425 const char *p = *p_ptr;
3426 unsigned int range_start, range_end;
3427
3428 if (p == pend)
3429 return REG_ERANGE;
3430
3431 /* Even though the pattern is a signed `char *', we need to fetch
3432 with unsigned char *'s; if the high bit of the pattern character
3433 is set, the range endpoints will be negative if we fetch using a
3434 signed char *.
3435
3436 We also want to fetch the endpoints without translating them; the
3437 appropriate translation is done in the bit-setting loop below. */
3438 /* The SVR4 compiler on the 3B2 had trouble with unsigned const char *. */
3439 range_start = ((const unsigned char *) p)[-2];
3440 range_end = ((const unsigned char *) p)[0];
3441
3442 /* Have to increment the pointer into the pattern string, so the
3443 caller isn't still at the ending character. */
3444 (*p_ptr)++;
3445
3446 /* If the start is after the end, the range is empty. */
3447 if (range_start > range_end)
3448 return syntax & RE_NO_EMPTY_RANGES ? REG_ERANGE : REG_NOERROR;
3449
3450 /* Here we see why `this_char' has to be larger than an `unsigned
3451 char' -- the range is inclusive, so if `range_end' == 0xff
3452 (assuming 8-bit characters), we would otherwise go into an infinite
3453 loop, since all characters <= 0xff. */
3454 for (this_char = range_start; this_char <= range_end; this_char++)
3455 {
3456 SET_LIST_BIT (TRANSLATE (this_char));
3457 }
3458
3459 return REG_NOERROR;
3460 }
3461 #endif
3462
3463 /* re_compile_fastmap computes a ``fastmap'' for the compiled pattern in
3464 BUFP. A fastmap records which of the (1 << BYTEWIDTH) possible
3465 characters can start a string that matches the pattern. This fastmap
3466 is used by re_search to skip quickly over impossible starting points.
3467
3468 The caller must supply the address of a (1 << BYTEWIDTH)-byte data
3469 area as BUFP->fastmap.
3470
3471 We set the `fastmap', `fastmap_accurate', and `can_be_null' fields in
3472 the pattern buffer.
3473
3474 Returns 0 if we succeed, -2 if an internal error. */
3475
3476 int
re_compile_fastmap(bufp)3477 re_compile_fastmap (bufp)
3478 struct re_pattern_buffer *bufp;
3479 {
3480 int j, k;
3481 #ifdef MATCH_MAY_ALLOCATE
3482 fail_stack_type fail_stack;
3483 #endif
3484 #ifndef REGEX_MALLOC
3485 char *destination;
3486 #endif
3487
3488 register char *fastmap = bufp->fastmap;
3489 unsigned char *pattern = bufp->buffer;
3490 unsigned char *p = pattern;
3491 register unsigned char *pend = pattern + bufp->used;
3492
3493 #ifdef REL_ALLOC
3494 /* This holds the pointer to the failure stack, when
3495 it is allocated relocatably. */
3496 fail_stack_elt_t *failure_stack_ptr;
3497 #endif
3498
3499 /* Assume that each path through the pattern can be null until
3500 proven otherwise. We set this false at the bottom of switch
3501 statement, to which we get only if a particular path doesn't
3502 match the empty string. */
3503 boolean path_can_be_null = true;
3504
3505 /* We aren't doing a `succeed_n' to begin with. */
3506 boolean succeed_n_p = false;
3507
3508 assert (fastmap != NULL && p != NULL);
3509
3510 INIT_FAIL_STACK ();
3511 bzero (fastmap, 1 << BYTEWIDTH); /* Assume nothing's valid. */
3512 bufp->fastmap_accurate = 1; /* It will be when we're done. */
3513 bufp->can_be_null = 0;
3514
3515 while (1)
3516 {
3517 if (p == pend || *p == succeed)
3518 {
3519 /* We have reached the (effective) end of pattern. */
3520 if (!FAIL_STACK_EMPTY ())
3521 {
3522 bufp->can_be_null |= path_can_be_null;
3523
3524 /* Reset for next path. */
3525 path_can_be_null = true;
3526
3527 p = fail_stack.stack[--fail_stack.avail].pointer;
3528
3529 continue;
3530 }
3531 else
3532 break;
3533 }
3534
3535 /* We should never be about to go beyond the end of the pattern. */
3536 assert (p < pend);
3537
3538 switch (SWITCH_ENUM_CAST ((re_opcode_t) *p++))
3539 {
3540
3541 /* I guess the idea here is to simply not bother with a fastmap
3542 if a backreference is used, since it's too hard to figure out
3543 the fastmap for the corresponding group. Setting
3544 `can_be_null' stops `re_search_2' from using the fastmap, so
3545 that is all we do. */
3546 case duplicate:
3547 bufp->can_be_null = 1;
3548 goto done;
3549
3550
3551 /* Following are the cases which match a character. These end
3552 with `break'. */
3553
3554 case exactn:
3555 fastmap[p[1]] = 1;
3556 break;
3557
3558
3559 case charset:
3560 /* NOTE: Charset for single-byte chars never contain
3561 multi-byte char. See set_list_bits(). */
3562 for (j = *p++ * BYTEWIDTH - 1; j >= 0; j--)
3563 if (p[j / BYTEWIDTH] & (1 << (j % BYTEWIDTH)))
3564 fastmap[j] = 1;
3565 {
3566 unsigned short size;
3567 unsigned char c, end;
3568 char buf[8], bufend[8];
3569
3570 p += p[-1] + 2;
3571 size = EXTRACT_UNSIGNED (&p[-2]);
3572 for (j = 0; j < size; j++)
3573 {
3574 /* set bits for 1st bytes of multi-byte chars. */
3575 mbcstore (buf, EXTRACT_MBC (&p[j*8]));
3576 mbcstore (bufend, EXTRACT_MBC (&p[j*8 + 4]));
3577 for (c = (unsigned char) buf[0], end = (unsigned char) bufend[0];
3578 c <= end;
3579 c++)
3580 /* NOTE: Charset for multi-byte chars might contain
3581 single-byte chars. We must reject them. */
3582 if (ismbchar (c))
3583 fastmap[c] = 1;
3584 }
3585 }
3586 break;
3587
3588
3589 case charset_not:
3590 /* S: set of all single-byte chars.
3591 M: set of all first bytes that can start multi-byte chars.
3592 s: any set of single-byte chars.
3593 m: any set of first bytes that can start multi-byte chars.
3594
3595 We assume S+M = U.
3596 ___ _ _
3597 s+m = (S*s+M*m). */
3598 /* Chars beyond end of map must be allowed. */
3599 /* NOTE: Charset_not for single-byte chars might contain
3600 multi-byte chars. See set_list_bits(). */
3601 for (j = *p * BYTEWIDTH; j < (1 << BYTEWIDTH); j++)
3602 if (!ismbchar (j))
3603 fastmap[j] = 1;
3604
3605 for (j = *p++ * BYTEWIDTH - 1; j >= 0; j--)
3606 if (!(p[j / BYTEWIDTH] & (1 << (j % BYTEWIDTH))))
3607 if (!ismbchar (j))
3608 fastmap[j] = 1;
3609 {
3610 unsigned short size;
3611 unsigned c, beg;
3612 char buf[8];
3613
3614 p += p[-1] + 2;
3615 size = EXTRACT_UNSIGNED (&p[-2]);
3616 c = 0x00;
3617 for (j = 0; j < size; j++) {
3618 mbcstore (buf, EXTRACT_MBC (&p[j*8 + 0]));
3619 for (beg = (unsigned char) buf[0]; c <= beg; c++)
3620 if (ismbchar (c))
3621 fastmap[c] = 1;
3622 mbcstore (buf, EXTRACT_MBC (&p[j*8 + 4]));
3623 c = (unsigned char) buf[0];
3624 }
3625 for (beg = 0xff; c <= beg; c++)
3626 if (ismbchar (c))
3627 fastmap[c] = 1;
3628 }
3629 break;
3630
3631
3632 case wordchar:
3633 for (j = 0; j < (1 << BYTEWIDTH); j++)
3634 if (SYNTAX (j) == Sword)
3635 fastmap[j] = 1;
3636 break;
3637
3638
3639 case notwordchar:
3640 for (j = 0; j < (1 << BYTEWIDTH); j++)
3641 if (SYNTAX (j) != Sword)
3642 fastmap[j] = 1;
3643 break;
3644
3645
3646 case anychar:
3647 {
3648 int fastmap_newline = fastmap['\n'];
3649
3650 /* `.' matches anything ... */
3651 for (j = 0; j < (1 << BYTEWIDTH); j++)
3652 fastmap[j] = 1;
3653
3654 /* ... except perhaps newline. */
3655 if (!(bufp->syntax & RE_DOT_NEWLINE))
3656 fastmap['\n'] = fastmap_newline;
3657
3658 /* Return if we have already set `can_be_null'; if we have,
3659 then the fastmap is irrelevant. Something's wrong here. */
3660 else if (bufp->can_be_null)
3661 goto done;
3662
3663 /* Otherwise, have to check alternative paths. */
3664 break;
3665 }
3666
3667 #ifdef emacs
3668 case syntaxspec:
3669 k = *p++;
3670 for (j = 0; j < (1 << BYTEWIDTH); j++)
3671 if (SYNTAX (j) == (enum syntaxcode) k)
3672 fastmap[j] = 1;
3673 break;
3674
3675
3676 case notsyntaxspec:
3677 k = *p++;
3678 for (j = 0; j < (1 << BYTEWIDTH); j++)
3679 if (SYNTAX (j) != (enum syntaxcode) k)
3680 fastmap[j] = 1;
3681 break;
3682
3683
3684 /* All cases after this match the empty string. These end with
3685 `continue'. */
3686
3687
3688 case before_dot:
3689 case at_dot:
3690 case after_dot:
3691 continue;
3692 #endif /* emacs */
3693
3694
3695 case no_op:
3696 case begline:
3697 case endline:
3698 case begbuf:
3699 case endbuf:
3700 case wordbound:
3701 case notwordbound:
3702 case wordbeg:
3703 case wordend:
3704 case push_dummy_failure:
3705 continue;
3706
3707
3708 case jump_n:
3709 case pop_failure_jump:
3710 case maybe_pop_jump:
3711 case jump:
3712 case jump_past_alt:
3713 case dummy_failure_jump:
3714 EXTRACT_NUMBER_AND_INCR (j, p);
3715 p += j;
3716 if (j > 0)
3717 continue;
3718
3719 /* Jump backward implies we just went through the body of a
3720 loop and matched nothing. Opcode jumped to should be
3721 `on_failure_jump' or `succeed_n'. Just treat it like an
3722 ordinary jump. For a * loop, it has pushed its failure
3723 point already; if so, discard that as redundant. */
3724 if ((re_opcode_t) *p != on_failure_jump
3725 && (re_opcode_t) *p != succeed_n)
3726 continue;
3727
3728 p++;
3729 EXTRACT_NUMBER_AND_INCR (j, p);
3730 p += j;
3731
3732 /* If what's on the stack is where we are now, pop it. */
3733 if (!FAIL_STACK_EMPTY ()
3734 && fail_stack.stack[fail_stack.avail - 1].pointer == p)
3735 fail_stack.avail--;
3736
3737 continue;
3738
3739
3740 case on_failure_jump:
3741 case on_failure_keep_string_jump:
3742 handle_on_failure_jump:
3743 EXTRACT_NUMBER_AND_INCR (j, p);
3744
3745 /* For some patterns, e.g., `(a?)?', `p+j' here points to the
3746 end of the pattern. We don't want to push such a point,
3747 since when we restore it above, entering the switch will
3748 increment `p' past the end of the pattern. We don't need
3749 to push such a point since we obviously won't find any more
3750 fastmap entries beyond `pend'. Such a pattern can match
3751 the null string, though. */
3752 if (p + j < pend)
3753 {
3754 if (!PUSH_PATTERN_OP (p + j, fail_stack))
3755 {
3756 RESET_FAIL_STACK ();
3757 return -2;
3758 }
3759 }
3760 else
3761 bufp->can_be_null = 1;
3762
3763 if (succeed_n_p)
3764 {
3765 EXTRACT_NUMBER_AND_INCR (k, p); /* Skip the n. */
3766 succeed_n_p = false;
3767 }
3768
3769 continue;
3770
3771
3772 case succeed_n:
3773 /* Get to the number of times to succeed. */
3774 p += 2;
3775
3776 /* Increment p past the n for when k != 0. */
3777 EXTRACT_NUMBER_AND_INCR (k, p);
3778 if (k == 0)
3779 {
3780 p -= 4;
3781 succeed_n_p = true; /* Spaghetti code alert. */
3782 goto handle_on_failure_jump;
3783 }
3784 continue;
3785
3786
3787 case set_number_at:
3788 p += 4;
3789 continue;
3790
3791
3792 case start_memory:
3793 case stop_memory:
3794 p += 2;
3795 continue;
3796
3797
3798 default:
3799 abort (); /* We have listed all the cases. */
3800 } /* switch *p++ */
3801
3802 /* Getting here means we have found the possible starting
3803 characters for one path of the pattern -- and that the empty
3804 string does not match. We need not follow this path further.
3805 Instead, look at the next alternative (remembered on the
3806 stack), or quit if no more. The test at the top of the loop
3807 does these things. */
3808 path_can_be_null = false;
3809 p = pend;
3810 } /* while p */
3811
3812 /* Set `can_be_null' for the last path (also the first path, if the
3813 pattern is empty). */
3814 bufp->can_be_null |= path_can_be_null;
3815
3816 done:
3817 RESET_FAIL_STACK ();
3818 return 0;
3819 } /* re_compile_fastmap */
3820
3821 /* Set REGS to hold NUM_REGS registers, storing them in STARTS and
3822 ENDS. Subsequent matches using PATTERN_BUFFER and REGS will use
3823 this memory for recording register information. STARTS and ENDS
3824 must be allocated using the malloc library routine, and must each
3825 be at least NUM_REGS * sizeof (regoff_t) bytes long.
3826
3827 If NUM_REGS == 0, then subsequent matches should allocate their own
3828 register data.
3829
3830 Unless this function is called, the first search or match using
3831 PATTERN_BUFFER will allocate its own register data, without
3832 freeing the old data. */
3833
3834 void
re_set_registers(bufp,regs,num_regs,starts,ends)3835 re_set_registers (bufp, regs, num_regs, starts, ends)
3836 struct re_pattern_buffer *bufp;
3837 struct re_registers *regs;
3838 unsigned num_regs;
3839 regoff_t *starts, *ends;
3840 {
3841 if (num_regs)
3842 {
3843 bufp->regs_allocated = REGS_REALLOCATE;
3844 regs->num_regs = num_regs;
3845 regs->start = starts;
3846 regs->end = ends;
3847 }
3848 else
3849 {
3850 bufp->regs_allocated = REGS_UNALLOCATED;
3851 regs->num_regs = 0;
3852 regs->start = regs->end = (regoff_t *) 0;
3853 }
3854 }
3855
3856 /* Searching routines. */
3857
3858 /* Like re_search_2, below, but only one string is specified, and
3859 doesn't let you say where to stop matching. */
3860
3861 int
re_search(bufp,string,size,startpos,range,regs)3862 re_search (bufp, string, size, startpos, range, regs)
3863 struct re_pattern_buffer *bufp;
3864 const char *string;
3865 int size, startpos, range;
3866 struct re_registers *regs;
3867 {
3868 return re_search_2 (bufp, NULL, 0, string, size, startpos, range,
3869 regs, size);
3870 }
3871
3872
3873 /* Using the compiled pattern in BUFP->buffer, first tries to match the
3874 virtual concatenation of STRING1 and STRING2, starting first at index
3875 STARTPOS, then at STARTPOS + 1, and so on.
3876
3877 STRING1 and STRING2 have length SIZE1 and SIZE2, respectively.
3878
3879 RANGE is how far to scan while trying to match. RANGE = 0 means try
3880 only at STARTPOS; in general, the last start tried is STARTPOS +
3881 RANGE.
3882
3883 In REGS, return the indices of the virtual concatenation of STRING1
3884 and STRING2 that matched the entire BUFP->buffer and its contained
3885 subexpressions.
3886
3887 Do not consider matching one past the index STOP in the virtual
3888 concatenation of STRING1 and STRING2.
3889
3890 We return either the position in the strings at which the match was
3891 found, -1 if no match, or -2 if error (such as failure
3892 stack overflow). */
3893
3894 int
re_search_2(bufp,string1,size1,string2,size2,startpos,range,regs,stop)3895 re_search_2 (bufp, string1, size1, string2, size2, startpos, range, regs, stop)
3896 struct re_pattern_buffer *bufp;
3897 const char *string1, *string2;
3898 int size1, size2;
3899 int startpos;
3900 int range;
3901 struct re_registers *regs;
3902 int stop;
3903 {
3904 int val;
3905 register char *fastmap = bufp->fastmap;
3906 register RE_TRANSLATE_TYPE translate = bufp->translate;
3907 int total_size = size1 + size2;
3908 int endpos = startpos + range;
3909
3910 /* Check for out-of-range STARTPOS. */
3911 if (startpos < 0 || startpos > total_size)
3912 return -1;
3913
3914 /* Fix up RANGE if it might eventually take us outside
3915 the virtual concatenation of STRING1 and STRING2.
3916 Make sure we won't move STARTPOS below 0 or above TOTAL_SIZE. */
3917 if (endpos < 0)
3918 range = 0 - startpos;
3919 else if (endpos > total_size)
3920 range = total_size - startpos;
3921
3922 /* If the search isn't to be a backwards one, don't waste time in a
3923 search for a pattern that must be anchored. */
3924 if (bufp->used > 0 && (re_opcode_t) bufp->buffer[0] == begbuf && range > 0)
3925 {
3926 if (startpos > 0)
3927 return -1;
3928 else
3929 range = 1;
3930 }
3931
3932 #ifdef emacs
3933 /* In a forward search for something that starts with \=.
3934 don't keep searching past point. */
3935 if (bufp->used > 0 && (re_opcode_t) bufp->buffer[0] == at_dot && range > 0)
3936 {
3937 range = PT - startpos;
3938 if (range <= 0)
3939 return -1;
3940 }
3941 #endif /* emacs */
3942
3943 /* Update the fastmap now if not correct already. */
3944 if (fastmap && !bufp->fastmap_accurate)
3945 if (re_compile_fastmap (bufp) == -2)
3946 return -2;
3947
3948 /* Loop through the string, looking for a place to start matching. */
3949 for (;;)
3950 {
3951 /* If a fastmap is supplied, skip quickly over characters that
3952 cannot be the start of a match. If the pattern can match the
3953 null string, however, we don't need to skip characters; we want
3954 the first null string. */
3955 if (fastmap && startpos < total_size && !bufp->can_be_null)
3956 {
3957 if (range > 0) /* Searching forwards. */
3958 {
3959 register const char *d;
3960 register int lim = 0;
3961 int irange = range;
3962 unsigned char c;
3963
3964 if (startpos < size1 && startpos + range >= size1)
3965 lim = range - (size1 - startpos);
3966
3967 d = (startpos >= size1 ? string2 - size1 : string1) + startpos;
3968
3969 /* Written out as an if-else to avoid testing `translate'
3970 inside the loop. */
3971 if (translate)
3972 while (range > lim) {
3973 c = *d++;
3974 if (ismbchar (c)) {
3975 size_t n;
3976 if (fastmap[c])
3977 break;
3978 d += (n = mbclen (c)) - 1;
3979 range -= n;
3980 continue;
3981 }
3982 if (fastmap[(unsigned char) translate[c]])
3983 break;
3984 range--;
3985 }
3986 else
3987 while (range > lim && (c = *d++, !fastmap[c])) {
3988 if (ismbchar (c)) {
3989 size_t n;
3990 d += (n = mbclen (c)) - 1;
3991 range -= n;
3992 continue;
3993 }
3994 range--;
3995 }
3996
3997 startpos += irange - range;
3998 }
3999 else /* Searching backwards. */
4000 {
4001 register char c = (size1 == 0 || startpos >= size1
4002 ? string2[startpos - size1]
4003 : string1[startpos]);
4004
4005 if (!fastmap[(unsigned char) TRANSLATE (c)])
4006 goto advance;
4007 }
4008 }
4009
4010 /* If can't match the null string, and that's all we have left, fail. */
4011 if (range >= 0 && startpos == total_size && fastmap
4012 && !bufp->can_be_null)
4013 return -1;
4014
4015 val = re_match_2_internal (bufp, string1, size1, string2, size2,
4016 startpos, regs, stop);
4017 #ifndef REGEX_MALLOC
4018 #ifdef C_ALLOCA
4019 alloca (0);
4020 #endif
4021 #endif
4022
4023 if (val >= 0)
4024 return startpos;
4025
4026 if (val == -2)
4027 return -2;
4028
4029 advance:
4030 if (!range)
4031 break;
4032 else if (range > 0)
4033 {
4034 const char *d = ((startpos >= size1 ? string2 - size1 : string1)
4035 + startpos);
4036
4037 if (ismbchar (*d)) {
4038 size_t n;
4039 range -= (n = mbclen (*d) - 1);
4040 startpos += n;
4041 if (range <= 0)
4042 break;
4043 }
4044 range--, startpos++;
4045 }
4046 else
4047 {
4048 range++, startpos--;
4049 {
4050 const char *s, *d, *p;
4051
4052 if (startpos < size1)
4053 s = string1, d = string1 + startpos;
4054 else
4055 s = string2, d = string2 + startpos - size1;
4056 if (current_mbctype == MBCTYPE_UTF8) {
4057 for (p = d; p > s && !ismbchar(*p) && !((unsigned char) *p < 0x80); p--) {
4058 range++;
4059 startpos--;
4060 }
4061 if (range > 0)
4062 break;
4063 } else {
4064 for (p = d; p-- > s && ismbchar(*p); )
4065 /* --p >= s ���� 80[12]?86 ��ư���ʤ���ǽ��������. (huge
4066 model �ʳ���, s �Υ��ե��åȤ� 0 ���ä����.) */
4067 ;
4068 if (!((d - p) & 1)) {
4069 if (!range)
4070 break;
4071 range++, startpos--;
4072 }
4073 }
4074 }
4075 }
4076 }
4077 return -1;
4078 } /* re_search_2 */
4079
4080 /* This converts PTR, a pointer into one of the search strings `string1'
4081 and `string2' into an offset from the beginning of that string. */
4082 #define POINTER_TO_OFFSET(ptr) \
4083 (FIRST_STRING_P (ptr) \
4084 ? ((regoff_t) ((ptr) - string1)) \
4085 : ((regoff_t) ((ptr) - string2 + size1)))
4086
4087 /* Macros for dealing with the split strings in re_match_2. */
4088
4089 #define MATCHING_IN_FIRST_STRING (dend == end_match_1)
4090
4091 /* Call before fetching a character with *d. This switches over to
4092 string2 if necessary. */
4093 #define PREFETCH() \
4094 while (d == dend) \
4095 { \
4096 /* End of string2 => fail. */ \
4097 if (dend == end_match_2) \
4098 goto fail; \
4099 /* End of string1 => advance to string2. */ \
4100 d = string2; \
4101 dend = end_match_2; \
4102 }
4103
4104
4105 /* Test if at very beginning or at very end of the virtual concatenation
4106 of `string1' and `string2'. If only one string, it's `string2'. */
4107 #define AT_STRINGS_BEG(d) ((d) == (size1 ? string1 : string2) || !size2)
4108 #define AT_STRINGS_END(d) ((d) == end2)
4109
4110
4111 /* Test if D points to a character which is word-constituent. We have
4112 two special cases to check for: if past the end of string1, look at
4113 the first character in string2; and if before the beginning of
4114 string2, look at the last character in string1. */
4115 #define WORDCHAR_P(d) \
4116 (SYNTAX ((d) == end1 ? *string2 \
4117 : (d) == string2 - 1 ? *(end1 - 1) : *(d)) \
4118 == Sword)
4119
4120 /* Disabled due to a compiler bug -- see comment at case wordbound */
4121 #if 0
4122 /* Test if the character before D and the one at D differ with respect
4123 to being word-constituent. */
4124 #define AT_WORD_BOUNDARY(d) \
4125 (AT_STRINGS_BEG (d) || AT_STRINGS_END (d) \
4126 || WORDCHAR_P (d - 1) != WORDCHAR_P (d))
4127 #endif
4128
4129 /* Free everything we malloc. */
4130 #ifdef MATCH_MAY_ALLOCATE
4131 #define FREE_VAR(var) if (var) REGEX_FREE (var); var = NULL
4132 #define FREE_VARIABLES() \
4133 do { \
4134 REGEX_FREE_STACK (fail_stack.stack); \
4135 FREE_VAR (regstart); \
4136 FREE_VAR (regend); \
4137 FREE_VAR (old_regstart); \
4138 FREE_VAR (old_regend); \
4139 FREE_VAR (best_regstart); \
4140 FREE_VAR (best_regend); \
4141 FREE_VAR (reg_info); \
4142 FREE_VAR (reg_dummy); \
4143 FREE_VAR (reg_info_dummy); \
4144 } while (0)
4145 #else
4146 #define FREE_VARIABLES() ((void)0) /* Do nothing! But inhibit gcc warning. */
4147 #endif /* not MATCH_MAY_ALLOCATE */
4148
4149 /* These values must meet several constraints. They must not be valid
4150 register values; since we have a limit of 255 registers (because
4151 we use only one byte in the pattern for the register number), we can
4152 use numbers larger than 255. They must differ by 1, because of
4153 NUM_FAILURE_ITEMS above. And the value for the lowest register must
4154 be larger than the value for the highest register, so we do not try
4155 to actually save any registers when none are active. */
4156 #define NO_HIGHEST_ACTIVE_REG (1 << BYTEWIDTH)
4157 #define NO_LOWEST_ACTIVE_REG (NO_HIGHEST_ACTIVE_REG + 1)
4158
4159 /* Matching routines. */
4160
4161 #ifndef emacs /* Emacs never uses this. */
4162 /* re_match is like re_match_2 except it takes only a single string. */
4163
4164 int
re_match(bufp,string,size,pos,regs)4165 re_match (bufp, string, size, pos, regs)
4166 struct re_pattern_buffer *bufp;
4167 const char *string;
4168 int size, pos;
4169 struct re_registers *regs;
4170 {
4171 int result = re_match_2_internal (bufp, NULL, 0, string, size,
4172 pos, regs, size);
4173 #ifndef REGEX_MALLOC
4174 #ifdef C_ALLOCA
4175 alloca (0);
4176 #endif
4177 #endif
4178 return result;
4179 }
4180 #endif /* not emacs */
4181
4182 static boolean group_match_null_string_p _RE_ARGS ((unsigned char **p,
4183 unsigned char *end,
4184 register_info_type *reg_info));
4185 static boolean alt_match_null_string_p _RE_ARGS ((unsigned char *p,
4186 unsigned char *end,
4187 register_info_type *reg_info));
4188 static boolean common_op_match_null_string_p _RE_ARGS ((unsigned char **p,
4189 unsigned char *end,
4190 register_info_type *reg_info));
4191 static int bcmp_translate _RE_ARGS ((const char *s1, const char *s2,
4192 int len, char *translate));
4193
4194 /* re_match_2 matches the compiled pattern in BUFP against the
4195 the (virtual) concatenation of STRING1 and STRING2 (of length SIZE1
4196 and SIZE2, respectively). We start matching at POS, and stop
4197 matching at STOP.
4198
4199 If REGS is non-null and the `no_sub' field of BUFP is nonzero, we
4200 store offsets for the substring each group matched in REGS. See the
4201 documentation for exactly how many groups we fill.
4202
4203 We return -1 if no match, -2 if an internal error (such as the
4204 failure stack overflowing). Otherwise, we return the length of the
4205 matched substring. */
4206
4207 int
re_match_2(bufp,string1,size1,string2,size2,pos,regs,stop)4208 re_match_2 (bufp, string1, size1, string2, size2, pos, regs, stop)
4209 struct re_pattern_buffer *bufp;
4210 const char *string1, *string2;
4211 int size1, size2;
4212 int pos;
4213 struct re_registers *regs;
4214 int stop;
4215 {
4216 int result = re_match_2_internal (bufp, string1, size1, string2, size2,
4217 pos, regs, stop);
4218 #ifndef REGEX_MALLOC
4219 #ifdef C_ALLOCA
4220 alloca (0);
4221 #endif
4222 #endif
4223 return result;
4224 }
4225
4226 /* This is a separate function so that we can force an alloca cleanup
4227 afterwards. */
4228 static int
re_match_2_internal(bufp,string1,size1,string2,size2,pos,regs,stop)4229 re_match_2_internal (bufp, string1, size1, string2, size2, pos, regs, stop)
4230 struct re_pattern_buffer *bufp;
4231 const char *string1, *string2;
4232 int size1, size2;
4233 int pos;
4234 struct re_registers *regs;
4235 int stop;
4236 {
4237 /* General temporaries. */
4238 int mcnt;
4239 unsigned char *p1;
4240
4241 /* Just past the end of the corresponding string. */
4242 const char *end1, *end2;
4243
4244 /* Pointers into string1 and string2, just past the last characters in
4245 each to consider matching. */
4246 const char *end_match_1, *end_match_2;
4247
4248 /* Where we are in the data, and the end of the current string. */
4249 const char *d, *dend;
4250
4251 /* Where we are in the pattern, and the end of the pattern. */
4252 unsigned char *p = bufp->buffer;
4253 register unsigned char *pend = p + bufp->used;
4254
4255 /* Mark the opcode just after a start_memory, so we can test for an
4256 empty subpattern when we get to the stop_memory. */
4257 unsigned char *just_past_start_mem = 0;
4258
4259 /* We use this to map every character in the string. */
4260 RE_TRANSLATE_TYPE translate = bufp->translate;
4261
4262 /* Failure point stack. Each place that can handle a failure further
4263 down the line pushes a failure point on this stack. It consists of
4264 restart, regend, and reg_info for all registers corresponding to
4265 the subexpressions we're currently inside, plus the number of such
4266 registers, and, finally, two char *'s. The first char * is where
4267 to resume scanning the pattern; the second one is where to resume
4268 scanning the strings. If the latter is zero, the failure point is
4269 a ``dummy''; if a failure happens and the failure point is a dummy,
4270 it gets discarded and the next next one is tried. */
4271 #ifdef MATCH_MAY_ALLOCATE /* otherwise, this is global. */
4272 fail_stack_type fail_stack;
4273 #endif
4274 #ifdef DEBUG
4275 static unsigned failure_id = 0;
4276 unsigned nfailure_points_pushed = 0, nfailure_points_popped = 0;
4277 #endif
4278
4279 #ifdef REL_ALLOC
4280 /* This holds the pointer to the failure stack, when
4281 it is allocated relocatably. */
4282 fail_stack_elt_t *failure_stack_ptr;
4283 #endif
4284
4285 /* We fill all the registers internally, independent of what we
4286 return, for use in backreferences. The number here includes
4287 an element for register zero. */
4288 size_t num_regs = bufp->re_nsub + 1;
4289
4290 /* The currently active registers. */
4291 active_reg_t lowest_active_reg = NO_LOWEST_ACTIVE_REG;
4292 active_reg_t highest_active_reg = NO_HIGHEST_ACTIVE_REG;
4293
4294 /* Information on the contents of registers. These are pointers into
4295 the input strings; they record just what was matched (on this
4296 attempt) by a subexpression part of the pattern, that is, the
4297 regnum-th regstart pointer points to where in the pattern we began
4298 matching and the regnum-th regend points to right after where we
4299 stopped matching the regnum-th subexpression. (The zeroth register
4300 keeps track of what the whole pattern matches.) */
4301 #ifdef MATCH_MAY_ALLOCATE /* otherwise, these are global. */
4302 const char **regstart, **regend;
4303 #endif
4304
4305 /* If a group that's operated upon by a repetition operator fails to
4306 match anything, then the register for its start will need to be
4307 restored because it will have been set to wherever in the string we
4308 are when we last see its open-group operator. Similarly for a
4309 register's end. */
4310 #ifdef MATCH_MAY_ALLOCATE /* otherwise, these are global. */
4311 const char **old_regstart, **old_regend;
4312 #endif
4313
4314 /* The is_active field of reg_info helps us keep track of which (possibly
4315 nested) subexpressions we are currently in. The matched_something
4316 field of reg_info[reg_num] helps us tell whether or not we have
4317 matched any of the pattern so far this time through the reg_num-th
4318 subexpression. These two fields get reset each time through any
4319 loop their register is in. */
4320 #ifdef MATCH_MAY_ALLOCATE /* otherwise, this is global. */
4321 register_info_type *reg_info;
4322 #endif
4323
4324 /* The following record the register info as found in the above
4325 variables when we find a match better than any we've seen before.
4326 This happens as we backtrack through the failure points, which in
4327 turn happens only if we have not yet matched the entire string. */
4328 unsigned best_regs_set = false;
4329 #ifdef MATCH_MAY_ALLOCATE /* otherwise, these are global. */
4330 const char **best_regstart, **best_regend;
4331 #endif
4332
4333 /* Logically, this is `best_regend[0]'. But we don't want to have to
4334 allocate space for that if we're not allocating space for anything
4335 else (see below). Also, we never need info about register 0 for
4336 any of the other register vectors, and it seems rather a kludge to
4337 treat `best_regend' differently than the rest. So we keep track of
4338 the end of the best match so far in a separate variable. We
4339 initialize this to NULL so that when we backtrack the first time
4340 and need to test it, it's not garbage. */
4341 const char *match_end = NULL;
4342
4343 /* This helps SET_REGS_MATCHED avoid doing redundant work. */
4344 int set_regs_matched_done = 0;
4345
4346 /* Used when we pop values we don't care about. */
4347 #ifdef MATCH_MAY_ALLOCATE /* otherwise, these are global. */
4348 const char **reg_dummy;
4349 register_info_type *reg_info_dummy;
4350 #endif
4351
4352 #ifdef DEBUG
4353 /* Counts the total number of registers pushed. */
4354 unsigned num_regs_pushed = 0;
4355 #endif
4356
4357 DEBUG_PRINT1 ("\n\nEntering re_match_2.\n");
4358
4359 INIT_FAIL_STACK ();
4360
4361 #ifdef MATCH_MAY_ALLOCATE
4362 /* Do not bother to initialize all the register variables if there are
4363 no groups in the pattern, as it takes a fair amount of time. If
4364 there are groups, we include space for register 0 (the whole
4365 pattern), even though we never use it, since it simplifies the
4366 array indexing. We should fix this. */
4367 if (bufp->re_nsub)
4368 {
4369 regstart = REGEX_TALLOC (num_regs, const char *);
4370 regend = REGEX_TALLOC (num_regs, const char *);
4371 old_regstart = REGEX_TALLOC (num_regs, const char *);
4372 old_regend = REGEX_TALLOC (num_regs, const char *);
4373 best_regstart = REGEX_TALLOC (num_regs, const char *);
4374 best_regend = REGEX_TALLOC (num_regs, const char *);
4375 reg_info = REGEX_TALLOC (num_regs, register_info_type);
4376 reg_dummy = REGEX_TALLOC (num_regs, const char *);
4377 reg_info_dummy = REGEX_TALLOC (num_regs, register_info_type);
4378
4379 if (!(regstart && regend && old_regstart && old_regend && reg_info
4380 && best_regstart && best_regend && reg_dummy && reg_info_dummy))
4381 {
4382 FREE_VARIABLES ();
4383 return -2;
4384 }
4385 }
4386 else
4387 {
4388 /* We must initialize all our variables to NULL, so that
4389 `FREE_VARIABLES' doesn't try to free them. */
4390 regstart = regend = old_regstart = old_regend = best_regstart
4391 = best_regend = reg_dummy = NULL;
4392 reg_info = reg_info_dummy = (register_info_type *) NULL;
4393 }
4394 #endif /* MATCH_MAY_ALLOCATE */
4395
4396 /* The starting position is bogus. */
4397 if (pos < 0 || pos > size1 + size2)
4398 {
4399 FREE_VARIABLES ();
4400 return -1;
4401 }
4402
4403 /* Initialize subexpression text positions to -1 to mark ones that no
4404 start_memory/stop_memory has been seen for. Also initialize the
4405 register information struct. */
4406 for (mcnt = 1; (unsigned) mcnt < num_regs; mcnt++)
4407 {
4408 regstart[mcnt] = regend[mcnt]
4409 = old_regstart[mcnt] = old_regend[mcnt] = REG_UNSET_VALUE;
4410
4411 REG_MATCH_NULL_STRING_P (reg_info[mcnt]) = MATCH_NULL_UNSET_VALUE;
4412 IS_ACTIVE (reg_info[mcnt]) = 0;
4413 MATCHED_SOMETHING (reg_info[mcnt]) = 0;
4414 EVER_MATCHED_SOMETHING (reg_info[mcnt]) = 0;
4415 }
4416
4417 /* We move `string1' into `string2' if the latter's empty -- but not if
4418 `string1' is null. */
4419 if (size2 == 0 && string1 != NULL)
4420 {
4421 string2 = string1;
4422 size2 = size1;
4423 string1 = 0;
4424 size1 = 0;
4425 }
4426 end1 = string1 + size1;
4427 end2 = string2 + size2;
4428
4429 /* Compute where to stop matching, within the two strings. */
4430 if (stop <= size1)
4431 {
4432 end_match_1 = string1 + stop;
4433 end_match_2 = string2;
4434 }
4435 else
4436 {
4437 end_match_1 = end1;
4438 end_match_2 = string2 + stop - size1;
4439 }
4440
4441 /* `p' scans through the pattern as `d' scans through the data.
4442 `dend' is the end of the input string that `d' points within. `d'
4443 is advanced into the following input string whenever necessary, but
4444 this happens before fetching; therefore, at the beginning of the
4445 loop, `d' can be pointing at the end of a string, but it cannot
4446 equal `string2'. */
4447 if (size1 > 0 && pos <= size1)
4448 {
4449 d = string1 + pos;
4450 dend = end_match_1;
4451 }
4452 else
4453 {
4454 d = string2 + pos - size1;
4455 dend = end_match_2;
4456 }
4457
4458 DEBUG_PRINT1 ("The compiled pattern is:\n");
4459 DEBUG_PRINT_COMPILED_PATTERN (bufp, p, pend);
4460 DEBUG_PRINT1 ("The string to match is: `");
4461 DEBUG_PRINT_DOUBLE_STRING (d, string1, size1, string2, size2);
4462 DEBUG_PRINT1 ("'\n");
4463
4464 /* This loops over pattern commands. It exits by returning from the
4465 function if the match is complete, or it drops through if the match
4466 fails at this starting point in the input data. */
4467 for (;;)
4468 {
4469 #if defined (_LIBC) || defined (MSDOS)
4470 DEBUG_PRINT2 ("\n%p: ", p);
4471 #else
4472 DEBUG_PRINT2 ("\n0x%x: ", p);
4473 #endif
4474
4475 if (p == pend)
4476 { /* End of pattern means we might have succeeded. */
4477 DEBUG_PRINT1 ("end of pattern ... ");
4478
4479 /* If we haven't matched the entire string, and we want the
4480 longest match, try backtracking. */
4481 if (d != end_match_2)
4482 {
4483 /* 1 if this match ends in the same string (string1 or string2)
4484 as the best previous match. */
4485 boolean same_str_p = (FIRST_STRING_P (match_end)
4486 == MATCHING_IN_FIRST_STRING);
4487 /* 1 if this match is the best seen so far. */
4488 boolean best_match_p;
4489
4490 /* AIX compiler got confused when this was combined
4491 with the previous declaration. */
4492 if (same_str_p)
4493 best_match_p = d > match_end;
4494 else
4495 best_match_p = !MATCHING_IN_FIRST_STRING;
4496
4497 DEBUG_PRINT1 ("backtracking.\n");
4498
4499 if (!FAIL_STACK_EMPTY ())
4500 { /* More failure points to try. */
4501
4502 /* If exceeds best match so far, save it. */
4503 if (!best_regs_set || best_match_p)
4504 {
4505 best_regs_set = true;
4506 match_end = d;
4507
4508 DEBUG_PRINT1 ("\nSAVING match as best so far.\n");
4509
4510 for (mcnt = 1; (unsigned) mcnt < num_regs; mcnt++)
4511 {
4512 best_regstart[mcnt] = regstart[mcnt];
4513 best_regend[mcnt] = regend[mcnt];
4514 }
4515 }
4516 goto fail;
4517 }
4518
4519 /* If no failure points, don't restore garbage. And if
4520 last match is real best match, don't restore second
4521 best one. */
4522 else if (best_regs_set && !best_match_p)
4523 {
4524 restore_best_regs:
4525 /* Restore best match. It may happen that `dend ==
4526 end_match_1' while the restored d is in string2.
4527 For example, the pattern `x.*y.*z' against the
4528 strings `x-' and `y-z-', if the two strings are
4529 not consecutive in memory. */
4530 DEBUG_PRINT1 ("Restoring best registers.\n");
4531
4532 d = match_end;
4533 dend = ((d >= string1 && d <= end1)
4534 ? end_match_1 : end_match_2);
4535
4536 for (mcnt = 1; (unsigned) mcnt < num_regs; mcnt++)
4537 {
4538 regstart[mcnt] = best_regstart[mcnt];
4539 regend[mcnt] = best_regend[mcnt];
4540 }
4541 }
4542 } /* d != end_match_2 */
4543
4544 succeed_label:
4545 DEBUG_PRINT1 ("Accepting match.\n");
4546
4547 /* If caller wants register contents data back, do it. */
4548 if (regs && !bufp->no_sub)
4549 {
4550 /* Have the register data arrays been allocated? */
4551 if (bufp->regs_allocated == REGS_UNALLOCATED)
4552 { /* No. So allocate them with malloc. We need one
4553 extra element beyond `num_regs' for the `-1' marker
4554 GNU code uses. */
4555 regs->num_regs = MAX (RE_NREGS, num_regs + 1);
4556 regs->start = TALLOC (regs->num_regs, regoff_t);
4557 regs->end = TALLOC (regs->num_regs, regoff_t);
4558 if (regs->start == NULL || regs->end == NULL)
4559 {
4560 FREE_VARIABLES ();
4561 return -2;
4562 }
4563 bufp->regs_allocated = REGS_REALLOCATE;
4564 }
4565 else if (bufp->regs_allocated == REGS_REALLOCATE)
4566 { /* Yes. If we need more elements than were already
4567 allocated, reallocate them. If we need fewer, just
4568 leave it alone. */
4569 if (regs->num_regs < num_regs + 1)
4570 {
4571 regs->num_regs = num_regs + 1;
4572 RETALLOC (regs->start, regs->num_regs, regoff_t);
4573 RETALLOC (regs->end, regs->num_regs, regoff_t);
4574 if (regs->start == NULL || regs->end == NULL)
4575 {
4576 FREE_VARIABLES ();
4577 return -2;
4578 }
4579 }
4580 }
4581 else
4582 {
4583 /* These braces fend off a "empty body in an else-statement"
4584 warning under GCC when assert expands to nothing. */
4585 assert (bufp->regs_allocated == REGS_FIXED);
4586 }
4587
4588 /* Convert the pointer data in `regstart' and `regend' to
4589 indices. Register zero has to be set differently,
4590 since we haven't kept track of any info for it. */
4591 if (regs->num_regs > 0)
4592 {
4593 regs->start[0] = pos;
4594 regs->end[0] = (MATCHING_IN_FIRST_STRING
4595 ? ((regoff_t) (d - string1))
4596 : ((regoff_t) (d - string2 + size1)));
4597 }
4598
4599 /* Go through the first `min (num_regs, regs->num_regs)'
4600 registers, since that is all we initialized. */
4601 for (mcnt = 1; (unsigned) mcnt < MIN (num_regs, regs->num_regs);
4602 mcnt++)
4603 {
4604 if (REG_UNSET (regstart[mcnt]) || REG_UNSET (regend[mcnt]))
4605 regs->start[mcnt] = regs->end[mcnt] = -1;
4606 else
4607 {
4608 regs->start[mcnt]
4609 = (regoff_t) POINTER_TO_OFFSET (regstart[mcnt]);
4610 regs->end[mcnt]
4611 = (regoff_t) POINTER_TO_OFFSET (regend[mcnt]);
4612 }
4613 }
4614
4615 /* If the regs structure we return has more elements than
4616 were in the pattern, set the extra elements to -1. If
4617 we (re)allocated the registers, this is the case,
4618 because we always allocate enough to have at least one
4619 -1 at the end. */
4620 for (mcnt = num_regs; (unsigned) mcnt < regs->num_regs; mcnt++)
4621 regs->start[mcnt] = regs->end[mcnt] = -1;
4622 } /* regs && !bufp->no_sub */
4623
4624 DEBUG_PRINT4 ("%u failure points pushed, %u popped (%u remain).\n",
4625 nfailure_points_pushed, nfailure_points_popped,
4626 nfailure_points_pushed - nfailure_points_popped);
4627 DEBUG_PRINT2 ("%u registers pushed.\n", num_regs_pushed);
4628
4629 mcnt = d - pos - (MATCHING_IN_FIRST_STRING
4630 ? string1
4631 : string2 - size1);
4632
4633 DEBUG_PRINT2 ("Returning %d from re_match_2.\n", mcnt);
4634
4635 FREE_VARIABLES ();
4636 return mcnt;
4637 }
4638
4639 /* Otherwise match next pattern command. */
4640 switch (SWITCH_ENUM_CAST ((re_opcode_t) *p++))
4641 {
4642 /* Ignore these. Used to ignore the n of succeed_n's which
4643 currently have n == 0. */
4644 case no_op:
4645 DEBUG_PRINT1 ("EXECUTING no_op.\n");
4646 break;
4647
4648 case succeed:
4649 DEBUG_PRINT1 ("EXECUTING succeed.\n");
4650 goto succeed_label;
4651
4652 /* Match the next n pattern characters exactly. The following
4653 byte in the pattern defines n, and the n bytes after that
4654 are the characters to match. */
4655 case exactn:
4656 mcnt = *p++;
4657 DEBUG_PRINT2 ("EXECUTING exactn %d.\n", mcnt);
4658
4659 /* This is written out as an if-else so we don't waste time
4660 testing `translate' inside the loop. */
4661 if (translate)
4662 {
4663 do
4664 {
4665 unsigned char c;
4666
4667 PREFETCH ();
4668 c = *d++;
4669 if (ismbchar (c)) {
4670 size_t n;
4671 if (c != (unsigned char) *p++)
4672 goto fail;
4673 for (n = mbclen (c) - 1; n > 0; n--)
4674 if (!--mcnt /* �ѥ���������������ѥ��뤵
4675 ��Ƥ���¤�, ���Υ����å���
4676 ��Ĺ����ǰ�Τ���. */
4677 || d == dend
4678 || (unsigned char) *d++ != (unsigned char) *p++)
4679 goto fail;
4680 continue;
4681 }
4682 if ((unsigned char) translate[c] != (unsigned char) *p++)
4683 goto fail;
4684 }
4685 while (--mcnt);
4686 }
4687 else
4688 {
4689 do
4690 {
4691 PREFETCH ();
4692 #if 0
4693 /* ¾����ʬ�Ǥ�, string1 �� string2 �˥ޥ���Х���ʸ��
4694 ���٤�Τ�����Ƥ��ʤ�. ���Τ��Ȥ�®�٤����ˤ���
4695 �������������, ������ `#if 0' �� `#if 1' ��
4696 �Ѥ��뤳��. */
4697 {
4698 unsigned char c;
4699 size_t n;
4700
4701 c = (unsigned char) *d++;
4702 if (c != (unsigned char) *p++)
4703 goto fail;
4704 if (ismbchar (c))
4705 for (n = mbclen (c) - 1; n > 0; n--)
4706 if (!--mcnt
4707 || d == dend
4708 || (unsigned char) *d++ != (unsigned char) *p++)
4709 goto fail;
4710 }
4711 #else
4712 if (*d++ != (char) *p++) goto fail;
4713 #endif
4714 }
4715 while (--mcnt);
4716 }
4717 SET_REGS_MATCHED ();
4718 break;
4719
4720
4721 /* Match any character except possibly a newline or a null. */
4722 case anychar:
4723 DEBUG_PRINT1 ("EXECUTING anychar.\n");
4724
4725 PREFETCH ();
4726 if (ismbchar (*d)) {
4727 size_t i, n = mbclen (*d);
4728 const char *p;
4729
4730 if (d + n > dend)
4731 goto fail;
4732 for (p = d, i = n - 1; i > 0; i--)
4733 if (*++p == '\n' || *p == '\0')
4734 /* ̵���ʥޥ���Х���ʸ���ˤϥޥå������ʤ�. �����Ǥ�, ��
4735 ñ�Τ��ᣲ�Х����ܰʹߤ� '\n', '\0' �Τ�Τ�����̵����
4736 ����. */
4737 goto fail;
4738
4739 SET_REGS_MATCHED ();
4740 DEBUG_PRINT2 (" Matched `%ld'.\n", (long) mbcfetch (d, NULL));
4741 d += n;
4742 break;
4743 }
4744
4745 if ((!(bufp->syntax & RE_DOT_NEWLINE) && TRANSLATE (*d) == '\n')
4746 || (bufp->syntax & RE_DOT_NOT_NULL && TRANSLATE (*d) == '\000'))
4747 goto fail;
4748
4749 SET_REGS_MATCHED ();
4750 DEBUG_PRINT2 (" Matched `%d'.\n", *d);
4751 d++;
4752 break;
4753
4754
4755 case charset:
4756 case charset_not:
4757 {
4758 register mbchar_t c;
4759 boolean not = (re_opcode_t) *(p - 1) == charset_not;
4760
4761 DEBUG_PRINT2 ("EXECUTING charset%s.\n", not ? "_not" : "");
4762
4763 PREFETCH ();
4764 c = (unsigned char) *d;
4765 if (ismbchar (c)) {
4766 if (d + mbclen (c) > dend)
4767 goto fail;
4768 c = mbcfetch (d, NULL);
4769 }
4770 else
4771 c = TRANSLATE (c); /* The character to match. */
4772
4773 not = is_in_list (c, p);
4774
4775 p += 1 + *p + 2 + EXTRACT_UNSIGNED (&p[1 + *p])*8;
4776
4777 if (!not) goto fail;
4778
4779 SET_REGS_MATCHED ();
4780 d += mbclen (*d);
4781 break;
4782 }
4783
4784
4785 /* The beginning of a group is represented by start_memory.
4786 The arguments are the register number in the next byte, and the
4787 number of groups inner to this one in the next. The text
4788 matched within the group is recorded (in the internal
4789 registers data structure) under the register number. */
4790 case start_memory:
4791 DEBUG_PRINT3 ("EXECUTING start_memory %d (%d):\n", *p, p[1]);
4792
4793 /* Find out if this group can match the empty string. */
4794 p1 = p; /* To send to group_match_null_string_p. */
4795
4796 if (REG_MATCH_NULL_STRING_P (reg_info[*p]) == MATCH_NULL_UNSET_VALUE)
4797 REG_MATCH_NULL_STRING_P (reg_info[*p])
4798 = group_match_null_string_p (&p1, pend, reg_info);
4799
4800 /* Save the position in the string where we were the last time
4801 we were at this open-group operator in case the group is
4802 operated upon by a repetition operator, e.g., with `(a*)*b'
4803 against `ab'; then we want to ignore where we are now in
4804 the string in case this attempt to match fails. */
4805 old_regstart[*p] = REG_MATCH_NULL_STRING_P (reg_info[*p])
4806 ? REG_UNSET (regstart[*p]) ? d : regstart[*p]
4807 : regstart[*p];
4808 DEBUG_PRINT2 (" old_regstart: %d\n",
4809 POINTER_TO_OFFSET (old_regstart[*p]));
4810
4811 regstart[*p] = d;
4812 DEBUG_PRINT2 (" regstart: %d\n", POINTER_TO_OFFSET (regstart[*p]));
4813
4814 IS_ACTIVE (reg_info[*p]) = 1;
4815 MATCHED_SOMETHING (reg_info[*p]) = 0;
4816
4817 /* Clear this whenever we change the register activity status. */
4818 set_regs_matched_done = 0;
4819
4820 /* This is the new highest active register. */
4821 highest_active_reg = *p;
4822
4823 /* If nothing was active before, this is the new lowest active
4824 register. */
4825 if (lowest_active_reg == NO_LOWEST_ACTIVE_REG)
4826 lowest_active_reg = *p;
4827
4828 /* Move past the register number and inner group count. */
4829 p += 2;
4830 just_past_start_mem = p;
4831
4832 break;
4833
4834
4835 /* The stop_memory opcode represents the end of a group. Its
4836 arguments are the same as start_memory's: the register
4837 number, and the number of inner groups. */
4838 case stop_memory:
4839 DEBUG_PRINT3 ("EXECUTING stop_memory %d (%d):\n", *p, p[1]);
4840
4841 /* We need to save the string position the last time we were at
4842 this close-group operator in case the group is operated
4843 upon by a repetition operator, e.g., with `((a*)*(b*)*)*'
4844 against `aba'; then we want to ignore where we are now in
4845 the string in case this attempt to match fails. */
4846 old_regend[*p] = REG_MATCH_NULL_STRING_P (reg_info[*p])
4847 ? REG_UNSET (regend[*p]) ? d : regend[*p]
4848 : regend[*p];
4849 DEBUG_PRINT2 (" old_regend: %d\n",
4850 POINTER_TO_OFFSET (old_regend[*p]));
4851
4852 regend[*p] = d;
4853 DEBUG_PRINT2 (" regend: %d\n", POINTER_TO_OFFSET (regend[*p]));
4854
4855 /* This register isn't active anymore. */
4856 IS_ACTIVE (reg_info[*p]) = 0;
4857
4858 /* Clear this whenever we change the register activity status. */
4859 set_regs_matched_done = 0;
4860
4861 /* If this was the only register active, nothing is active
4862 anymore. */
4863 if (lowest_active_reg == highest_active_reg)
4864 {
4865 lowest_active_reg = NO_LOWEST_ACTIVE_REG;
4866 highest_active_reg = NO_HIGHEST_ACTIVE_REG;
4867 }
4868 else
4869 { /* We must scan for the new highest active register, since
4870 it isn't necessarily one less than now: consider
4871 (a(b)c(d(e)f)g). When group 3 ends, after the f), the
4872 new highest active register is 1. */
4873 unsigned char r = *p - 1;
4874 while (r > 0 && !IS_ACTIVE (reg_info[r]))
4875 r--;
4876
4877 /* If we end up at register zero, that means that we saved
4878 the registers as the result of an `on_failure_jump', not
4879 a `start_memory', and we jumped to past the innermost
4880 `stop_memory'. For example, in ((.)*) we save
4881 registers 1 and 2 as a result of the *, but when we pop
4882 back to the second ), we are at the stop_memory 1.
4883 Thus, nothing is active. */
4884 if (r == 0)
4885 {
4886 lowest_active_reg = NO_LOWEST_ACTIVE_REG;
4887 highest_active_reg = NO_HIGHEST_ACTIVE_REG;
4888 }
4889 else
4890 highest_active_reg = r;
4891 }
4892
4893 /* If just failed to match something this time around with a
4894 group that's operated on by a repetition operator, try to
4895 force exit from the ``loop'', and restore the register
4896 information for this group that we had before trying this
4897 last match. */
4898 if ((!MATCHED_SOMETHING (reg_info[*p])
4899 || just_past_start_mem == p - 1)
4900 && (p + 2) < pend)
4901 {
4902 boolean is_a_jump_n = false;
4903
4904 p1 = p + 2;
4905 mcnt = 0;
4906 switch ((re_opcode_t) *p1++)
4907 {
4908 case jump_n:
4909 is_a_jump_n = true;
4910 case pop_failure_jump:
4911 case maybe_pop_jump:
4912 case jump:
4913 case dummy_failure_jump:
4914 EXTRACT_NUMBER_AND_INCR (mcnt, p1);
4915 if (is_a_jump_n)
4916 p1 += 2;
4917 break;
4918
4919 default:
4920 /* do nothing */ ;
4921 }
4922 p1 += mcnt;
4923
4924 /* If the next operation is a jump backwards in the pattern
4925 to an on_failure_jump right before the start_memory
4926 corresponding to this stop_memory, exit from the loop
4927 by forcing a failure after pushing on the stack the
4928 on_failure_jump's jump in the pattern, and d. */
4929 if (mcnt < 0 && (re_opcode_t) *p1 == on_failure_jump
4930 && (re_opcode_t) p1[3] == start_memory && p1[4] == *p)
4931 {
4932 /* If this group ever matched anything, then restore
4933 what its registers were before trying this last
4934 failed match, e.g., with `(a*)*b' against `ab' for
4935 regstart[1], and, e.g., with `((a*)*(b*)*)*'
4936 against `aba' for regend[3].
4937
4938 Also restore the registers for inner groups for,
4939 e.g., `((a*)(b*))*' against `aba' (register 3 would
4940 otherwise get trashed). */
4941
4942 if (EVER_MATCHED_SOMETHING (reg_info[*p]))
4943 {
4944 unsigned r;
4945
4946 EVER_MATCHED_SOMETHING (reg_info[*p]) = 0;
4947
4948 /* Restore this and inner groups' (if any) registers. */
4949 for (r = *p; r < (unsigned) *p + (unsigned) *(p + 1);
4950 r++)
4951 {
4952 regstart[r] = old_regstart[r];
4953
4954 /* xx why this test? */
4955 if (old_regend[r] >= regstart[r])
4956 regend[r] = old_regend[r];
4957 }
4958 }
4959 p1++;
4960 EXTRACT_NUMBER_AND_INCR (mcnt, p1);
4961 PUSH_FAILURE_POINT (p1 + mcnt, d, -2);
4962
4963 goto fail;
4964 }
4965 }
4966
4967 /* Move past the register number and the inner group count. */
4968 p += 2;
4969 break;
4970
4971
4972 /* \<digit> has been turned into a `duplicate' command which is
4973 followed by the numeric value of <digit> as the register number. */
4974 case duplicate:
4975 {
4976 register const char *d2, *dend2;
4977 int regno = *p++; /* Get which register to match against. */
4978 DEBUG_PRINT2 ("EXECUTING duplicate %d.\n", regno);
4979
4980 /* Can't back reference a group which we've never matched. */
4981 if (REG_UNSET (regstart[regno]) || REG_UNSET (regend[regno]))
4982 goto fail;
4983
4984 /* Where in input to try to start matching. */
4985 d2 = regstart[regno];
4986
4987 /* Where to stop matching; if both the place to start and
4988 the place to stop matching are in the same string, then
4989 set to the place to stop, otherwise, for now have to use
4990 the end of the first string. */
4991
4992 dend2 = ((FIRST_STRING_P (regstart[regno])
4993 == FIRST_STRING_P (regend[regno]))
4994 ? regend[regno] : end_match_1);
4995 for (;;)
4996 {
4997 /* If necessary, advance to next segment in register
4998 contents. */
4999 while (d2 == dend2)
5000 {
5001 if (dend2 == end_match_2) break;
5002 if (dend2 == regend[regno]) break;
5003
5004 /* End of string1 => advance to string2. */
5005 d2 = string2;
5006 dend2 = regend[regno];
5007 }
5008 /* At end of register contents => success */
5009 if (d2 == dend2) break;
5010
5011 /* If necessary, advance to next segment in data. */
5012 PREFETCH ();
5013
5014 /* How many characters left in this segment to match. */
5015 mcnt = dend - d;
5016
5017 /* Want how many consecutive characters we can match in
5018 one shot, so, if necessary, adjust the count. */
5019 if (mcnt > dend2 - d2)
5020 mcnt = dend2 - d2;
5021
5022 /* Compare that many; failure if mismatch, else move
5023 past them. */
5024 if (translate
5025 ? bcmp_translate (d, d2, mcnt, translate)
5026 : bcmp (d, d2, mcnt))
5027 goto fail;
5028 d += mcnt, d2 += mcnt;
5029
5030 /* Do this because we've match some characters. */
5031 SET_REGS_MATCHED ();
5032 }
5033 }
5034 break;
5035
5036
5037 /* begline matches the empty string at the beginning of the string
5038 (unless `not_bol' is set in `bufp'), and, if
5039 `newline_anchor' is set, after newlines. */
5040 case begline:
5041 DEBUG_PRINT1 ("EXECUTING begline.\n");
5042
5043 if (AT_STRINGS_BEG (d))
5044 {
5045 if (!bufp->not_bol) break;
5046 }
5047 else if (d[-1] == '\n' && bufp->newline_anchor)
5048 {
5049 break;
5050 }
5051 /* In all other cases, we fail. */
5052 goto fail;
5053
5054
5055 /* endline is the dual of begline. */
5056 case endline:
5057 DEBUG_PRINT1 ("EXECUTING endline.\n");
5058
5059 if (AT_STRINGS_END (d))
5060 {
5061 if (!bufp->not_eol) break;
5062 }
5063
5064 /* We have to ``prefetch'' the next character. */
5065 else if ((d == end1 ? *string2 : *d) == '\n'
5066 && bufp->newline_anchor)
5067 {
5068 break;
5069 }
5070 goto fail;
5071
5072
5073 /* Match at the very beginning of the data. */
5074 case begbuf:
5075 DEBUG_PRINT1 ("EXECUTING begbuf.\n");
5076 if (AT_STRINGS_BEG (d))
5077 break;
5078 goto fail;
5079
5080
5081 /* Match at the very end of the data. */
5082 case endbuf:
5083 DEBUG_PRINT1 ("EXECUTING endbuf.\n");
5084 if (AT_STRINGS_END (d))
5085 break;
5086 goto fail;
5087
5088
5089 /* on_failure_keep_string_jump is used to optimize `.*\n'. It
5090 pushes NULL as the value for the string on the stack. Then
5091 `pop_failure_point' will keep the current value for the
5092 string, instead of restoring it. To see why, consider
5093 matching `foo\nbar' against `.*\n'. The .* matches the foo;
5094 then the . fails against the \n. But the next thing we want
5095 to do is match the \n against the \n; if we restored the
5096 string value, we would be back at the foo.
5097
5098 Because this is used only in specific cases, we don't need to
5099 check all the things that `on_failure_jump' does, to make
5100 sure the right things get saved on the stack. Hence we don't
5101 share its code. The only reason to push anything on the
5102 stack at all is that otherwise we would have to change
5103 `anychar's code to do something besides goto fail in this
5104 case; that seems worse than this. */
5105 case on_failure_keep_string_jump:
5106 DEBUG_PRINT1 ("EXECUTING on_failure_keep_string_jump");
5107
5108 EXTRACT_NUMBER_AND_INCR (mcnt, p);
5109 #if defined (_LIBC) || defined (MSDOS)
5110 DEBUG_PRINT3 (" %d (to %p):\n", mcnt, p + mcnt);
5111 #else
5112 DEBUG_PRINT3 (" %d (to 0x%x):\n", mcnt, p + mcnt);
5113 #endif
5114
5115 PUSH_FAILURE_POINT (p + mcnt, NULL, -2);
5116 break;
5117
5118
5119 /* Uses of on_failure_jump:
5120
5121 Each alternative starts with an on_failure_jump that points
5122 to the beginning of the next alternative. Each alternative
5123 except the last ends with a jump that in effect jumps past
5124 the rest of the alternatives. (They really jump to the
5125 ending jump of the following alternative, because tensioning
5126 these jumps is a hassle.)
5127
5128 Repeats start with an on_failure_jump that points past both
5129 the repetition text and either the following jump or
5130 pop_failure_jump back to this on_failure_jump. */
5131 case on_failure_jump:
5132 on_failure:
5133 DEBUG_PRINT1 ("EXECUTING on_failure_jump");
5134
5135 EXTRACT_NUMBER_AND_INCR (mcnt, p);
5136 #if defined (_LIBC) || defined (MSDOS)
5137 DEBUG_PRINT3 (" %d (to %p)", mcnt, p + mcnt);
5138 #else
5139 DEBUG_PRINT3 (" %d (to 0x%x)", mcnt, p + mcnt);
5140 #endif
5141
5142 /* If this on_failure_jump comes right before a group (i.e.,
5143 the original * applied to a group), save the information
5144 for that group and all inner ones, so that if we fail back
5145 to this point, the group's information will be correct.
5146 For example, in \(a*\)*\1, we need the preceding group,
5147 and in \(zz\(a*\)b*\)\2, we need the inner group. */
5148
5149 /* We can't use `p' to check ahead because we push
5150 a failure point to `p + mcnt' after we do this. */
5151 p1 = p;
5152
5153 /* We need to skip no_op's before we look for the
5154 start_memory in case this on_failure_jump is happening as
5155 the result of a completed succeed_n, as in \(a\)\{1,3\}b\1
5156 against aba. */
5157 while (p1 < pend && (re_opcode_t) *p1 == no_op)
5158 p1++;
5159
5160 if (p1 < pend && (re_opcode_t) *p1 == start_memory)
5161 {
5162 /* We have a new highest active register now. This will
5163 get reset at the start_memory we are about to get to,
5164 but we will have saved all the registers relevant to
5165 this repetition op, as described above. */
5166 highest_active_reg = *(p1 + 1) + *(p1 + 2);
5167 if (lowest_active_reg == NO_LOWEST_ACTIVE_REG)
5168 lowest_active_reg = *(p1 + 1);
5169 }
5170
5171 DEBUG_PRINT1 (":\n");
5172 PUSH_FAILURE_POINT (p + mcnt, d, -2);
5173 break;
5174
5175
5176 /* A smart repeat ends with `maybe_pop_jump'.
5177 We change it to either `pop_failure_jump' or `jump'. */
5178 case maybe_pop_jump:
5179 EXTRACT_NUMBER_AND_INCR (mcnt, p);
5180 DEBUG_PRINT2 ("EXECUTING maybe_pop_jump %d.\n", mcnt);
5181 {
5182 register unsigned char *p2 = p;
5183
5184 /* Compare the beginning of the repeat with what in the
5185 pattern follows its end. If we can establish that there
5186 is nothing that they would both match, i.e., that we
5187 would have to backtrack because of (as in, e.g., `a*a')
5188 then we can change to pop_failure_jump, because we'll
5189 never have to backtrack.
5190
5191 This is not true in the case of alternatives: in
5192 `(a|ab)*' we do need to backtrack to the `ab' alternative
5193 (e.g., if the string was `ab'). But instead of trying to
5194 detect that here, the alternative has put on a dummy
5195 failure point which is what we will end up popping. */
5196
5197 /* Skip over open/close-group commands.
5198 If what follows this loop is a ...+ construct,
5199 look at what begins its body, since we will have to
5200 match at least one of that. */
5201 while (1)
5202 {
5203 if (p2 + 2 < pend
5204 && ((re_opcode_t) *p2 == stop_memory
5205 || (re_opcode_t) *p2 == start_memory))
5206 p2 += 3;
5207 else if (p2 + 6 < pend
5208 && (re_opcode_t) *p2 == dummy_failure_jump)
5209 p2 += 6;
5210 else
5211 break;
5212 }
5213
5214 p1 = p + mcnt;
5215 /* p1[0] ... p1[2] are the `on_failure_jump' corresponding
5216 to the `maybe_finalize_jump' of this case. Examine what
5217 follows. */
5218
5219 /* If we're at the end of the pattern, we can change. */
5220 if (p2 == pend)
5221 {
5222 /* Consider what happens when matching ":\(.*\)"
5223 against ":/". I don't really understand this code
5224 yet. */
5225 p[-3] = (unsigned char) pop_failure_jump;
5226 DEBUG_PRINT1
5227 (" End of pattern: change to `pop_failure_jump'.\n");
5228 }
5229
5230 else if ((re_opcode_t) *p2 == exactn
5231 || (bufp->newline_anchor && (re_opcode_t) *p2 == endline))
5232 {
5233 register mbchar_t c
5234 = *p2 == (unsigned char) endline ? '\n' : p2[2];
5235
5236 if ((re_opcode_t) p1[3] == exactn && p1[5] != c)
5237 {
5238 p[-3] = (unsigned char) pop_failure_jump;
5239 DEBUG_PRINT3 (" %c != %c => pop_failure_jump.\n",
5240 c, p1[5]);
5241 }
5242
5243 else if ((re_opcode_t) p1[3] == charset
5244 || (re_opcode_t) p1[3] == charset_not)
5245 {
5246 if (ismbchar (c))
5247 c = mbcfetch (&p2[2], NULL);
5248
5249 /* `is_in_list()' is TRUE if c would match, which means
5250 that we can't change to pop_failure_jump. */
5251 if (!is_in_list (c, p1 + 4))
5252 {
5253 p[-3] = (unsigned char) pop_failure_jump;
5254 DEBUG_PRINT1 (" No match => pop_failure_jump.\n");
5255 }
5256 }
5257 }
5258 else if ((re_opcode_t) *p2 == charset)
5259 {
5260 #ifdef DEBUG
5261 register mbchar_t c
5262 = *p2 == (unsigned char) endline ? '\n' : p2[2];
5263 #endif
5264
5265 mbchar_t wc = mbcfetch (&p1[5], NULL);
5266 if ((re_opcode_t) p1[3] == exactn && !is_in_list (wc, p2 + 1))
5267 {
5268 p[-3] = (unsigned char) pop_failure_jump;
5269 DEBUG_PRINT3 (" %c != %c => pop_failure_jump.\n",
5270 c, p1[5]);
5271 }
5272
5273 else if ((re_opcode_t) p1[3] == charset_not)
5274 {
5275 int idx;
5276 /* We win if the charset_not inside the loop
5277 lists every character listed in the charset after. */
5278 for (idx = 0; idx < (int) p2[1]; idx++)
5279 if (! (p2[2 + idx] == 0
5280 || (idx < (int) p1[4]
5281 && ((p2[2 + idx] & ~ p1[5 + idx]) == 0))))
5282 break;
5283
5284 if (idx == p2[1]
5285 && is_invmblist_nooverlap (&p2[2 + p2[1]], &p1[5 + p1[4]]))
5286 {
5287 p[-3] = (unsigned char) pop_failure_jump;
5288 DEBUG_PRINT1 (" No match => pop_failure_jump.\n");
5289 }
5290 }
5291 else if ((re_opcode_t) p1[3] == charset)
5292 {
5293 int idx;
5294 /* We win if the charset inside the loop
5295 has no overlap with the one after the loop. */
5296 for (idx = 0;
5297 idx < (int) p2[1] && idx < (int) p1[4];
5298 idx++)
5299 if ((p2[2 + idx] & p1[5 + idx]) != 0)
5300 break;
5301
5302 if ((idx == p2[1] || idx == p1[4])
5303 && is_mblist_nooverlap (&p2[2 + p2[1]], &p1[5 + p1[4]]))
5304 {
5305 p[-3] = (unsigned char) pop_failure_jump;
5306 DEBUG_PRINT1 (" No match => pop_failure_jump.\n");
5307 }
5308 }
5309 }
5310 }
5311 p -= 2; /* Point at relative address again. */
5312 if ((re_opcode_t) p[-1] != pop_failure_jump)
5313 {
5314 p[-1] = (unsigned char) jump;
5315 DEBUG_PRINT1 (" Match => jump.\n");
5316 goto unconditional_jump;
5317 }
5318 /* Note fall through. */
5319
5320
5321 /* The end of a simple repeat has a pop_failure_jump back to
5322 its matching on_failure_jump, where the latter will push a
5323 failure point. The pop_failure_jump takes off failure
5324 points put on by this pop_failure_jump's matching
5325 on_failure_jump; we got through the pattern to here from the
5326 matching on_failure_jump, so didn't fail. */
5327 case pop_failure_jump:
5328 {
5329 /* We need to pass separate storage for the lowest and
5330 highest registers, even though we don't care about the
5331 actual values. Otherwise, we will restore only one
5332 register from the stack, since lowest will == highest in
5333 `pop_failure_point'. */
5334 active_reg_t dummy_low_reg, dummy_high_reg;
5335 unsigned char *pdummy;
5336 const char *sdummy;
5337
5338 DEBUG_PRINT1 ("EXECUTING pop_failure_jump.\n");
5339 POP_FAILURE_POINT (sdummy, pdummy,
5340 dummy_low_reg, dummy_high_reg,
5341 reg_dummy, reg_dummy, reg_info_dummy);
5342 }
5343 /* Note fall through. */
5344
5345 unconditional_jump:
5346 #if defined (_LIBC) || defined (MSDOS)
5347 DEBUG_PRINT2 ("\n%p: ", p);
5348 #else
5349 DEBUG_PRINT2 ("\n0x%x: ", p);
5350 #endif
5351 /* Note fall through. */
5352
5353 /* Unconditionally jump (without popping any failure points). */
5354 case jump:
5355 EXTRACT_NUMBER_AND_INCR (mcnt, p); /* Get the amount to jump. */
5356 DEBUG_PRINT2 ("EXECUTING jump %d ", mcnt);
5357 p += mcnt; /* Do the jump. */
5358 #if defined (_LIBC) || defined (MSDOS)
5359 DEBUG_PRINT2 ("(to %p).\n", p);
5360 #else
5361 DEBUG_PRINT2 ("(to 0x%x).\n", p);
5362 #endif
5363 break;
5364
5365
5366 /* We need this opcode so we can detect where alternatives end
5367 in `group_match_null_string_p' et al. */
5368 case jump_past_alt:
5369 DEBUG_PRINT1 ("EXECUTING jump_past_alt.\n");
5370 goto unconditional_jump;
5371
5372
5373 /* Normally, the on_failure_jump pushes a failure point, which
5374 then gets popped at pop_failure_jump. We will end up at
5375 pop_failure_jump, also, and with a pattern of, say, `a+', we
5376 are skipping over the on_failure_jump, so we have to push
5377 something meaningless for pop_failure_jump to pop. */
5378 case dummy_failure_jump:
5379 DEBUG_PRINT1 ("EXECUTING dummy_failure_jump.\n");
5380 /* It doesn't matter what we push for the string here. What
5381 the code at `fail' tests is the value for the pattern. */
5382 PUSH_FAILURE_POINT (0, 0, -2);
5383 goto unconditional_jump;
5384
5385
5386 /* At the end of an alternative, we need to push a dummy failure
5387 point in case we are followed by a `pop_failure_jump', because
5388 we don't want the failure point for the alternative to be
5389 popped. For example, matching `(a|ab)*' against `aab'
5390 requires that we match the `ab' alternative. */
5391 case push_dummy_failure:
5392 DEBUG_PRINT1 ("EXECUTING push_dummy_failure.\n");
5393 /* See comments just above at `dummy_failure_jump' about the
5394 two zeroes. */
5395 PUSH_FAILURE_POINT (0, 0, -2);
5396 break;
5397
5398 /* Have to succeed matching what follows at least n times.
5399 After that, handle like `on_failure_jump'. */
5400 case succeed_n:
5401 EXTRACT_NUMBER (mcnt, p + 2);
5402 DEBUG_PRINT2 ("EXECUTING succeed_n %d.\n", mcnt);
5403
5404 assert (mcnt >= 0);
5405 /* Originally, this is how many times we HAVE to succeed. */
5406 if (mcnt > 0)
5407 {
5408 mcnt--;
5409 p += 2;
5410 STORE_NUMBER_AND_INCR (p, mcnt);
5411 #if defined (_LIBC) || defined (MSDOS)
5412 DEBUG_PRINT3 (" Setting %p to %d.\n", p - 2, mcnt);
5413 #else
5414 DEBUG_PRINT3 (" Setting 0x%x to %d.\n", p - 2, mcnt);
5415 #endif
5416 }
5417 else if (mcnt == 0)
5418 {
5419 #if defined (_LIBC) || defined (MSDOS)
5420 DEBUG_PRINT2 (" Setting two bytes from %p to no_op.\n", p+2);
5421 #else
5422 DEBUG_PRINT2 (" Setting two bytes from 0x%x to no_op.\n", p+2);
5423 #endif
5424 p[2] = (unsigned char) no_op;
5425 p[3] = (unsigned char) no_op;
5426 goto on_failure;
5427 }
5428 break;
5429
5430 case jump_n:
5431 EXTRACT_NUMBER (mcnt, p + 2);
5432 DEBUG_PRINT2 ("EXECUTING jump_n %d.\n", mcnt);
5433
5434 /* Originally, this is how many times we CAN jump. */
5435 if (mcnt)
5436 {
5437 mcnt--;
5438 STORE_NUMBER (p + 2, mcnt);
5439 #if defined (_LIBC) || defined (MSDOS)
5440 DEBUG_PRINT3 (" Setting %p to %d.\n", p + 2, mcnt);
5441 #else
5442 DEBUG_PRINT3 (" Setting 0x%x to %d.\n", p + 2, mcnt);
5443 #endif
5444 goto unconditional_jump;
5445 }
5446 /* If don't have to jump any more, skip over the rest of command. */
5447 else
5448 p += 4;
5449 break;
5450
5451 case set_number_at:
5452 {
5453 DEBUG_PRINT1 ("EXECUTING set_number_at.\n");
5454
5455 EXTRACT_NUMBER_AND_INCR (mcnt, p);
5456 p1 = p + mcnt;
5457 EXTRACT_NUMBER_AND_INCR (mcnt, p);
5458 #if defined (_LIBC) || defined (MSDOS)
5459 DEBUG_PRINT3 (" Setting %p to %d.\n", p1, mcnt);
5460 #else
5461 DEBUG_PRINT3 (" Setting 0x%x to %d.\n", p1, mcnt);
5462 #endif
5463 STORE_NUMBER (p1, mcnt);
5464 break;
5465 }
5466
5467 #if 0
5468 /* The DEC Alpha C compiler 3.x generates incorrect code for the
5469 test WORDCHAR_P (d - 1) != WORDCHAR_P (d) in the expansion of
5470 AT_WORD_BOUNDARY, so this code is disabled. Expanding the
5471 macro and introducing temporary variables works around the bug. */
5472
5473 case wordbound:
5474 DEBUG_PRINT1 ("EXECUTING wordbound.\n");
5475 if (AT_WORD_BOUNDARY (d))
5476 break;
5477 goto fail;
5478
5479 case notwordbound:
5480 DEBUG_PRINT1 ("EXECUTING notwordbound.\n");
5481 if (AT_WORD_BOUNDARY (d))
5482 goto fail;
5483 break;
5484 #else
5485 case wordbound:
5486 {
5487 boolean prevchar, thischar;
5488
5489 DEBUG_PRINT1 ("EXECUTING wordbound.\n");
5490 if (AT_STRINGS_BEG (d) || AT_STRINGS_END (d))
5491 break;
5492
5493 prevchar = WORDCHAR_P (d - 1);
5494 thischar = WORDCHAR_P (d);
5495 if (prevchar != thischar)
5496 break;
5497 goto fail;
5498 }
5499
5500 case notwordbound:
5501 {
5502 boolean prevchar, thischar;
5503
5504 DEBUG_PRINT1 ("EXECUTING notwordbound.\n");
5505 if (AT_STRINGS_BEG (d) || AT_STRINGS_END (d))
5506 goto fail;
5507
5508 prevchar = WORDCHAR_P (d - 1);
5509 thischar = WORDCHAR_P (d);
5510 if (prevchar != thischar)
5511 goto fail;
5512 break;
5513 }
5514 #endif
5515
5516 case wordbeg:
5517 DEBUG_PRINT1 ("EXECUTING wordbeg.\n");
5518 if (WORDCHAR_P (d) && (AT_STRINGS_BEG (d) || !WORDCHAR_P (d - 1)))
5519 break;
5520 goto fail;
5521
5522 case wordend:
5523 DEBUG_PRINT1 ("EXECUTING wordend.\n");
5524 if (!AT_STRINGS_BEG (d) && WORDCHAR_P (d - 1)
5525 && (!WORDCHAR_P (d) || AT_STRINGS_END (d)))
5526 break;
5527 goto fail;
5528
5529 #ifdef emacs
5530 case before_dot:
5531 DEBUG_PRINT1 ("EXECUTING before_dot.\n");
5532 if (PTR_CHAR_POS ((unsigned char *) d) >= point)
5533 goto fail;
5534 break;
5535
5536 case at_dot:
5537 DEBUG_PRINT1 ("EXECUTING at_dot.\n");
5538 if (PTR_CHAR_POS ((unsigned char *) d) != point)
5539 goto fail;
5540 break;
5541
5542 case after_dot:
5543 DEBUG_PRINT1 ("EXECUTING after_dot.\n");
5544 if (PTR_CHAR_POS ((unsigned char *) d) <= point)
5545 goto fail;
5546 break;
5547
5548 case syntaxspec:
5549 DEBUG_PRINT2 ("EXECUTING syntaxspec %d.\n", mcnt);
5550 mcnt = *p++;
5551 goto matchsyntax;
5552
5553 case wordchar:
5554 DEBUG_PRINT1 ("EXECUTING Emacs wordchar.\n");
5555 mcnt = (int) Sword;
5556 matchsyntax:
5557 PREFETCH ();
5558 /* Can't use *d++ here; SYNTAX may be an unsafe macro. */
5559 d++;
5560 if (SYNTAX (d[-1]) != (enum syntaxcode) mcnt)
5561 goto fail;
5562 SET_REGS_MATCHED ();
5563 break;
5564
5565 case notsyntaxspec:
5566 DEBUG_PRINT2 ("EXECUTING notsyntaxspec %d.\n", mcnt);
5567 mcnt = *p++;
5568 goto matchnotsyntax;
5569
5570 case notwordchar:
5571 DEBUG_PRINT1 ("EXECUTING Emacs notwordchar.\n");
5572 mcnt = (int) Sword;
5573 matchnotsyntax:
5574 PREFETCH ();
5575 /* Can't use *d++ here; SYNTAX may be an unsafe macro. */
5576 d++;
5577 if (SYNTAX (d[-1]) == (enum syntaxcode) mcnt)
5578 goto fail;
5579 SET_REGS_MATCHED ();
5580 break;
5581
5582 #else /* not emacs */
5583 case wordchar:
5584 DEBUG_PRINT1 ("EXECUTING non-Emacs wordchar.\n");
5585 PREFETCH ();
5586 if (!WORDCHAR_P (d))
5587 goto fail;
5588 SET_REGS_MATCHED ();
5589 d++;
5590 break;
5591
5592 case notwordchar:
5593 DEBUG_PRINT1 ("EXECUTING non-Emacs notwordchar.\n");
5594 PREFETCH ();
5595 if (WORDCHAR_P (d))
5596 goto fail;
5597 SET_REGS_MATCHED ();
5598 d++;
5599 break;
5600 #endif /* not emacs */
5601
5602 default:
5603 abort ();
5604 }
5605 continue; /* Successfully executed one pattern command; keep going. */
5606
5607
5608 /* We goto here if a matching operation fails. */
5609 fail:
5610 if (!FAIL_STACK_EMPTY ())
5611 { /* A restart point is known. Restore to that state. */
5612 DEBUG_PRINT1 ("\nFAIL:\n");
5613 POP_FAILURE_POINT (d, p,
5614 lowest_active_reg, highest_active_reg,
5615 regstart, regend, reg_info);
5616
5617 /* If this failure point is a dummy, try the next one. */
5618 if (!p)
5619 goto fail;
5620
5621 /* If we failed to the end of the pattern, don't examine *p. */
5622 assert (p <= pend);
5623 if (p < pend)
5624 {
5625 boolean is_a_jump_n = false;
5626
5627 /* If failed to a backwards jump that's part of a repetition
5628 loop, need to pop this failure point and use the next one. */
5629 switch ((re_opcode_t) *p)
5630 {
5631 case jump_n:
5632 is_a_jump_n = true;
5633 case maybe_pop_jump:
5634 case pop_failure_jump:
5635 case jump:
5636 p1 = p + 1;
5637 EXTRACT_NUMBER_AND_INCR (mcnt, p1);
5638 p1 += mcnt;
5639
5640 if ((is_a_jump_n && (re_opcode_t) *p1 == succeed_n)
5641 || (!is_a_jump_n
5642 && (re_opcode_t) *p1 == on_failure_jump))
5643 goto fail;
5644 break;
5645 default:
5646 /* do nothing */ ;
5647 }
5648 }
5649
5650 if (d >= string1 && d <= end1)
5651 dend = end_match_1;
5652 }
5653 else
5654 break; /* Matching at this starting point really fails. */
5655 } /* for (;;) */
5656
5657 if (best_regs_set)
5658 goto restore_best_regs;
5659
5660 FREE_VARIABLES ();
5661
5662 return -1; /* Failure to match. */
5663 } /* re_match_2 */
5664
5665 /* Subroutine definitions for re_match_2. */
5666
5667
5668 /* We are passed P pointing to a register number after a start_memory.
5669
5670 Return true if the pattern up to the corresponding stop_memory can
5671 match the empty string, and false otherwise.
5672
5673 If we find the matching stop_memory, sets P to point to one past its number.
5674 Otherwise, sets P to an undefined byte less than or equal to END.
5675
5676 We don't handle duplicates properly (yet). */
5677
5678 static boolean
group_match_null_string_p(p,end,reg_info)5679 group_match_null_string_p (p, end, reg_info)
5680 unsigned char **p, *end;
5681 register_info_type *reg_info;
5682 {
5683 int mcnt;
5684 /* Point to after the args to the start_memory. */
5685 unsigned char *p1 = *p + 2;
5686
5687 while (p1 < end)
5688 {
5689 /* Skip over opcodes that can match nothing, and return true or
5690 false, as appropriate, when we get to one that can't, or to the
5691 matching stop_memory. */
5692
5693 switch ((re_opcode_t) *p1)
5694 {
5695 /* Could be either a loop or a series of alternatives. */
5696 case on_failure_jump:
5697 p1++;
5698 EXTRACT_NUMBER_AND_INCR (mcnt, p1);
5699
5700 /* If the next operation is not a jump backwards in the
5701 pattern. */
5702
5703 if (mcnt >= 0)
5704 {
5705 /* Go through the on_failure_jumps of the alternatives,
5706 seeing if any of the alternatives cannot match nothing.
5707 The last alternative starts with only a jump,
5708 whereas the rest start with on_failure_jump and end
5709 with a jump, e.g., here is the pattern for `a|b|c':
5710
5711 /on_failure_jump/0/6/exactn/1/a/jump_past_alt/0/6
5712 /on_failure_jump/0/6/exactn/1/b/jump_past_alt/0/3
5713 /exactn/1/c
5714
5715 So, we have to first go through the first (n-1)
5716 alternatives and then deal with the last one separately. */
5717
5718
5719 /* Deal with the first (n-1) alternatives, which start
5720 with an on_failure_jump (see above) that jumps to right
5721 past a jump_past_alt. */
5722
5723 while ((re_opcode_t) p1[mcnt-3] == jump_past_alt)
5724 {
5725 /* `mcnt' holds how many bytes long the alternative
5726 is, including the ending `jump_past_alt' and
5727 its number. */
5728
5729 if (!alt_match_null_string_p (p1, p1 + mcnt - 3,
5730 reg_info))
5731 return false;
5732
5733 /* Move to right after this alternative, including the
5734 jump_past_alt. */
5735 p1 += mcnt;
5736
5737 /* Break if it's the beginning of an n-th alternative
5738 that doesn't begin with an on_failure_jump. */
5739 if ((re_opcode_t) *p1 != on_failure_jump)
5740 break;
5741
5742 /* Still have to check that it's not an n-th
5743 alternative that starts with an on_failure_jump. */
5744 p1++;
5745 EXTRACT_NUMBER_AND_INCR (mcnt, p1);
5746 if ((re_opcode_t) p1[mcnt-3] != jump_past_alt)
5747 {
5748 /* Get to the beginning of the n-th alternative. */
5749 p1 -= 3;
5750 break;
5751 }
5752 }
5753
5754 /* Deal with the last alternative: go back and get number
5755 of the `jump_past_alt' just before it. `mcnt' contains
5756 the length of the alternative. */
5757 EXTRACT_NUMBER (mcnt, p1 - 2);
5758
5759 if (!alt_match_null_string_p (p1, p1 + mcnt, reg_info))
5760 return false;
5761
5762 p1 += mcnt; /* Get past the n-th alternative. */
5763 } /* if mcnt > 0 */
5764 break;
5765
5766
5767 case stop_memory:
5768 assert (p1[1] == **p);
5769 *p = p1 + 2;
5770 return true;
5771
5772
5773 default:
5774 if (!common_op_match_null_string_p (&p1, end, reg_info))
5775 return false;
5776 }
5777 } /* while p1 < end */
5778
5779 return false;
5780 } /* group_match_null_string_p */
5781
5782
5783 /* Similar to group_match_null_string_p, but doesn't deal with alternatives:
5784 It expects P to be the first byte of a single alternative and END one
5785 byte past the last. The alternative can contain groups. */
5786
5787 static boolean
alt_match_null_string_p(p,end,reg_info)5788 alt_match_null_string_p (p, end, reg_info)
5789 unsigned char *p, *end;
5790 register_info_type *reg_info;
5791 {
5792 int mcnt;
5793 unsigned char *p1 = p;
5794
5795 while (p1 < end)
5796 {
5797 /* Skip over opcodes that can match nothing, and break when we get
5798 to one that can't. */
5799
5800 switch ((re_opcode_t) *p1)
5801 {
5802 /* It's a loop. */
5803 case on_failure_jump:
5804 p1++;
5805 EXTRACT_NUMBER_AND_INCR (mcnt, p1);
5806 p1 += mcnt;
5807 break;
5808
5809 default:
5810 if (!common_op_match_null_string_p (&p1, end, reg_info))
5811 return false;
5812 }
5813 } /* while p1 < end */
5814
5815 return true;
5816 } /* alt_match_null_string_p */
5817
5818
5819 /* Deals with the ops common to group_match_null_string_p and
5820 alt_match_null_string_p.
5821
5822 Sets P to one after the op and its arguments, if any. */
5823
5824 static boolean
common_op_match_null_string_p(p,end,reg_info)5825 common_op_match_null_string_p (p, end, reg_info)
5826 unsigned char **p, *end;
5827 register_info_type *reg_info;
5828 {
5829 int mcnt;
5830 boolean ret;
5831 int reg_no;
5832 unsigned char *p1 = *p;
5833
5834 switch ((re_opcode_t) *p1++)
5835 {
5836 case no_op:
5837 case begline:
5838 case endline:
5839 case begbuf:
5840 case endbuf:
5841 case wordbeg:
5842 case wordend:
5843 case wordbound:
5844 case notwordbound:
5845 #ifdef emacs
5846 case before_dot:
5847 case at_dot:
5848 case after_dot:
5849 #endif
5850 break;
5851
5852 case start_memory:
5853 reg_no = *p1;
5854 assert (reg_no > 0 && reg_no <= MAX_REGNUM);
5855 ret = group_match_null_string_p (&p1, end, reg_info);
5856
5857 /* Have to set this here in case we're checking a group which
5858 contains a group and a back reference to it. */
5859
5860 if (REG_MATCH_NULL_STRING_P (reg_info[reg_no]) == MATCH_NULL_UNSET_VALUE)
5861 REG_MATCH_NULL_STRING_P (reg_info[reg_no]) = ret;
5862
5863 if (!ret)
5864 return false;
5865 break;
5866
5867 /* If this is an optimized succeed_n for zero times, make the jump. */
5868 case jump:
5869 EXTRACT_NUMBER_AND_INCR (mcnt, p1);
5870 if (mcnt >= 0)
5871 p1 += mcnt;
5872 else
5873 return false;
5874 break;
5875
5876 case succeed_n:
5877 /* Get to the number of times to succeed. */
5878 p1 += 2;
5879 EXTRACT_NUMBER_AND_INCR (mcnt, p1);
5880
5881 if (mcnt == 0)
5882 {
5883 p1 -= 4;
5884 EXTRACT_NUMBER_AND_INCR (mcnt, p1);
5885 p1 += mcnt;
5886 }
5887 else
5888 return false;
5889 break;
5890
5891 case duplicate:
5892 if (!REG_MATCH_NULL_STRING_P (reg_info[*p1]))
5893 return false;
5894 break;
5895
5896 case set_number_at:
5897 p1 += 4;
5898
5899 default:
5900 /* All other opcodes mean we cannot match the empty string. */
5901 return false;
5902 }
5903
5904 *p = p1;
5905 return true;
5906 } /* common_op_match_null_string_p */
5907
5908
5909 /* Return zero if TRANSLATE[S1] and TRANSLATE[S2] are identical for LEN
5910 bytes; nonzero otherwise. */
5911
5912 static int
bcmp_translate(s1,s2,len,translate)5913 bcmp_translate (s1, s2, len, translate)
5914 const char *s1, *s2;
5915 register int len;
5916 RE_TRANSLATE_TYPE translate;
5917 {
5918 register const unsigned char *p1 = (const unsigned char *) s1;
5919 register const unsigned char *p2 = (const unsigned char *) s2;
5920 unsigned char c;
5921
5922 while (len)
5923 {
5924 c = *p1++;
5925 if (ismbchar(c)) {
5926 size_t n;
5927 if (c != *p2++)
5928 return 1;
5929 for (n = mbclen (c) - 1; n > 0; n--)
5930 if (!--len || *p1++ != *p2++)
5931 return 1;
5932 }
5933 else
5934 if (translate[c] != translate[*p2++])
5935 return 1;
5936 len--;
5937 }
5938 return 0;
5939 }
5940
5941 /* Entry points for GNU code. */
5942
5943 /* re_compile_pattern is the GNU regular expression compiler: it
5944 compiles PATTERN (of length SIZE) and puts the result in BUFP.
5945 Returns 0 if the pattern was valid, otherwise an error string.
5946
5947 Assumes the `allocated' (and perhaps `buffer') and `translate' fields
5948 are set in BUFP on entry.
5949
5950 We call regex_compile to do the actual compilation. */
5951
5952 const char *
re_compile_pattern(pattern,length,bufp)5953 re_compile_pattern (pattern, length, bufp)
5954 const char *pattern;
5955 size_t length;
5956 struct re_pattern_buffer *bufp;
5957 {
5958 reg_errcode_t ret;
5959
5960 /* GNU code is written to assume at least RE_NREGS registers will be set
5961 (and at least one extra will be -1). */
5962 bufp->regs_allocated = REGS_UNALLOCATED;
5963
5964 /* And GNU code determines whether or not to get register information
5965 by passing null for the REGS argument to re_match, etc., not by
5966 setting no_sub. */
5967 bufp->no_sub = 0;
5968
5969 /* Match anchors at newline. */
5970 bufp->newline_anchor = 1;
5971
5972 ret = regex_compile (pattern, length, re_syntax_options, bufp);
5973
5974 if (!ret)
5975 return NULL;
5976 return gettext (re_error_msgid[(int) ret]);
5977 }
5978
5979 /* Entry points compatible with 4.2 BSD regex library. We don't define
5980 them unless specifically requested. */
5981
5982 #if defined (_REGEX_RE_COMP) || defined (_LIBC)
5983
5984 /* BSD has one and only one pattern buffer. */
5985 static struct re_pattern_buffer re_comp_buf;
5986
5987 char *
5988 #ifdef _LIBC
5989 /* Make these definitions weak in libc, so POSIX programs can redefine
5990 these names if they don't use our functions, and still use
5991 regcomp/regexec below without link errors. */
5992 weak_function
5993 #endif
re_comp(s)5994 re_comp (s)
5995 const char *s;
5996 {
5997 reg_errcode_t ret;
5998
5999 if (!s)
6000 {
6001 if (!re_comp_buf.buffer)
6002 return gettext ("No previous regular expression");
6003 return 0;
6004 }
6005
6006 if (!re_comp_buf.buffer)
6007 {
6008 re_comp_buf.buffer = (unsigned char *) malloc (200);
6009 if (re_comp_buf.buffer == NULL)
6010 return gettext (re_error_msgid[(int) REG_ESPACE]);
6011 re_comp_buf.allocated = 200;
6012
6013 re_comp_buf.fastmap = (char *) malloc (1 << BYTEWIDTH);
6014 if (re_comp_buf.fastmap == NULL)
6015 return gettext (re_error_msgid[(int) REG_ESPACE]);
6016 }
6017
6018 /* Since `re_exec' always passes NULL for the `regs' argument, we
6019 don't need to initialize the pattern buffer fields which affect it. */
6020
6021 /* Match anchors at newlines. */
6022 re_comp_buf.newline_anchor = 1;
6023
6024 ret = regex_compile (s, strlen (s), re_syntax_options, &re_comp_buf);
6025
6026 if (!ret)
6027 return NULL;
6028
6029 /* Yes, we're discarding `const' here if !HAVE_LIBINTL. */
6030 return (char *) gettext (re_error_msgid[(int) ret]);
6031 }
6032
6033
6034 int
6035 #ifdef _LIBC
6036 weak_function
6037 #endif
re_exec(s)6038 re_exec (s)
6039 const char *s;
6040 {
6041 const int len = strlen (s);
6042 return
6043 0 <= re_search (&re_comp_buf, s, len, 0, len, (struct re_registers *) 0);
6044 }
6045
6046 #endif /* _REGEX_RE_COMP */
6047
6048 /* POSIX.2 functions. Don't define these for Emacs. */
6049
6050 #ifndef emacs
6051
6052 /* regcomp takes a regular expression as a string and compiles it.
6053
6054 PREG is a regex_t *. We do not expect any fields to be initialized,
6055 since POSIX says we shouldn't. Thus, we set
6056
6057 `buffer' to the compiled pattern;
6058 `used' to the length of the compiled pattern;
6059 `syntax' to RE_SYNTAX_POSIX_EXTENDED if the
6060 REG_EXTENDED bit in CFLAGS is set; otherwise, to
6061 RE_SYNTAX_POSIX_BASIC;
6062 `newline_anchor' to REG_NEWLINE being set in CFLAGS;
6063 `fastmap' and `fastmap_accurate' to zero;
6064 `re_nsub' to the number of subexpressions in PATTERN.
6065
6066 PATTERN is the address of the pattern string.
6067
6068 CFLAGS is a series of bits which affect compilation.
6069
6070 If REG_EXTENDED is set, we use POSIX extended syntax; otherwise, we
6071 use POSIX basic syntax.
6072
6073 If REG_NEWLINE is set, then . and [^...] don't match newline.
6074 Also, regexec will try a match beginning after every newline.
6075
6076 If REG_ICASE is set, then we considers upper- and lowercase
6077 versions of letters to be equivalent when matching.
6078
6079 If REG_NOSUB is set, then when PREG is passed to regexec, that
6080 routine will report only success or failure, and nothing about the
6081 registers.
6082
6083 It returns 0 if it succeeds, nonzero if it doesn't. (See regex.h for
6084 the return codes and their meanings.) */
6085
6086 #ifdef __APPLE__
6087 __private_extern__
6088 #endif
6089 int
regcomp(preg,pattern,cflags)6090 regcomp (preg, pattern, cflags)
6091 regex_t *preg;
6092 const char *pattern;
6093 int cflags;
6094 {
6095 reg_errcode_t ret;
6096 reg_syntax_t syntax
6097 = (cflags & REG_EXTENDED) ?
6098 RE_SYNTAX_POSIX_EXTENDED : RE_SYNTAX_POSIX_BASIC;
6099
6100 /* regex_compile will allocate the space for the compiled pattern. */
6101 preg->buffer = 0;
6102 preg->allocated = 0;
6103 preg->used = 0;
6104
6105 /* Don't bother to use a fastmap when searching. This simplifies the
6106 REG_NEWLINE case: if we used a fastmap, we'd have to put all the
6107 characters after newlines into the fastmap. This way, we just try
6108 every character. */
6109 preg->fastmap = 0;
6110
6111 if (cflags & REG_ICASE)
6112 {
6113 unsigned i;
6114
6115 preg->translate
6116 = (RE_TRANSLATE_TYPE) malloc (CHAR_SET_SIZE
6117 * sizeof (*(RE_TRANSLATE_TYPE)0));
6118 if (preg->translate == NULL)
6119 return (int) REG_ESPACE;
6120
6121 /* Map uppercase characters to corresponding lowercase ones. */
6122 for (i = 0; i < CHAR_SET_SIZE; i++)
6123 preg->translate[i] = ISUPPER (i) ? tolower (i) : i;
6124 }
6125 else
6126 preg->translate = NULL;
6127
6128 /* If REG_NEWLINE is set, newlines are treated differently. */
6129 if (cflags & REG_NEWLINE)
6130 { /* REG_NEWLINE implies neither . nor [^...] match newline. */
6131 syntax &= ~RE_DOT_NEWLINE;
6132 syntax |= RE_HAT_LISTS_NOT_NEWLINE;
6133 /* It also changes the matching behavior. */
6134 preg->newline_anchor = 1;
6135 }
6136 else
6137 preg->newline_anchor = 0;
6138
6139 preg->no_sub = !!(cflags & REG_NOSUB);
6140
6141 /* POSIX says a null character in the pattern terminates it, so we
6142 can use strlen here in compiling the pattern. */
6143 ret = regex_compile (pattern, strlen (pattern), syntax, preg);
6144
6145 /* POSIX doesn't distinguish between an unmatched open-group and an
6146 unmatched close-group: both are REG_EPAREN. */
6147 if (ret == REG_ERPAREN) ret = REG_EPAREN;
6148
6149 return (int) ret;
6150 }
6151
6152
6153 /* regexec searches for a given pattern, specified by PREG, in the
6154 string STRING.
6155
6156 If NMATCH is zero or REG_NOSUB was set in the cflags argument to
6157 `regcomp', we ignore PMATCH. Otherwise, we assume PMATCH has at
6158 least NMATCH elements, and we set them to the offsets of the
6159 corresponding matched substrings.
6160
6161 EFLAGS specifies `execution flags' which affect matching: if
6162 REG_NOTBOL is set, then ^ does not match at the beginning of the
6163 string; if REG_NOTEOL is set, then $ does not match at the end.
6164
6165 We return 0 if we find a match and REG_NOMATCH if not. */
6166
6167 #ifdef __APPLE__
6168 __private_extern__
6169 #endif
6170 int
regexec(preg,string,nmatch,pmatch,eflags)6171 regexec (preg, string, nmatch, pmatch, eflags)
6172 const regex_t *preg;
6173 const char *string;
6174 size_t nmatch;
6175 regmatch_t pmatch[];
6176 int eflags;
6177 {
6178 int ret;
6179 struct re_registers regs;
6180 regex_t private_preg;
6181 int len = strlen (string);
6182 boolean want_reg_info = !preg->no_sub && nmatch > 0;
6183
6184 private_preg = *preg;
6185
6186 private_preg.not_bol = !!(eflags & REG_NOTBOL);
6187 private_preg.not_eol = !!(eflags & REG_NOTEOL);
6188
6189 /* The user has told us exactly how many registers to return
6190 information about, via `nmatch'. We have to pass that on to the
6191 matching routines. */
6192 private_preg.regs_allocated = REGS_FIXED;
6193
6194 if (want_reg_info)
6195 {
6196 regs.num_regs = nmatch;
6197 regs.start = TALLOC (nmatch, regoff_t);
6198 regs.end = TALLOC (nmatch, regoff_t);
6199 if (regs.start == NULL || regs.end == NULL)
6200 return (int) REG_NOMATCH;
6201 }
6202
6203 /* Perform the searching operation. */
6204 ret = re_search (&private_preg, string, len,
6205 /* start: */ 0, /* range: */ len,
6206 want_reg_info ? ®s : (struct re_registers *) 0);
6207
6208 /* Copy the register information to the POSIX structure. */
6209 if (want_reg_info)
6210 {
6211 if (ret >= 0)
6212 {
6213 unsigned r;
6214
6215 for (r = 0; r < nmatch; r++)
6216 {
6217 pmatch[r].rm_so = regs.start[r];
6218 pmatch[r].rm_eo = regs.end[r];
6219 }
6220 }
6221
6222 /* If we needed the temporary register info, free the space now. */
6223 free (regs.start);
6224 free (regs.end);
6225 }
6226
6227 /* We want zero return to mean success, unlike `re_search'. */
6228 return ret >= 0 ? (int) REG_NOERROR : (int) REG_NOMATCH;
6229 }
6230
6231
6232 /* Returns a message corresponding to an error code, ERRCODE, returned
6233 from either regcomp or regexec. We don't use PREG here. */
6234
6235 size_t
regerror(errcode,preg,errbuf,errbuf_size)6236 regerror (errcode, preg, errbuf, errbuf_size)
6237 int errcode;
6238 const regex_t *preg;
6239 char *errbuf;
6240 size_t errbuf_size;
6241 {
6242 const char *msg;
6243 size_t msg_size;
6244
6245 if (errcode < 0
6246 || errcode >= (int) (sizeof (re_error_msgid)
6247 / sizeof (re_error_msgid[0])))
6248 /* Only error codes returned by the rest of the code should be passed
6249 to this routine. If we are given anything else, or if other regex
6250 code generates an invalid error code, then the program has a bug.
6251 Dump core so we can fix it. */
6252 abort ();
6253
6254 msg = gettext (re_error_msgid[errcode]);
6255
6256 msg_size = strlen (msg) + 1; /* Includes the null. */
6257
6258 if (errbuf_size != 0)
6259 {
6260 if (msg_size > errbuf_size)
6261 {
6262 strncpy (errbuf, msg, errbuf_size - 1);
6263 errbuf[errbuf_size - 1] = 0;
6264 }
6265 else
6266 strcpy (errbuf, msg);
6267 }
6268
6269 return msg_size;
6270 }
6271
6272
6273 /* Free dynamically allocated space used by PREG. */
6274
6275 #ifdef __APPLE__
6276 __private_extern__
6277 #endif
6278 void
regfree(preg)6279 regfree (preg)
6280 regex_t *preg;
6281 {
6282 if (preg->buffer != NULL)
6283 free (preg->buffer);
6284 preg->buffer = NULL;
6285
6286 preg->allocated = 0;
6287 preg->used = 0;
6288
6289 if (preg->fastmap != NULL)
6290 free (preg->fastmap);
6291 preg->fastmap = NULL;
6292 preg->fastmap_accurate = 0;
6293
6294 if (preg->translate != NULL)
6295 free (preg->translate);
6296 preg->translate = NULL;
6297 }
6298
6299 #endif /* not emacs */
6300