1 /* Add nmz prefix by Satoru Takabayashi */
2 /* Extended regular expression matching and search library.
3 Copyright (C) 1993, 94, 95, 96, 97, 98 Free Software Foundation, Inc.
4
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Library General Public License as
7 published by the Free Software Foundation; either version 2 of the
8 License, or (at your option) any later version.
9
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Library General Public License for more details.
14
15 You should have received a copy of the GNU Library General Public
16 License along with the GNU C Library; see the file COPYING.LIB. If not,
17 write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
18 Boston, MA 02111-1307, USA. */
19 /* Multi-byte extension added May, 1993 by t^2 (Takahiro Tanimoto)
20 Last change: May 21, 1993 by t^2 */
21 /* removed gapped buffer support, multiple syntax support by matz <matz@nts.co.jp> */
22 /* Perl5 extension added by matz <matz@caelum.co.jp> */
23 /* UTF-8 extension added Jan 16 1999 by Yoshida Masato <yoshidam@tau.bekkoame.ne.jp> */
24
25 #ifdef HAVE_CONFIG_H
26 # include "config.h"
27 #endif
28 #ifdef HAVE_SUPPORT_H
29 # include "support.h"
30 #endif
31
32 #ifdef HAVE_STRING_H
33 # include <string.h>
34 #else
35 # include <strings.h>
36 #endif
37
38 /* We write fatal error messages on standard error. */
39 #include <stdio.h>
40
41 /* isalpha(3) etc. are used for the character classes. */
42 #include <ctype.h>
43 #include <sys/types.h>
44
45 #ifndef PARAMS
46 # if defined __GNUC__ || (defined __STDC__ && __STDC__)
47 # define PARAMS(args) args
48 # else
49 # define PARAMS(args) ()
50 # endif /* GCC. */
51 #endif /* Not PARAMS. */
52
53 #if defined(STDC_HEADERS)
54 # include <stddef.h>
55 #else
56 /* We need this for `regex.h', and perhaps for the Emacs include files. */
57 # include <sys/types.h>
58 #endif
59
60 #ifndef __STDC__
61 # define volatile
62 #endif
63
64 #ifdef HAVE_PROTOTYPES
65 # define _(args) args
66 #else
67 # define _(args) ()
68 #endif
69
70 #ifdef RUBY_PLATFORM
71 #include "defines.h"
72
73 # define RUBY
74 extern int rb_prohibit_interrupt;
75 extern int rb_trap_pending;
76 void rb_trap_exec _((void));
77
78 # define CHECK_INTS if (!rb_prohibit_interrupt) {\
79 if (rb_trap_pending) rb_trap_exec();\
80 }
81
82 #define nmz_xmalloc ruby_xmalloc
83 #define xcalloc ruby_xcalloc
84 #define nmz_xrealloc ruby_xrealloc
85 #define xfree ruby_xfree
86
87 void *nmz_xmalloc _((size_t));
88 void *xcalloc _((size_t,size_t));
89 void *nmz_xrealloc _((void*,size_t));
90 void xfree _((void*));
91 #endif
92
93 /* for using libnamazu */
94 #ifdef HAVE_STDLIB_H
95 # include <stdlib.h>
96 #endif
97 #include "util.h"
98 #define xfree free
99
100 #ifndef NO_ALLOCA
101 /* Make alloca work the best possible way. */
102 #ifdef __GNUC__
103 # ifndef atarist
104 # ifndef alloca
105 # define alloca __builtin_alloca
106 # endif
107 # endif /* atarist */
108 #else
109 # ifdef _MSC_VER
110 # include <malloc.h>
111 # define alloca _alloca
112 # else
113 # if HAVE_ALLOCA_H
114 # include <alloca.h>
115 # else
116 # ifdef _AIX
117 #pragma alloca
118 # else
119 # ifndef alloca /* predefined by HP cc +Olibcalls */
120 char *alloca ();
121 # endif
122 # endif
123 # endif
124 # endif
125 #endif
126 #endif /* NO_ALLOCA */
127
128 #ifdef HAVE_STRING_H
129 # include <string.h>
130 #else
131 # include <strings.h>
132 #endif
133
134 #ifndef NO_ALLOCA
135 #ifdef C_ALLOCA
136 #define FREE_VARIABLES() alloca(0)
137 #else
138 #define FREE_VARIABLES()
139 #endif
140 #else /* NO_ALLOCA */
141 #define FREE_VARIABLES()
142 #endif /* NO_ALLOCA */
143
144 #define FREE_AND_RETURN_VOID(stackb) do { \
145 FREE_VARIABLES(); \
146 if (stackb != stacka) xfree(stackb); \
147 return; \
148 } while(0)
149
150 #define FREE_AND_RETURN(stackb,val) do { \
151 FREE_VARIABLES(); \
152 if (stackb != stacka) xfree(stackb); \
153 return(val); \
154 } while(0)
155
156 #define DOUBLE_STACK(type) do { \
157 type *stackx; \
158 unsigned int xlen = stacke - stackb; \
159 if (stackb == stacka) { \
160 stackx = (type*)nmz_xmalloc(2 * xlen * sizeof(type)); \
161 memcpy(stackx, stackb, xlen * sizeof (type)); \
162 } \
163 else { \
164 stackx = (type*)nmz_xrealloc(stackb, 2 * xlen * sizeof(type)); \
165 } \
166 /* Rearrange the pointers. */ \
167 stackp = stackx + (stackp - stackb); \
168 stackb = stackx; \
169 stacke = stackb + 2 * xlen; \
170 } while (0)
171
172 #define MALLOC_STACK(type, xlen) do { \
173 type *stackx; \
174 stackx = (type*)nmz_xmalloc((unsigned int)xlen * sizeof(type)); \
175 memset(stackx, 0, (unsigned int)xlen * sizeof (type)); \
176 stackb = stackx; \
177 stackp = stackb; \
178 stacke = stackb + (unsigned int)xlen; \
179 } while (0)
180
181 #define RE_TALLOC(n,t) ((t*)nmz_xmalloc((n)*sizeof(t)))
182 #define TMALLOC(n,t) ((t*)nmz_xmalloc((n)*sizeof(t)))
183 #define TREALLOC(s,n,t) (s=((t*)nmz_xrealloc(s,(n)*sizeof(t))))
184
185 #define EXPAND_FAIL_STACK() DOUBLE_STACK(unsigned char*)
186 #define ENSURE_FAIL_STACK(n) \
187 do { \
188 if (stacke - stackp <= (n)) { \
189 /* if (len > re_max_failures * MAX_NUM_FAILURE_ITEMS) \
190 { \
191 FREE_AND_RETURN(stackb,(-2)); \
192 }*/ \
193 \
194 /* Roughly double the size of the stack. */ \
195 EXPAND_FAIL_STACK(); \
196 } \
197 } while (0)
198
199 /* Get the interface, including the syntax bits. */
200 #include "regex.h"
201
202 /* Subroutines for nmz_re_compile_pattern. */
203 static void store_jump _((char*, int, char*));
204 static void insert_jump _((int, char*, char*, char*));
205 static void store_jump_n _((char*, int, char*, unsigned));
206 static void insert_jump_n _((int, char*, char*, char*, unsigned));
207 #if 0
208 static void insert_op _((int, char*, char*));
209 #endif
210 static void insert_op_2 _((int, char*, char*, int, int));
211 static int memcmp_translate _((unsigned char*, unsigned char*, int));
212
213 /* Define the syntax stuff, so we can do the \<, \>, etc. */
214
215 /* This must be nonzero for the wordchar and notwordchar pattern
216 commands in nmz_re_match. */
217 #define Sword 1
218 #define Sword2 2
219
220 #define SYNTAX(c) re_syntax_table[c]
221
222 static char re_syntax_table[256];
223 static void init_syntax_once _((void));
224 static const unsigned char *translate = 0;
225 static void init_regs _((struct re_registers*, unsigned int));
226 static void bm_init_skip _((int *, unsigned char*, int, const unsigned char*));
227 static int current_mbctype = MBCTYPE_ASCII;
228
229 #undef P
230
231 #ifdef RUBY
232 #include "util.h"
233 #endif
234
235 static void
init_syntax_once()236 init_syntax_once()
237 {
238 register int c;
239 static int done = 0;
240
241 if (done)
242 return;
243
244 memset(re_syntax_table, 0, sizeof re_syntax_table);
245
246 for (c=0; c<=0x7f; c++)
247 if (isalnum(c))
248 re_syntax_table[c] = Sword;
249 re_syntax_table['_'] = Sword;
250
251 for (c=0x80; c<=0xff; c++)
252 if (isalnum(c))
253 re_syntax_table[c] = Sword2;
254 done = 1;
255 }
256
257 void
nmz_re_set_casetable(table)258 nmz_re_set_casetable(table)
259 const char *table;
260 {
261 translate = (const unsigned char*)table;
262 }
263
264 /* Jim Meyering writes:
265
266 "... Some ctype macros are valid only for character codes that
267 isascii says are ASCII (SGI's IRIX-4.0.5 is one such system --when
268 using /bin/cc or gcc but without giving an ansi option). So, all
269 ctype uses should be through macros like ISPRINT... If
270 STDC_HEADERS is defined, then autoconf has verified that the ctype
271 macros don't need to be guarded with references to isascii. ...
272 Defining isascii to 1 should let any compiler worth its salt
273 eliminate the && through constant folding."
274 Solaris defines some of these symbols so we must undefine them first. */
275
276 #undef ISASCII
277 #if defined STDC_HEADERS || (!defined isascii && !defined HAVE_ISASCII)
278 # define ISASCII(c) 1
279 #else
280 # define ISASCII(c) isascii(c)
281 #endif
282
283 #ifdef isblank
284 # define ISBLANK(c) (ISASCII(c) && isblank(c))
285 #else
286 # define ISBLANK(c) ((c) == ' ' || (c) == '\t')
287 #endif
288 #ifdef isgraph
289 # define ISGRAPH(c) (ISASCII(c) && isgraph(c))
290 #else
291 # define ISGRAPH(c) (ISASCII(c) && isprint(c) && !isspace(c))
292 #endif
293
294 #undef ISPRINT
295 #define ISPRINT(c) (ISASCII(c) && isprint(c))
296 #define ISDIGIT(c) (ISASCII(c) && isdigit(c))
297 #define ISALNUM(c) (ISASCII(c) && isalnum(c))
298 #define ISALPHA(c) (ISASCII(c) && isalpha(c))
299 #define ISCNTRL(c) (ISASCII(c) && iscntrl(c))
300 #define ISLOWER(c) (ISASCII(c) && islower(c))
301 #define ISPUNCT(c) (ISASCII(c) && ispunct(c))
302 #define ISSPACE(c) (ISASCII(c) && isspace(c))
303 #define ISUPPER(c) (ISASCII(c) && isupper(c))
304 #define ISXDIGIT(c) (ISASCII(c) && isxdigit(c))
305
306 #ifndef NULL
307 # define NULL (void *)0
308 #endif
309
310 /* We remove any previous definition of `SIGN_EXTEND_CHAR',
311 since ours (we hope) works properly with all combinations of
312 machines, compilers, `char' and `unsigned char' argument types.
313 (Per Bothner suggested the basic approach.) */
314 #undef SIGN_EXTEND_CHAR
315 #if __STDC__
316 # define SIGN_EXTEND_CHAR(c) ((signed char)(c))
317 #else /* not __STDC__ */
318 /* As in Harbison and Steele. */
319 # define SIGN_EXTEND_CHAR(c) ((((unsigned char)(c)) ^ 128) - 128)
320 #endif
321
322 /* These are the command codes that appear in compiled regular
323 expressions, one per byte. Some command codes are followed by
324 argument bytes. A command code can specify any interpretation
325 whatsoever for its arguments. Zero-bytes may appear in the compiled
326 regular expression.
327
328 The value of `exactn' is needed in search.c (search_buffer) in emacs.
329 So regex.h defines a symbol `RE_EXACTN_VALUE' to be 1; the value of
330 `exactn' we use here must also be 1. */
331
332 enum regexpcode
333 {
334 unused=0,
335 exactn=1, /* Followed by one byte giving n, then by n literal bytes. */
336 begline, /* Fail unless at beginning of line. */
337 endline, /* Fail unless at end of line. */
338 begbuf, /* Succeeds if at beginning of buffer (if emacs) or at beginning
339 of string to be matched (if not). */
340 endbuf, /* Analogously, for end of buffer/string. */
341 endbuf2, /* End of buffer/string, or newline just before it. */
342 begpos, /* Matches where last scan//gsub left off. */
343 jump, /* Followed by two bytes giving relative address to jump to. */
344 jump_past_alt,/* Same as jump, but marks the end of an alternative. */
345 on_failure_jump, /* Followed by two bytes giving relative address of
346 place to resume at in case of failure. */
347 finalize_jump, /* Throw away latest failure point and then jump to
348 address. */
349 maybe_finalize_jump, /* Like jump but finalize if safe to do so.
350 This is used to jump back to the beginning
351 of a repeat. If the command that follows
352 this jump is clearly incompatible with the
353 one at the beginning of the repeat, such that
354 we can be sure that there is no use backtracking
355 out of repetitions already completed,
356 then we finalize. */
357 dummy_failure_jump, /* Jump, and push a dummy failure point. This
358 failure point will be thrown away if an attempt
359 is made to use it for a failure. A + construct
360 makes this before the first repeat. Also
361 use it as an intermediary kind of jump when
362 compiling an or construct. */
363 push_dummy_failure, /* Push a dummy failure point and continue. Used at the end of
364 alternatives. */
365 succeed_n, /* Used like on_failure_jump except has to succeed n times;
366 then gets turned into an on_failure_jump. The relative
367 address following it is useless until then. The
368 address is followed by two bytes containing n. */
369 jump_n, /* Similar to jump, but jump n times only; also the relative
370 address following is in turn followed by yet two more bytes
371 containing n. */
372 try_next, /* Jump to next pattern for the first time,
373 leaving this pattern on the failure stack. */
374 finalize_push, /* Finalize stack and push the beginning of the pattern
375 on the stack to retry (used for non-greedy match) */
376 finalize_push_n, /* Similar to finalize_push, buf finalize n time only */
377 set_number_at, /* Set the following relative location to the
378 subsequent number. */
379 anychar, /* Matches any (more or less) one character excluding newlines. */
380 anychar_repeat, /* Matches sequence of characters excluding newlines. */
381 charset, /* Matches any one char belonging to specified set.
382 First following byte is number of bitmap bytes.
383 Then come bytes for a bitmap saying which chars are in.
384 Bits in each byte are ordered low-bit-first.
385 A character is in the set if its bit is 1.
386 A character too large to have a bit in the map
387 is automatically not in the set. */
388 charset_not, /* Same parameters as charset, but match any character
389 that is not one of those specified. */
390 start_memory, /* Start remembering the text that is matched, for
391 storing in a memory register. Followed by one
392 byte containing the register number. Register numbers
393 must be in the range 0 through RE_NREGS. */
394 stop_memory, /* Stop remembering the text that is matched
395 and store it in a memory register. Followed by
396 one byte containing the register number. Register
397 numbers must be in the range 0 through RE_NREGS. */
398 start_paren, /* Place holder at the start of (?:..). */
399 stop_paren, /* Place holder at the end of (?:..). */
400 casefold_on, /* Turn on casefold flag. */
401 casefold_off, /* Turn off casefold flag. */
402 option_set, /* Turn on multi line match (match with newlines). */
403 start_nowidth, /* Save string point to the stack. */
404 stop_nowidth, /* Restore string place at the point start_nowidth. */
405 pop_and_fail, /* Fail after popping nowidth entry from stack. */
406 stop_backtrack, /* Restore backtrack stack at the point start_nowidth. */
407 duplicate, /* Match a duplicate of something remembered.
408 Followed by one byte containing the index of the memory
409 register. */
410 #if 0
411 fail, /* always fails. */
412 #endif
413 wordchar, /* Matches any word-constituent character. */
414 notwordchar, /* Matches any char that is not a word-constituent. */
415 wordbeg, /* Succeeds if at word beginning. */
416 wordend, /* Succeeds if at word end. */
417 wordbound, /* Succeeds if at a word boundary. */
418 notwordbound /* Succeeds if not at a word boundary. */
419 };
420
421
422 /* Number of failure points to allocate space for initially,
423 when matching. If this number is exceeded, more space is allocated,
424 so it is not a hard limit. */
425
426 #ifndef NFAILURES
427 #define NFAILURES 160
428 #endif
429
430 /* Store NUMBER in two contiguous bytes starting at DESTINATION. */
431 #define STORE_NUMBER(destination, number) \
432 do { (destination)[0] = (number) & 0377; \
433 (destination)[1] = (number) >> 8; } while (0)
434
435 /* Same as STORE_NUMBER, except increment the destination pointer to
436 the byte after where the number is stored. Watch out that values for
437 DESTINATION such as p + 1 won't work, whereas p will. */
438 #define STORE_NUMBER_AND_INCR(destination, number) \
439 do { STORE_NUMBER(destination, number); \
440 (destination) += 2; } while (0)
441
442
443 /* Put into DESTINATION a number stored in two contingous bytes starting
444 at SOURCE. */
445 #define EXTRACT_NUMBER(destination, source) \
446 do { (destination) = *(source) & 0377; \
447 (destination) += SIGN_EXTEND_CHAR(*(char*)((source) + 1)) << 8; } while (0)
448
449 /* Same as EXTRACT_NUMBER, except increment the pointer for source to
450 point to second byte of SOURCE. Note that SOURCE has to be a value
451 such as p, not, e.g., p + 1. */
452 #define EXTRACT_NUMBER_AND_INCR(destination, source) \
453 do { EXTRACT_NUMBER(destination, source); \
454 (source) += 2; } while (0)
455
456
457 /* Specify the precise syntax of regexps for compilation. This provides
458 for compatibility for various utilities which historically have
459 different, incompatible syntaxes.
460
461 The argument SYNTAX is a bit-mask comprised of the various bits
462 defined in regex.h. */
463
464 long
nmz_re_set_syntax(syntax)465 nmz_re_set_syntax(syntax)
466 long syntax;
467 {
468 /* obsolete */
469 return 0;
470 }
471
472
473 /* Macros for nmz_re_compile_pattern, which is found below these definitions. */
474
475 #define TRANSLATE_P() ((options&RE_OPTION_IGNORECASE) && translate)
476 #define MAY_TRANSLATE() ((bufp->options&(RE_OPTION_IGNORECASE|RE_MAY_IGNORECASE)) && translate)
477 /* Fetch the next character in the uncompiled pattern---translating it
478 if necessary. Also cast from a signed character in the constant
479 string passed to us by the user to an unsigned char that we can use
480 as an array index (in, e.g., `translate'). */
481 #define PATFETCH(c) \
482 do {if (p == pend) goto end_of_pattern; \
483 c = (unsigned char) *p++; \
484 if (TRANSLATE_P()) c = (unsigned char)translate[c]; \
485 } while (0)
486
487 /* Fetch the next character in the uncompiled pattern, with no
488 translation. */
489 #define PATFETCH_RAW(c) \
490 do {if (p == pend) goto end_of_pattern; \
491 c = (unsigned char)*p++; \
492 } while (0)
493
494 /* Go backwards one character in the pattern. */
495 #define PATUNFETCH p--
496
497 #define MBC2WC(c, p) \
498 do { \
499 if (current_mbctype == MBCTYPE_UTF8) { \
500 int n = mbclen(c) - 1; \
501 c &= (1<<(BYTEWIDTH-2-n)) - 1; \
502 while (n--) { \
503 c = c << 6 | (*p++ & ((1<<6)-1)); \
504 } \
505 } \
506 else { \
507 c <<= 8; \
508 c |= (unsigned char)*(p)++; \
509 } \
510 } while (0)
511
512 #define PATFETCH_MBC(c) \
513 do { \
514 if (p + mbclen(c) - 1 >= pend) goto end_of_pattern; \
515 MBC2WC(c, p); \
516 } while(0)
517
518 #define WC2MBC1ST(c) \
519 ((current_mbctype != MBCTYPE_UTF8) ? ((c<0x100) ? (c) : (((c)>>8)&0xff)) : utf8_firstbyte(c))
520
521 static unsigned int
utf8_firstbyte(c)522 utf8_firstbyte(c)
523 unsigned long c;
524 {
525 if (c < 0x80) return c;
526 if (c <= 0x7ff) return ((c>>6)&0xff)|0xc0;
527 if (c <= 0xffff) return ((c>>12)&0xff)|0xe0;
528 if (c <= 0x1fffff) return ((c>>18)&0xff)|0xf0;
529 if (c <= 0x3ffffff) return ((c>>24)&0xff)|0xf8;
530 if (c <= 0x7fffffff) return ((c>>30)&0xff)|0xfc;
531 #if SIZEOF_INT > 4
532 if (c <= 0xfffffffff) return 0xfe;
533 #else
534 return 0xfe;
535 #endif
536 }
537
538 #if 0
539 static void
540 print_mbc(c)
541 unsigned int c;
542 {
543 if (current_mbctype == MBCTYPE_UTF8) {
544 if (c < 0x80)
545 printf("%c", (int)c);
546 else if (c <= 0x7ff)
547 printf("%c%c", (int)utf8_firstbyte(c), (int)(c & 0x3f));
548 else if (c <= 0xffff)
549 printf("%c%c%c", (int)utf8_firstbyte(c), (int)((c >> 6) & 0x3f),
550 (int)(c & 0x3f));
551 else if (c <= 0x1fffff)
552 printf("%c%c%c%c", (int)utf8_firstbyte(c), (int)((c >> 12) & 0x3f),
553 (int)((c >> 6) & 0x3f), (int)(c & 0x3f));
554 else if (c <= 0x3ffffff)
555 printf("%c%c%c%c%c", (int)utf8_firstbyte(c), (int)((c >> 18) & 0x3f),
556 (int)((c >> 12) & 0x3f), (int)((c >> 6) & 0x3f), (int)(c & 0x3f));
557 else if (c <= 0x7fffffff)
558 printf("%c%c%c%c%c%c", (int)utf8_firstbyte(c), (int)((c >> 24) & 0x3f),
559 (int)((c >> 18) & 0x3f), (int)((c >> 12) & 0x3f),
560 (int)((c >> 6) & 0x3f), (int)(c & 0x3f));
561 }
562 else if (c < 0xff) {
563 printf("\\%o", (int)c);
564 }
565 else {
566 printf("%c%c", (int)(c >> BYTEWIDTH), (int)(c &0xff));
567 }
568 }
569 #endif
570
571 /* If the buffer isn't allocated when it comes in, use this. */
572 #define INIT_BUF_SIZE 28
573
574 /* Make sure we have at least N more bytes of space in buffer. */
575 #define GET_BUFFER_SPACE(n) \
576 do { \
577 while (b - bufp->buffer + (n) >= bufp->allocated) \
578 EXTEND_BUFFER; \
579 } while (0)
580
581 /* Make sure we have one more byte of buffer space and then add CH to it. */
582 #define BUFPUSH(ch) \
583 do { \
584 GET_BUFFER_SPACE(1); \
585 *b++ = (char)(ch); \
586 } while (0)
587
588 /* Extend the buffer by twice its current size via reallociation and
589 reset the pointers that pointed into the old allocation to point to
590 the correct places in the new allocation. If extending the buffer
591 results in it being larger than 1 << 16, then flag memory exhausted. */
592 #define EXTEND_BUFFER \
593 do { char *old_buffer = bufp->buffer; \
594 if (bufp->allocated == (1L<<16)) goto too_big; \
595 bufp->allocated *= 2; \
596 if (bufp->allocated > (1L<<16)) bufp->allocated = (1L<<16); \
597 bufp->buffer = (char*)nmz_xrealloc(bufp->buffer, bufp->allocated); \
598 if (bufp->buffer == 0) \
599 goto memory_exhausted; \
600 b = (b - old_buffer) + bufp->buffer; \
601 if (fixup_alt_jump) \
602 fixup_alt_jump = (fixup_alt_jump - old_buffer) + bufp->buffer; \
603 if (laststart) \
604 laststart = (laststart - old_buffer) + bufp->buffer; \
605 begalt = (begalt - old_buffer) + bufp->buffer; \
606 if (pending_exact) \
607 pending_exact = (pending_exact - old_buffer) + bufp->buffer; \
608 } while (0)
609
610
611 /* Set the bit for character C in a character set list. */
612 #define SET_LIST_BIT(c) \
613 (b[(unsigned char)(c) / BYTEWIDTH] \
614 |= 1 << ((unsigned char)(c) % BYTEWIDTH))
615
616 /* Get the next unsigned number in the uncompiled pattern. */
617 #define GET_UNSIGNED_NUMBER(num) \
618 do { if (p != pend) { \
619 PATFETCH(c); \
620 while (ISDIGIT(c)) { \
621 if (num < 0) \
622 num = 0; \
623 num = num * 10 + c - '0'; \
624 if (p == pend) \
625 break; \
626 PATFETCH(c); \
627 } \
628 } \
629 } while (0)
630
631 #define STREQ(s1, s2) ((strcmp(s1, s2) == 0))
632
633 #define CHAR_CLASS_MAX_LENGTH 6 /* Namely, `xdigit'. */
634
635 #define IS_CHAR_CLASS(string) \
636 (STREQ(string, "alpha") || STREQ(string, "upper") \
637 || STREQ(string, "lower") || STREQ(string, "digit") \
638 || STREQ(string, "alnum") || STREQ(string, "xdigit") \
639 || STREQ(string, "space") || STREQ(string, "print") \
640 || STREQ(string, "punct") || STREQ(string, "graph") \
641 || STREQ(string, "cntrl") || STREQ(string, "blank"))
642
643 #define STORE_MBC(p, c) \
644 do { \
645 (p)[0] = (unsigned char)(((c) >>24) & 0xff); \
646 (p)[1] = (unsigned char)(((c) >>16) & 0xff); \
647 (p)[2] = (unsigned char)(((c) >> 8) & 0xff); \
648 (p)[3] = (unsigned char)(((c) >> 0) & 0xff); \
649 } while (0)
650
651 #define STORE_MBC_AND_INCR(p, c) \
652 do { \
653 *(p)++ = (unsigned char)(((c) >>24) & 0xff); \
654 *(p)++ = (unsigned char)(((c) >>16) & 0xff); \
655 *(p)++ = (unsigned char)(((c) >> 8) & 0xff); \
656 *(p)++ = (unsigned char)(((c) >> 0) & 0xff); \
657 } while (0)
658
659 #define EXTRACT_MBC(p) \
660 ((unsigned int)((unsigned char)(p)[0] << 24 | \
661 (unsigned char)(p)[1] << 16 | \
662 (unsigned char)(p)[2] << 8 | \
663 (unsigned char)(p)[3]))
664
665 #define EXTRACT_MBC_AND_INCR(p) \
666 ((unsigned int)((p) += 4, \
667 (unsigned char)(p)[-4] << 24 | \
668 (unsigned char)(p)[-3] << 16 | \
669 (unsigned char)(p)[-2] << 8 | \
670 (unsigned char)(p)[-1]))
671
672 #define EXTRACT_UNSIGNED(p) \
673 ((unsigned char)(p)[0] | (unsigned char)(p)[1] << 8)
674 #define EXTRACT_UNSIGNED_AND_INCR(p) \
675 ((p) += 2, (unsigned char)(p)[-2] | (unsigned char)(p)[-1] << 8)
676
677 /* Handle (mb)?charset(_not)?.
678
679 Structure of mbcharset(_not)? in compiled pattern.
680
681 struct {
682 unsinged char id; mbcharset(_not)?
683 unsigned char sbc_size;
684 unsigned char sbc_map[sbc_size]; same as charset(_not)? up to here.
685 unsigned short mbc_size; number of intervals.
686 struct {
687 unsigned long beg; beginning of interval.
688 unsigned long end; end of interval.
689 } intervals[mbc_size];
690 }; */
691
692 static void
set_list_bits(c1,c2,b)693 set_list_bits(c1, c2, b)
694 unsigned long c1, c2;
695 unsigned char *b;
696 {
697 unsigned char sbc_size = b[-1];
698 unsigned short mbc_size = EXTRACT_UNSIGNED(&b[sbc_size]);
699 unsigned short beg, end, upb;
700
701 if (c1 > c2)
702 return;
703 b = &b[sbc_size + 2];
704
705 for (beg = 0, upb = mbc_size; beg < upb; ) {
706 unsigned short mid = (unsigned short)(beg + upb) >> 1;
707
708 if ((int)c1 - 1 > (int)EXTRACT_MBC(&b[mid*8+4]))
709 beg = mid + 1;
710 else
711 upb = mid;
712 }
713
714 for (end = beg, upb = mbc_size; end < upb; ) {
715 unsigned short mid = (unsigned short)(end + upb) >> 1;
716
717 if ((int)c2 >= (int)EXTRACT_MBC(&b[mid*8]) - 1)
718 end = mid + 1;
719 else
720 upb = mid;
721 }
722
723 if (beg != end) {
724 if (c1 > EXTRACT_MBC(&b[beg*8]))
725 c1 = EXTRACT_MBC(&b[beg*8]);
726 if (c2 < EXTRACT_MBC(&b[(end - 1)*8+4]))
727 c2 = EXTRACT_MBC(&b[(end - 1)*8+4]);
728 }
729 if (end < mbc_size && end != beg + 1)
730 /* NOTE: memcpy() would not work here. */
731 memmove(&b[(beg + 1)*8], &b[end*8], (mbc_size - end)*8);
732 STORE_MBC(&b[beg*8 + 0], c1);
733 STORE_MBC(&b[beg*8 + 4], c2);
734 mbc_size += beg - end + 1;
735 STORE_NUMBER(&b[-2], mbc_size);
736 }
737
738 static int
is_in_list(c,b)739 is_in_list(c, b)
740 unsigned long c;
741 const unsigned char *b;
742 {
743 unsigned short size;
744 unsigned short i, j;
745
746 size = *b++;
747 if ((int)c / BYTEWIDTH < (int)size && b[c / BYTEWIDTH] & 1 << c % BYTEWIDTH) {
748 return 1;
749 }
750 b += size + 2;
751 size = EXTRACT_UNSIGNED(&b[-2]);
752 if (size == 0) return 0;
753
754 for (i = 0, j = size; i < j; ) {
755 unsigned short k = (unsigned short)(i + j) >> 1;
756
757 if (c > EXTRACT_MBC(&b[k*8+4]))
758 i = k + 1;
759 else
760 j = k;
761 }
762 if (i < size && EXTRACT_MBC(&b[i*8]) <= c
763 && ((unsigned char)c != '\n' && (unsigned char)c != '\0'))
764 return 1;
765 return 0;
766 }
767
768 #if 0
769 static void
770 print_partial_compiled_pattern(start, end)
771 unsigned char *start;
772 unsigned char *end;
773 {
774 int mcnt, mcnt2;
775 unsigned char *p = start;
776 unsigned char *pend = end;
777
778 if (start == NULL) {
779 printf("(null)\n");
780 return;
781 }
782
783 /* Loop over pattern commands. */
784 while (p < pend) {
785 switch ((enum regexpcode)*p++) {
786 case unused:
787 printf("/unused");
788 break;
789
790 case exactn:
791 mcnt = *p++;
792 printf("/exactn/%d", mcnt);
793 do {
794 putchar('/');
795 printf("%c", *p++);
796 }
797 while (--mcnt);
798 break;
799
800 case start_memory:
801 mcnt = *p++;
802 printf("/start_memory/%d/%d", mcnt, *p++);
803 break;
804
805 case stop_memory:
806 mcnt = *p++;
807 printf("/stop_memory/%d/%d", mcnt, *p++);
808 break;
809
810 case start_paren:
811 printf("/start_paren");
812 break;
813
814 case stop_paren:
815 printf("/stop_paren");
816 break;
817
818 case casefold_on:
819 printf("/casefold_on");
820 break;
821
822 case casefold_off:
823 printf("/casefold_off");
824 break;
825
826 case option_set:
827 printf("/option_set/%d", *p++);
828 break;
829
830 case start_nowidth:
831 EXTRACT_NUMBER_AND_INCR(mcnt, p);
832 printf("/start_nowidth//%d", mcnt);
833 break;
834
835 case stop_nowidth:
836 printf("/stop_nowidth//");
837 p += 2;
838 break;
839
840 case pop_and_fail:
841 printf("/pop_and_fail");
842 break;
843
844 case stop_backtrack:
845 printf("/stop_backtrack//");
846 p += 2;
847 break;
848
849 case duplicate:
850 printf("/duplicate/%d", *p++);
851 break;
852
853 case anychar:
854 printf("/anychar");
855 break;
856
857 case anychar_repeat:
858 printf("/anychar_repeat");
859 break;
860
861 case charset:
862 case charset_not:
863 {
864 register int c;
865
866 printf("/charset%s",
867 (enum regexpcode)*(p - 1) == charset_not ? "_not" : "");
868
869 mcnt = *p++;
870 printf("/%d", mcnt);
871 for (c = 0; c < mcnt; c++) {
872 unsigned bit;
873 unsigned char map_byte = p[c];
874
875 putchar ('/');
876
877 for (bit = 0; bit < BYTEWIDTH; bit++)
878 if (map_byte & (1 << bit))
879 printf("%c", c * BYTEWIDTH + bit);
880 }
881 p += mcnt;
882 mcnt = EXTRACT_UNSIGNED_AND_INCR(p);
883 printf("/");
884 while (mcnt--) {
885 print_mbc(EXTRACT_MBC_AND_INCR(p));
886 printf("-");
887 print_mbc(EXTRACT_MBC_AND_INCR(p));
888 }
889 break;
890 }
891
892 case begline:
893 printf("/begline");
894 break;
895
896 case endline:
897 printf("/endline");
898 break;
899
900 case on_failure_jump:
901 EXTRACT_NUMBER_AND_INCR(mcnt, p);
902 printf("/on_failure_jump//%d", mcnt);
903 break;
904
905 case dummy_failure_jump:
906 EXTRACT_NUMBER_AND_INCR(mcnt, p);
907 printf("/dummy_failure_jump//%d", mcnt);
908 break;
909
910 case push_dummy_failure:
911 printf("/push_dummy_failure");
912 break;
913
914 case finalize_jump:
915 EXTRACT_NUMBER_AND_INCR(mcnt, p);
916 printf("/finalize_jump//%d", mcnt);
917 break;
918
919 case maybe_finalize_jump:
920 EXTRACT_NUMBER_AND_INCR(mcnt, p);
921 printf("/maybe_finalize_jump//%d", mcnt);
922 break;
923
924 case jump_past_alt:
925 EXTRACT_NUMBER_AND_INCR(mcnt, p);
926 printf("/jump_past_alt//%d", mcnt);
927 break;
928
929 case jump:
930 EXTRACT_NUMBER_AND_INCR(mcnt, p);
931 printf("/jump//%d", mcnt);
932 break;
933
934 case succeed_n:
935 EXTRACT_NUMBER_AND_INCR(mcnt, p);
936 EXTRACT_NUMBER_AND_INCR(mcnt2, p);
937 printf("/succeed_n//%d//%d", mcnt, mcnt2);
938 break;
939
940 case jump_n:
941 EXTRACT_NUMBER_AND_INCR(mcnt, p);
942 EXTRACT_NUMBER_AND_INCR(mcnt2, p);
943 printf("/jump_n//%d//%d", mcnt, mcnt2);
944 break;
945
946 case set_number_at:
947 EXTRACT_NUMBER_AND_INCR(mcnt, p);
948 EXTRACT_NUMBER_AND_INCR(mcnt2, p);
949 printf("/set_number_at//%d//%d", mcnt, mcnt2);
950 break;
951
952 case try_next:
953 EXTRACT_NUMBER_AND_INCR(mcnt, p);
954 printf("/try_next//%d", mcnt);
955 break;
956
957 case finalize_push:
958 EXTRACT_NUMBER_AND_INCR(mcnt, p);
959 printf("/finalize_push//%d", mcnt);
960 break;
961
962 case finalize_push_n:
963 EXTRACT_NUMBER_AND_INCR(mcnt, p);
964 EXTRACT_NUMBER_AND_INCR(mcnt2, p);
965 printf("/finalize_push_n//%d//%d", mcnt, mcnt2);
966 break;
967
968 case wordbound:
969 printf("/wordbound");
970 break;
971
972 case notwordbound:
973 printf("/notwordbound");
974 break;
975
976 case wordbeg:
977 printf("/wordbeg");
978 break;
979
980 case wordend:
981 printf("/wordend");
982
983 case wordchar:
984 printf("/wordchar");
985 break;
986
987 case notwordchar:
988 printf("/notwordchar");
989 break;
990
991 case begbuf:
992 printf("/begbuf");
993 break;
994
995 case endbuf:
996 printf("/endbuf");
997 break;
998
999 case endbuf2:
1000 printf("/endbuf2");
1001 break;
1002
1003 case begpos:
1004 printf("/begpos");
1005 break;
1006
1007 default:
1008 printf("?%d", *(p-1));
1009 }
1010 }
1011 printf("/\n");
1012 }
1013
1014
1015 static void
1016 print_compiled_pattern(bufp)
1017 struct re_pattern_buffer *bufp;
1018 {
1019 unsigned char *buffer = (unsigned char*)bufp->buffer;
1020
1021 print_partial_compiled_pattern(buffer, buffer + bufp->used);
1022 }
1023 #endif
1024
1025 static char*
calculate_must_string(start,end)1026 calculate_must_string(start, end)
1027 char *start;
1028 char *end;
1029 {
1030 int mcnt;
1031 int max = 0;
1032 char *p = start;
1033 char *pend = end;
1034 char *must = 0;
1035
1036 if (start == NULL) return 0;
1037
1038 /* Loop over pattern commands. */
1039 while (p < pend) {
1040 switch ((enum regexpcode)*p++) {
1041 case unused:
1042 break;
1043
1044 case exactn:
1045 mcnt = *p;
1046 if (mcnt > max) {
1047 must = p;
1048 max = mcnt;
1049 }
1050 p += mcnt+1;
1051 break;
1052
1053 case start_memory:
1054 case stop_memory:
1055 p += 2;
1056 break;
1057
1058 case duplicate:
1059 p++;
1060 break;
1061
1062 case casefold_on:
1063 case casefold_off:
1064 return 0; /* should not check must_string */
1065
1066 case pop_and_fail:
1067 case anychar:
1068 case anychar_repeat:
1069 case begline:
1070 case endline:
1071 case wordbound:
1072 case notwordbound:
1073 case wordbeg:
1074 case wordend:
1075 case wordchar:
1076 case notwordchar:
1077 case begbuf:
1078 case endbuf:
1079 case endbuf2:
1080 case begpos:
1081 case push_dummy_failure:
1082 case start_paren:
1083 case stop_paren:
1084 case option_set:
1085 break;
1086
1087 case charset:
1088 case charset_not:
1089 mcnt = *p++;
1090 p += mcnt;
1091 mcnt = EXTRACT_UNSIGNED_AND_INCR(p);
1092 while (mcnt--) {
1093 p += 4;
1094 }
1095 break;
1096
1097 case on_failure_jump:
1098 EXTRACT_NUMBER_AND_INCR(mcnt, p);
1099 if (mcnt > 0) p += mcnt;
1100 if ((enum regexpcode)p[-3] == jump) {
1101 p -= 2;
1102 EXTRACT_NUMBER_AND_INCR(mcnt, p);
1103 if (mcnt > 0) p += mcnt;
1104 }
1105 break;
1106
1107 case dummy_failure_jump:
1108 case succeed_n:
1109 case try_next:
1110 case jump:
1111 EXTRACT_NUMBER_AND_INCR(mcnt, p);
1112 if (mcnt > 0) p += mcnt;
1113 break;
1114
1115 case start_nowidth:
1116 case stop_nowidth:
1117 case stop_backtrack:
1118 case finalize_jump:
1119 case maybe_finalize_jump:
1120 case finalize_push:
1121 p += 2;
1122 break;
1123
1124 case jump_n:
1125 case set_number_at:
1126 case finalize_push_n:
1127 p += 4;
1128 break;
1129
1130 default:
1131 break;
1132 }
1133 }
1134 return must;
1135 }
1136
1137 static unsigned int
read_backslash(c)1138 read_backslash(c)
1139 int c;
1140 {
1141 switch (c) {
1142 case 'n':
1143 return '\n';
1144
1145 case 't':
1146 return '\t';
1147
1148 case 'r':
1149 return '\r';
1150
1151 case 'f':
1152 return '\f';
1153
1154 case 'v':
1155 return '\v';
1156
1157 case 'a':
1158 return '\007';
1159
1160 case 'b':
1161 return '\010';
1162
1163 case 'e':
1164 return '\033';
1165 }
1166 return c;
1167 }
1168
1169 static unsigned int
read_special(p,pend,pp)1170 read_special(p, pend, pp)
1171 const char *p, *pend, **pp;
1172 {
1173 int c;
1174
1175 PATFETCH_RAW(c);
1176 switch (c) {
1177 case 'M':
1178 PATFETCH_RAW(c);
1179 if (c != '-') return -1;
1180 PATFETCH_RAW(c);
1181 *pp = p;
1182 if (c == '\\') {
1183 return read_special(p, pend, pp) | 0x80;
1184 }
1185 else if (c == -1) return ~0;
1186 else {
1187 return ((c & 0xff) | 0x80);
1188 }
1189
1190 case 'C':
1191 PATFETCH_RAW(c);
1192 if (c != '-') return ~0;
1193 case 'c':
1194 PATFETCH_RAW(c);
1195 *pp = p;
1196 if (c == '\\') {
1197 c = read_special(p, pend, pp);
1198 }
1199 else if (c == '?') return 0177;
1200 else if (c == -1) return ~0;
1201 return c & 0x9f;
1202 default:
1203 return read_backslash(c);
1204 }
1205
1206 end_of_pattern:
1207 return ~0;
1208 }
1209
1210 /* nmz_re_compile_pattern takes a regular-expression string
1211 and converts it into a buffer full of byte commands for matching.
1212
1213 PATTERN is the address of the pattern string
1214 SIZE is the length of it.
1215 BUFP is a struct re_pattern_buffer * which points to the info
1216 on where to store the byte commands.
1217 This structure contains a char * which points to the
1218 actual space, which should have been obtained with malloc.
1219 nmz_re_compile_pattern may use realloc to grow the buffer space.
1220
1221 The number of bytes of commands can be found out by looking in
1222 the `struct re_pattern_buffer' that bufp pointed to, after
1223 nmz_re_compile_pattern returns. */
1224
1225 char *
nmz_re_compile_pattern(pattern,size,bufp)1226 nmz_re_compile_pattern(pattern, size, bufp)
1227 const char *pattern;
1228 int size;
1229 struct re_pattern_buffer *bufp;
1230 {
1231 register char *b = bufp->buffer;
1232 register const char *p = pattern;
1233 const char *nextp;
1234 const char *pend = pattern + size;
1235 register unsigned int c, c1 = 0;
1236 const char *p0;
1237 int numlen;
1238 #define ERROR_MSG_MAX_SIZE 200
1239 static char error_msg[ERROR_MSG_MAX_SIZE+1];
1240
1241 /* Address of the count-byte of the most recently inserted `exactn'
1242 command. This makes it possible to tell whether a new exact-match
1243 character can be added to that command or requires a new `exactn'
1244 command. */
1245
1246 char *pending_exact = 0;
1247
1248 /* Address of the place where a forward-jump should go to the end of
1249 the containing expression. Each alternative of an `or', except the
1250 last, ends with a forward-jump of this sort. */
1251
1252 char *fixup_alt_jump = 0;
1253
1254 /* Address of start of the most recently finished expression.
1255 This tells postfix * where to find the start of its operand. */
1256
1257 char *laststart = 0;
1258
1259 /* In processing a repeat, 1 means zero matches is allowed. */
1260
1261 char zero_times_ok;
1262
1263 /* In processing a repeat, 1 means many matches is allowed. */
1264
1265 char many_times_ok;
1266
1267 /* In processing a repeat, 1 means non-greedy matches. */
1268
1269 char greedy;
1270
1271 /* Address of beginning of regexp, or inside of last (. */
1272
1273 char *begalt = b;
1274
1275 /* Place in the uncompiled pattern (i.e., the {) to
1276 which to go back if the interval is invalid. */
1277 const char *beg_interval;
1278
1279 /* In processing an interval, at least this many matches must be made. */
1280 int lower_bound;
1281
1282 /* In processing an interval, at most this many matches can be made. */
1283 int upper_bound;
1284
1285 /* Stack of information saved by ( and restored by ).
1286 Five stack elements are pushed by each (:
1287 First, the value of b.
1288 Second, the value of fixup_alt_jump.
1289 Third, the value of begalt.
1290 Fourth, the value of regnum.
1291 Fifth, the type of the paren. */
1292
1293 int stacka[40];
1294 int *stackb = stacka;
1295 int *stackp = stackb;
1296 int *stacke = stackb + 40;
1297 #if 0
1298 int *stackt;
1299 #endif
1300
1301 /* Counts ('s as they are encountered. Remembered for the matching ),
1302 where it becomes the register number to put in the stop_memory
1303 command. */
1304
1305 int regnum = 1;
1306
1307 int range = 0;
1308 int had_mbchar = 0;
1309 int had_num_literal = 0;
1310 int had_char_class = 0;
1311
1312 int options = bufp->options;
1313
1314 bufp->fastmap_accurate = 0;
1315 bufp->must = 0;
1316 bufp->must_skip = 0;
1317 bufp->stclass = 0;
1318
1319 /* Initialize the syntax table. */
1320 init_syntax_once();
1321
1322 if (bufp->allocated == 0) {
1323 bufp->allocated = INIT_BUF_SIZE;
1324 if (bufp->buffer)
1325 /* EXTEND_BUFFER loses when bufp->allocated is 0. */
1326 bufp->buffer = (char*)nmz_xrealloc(bufp->buffer, INIT_BUF_SIZE);
1327 else
1328 /* Caller did not allocate a buffer. Do it for them. */
1329 bufp->buffer = (char*)nmz_xmalloc(INIT_BUF_SIZE);
1330 if (!bufp->buffer) goto memory_exhausted;
1331 begalt = b = bufp->buffer;
1332 }
1333
1334 while (p != pend) {
1335 PATFETCH(c);
1336
1337 switch (c) {
1338 case '$':
1339 if (bufp->options & RE_OPTION_SINGLELINE) {
1340 BUFPUSH(endbuf);
1341 }
1342 else {
1343 p0 = p;
1344 /* When testing what follows the $,
1345 look past the \-constructs that don't consume anything. */
1346
1347 while (p0 != pend) {
1348 if (*p0 == '\\' && p0 + 1 != pend
1349 && (p0[1] == 'b' || p0[1] == 'B'))
1350 p0 += 2;
1351 else
1352 break;
1353 }
1354 BUFPUSH(endline);
1355 }
1356 break;
1357
1358 case '^':
1359 if (bufp->options & RE_OPTION_SINGLELINE)
1360 BUFPUSH(begbuf);
1361 else
1362 BUFPUSH(begline);
1363 break;
1364
1365 case '+':
1366 case '?':
1367 case '*':
1368 /* If there is no previous pattern, char not special. */
1369 if (!laststart) {
1370 snprintf(error_msg, ERROR_MSG_MAX_SIZE,
1371 "invalid regular expression; there's no previous pattern, to which '%c' would define cardinality at %d",
1372 c, p-pattern);
1373 FREE_AND_RETURN(stackb, error_msg);
1374 }
1375 /* If there is a sequence of repetition chars,
1376 collapse it down to just one. */
1377 zero_times_ok = c != '+';
1378 many_times_ok = c != '?';
1379 greedy = 1;
1380 if (p != pend) {
1381 PATFETCH(c);
1382 switch (c) {
1383 case '?':
1384 greedy = 0;
1385 break;
1386 case '*':
1387 case '+':
1388 goto nested_meta;
1389 default:
1390 PATUNFETCH;
1391 break;
1392 }
1393 }
1394
1395 repeat:
1396 /* Star, etc. applied to an empty pattern is equivalent
1397 to an empty pattern. */
1398 if (!laststart)
1399 break;
1400
1401 if (greedy && many_times_ok && *laststart == anychar && b - laststart <= 2) {
1402 if (b[-1] == stop_paren)
1403 b--;
1404 if (zero_times_ok)
1405 *laststart = anychar_repeat;
1406 else {
1407 BUFPUSH(anychar_repeat);
1408 }
1409 break;
1410 }
1411 /* Now we know whether or not zero matches is allowed
1412 and also whether or not two or more matches is allowed. */
1413 if (many_times_ok) {
1414 /* If more than one repetition is allowed, put in at the
1415 end a backward relative jump from b to before the next
1416 jump we're going to put in below (which jumps from
1417 laststart to after this jump). */
1418 GET_BUFFER_SPACE(3);
1419 store_jump(b,greedy?maybe_finalize_jump:finalize_push,laststart-3);
1420 b += 3; /* Because store_jump put stuff here. */
1421 }
1422
1423 /* On failure, jump from laststart to next pattern, which will be the
1424 end of the buffer after this jump is inserted. */
1425 GET_BUFFER_SPACE(3);
1426 insert_jump(on_failure_jump, laststart, b + 3, b);
1427 b += 3;
1428
1429 if (zero_times_ok) {
1430 if (greedy == 0) {
1431 GET_BUFFER_SPACE(3);
1432 insert_jump(try_next, laststart, b + 3, b);
1433 b += 3;
1434 }
1435 }
1436 else {
1437 /* At least one repetition is required, so insert a
1438 `dummy_failure_jump' before the initial
1439 `on_failure_jump' instruction of the loop. This
1440 effects a skip over that instruction the first time
1441 we hit that loop. */
1442 GET_BUFFER_SPACE(3);
1443 insert_jump(dummy_failure_jump, laststart, laststart + 6, b);
1444 b += 3;
1445 }
1446 break;
1447
1448 case '.':
1449 laststart = b;
1450 BUFPUSH(anychar);
1451 break;
1452
1453 case '[':
1454 if (p == pend)
1455 FREE_AND_RETURN(stackb, "invalid regular expression; '[' can't be the last character ie. can't start range at the end of pattern");
1456 while ((b - bufp->buffer + 9 + (1 << BYTEWIDTH) / BYTEWIDTH)
1457 > bufp->allocated)
1458 EXTEND_BUFFER;
1459
1460 laststart = b;
1461 if (*p == '^') {
1462 BUFPUSH(charset_not);
1463 p++;
1464 }
1465 else
1466 BUFPUSH(charset);
1467 p0 = p;
1468
1469 BUFPUSH((1 << BYTEWIDTH) / BYTEWIDTH);
1470 /* Clear the whole map */
1471 memset(b, 0, (1 << BYTEWIDTH) / BYTEWIDTH + 2);
1472
1473 had_mbchar = 0;
1474 had_num_literal = 0;
1475 had_char_class = 0;
1476
1477 /* Read in characters and ranges, setting map bits. */
1478 for (;;) {
1479 int size;
1480 unsigned last = (unsigned)-1;
1481
1482 if ((size = EXTRACT_UNSIGNED(&b[(1 << BYTEWIDTH) / BYTEWIDTH]))
1483 || current_mbctype) {
1484 /* Ensure the space is enough to hold another interval
1485 of multi-byte chars in charset(_not)?. */
1486 size = (1 << BYTEWIDTH) / BYTEWIDTH + 2 + size*8 + 8;
1487 while (b + size + 1 > bufp->buffer + bufp->allocated)
1488 EXTEND_BUFFER;
1489 }
1490 range_retry:
1491 if (range && had_char_class) {
1492 FREE_AND_RETURN(stackb, "invalid regular expression; can't use character class as an end value of range");
1493 }
1494 PATFETCH(c);
1495
1496 if (c == ']') {
1497 if (p == p0 + 1) {
1498 if (p == pend)
1499 FREE_AND_RETURN(stackb, "invalid regular expression; empty character class");
1500 }
1501 else
1502 /* Stop if this isn't merely a ] inside a bracket
1503 expression, but rather the end of a bracket
1504 expression. */
1505 break;
1506 }
1507 /* Look ahead to see if it's a range when the last thing
1508 was a character class. */
1509 if (had_char_class && c == '-' && *p != ']')
1510 FREE_AND_RETURN(stackb, "invalid regular expression; can't use character class as a start value of range");
1511 if (ismbchar(c)) {
1512 PATFETCH_MBC(c);
1513 had_mbchar++;
1514 }
1515 had_char_class = 0;
1516
1517 /* \ escapes characters when inside [...]. */
1518 if (c == '\\') {
1519 PATFETCH_RAW(c);
1520 switch (c) {
1521 case 'w':
1522 for (c = 0; c < (1 << BYTEWIDTH); c++) {
1523 if (SYNTAX(c) == Sword ||
1524 (!current_mbctype && SYNTAX(c) == Sword2))
1525 SET_LIST_BIT(c);
1526 }
1527 if (current_mbctype) {
1528 set_list_bits(0x80, 0xffffffff, b);
1529 }
1530 had_char_class = 1;
1531 last = -1;
1532 continue;
1533
1534 case 'W':
1535 for (c = 0; c < (1 << BYTEWIDTH); c++) {
1536 if (SYNTAX(c) != Sword &&
1537 ((current_mbctype && !re_mbctab[c]) ||
1538 (!current_mbctype && SYNTAX(c) != Sword2)))
1539 SET_LIST_BIT(c);
1540 }
1541 had_char_class = 1;
1542 last = -1;
1543 continue;
1544
1545 case 's':
1546 for (c = 0; c < 256; c++)
1547 if (ISSPACE(c))
1548 SET_LIST_BIT(c);
1549 had_char_class = 1;
1550 last = -1;
1551 continue;
1552
1553 case 'S':
1554 for (c = 0; c < 256; c++)
1555 if (!ISSPACE(c))
1556 SET_LIST_BIT(c);
1557 if (current_mbctype)
1558 set_list_bits(0x80, 0xffffffff, b);
1559 had_char_class = 1;
1560 last = -1;
1561 continue;
1562
1563 case 'd':
1564 for (c = '0'; c <= '9'; c++)
1565 SET_LIST_BIT(c);
1566 had_char_class = 1;
1567 last = -1;
1568 continue;
1569
1570 case 'D':
1571 for (c = 0; c < 256; c++)
1572 if (!ISDIGIT(c))
1573 SET_LIST_BIT(c);
1574 if (current_mbctype)
1575 set_list_bits(0x80, 0xffffffff, b);
1576 had_char_class = 1;
1577 last = -1;
1578 continue;
1579
1580 case 'x':
1581 c = nmz_scan_hex(p, 2, &numlen);
1582 p += numlen;
1583 had_num_literal = 1;
1584 break;
1585
1586 case '0': case '1': case '2': case '3': case '4':
1587 case '5': case '6': case '7': case '8': case '9':
1588 PATUNFETCH;
1589 c = nmz_scan_oct(p, 3, &numlen);
1590 p += numlen;
1591 had_num_literal = 1;
1592 break;
1593
1594 case 'M':
1595 case 'C':
1596 case 'c':
1597 {
1598 char *pp;
1599
1600 --p;
1601 c = read_special(p, pend, &pp);
1602 if (c > 255) goto invalid_escape;
1603 p = pp;
1604 had_num_literal = 1;
1605 }
1606 break;
1607
1608 default:
1609 c = read_backslash(c);
1610 if (ismbchar(c)) {
1611 PATFETCH_MBC(c);
1612 had_mbchar++;
1613 }
1614 break;
1615 }
1616 }
1617
1618 /* Get a range. */
1619 if (range) {
1620 if (last > c)
1621 goto invalid_pattern;
1622
1623 range = 0;
1624 if (had_mbchar == 0) {
1625 for (;last<=c;last++)
1626 SET_LIST_BIT(last);
1627 }
1628 else if (had_mbchar == 2) {
1629 set_list_bits(last, c, b);
1630 }
1631 else {
1632 /* restriction: range between sbc and mbc */
1633 goto invalid_pattern;
1634 }
1635 }
1636 else if (p[0] == '-' && p[1] != ']') {
1637 last = c;
1638 PATFETCH(c1);
1639 range = 1;
1640 goto range_retry;
1641 }
1642 else if (c == '[' && *p == ':') {
1643 /* Leave room for the null. */
1644 char str[CHAR_CLASS_MAX_LENGTH + 1];
1645
1646 PATFETCH_RAW(c);
1647 c1 = 0;
1648
1649 /* If pattern is `[[:'. */
1650 if (p == pend)
1651 FREE_AND_RETURN(stackb, "invalid regular expression; re can't end '[[:'");
1652
1653 for (;;) {
1654 PATFETCH (c);
1655 if (c == ':' || c == ']' || p == pend
1656 || c1 == CHAR_CLASS_MAX_LENGTH)
1657 break;
1658 str[c1++] = c;
1659 }
1660 str[c1] = '\0';
1661
1662 /* If isn't a word bracketed by `[:' and:`]':
1663 undo the ending character, the letters, and leave
1664 the leading `:' and `[' (but set bits for them). */
1665 if (c == ':' && *p == ']') {
1666 int ch;
1667 char is_alnum = STREQ(str, "alnum");
1668 char is_alpha = STREQ(str, "alpha");
1669 char is_blank = STREQ(str, "blank");
1670 char is_cntrl = STREQ(str, "cntrl");
1671 char is_digit = STREQ(str, "digit");
1672 char is_graph = STREQ(str, "graph");
1673 char is_lower = STREQ(str, "lower");
1674 char is_print = STREQ(str, "print");
1675 char is_punct = STREQ(str, "punct");
1676 char is_space = STREQ(str, "space");
1677 char is_upper = STREQ(str, "upper");
1678 char is_xdigit = STREQ(str, "xdigit");
1679
1680 if (!IS_CHAR_CLASS(str)){
1681 snprintf(error_msg, ERROR_MSG_MAX_SIZE,
1682 "invalid regular expression; [:%s:] is not a character class", str);
1683 FREE_AND_RETURN(stackb, error_msg);
1684 }
1685
1686 /* Throw away the ] at the end of the character class. */
1687 PATFETCH(c);
1688
1689 if (p == pend)
1690 FREE_AND_RETURN(stackb, "invalid regular expression; range doesn't have ending ']' after a character class");
1691
1692 for (ch = 0; ch < 1 << BYTEWIDTH; ch++) {
1693 if ( (is_alnum && ISALNUM(ch))
1694 || (is_alpha && ISALPHA(ch))
1695 || (is_blank && ISBLANK(ch))
1696 || (is_cntrl && ISCNTRL(ch))
1697 || (is_digit && ISDIGIT(ch))
1698 || (is_graph && ISGRAPH(ch))
1699 || (is_lower && ISLOWER(ch))
1700 || (is_print && ISPRINT(ch))
1701 || (is_punct && ISPUNCT(ch))
1702 || (is_space && ISSPACE(ch))
1703 || (is_upper && ISUPPER(ch))
1704 || (is_xdigit && ISXDIGIT(ch)))
1705 SET_LIST_BIT(ch);
1706 }
1707 had_char_class = 1;
1708 }
1709 else {
1710 c1++;
1711 while (c1--)
1712 PATUNFETCH;
1713 SET_LIST_BIT(TRANSLATE_P()?translate['[']:'[');
1714 SET_LIST_BIT(TRANSLATE_P()?translate[':']:':');
1715 had_char_class = 0;
1716 last = ':';
1717 }
1718 }
1719 else if (had_mbchar == 0 && (!current_mbctype || !had_num_literal)) {
1720 SET_LIST_BIT(c);
1721 had_num_literal = 0;
1722 }
1723 else
1724 set_list_bits(c, c, b);
1725 had_mbchar = 0;
1726 }
1727
1728 /* Discard any character set/class bitmap bytes that are all
1729 0 at the end of the map. Decrement the map-length byte too. */
1730 while ((int)b[-1] > 0 && b[b[-1] - 1] == 0)
1731 b[-1]--;
1732 if (b[-1] != (1 << BYTEWIDTH) / BYTEWIDTH)
1733 memmove(&b[(int)b[-1]], &b[(1 << BYTEWIDTH) / BYTEWIDTH],
1734 2 + EXTRACT_UNSIGNED(&b[(1 << BYTEWIDTH) / BYTEWIDTH])*8);
1735 b += b[-1] + 2 + EXTRACT_UNSIGNED(&b[(int)b[-1]])*8;
1736 break;
1737
1738 case '(':
1739 {
1740 int old_options = options;
1741 int push_option = 0;
1742 int casefold = 0;
1743
1744 PATFETCH(c);
1745 if (c == '?') {
1746 int negative = 0;
1747
1748 PATFETCH_RAW(c);
1749 switch (c) {
1750 case 'x': case 'p': case 'm': case 'i': case '-':
1751 for (;;) {
1752 switch (c) {
1753 case '-':
1754 negative = 1;
1755 break;
1756
1757 case ':':
1758 case ')':
1759 break;
1760
1761 case 'x':
1762 if (negative)
1763 options &= ~RE_OPTION_EXTENDED;
1764 else
1765 options |= RE_OPTION_EXTENDED;
1766 break;
1767
1768 case 'p':
1769 if (negative) {
1770 if ((options&RE_OPTION_POSIXLINE) == RE_OPTION_POSIXLINE) {
1771 options &= ~RE_OPTION_POSIXLINE;
1772 }
1773 }
1774 else if ((options&RE_OPTION_POSIXLINE) != RE_OPTION_POSIXLINE) {
1775 options |= RE_OPTION_POSIXLINE;
1776 }
1777 push_option = 1;
1778 break;
1779
1780 case 'm':
1781 if (negative) {
1782 if (options&RE_OPTION_MULTILINE) {
1783 options &= ~RE_OPTION_MULTILINE;
1784 }
1785 }
1786 else if (!(options&RE_OPTION_MULTILINE)) {
1787 options |= RE_OPTION_MULTILINE;
1788 }
1789 push_option = 1;
1790 break;
1791
1792 case 'i':
1793 if (negative) {
1794 if (options&RE_OPTION_IGNORECASE) {
1795 options &= ~RE_OPTION_IGNORECASE;
1796 }
1797 }
1798 else if (!(options&RE_OPTION_IGNORECASE)) {
1799 options |= RE_OPTION_IGNORECASE;
1800 }
1801 casefold = 1;
1802 break;
1803
1804 default:
1805 FREE_AND_RETURN(stackb, "undefined (?...) inline option");
1806 }
1807 if (c == ')') {
1808 c = '#'; /* read whole in-line options */
1809 break;
1810 }
1811 if (c == ':') break;
1812 PATFETCH_RAW(c);
1813 }
1814 break;
1815
1816 case '#':
1817 for (;;) {
1818 PATFETCH(c);
1819 if (c == ')') break;
1820 }
1821 c = '#';
1822 break;
1823
1824 case ':':
1825 case '=':
1826 case '!':
1827 case '>':
1828 break;
1829
1830 default:
1831 FREE_AND_RETURN(stackb, "undefined (?...) sequence");
1832 }
1833 }
1834 else {
1835 PATUNFETCH;
1836 c = '(';
1837 }
1838 if (c == '#') {
1839 if (push_option) {
1840 BUFPUSH(option_set);
1841 BUFPUSH(options);
1842 }
1843 if (casefold) {
1844 if (options & RE_OPTION_IGNORECASE)
1845 BUFPUSH(casefold_on);
1846 else
1847 BUFPUSH(casefold_off);
1848 }
1849 break;
1850 }
1851 if (stackp+8 >= stacke) {
1852 DOUBLE_STACK(int);
1853 }
1854
1855 /* Laststart should point to the start_memory that we are about
1856 to push (unless the pattern has RE_NREGS or more ('s). */
1857 /* obsolete: now RE_NREGS is just a default register size. */
1858 *stackp++ = b - bufp->buffer;
1859 *stackp++ = fixup_alt_jump ? fixup_alt_jump - bufp->buffer + 1 : 0;
1860 *stackp++ = begalt - bufp->buffer;
1861 switch (c) {
1862 case '(':
1863 BUFPUSH(start_memory);
1864 BUFPUSH(regnum);
1865 *stackp++ = regnum++;
1866 *stackp++ = b - bufp->buffer;
1867 BUFPUSH(0);
1868 /* too many ()'s to fit in a byte. (max 254) */
1869 if (regnum >= RE_REG_MAX) goto too_big;
1870 break;
1871
1872 case '=':
1873 case '!':
1874 case '>':
1875 BUFPUSH(start_nowidth);
1876 *stackp++ = b - bufp->buffer;
1877 BUFPUSH(0); /* temporary value */
1878 BUFPUSH(0);
1879 if (c != '!') break;
1880
1881 BUFPUSH(on_failure_jump);
1882 *stackp++ = b - bufp->buffer;
1883 BUFPUSH(0); /* temporary value */
1884 BUFPUSH(0);
1885 break;
1886
1887 case ':':
1888 BUFPUSH(start_paren);
1889 pending_exact = 0;
1890 default:
1891 break;
1892 }
1893 if (push_option) {
1894 BUFPUSH(option_set);
1895 BUFPUSH(options);
1896 }
1897 if (casefold) {
1898 if (options & RE_OPTION_IGNORECASE)
1899 BUFPUSH(casefold_on);
1900 else
1901 BUFPUSH(casefold_off);
1902 }
1903 *stackp++ = c;
1904 *stackp++ = old_options;
1905 fixup_alt_jump = 0;
1906 laststart = 0;
1907 begalt = b;
1908 }
1909 break;
1910
1911 case ')':
1912 if (stackp == stackb)
1913 FREE_AND_RETURN(stackb, "unmatched )");
1914
1915 pending_exact = 0;
1916 if (fixup_alt_jump) {
1917 /* Push a dummy failure point at the end of the
1918 alternative for a possible future
1919 `finalize_jump' to pop. See comments at
1920 `push_dummy_failure' in `nmz_re_match'. */
1921 BUFPUSH(push_dummy_failure);
1922
1923 /* We allocated space for this jump when we assigned
1924 to `fixup_alt_jump', in the `handle_alt' case below. */
1925 store_jump(fixup_alt_jump, jump, b);
1926 }
1927 if (options != stackp[-1]) {
1928 if ((options ^ stackp[-1]) & RE_OPTION_IGNORECASE) {
1929 BUFPUSH((options&RE_OPTION_IGNORECASE)?casefold_off:casefold_on);
1930 }
1931 if ((options ^ stackp[-1]) != RE_OPTION_IGNORECASE) {
1932 BUFPUSH(option_set);
1933 BUFPUSH(stackp[-1]);
1934 }
1935 }
1936 p0 = b;
1937 options = *--stackp;
1938 switch (c = *--stackp) {
1939 case '(':
1940 {
1941 char *loc = bufp->buffer + *--stackp;
1942 *loc = regnum - stackp[-1];
1943 BUFPUSH(stop_memory);
1944 BUFPUSH(stackp[-1]);
1945 BUFPUSH(regnum - stackp[-1]);
1946 stackp--;
1947 }
1948 break;
1949
1950 case '!':
1951 BUFPUSH(pop_and_fail);
1952 /* back patch */
1953 STORE_NUMBER(bufp->buffer+stackp[-1], b - bufp->buffer - stackp[-1] - 2);
1954 stackp--;
1955 /* fall through */
1956 case '=':
1957 BUFPUSH(stop_nowidth);
1958 /* tell stack-pos place to start_nowidth */
1959 STORE_NUMBER(bufp->buffer+stackp[-1], b - bufp->buffer - stackp[-1] - 2);
1960 BUFPUSH(0); /* space to hold stack pos */
1961 BUFPUSH(0);
1962 stackp--;
1963 break;
1964
1965 case '>':
1966 BUFPUSH(stop_backtrack);
1967 /* tell stack-pos place to start_nowidth */
1968 STORE_NUMBER(bufp->buffer+stackp[-1], b - bufp->buffer - stackp[-1] - 2);
1969 BUFPUSH(0); /* space to hold stack pos */
1970 BUFPUSH(0);
1971 stackp--;
1972 break;
1973
1974 case ':':
1975 BUFPUSH(stop_paren);
1976 break;
1977
1978 default:
1979 break;
1980 }
1981 begalt = *--stackp + bufp->buffer;
1982 stackp--;
1983 fixup_alt_jump = *stackp ? *stackp + bufp->buffer - 1 : 0;
1984 laststart = *--stackp + bufp->buffer;
1985 if (c == '!' || c == '=') laststart = b;
1986 break;
1987
1988 case '|':
1989 /* Insert before the previous alternative a jump which
1990 jumps to this alternative if the former fails. */
1991 GET_BUFFER_SPACE(3);
1992 insert_jump(on_failure_jump, begalt, b + 6, b);
1993 pending_exact = 0;
1994 b += 3;
1995 /* The alternative before this one has a jump after it
1996 which gets executed if it gets matched. Adjust that
1997 jump so it will jump to this alternative's analogous
1998 jump (put in below, which in turn will jump to the next
1999 (if any) alternative's such jump, etc.). The last such
2000 jump jumps to the correct final destination. A picture:
2001 _____ _____
2002 | | | |
2003 | v | v
2004 a | b | c
2005
2006 If we are at `b', then fixup_alt_jump right now points to a
2007 three-byte space after `a'. We'll put in the jump, set
2008 fixup_alt_jump to right after `b', and leave behind three
2009 bytes which we'll fill in when we get to after `c'. */
2010
2011 if (fixup_alt_jump)
2012 store_jump(fixup_alt_jump, jump_past_alt, b);
2013
2014 /* Mark and leave space for a jump after this alternative,
2015 to be filled in later either by next alternative or
2016 when know we're at the end of a series of alternatives. */
2017 fixup_alt_jump = b;
2018 GET_BUFFER_SPACE(3);
2019 b += 3;
2020
2021 laststart = 0;
2022 begalt = b;
2023 break;
2024
2025 case '{':
2026 /* If there is no previous pattern, this is an invalid pattern. */
2027 if (!laststart) {
2028 snprintf(error_msg, ERROR_MSG_MAX_SIZE,
2029 "invalid regular expression; there's no previous pattern, to which '{' would define cardinality at %d",
2030 p-pattern);
2031 FREE_AND_RETURN(stackb, error_msg);
2032 }
2033 if( p == pend)
2034 FREE_AND_RETURN(stackb, "invalid regular expression; '{' can't be last character" );
2035
2036 beg_interval = p - 1;
2037
2038 lower_bound = -1; /* So can see if are set. */
2039 upper_bound = -1;
2040 GET_UNSIGNED_NUMBER(lower_bound);
2041 if (c == ',') {
2042 GET_UNSIGNED_NUMBER(upper_bound);
2043 }
2044 else
2045 /* Interval such as `{1}' => match exactly once. */
2046 upper_bound = lower_bound;
2047
2048 if (lower_bound < 0 || c != '}')
2049 goto unfetch_interval;
2050
2051 if (lower_bound >= RE_DUP_MAX || upper_bound >= RE_DUP_MAX)
2052 FREE_AND_RETURN(stackb, "too big quantifier in {,}");
2053 if (upper_bound < 0) upper_bound = RE_DUP_MAX;
2054 if (lower_bound > upper_bound)
2055 FREE_AND_RETURN(stackb, "can't do {n,m} with n > m");
2056
2057 beg_interval = 0;
2058 pending_exact = 0;
2059
2060 greedy = 1;
2061 if (p != pend) {
2062 PATFETCH(c);
2063 if (c == '?') greedy = 0;
2064 else PATUNFETCH;
2065 }
2066
2067 if (lower_bound == 0) {
2068 zero_times_ok = 1;
2069 if (upper_bound == RE_DUP_MAX) {
2070 many_times_ok = 1;
2071 goto repeat;
2072 }
2073 if (upper_bound == 1) {
2074 many_times_ok = 0;
2075 goto repeat;
2076 }
2077 }
2078 if (lower_bound == 1) {
2079 if (upper_bound == 1) {
2080 /* No need to repeat */
2081 break;
2082 }
2083 if (upper_bound == RE_DUP_MAX) {
2084 many_times_ok = 1;
2085 zero_times_ok = 0;
2086 goto repeat;
2087 }
2088 }
2089
2090 /* If upper_bound is zero, don't want to succeed at all;
2091 jump from laststart to b + 3, which will be the end of
2092 the buffer after this jump is inserted. */
2093
2094 if (upper_bound == 0) {
2095 GET_BUFFER_SPACE(3);
2096 insert_jump(jump, laststart, b + 3, b);
2097 b += 3;
2098 break;
2099 }
2100
2101 /* If lower_bound == upper_bound, repeat count can be removed */
2102 if (lower_bound == upper_bound) {
2103 int mcnt;
2104 int skip_stop_paren = 0;
2105
2106 if (b[-1] == stop_paren) {
2107 skip_stop_paren = 1;
2108 b--;
2109 }
2110
2111 if (*laststart == exactn && laststart[1]+2 == b - laststart
2112 && laststart[1]*lower_bound < 256) {
2113 mcnt = laststart[1];
2114 GET_BUFFER_SPACE((lower_bound-1)*mcnt);
2115 laststart[1] = lower_bound*mcnt;
2116 while (--lower_bound) {
2117 memcpy(b, laststart+2, mcnt);
2118 b += mcnt;
2119 }
2120 if (skip_stop_paren) BUFPUSH(stop_paren);
2121 break;
2122 }
2123
2124 if (lower_bound < 5 && b - laststart < 10) {
2125 /* 5 and 10 are the magic numbers */
2126
2127 mcnt = b - laststart;
2128 GET_BUFFER_SPACE((lower_bound-1)*mcnt);
2129 while (--lower_bound) {
2130 memcpy(b, laststart, mcnt);
2131 b += mcnt;
2132 }
2133 if (skip_stop_paren) BUFPUSH(stop_paren);
2134 break;
2135 }
2136 if (skip_stop_paren) b++; /* push back stop_paren */
2137 }
2138
2139 /* Otherwise, we have a nontrivial interval. When
2140 we're all done, the pattern will look like:
2141 set_number_at <jump count> <upper bound>
2142 set_number_at <succeed_n count> <lower bound>
2143 succeed_n <after jump addr> <succed_n count>
2144 <body of loop>
2145 jump_n <succeed_n addr> <jump count>
2146 (The upper bound and `jump_n' are omitted if
2147 `upper_bound' is 1, though.) */
2148 { /* If the upper bound is > 1, we need to insert
2149 more at the end of the loop. */
2150 unsigned nbytes = upper_bound == 1 ? 10 : 20;
2151
2152 GET_BUFFER_SPACE(nbytes);
2153 /* Initialize lower bound of the `succeed_n', even
2154 though it will be set during matching by its
2155 attendant `set_number_at' (inserted next),
2156 because `nmz_re_compile_fastmap' needs to know.
2157 Jump to the `jump_n' we might insert below. */
2158 insert_jump_n(succeed_n, laststart, b + (nbytes/2),
2159 b, lower_bound);
2160 b += 5; /* Just increment for the succeed_n here. */
2161
2162 /* Code to initialize the lower bound. Insert
2163 before the `succeed_n'. The `5' is the last two
2164 bytes of this `set_number_at', plus 3 bytes of
2165 the following `succeed_n'. */
2166 insert_op_2(set_number_at, laststart, b, 5, lower_bound);
2167 b += 5;
2168
2169 if (upper_bound > 1) {
2170 /* More than one repetition is allowed, so
2171 append a backward jump to the `succeed_n'
2172 that starts this interval.
2173
2174 When we've reached this during matching,
2175 we'll have matched the interval once, so
2176 jump back only `upper_bound - 1' times. */
2177 GET_BUFFER_SPACE(5);
2178 store_jump_n(b, greedy?jump_n:finalize_push_n, laststart + 5,
2179 upper_bound - 1);
2180 b += 5;
2181
2182 /* The location we want to set is the second
2183 parameter of the `jump_n'; that is `b-2' as
2184 an absolute address. `laststart' will be
2185 the `set_number_at' we're about to insert;
2186 `laststart+3' the number to set, the source
2187 for the relative address. But we are
2188 inserting into the middle of the pattern --
2189 so everything is getting moved up by 5.
2190 Conclusion: (b - 2) - (laststart + 3) + 5,
2191 i.e., b - laststart.
2192
2193 We insert this at the beginning of the loop
2194 so that if we fail during matching, we'll
2195 reinitialize the bounds. */
2196 insert_op_2(set_number_at, laststart, b, b - laststart,
2197 upper_bound - 1);
2198 b += 5;
2199 }
2200 }
2201 break;
2202
2203 unfetch_interval:
2204 /* If an invalid interval, match the characters as literals. */
2205 p = beg_interval;
2206 beg_interval = 0;
2207
2208 /* normal_char and normal_backslash need `c'. */
2209 PATFETCH(c);
2210 goto normal_char;
2211
2212 case '\\':
2213 if (p == pend)
2214 FREE_AND_RETURN(stackb, "invalid regular expression; '\\' can't be last character");
2215 /* Do not translate the character after the \, so that we can
2216 distinguish, e.g., \B from \b, even if we normally would
2217 translate, e.g., B to b. */
2218 PATFETCH_RAW(c);
2219 switch (c) {
2220 case 's':
2221 case 'S':
2222 case 'd':
2223 case 'D':
2224 while (b - bufp->buffer + 9 + (1 << BYTEWIDTH) / BYTEWIDTH
2225 > bufp->allocated)
2226 EXTEND_BUFFER;
2227
2228 laststart = b;
2229 if (c == 's' || c == 'd') {
2230 BUFPUSH(charset);
2231 }
2232 else {
2233 BUFPUSH(charset_not);
2234 }
2235
2236 BUFPUSH((1 << BYTEWIDTH) / BYTEWIDTH);
2237 memset(b, 0, (1 << BYTEWIDTH) / BYTEWIDTH + 2);
2238 if (c == 's' || c == 'S') {
2239 SET_LIST_BIT(' ');
2240 SET_LIST_BIT('\t');
2241 SET_LIST_BIT('\n');
2242 SET_LIST_BIT('\r');
2243 SET_LIST_BIT('\f');
2244 }
2245 else {
2246 char cc;
2247
2248 for (cc = '0'; cc <= '9'; cc++) {
2249 SET_LIST_BIT(cc);
2250 }
2251 }
2252
2253 while ((int)b[-1] > 0 && b[b[-1] - 1] == 0)
2254 b[-1]--;
2255 if (b[-1] != (1 << BYTEWIDTH) / BYTEWIDTH)
2256 memmove(&b[(int)b[-1]], &b[(1 << BYTEWIDTH) / BYTEWIDTH],
2257 2 + EXTRACT_UNSIGNED(&b[(1 << BYTEWIDTH) / BYTEWIDTH])*8);
2258 b += b[-1] + 2 + EXTRACT_UNSIGNED(&b[(int)b[-1]])*8;
2259 break;
2260
2261 case 'w':
2262 laststart = b;
2263 BUFPUSH(wordchar);
2264 break;
2265
2266 case 'W':
2267 laststart = b;
2268 BUFPUSH(notwordchar);
2269 break;
2270
2271 #ifndef RUBY
2272 case '<':
2273 BUFPUSH(wordbeg);
2274 break;
2275
2276 case '>':
2277 BUFPUSH(wordend);
2278 break;
2279 #endif
2280
2281 case 'b':
2282 BUFPUSH(wordbound);
2283 break;
2284
2285 case 'B':
2286 BUFPUSH(notwordbound);
2287 break;
2288
2289 case 'A':
2290 BUFPUSH(begbuf);
2291 break;
2292
2293 case 'Z':
2294 if ((bufp->options & RE_OPTION_SINGLELINE) == 0) {
2295 BUFPUSH(endbuf2);
2296 break;
2297 }
2298 /* fall through */
2299 case 'z':
2300 BUFPUSH(endbuf);
2301 break;
2302
2303 case 'G':
2304 BUFPUSH(begpos);
2305 break;
2306
2307 /* hex */
2308 case 'x':
2309 had_mbchar = 0;
2310 c = nmz_scan_hex(p, 2, &numlen);
2311 p += numlen;
2312 had_num_literal = 1;
2313 goto numeric_char;
2314
2315 /* octal */
2316 case '0':
2317 had_mbchar = 0;
2318 c = nmz_scan_oct(p, 3, &numlen);
2319 p += numlen;
2320 had_num_literal = 1;
2321 goto numeric_char;
2322
2323 /* back-ref or octal */
2324 case '1': case '2': case '3':
2325 case '4': case '5': case '6':
2326 case '7': case '8': case '9':
2327 PATUNFETCH;
2328 p0 = p;
2329
2330 had_mbchar = 0;
2331 c1 = 0;
2332 GET_UNSIGNED_NUMBER(c1);
2333 if (!ISDIGIT(c)) PATUNFETCH;
2334
2335 if (9 < c1 && c1 >= regnum) {
2336 /* need to get octal */
2337 c = nmz_scan_oct(p0, 3, &numlen) & 0xff;
2338 p = p0 + numlen;
2339 c1 = 0;
2340 had_num_literal = 1;
2341 goto numeric_char;
2342 }
2343
2344 laststart = b;
2345 BUFPUSH(duplicate);
2346 BUFPUSH(c1);
2347 break;
2348
2349 case 'M':
2350 case 'C':
2351 case 'c':
2352 p0 = --p;
2353 c = read_special(p, pend, &p0);
2354 if (c > 255) goto invalid_escape;
2355 p = p0;
2356 had_num_literal = 1;
2357 goto numeric_char;
2358
2359 default:
2360 c = read_backslash(c);
2361 goto normal_char;
2362 }
2363 break;
2364
2365 case '#':
2366 if (options & RE_OPTION_EXTENDED) {
2367 while (p != pend) {
2368 PATFETCH(c);
2369 if (c == '\n') break;
2370 }
2371 break;
2372 }
2373 goto normal_char;
2374
2375 case ' ':
2376 case '\t':
2377 case '\f':
2378 case '\r':
2379 case '\n':
2380 if (options & RE_OPTION_EXTENDED)
2381 break;
2382
2383 default:
2384 normal_char: /* Expects the character in `c'. */
2385 had_mbchar = 0;
2386 if (ismbchar(c)) {
2387 had_mbchar = 1;
2388 c1 = p - pattern;
2389 }
2390 numeric_char:
2391 nextp = p + mbclen(c) - 1;
2392 if (!pending_exact || pending_exact + *pending_exact + 1 != b
2393 || *pending_exact >= (c1 ? 0176 : 0177)
2394 || *nextp == '+' || *nextp == '?'
2395 || *nextp == '*' || *nextp == '^'
2396 || *nextp == '{') {
2397 laststart = b;
2398 BUFPUSH(exactn);
2399 pending_exact = b;
2400 BUFPUSH(0);
2401 }
2402 if (had_num_literal || c == 0xff) {
2403 BUFPUSH(0xff);
2404 (*pending_exact)++;
2405 had_num_literal = 0;
2406 }
2407 BUFPUSH(c);
2408 (*pending_exact)++;
2409 if (had_mbchar) {
2410 int len = mbclen(c) - 1;
2411 while (len--) {
2412 PATFETCH_RAW(c);
2413 BUFPUSH(c);
2414 (*pending_exact)++;
2415 }
2416 }
2417 }
2418 }
2419
2420 if (fixup_alt_jump)
2421 store_jump(fixup_alt_jump, jump, b);
2422
2423 if (stackp != stackb)
2424 FREE_AND_RETURN(stackb, "unmatched (");
2425
2426 /* set optimize flags */
2427 laststart = bufp->buffer;
2428 if (laststart != b) {
2429 if (*laststart == start_memory) laststart += 3;
2430 if (*laststart == dummy_failure_jump) laststart += 3;
2431 else if (*laststart == try_next) laststart += 3;
2432 if (*laststart == anychar_repeat) {
2433 bufp->options |= RE_OPTIMIZE_ANCHOR;
2434 }
2435 else if (*laststart == on_failure_jump) {
2436 int mcnt;
2437
2438 laststart++;
2439 EXTRACT_NUMBER_AND_INCR(mcnt, laststart);
2440 if (*laststart == charset || *laststart == charset_not) {
2441 p0 = laststart;
2442 mcnt = *++p0;
2443 p0 += mcnt+1;
2444 mcnt = EXTRACT_UNSIGNED_AND_INCR(p0);
2445 p0 += 8*mcnt;
2446 if (*p0 == maybe_finalize_jump) {
2447 bufp->stclass = laststart;
2448 }
2449 }
2450 }
2451 }
2452
2453 bufp->used = b - bufp->buffer;
2454 bufp->re_nsub = regnum;
2455 laststart = bufp->buffer;
2456 if (laststart != b) {
2457 if (*laststart == start_memory) laststart += 3;
2458 if (*laststart == exactn) {
2459 bufp->options |= RE_OPTIMIZE_EXACTN;
2460 bufp->must = laststart+1;
2461 }
2462 }
2463 if (!bufp->must) {
2464 bufp->must = calculate_must_string(bufp->buffer, b);
2465 }
2466 if (current_mbctype == MBCTYPE_SJIS) bufp->options |= RE_OPTIMIZE_NO_BM;
2467 else if (bufp->must) {
2468 int i;
2469 int len = (unsigned char)bufp->must[0];
2470
2471 for (i=1; i<len; i++) {
2472 if ((unsigned char)bufp->must[i] == 0xff ||
2473 (current_mbctype && ismbchar(bufp->must[i]))) {
2474 bufp->options |= RE_OPTIMIZE_NO_BM;
2475 break;
2476 }
2477 }
2478 if (!(bufp->options & RE_OPTIMIZE_NO_BM)) {
2479 bufp->must_skip = (int *) nmz_xmalloc((1 << BYTEWIDTH)*sizeof(int));
2480 bm_init_skip(bufp->must_skip, (unsigned char*)bufp->must+1,
2481 (unsigned char)bufp->must[0],
2482 (unsigned char*)(MAY_TRANSLATE()?translate:0));
2483 }
2484 }
2485
2486 bufp->regstart = TMALLOC(regnum, unsigned char*);
2487 bufp->regend = TMALLOC(regnum, unsigned char*);
2488 bufp->old_regstart = TMALLOC(regnum, unsigned char*);
2489 bufp->old_regend = TMALLOC(regnum, unsigned char*);
2490 bufp->reg_info = TMALLOC(regnum, register_info_type);
2491 bufp->best_regstart = TMALLOC(regnum, unsigned char*);
2492 bufp->best_regend = TMALLOC(regnum, unsigned char*);
2493 FREE_AND_RETURN(stackb, 0);
2494
2495 invalid_pattern:
2496 FREE_AND_RETURN(stackb, "invalid regular expression");
2497
2498 end_of_pattern:
2499 FREE_AND_RETURN(stackb, "premature end of regular expression");
2500
2501 too_big:
2502 FREE_AND_RETURN(stackb, "regular expression too big");
2503
2504 memory_exhausted:
2505 FREE_AND_RETURN(stackb, "memory exhausted");
2506
2507 nested_meta:
2508 FREE_AND_RETURN(stackb, "nested *?+ in regexp");
2509
2510 invalid_escape:
2511 FREE_AND_RETURN(stackb, "Invalid escape character syntax");
2512 }
2513
2514 void
nmz_re_free_pattern(bufp)2515 nmz_re_free_pattern(bufp)
2516 struct re_pattern_buffer *bufp;
2517 {
2518 xfree(bufp->buffer);
2519 xfree(bufp->fastmap);
2520 if (bufp->must_skip) xfree(bufp->must_skip);
2521
2522 xfree(bufp->regstart);
2523 xfree(bufp->regend);
2524 xfree(bufp->old_regstart);
2525 xfree(bufp->old_regend);
2526 xfree(bufp->best_regstart);
2527 xfree(bufp->best_regend);
2528 xfree(bufp->reg_info);
2529 xfree(bufp);
2530 }
2531
2532 /* Store a jump of the form <OPCODE> <relative address>.
2533 Store in the location FROM a jump operation to jump to relative
2534 address FROM - TO. OPCODE is the opcode to store. */
2535
2536 static void
store_jump(from,opcode,to)2537 store_jump(from, opcode, to)
2538 char *from, *to;
2539 int opcode;
2540 {
2541 from[0] = (char)opcode;
2542 STORE_NUMBER(from + 1, to - (from + 3));
2543 }
2544
2545
2546 /* Open up space before char FROM, and insert there a jump to TO.
2547 CURRENT_END gives the end of the storage not in use, so we know
2548 how much data to copy up. OP is the opcode of the jump to insert.
2549
2550 If you call this function, you must zero out pending_exact. */
2551
2552 static void
insert_jump(op,from,to,current_end)2553 insert_jump(op, from, to, current_end)
2554 int op;
2555 char *from, *to, *current_end;
2556 {
2557 register char *pfrom = current_end; /* Copy from here... */
2558 register char *pto = current_end + 3; /* ...to here. */
2559
2560 while (pfrom != from)
2561 *--pto = *--pfrom;
2562 store_jump(from, op, to);
2563 }
2564
2565
2566 /* Store a jump of the form <opcode> <relative address> <n> .
2567
2568 Store in the location FROM a jump operation to jump to relative
2569 address FROM - TO. OPCODE is the opcode to store, N is a number the
2570 jump uses, say, to decide how many times to jump.
2571
2572 If you call this function, you must zero out pending_exact. */
2573
2574 static void
store_jump_n(from,opcode,to,n)2575 store_jump_n(from, opcode, to, n)
2576 char *from, *to;
2577 int opcode;
2578 unsigned n;
2579 {
2580 from[0] = (char)opcode;
2581 STORE_NUMBER(from + 1, to - (from + 3));
2582 STORE_NUMBER(from + 3, n);
2583 }
2584
2585
2586 /* Similar to insert_jump, but handles a jump which needs an extra
2587 number to handle minimum and maximum cases. Open up space at
2588 location FROM, and insert there a jump to TO. CURRENT_END gives the
2589 end of the storage in use, so we know how much data to copy up. OP is
2590 the opcode of the jump to insert.
2591
2592 If you call this function, you must zero out pending_exact. */
2593
2594 static void
insert_jump_n(op,from,to,current_end,n)2595 insert_jump_n(op, from, to, current_end, n)
2596 int op;
2597 char *from, *to, *current_end;
2598 unsigned n;
2599 {
2600 register char *pfrom = current_end; /* Copy from here... */
2601 register char *pto = current_end + 5; /* ...to here. */
2602
2603 while (pfrom != from)
2604 *--pto = *--pfrom;
2605 store_jump_n(from, op, to, n);
2606 }
2607
2608
2609 /* Open up space at location THERE, and insert operation OP.
2610 CURRENT_END gives the end of the storage in use, so
2611 we know how much data to copy up.
2612
2613 If you call this function, you must zero out pending_exact. */
2614
2615 #if 0
2616 static void
2617 insert_op(op, there, current_end)
2618 int op;
2619 char *there, *current_end;
2620 {
2621 register char *pfrom = current_end; /* Copy from here... */
2622 register char *pto = current_end + 1; /* ...to here. */
2623
2624 while (pfrom != there)
2625 *--pto = *--pfrom;
2626
2627 there[0] = (char)op;
2628 }
2629 #endif
2630
2631
2632 /* Open up space at location THERE, and insert operation OP followed by
2633 NUM_1 and NUM_2. CURRENT_END gives the end of the storage in use, so
2634 we know how much data to copy up.
2635
2636 If you call this function, you must zero out pending_exact. */
2637
2638 static void
insert_op_2(op,there,current_end,num_1,num_2)2639 insert_op_2(op, there, current_end, num_1, num_2)
2640 int op;
2641 char *there, *current_end;
2642 int num_1, num_2;
2643 {
2644 register char *pfrom = current_end; /* Copy from here... */
2645 register char *pto = current_end + 5; /* ...to here. */
2646
2647 while (pfrom != there)
2648 *--pto = *--pfrom;
2649
2650 there[0] = (char)op;
2651 STORE_NUMBER(there + 1, num_1);
2652 STORE_NUMBER(there + 3, num_2);
2653 }
2654
2655
2656 #define trans_eq(c1, c2, translate) (translate?(translate[c1]==translate[c2]):((c1)==(c2)))
2657 static int
slow_match(little,lend,big,bend,translate)2658 slow_match(little, lend, big, bend, translate)
2659 unsigned char *little, *lend;
2660 unsigned char *big, *bend;
2661 unsigned char *translate;
2662 {
2663 int c;
2664
2665 while (little < lend && big < bend) {
2666 c = *little++;
2667 if (c == 0xff)
2668 c = *little++;
2669 if (!trans_eq(*big++, c, translate)) break;
2670 }
2671 if (little == lend) return 1;
2672 return 0;
2673 }
2674
2675 static int
slow_search(little,llen,big,blen,translate)2676 slow_search(little, llen, big, blen, translate)
2677 unsigned char *little;
2678 int llen;
2679 unsigned char *big;
2680 int blen;
2681 char *translate;
2682 {
2683 unsigned char *bsave = big;
2684 unsigned char *bend = big + blen;
2685 register int c;
2686 int fescape = 0;
2687
2688 c = *little;
2689 if (c == 0xff) {
2690 c = little[1];
2691 fescape = 1;
2692 }
2693 else if (translate && !ismbchar(c)) {
2694 c = translate[c];
2695 }
2696
2697 while (big < bend) {
2698 /* look for first character */
2699 if (fescape) {
2700 while (big < bend) {
2701 if (*big == c) break;
2702 big++;
2703 }
2704 }
2705 else if (translate && !ismbchar(c)) {
2706 while (big < bend) {
2707 if (ismbchar(*big)) big+=mbclen(*big)-1;
2708 else if (translate[*big] == c) break;
2709 big++;
2710 }
2711 }
2712 else {
2713 while (big < bend) {
2714 if (*big == c) break;
2715 if (ismbchar(*big)) big+=mbclen(*big)-1;
2716 big++;
2717 }
2718 }
2719
2720 if (slow_match(little, little+llen, big, bend, translate))
2721 return big - bsave;
2722
2723 big+=mbclen(*big);
2724 }
2725 return -1;
2726 }
2727
2728 static void
bm_init_skip(skip,pat,m,translate)2729 bm_init_skip(skip, pat, m, translate)
2730 int *skip;
2731 unsigned char *pat;
2732 int m;
2733 const unsigned char *translate;
2734 {
2735 int j, c;
2736
2737 for (c=0; c<256; c++) {
2738 skip[c] = m;
2739 }
2740 if (translate) {
2741 for (j=0; j<m-1; j++) {
2742 skip[translate[pat[j]]] = m-1-j;
2743 }
2744 }
2745 else {
2746 for (j=0; j<m-1; j++) {
2747 skip[pat[j]] = m-1-j;
2748 }
2749 }
2750 }
2751
2752 static int
bm_search(little,llen,big,blen,skip,translate)2753 bm_search(little, llen, big, blen, skip, translate)
2754 unsigned char *little;
2755 int llen;
2756 unsigned char *big;
2757 int blen;
2758 int *skip;
2759 unsigned char *translate;
2760 {
2761 int i, j, k;
2762
2763 i = llen-1;
2764 if (translate) {
2765 while (i < blen) {
2766 k = i;
2767 j = llen-1;
2768 while (j >= 0 && translate[big[k]] == translate[little[j]]) {
2769 k--;
2770 j--;
2771 }
2772 if (j < 0) return k+1;
2773
2774 i += skip[translate[big[i]]];
2775 }
2776 return -1;
2777 }
2778 while (i < blen) {
2779 k = i;
2780 j = llen-1;
2781 while (j >= 0 && big[k] == little[j]) {
2782 k--;
2783 j--;
2784 }
2785 if (j < 0) return k+1;
2786
2787 i += skip[big[i]];
2788 }
2789 return -1;
2790 }
2791
2792 /* Given a pattern, compute a fastmap from it. The fastmap records
2793 which of the (1 << BYTEWIDTH) possible characters can start a string
2794 that matches the pattern. This fastmap is used by nmz_re_search to skip
2795 quickly over totally implausible text.
2796
2797 The caller must supply the address of a (1 << BYTEWIDTH)-byte data
2798 area as bufp->fastmap.
2799 The other components of bufp describe the pattern to be used. */
2800 void
nmz_re_compile_fastmap(bufp)2801 nmz_re_compile_fastmap(bufp)
2802 struct re_pattern_buffer *bufp;
2803 {
2804 unsigned char *pattern = (unsigned char*)bufp->buffer;
2805 int size = bufp->used;
2806 register char *fastmap = bufp->fastmap;
2807 register unsigned char *p = pattern;
2808 register unsigned char *pend = pattern + size;
2809 register int j, k;
2810 unsigned is_a_succeed_n;
2811
2812
2813 unsigned char *stacka[NFAILURES];
2814 unsigned char **stackb = stacka;
2815 unsigned char **stackp = stackb;
2816 unsigned char **stacke = stackb + NFAILURES;
2817 int options = bufp->options;
2818
2819 memset(fastmap, 0, (1 << BYTEWIDTH));
2820 bufp->fastmap_accurate = 1;
2821 bufp->can_be_null = 0;
2822
2823 while (p) {
2824 is_a_succeed_n = 0;
2825 if (p == pend) {
2826 bufp->can_be_null = 1;
2827 break;
2828 }
2829 #ifdef SWITCH_ENUM_BUG
2830 switch ((int)((enum regexpcode)*p++))
2831 #else
2832 switch ((enum regexpcode)*p++)
2833 #endif
2834 {
2835 case exactn:
2836 if (p[1] == 0xff) {
2837 if (TRANSLATE_P())
2838 fastmap[translate[p[2]]] = 2;
2839 else
2840 fastmap[p[2]] = 2;
2841 bufp->options |= RE_OPTIMIZE_BMATCH;
2842 }
2843 else if (TRANSLATE_P())
2844 fastmap[translate[p[1]]] = 1;
2845 else
2846 fastmap[p[1]] = 1;
2847 break;
2848
2849 case begline:
2850 case begbuf:
2851 case endbuf:
2852 case endbuf2:
2853 case wordbound:
2854 case notwordbound:
2855 case wordbeg:
2856 case wordend:
2857 case pop_and_fail:
2858 case push_dummy_failure:
2859 case start_paren:
2860 case stop_paren:
2861 continue;
2862
2863 case casefold_on:
2864 bufp->options |= RE_MAY_IGNORECASE;
2865 case casefold_off:
2866 options ^= RE_OPTION_IGNORECASE;
2867 continue;
2868
2869 case option_set:
2870 options = *p++;
2871 continue;
2872
2873 case endline:
2874 if (TRANSLATE_P())
2875 fastmap[translate['\n']] = 1;
2876 else
2877 fastmap['\n'] = 1;
2878 if ((options & RE_OPTION_SINGLELINE) == 0 && bufp->can_be_null == 0)
2879 bufp->can_be_null = 2;
2880 break;
2881
2882 case jump_n:
2883 case finalize_jump:
2884 case maybe_finalize_jump:
2885 case jump:
2886 case jump_past_alt:
2887 case dummy_failure_jump:
2888 case finalize_push:
2889 case finalize_push_n:
2890 EXTRACT_NUMBER_AND_INCR(j, p);
2891 p += j;
2892 if (j > 0)
2893 continue;
2894 /* Jump backward reached implies we just went through
2895 the body of a loop and matched nothing.
2896 Opcode jumped to should be an on_failure_jump.
2897 Just treat it like an ordinary jump.
2898 For a * loop, it has pushed its failure point already;
2899 If so, discard that as redundant. */
2900
2901 if ((enum regexpcode)*p != on_failure_jump
2902 && (enum regexpcode)*p != try_next
2903 && (enum regexpcode)*p != succeed_n)
2904 continue;
2905 p++;
2906 EXTRACT_NUMBER_AND_INCR(j, p);
2907 p += j;
2908 if (stackp != stackb && *stackp == p)
2909 stackp--; /* pop */
2910 continue;
2911
2912 case try_next:
2913 case start_nowidth:
2914 case stop_nowidth:
2915 case stop_backtrack:
2916 p += 2;
2917 continue;
2918
2919 case succeed_n:
2920 is_a_succeed_n = 1;
2921 /* Get to the number of times to succeed. */
2922 EXTRACT_NUMBER(k, p + 2);
2923 /* Increment p past the n for when k != 0. */
2924 if (k != 0) {
2925 p += 4;
2926 continue;
2927 }
2928 /* fall through */
2929
2930 case on_failure_jump:
2931 EXTRACT_NUMBER_AND_INCR(j, p);
2932 if (p + j < pend) {
2933 if (stackp == stacke) {
2934 EXPAND_FAIL_STACK();
2935 }
2936 *++stackp = p + j; /* push */
2937 }
2938 else {
2939 bufp->can_be_null = 1;
2940 }
2941 if (is_a_succeed_n)
2942 EXTRACT_NUMBER_AND_INCR(k, p); /* Skip the n. */
2943 continue;
2944
2945 case set_number_at:
2946 p += 4;
2947 continue;
2948
2949 case start_memory:
2950 case stop_memory:
2951 p += 2;
2952 continue;
2953
2954 case duplicate:
2955 bufp->can_be_null = 1;
2956 fastmap['\n'] = 1;
2957 case anychar_repeat:
2958 case anychar:
2959 for (j = 0; j < (1 << BYTEWIDTH); j++) {
2960 if (j != '\n' || (options & RE_OPTION_MULTILINE))
2961 fastmap[j] = 1;
2962 }
2963 if (bufp->can_be_null) {
2964 FREE_AND_RETURN_VOID(stackb);
2965 }
2966 /* Don't return; check the alternative paths
2967 so we can set can_be_null if appropriate. */
2968 if ((enum regexpcode)p[-1] == anychar_repeat) {
2969 continue;
2970 }
2971 break;
2972
2973 case wordchar:
2974 for (j = 0; j < 0x80; j++) {
2975 if (SYNTAX(j) == Sword)
2976 fastmap[j] = 1;
2977 }
2978 switch (current_mbctype) {
2979 case MBCTYPE_ASCII:
2980 for (j = 0x80; j < (1 << BYTEWIDTH); j++) {
2981 if (SYNTAX(j) == Sword2)
2982 fastmap[j] = 1;
2983 }
2984 break;
2985 case MBCTYPE_EUC:
2986 case MBCTYPE_SJIS:
2987 case MBCTYPE_UTF8:
2988 for (j = 0x80; j < (1 << BYTEWIDTH); j++) {
2989 if (re_mbctab[j])
2990 fastmap[j] = 1;
2991 }
2992 break;
2993 }
2994 break;
2995
2996 case notwordchar:
2997 for (j = 0; j < 0x80; j++)
2998 if (SYNTAX(j) != Sword)
2999 fastmap[j] = 1;
3000 switch (current_mbctype) {
3001 case MBCTYPE_ASCII:
3002 for (j = 0x80; j < (1 << BYTEWIDTH); j++) {
3003 if (SYNTAX(j) != Sword2)
3004 fastmap[j] = 1;
3005 }
3006 break;
3007 case MBCTYPE_EUC:
3008 case MBCTYPE_SJIS:
3009 case MBCTYPE_UTF8:
3010 for (j = 0x80; j < (1 << BYTEWIDTH); j++) {
3011 if (!re_mbctab[j])
3012 fastmap[j] = 1;
3013 }
3014 break;
3015 }
3016 break;
3017
3018 case charset:
3019 /* NOTE: Charset for single-byte chars never contain
3020 multi-byte char. See set_list_bits(). */
3021 for (j = *p++ * BYTEWIDTH - 1; j >= 0; j--)
3022 if (p[j / BYTEWIDTH] & (1 << (j % BYTEWIDTH))) {
3023 int tmp = TRANSLATE_P()?translate[j]:j;
3024 fastmap[tmp] = 1;
3025 }
3026 {
3027 unsigned short size;
3028 unsigned long c, beg, end;
3029
3030 p += p[-1] + 2;
3031 size = EXTRACT_UNSIGNED(&p[-2]);
3032 for (j = 0; j < (int)size; j++) {
3033 c = EXTRACT_MBC(&p[j*8]);
3034 beg = WC2MBC1ST(c);
3035 c = EXTRACT_MBC(&p[j*8+4]);
3036 end = WC2MBC1ST(c);
3037 /* set bits for 1st bytes of multi-byte chars. */
3038 while (beg <= end) {
3039 /* NOTE: Charset for multi-byte chars might contain
3040 single-byte chars. We must reject them. */
3041 if (c < 0x100) {
3042 fastmap[beg] = 2;
3043 bufp->options |= RE_OPTIMIZE_BMATCH;
3044 }
3045 else if (ismbchar(beg))
3046 fastmap[beg] = 1;
3047 beg++;
3048 }
3049 }
3050 }
3051 break;
3052
3053 case charset_not:
3054 /* S: set of all single-byte chars.
3055 M: set of all first bytes that can start multi-byte chars.
3056 s: any set of single-byte chars.
3057 m: any set of first bytes that can start multi-byte chars.
3058
3059 We assume S+M = U.
3060 ___ _ _
3061 s+m = (S*s+M*m). */
3062 /* Chars beyond end of map must be allowed */
3063 /* NOTE: Charset_not for single-byte chars might contain
3064 multi-byte chars. See set_list_bits(). */
3065 for (j = *p * BYTEWIDTH; j < (1 << BYTEWIDTH); j++)
3066 if (!ismbchar(j))
3067 fastmap[j] = 1;
3068
3069 for (j = *p++ * BYTEWIDTH - 1; j >= 0; j--)
3070 if (!(p[j / BYTEWIDTH] & (1 << (j % BYTEWIDTH)))) {
3071 if (!ismbchar(j))
3072 fastmap[j] = 1;
3073 }
3074 {
3075 unsigned short size;
3076 unsigned long c, beg;
3077 int num_literal = 0;
3078
3079 p += p[-1] + 2;
3080 size = EXTRACT_UNSIGNED(&p[-2]);
3081 if (size == 0) {
3082 for (j = 0x80; j < (1 << BYTEWIDTH); j++)
3083 if (ismbchar(j))
3084 fastmap[j] = 1;
3085 break;
3086 }
3087 for (j = 0,c = 0;j < (int)size; j++) {
3088 unsigned int cc = EXTRACT_MBC(&p[j*8]);
3089 beg = WC2MBC1ST(cc);
3090 while (c <= beg) {
3091 if (ismbchar(c))
3092 fastmap[c] = 1;
3093 c++;
3094 }
3095
3096 cc = EXTRACT_MBC(&p[j*8+4]);
3097 if (cc < 0xff) {
3098 num_literal = 1;
3099 while (c <= cc) {
3100 if (ismbchar(c))
3101 fastmap[c] = 1;
3102 c++;
3103 }
3104 }
3105 c = WC2MBC1ST(cc);
3106 }
3107
3108 for (j = c; j < (1 << BYTEWIDTH); j++) {
3109 if (num_literal)
3110 fastmap[j] = 1;
3111 if (ismbchar(j))
3112 fastmap[j] = 1;
3113 }
3114 }
3115 break;
3116
3117 case begpos:
3118 case unused: /* pacify gcc -Wall */
3119 break;
3120 }
3121
3122 /* Get here means we have successfully found the possible starting
3123 characters of one path of the pattern. We need not follow this
3124 path any farther. Instead, look at the next alternative
3125 remembered in the stack. */
3126 if (stackp != stackb)
3127 p = *stackp--; /* pop */
3128 else
3129 break;
3130 }
3131 FREE_AND_RETURN_VOID(stackb);
3132 }
3133
3134 /* adjust startpos value to the position between characters. */
3135 int
nmz_re_adjust_startpos(bufp,string,size,startpos,range)3136 nmz_re_adjust_startpos(bufp, string, size, startpos, range)
3137 struct re_pattern_buffer *bufp;
3138 const char *string;
3139 int size, startpos, range;
3140 {
3141 /* Update the fastmap now if not correct already. */
3142 if (!bufp->fastmap_accurate) {
3143 nmz_re_compile_fastmap(bufp);
3144 }
3145
3146 /* Adjust startpos for mbc string */
3147 if (current_mbctype && startpos>0 && !(bufp->options&RE_OPTIMIZE_BMATCH)) {
3148 int i = 0;
3149
3150 if (range > 0) {
3151 while (i<size) {
3152 i += mbclen(string[i]);
3153 if (startpos <= i) {
3154 startpos = i;
3155 break;
3156 }
3157 }
3158 }
3159 else {
3160 int w;
3161
3162 while (i<size) {
3163 w = mbclen(string[i]);
3164 if (startpos < i + w) {
3165 startpos = i;
3166 break;
3167 }
3168 i += w;
3169 }
3170 }
3171 }
3172 return startpos;
3173 }
3174
3175
3176 /* Using the compiled pattern in BUFP->buffer, first tries to match
3177 STRING, starting first at index STARTPOS, then at STARTPOS + 1, and
3178 so on. RANGE is the number of places to try before giving up. If
3179 RANGE is negative, it searches backwards, i.e., the starting
3180 positions tried are STARTPOS, STARTPOS - 1, etc. STRING is of SIZE.
3181 In REGS, return the indices of STRING that matched the entire
3182 BUFP->buffer and its contained subexpressions.
3183
3184 The value returned is the position in the strings at which the match
3185 was found, or -1 if no match was found, or -2 if error (such as
3186 failure stack overflow). */
3187
3188 int
nmz_re_search(bufp,string,size,startpos,range,regs)3189 nmz_re_search(bufp, string, size, startpos, range, regs)
3190 struct re_pattern_buffer *bufp;
3191 const char *string;
3192 int size, startpos, range;
3193 struct re_registers *regs;
3194 {
3195 register char *fastmap = bufp->fastmap;
3196 int val, anchor = 0;
3197
3198 /* Check for out-of-range starting position. */
3199 if (startpos < 0 || startpos > size)
3200 return -1;
3201
3202 /* Update the fastmap now if not correct already. */
3203 if (fastmap && !bufp->fastmap_accurate) {
3204 nmz_re_compile_fastmap(bufp);
3205 }
3206
3207
3208 /* If the search isn't to be a backwards one, don't waste time in a
3209 search for a pattern that must be anchored. */
3210 if (bufp->used > 0) {
3211 switch ((enum regexpcode)bufp->buffer[0]) {
3212 case begbuf:
3213 begbuf_match:
3214 if (range > 0) {
3215 if (startpos > 0) return -1;
3216 else {
3217 val = nmz_re_match(bufp, string, size, 0, regs);
3218 if (val >= 0) return 0;
3219 return val;
3220 }
3221 }
3222 break;
3223
3224 case begline:
3225 anchor = 1;
3226 break;
3227
3228 case begpos:
3229 val = nmz_re_match(bufp, string, size, startpos, regs);
3230 if (val >= 0) return startpos;
3231 return val;
3232
3233 default:
3234 break;
3235 }
3236 }
3237 if (bufp->options & RE_OPTIMIZE_ANCHOR) {
3238 if (bufp->options&RE_OPTION_SINGLELINE) {
3239 goto begbuf_match;
3240 }
3241 anchor = 1;
3242 }
3243
3244 if (bufp->must) {
3245 int len = ((unsigned char*)bufp->must)[0];
3246 int pos, pbeg, pend;
3247
3248 pbeg = startpos;
3249 pend = startpos + range;
3250 if (pbeg > pend) { /* swap pbeg,pend */
3251 pos = pend; pend = pbeg; pbeg = pos;
3252 }
3253 pend = size;
3254 if (bufp->options & RE_OPTIMIZE_NO_BM) {
3255 pos = slow_search(bufp->must+1, len,
3256 string+pbeg, pend-pbeg,
3257 MAY_TRANSLATE()?translate:0);
3258 }
3259 else {
3260 pos = bm_search(bufp->must+1, len,
3261 string+pbeg, pend-pbeg,
3262 bufp->must_skip,
3263 MAY_TRANSLATE()?translate:0);
3264 }
3265 if (pos == -1) return -1;
3266 if (range > 0 && (bufp->options & RE_OPTIMIZE_EXACTN)) {
3267 startpos += pos;
3268 range -= pos;
3269 if (range < 0) return -1;
3270 }
3271 }
3272
3273 for (;;) {
3274 /* If a fastmap is supplied, skip quickly over characters that
3275 cannot possibly be the start of a match. Note, however, that
3276 if the pattern can possibly match the null string, we must
3277 test it at each starting point so that we take the first null
3278 string we get. */
3279
3280 if (fastmap && startpos < size
3281 && bufp->can_be_null != 1 && !(anchor && startpos == 0)) {
3282 if (range > 0) { /* Searching forwards. */
3283 register unsigned char *p, c;
3284 int irange = range;
3285
3286 p = (unsigned char*)string+startpos;
3287
3288 while (range > 0) {
3289 c = *p++;
3290 if (ismbchar(c)) {
3291 int len;
3292
3293 if (fastmap[c])
3294 break;
3295 len = mbclen(c) - 1;
3296 while (len--) {
3297 c = *p++;
3298 range--;
3299 if (fastmap[c] == 2)
3300 goto startpos_adjust;
3301 }
3302 }
3303 else {
3304 if (fastmap[MAY_TRANSLATE() ? translate[c] : c])
3305 break;
3306 }
3307 range--;
3308 }
3309 startpos_adjust:
3310 startpos += irange - range;
3311 }
3312 else { /* Searching backwards. */
3313 register unsigned char c;
3314
3315 c = string[startpos];
3316 c &= 0xff;
3317 if (MAY_TRANSLATE() ? !fastmap[translate[c]] : !fastmap[c])
3318 goto advance;
3319 }
3320 }
3321
3322 if (startpos > size) return -1;
3323 if ((anchor || !bufp->can_be_null) && range > 0 && size > 0 && startpos == size)
3324 return -1;
3325 val = nmz_re_match(bufp, string, size, startpos, regs);
3326 if (val >= 0) return startpos;
3327 if (val == -2) return -2;
3328
3329 #ifndef NO_ALLOCA
3330 #ifdef C_ALLOCA
3331 alloca(0);
3332 #endif /* C_ALLOCA */
3333 #endif /* NO_ALLOCA */
3334
3335 if (range > 0) {
3336 if (anchor && startpos < size &&
3337 (startpos < 1 || string[startpos-1] != '\n')) {
3338 while (range > 0 && string[startpos] != '\n') {
3339 range--;
3340 startpos++;
3341 }
3342 }
3343 else if (fastmap && (bufp->stclass)) {
3344 register unsigned char *p;
3345 unsigned long c;
3346 int irange = range;
3347
3348 p = (unsigned char*)string+startpos;
3349 while (range > 0) {
3350 c = *p++;
3351 if (ismbchar(c) && fastmap[c] != 2) {
3352 MBC2WC(c, p);
3353 }
3354 else if (MAY_TRANSLATE())
3355 c = translate[c];
3356 if (*bufp->stclass == charset) {
3357 if (!is_in_list(c, bufp->stclass+1)) break;
3358 }
3359 else {
3360 if (is_in_list(c, bufp->stclass+1)) break;
3361 }
3362 range--;
3363 if (c > 256) range--;
3364 }
3365 startpos += irange - range;
3366 }
3367 }
3368
3369 advance:
3370 if (!range)
3371 break;
3372 else if (range > 0) {
3373 const char *d = string + startpos;
3374
3375 if (ismbchar(*d)) {
3376 int len = mbclen(*d) - 1;
3377 range-=len, startpos+=len;
3378 if (!range)
3379 break;
3380 }
3381 range--, startpos++;
3382 }
3383 else {
3384 range++, startpos--;
3385 {
3386 const char *s, *d, *p;
3387
3388 s = string; d = string + startpos;
3389 for (p = d; p-- > s && ismbchar(*p); )
3390 /* --p >= s would not work on 80[12]?86.
3391 (when the offset of s equals 0 other than huge model.) */
3392 ;
3393 if (!((d - p) & 1)) {
3394 if (!range)
3395 break;
3396 range++, startpos--;
3397 }
3398 }
3399 }
3400 }
3401 return -1;
3402 }
3403
3404
3405
3406
3407 /* The following are used for nmz_re_match, defined below: */
3408
3409 /* Accessing macros used in nmz_re_match: */
3410
3411 #define IS_ACTIVE(R) ((R).bits.is_active)
3412 #define MATCHED_SOMETHING(R) ((R).bits.matched_something)
3413
3414
3415 /* Macros used by nmz_re_match: */
3416
3417 /* I.e., regstart, regend, and reg_info. */
3418 #define NUM_REG_ITEMS 3
3419
3420 /* I.e., ptr and count. */
3421 #define NUM_COUNT_ITEMS 2
3422
3423 /* Individual items aside from the registers. */
3424 #define NUM_NONREG_ITEMS 4
3425
3426 /* We push at most this many things on the stack whenever we
3427 fail. The `+ 2' refers to PATTERN_PLACE and STRING_PLACE, which are
3428 arguments to the PUSH_FAILURE_POINT macro. */
3429 #define MAX_NUM_FAILURE_ITEMS (num_regs * NUM_REG_ITEMS + NUM_NONREG_ITEMS)
3430
3431 /* We push this many things on the stack whenever we fail. */
3432 #define NUM_FAILURE_ITEMS (last_used_reg * NUM_REG_ITEMS + NUM_NONREG_ITEMS + 1)
3433
3434 /* This pushes counter information for succeed_n and jump_n */
3435 #define PUSH_FAILURE_COUNT(ptr) \
3436 do { \
3437 int c; \
3438 EXTRACT_NUMBER(c, ptr); \
3439 ENSURE_FAIL_STACK(NUM_COUNT_ITEMS); \
3440 *stackp++ = (unsigned char*)(long)c; \
3441 *stackp++ = (ptr); \
3442 num_failure_counts++; \
3443 } while (0)
3444
3445 /* This pushes most of the information about the current state we will want
3446 if we ever fail back to it. */
3447
3448 #define PUSH_FAILURE_POINT(pattern_place, string_place) \
3449 do { \
3450 long last_used_reg, this_reg; \
3451 \
3452 /* Find out how many registers are active or have been matched. \
3453 (Aside from register zero, which is only set at the end.) */ \
3454 for (last_used_reg = num_regs-1; last_used_reg > 0; last_used_reg--)\
3455 if (!REG_UNSET(regstart[last_used_reg])) \
3456 break; \
3457 \
3458 ENSURE_FAIL_STACK(NUM_FAILURE_ITEMS); \
3459 *stackp++ = (unsigned char*)(long)num_failure_counts; \
3460 num_failure_counts = 0; \
3461 \
3462 /* Now push the info for each of those registers. */ \
3463 for (this_reg = 1; this_reg <= last_used_reg; this_reg++) { \
3464 *stackp++ = regstart[this_reg]; \
3465 *stackp++ = regend[this_reg]; \
3466 *stackp++ = reg_info[this_reg].word; \
3467 } \
3468 \
3469 /* Push how many registers we saved. */ \
3470 *stackp++ = (unsigned char*)last_used_reg; \
3471 \
3472 *stackp++ = pattern_place; \
3473 *stackp++ = string_place; \
3474 *stackp++ = (unsigned char*)(long)options; /* current option status */ \
3475 *stackp++ = (unsigned char*)0; /* non-greedy flag */ \
3476 } while(0)
3477
3478 #define NON_GREEDY ((unsigned char*)1)
3479
3480 #define POP_FAILURE_COUNT() \
3481 do { \
3482 unsigned char *ptr = *--stackp; \
3483 int count = (long)*--stackp; \
3484 STORE_NUMBER(ptr, count); \
3485 } while (0)
3486
3487 /* This pops what PUSH_FAILURE_POINT pushes. */
3488
3489 #define POP_FAILURE_POINT() \
3490 do { \
3491 long temp; \
3492 stackp -= NUM_NONREG_ITEMS; /* Remove failure points (and flag). */ \
3493 temp = (long)*--stackp; /* How many regs pushed. */ \
3494 temp *= NUM_REG_ITEMS; /* How much to take off the stack. */ \
3495 stackp -= temp; /* Remove the register info. */ \
3496 temp = (long)*--stackp; /* How many counters pushed. */ \
3497 while (temp--) { \
3498 POP_FAILURE_COUNT(); /* Remove the counter info. */ \
3499 } \
3500 num_failure_counts = 0; /* Reset num_failure_counts. */ \
3501 } while(0)
3502
3503 /* Registers are set to a sentinel when they haven't yet matched. */
3504 #define REG_UNSET_VALUE ((unsigned char*)-1)
3505 #define REG_UNSET(e) ((e) == REG_UNSET_VALUE)
3506
3507 #define PREFETCH if (d == dend) goto fail
3508
3509 /* Call this when have matched something; it sets `matched' flags for the
3510 registers corresponding to the subexpressions of which we currently
3511 are inside. */
3512 #define SET_REGS_MATCHED \
3513 do { unsigned this_reg; \
3514 for (this_reg = 0; this_reg < num_regs; this_reg++) { \
3515 if (IS_ACTIVE(reg_info[this_reg])) \
3516 MATCHED_SOMETHING(reg_info[this_reg]) = 1; \
3517 else \
3518 MATCHED_SOMETHING(reg_info[this_reg]) = 0; \
3519 } \
3520 } while(0)
3521
3522 #define AT_STRINGS_BEG(d) ((d) == string)
3523 #define AT_STRINGS_END(d) ((d) == dend)
3524
3525 #define IS_A_LETTER(d) (SYNTAX(*(d)) == Sword || \
3526 (current_mbctype ? \
3527 (re_mbctab[*(d)] && ((d)+mbclen(*(d)))<=dend): \
3528 SYNTAX(*(d)) == Sword2))
3529
3530 #define PREV_IS_A_LETTER(d) ((current_mbctype == MBCTYPE_SJIS)? \
3531 IS_A_LETTER((d)-(!AT_STRINGS_BEG((d)-1)&& \
3532 ismbchar((d)[-2])?2:1)): \
3533 ((current_mbctype && ((d)[-1] >= 0x80)) || \
3534 IS_A_LETTER((d)-1)))
3535
3536 static void
init_regs(regs,num_regs)3537 init_regs(regs, num_regs)
3538 struct re_registers *regs;
3539 unsigned int num_regs;
3540 {
3541 int i;
3542
3543 regs->num_regs = num_regs;
3544 if (num_regs < RE_NREGS)
3545 num_regs = RE_NREGS;
3546
3547 if (regs->allocated == 0) {
3548 regs->beg = TMALLOC(num_regs, int);
3549 regs->end = TMALLOC(num_regs, int);
3550 regs->allocated = num_regs;
3551 }
3552 else if (regs->allocated < num_regs) {
3553 TREALLOC(regs->beg, num_regs, int);
3554 TREALLOC(regs->end, num_regs, int);
3555 regs->allocated = num_regs;
3556 }
3557 for (i=0; i<num_regs; i++) {
3558 regs->beg[i] = regs->end[i] = -1;
3559 }
3560 }
3561
3562 /* Match the pattern described by BUFP against STRING, which is of
3563 SIZE. Start the match at index POS in STRING. In REGS, return the
3564 indices of STRING that matched the entire BUFP->buffer and its
3565 contained subexpressions.
3566
3567 If bufp->fastmap is nonzero, then it had better be up to date.
3568
3569 The reason that the data to match are specified as two components
3570 which are to be regarded as concatenated is so this function can be
3571 used directly on the contents of an Emacs buffer.
3572
3573 -1 is returned if there is no match. -2 is returned if there is an
3574 error (such as match stack overflow). Otherwise the value is the
3575 length of the substring which was matched. */
3576
3577 int
nmz_re_match(bufp,string_arg,size,pos,regs)3578 nmz_re_match(bufp, string_arg, size, pos, regs)
3579 struct re_pattern_buffer *bufp;
3580 const char *string_arg;
3581 int size, pos;
3582 struct re_registers *regs;
3583 {
3584 register unsigned char *p = (unsigned char*)bufp->buffer;
3585 unsigned char *p1;
3586
3587 /* Pointer to beyond end of buffer. */
3588 register unsigned char *pend = p + bufp->used;
3589
3590 unsigned num_regs = bufp->re_nsub;
3591
3592 unsigned char *string = (unsigned char*)string_arg;
3593
3594 register unsigned char *d, *dend;
3595 register int mcnt; /* Multipurpose. */
3596 int options = bufp->options;
3597
3598 /* Failure point stack. Each place that can handle a failure further
3599 down the line pushes a failure point on this stack. It consists of
3600 restart, regend, and reg_info for all registers corresponding to the
3601 subexpressions we're currently inside, plus the number of such
3602 registers, and, finally, two char *'s. The first char * is where to
3603 resume scanning the pattern; the second one is where to resume
3604 scanning the strings. If the latter is zero, the failure point is a
3605 ``dummy''; if a failure happens and the failure point is a dummy, it
3606 gets discarded and the next next one is tried. */
3607
3608 unsigned char *stacka[1];
3609 unsigned char **stackb = NULL;
3610 unsigned char **stackp = NULL;
3611 unsigned char **stacke = NULL;
3612
3613 /* Information on the contents of registers. These are pointers into
3614 the input strings; they record just what was matched (on this
3615 attempt) by a subexpression part of the pattern, that is, the
3616 regnum-th regstart pointer points to where in the pattern we began
3617 matching and the regnum-th regend points to right after where we
3618 stopped matching the regnum-th subexpression. (The zeroth register
3619 keeps track of what the whole pattern matches.) */
3620
3621 unsigned char **regstart = bufp->regstart;
3622 unsigned char **regend = bufp->regend;
3623
3624 /* If a group that's operated upon by a repetition operator fails to
3625 match anything, then the register for its start will need to be
3626 restored because it will have been set to wherever in the string we
3627 are when we last see its open-group operator. Similarly for a
3628 register's end. */
3629 unsigned char **old_regstart = bufp->old_regstart;
3630 unsigned char **old_regend = bufp->old_regend;
3631
3632 /* The is_active field of reg_info helps us keep track of which (possibly
3633 nested) subexpressions we are currently in. The matched_something
3634 field of reg_info[reg_num] helps us tell whether or not we have
3635 matched any of the pattern so far this time through the reg_num-th
3636 subexpression. These two fields get reset each time through any
3637 loop their register is in. */
3638
3639 register_info_type *reg_info = bufp->reg_info;
3640
3641 /* The following record the register info as found in the above
3642 variables when we find a match better than any we've seen before.
3643 This happens as we backtrack through the failure points, which in
3644 turn happens only if we have not yet matched the entire string. */
3645
3646 unsigned best_regs_set = 0;
3647 unsigned char **best_regstart = bufp->best_regstart;
3648 unsigned char **best_regend = bufp->best_regend;
3649
3650 int num_failure_counts = 0;
3651
3652 if (regs) {
3653 init_regs(regs, num_regs);
3654 }
3655
3656 /* Initialize the stack. */
3657 MALLOC_STACK(unsigned char *, MAX_NUM_FAILURE_ITEMS * NFAILURES);
3658
3659 #ifdef DEBUG_REGEX
3660 fprintf(stderr, "Entering nmz_re_match(%s)\n", string_arg);
3661 #endif
3662
3663 /* Initialize subexpression text positions to -1 to mark ones that no
3664 ( or ( and ) or ) has been seen for. Also set all registers to
3665 inactive and mark them as not having matched anything or ever
3666 failed. */
3667 for (mcnt = 0; mcnt < num_regs; mcnt++) {
3668 regstart[mcnt] = regend[mcnt]
3669 = old_regstart[mcnt] = old_regend[mcnt]
3670 = best_regstart[mcnt] = best_regend[mcnt] = REG_UNSET_VALUE;
3671 #ifdef __CHECKER__
3672 reg_info[mcnt].word = 0;
3673 #endif
3674 IS_ACTIVE (reg_info[mcnt]) = 0;
3675 MATCHED_SOMETHING (reg_info[mcnt]) = 0;
3676 }
3677
3678 /* Set up pointers to ends of strings.
3679 Don't allow the second string to be empty unless both are empty. */
3680
3681
3682 /* `p' scans through the pattern as `d' scans through the data. `dend'
3683 is the end of the input string that `d' points within. `d' is
3684 advanced into the following input string whenever necessary, but
3685 this happens before fetching; therefore, at the beginning of the
3686 loop, `d' can be pointing at the end of a string, but it cannot
3687 equal string2. */
3688
3689 d = string + pos, dend = string + size;
3690
3691 /* This loops over pattern commands. It exits by returning from the
3692 function if match is complete, or it drops through if match fails
3693 at this starting point in the input data. */
3694
3695 for (;;) {
3696 #ifdef DEBUG_REGEX
3697 fprintf(stderr,
3698 "regex loop(%d): matching 0x%02d\n",
3699 p - (unsigned char*)bufp->buffer,
3700 *p);
3701 #endif
3702 /* End of pattern means we might have succeeded. */
3703 if (p == pend) {
3704 /* If not end of string, try backtracking. Otherwise done. */
3705 if ((bufp->options & RE_OPTION_LONGEST) && d != dend) {
3706 if (best_regs_set) /* non-greedy, no need to backtrack */
3707 goto restore_best_regs;
3708 while (stackp != stackb && stackp[-1] == NON_GREEDY) {
3709 if (best_regs_set) /* non-greedy, no need to backtrack */
3710 goto restore_best_regs;
3711 POP_FAILURE_POINT();
3712 }
3713 if (stackp != stackb) {
3714 /* More failure points to try. */
3715
3716 /* If exceeds best match so far, save it. */
3717 if (! best_regs_set || (d > best_regend[0])) {
3718 best_regs_set = 1;
3719 best_regend[0] = d; /* Never use regstart[0]. */
3720
3721 for (mcnt = 1; mcnt < num_regs; mcnt++) {
3722 best_regstart[mcnt] = regstart[mcnt];
3723 best_regend[mcnt] = regend[mcnt];
3724 }
3725 }
3726 goto fail;
3727 }
3728 /* If no failure points, don't restore garbage. */
3729 else if (best_regs_set) {
3730 restore_best_regs:
3731 /* Restore best match. */
3732 d = best_regend[0];
3733
3734 for (mcnt = 0; mcnt < num_regs; mcnt++) {
3735 regstart[mcnt] = best_regstart[mcnt];
3736 regend[mcnt] = best_regend[mcnt];
3737 }
3738 }
3739 }
3740
3741 /* If caller wants register contents data back, convert it
3742 to indices. */
3743 if (regs) {
3744 regs->beg[0] = pos;
3745 regs->end[0] = d - string;
3746 for (mcnt = 1; mcnt < num_regs; mcnt++) {
3747 if (REG_UNSET(regend[mcnt])) {
3748 regs->beg[mcnt] = -1;
3749 regs->end[mcnt] = -1;
3750 continue;
3751 }
3752 regs->beg[mcnt] = regstart[mcnt] - string;
3753 regs->end[mcnt] = regend[mcnt] - string;
3754 }
3755 }
3756 FREE_AND_RETURN(stackb, (d - pos - string));
3757 }
3758
3759 /* Otherwise match next pattern command. */
3760 #ifdef SWITCH_ENUM_BUG
3761 switch ((int)((enum regexpcode)*p++))
3762 #else
3763 switch ((enum regexpcode)*p++)
3764 #endif
3765 {
3766 /* ( [or `(', as appropriate] is represented by start_memory,
3767 ) by stop_memory. Both of those commands are followed by
3768 a register number in the next byte. The text matched
3769 within the ( and ) is recorded under that number. */
3770 case start_memory:
3771 old_regstart[*p] = regstart[*p];
3772 regstart[*p] = d;
3773 IS_ACTIVE(reg_info[*p]) = 1;
3774 MATCHED_SOMETHING(reg_info[*p]) = 0;
3775 p += 2;
3776 continue;
3777
3778 case stop_memory:
3779 old_regend[*p] = regend[*p];
3780 regend[*p] = d;
3781 IS_ACTIVE(reg_info[*p]) = 0;
3782 p += 2;
3783 continue;
3784
3785 case start_paren:
3786 case stop_paren:
3787 break;
3788
3789 /* \<digit> has been turned into a `duplicate' command which is
3790 followed by the numeric value of <digit> as the register number. */
3791 case duplicate:
3792 {
3793 int regno = *p++; /* Get which register to match against */
3794 register unsigned char *d2, *dend2;
3795
3796 /* Check if there's corresponding group */
3797 if (regno >= num_regs) goto fail;
3798 /* Check if corresponding group is still open */
3799 if (IS_ACTIVE(reg_info[regno])) goto fail;
3800
3801 /* Where in input to try to start matching. */
3802 d2 = regstart[regno];
3803 if (REG_UNSET(d2)) goto fail;
3804
3805 /* Where to stop matching; if both the place to start and
3806 the place to stop matching are in the same string, then
3807 set to the place to stop, otherwise, for now have to use
3808 the end of the first string. */
3809
3810 dend2 = regend[regno];
3811 if (REG_UNSET(dend2)) goto fail;
3812 for (;;) {
3813 /* At end of register contents => success */
3814 if (d2 == dend2) break;
3815
3816 /* If necessary, advance to next segment in data. */
3817 PREFETCH;
3818
3819 /* How many characters left in this segment to match. */
3820 mcnt = dend - d;
3821
3822 /* Want how many consecutive characters we can match in
3823 one shot, so, if necessary, adjust the count. */
3824 if (mcnt > dend2 - d2)
3825 mcnt = dend2 - d2;
3826
3827 /* Compare that many; failure if mismatch, else move
3828 past them. */
3829 if ((options & RE_OPTION_IGNORECASE)
3830 ? memcmp_translate(d, d2, mcnt)
3831 : memcmp((char*)d, (char*)d2, mcnt))
3832 goto fail;
3833 d += mcnt, d2 += mcnt;
3834 }
3835 }
3836 break;
3837
3838 case start_nowidth:
3839 PUSH_FAILURE_POINT(0, d);
3840 if (stackp - stackb > RE_DUP_MAX) {
3841 FREE_AND_RETURN(stackb,(-2));
3842 }
3843 EXTRACT_NUMBER_AND_INCR(mcnt, p);
3844 STORE_NUMBER(p+mcnt, stackp - stackb);
3845 continue;
3846
3847 case stop_nowidth:
3848 EXTRACT_NUMBER_AND_INCR(mcnt, p);
3849 stackp = stackb + mcnt;
3850 d = stackp[-3];
3851 POP_FAILURE_POINT();
3852 continue;
3853
3854 case stop_backtrack:
3855 EXTRACT_NUMBER_AND_INCR(mcnt, p);
3856 stackp = stackb + mcnt;
3857 POP_FAILURE_POINT();
3858 continue;
3859
3860 case pop_and_fail:
3861 EXTRACT_NUMBER(mcnt, p+1);
3862 stackp = stackb + mcnt;
3863 POP_FAILURE_POINT();
3864 goto fail;
3865
3866 case anychar:
3867 PREFETCH;
3868 if (ismbchar(*d)) {
3869 if (d + mbclen(*d) > dend)
3870 goto fail;
3871 SET_REGS_MATCHED;
3872 d += mbclen(*d);
3873 break;
3874 }
3875 if (!(options&RE_OPTION_MULTILINE)
3876 && (TRANSLATE_P() ? translate[*d] : *d) == '\n')
3877 goto fail;
3878 SET_REGS_MATCHED;
3879 d++;
3880 break;
3881
3882 case anychar_repeat:
3883 for (;;) {
3884 PUSH_FAILURE_POINT(p, d);
3885 PREFETCH;
3886 if (ismbchar(*d)) {
3887 if (d + mbclen(*d) > dend)
3888 goto fail;
3889 SET_REGS_MATCHED;
3890 d += mbclen(*d);
3891 continue;
3892 }
3893 if (!(options&RE_OPTION_MULTILINE) &&
3894 (TRANSLATE_P() ? translate[*d] : *d) == '\n')
3895 goto fail;
3896 SET_REGS_MATCHED;
3897 d++;
3898 }
3899 break;
3900
3901 case charset:
3902 case charset_not:
3903 {
3904 int not; /* Nonzero for charset_not. */
3905 int part = 0; /* true if matched part of mbc */
3906 unsigned char *dsave = d + 1;
3907 int cc, c;
3908
3909 PREFETCH;
3910 cc = c = (unsigned char)*d++;
3911 if (ismbchar(c)) {
3912 if (d + mbclen(c) - 1 <= dend) {
3913 MBC2WC(c, d);
3914 }
3915 }
3916 else if (TRANSLATE_P())
3917 cc = c = (unsigned char)translate[c];
3918
3919 not = is_in_list(c, p);
3920 if (!not && cc != c) {
3921 part = not = is_in_list(cc, p);
3922 }
3923 if (*(p - 1) == (unsigned char)charset_not) {
3924 not = !not;
3925 }
3926 if (!not) goto fail;
3927
3928 p += 1 + *p + 2 + EXTRACT_UNSIGNED(&p[1 + *p])*8;
3929 SET_REGS_MATCHED;
3930
3931 if (part) d = dsave;
3932 break;
3933 }
3934
3935 case begline:
3936 if (size == 0 || AT_STRINGS_BEG(d))
3937 break;
3938 if (d[-1] == '\n' && !AT_STRINGS_END(d))
3939 break;
3940 goto fail;
3941
3942 case endline:
3943 if (AT_STRINGS_END(d)) {
3944 if (size == 0 || d[-1] != '\n')
3945 break;
3946 }
3947 else if (*d == '\n')
3948 break;
3949 goto fail;
3950
3951 /* Match at the very beginning of the string. */
3952 case begbuf:
3953 if (AT_STRINGS_BEG(d))
3954 break;
3955 goto fail;
3956
3957 /* Match at the very end of the data. */
3958 case endbuf:
3959 if (AT_STRINGS_END(d))
3960 break;
3961 goto fail;
3962
3963 /* Match at the very end of the data. */
3964 case endbuf2:
3965 if (AT_STRINGS_END(d)) {
3966 if (size == 0 || d[-1] != '\n')
3967 break;
3968 }
3969 /* .. or newline just before the end of the data. */
3970 if (*d == '\n' && AT_STRINGS_END(d+1))
3971 break;
3972 goto fail;
3973
3974 /* `or' constructs are handled by starting each alternative with
3975 an on_failure_jump that points to the start of the next
3976 alternative. Each alternative except the last ends with a
3977 jump to the joining point. (Actually, each jump except for
3978 the last one really jumps to the following jump, because
3979 tensioning the jumps is a hassle.) */
3980
3981 /* The start of a stupid repeat has an on_failure_jump that points
3982 past the end of the repeat text. This makes a failure point so
3983 that on failure to match a repetition, matching restarts past
3984 as many repetitions have been found with no way to fail and
3985 look for another one. */
3986
3987 /* A smart repeat is similar but loops back to the on_failure_jump
3988 so that each repetition makes another failure point. */
3989
3990 /* Match at the starting position. */
3991 case begpos:
3992 if (d - string == pos)
3993 break;
3994 goto fail;
3995
3996 case on_failure_jump:
3997 on_failure:
3998 EXTRACT_NUMBER_AND_INCR(mcnt, p);
3999 PUSH_FAILURE_POINT(p + mcnt, d);
4000 continue;
4001
4002 /* The end of a smart repeat has a maybe_finalize_jump back.
4003 Change it either to a finalize_jump or an ordinary jump. */
4004 case maybe_finalize_jump:
4005 EXTRACT_NUMBER_AND_INCR(mcnt, p);
4006 p1 = p;
4007
4008 /* Compare the beginning of the repeat with what in the
4009 pattern follows its end. If we can establish that there
4010 is nothing that they would both match, i.e., that we
4011 would have to backtrack because of (as in, e.g., `a*a')
4012 then we can change to finalize_jump, because we'll
4013 never have to backtrack.
4014
4015 This is not true in the case of alternatives: in
4016 `(a|ab)*' we do need to backtrack to the `ab' alternative
4017 (e.g., if the string was `ab'). But instead of trying to
4018 detect that here, the alternative has put on a dummy
4019 failure point which is what we will end up popping. */
4020
4021 /* Skip over open/close-group commands. */
4022 while (p1 + 2 < pend) {
4023 if ((enum regexpcode)*p1 == stop_memory ||
4024 (enum regexpcode)*p1 == start_memory)
4025 p1 += 3; /* Skip over args, too. */
4026 else if (/*(enum regexpcode)*p1 == start_paren ||*/
4027 (enum regexpcode)*p1 == stop_paren)
4028 p1 += 1;
4029 else
4030 break;
4031 }
4032
4033 if (p1 == pend)
4034 p[-3] = (unsigned char)finalize_jump;
4035 else if (*p1 == (unsigned char)exactn ||
4036 *p1 == (unsigned char)endline) {
4037 register int c = *p1 == (unsigned char)endline ? '\n' : p1[2];
4038 register unsigned char *p2 = p + mcnt;
4039 /* p2[0] ... p2[2] are an on_failure_jump.
4040 Examine what follows that. */
4041 if (p2[3] == (unsigned char)exactn && p2[5] != c)
4042 p[-3] = (unsigned char)finalize_jump;
4043 else if (p2[3] == (unsigned char)charset ||
4044 p2[3] == (unsigned char)charset_not) {
4045 int not;
4046 if (ismbchar(c)) {
4047 unsigned char *pp = p1+3;
4048 MBC2WC(c, pp);
4049 }
4050 /* `is_in_list()' is TRUE if c would match */
4051 /* That means it is not safe to finalize. */
4052 not = is_in_list(c, p2 + 4);
4053 if (p2[3] == (unsigned char)charset_not)
4054 not = !not;
4055 if (!not)
4056 p[-3] = (unsigned char)finalize_jump;
4057 }
4058 }
4059 p -= 2; /* Point at relative address again. */
4060 if (p[-1] != (unsigned char)finalize_jump) {
4061 p[-1] = (unsigned char)jump;
4062 goto nofinalize;
4063 }
4064 /* Note fall through. */
4065
4066 /* The end of a stupid repeat has a finalize_jump back to the
4067 start, where another failure point will be made which will
4068 point to after all the repetitions found so far. */
4069
4070 /* Take off failure points put on by matching on_failure_jump
4071 because didn't fail. Also remove the register information
4072 put on by the on_failure_jump. */
4073 case finalize_jump:
4074 if (stackp > stackb && stackp[-3] == d) {
4075 p = stackp[-4];
4076 POP_FAILURE_POINT();
4077 continue;
4078 }
4079 POP_FAILURE_POINT();
4080 /* Note fall through. */
4081
4082 /* We need this opcode so we can detect where alternatives end
4083 in `group_match_null_string_p' et al. */
4084 case jump_past_alt:
4085 /* fall through */
4086
4087 /* Jump without taking off any failure points. */
4088 case jump:
4089 nofinalize:
4090 EXTRACT_NUMBER_AND_INCR(mcnt, p);
4091 if (mcnt < 0 && stackp > stackb && stackp[-3] == d) /* avoid infinite loop */
4092 goto fail;
4093 p += mcnt;
4094 continue;
4095
4096 case dummy_failure_jump:
4097 /* Normally, the on_failure_jump pushes a failure point, which
4098 then gets popped at finalize_jump. We will end up at
4099 finalize_jump, also, and with a pattern of, say, `a+', we
4100 are skipping over the on_failure_jump, so we have to push
4101 something meaningless for finalize_jump to pop. */
4102 PUSH_FAILURE_POINT(0, 0);
4103 goto nofinalize;
4104
4105 /* At the end of an alternative, we need to push a dummy failure
4106 point in case we are followed by a `finalize_jump', because
4107 we don't want the failure point for the alternative to be
4108 popped. For example, matching `(a|ab)*' against `aab'
4109 requires that we match the `ab' alternative. */
4110 case push_dummy_failure:
4111 /* See comments just above at `dummy_failure_jump' about the
4112 two zeroes. */
4113 p1 = p;
4114 /* Skip over open/close-group commands. */
4115 while (p1 + 2 < pend) {
4116 if ((enum regexpcode)*p1 == stop_memory ||
4117 (enum regexpcode)*p1 == start_memory)
4118 p1 += 3; /* Skip over args, too. */
4119 else if (/*(enum regexpcode)*p1 == start_paren ||*/
4120 (enum regexpcode)*p1 == stop_paren)
4121 p1 += 1;
4122 else
4123 break;
4124 }
4125 if ((enum regexpcode)*p1 == jump)
4126 p[-1] = unused;
4127 else
4128 PUSH_FAILURE_POINT(0, 0);
4129 break;
4130
4131 /* Have to succeed matching what follows at least n times. Then
4132 just handle like an on_failure_jump. */
4133 case succeed_n:
4134 EXTRACT_NUMBER(mcnt, p + 2);
4135 /* Originally, this is how many times we HAVE to succeed. */
4136 if (mcnt != 0) {
4137 mcnt--;
4138 p += 2;
4139 PUSH_FAILURE_COUNT(p);
4140 STORE_NUMBER_AND_INCR(p, mcnt);
4141 PUSH_FAILURE_POINT(0, 0);
4142 }
4143 else {
4144 goto on_failure;
4145 }
4146 continue;
4147
4148 case jump_n:
4149 EXTRACT_NUMBER(mcnt, p + 2);
4150 /* Originally, this is how many times we CAN jump. */
4151 if (mcnt) {
4152 mcnt--;
4153 PUSH_FAILURE_COUNT(p + 2);
4154 STORE_NUMBER(p + 2, mcnt);
4155 goto nofinalize; /* Do the jump without taking off
4156 any failure points. */
4157 }
4158 /* If don't have to jump any more, skip over the rest of command. */
4159 else
4160 p += 4;
4161 continue;
4162
4163 case set_number_at:
4164 EXTRACT_NUMBER_AND_INCR(mcnt, p);
4165 p1 = p + mcnt;
4166 EXTRACT_NUMBER_AND_INCR(mcnt, p);
4167 STORE_NUMBER(p1, mcnt);
4168 continue;
4169
4170 case try_next:
4171 EXTRACT_NUMBER_AND_INCR(mcnt, p);
4172 if (p + mcnt < pend) {
4173 PUSH_FAILURE_POINT(p, d);
4174 stackp[-1] = NON_GREEDY;
4175 }
4176 p += mcnt;
4177 continue;
4178
4179 case finalize_push:
4180 POP_FAILURE_POINT();
4181 EXTRACT_NUMBER_AND_INCR(mcnt, p);
4182 if (mcnt < 0 && stackp > stackb && stackp[-3] == d) /* avoid infinite loop */
4183 goto fail;
4184 PUSH_FAILURE_POINT(p + mcnt, d);
4185 stackp[-1] = NON_GREEDY;
4186 continue;
4187
4188 case finalize_push_n:
4189 EXTRACT_NUMBER(mcnt, p + 2);
4190 /* Originally, this is how many times we CAN jump. */
4191 if (mcnt) {
4192 int pos, i;
4193
4194 mcnt--;
4195 STORE_NUMBER(p + 2, mcnt);
4196 EXTRACT_NUMBER(pos, p);
4197 EXTRACT_NUMBER(i, p+pos+5);
4198 if (i > 0) goto nofinalize;
4199 POP_FAILURE_POINT();
4200 EXTRACT_NUMBER_AND_INCR(mcnt, p);
4201 PUSH_FAILURE_POINT(p + mcnt, d);
4202 stackp[-1] = NON_GREEDY;
4203 p += 2; /* skip n */
4204 }
4205 /* If don't have to push any more, skip over the rest of command. */
4206 else
4207 p += 4;
4208 continue;
4209
4210 /* Ignore these. Used to ignore the n of succeed_n's which
4211 currently have n == 0. */
4212 case unused:
4213 continue;
4214
4215 case casefold_on:
4216 options |= RE_OPTION_IGNORECASE;
4217 continue;
4218
4219 case casefold_off:
4220 options &= ~RE_OPTION_IGNORECASE;
4221 continue;
4222
4223 case option_set:
4224 options = *p++;
4225 continue;
4226
4227 case wordbound:
4228 if (AT_STRINGS_BEG(d)) {
4229 if (IS_A_LETTER(d)) break;
4230 else goto fail;
4231 }
4232 if (AT_STRINGS_END(d)) {
4233 if (PREV_IS_A_LETTER(d)) break;
4234 else goto fail;
4235 }
4236 if (PREV_IS_A_LETTER(d) != IS_A_LETTER(d))
4237 break;
4238 goto fail;
4239
4240 case notwordbound:
4241 if (AT_STRINGS_BEG(d)) {
4242 if (IS_A_LETTER(d)) goto fail;
4243 else break;
4244 }
4245 if (AT_STRINGS_END(d)) {
4246 if (PREV_IS_A_LETTER(d)) goto fail;
4247 else break;
4248 }
4249 if (PREV_IS_A_LETTER(d) != IS_A_LETTER(d))
4250 goto fail;
4251 break;
4252
4253 case wordbeg:
4254 if (IS_A_LETTER(d) && (AT_STRINGS_BEG(d) || !PREV_IS_A_LETTER(d)))
4255 break;
4256 goto fail;
4257
4258 case wordend:
4259 if (!AT_STRINGS_BEG(d) && PREV_IS_A_LETTER(d)
4260 && (!IS_A_LETTER(d) || AT_STRINGS_END(d)))
4261 break;
4262 goto fail;
4263
4264 case wordchar:
4265 PREFETCH;
4266 if (!IS_A_LETTER(d))
4267 goto fail;
4268 if (ismbchar(*d) && d + mbclen(*d) - 1 < dend)
4269 d += mbclen(*d) - 1;
4270 d++;
4271 SET_REGS_MATCHED;
4272 break;
4273
4274 case notwordchar:
4275 PREFETCH;
4276 if (IS_A_LETTER(d))
4277 goto fail;
4278 if (ismbchar(*d) && d + mbclen(*d) - 1 < dend)
4279 d += mbclen(*d) - 1;
4280 d++;
4281 SET_REGS_MATCHED;
4282 break;
4283
4284 case exactn:
4285 /* Match the next few pattern characters exactly.
4286 mcnt is how many characters to match. */
4287 mcnt = *p++;
4288 /* This is written out as an if-else so we don't waste time
4289 testing `translate' inside the loop. */
4290 if (TRANSLATE_P()) {
4291 do {
4292 unsigned char c;
4293
4294 PREFETCH;
4295 if (*p == 0xff) {
4296 p++;
4297 if (!--mcnt
4298 || AT_STRINGS_END(d)
4299 || (unsigned char)*d++ != (unsigned char)*p++)
4300 goto fail;
4301 continue;
4302 }
4303 c = *d++;
4304 if (ismbchar(c)) {
4305 int n;
4306
4307 if (c != (unsigned char)*p++)
4308 goto fail;
4309 for (n = mbclen(c) - 1; n > 0; n--)
4310 if (!--mcnt /* redundant check if pattern was
4311 compiled properly. */
4312 || AT_STRINGS_END(d)
4313 || (unsigned char)*d++ != (unsigned char)*p++)
4314 goto fail;
4315 continue;
4316 }
4317 /* compiled code translation needed for ruby */
4318 if ((unsigned char)translate[c] != (unsigned char)translate[*p++])
4319 goto fail;
4320 }
4321 while (--mcnt);
4322 }
4323 else {
4324 do {
4325 PREFETCH;
4326 if (*p == 0xff) {p++; mcnt--;}
4327 if (*d++ != *p++) goto fail;
4328 }
4329 while (--mcnt);
4330 }
4331 SET_REGS_MATCHED;
4332 break;
4333 }
4334 #ifdef RUBY
4335 CHECK_INTS;
4336 #endif
4337 continue; /* Successfully executed one pattern command; keep going. */
4338
4339 /* Jump here if any matching operation fails. */
4340 fail:
4341 if (stackp != stackb) {
4342 /* A restart point is known. Restart there and pop it. */
4343 short last_used_reg, this_reg;
4344
4345 /* If this failure point is from a dummy_failure_point, just
4346 skip it. */
4347 if (stackp[-4] == 0 || (best_regs_set && stackp[-1] == NON_GREEDY)) {
4348 POP_FAILURE_POINT();
4349 goto fail;
4350 }
4351 stackp--; /* discard greedy flag */
4352 options = (long)*--stackp;
4353 d = *--stackp;
4354 p = *--stackp;
4355 /* Restore register info. */
4356 last_used_reg = (long)*--stackp;
4357
4358 /* Make the ones that weren't saved -1 or 0 again. */
4359 for (this_reg = num_regs - 1; this_reg > last_used_reg; this_reg--) {
4360 regend[this_reg] = REG_UNSET_VALUE;
4361 regstart[this_reg] = REG_UNSET_VALUE;
4362 IS_ACTIVE(reg_info[this_reg]) = 0;
4363 MATCHED_SOMETHING(reg_info[this_reg]) = 0;
4364 }
4365
4366 /* And restore the rest from the stack. */
4367 for ( ; this_reg > 0; this_reg--) {
4368 reg_info[this_reg].word = *--stackp;
4369 regend[this_reg] = *--stackp;
4370 regstart[this_reg] = *--stackp;
4371 }
4372 mcnt = (long)*--stackp;
4373 while (mcnt--) {
4374 POP_FAILURE_COUNT();
4375 }
4376 if (p < pend) {
4377 int is_a_jump_n = 0;
4378 int failed_paren = 0;
4379
4380 p1 = p;
4381 /* If failed to a backwards jump that's part of a repetition
4382 loop, need to pop this failure point and use the next one. */
4383 switch ((enum regexpcode)*p1) {
4384 case jump_n:
4385 case finalize_push_n:
4386 is_a_jump_n = 1;
4387 case maybe_finalize_jump:
4388 case finalize_jump:
4389 case finalize_push:
4390 case jump:
4391 p1++;
4392 EXTRACT_NUMBER_AND_INCR(mcnt, p1);
4393
4394 if (mcnt >= 0) break; /* should be backward jump */
4395 p1 += mcnt;
4396
4397 if (( is_a_jump_n && (enum regexpcode)*p1 == succeed_n) ||
4398 (!is_a_jump_n && (enum regexpcode)*p1 == on_failure_jump)) {
4399 if (failed_paren) {
4400 p1++;
4401 EXTRACT_NUMBER_AND_INCR(mcnt, p1);
4402 PUSH_FAILURE_POINT(p1 + mcnt, d);
4403 }
4404 goto fail;
4405 }
4406 break;
4407 default:
4408 /* do nothing */;
4409 }
4410 }
4411 }
4412 else
4413 break; /* Matching at this starting point really fails. */
4414 }
4415
4416 if (best_regs_set)
4417 goto restore_best_regs;
4418
4419 FREE_AND_RETURN(stackb,(-1)); /* Failure to match. */
4420 }
4421
4422
4423 static int
memcmp_translate(s1,s2,len)4424 memcmp_translate(s1, s2, len)
4425 unsigned char *s1, *s2;
4426 register int len;
4427 {
4428 register unsigned char *p1 = s1, *p2 = s2, c;
4429 while (len) {
4430 c = *p1++;
4431 if (ismbchar(c)) {
4432 int n;
4433
4434 if (c != *p2++) return 1;
4435 for (n = mbclen(c) - 1; n > 0; n--)
4436 if (!--len || *p1++ != *p2++)
4437 return 1;
4438 }
4439 else
4440 if (translate[c] != translate[*p2++])
4441 return 1;
4442 len--;
4443 }
4444 return 0;
4445 }
4446
4447 void
nmz_re_copy_registers(regs1,regs2)4448 nmz_re_copy_registers(regs1, regs2)
4449 struct re_registers *regs1, *regs2;
4450 {
4451 int i;
4452
4453 if (regs1 == regs2) return;
4454 if (regs1->allocated == 0) {
4455 regs1->beg = TMALLOC(regs2->num_regs, int);
4456 regs1->end = TMALLOC(regs2->num_regs, int);
4457 regs1->allocated = regs2->num_regs;
4458 }
4459 else if (regs1->allocated < regs2->num_regs) {
4460 TREALLOC(regs1->beg, regs2->num_regs, int);
4461 TREALLOC(regs1->end, regs2->num_regs, int);
4462 regs1->allocated = regs2->num_regs;
4463 }
4464 for (i=0; i<regs2->num_regs; i++) {
4465 regs1->beg[i] = regs2->beg[i];
4466 regs1->end[i] = regs2->end[i];
4467 }
4468 regs1->num_regs = regs2->num_regs;
4469 }
4470
4471 void
nmz_re_free_registers(regs)4472 nmz_re_free_registers(regs)
4473 struct re_registers *regs;
4474 {
4475 if (regs->allocated == 0) return;
4476 if (regs->beg) xfree(regs->beg);
4477 if (regs->end) xfree(regs->end);
4478 }
4479
4480 /* Functions for multi-byte support.
4481 Created for grep multi-byte extension Jul., 1993 by t^2 (Takahiro Tanimoto)
4482 Last change: Jul. 9, 1993 by t^2 */
4483 static const unsigned char mbctab_ascii[] = {
4484 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4485 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4486 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4487 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4488 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4489 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4490 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4491 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4492 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4493 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4494 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4495 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4496 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4497 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4498 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4499 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
4500 };
4501
4502 static const unsigned char mbctab_euc[] = { /* 0xA1-0xFE */
4503 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4504 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4505 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4506 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4507 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4508 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4509 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4510 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4511 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,
4512 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4513 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
4514 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
4515 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
4516 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
4517 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
4518 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0
4519 };
4520
4521 static const unsigned char mbctab_sjis[] = { /* 0x80-0x9f,0xE0-0xFF */
4522 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4523 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4524 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4525 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4526 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4527 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4528 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4529 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4530 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
4531 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
4532 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4533 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4534 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4535 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4536 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
4537 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1
4538 };
4539
4540 static const unsigned char mbctab_utf8[] = {
4541 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4542 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4543 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4544 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4545 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4546 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4547 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4548 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4549 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4550 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4551 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4552 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4553 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
4554 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
4555 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
4556 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 5, 5, 0, 0
4557 };
4558
4559 const unsigned char *re_mbctab = mbctab_ascii;
4560
4561 void
nmz_re_mbcinit(mbctype)4562 nmz_re_mbcinit(mbctype)
4563 int mbctype;
4564 {
4565 switch (mbctype) {
4566 case MBCTYPE_ASCII:
4567 re_mbctab = mbctab_ascii;
4568 current_mbctype = MBCTYPE_ASCII;
4569 break;
4570 case MBCTYPE_EUC:
4571 re_mbctab = mbctab_euc;
4572 current_mbctype = MBCTYPE_EUC;
4573 break;
4574 case MBCTYPE_SJIS:
4575 re_mbctab = mbctab_sjis;
4576 current_mbctype = MBCTYPE_SJIS;
4577 break;
4578 case MBCTYPE_UTF8:
4579 re_mbctab = mbctab_utf8;
4580 current_mbctype = MBCTYPE_UTF8;
4581 break;
4582 }
4583 }
4584