1 /* Extended regular expression matching and search library.
2 Copyright (C) 1985, 1989-90 Free Software Foundation, Inc.
3
4 This program is free software; you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation; either version 1, or (at your option)
7 any later version.
8
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
13
14 You should have received a copy of the GNU General Public License
15 along with this program; if not, write to the Free Software
16 Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */
17 /* Multi-byte extension added May, 1993 by t^2 (Takahiro Tanimoto)
18 Last change: May 21, 1993 by t^2 */
19
20
21 /* To test, compile with -Dtest. This Dtestable feature turns this into
22 a self-contained program which reads a pattern, describes how it
23 compiles, then reads a string and searches for it.
24
25 On the other hand, if you compile with both -Dtest and -Dcanned you
26 can run some tests we've already thought of. */
27
28 /*
29 Some small modifications for Namazu
30 by ccsatoru@vega.aichi-u.ac.jp [10/06/1998]
31 */
32
33 /* We write fatal error messages on standard error. */
34 #include <stdio.h>
35
36 /* isalpha(3) etc. are used for the character classes. */
37 #include <ctype.h>
38 #include <sys/types.h>
39
40 #ifdef __STDC__
41 #define P(s) s
42 #define MALLOC_ARG_T size_t
43 #else
44 #define P(s) ()
45 #define MALLOC_ARG_T unsigned
46 #define volatile
47 #define const
48 #endif
49
50 /* #include "config.h" */
51
52 void *xmalloc P((unsigned long));
53 void *xcalloc P((unsigned long,unsigned long));
54 void *xrealloc P((void*,unsigned long));
55 void free P((void*));
56
57 /* #define NO_ALLOCA * try it out for now */
58 #ifndef NO_ALLOCA
59 /* Make alloca work the best possible way. */
60 #ifdef __GNUC__
61 # ifndef atarist
62 # ifndef alloca
63 # define alloca __builtin_alloca
64 # endif
65 # endif /* atarist */
66 #else
67 # if defined(HAVE_ALLOCA_H)
68 # include <alloca.h>
69 # else
70 char *alloca();
71 # endif
72 #endif /* __GNUC__ */
73
74 #ifdef _AIX
75 #pragma alloca
76 #endif
77
78 #ifdef HAVE_STRING_H
79 # include <string.h>
80 #else
81 # include <strings.h>
82 #endif
83
84 #define RE_ALLOCATE alloca
85 #ifdef C_ALLOCA
86 #define FREE_VARIABLES() alloca(0)
87 #else
88 #define FREE_VARIABLES()
89 #endif
90
91 #define FREE_AND_RETURN_VOID(stackb) return
92 #define FREE_AND_RETURN(stackb,val) return(val)
93 #define DOUBLE_STACK(stackx,stackb,len,type) \
94 (stackx = (type*) alloca(2 * len * sizeof(type)), \
95 /* Only copy what is in use. */ \
96 (type*) memcpy(stackx, stackb, len * sizeof (type)))
97 #else /* NO_ALLOCA defined */
98
99 #define RE_ALLOCATE xmalloc
100
101 #define FREE_VAR(var) if (var) free(var); var = NULL
102 #define FREE_VARIABLES() \
103 do { \
104 FREE_VAR(regstart); \
105 FREE_VAR(regend); \
106 FREE_VAR(old_regstart) \
107 FREE_VAR(old_regend); \
108 FREE_VAR(best_regstart); \
109 FREE_VAR(best_regend); \
110 FREE_VAR(reg_info); \
111 } while (0)
112
113 #define FREE_AND_RETURN_VOID(stackb) free(stackb);return
114 #define FREE_AND_RETURN(stackb,val) free(stackb);return(val)
115 #define DOUBLE_STACK(stackx,stackb,len,type) \
116 (type*)xrealloc(stackb, 2 * len * sizeof(type))
117 #endif /* NO_ALLOCA */
118
119 #define RE_TALLOC(n,t) ((t*)RE_ALLOCATE((n)*sizeof(t)))
120 #define TMALLOC(n,t) ((t*)xmalloc((n)*sizeof(t)))
121 #define TREALLOC(s,n,t) (s=((t*)xrealloc(s,(n)*sizeof(t))))
122
123 #define EXPAND_FAIL_STACK(stackx,stackb,len) \
124 do {\
125 /* Roughly double the size of the stack. */ \
126 stackx = DOUBLE_STACK(stackx,stackb,len,unsigned char*); \
127 /* Rearrange the pointers. */ \
128 stackp = stackx + (stackp - stackb); \
129 stackb = stackx; \
130 stacke = stackb + 2 * len; \
131 } while (0)
132
133 /* Get the interface, including the syntax bits. */
134 #include "regex.h"
135
136 /* Subroutines for re_compile_pattern. */
137 static void store_jump P((char *, int, char *));
138 static void insert_jump P((int, char *, char *, char *));
139 static void store_jump_n P((char *, int, char *, unsigned));
140 static void insert_jump_n P((int, char *, char *, char *, unsigned));
141 /* static void insert_op P((int, char *, char *)); */
142 static void insert_op_2 P((int, char *, char *, int, int));
143 static int memcmp_translate P((unsigned char *, unsigned char *, int));
144 static int alt_match_null_string_p ();
145 static int common_op_match_null_string_p ();
146 static int group_match_null_string_p ();
147
148 /* Define the syntax stuff, so we can do the \<, \>, etc. */
149
150 /* This must be nonzero for the wordchar and notwordchar pattern
151 commands in re_match. */
152 #ifndef Sword
153 #define Sword 1
154 #endif
155
156 #define SYNTAX(c) re_syntax_table[c]
157
158 static char re_syntax_table[256];
159 static void init_syntax_once P((void));
160 static unsigned char *translate = 0;
161 static void init_regs P((struct re_registers*, unsigned int));
162 static void bm_init_skip P((int *, unsigned char*, int, char *));
163
164 #undef P
165
166 #include "util.h"
167
168 static void
init_syntax_once()169 init_syntax_once()
170 {
171 register int c;
172 static int done = 0;
173
174 if (done)
175 return;
176
177 memset(re_syntax_table, 0, sizeof re_syntax_table);
178
179 for (c = 'a'; c <= 'z'; c++)
180 re_syntax_table[c] = Sword;
181
182 for (c = 'A'; c <= 'Z'; c++)
183 re_syntax_table[c] = Sword;
184
185 for (c = '0'; c <= '9'; c++)
186 re_syntax_table[c] = Sword;
187
188 re_syntax_table['_'] = Sword;
189
190 /* Add specific syntax for ISO Latin-1. */
191 for (c = 0300; c <= 0377; c++)
192 re_syntax_table[c] = Sword;
193 re_syntax_table[0327] = 0;
194 re_syntax_table[0367] = 0;
195
196 done = 1;
197 }
198
199 void
re_set_casetable(table)200 re_set_casetable(table)
201 char *table;
202 {
203 translate = (unsigned char*)table;
204 }
205
206 /* Jim Meyering writes:
207
208 "... Some ctype macros are valid only for character codes that
209 isascii says are ASCII (SGI's IRIX-4.0.5 is one such system --when
210 using /bin/cc or gcc but without giving an ansi option). So, all
211 ctype uses should be through macros like ISPRINT... If
212 STDC_HEADERS is defined, then autoconf has verified that the ctype
213 macros don't need to be guarded with references to isascii. ...
214 Defining isascii to 1 should let any compiler worth its salt
215 eliminate the && through constant folding." */
216 #ifdef isblank
217 #define ISBLANK(c) isblank ((unsigned char)c)
218 #else
219 #define ISBLANK(c) ((c) == ' ' || (c) == '\t')
220 #endif
221 #ifdef isgraph
222 #define ISGRAPH(c) isgraph ((unsigned char)c)
223 #else
224 #define ISGRAPH(c) (isprint ((unsigned char)c) && !isspace ((unsigned char)c))
225 #endif
226
227 #define ISPRINT(c) isprint ((unsigned char)c)
228 #define ISDIGIT(c) isdigit ((unsigned char)c)
229 #define ISALNUM(c) isalnum ((unsigned char)c)
230 #define ISALPHA(c) isalpha ((unsigned char)c)
231 #define ISCNTRL(c) iscntrl ((unsigned char)c)
232 #define ISLOWER(c) islower ((unsigned char)c)
233 #define ISPUNCT(c) ispunct ((unsigned char)c)
234 #define ISSPACE(c) isspace ((unsigned char)c)
235 #define ISUPPER(c) isupper ((unsigned char)c)
236 #define ISXDIGIT(c) isxdigit ((unsigned char)c)
237
238 /* These are the command codes that appear in compiled regular
239 expressions, one per byte. Some command codes are followed by
240 argument bytes. A command code can specify any interpretation
241 whatsoever for its arguments. Zero-bytes may appear in the compiled
242 regular expression.
243
244 The value of `exactn' is needed in search.c (search_buffer) in emacs.
245 So regex.h defines a symbol `RE_EXACTN_VALUE' to be 1; the value of
246 `exactn' we use here must also be 1. */
247
248 enum regexpcode
249 {
250 unused=0,
251 exactn=1, /* Followed by one byte giving n, then by n literal bytes. */
252 begline, /* Fail unless at beginning of line. */
253 endline, /* Fail unless at end of line. */
254 begbuf, /* Succeeds if at beginning of buffer (if emacs) or at beginning
255 of string to be matched (if not). */
256 endbuf, /* Analogously, for end of buffer/string. */
257 endbuf2, /* End of buffer/string, or newline just before it. */
258 jump, /* Followed by two bytes giving relative address to jump to. */
259 jump_past_alt,/* Same as jump, but marks the end of an alternative. */
260 on_failure_jump, /* Followed by two bytes giving relative address of
261 place to resume at in case of failure. */
262 finalize_jump, /* Throw away latest failure point and then jump to
263 address. */
264 maybe_finalize_jump, /* Like jump but finalize if safe to do so.
265 This is used to jump back to the beginning
266 of a repeat. If the command that follows
267 this jump is clearly incompatible with the
268 one at the beginning of the repeat, such that
269 we can be sure that there is no use backtracking
270 out of repetitions already completed,
271 then we finalize. */
272 dummy_failure_jump, /* Jump, and push a dummy failure point. This
273 failure point will be thrown away if an attempt
274 is made to use it for a failure. A + construct
275 makes this before the first repeat. Also
276 use it as an intermediary kind of jump when
277 compiling an or construct. */
278 push_dummy_failure, /* Push a dummy failure point and continue. Used at the end of
279 alternatives. */
280 succeed_n, /* Used like on_failure_jump except has to succeed n times;
281 then gets turned into an on_failure_jump. The relative
282 address following it is useless until then. The
283 address is followed by two bytes containing n. */
284 jump_n, /* Similar to jump, but jump n times only; also the relative
285 address following is in turn followed by yet two more bytes
286 containing n. */
287 try_next, /* Jump to next pattern for the first time,
288 leaving this pattern on the failure stack. */
289 finalize_push, /* Finalize stack and push the beginning of the pattern
290 on the stack to retry (used for non-greedy match) */
291 finalize_push_n, /* Similar to finalize_push, buf finalize n time only */
292 set_number_at, /* Set the following relative location to the
293 subsequent number. */
294 anychar, /* Matches any (more or less) one character. */
295 charset, /* Matches any one char belonging to specified set.
296 First following byte is number of bitmap bytes.
297 Then come bytes for a bitmap saying which chars are in.
298 Bits in each byte are ordered low-bit-first.
299 A character is in the set if its bit is 1.
300 A character too large to have a bit in the map
301 is automatically not in the set. */
302 charset_not, /* Same parameters as charset, but match any character
303 that is not one of those specified. */
304 start_memory, /* Start remembering the text that is matched, for
305 storing in a memory register. Followed by one
306 byte containing the register number. Register numbers
307 must be in the range 0 through RE_NREGS. */
308 stop_memory, /* Stop remembering the text that is matched
309 and store it in a memory register. Followed by
310 one byte containing the register number. Register
311 numbers must be in the range 0 through RE_NREGS. */
312 casefold_on, /* Turn on casefold flag. */
313 casefold_off, /* Turn off casefold flag. */
314 start_nowidth, /* Save string point to the stack. */
315 stop_nowidth, /* Restore string place at the point start_nowidth. */
316 pop_and_fail, /* Fail after popping nowidth entry from stack. */
317 duplicate, /* Match a duplicate of something remembered.
318 Followed by one byte containing the index of the memory
319 register. */
320 wordchar, /* Matches any word-constituent character. */
321 notwordchar, /* Matches any char that is not a word-constituent. */
322 wordbeg, /* Succeeds if at word beginning. */
323 wordend, /* Succeeds if at word end. */
324 wordbound, /* Succeeds if at a word boundary. */
325 notwordbound,/* Succeeds if not at a word boundary. */
326 };
327
328
329 /* Number of failure points to allocate space for initially,
330 when matching. If this number is exceeded, more space is allocated,
331 so it is not a hard limit. */
332
333 #ifndef NFAILURES
334 #define NFAILURES 80
335 #endif
336
337 #if defined(CHAR_UNSIGNED) || defined(__CHAR_UNSIGNED__)
338 #define SIGN_EXTEND_CHAR(c) ((c)>(char)127?(c)-256:(c)) /* for IBM RT */
339 #endif
340 #ifndef SIGN_EXTEND_CHAR
341 #define SIGN_EXTEND_CHAR(x) (x)
342 #endif
343
344
345 /* Store NUMBER in two contiguous bytes starting at DESTINATION. */
346 #define STORE_NUMBER(destination, number) \
347 { (destination)[0] = (number) & 0377; \
348 (destination)[1] = (number) >> 8; }
349
350 /* Same as STORE_NUMBER, except increment the destination pointer to
351 the byte after where the number is stored. Watch out that values for
352 DESTINATION such as p + 1 won't work, whereas p will. */
353 #define STORE_NUMBER_AND_INCR(destination, number) \
354 { STORE_NUMBER(destination, number); \
355 (destination) += 2; }
356
357
358 /* Put into DESTINATION a number stored in two contingous bytes starting
359 at SOURCE. */
360 #define EXTRACT_NUMBER(destination, source) \
361 { (destination) = *(source) & 0377; \
362 (destination) += SIGN_EXTEND_CHAR (*(char *)((source) + 1)) << 8; }
363
364 /* Same as EXTRACT_NUMBER, except increment the pointer for source to
365 point to second byte of SOURCE. Note that SOURCE has to be a value
366 such as p, not, e.g., p + 1. */
367 #define EXTRACT_NUMBER_AND_INCR(destination, source) \
368 { EXTRACT_NUMBER(destination, source); \
369 (source) += 2; }
370
371
372 /* Specify the precise syntax of regexps for compilation. This provides
373 for compatibility for various utilities which historically have
374 different, incompatible syntaxes.
375
376 The argument SYNTAX is a bit-mask comprised of the various bits
377 defined in regex.h. */
378
379 long
re_set_syntax(syntax)380 re_set_syntax(syntax)
381 long syntax;
382 {
383 long ret;
384
385 ret = re_syntax_options;
386 re_syntax_options = syntax;
387 return ret;
388 }
389
390 /* Set by re_set_syntax to the current regexp syntax to recognize. */
391 long re_syntax_options = 0;
392
393
394 /* Macros for re_compile_pattern, which is found below these definitions. */
395
396 #define TRANSLATE_P() ((options&RE_OPTION_IGNORECASE) && translate)
397 #define MAY_TRANSLATE() ((bufp->options&(RE_OPTION_IGNORECASE|RE_MAY_IGNORECASE)) && translate)
398 /* Fetch the next character in the uncompiled pattern---translating it
399 if necessary. Also cast from a signed character in the constant
400 string passed to us by the user to an unsigned char that we can use
401 as an array index (in, e.g., `translate'). */
402 #define PATFETCH(c) \
403 do {if (p == pend) goto end_of_pattern; \
404 c = (unsigned char) *p++; \
405 if (TRANSLATE_P()) c = (unsigned char)translate[c]; \
406 } while (0)
407
408 /* Fetch the next character in the uncompiled pattern, with no
409 translation. */
410 #define PATFETCH_RAW(c) \
411 do {if (p == pend) goto end_of_pattern; \
412 c = (unsigned char) *p++; \
413 } while (0)
414
415 /* Go backwards one character in the pattern. */
416 #define PATUNFETCH p--
417
418
419 /* If the buffer isn't allocated when it comes in, use this. */
420 #define INIT_BUF_SIZE 28
421
422 /* Make sure we have at least N more bytes of space in buffer. */
423 #define GET_BUFFER_SPACE(n) \
424 { \
425 while (b - bufp->buffer + (n) >= bufp->allocated) \
426 EXTEND_BUFFER; \
427 }
428
429 /* Make sure we have one more byte of buffer space and then add CH to it. */
430 #define BUFPUSH(ch) \
431 { \
432 GET_BUFFER_SPACE(1); \
433 *b++ = (char)(ch); \
434 }
435
436 /* Extend the buffer by twice its current size via reallociation and
437 reset the pointers that pointed into the old allocation to point to
438 the correct places in the new allocation. If extending the buffer
439 results in it being larger than 1 << 16, then flag memory exhausted. */
440 #define EXTEND_BUFFER \
441 { char *old_buffer = bufp->buffer; \
442 if (bufp->allocated == (1L<<16)) goto too_big; \
443 bufp->allocated *= 2; \
444 if (bufp->allocated > (1L<<16)) bufp->allocated = (1L<<16); \
445 bufp->buffer = (char *) xrealloc (bufp->buffer, bufp->allocated); \
446 if (bufp->buffer == 0) \
447 goto memory_exhausted; \
448 b = (b - old_buffer) + bufp->buffer; \
449 if (fixup_alt_jump) \
450 fixup_alt_jump = (fixup_alt_jump - old_buffer) + bufp->buffer; \
451 if (laststart) \
452 laststart = (laststart - old_buffer) + bufp->buffer; \
453 begalt = (begalt - old_buffer) + bufp->buffer; \
454 if (pending_exact) \
455 pending_exact = (pending_exact - old_buffer) + bufp->buffer; \
456 }
457
458
459 /* Set the bit for character C in a character set list. */
460 #define SET_LIST_BIT(c) \
461 (b[(unsigned char)(c) / BYTEWIDTH] \
462 |= 1 << ((unsigned char)(c) % BYTEWIDTH))
463
464 /* Get the next unsigned number in the uncompiled pattern. */
465 #define GET_UNSIGNED_NUMBER(num) \
466 { if (p != pend) \
467 { \
468 PATFETCH(c); \
469 while (ISDIGIT(c)) \
470 { \
471 if (num < 0) \
472 num = 0; \
473 num = num * 10 + c - '0'; \
474 if (p == pend) \
475 break; \
476 PATFETCH(c); \
477 } \
478 } \
479 }
480
481 #define STREQ(s1, s2) ((strcmp (s1, s2) == 0))
482
483 #define CHAR_CLASS_MAX_LENGTH 6 /* Namely, `xdigit'. */
484
485 #define IS_CHAR_CLASS(string) \
486 (STREQ (string, "alpha") || STREQ (string, "upper") \
487 || STREQ (string, "lower") || STREQ (string, "digit") \
488 || STREQ (string, "alnum") || STREQ (string, "xdigit") \
489 || STREQ (string, "space") || STREQ (string, "print") \
490 || STREQ (string, "punct") || STREQ (string, "graph") \
491 || STREQ (string, "cntrl") || STREQ (string, "blank"))
492
493 #define STORE_MBC(p, c) \
494 ((p)[0] = (unsigned char)(c >> 8), (p)[1] = (unsigned char)(c))
495 #define STORE_MBC_AND_INCR(p, c) \
496 (*(p)++ = (unsigned char)(c >> 8), *(p)++ = (unsigned char)(c))
497
498 #define EXTRACT_MBC(p) \
499 ((unsigned short)((unsigned char)(p)[0] << 8 | (unsigned char)(p)[1]))
500 #define EXTRACT_MBC_AND_INCR(p) \
501 ((unsigned short)((p) += 2, (unsigned char)(p)[-2] << 8 | (unsigned char)(p)[-1]))
502
503 #define EXTRACT_UNSIGNED(p) \
504 ((unsigned char)(p)[0] | (unsigned char)(p)[1] << 8)
505 #define EXTRACT_UNSIGNED_AND_INCR(p) \
506 ((p) += 2, (unsigned char)(p)[-2] | (unsigned char)(p)[-1] << 8)
507
508 /* Handle (mb)?charset(_not)?.
509
510 Structure of mbcharset(_not)? in compiled pattern.
511
512 struct {
513 unsinged char id; mbcharset(_not)?
514 unsigned char sbc_size;
515 unsigned char sbc_map[sbc_size]; same as charset(_not)? up to here.
516 unsigned short mbc_size; number of intervals.
517 struct {
518 unsigned short beg; beginning of interval.
519 unsigned short end; end of interval.
520 } intervals[mbc_size];
521 }; */
522
523 static void
set_list_bits(c1,c2,b)524 set_list_bits(c1, c2, b)
525 unsigned short c1, c2;
526 unsigned char *b;
527 {
528 unsigned char sbc_size = b[-1];
529 unsigned short mbc_size = EXTRACT_UNSIGNED(&b[sbc_size]);
530 unsigned short beg, end, upb;
531
532 if (c1 > c2)
533 return;
534 if ((int)c1 < 1 << BYTEWIDTH) {
535 upb = c2;
536 if (1 << BYTEWIDTH <= (int)upb)
537 upb = (1 << BYTEWIDTH) - 1; /* The last single-byte char */
538 if (sbc_size <= (unsigned short)(upb / BYTEWIDTH)) {
539 /* Allocate maximum size so it never happens again. */
540 /* NOTE: memcpy() would not work here. */
541 memmove(&b[(1 << BYTEWIDTH) / BYTEWIDTH], &b[sbc_size], 2 + mbc_size*4);
542 memset(&b[sbc_size], 0, (1 << BYTEWIDTH) / BYTEWIDTH - sbc_size);
543 b[-1] = sbc_size = (1 << BYTEWIDTH) / BYTEWIDTH;
544 }
545 for (; c1 <= upb; c1++)
546 if (!ismbchar(c1))
547 SET_LIST_BIT(c1);
548 if ((int)c2 < 1 << BYTEWIDTH)
549 return;
550 c1 = 0x8000; /* The first wide char */
551 }
552 b = &b[sbc_size + 2];
553
554 for (beg = 0, upb = mbc_size; beg < upb; ) {
555 unsigned short mid = (unsigned short)(beg + upb) >> 1;
556
557 if ((int)c1 - 1 > (int)EXTRACT_MBC(&b[mid*4 + 2]))
558 beg = mid + 1;
559 else
560 upb = mid;
561 }
562
563 for (end = beg, upb = mbc_size; end < upb; ) {
564 unsigned short mid = (unsigned short)(end + upb) >> 1;
565
566 if ((int)c2 >= (int)EXTRACT_MBC(&b[mid*4]) - 1)
567 end = mid + 1;
568 else
569 upb = mid;
570 }
571
572 if (beg != end) {
573 if (c1 > EXTRACT_MBC(&b[beg*4]))
574 c1 = EXTRACT_MBC(&b[beg*4]);
575 if (c2 < EXTRACT_MBC(&b[(end - 1)*4]))
576 c2 = EXTRACT_MBC(&b[(end - 1)*4]);
577 }
578 if (end < mbc_size && end != beg + 1)
579 /* NOTE: memcpy() would not work here. */
580 memmove(&b[(beg + 1)*4], &b[end*4], (mbc_size - end)*4);
581 STORE_MBC(&b[beg*4 + 0], c1);
582 STORE_MBC(&b[beg*4 + 2], c2);
583 mbc_size += beg - end + 1;
584 STORE_NUMBER(&b[-2], mbc_size);
585 }
586
587 static int
is_in_list(c,b)588 is_in_list(c, b)
589 unsigned short c;
590 const unsigned char *b;
591 {
592 unsigned short size;
593 unsigned short i, j;
594 int result = 0;
595
596 size = *b++;
597 if ((int)c < 1<<BYTEWIDTH) {
598 if ((int)c / BYTEWIDTH < (int)size && b[c / BYTEWIDTH] & 1 << c % BYTEWIDTH) {
599 return 1;
600 }
601 }
602 b += size + 2;
603 size = EXTRACT_UNSIGNED(&b[-2]);
604 if (size == 0) return 0;
605
606 if (b[(size-1)*4] == 0xff) {
607 i = c;
608 if ((int)c >= 1<<BYTEWIDTH) {
609 i = i>>BYTEWIDTH;
610 }
611 while (size>0 && b[size*4-2] == 0xff) {
612 size--;
613 if (b[size*4+1] <= i && i <= b[size*4+3]) {
614 result = 2;
615 break;
616 }
617 }
618 }
619 for (i = 0, j = size; i < j; ) {
620 unsigned short k = (unsigned short)(i + j) >> 1;
621
622 if (c > EXTRACT_MBC(&b[k*4+2]))
623 i = k + 1;
624 else
625 j = k;
626 }
627 if (i < size && EXTRACT_MBC(&b[i*4]) <= c
628 && ((unsigned char)c != '\n' && (unsigned char)c != '\0'))
629 return 1;
630 return result;
631 }
632
633 #if 0
634 static void
635 print_partial_compiled_pattern(start, end)
636 unsigned char *start;
637 unsigned char *end;
638 {
639 int mcnt, mcnt2;
640 unsigned char *p = start;
641 unsigned char *pend = end;
642
643 if (start == NULL)
644 {
645 printf ("(null)\n");
646 return;
647 }
648
649 /* Loop over pattern commands. */
650 while (p < pend)
651 {
652 switch ((enum regexpcode) *p++)
653 {
654 case unused:
655 printf ("/unused");
656 break;
657
658 case exactn:
659 mcnt = *p++;
660 printf ("/exactn/%d", mcnt);
661 do
662 {
663 putchar('/');
664 printf("%c", *p++);
665 }
666 while (--mcnt);
667 break;
668
669 case start_memory:
670 mcnt = *p++;
671 printf ("/start_memory/%d/%d", mcnt, *p++);
672 break;
673
674 case stop_memory:
675 mcnt = *p++;
676 printf ("/stop_memory/%d/%d", mcnt, *p++);
677 break;
678
679 case casefold_on:
680 printf ("/casefold_on");
681 break;
682
683 case casefold_off:
684 printf ("/casefold_off");
685 break;
686
687 case start_nowidth:
688 EXTRACT_NUMBER_AND_INCR (mcnt, p);
689 printf ("/start_nowidth//%d", mcnt);
690 break;
691
692 case stop_nowidth:
693 printf ("/stop_nowidth//");
694 p += 2;
695 break;
696
697 case pop_and_fail:
698 printf ("/pop_and_fail");
699 break;
700
701 case duplicate:
702 printf ("/duplicate/%d", *p++);
703 break;
704
705 case anychar:
706 printf ("/anychar");
707 break;
708
709 case charset:
710 case charset_not:
711 {
712 register int c;
713
714 printf ("/charset%s",
715 (enum regexpcode) *(p - 1) == charset_not ? "_not" : "");
716
717 mcnt = *p++;
718 printf("/%d", mcnt);
719 for (c = 0; c < mcnt; c++)
720 {
721 unsigned bit;
722 unsigned char map_byte = p[c];
723
724 putchar ('/');
725
726 for (bit = 0; bit < BYTEWIDTH; bit++)
727 if (map_byte & (1 << bit))
728 printf("%c", c * BYTEWIDTH + bit);
729 }
730 p += mcnt;
731 mcnt = EXTRACT_UNSIGNED_AND_INCR(p);
732 while (mcnt--) {
733 int beg, end;
734 beg = EXTRACT_MBC_AND_INCR(p);
735 end = EXTRACT_MBC_AND_INCR(p);
736 printf("/%c%c-%c%c", beg>>BYTEWIDTH, beg&0xff, end>>BYTEWIDTH, end&0xff);
737 }
738 break;
739 }
740
741 case begline:
742 printf ("/begline");
743 break;
744
745 case endline:
746 printf ("/endline");
747 break;
748
749 case on_failure_jump:
750 EXTRACT_NUMBER_AND_INCR (mcnt, p);
751 printf ("/on_failure_jump//%d", mcnt);
752 break;
753
754 case dummy_failure_jump:
755 EXTRACT_NUMBER_AND_INCR (mcnt, p);
756 printf ("/dummy_failure_jump//%d", mcnt);
757 break;
758
759 case push_dummy_failure:
760 printf ("/push_dummy_failure");
761 break;
762
763 case finalize_jump:
764 EXTRACT_NUMBER_AND_INCR (mcnt, p);
765 printf ("/finalize_jump//%d", mcnt);
766 break;
767
768 case maybe_finalize_jump:
769 EXTRACT_NUMBER_AND_INCR (mcnt, p);
770 printf ("/maybe_finalize_jump//%d", mcnt);
771 break;
772
773 case jump_past_alt:
774 EXTRACT_NUMBER_AND_INCR (mcnt, p);
775 printf ("/jump_past_alt//%d", mcnt);
776 break;
777
778 case jump:
779 EXTRACT_NUMBER_AND_INCR (mcnt, p);
780 printf ("/jump//%d", mcnt);
781 break;
782
783 case succeed_n:
784 EXTRACT_NUMBER_AND_INCR (mcnt, p);
785 EXTRACT_NUMBER_AND_INCR (mcnt2, p);
786 printf ("/succeed_n//%d//%d", mcnt, mcnt2);
787 break;
788
789 case jump_n:
790 EXTRACT_NUMBER_AND_INCR (mcnt, p);
791 EXTRACT_NUMBER_AND_INCR (mcnt2, p);
792 printf ("/jump_n//%d//%d", mcnt, mcnt2);
793 break;
794
795 case set_number_at:
796 EXTRACT_NUMBER_AND_INCR (mcnt, p);
797 EXTRACT_NUMBER_AND_INCR (mcnt2, p);
798 printf ("/set_number_at//%d//%d", mcnt, mcnt2);
799 break;
800
801 case try_next:
802 EXTRACT_NUMBER_AND_INCR (mcnt, p);
803 printf ("/try_next//%d", mcnt);
804 break;
805
806 case finalize_push:
807 EXTRACT_NUMBER_AND_INCR (mcnt, p);
808 printf ("/finalize_push//%d", mcnt);
809 break;
810
811 case finalize_push_n:
812 EXTRACT_NUMBER_AND_INCR (mcnt, p);
813 EXTRACT_NUMBER_AND_INCR (mcnt2, p);
814 printf ("/finalize_push_n//%d//%d", mcnt, mcnt2);
815 break;
816
817 case wordbound:
818 printf ("/wordbound");
819 break;
820
821 case notwordbound:
822 printf ("/notwordbound");
823 break;
824
825 case wordbeg:
826 printf ("/wordbeg");
827 break;
828
829 case wordend:
830 printf ("/wordend");
831
832 case wordchar:
833 printf ("/wordchar");
834 break;
835
836 case notwordchar:
837 printf ("/notwordchar");
838 break;
839
840 case begbuf:
841 printf ("/begbuf");
842 break;
843
844 case endbuf:
845 printf ("/endbuf");
846 break;
847
848 case endbuf2:
849 printf ("/endbuf2");
850 break;
851
852 default:
853 printf ("?%d", *(p-1));
854 }
855 }
856 printf ("/\n");
857 }
858 #endif /* 0 */
859
860 #if 0
861 static void
862 print_compiled_pattern(bufp)
863 struct re_pattern_buffer *bufp;
864 {
865 unsigned char *buffer = (unsigned char*)bufp->buffer;
866
867 print_partial_compiled_pattern (buffer, buffer + bufp->used);
868 }
869 #endif /* 0 */
870
871 static char*
calculate_must_string(start,end)872 calculate_must_string(start, end)
873 char *start;
874 char *end;
875 {
876 int mcnt;
877 int max = 0;
878 char *p = start;
879 char *pend = end;
880 char *must = 0;
881
882 if (start == NULL) return 0;
883
884 /* Loop over pattern commands. */
885 while (p < pend)
886 {
887 switch ((enum regexpcode) *p++)
888 {
889 case unused:
890 break;
891
892 case exactn:
893 mcnt = *p;
894 if (mcnt > max) {
895 must = p;
896 max = mcnt;
897 }
898 p += mcnt+1;
899 break;
900
901 case start_memory:
902 case stop_memory:
903 p += 2;
904 break;
905
906 case duplicate:
907 p++;
908 break;
909
910 case casefold_on:
911 case casefold_off:
912 return 0; /* should not check must_string */
913
914 case pop_and_fail:
915 case anychar:
916 case begline:
917 case endline:
918 case wordbound:
919 case notwordbound:
920 case wordbeg:
921 case wordend:
922 case wordchar:
923 case notwordchar:
924 case begbuf:
925 case endbuf:
926 case endbuf2:
927 case push_dummy_failure:
928 break;
929
930 case charset:
931 case charset_not:
932 mcnt = *p++;
933 p += mcnt;
934 mcnt = EXTRACT_UNSIGNED_AND_INCR(p);
935 while (mcnt--) {
936 /* commented out by ccsatoru
937 EXTRACT_MBC_AND_INCR(p);
938 EXTRACT_MBC_AND_INCR(p);
939 */
940 }
941 break;
942
943 case on_failure_jump:
944 EXTRACT_NUMBER_AND_INCR (mcnt, p);
945 if (mcnt > 0) p += mcnt;
946 if ((enum regexpcode)p[-3] == jump) {
947 p -= 3;
948 EXTRACT_NUMBER_AND_INCR (mcnt, p);
949 if (mcnt > 0) p += mcnt;
950 }
951 break;
952
953 case dummy_failure_jump:
954 case succeed_n:
955 case try_next:
956 case jump:
957 EXTRACT_NUMBER_AND_INCR (mcnt, p);
958 if (mcnt > 0) p += mcnt;
959 break;
960
961 case start_nowidth:
962 case stop_nowidth:
963 case finalize_jump:
964 case maybe_finalize_jump:
965 case finalize_push:
966 p += 2;
967 break;
968
969 case jump_n:
970 case set_number_at:
971 case finalize_push_n:
972 p += 4;
973 break;
974
975 default:
976 break;
977 }
978 }
979 return must;
980 }
981
982
983 /* re_compile_pattern takes a regular-expression string
984 and converts it into a buffer full of byte commands for matching.
985
986 PATTERN is the address of the pattern string
987 SIZE is the length of it.
988 BUFP is a struct re_pattern_buffer * which points to the info
989 on where to store the byte commands.
990 This structure contains a char * which points to the
991 actual space, which should have been obtained with malloc.
992 re_compile_pattern may use realloc to grow the buffer space.
993
994 The number of bytes of commands can be found out by looking in
995 the `struct re_pattern_buffer' that bufp pointed to, after
996 re_compile_pattern returns. */
997
998 char *
re_compile_pattern(pattern,size,bufp)999 re_compile_pattern(pattern, size, bufp)
1000 char *pattern;
1001 size_t size;
1002 struct re_pattern_buffer *bufp;
1003 {
1004 register char *b = bufp->buffer;
1005 register char *p = pattern;
1006 char *pend = pattern + size;
1007 register unsigned c, c1;
1008 char *p0;
1009 int numlen;
1010
1011 /* Address of the count-byte of the most recently inserted `exactn'
1012 command. This makes it possible to tell whether a new exact-match
1013 character can be added to that command or requires a new `exactn'
1014 command. */
1015
1016 char *pending_exact = 0;
1017
1018 /* Address of the place where a forward-jump should go to the end of
1019 the containing expression. Each alternative of an `or', except the
1020 last, ends with a forward-jump of this sort. */
1021
1022 char *fixup_alt_jump = 0;
1023
1024 /* Address of start of the most recently finished expression.
1025 This tells postfix * where to find the start of its operand. */
1026
1027 char *laststart = 0;
1028
1029 /* In processing a repeat, 1 means zero matches is allowed. */
1030
1031 char zero_times_ok;
1032
1033 /* In processing a repeat, 1 means many matches is allowed. */
1034
1035 char many_times_ok;
1036
1037 /* In processing a repeat, 1 means non-greedy matches. */
1038
1039 char greedy;
1040
1041 /* Address of beginning of regexp, or inside of last \(. */
1042
1043 char *begalt = b;
1044
1045 /* Place in the uncompiled pattern (i.e., the {) to
1046 which to go back if the interval is invalid. */
1047 char *beg_interval;
1048
1049 /* In processing an interval, at least this many matches must be made. */
1050 int lower_bound;
1051
1052 /* In processing an interval, at most this many matches can be made. */
1053 int upper_bound;
1054
1055 /* Stack of information saved by \( and restored by \).
1056 Five stack elements are pushed by each \(:
1057 First, the value of b.
1058 Second, the value of fixup_alt_jump.
1059 Third, the value of begalt.
1060 Fourth, the value of regnum.
1061 Fifth, the type of the paren. */
1062
1063 int *stackb = RE_TALLOC(40, int);
1064 int *stackp = stackb;
1065 int *stacke = stackb + 40;
1066 int *stackt;
1067
1068 /* Counts \('s as they are encountered. Remembered for the matching \),
1069 where it becomes the register number to put in the stop_memory
1070 command. */
1071
1072 int regnum = 1;
1073
1074 int range = 0;
1075 int had_char_class = 0;
1076
1077 int options = bufp->options;
1078
1079 bufp->fastmap_accurate = 0;
1080 bufp->must = 0;
1081 bufp->must_skip = 0;
1082 bufp->stclass = 0;
1083
1084 /* Initialize the syntax table. */
1085 init_syntax_once();
1086
1087 if (bufp->allocated == 0) {
1088 bufp->allocated = INIT_BUF_SIZE;
1089 if (bufp->buffer)
1090 /* EXTEND_BUFFER loses when bufp->allocated is 0. */
1091 bufp->buffer = (char *) xrealloc (bufp->buffer, INIT_BUF_SIZE);
1092 else
1093 /* Caller did not allocate a buffer. Do it for them. */
1094 bufp->buffer = (char *) xmalloc(INIT_BUF_SIZE);
1095 if (!bufp->buffer) goto memory_exhausted;
1096 begalt = b = bufp->buffer;
1097 }
1098
1099 while (p != pend) {
1100 PATFETCH(c);
1101
1102 switch (c)
1103 {
1104 case '$':
1105 {
1106 p0 = p;
1107 /* When testing what follows the $,
1108 look past the \-constructs that don't consume anything. */
1109
1110 while (p0 != pend)
1111 {
1112 if (*p0 == '\\' && p0 + 1 != pend
1113 && (p0[1] == 'b' || p0[1] == 'B'))
1114 p0 += 2;
1115 else
1116 break;
1117 }
1118 /* $ means succeed if at end of line, but only in special contexts.
1119 If validly in the middle of a pattern, it is a normal character. */
1120
1121 if (p0 == pend || *p0 == '\n'
1122 || *p0 == ')'
1123 || *p0 == '|')
1124 {
1125 BUFPUSH(endline);
1126 break;
1127 }
1128 goto normal_char;
1129 }
1130 case '^':
1131 /* ^ means succeed if at beg of line, but only if no preceding
1132 pattern. */
1133
1134 if (laststart)
1135 goto invalid_pattern;
1136 if (laststart && p - 2 >= pattern && p[-2] != '\n')
1137 goto normal_char;
1138 BUFPUSH(begline);
1139 break;
1140
1141 case '+':
1142 case '?':
1143 case '*':
1144 /* If there is no previous pattern, char not special. */
1145 if (!laststart) {
1146 goto invalid_pattern;
1147 }
1148 /* If there is a sequence of repetition chars,
1149 collapse it down to just one. */
1150 zero_times_ok = c != '+';
1151 many_times_ok = c != '?';
1152 greedy = 1;
1153 if (p != pend) {
1154 PATFETCH(c);
1155 switch (c) {
1156 case '?':
1157 greedy = 0;
1158 break;
1159 case '*':
1160 case '+':
1161 goto nested_meta;
1162 default:
1163 PATUNFETCH;
1164 break;
1165 }
1166 }
1167
1168 repeat:
1169 /* Star, etc. applied to an empty pattern is equivalent
1170 to an empty pattern. */
1171 if (!laststart)
1172 break;
1173
1174 /* Now we know whether or not zero matches is allowed
1175 and also whether or not two or more matches is allowed. */
1176 if (many_times_ok) {
1177 /* If more than one repetition is allowed, put in at the
1178 end a backward relative jump from b to before the next
1179 jump we're going to put in below (which jumps from
1180 laststart to after this jump). */
1181 GET_BUFFER_SPACE(3);
1182 store_jump(b,greedy?maybe_finalize_jump:finalize_push,laststart-3);
1183 b += 3; /* Because store_jump put stuff here. */
1184 }
1185
1186 /* On failure, jump from laststart to next pattern, which will be the
1187 end of the buffer after this jump is inserted. */
1188 GET_BUFFER_SPACE(3);
1189 insert_jump(on_failure_jump, laststart, b + 3, b);
1190 b += 3;
1191
1192 if (zero_times_ok) {
1193 if (greedy == 0) {
1194 GET_BUFFER_SPACE(3);
1195 insert_jump(try_next, laststart, b + 3, b);
1196 b += 3;
1197 }
1198 }
1199 else {
1200 /* At least one repetition is required, so insert a
1201 `dummy_failure_jump' before the initial
1202 `on_failure_jump' instruction of the loop. This
1203 effects a skip over that instruction the first time
1204 we hit that loop. */
1205 GET_BUFFER_SPACE(3);
1206 insert_jump(dummy_failure_jump, laststart, laststart + 6, b);
1207 b += 3;
1208 }
1209 break;
1210
1211 case '.':
1212 laststart = b;
1213 BUFPUSH(anychar);
1214 break;
1215
1216 case '[':
1217 if (p == pend)
1218 goto invalid_pattern;
1219 while (b - bufp->buffer
1220 > bufp->allocated - 9 - (1 << BYTEWIDTH) / BYTEWIDTH)
1221 EXTEND_BUFFER;
1222
1223 laststart = b;
1224 if (*p == '^')
1225 {
1226 BUFPUSH(charset_not);
1227 p++;
1228 }
1229 else
1230 BUFPUSH(charset);
1231 p0 = p;
1232
1233 BUFPUSH((1 << BYTEWIDTH) / BYTEWIDTH);
1234 /* Clear the whole map */
1235 memset(b, 0, (1 << BYTEWIDTH) / BYTEWIDTH + 2);
1236
1237 if ((re_syntax_options & RE_HAT_NOT_NEWLINE) && b[-2] == charset_not)
1238 SET_LIST_BIT('\n');
1239
1240 had_char_class = 0;
1241 /* Read in characters and ranges, setting map bits. */
1242 for (;;)
1243 {
1244 int size;
1245 unsigned last = (unsigned)-1;
1246
1247 if ((size = EXTRACT_UNSIGNED(&b[(1 << BYTEWIDTH) / BYTEWIDTH]))) {
1248 /* Ensure the space is enough to hold another interval
1249 of multi-byte chars in charset(_not)?. */
1250 size = (1 << BYTEWIDTH) / BYTEWIDTH + 2 + size*4 + 4;
1251 while (b + size + 1 > bufp->buffer + bufp->allocated)
1252 EXTEND_BUFFER;
1253 }
1254 range_retry:
1255 PATFETCH(c);
1256
1257 if (c == ']') {
1258 if (p == p0 + 1) {
1259 if (p == pend)
1260 goto invalid_pattern;
1261 }
1262 else
1263 /* Stop if this isn't merely a ] inside a bracket
1264 expression, but rather the end of a bracket
1265 expression. */
1266 break;
1267 }
1268 /* Look ahead to see if it's a range when the last thing
1269 was a character class. */
1270 if (had_char_class && c == '-' && *p != ']')
1271 goto invalid_pattern;
1272 if (ismbchar(c)) {
1273 PATFETCH(c1);
1274 c = c << BYTEWIDTH | c1;
1275 }
1276
1277 /* \ escapes characters when inside [...]. */
1278 if (c == '\\') {
1279 PATFETCH(c);
1280 switch (c) {
1281 case 'w':
1282 for (c = 0; c < (1 << BYTEWIDTH); c++)
1283 if (SYNTAX(c) == Sword)
1284 SET_LIST_BIT(c);
1285 last = -1;
1286 continue;
1287
1288 case 'W':
1289 for (c = 0; c < (1 << BYTEWIDTH); c++)
1290 if (SYNTAX(c) != Sword)
1291 SET_LIST_BIT(c);
1292 if (current_mbctype) {
1293 set_list_bits(0x8000, 0xffff, (unsigned char*)b);
1294 }
1295 last = -1;
1296 continue;
1297
1298 case 's':
1299 for (c = 0; c < 256; c++)
1300 if (ISSPACE(c))
1301 SET_LIST_BIT(c);
1302 last = -1;
1303 continue;
1304
1305 case 'S':
1306 for (c = 0; c < 256; c++)
1307 if (!ISSPACE(c))
1308 SET_LIST_BIT(c);
1309 if (current_mbctype) {
1310 set_list_bits(0x8000, 0xffff, (unsigned char*)b);
1311 }
1312 last = -1;
1313 continue;
1314
1315 case 'd':
1316 for (c = '0'; c <= '9'; c++)
1317 SET_LIST_BIT(c);
1318 last = -1;
1319 continue;
1320
1321 case 'D':
1322 for (c = 0; c < 256; c++)
1323 if (!ISDIGIT(c))
1324 SET_LIST_BIT(c);
1325 if (current_mbctype) {
1326 set_list_bits(0x8000, 0xffff, (unsigned char*)b);
1327 }
1328 last = -1;
1329 continue;
1330
1331 case 'x':
1332 c = scan_hex(p, 2, &numlen);
1333 if (current_mbctype && c > 0x7f)
1334 c = 0xff00 | c;
1335 p += numlen;
1336 break;
1337
1338 case '0': case '1': case '2': case '3': case '4':
1339 case '5': case '6': case '7': case '8': case '9':
1340 PATUNFETCH;
1341 c = scan_oct(p, 3, &numlen);
1342 if (ismbchar(c))
1343 c |= 0xff00;
1344 p += numlen;
1345 break;
1346
1347 default:
1348 if (ismbchar(c)) {
1349 PATFETCH(c1);
1350 c = c << 8 | c1;
1351 }
1352 break;
1353 }
1354 }
1355
1356 /* Get a range. */
1357 if (range) {
1358 if (last > c)
1359 goto invalid_pattern;
1360
1361 range = 0;
1362 if (last < 1 << BYTEWIDTH && c < 1 << BYTEWIDTH) {
1363 for (;last<=c;last++)
1364 SET_LIST_BIT(last);
1365 }
1366 else {
1367 set_list_bits(last, c, (unsigned char*)b);
1368 }
1369 }
1370 else if (p[0] == '-' && p[1] != ']') {
1371 last = c;
1372 PATFETCH(c1);
1373 range = 1;
1374 goto range_retry;
1375 }
1376 else if ((re_syntax_options & RE_CHAR_CLASSES)
1377 && c == '[' && *p == ':') {
1378 /* Leave room for the null. */
1379 char str[CHAR_CLASS_MAX_LENGTH + 1];
1380
1381 PATFETCH_RAW (c);
1382 c1 = 0;
1383
1384 /* If pattern is `[[:'. */
1385 if (p == pend)
1386 goto invalid_pattern;
1387
1388 for (;;) {
1389 PATFETCH (c);
1390 if (c == ':' || c == ']' || p == pend
1391 || c1 == CHAR_CLASS_MAX_LENGTH)
1392 break;
1393 str[c1++] = c;
1394 }
1395 str[c1] = '\0';
1396
1397 /* If isn't a word bracketed by `[:' and:`]':
1398 undo the ending character, the letters, and leave
1399 the leading `:' and `[' (but set bits for them). */
1400 if (c == ':' && *p == ']') {
1401 int ch;
1402 char is_alnum = STREQ (str, "alnum");
1403 char is_alpha = STREQ (str, "alpha");
1404 char is_blank = STREQ (str, "blank");
1405 char is_cntrl = STREQ (str, "cntrl");
1406 char is_digit = STREQ (str, "digit");
1407 char is_graph = STREQ (str, "graph");
1408 char is_lower = STREQ (str, "lower");
1409 char is_print = STREQ (str, "print");
1410 char is_punct = STREQ (str, "punct");
1411 char is_space = STREQ (str, "space");
1412 char is_upper = STREQ (str, "upper");
1413 char is_xdigit = STREQ (str, "xdigit");
1414
1415 if (!IS_CHAR_CLASS (str))
1416 goto invalid_pattern;
1417
1418 /* Throw away the ] at the end of the character
1419 class. */
1420 PATFETCH (c);
1421
1422 if (p == pend)
1423 goto invalid_pattern;
1424
1425
1426 for (ch = 0; ch < 1 << BYTEWIDTH; ch++) {
1427 if ( (is_alnum && ISALNUM (ch))
1428 || (is_alpha && ISALPHA (ch))
1429 || (is_blank && ISBLANK (ch))
1430 || (is_cntrl && ISCNTRL (ch))
1431 || (is_digit && ISDIGIT (ch))
1432 || (is_graph && ISGRAPH (ch))
1433 || (is_lower && ISLOWER (ch))
1434 || (is_print && ISPRINT (ch))
1435 || (is_punct && ISPUNCT (ch))
1436 || (is_space && ISSPACE (ch))
1437 || (is_upper && ISUPPER (ch))
1438 || (is_xdigit && ISXDIGIT (ch)))
1439 SET_LIST_BIT (ch);
1440 }
1441 had_char_class = 1;
1442 }
1443 else {
1444 c1++;
1445 while (c1--)
1446 PATUNFETCH;
1447 SET_LIST_BIT(translate?translate['[']:'[');
1448 SET_LIST_BIT(translate?translate[':']:':');
1449 had_char_class = 0;
1450 last = ':';
1451 }
1452 }
1453 else if (c < 1 << BYTEWIDTH)
1454 SET_LIST_BIT(c);
1455 else
1456 set_list_bits(c, c, (unsigned char*)b);
1457 }
1458
1459 /* Discard any character set/class bitmap bytes that are all
1460 0 at the end of the map. Decrement the map-length byte too. */
1461 while ((int) b[-1] > 0 && b[b[-1] - 1] == 0)
1462 b[-1]--;
1463 if (b[-1] != (1 << BYTEWIDTH) / BYTEWIDTH)
1464 memmove(&b[b[-1]], &b[(1 << BYTEWIDTH) / BYTEWIDTH],
1465 2 + EXTRACT_UNSIGNED (&b[(1 << BYTEWIDTH) / BYTEWIDTH])*4);
1466 b += b[-1] + 2 + EXTRACT_UNSIGNED (&b[b[-1]])*4;
1467 break;
1468
1469 case '(':
1470 PATFETCH(c);
1471 if (c == '?') {
1472 int negative = 0;
1473 PATFETCH_RAW(c);
1474 switch (c) {
1475 case 'x': case 'i': case '-':
1476 for (;;) {
1477 switch (c) {
1478 case '-':
1479 negative = 1;
1480 break;
1481
1482 case ')':
1483 case ':':
1484 break;
1485
1486 case 'x':
1487 if (negative)
1488 options &= ~RE_OPTION_EXTENDED;
1489 else
1490 options |= RE_OPTION_EXTENDED;
1491 break;
1492 case 'i':
1493 if (negative) {
1494 if (options&RE_OPTION_IGNORECASE) {
1495 options &= ~RE_OPTION_IGNORECASE;
1496 BUFPUSH(casefold_off);
1497 }
1498 }
1499 else if (!(options&RE_OPTION_IGNORECASE)) {
1500 options |= RE_OPTION_IGNORECASE;
1501 BUFPUSH(casefold_on);
1502 }
1503 break;
1504
1505 default:
1506 FREE_AND_RETURN(stackb, "undefined (?...) inline option");
1507 }
1508 if (c == ')') {
1509 c = '#'; /* read whole in-line options */
1510 break;
1511 }
1512 if (c == ':') break;
1513 PATFETCH_RAW(c);
1514 }
1515 break;
1516
1517 case '#':
1518 for (;;) {
1519 PATFETCH(c);
1520 if (c == ')') break;
1521 }
1522 c = '#';
1523 break;
1524
1525 case ':':
1526 case '=':
1527 case '!':
1528 break;
1529
1530 default:
1531 FREE_AND_RETURN(stackb, "undefined (?...) sequence");
1532 }
1533 }
1534 else {
1535 PATUNFETCH;
1536 c = '(';
1537 }
1538 if (c == '#') break;
1539 if (stackp+8 >= stacke) {
1540 int *stackx;
1541 unsigned int len = stacke - stackb;
1542
1543 stackx = DOUBLE_STACK(stackx,stackb,len,int);
1544 /* Rearrange the pointers. */
1545 stackp = stackx + (stackp - stackb);
1546 stackb = stackx;
1547 stacke = stackb + 2 * len;
1548 }
1549
1550 /* Laststart should point to the start_memory that we are about
1551 to push (unless the pattern has RE_NREGS or more ('s). */
1552 /* obsolete: now RE_NREGS is just a default register size. */
1553 *stackp++ = b - bufp->buffer;
1554 *stackp++ = fixup_alt_jump ? fixup_alt_jump - bufp->buffer + 1 : 0;
1555 *stackp++ = begalt - bufp->buffer;
1556 switch (c) {
1557 case '(':
1558 BUFPUSH(start_memory);
1559 BUFPUSH(regnum);
1560 *stackp++ = regnum++;
1561 *stackp++ = b - bufp->buffer;
1562 BUFPUSH(0);
1563 /* too many ()'s to fit in a byte. (max 254) */
1564 if (regnum >= RE_REG_MAX) goto too_big;
1565 break;
1566
1567 case '=':
1568 case '!':
1569 BUFPUSH(start_nowidth);
1570 *stackp++ = b - bufp->buffer;
1571 BUFPUSH(0); /* temporary value */
1572 BUFPUSH(0);
1573 if (c == '=') break;
1574
1575 BUFPUSH(on_failure_jump);
1576 *stackp++ = b - bufp->buffer;
1577 BUFPUSH(0); /* temporary value */
1578 BUFPUSH(0);
1579 break;
1580
1581 case ':':
1582 pending_exact = 0;
1583 default:
1584 break;
1585 }
1586 *stackp++ = c;
1587 *stackp++ = options;
1588 fixup_alt_jump = 0;
1589 laststart = 0;
1590 begalt = b;
1591 break;
1592
1593 case ')':
1594 if (stackp == stackb)
1595 FREE_AND_RETURN(stackb, "unmatched )");
1596 if ((options ^ stackp[-1]) & RE_OPTION_IGNORECASE) {
1597 BUFPUSH((options&RE_OPTION_IGNORECASE)?casefold_off:casefold_on);
1598 }
1599 pending_exact = 0;
1600 if (fixup_alt_jump)
1601 { /* Push a dummy failure point at the end of the
1602 alternative for a possible future
1603 `finalize_jump' to pop. See comments at
1604 `push_dummy_failure' in `re_match'. */
1605 BUFPUSH(push_dummy_failure);
1606
1607 /* We allocated space for this jump when we assigned
1608 to `fixup_alt_jump', in the `handle_alt' case below. */
1609 store_jump(fixup_alt_jump, jump, b);
1610 }
1611 options = *--stackp;
1612 switch (c = *--stackp) {
1613 case '(':
1614 {
1615 char *loc = bufp->buffer + *--stackp;
1616 *loc = regnum - stackp[-1];
1617 BUFPUSH(stop_memory);
1618 BUFPUSH(stackp[-1]);
1619 BUFPUSH(regnum - stackp[-1]);
1620 stackp--;
1621 }
1622 break;
1623
1624 case '!':
1625 BUFPUSH(pop_and_fail);
1626 /* back patch */
1627 STORE_NUMBER(bufp->buffer+stackp[-1], b - bufp->buffer - stackp[-1] - 2);
1628 stackp--;
1629 /* fall through */
1630 case '=':
1631 BUFPUSH(stop_nowidth);
1632 /* tell stack-pos place to start_nowidth */
1633 STORE_NUMBER(bufp->buffer+stackp[-1], b - bufp->buffer - stackp[-1] - 2);
1634 BUFPUSH(0); /* space to hold stack pos */
1635 BUFPUSH(0);
1636 stackp--;
1637 break;
1638
1639 default:
1640 break;
1641 }
1642 begalt = *--stackp + bufp->buffer;
1643 stackp--;
1644 fixup_alt_jump = *stackp ? *stackp + bufp->buffer - 1 : 0;
1645 laststart = *--stackp + bufp->buffer;
1646 if (c == '!' || c == '=') laststart = b;
1647 break;
1648
1649 case '|':
1650 /* Insert before the previous alternative a jump which
1651 jumps to this alternative if the former fails. */
1652 GET_BUFFER_SPACE(3);
1653 insert_jump(on_failure_jump, begalt, b + 6, b);
1654 pending_exact = 0;
1655 b += 3;
1656 /* The alternative before this one has a jump after it
1657 which gets executed if it gets matched. Adjust that
1658 jump so it will jump to this alternative's analogous
1659 jump (put in below, which in turn will jump to the next
1660 (if any) alternative's such jump, etc.). The last such
1661 jump jumps to the correct final destination. A picture:
1662 _____ _____
1663 | | | |
1664 | v | v
1665 a | b | c
1666
1667 If we are at `b', then fixup_alt_jump right now points to a
1668 three-byte space after `a'. We'll put in the jump, set
1669 fixup_alt_jump to right after `b', and leave behind three
1670 bytes which we'll fill in when we get to after `c'. */
1671
1672 if (fixup_alt_jump)
1673 store_jump(fixup_alt_jump, jump_past_alt, b);
1674
1675 /* Mark and leave space for a jump after this alternative,
1676 to be filled in later either by next alternative or
1677 when know we're at the end of a series of alternatives. */
1678 fixup_alt_jump = b;
1679 GET_BUFFER_SPACE(3);
1680 b += 3;
1681
1682 laststart = 0;
1683 begalt = b;
1684 break;
1685
1686 case '{':
1687 /* If there is no previous pattern, this isn't an interval. */
1688 if (!laststart || p == pend)
1689 {
1690 goto normal_backsl;
1691 }
1692
1693 beg_interval = p - 1;
1694
1695 lower_bound = -1; /* So can see if are set. */
1696 upper_bound = -1;
1697 GET_UNSIGNED_NUMBER(lower_bound);
1698 if (c == ',') {
1699 GET_UNSIGNED_NUMBER(upper_bound);
1700 }
1701 else
1702 /* Interval such as `{1}' => match exactly once. */
1703 upper_bound = lower_bound;
1704
1705 if (lower_bound < 0 || c != '}')
1706 goto unfetch_interval;
1707
1708 if (lower_bound >= RE_DUP_MAX || upper_bound >= RE_DUP_MAX)
1709 FREE_AND_RETURN(stackb, "too big quantifier in {,}");
1710 if (upper_bound < 0) upper_bound = RE_DUP_MAX;
1711 if (lower_bound > upper_bound)
1712 FREE_AND_RETURN(stackb, "can't do {n,m} with n > m");
1713
1714 beg_interval = 0;
1715 pending_exact = 0;
1716
1717 greedy = 1;
1718 if (p != pend) {
1719 PATFETCH(c);
1720 if (c == '?') greedy = 0;
1721 else PATUNFETCH;
1722 }
1723
1724 if (lower_bound == 0) {
1725 zero_times_ok = 1;
1726 if (upper_bound == RE_DUP_MAX) {
1727 many_times_ok = 1;
1728 goto repeat;
1729 }
1730 if (upper_bound == 1) {
1731 many_times_ok = 0;
1732 goto repeat;
1733 }
1734 }
1735 if (lower_bound == 1) {
1736 if (upper_bound == 1) {
1737 /* No need to repeat */
1738 break;
1739 }
1740 if (upper_bound == RE_DUP_MAX) {
1741 many_times_ok = 1;
1742 zero_times_ok = 0;
1743 goto repeat;
1744 }
1745 }
1746
1747 /* If upper_bound is zero, don't want to succeed at all;
1748 jump from laststart to b + 3, which will be the end of
1749 the buffer after this jump is inserted. */
1750
1751 if (upper_bound == 0) {
1752 GET_BUFFER_SPACE(3);
1753 insert_jump(jump, laststart, b + 3, b);
1754 b += 3;
1755 break;
1756 }
1757
1758 /* Otherwise, we have a nontrivial interval. When
1759 we're all done, the pattern will look like:
1760 set_number_at <jump count> <upper bound>
1761 set_number_at <succeed_n count> <lower bound>
1762 succeed_n <after jump addr> <succed_n count>
1763 <body of loop>
1764 jump_n <succeed_n addr> <jump count>
1765 (The upper bound and `jump_n' are omitted if
1766 `upper_bound' is 1, though.) */
1767 { /* If the upper bound is > 1, we need to insert
1768 more at the end of the loop. */
1769 unsigned nbytes = upper_bound == 1 ? 10 : 20;
1770
1771 GET_BUFFER_SPACE(nbytes);
1772 /* Initialize lower bound of the `succeed_n', even
1773 though it will be set during matching by its
1774 attendant `set_number_at' (inserted next),
1775 because `re_compile_fastmap' needs to know.
1776 Jump to the `jump_n' we might insert below. */
1777 insert_jump_n(succeed_n, laststart, b + (nbytes/2),
1778 b, lower_bound);
1779 b += 5; /* Just increment for the succeed_n here. */
1780
1781 /* Code to initialize the lower bound. Insert
1782 before the `succeed_n'. The `5' is the last two
1783 bytes of this `set_number_at', plus 3 bytes of
1784 the following `succeed_n'. */
1785 insert_op_2(set_number_at, laststart, b, 5, lower_bound);
1786 b += 5;
1787
1788 if (upper_bound > 1)
1789 { /* More than one repetition is allowed, so
1790 append a backward jump to the `succeed_n'
1791 that starts this interval.
1792
1793 When we've reached this during matching,
1794 we'll have matched the interval once, so
1795 jump back only `upper_bound - 1' times. */
1796 GET_BUFFER_SPACE(5);
1797 store_jump_n(b, greedy?jump_n:finalize_push_n, laststart + 5,
1798 upper_bound - 1);
1799 b += 5;
1800
1801 /* The location we want to set is the second
1802 parameter of the `jump_n'; that is `b-2' as
1803 an absolute address. `laststart' will be
1804 the `set_number_at' we're about to insert;
1805 `laststart+3' the number to set, the source
1806 for the relative address. But we are
1807 inserting into the middle of the pattern --
1808 so everything is getting moved up by 5.
1809 Conclusion: (b - 2) - (laststart + 3) + 5,
1810 i.e., b - laststart.
1811
1812 We insert this at the beginning of the loop
1813 so that if we fail during matching, we'll
1814 reinitialize the bounds. */
1815 insert_op_2(set_number_at, laststart, b, b - laststart,
1816 upper_bound - 1);
1817 b += 5;
1818 }
1819 }
1820 break;
1821
1822 unfetch_interval:
1823 /* If an invalid interval, match the characters as literals. */
1824 p = beg_interval;
1825 beg_interval = 0;
1826
1827 /* normal_char and normal_backslash need `c'. */
1828 PATFETCH (c);
1829 goto normal_char;
1830
1831 case '\\':
1832 if (p == pend) goto invalid_pattern;
1833 /* Do not translate the character after the \, so that we can
1834 distinguish, e.g., \B from \b, even if we normally would
1835 translate, e.g., B to b. */
1836 PATFETCH_RAW(c);
1837 switch (c)
1838 {
1839 case 's':
1840 case 'S':
1841 case 'd':
1842 case 'D':
1843 while (b - bufp->buffer
1844 > bufp->allocated - 9 - (1 << BYTEWIDTH) / BYTEWIDTH)
1845 EXTEND_BUFFER;
1846
1847 laststart = b;
1848 if (c == 's' || c == 'd') {
1849 BUFPUSH(charset);
1850 }
1851 else {
1852 BUFPUSH(charset_not);
1853 }
1854
1855 BUFPUSH((1 << BYTEWIDTH) / BYTEWIDTH);
1856 memset(b, 0, (1 << BYTEWIDTH) / BYTEWIDTH + 2);
1857 if (c == 's' || c == 'S') {
1858 SET_LIST_BIT(' ');
1859 SET_LIST_BIT('\t');
1860 SET_LIST_BIT('\n');
1861 SET_LIST_BIT('\r');
1862 SET_LIST_BIT('\f');
1863 }
1864 else {
1865 char cc;
1866
1867 for (cc = '0'; cc <= '9'; cc++) {
1868 SET_LIST_BIT(cc);
1869 }
1870 }
1871
1872 while ((int) b[-1] > 0 && b[b[-1] - 1] == 0)
1873 b[-1]--;
1874 if (b[-1] != (1 << BYTEWIDTH) / BYTEWIDTH)
1875 memmove(&b[b[-1]], &b[(1 << BYTEWIDTH) / BYTEWIDTH],
1876 2 + EXTRACT_UNSIGNED(&b[(1 << BYTEWIDTH) / BYTEWIDTH])*4);
1877 b += b[-1] + 2 + EXTRACT_UNSIGNED(&b[b[-1]])*4;
1878 break;
1879
1880 case 'w':
1881 laststart = b;
1882 BUFPUSH(wordchar);
1883 break;
1884
1885 case 'W':
1886 laststart = b;
1887 BUFPUSH(notwordchar);
1888 break;
1889
1890 case '<':
1891 BUFPUSH(wordbeg);
1892 break;
1893
1894 case '>':
1895 BUFPUSH(wordend);
1896 break;
1897
1898 case 'b':
1899 BUFPUSH(wordbound);
1900 break;
1901
1902 case 'B':
1903 BUFPUSH(notwordbound);
1904 break;
1905
1906 case 'A':
1907 BUFPUSH(begbuf);
1908 break;
1909
1910 case 'Z':
1911 BUFPUSH(endbuf2);
1912 break;
1913
1914 case 'z':
1915 BUFPUSH(endbuf);
1916 break;
1917
1918 /* hex */
1919 case 'x':
1920 c1 = 0;
1921 c = scan_hex(p, 2, &numlen);
1922 p += numlen;
1923 if (c > 0x7f)
1924 c1 = 0xff;
1925 goto numeric_char;
1926
1927 /* octal */
1928 case '0':
1929 c1 = 0;
1930 c = scan_oct(p, 3, &numlen);
1931 p += numlen;
1932 if (c > 0x7f)
1933 c1 = 0xff;
1934 goto numeric_char;
1935
1936 /* back-ref or octal */
1937 case '1': case '2': case '3':
1938 case '4': case '5': case '6':
1939 case '7': case '8': case '9':
1940 {
1941 char *p_save;
1942
1943 PATUNFETCH;
1944 p_save = p;
1945
1946 c1 = 0;
1947 GET_UNSIGNED_NUMBER(c1);
1948 if (!ISDIGIT(c)) PATUNFETCH;
1949
1950 if (c1 >= regnum) {
1951 /* need to get octal */
1952 p = p_save;
1953 c = scan_oct(p_save, 3, &numlen) & 0xff;
1954 p = p_save + numlen;
1955 c1 = 0;
1956 if (c > 0x7f)
1957 c1 = 0xff;
1958 goto numeric_char;
1959 }
1960 }
1961
1962 /* Can't back reference to a subexpression if inside of it. */
1963 for (stackt = stackp - 2; stackt > stackb; stackt -= 5)
1964 if (*stackt == c1)
1965 goto normal_char;
1966 laststart = b;
1967 BUFPUSH(duplicate);
1968 BUFPUSH(c1);
1969 break;
1970
1971 default:
1972 normal_backsl:
1973 goto normal_char;
1974 }
1975 break;
1976
1977 case '#':
1978 if (options & RE_OPTION_EXTENDED)
1979 {
1980 while (p != pend) {
1981 PATFETCH(c);
1982 if (c == '\n') break;
1983 }
1984 break;
1985 }
1986 goto normal_char;
1987
1988 case ' ':
1989 case '\t':
1990 case '\f':
1991 case '\r':
1992 case '\n':
1993 if (options & RE_OPTION_EXTENDED)
1994 break;
1995
1996 default:
1997 normal_char: /* Expects the character in `c'. */
1998 c1 = 0;
1999 if (ismbchar(c)) {
2000 c1 = c;
2001 PATFETCH_RAW(c);
2002 }
2003 else if (c > 0x7f) {
2004 c1 = 0xff;
2005 }
2006 numeric_char:
2007 if (!pending_exact || pending_exact + *pending_exact + 1 != b
2008 || *pending_exact >= (c1 ? 0176 : 0177)
2009 || *p == '+' || *p == '?'
2010 || *p == '*' || *p == '^'
2011 || *p == '{')
2012 {
2013 laststart = b;
2014 BUFPUSH(exactn);
2015 pending_exact = b;
2016 BUFPUSH(0);
2017 }
2018 if (c1) {
2019 BUFPUSH(c1);
2020 (*pending_exact)++;
2021 }
2022 BUFPUSH(c);
2023 (*pending_exact)++;
2024 }
2025 }
2026
2027 if (fixup_alt_jump)
2028 store_jump(fixup_alt_jump, jump, b);
2029
2030 if (stackp != stackb)
2031 FREE_AND_RETURN(stackb, "unmatched (");
2032
2033 /* set optimize flags */
2034 laststart = bufp->buffer;
2035 if (*laststart == start_memory) laststart += 3;
2036 if (*laststart == dummy_failure_jump) laststart += 3;
2037 else if (*laststart == try_next) laststart += 3;
2038 if (*laststart == on_failure_jump) {
2039 int mcnt;
2040
2041 laststart++;
2042 EXTRACT_NUMBER_AND_INCR(mcnt, laststart);
2043 if (mcnt == 4 && *laststart == anychar) {
2044 bufp->options |= RE_OPTIMIZE_ANCHOR;
2045 }
2046 else if (*laststart == charset || *laststart == charset_not) {
2047 p0 = laststart;
2048 mcnt = *++p0 ;
2049 p0 += mcnt+1;
2050 mcnt = EXTRACT_UNSIGNED_AND_INCR(p0);
2051 p0 += 4*mcnt;
2052 if (*p0 == maybe_finalize_jump) {
2053 bufp->stclass = laststart;
2054 }
2055 }
2056 }
2057
2058 bufp->used = b - bufp->buffer;
2059 bufp->re_nsub = regnum;
2060 laststart = bufp->buffer;
2061 if (*laststart == start_memory) laststart += 3;
2062 if (*laststart == exactn) {
2063 bufp->options |= RE_OPTIMIZE_EXACTN;
2064 bufp->must = laststart+1;
2065 }
2066 else {
2067 bufp->must = calculate_must_string(bufp->buffer, b);
2068 }
2069 if (current_mbctype == MBCTYPE_SJIS) bufp->options |= RE_OPTIMIZE_NO_BM;
2070 else if (bufp->must) {
2071 int i;
2072 int len = (unsigned char)bufp->must[0];
2073
2074 for (i=1; i<len; i++) {
2075 if ((unsigned char)bufp->must[i] == 0xff ||
2076 (current_mbctype == MBCTYPE_EUC && ismbchar(bufp->must[i]))) {
2077 bufp->options |= RE_OPTIMIZE_NO_BM;
2078 break;
2079 }
2080 }
2081 if (!(bufp->options & RE_OPTIMIZE_NO_BM)) {
2082 bufp->must_skip = (int *) xmalloc((1 << BYTEWIDTH)*sizeof(int));
2083 bm_init_skip(bufp->must_skip, bufp->must+1,
2084 (unsigned char)bufp->must[0],
2085 MAY_TRANSLATE()?translate:0);
2086 }
2087 }
2088
2089 FREE_AND_RETURN(stackb, 0);
2090
2091 invalid_pattern:
2092 FREE_AND_RETURN(stackb, "invalid regular expression");
2093
2094 end_of_pattern:
2095 FREE_AND_RETURN(stackb, "premature end of regular expression");
2096
2097 too_big:
2098 FREE_AND_RETURN(stackb, "regular expression too big");
2099
2100 memory_exhausted:
2101 FREE_AND_RETURN(stackb, "memory exhausted");
2102
2103 nested_meta:
2104 FREE_AND_RETURN(stackb, "nested *?+ in regexp");
2105 }
2106
2107 void
re_free_pattern(bufp)2108 re_free_pattern(bufp)
2109 struct re_pattern_buffer *bufp;
2110 {
2111 free(bufp->buffer);
2112 free(bufp->fastmap);
2113 if (bufp->must_skip) free(bufp->must_skip);
2114 free(bufp);
2115 }
2116
2117 /* Store a jump of the form <OPCODE> <relative address>.
2118 Store in the location FROM a jump operation to jump to relative
2119 address FROM - TO. OPCODE is the opcode to store. */
2120
2121 static void
store_jump(from,opcode,to)2122 store_jump(from, opcode, to)
2123 char *from, *to;
2124 int opcode;
2125 {
2126 from[0] = (char)opcode;
2127 STORE_NUMBER(from + 1, to - (from + 3));
2128 }
2129
2130
2131 /* Open up space before char FROM, and insert there a jump to TO.
2132 CURRENT_END gives the end of the storage not in use, so we know
2133 how much data to copy up. OP is the opcode of the jump to insert.
2134
2135 If you call this function, you must zero out pending_exact. */
2136
2137 static void
insert_jump(op,from,to,current_end)2138 insert_jump(op, from, to, current_end)
2139 int op;
2140 char *from, *to, *current_end;
2141 {
2142 register char *pfrom = current_end; /* Copy from here... */
2143 register char *pto = current_end + 3; /* ...to here. */
2144
2145 while (pfrom != from)
2146 *--pto = *--pfrom;
2147 store_jump(from, op, to);
2148 }
2149
2150
2151 /* Store a jump of the form <opcode> <relative address> <n> .
2152
2153 Store in the location FROM a jump operation to jump to relative
2154 address FROM - TO. OPCODE is the opcode to store, N is a number the
2155 jump uses, say, to decide how many times to jump.
2156
2157 If you call this function, you must zero out pending_exact. */
2158
2159 static void
store_jump_n(from,opcode,to,n)2160 store_jump_n(from, opcode, to, n)
2161 char *from, *to;
2162 int opcode;
2163 unsigned n;
2164 {
2165 from[0] = (char)opcode;
2166 STORE_NUMBER(from + 1, to - (from + 3));
2167 STORE_NUMBER(from + 3, n);
2168 }
2169
2170
2171 /* Similar to insert_jump, but handles a jump which needs an extra
2172 number to handle minimum and maximum cases. Open up space at
2173 location FROM, and insert there a jump to TO. CURRENT_END gives the
2174 end of the storage in use, so we know how much data to copy up. OP is
2175 the opcode of the jump to insert.
2176
2177 If you call this function, you must zero out pending_exact. */
2178
2179 static void
insert_jump_n(op,from,to,current_end,n)2180 insert_jump_n(op, from, to, current_end, n)
2181 int op;
2182 char *from, *to, *current_end;
2183 unsigned n;
2184 {
2185 register char *pfrom = current_end; /* Copy from here... */
2186 register char *pto = current_end + 5; /* ...to here. */
2187
2188 while (pfrom != from)
2189 *--pto = *--pfrom;
2190 store_jump_n(from, op, to, n);
2191 }
2192
2193
2194 /* Open up space at location THERE, and insert operation OP.
2195 CURRENT_END gives the end of the storage in use, so
2196 we know how much data to copy up.
2197
2198 If you call this function, you must zero out pending_exact. */
2199
2200 #if 0
2201 static void
2202 insert_op(op, there, current_end)
2203 int op;
2204 char *there, *current_end;
2205 {
2206 register char *pfrom = current_end; /* Copy from here... */
2207 register char *pto = current_end + 1; /* ...to here. */
2208
2209 while (pfrom != there)
2210 *--pto = *--pfrom;
2211
2212 there[0] = (char)op;
2213 }
2214 #endif /* 0 */
2215
2216
2217 /* Open up space at location THERE, and insert operation OP followed by
2218 NUM_1 and NUM_2. CURRENT_END gives the end of the storage in use, so
2219 we know how much data to copy up.
2220
2221 If you call this function, you must zero out pending_exact. */
2222
2223 static void
insert_op_2(op,there,current_end,num_1,num_2)2224 insert_op_2(op, there, current_end, num_1, num_2)
2225 int op;
2226 char *there, *current_end;
2227 int num_1, num_2;
2228 {
2229 register char *pfrom = current_end; /* Copy from here... */
2230 register char *pto = current_end + 5; /* ...to here. */
2231
2232 while (pfrom != there)
2233 *--pto = *--pfrom;
2234
2235 there[0] = (char)op;
2236 STORE_NUMBER(there + 1, num_1);
2237 STORE_NUMBER(there + 3, num_2);
2238 }
2239
2240
2241 #define trans_eq(c1, c2, translate) (translate?(translate[c1]==translate[c2]):((c1)==(c2)))
2242 static int
slow_match(little,lend,big,bend,translate)2243 slow_match(little, lend, big, bend, translate)
2244 unsigned char *little, *lend;
2245 unsigned char *big, *bend;
2246 unsigned char *translate;
2247 {
2248 int c;
2249
2250 while (little < lend && big < bend) {
2251 c = *little++;
2252 if (c == 0xff)
2253 c = *little++;
2254 if (!trans_eq(*big++, c, translate)) break;
2255 }
2256 if (little == lend) return 1;
2257 return 0;
2258 }
2259
2260 static int
slow_search(little,llen,big,blen,translate)2261 slow_search(little, llen, big, blen, translate)
2262 unsigned char *little;
2263 int llen;
2264 unsigned char *big;
2265 int blen;
2266 char *translate;
2267 {
2268 unsigned char *bsave = big;
2269 unsigned char *bend = big + blen;
2270 register int c;
2271 int fescape = 0;
2272
2273 c = *little;
2274 if (c == 0xff) {
2275 c = little[1];
2276 fescape = 1;
2277 }
2278 else if (translate && !ismbchar(c)) {
2279 c = translate[c];
2280 }
2281
2282 while (big < bend) {
2283 /* look for first character */
2284 if (fescape) {
2285 while (big < bend) {
2286 if (*big == c) break;
2287 big++;
2288 }
2289 }
2290 else if (translate && !ismbchar(c)) {
2291 while (big < bend) {
2292 if (ismbchar(*big)) big++;
2293 else if (translate[*big] == c) break;
2294 big++;
2295 }
2296 }
2297 else {
2298 while (big < bend) {
2299 if (*big == c) break;
2300 if (ismbchar(*big)) big++;
2301 big++;
2302 }
2303 }
2304
2305 if (slow_match(little, little+llen, big, bend, translate))
2306 return big - bsave;
2307
2308 if (ismbchar(*big)) big++;
2309 big++;
2310 }
2311 return -1;
2312 }
2313
2314 static void
bm_init_skip(skip,pat,m,translate)2315 bm_init_skip(skip, pat, m, translate)
2316 int *skip;
2317 unsigned char *pat;
2318 int m;
2319 char *translate;
2320 {
2321 int j, c;
2322
2323 for (c=0; c<256; c++) {
2324 skip[c] = m;
2325 }
2326 if (translate) {
2327 for (j=0; j<m-1; j++) {
2328 skip[translate[pat[j]]] = m-1-j;
2329 }
2330 }
2331 else {
2332 for (j=0; j<m-1; j++) {
2333 skip[pat[j]] = m-1-j;
2334 }
2335 }
2336 }
2337
2338 static int
bm_search(little,llen,big,blen,skip,translate)2339 bm_search(little, llen, big, blen, skip, translate)
2340 unsigned char *little;
2341 int llen;
2342 unsigned char *big;
2343 int blen;
2344 int *skip;
2345 unsigned char *translate;
2346 {
2347 /* int next[256]; */
2348 int i, j, k;
2349 /* unsigned char c; */
2350
2351 i = llen-1;
2352 if (translate) {
2353 while (i < blen) {
2354 k = i;
2355 j = llen-1;
2356 while (j >= 0 && translate[big[k]] == translate[little[j]]) {
2357 k--;
2358 j--;
2359 }
2360 if (j < 0) return k+1;
2361
2362 i += skip[translate[big[i]]];
2363 }
2364 return -1;
2365 }
2366 while (i < blen) {
2367 k = i;
2368 j = llen-1;
2369 while (j >= 0 && big[k] == little[j]) {
2370 k--;
2371 j--;
2372 }
2373 if (j < 0) return k+1;
2374
2375 i += skip[big[i]];
2376 }
2377 return -1;
2378 }
2379
2380 /* Given a pattern, compute a fastmap from it. The fastmap records
2381 which of the (1 << BYTEWIDTH) possible characters can start a string
2382 that matches the pattern. This fastmap is used by re_search to skip
2383 quickly over totally implausible text.
2384
2385 The caller must supply the address of a (1 << BYTEWIDTH)-byte data
2386 area as bufp->fastmap.
2387 The other components of bufp describe the pattern to be used. */
2388 void
re_compile_fastmap(bufp)2389 re_compile_fastmap(bufp)
2390 struct re_pattern_buffer *bufp;
2391 {
2392 unsigned char *pattern = (unsigned char *) bufp->buffer;
2393 int size = bufp->used;
2394 register char *fastmap = bufp->fastmap;
2395 register unsigned char *p = pattern;
2396 register unsigned char *pend = pattern + size;
2397 register int j, k;
2398 unsigned is_a_succeed_n;
2399
2400 unsigned char **stackb = RE_TALLOC(NFAILURES, unsigned char*);
2401 unsigned char **stackp = stackb;
2402 unsigned char **stacke = stackb + NFAILURES;
2403 int options = bufp->options;
2404
2405 memset(fastmap, 0, (1 << BYTEWIDTH));
2406 bufp->fastmap_accurate = 1;
2407 bufp->can_be_null = 0;
2408
2409 while (p)
2410 {
2411 is_a_succeed_n = 0;
2412 if (p == pend)
2413 {
2414 bufp->can_be_null = 1;
2415 break;
2416 }
2417 #ifdef SWITCH_ENUM_BUG
2418 switch ((int) ((enum regexpcode)*p++))
2419 #else
2420 switch ((enum regexpcode)*p++)
2421 #endif
2422 {
2423 case exactn:
2424 if (p[1] == 0xff) {
2425 if (TRANSLATE_P())
2426 fastmap[translate[p[2]]] = 2;
2427 else
2428 fastmap[p[2]] = 2;
2429 }
2430 else if (TRANSLATE_P())
2431 fastmap[translate[p[1]]] = 1;
2432 else
2433 fastmap[p[1]] = 1;
2434 break;
2435
2436 case begline:
2437 case begbuf:
2438 case endbuf:
2439 case endbuf2:
2440 case wordbound:
2441 case notwordbound:
2442 case wordbeg:
2443 case wordend:
2444 case pop_and_fail:
2445 case push_dummy_failure:
2446 continue;
2447
2448 case casefold_on:
2449 bufp->options |= RE_MAY_IGNORECASE;
2450 case casefold_off:
2451 options ^= RE_OPTION_IGNORECASE;
2452 continue;
2453
2454 case endline:
2455 if (TRANSLATE_P())
2456 fastmap[translate['\n']] = 1;
2457 else
2458 fastmap['\n'] = 1;
2459
2460 if (bufp->can_be_null == 0)
2461 bufp->can_be_null = 2;
2462 break;
2463
2464 case jump_n:
2465 case finalize_jump:
2466 case maybe_finalize_jump:
2467 case jump:
2468 case jump_past_alt:
2469 case dummy_failure_jump:
2470 EXTRACT_NUMBER_AND_INCR(j, p);
2471 p += j;
2472 if (j > 0)
2473 continue;
2474 /* Jump backward reached implies we just went through
2475 the body of a loop and matched nothing.
2476 Opcode jumped to should be an on_failure_jump.
2477 Just treat it like an ordinary jump.
2478 For a * loop, it has pushed its failure point already;
2479 If so, discard that as redundant. */
2480
2481 if ((enum regexpcode) *p != on_failure_jump
2482 && (enum regexpcode) *p != try_next
2483 && (enum regexpcode) *p != succeed_n
2484 && (enum regexpcode) *p != finalize_push
2485 && (enum regexpcode) *p != finalize_push_n)
2486 continue;
2487 p++;
2488 EXTRACT_NUMBER_AND_INCR(j, p);
2489 p += j;
2490 if (stackp != stackb && *stackp == p)
2491 stackp--; /* pop */
2492 continue;
2493
2494 case start_nowidth:
2495 case stop_nowidth:
2496 case finalize_push:
2497 p += 2;
2498 continue;
2499
2500 case finalize_push_n:
2501 p += 4;
2502 continue;
2503
2504 case try_next:
2505 case on_failure_jump:
2506 handle_on_failure_jump:
2507 EXTRACT_NUMBER_AND_INCR(j, p);
2508 if (p + j < pend) {
2509 if (stackp == stacke) {
2510 unsigned char **stackx;
2511 unsigned int len = stacke - stackb;
2512
2513 EXPAND_FAIL_STACK(stackx, stackb, len);
2514 }
2515 *++stackp = p + j; /* push */
2516 }
2517 else {
2518 bufp->can_be_null = 1;
2519 }
2520 if (is_a_succeed_n)
2521 EXTRACT_NUMBER_AND_INCR(k, p); /* Skip the n. */
2522 continue;
2523
2524 case succeed_n:
2525 is_a_succeed_n = 1;
2526 /* Get to the number of times to succeed. */
2527 EXTRACT_NUMBER(k, p + 2);
2528 /* Increment p past the n for when k != 0. */
2529 if (k == 0) {
2530 p += 4;
2531 }
2532 else {
2533 goto handle_on_failure_jump;
2534 }
2535 continue;
2536
2537 case set_number_at:
2538 p += 4;
2539 continue;
2540
2541 case start_memory:
2542 case stop_memory:
2543 p += 2;
2544 continue;
2545
2546 case duplicate:
2547 bufp->can_be_null = 1;
2548 fastmap['\n'] = 1;
2549 case anychar:
2550 for (j = 0; j < (1 << BYTEWIDTH); j++)
2551 if (j != '\n')
2552 fastmap[j] = 1;
2553 if (bufp->can_be_null)
2554 {
2555 FREE_AND_RETURN_VOID(stackb);
2556 }
2557 /* Don't return; check the alternative paths
2558 so we can set can_be_null if appropriate. */
2559 break;
2560
2561 case wordchar:
2562 for (j = 0; j < (1 << BYTEWIDTH); j++)
2563 if (SYNTAX(j) == Sword)
2564 fastmap[j] = 1;
2565 break;
2566
2567 case notwordchar:
2568 for (j = 0; j < 0x80; j++)
2569 if (SYNTAX(j) != Sword)
2570 fastmap[j] = 1;
2571 for (j = 0x80; j < (1 << BYTEWIDTH); j++)
2572 fastmap[j] = 1;
2573 break;
2574
2575 case charset:
2576 /* NOTE: Charset for single-byte chars never contain
2577 multi-byte char. See set_list_bits(). */
2578 for (j = *p++ * BYTEWIDTH - 1; j >= 0; j--)
2579 if (p[j / BYTEWIDTH] & (1 << (j % BYTEWIDTH)))
2580 {
2581 if (TRANSLATE_P())
2582 fastmap[translate[j]] = 1;
2583 else
2584 fastmap[j] = 1;
2585 }
2586 {
2587 unsigned short size;
2588 unsigned c, end;
2589
2590 p += p[-1] + 2;
2591 size = EXTRACT_UNSIGNED(&p[-2]);
2592 for (j = 0; j < (int)size; j++) {
2593 if ((unsigned char)p[j*4] == 0xff) {
2594 for (c = (unsigned char)p[j*4+1],
2595 end = (unsigned char)p[j*4+3];
2596 c <= end; c++) {
2597 fastmap[c] = 2;
2598 }
2599 }
2600 else {
2601 /* set bits for 1st bytes of multi-byte chars. */
2602 for (c = (unsigned char)p[j*4],
2603 end = (unsigned char)p[j*4 + 2];
2604 c <= end; c++) {
2605 /* NOTE: Charset for multi-byte chars might contain
2606 single-byte chars. We must reject them. */
2607 if (ismbchar(c))
2608 fastmap[c] = 1;
2609 }
2610 }
2611 }
2612 }
2613 break;
2614
2615 case charset_not:
2616 /* S: set of all single-byte chars.
2617 M: set of all first bytes that can start multi-byte chars.
2618 s: any set of single-byte chars.
2619 m: any set of first bytes that can start multi-byte chars.
2620
2621 We assume S+M = U.
2622 ___ _ _
2623 s+m = (S*s+M*m). */
2624 /* Chars beyond end of map must be allowed */
2625 /* NOTE: Charset_not for single-byte chars might contain
2626 multi-byte chars. See set_list_bits(). */
2627 for (j = *p * BYTEWIDTH; j < (1 << BYTEWIDTH); j++)
2628 if (!ismbchar(j))
2629 fastmap[j] = 1;
2630
2631 for (j = *p++ * BYTEWIDTH - 1; j >= 0; j--)
2632 if (!(p[j / BYTEWIDTH] & (1 << (j % BYTEWIDTH))))
2633 {
2634 if (!ismbchar(j))
2635 fastmap[j] = 1;
2636 }
2637 {
2638 unsigned short size;
2639 unsigned char c, beg;
2640 int byte_match = 0;
2641
2642 p += p[-1] + 2;
2643 size = EXTRACT_UNSIGNED(&p[-2]);
2644 if (size == 0) {
2645 for (j = 0x80; j < (1 << BYTEWIDTH); j++)
2646 if (ismbchar(j))
2647 fastmap[j] = 1;
2648 }
2649 for (j = 0,c = 0x80;j < (int)size; j++) {
2650 if ((unsigned char)p[j*4] == 0xff) {
2651 byte_match = 1;
2652 for (beg = (unsigned char)p[j*4+1]; c < beg; c++)
2653 fastmap[c] = 1;
2654 c = (unsigned char)p[j*4+3] + 1;
2655 }
2656 else {
2657 for (beg = (unsigned char)p[j*4 + 0]; c < beg; c++)
2658 if (ismbchar(c))
2659 fastmap[c] = 1;
2660 c = (unsigned char)p[j*4 + 2] + 1;
2661 }
2662 }
2663 if (byte_match) {
2664 for (j = c; j < (1 << BYTEWIDTH); j++)
2665 fastmap[j] = 1;
2666 for (j = 0; j < (1 << BYTEWIDTH); j++)
2667 if (fastmap[j])
2668 fastmap[j] = 2;
2669 }
2670 else {
2671 for (j = c; j < (1 << BYTEWIDTH); j++)
2672 if (ismbchar(j))
2673 fastmap[j] = 1;
2674 }
2675 }
2676 break;
2677
2678 case unused: /* pacify gcc -Wall */
2679 break;
2680 }
2681
2682 /* Get here means we have successfully found the possible starting
2683 characters of one path of the pattern. We need not follow this
2684 path any farther. Instead, look at the next alternative
2685 remembered in the stack. */
2686 if (stackp != stackb)
2687 p = *stackp--; /* pop */
2688 else
2689 break;
2690 }
2691 FREE_AND_RETURN_VOID(stackb);
2692 }
2693
2694
2695 /* Using the compiled pattern in BUFP->buffer, first tries to match
2696 STRING, starting first at index STARTPOS, then at STARTPOS + 1, and
2697 so on. RANGE is the number of places to try before giving up. If
2698 RANGE is negative, it searches backwards, i.e., the starting
2699 positions tried are STARTPOS, STARTPOS - 1, etc. STRING is of SIZE.
2700 In REGS, return the indices of STRING that matched the entire
2701 BUFP->buffer and its contained subexpressions.
2702
2703 The value returned is the position in the strings at which the match
2704 was found, or -1 if no match was found, or -2 if error (such as
2705 failure stack overflow). */
2706
2707 int
re_search(bufp,string,size,startpos,range,regs)2708 re_search(bufp, string, size, startpos, range, regs)
2709 struct re_pattern_buffer *bufp;
2710 char *string;
2711 int size, startpos, range;
2712 struct re_registers *regs;
2713 {
2714 register char *fastmap = bufp->fastmap;
2715 int val, anchor = 0;
2716
2717 /* Check for out-of-range starting position. */
2718 if (startpos < 0 || startpos > size)
2719 return -1;
2720
2721 /* Update the fastmap now if not correct already. */
2722 if (fastmap && !bufp->fastmap_accurate) {
2723 re_compile_fastmap(bufp);
2724 }
2725
2726 /* If the search isn't to be a backwards one, don't waste time in a
2727 search for a pattern that must be anchored. */
2728 if (bufp->used>0) {
2729 switch ((enum regexpcode)bufp->buffer[0]) {
2730 case begbuf:
2731 if (range > 0) {
2732 if (startpos > 0)
2733 return -1;
2734 else if (re_match(bufp, string, size, 0, regs) >= 0)
2735 return 0;
2736 return -1;
2737 }
2738 break;
2739
2740 case begline:
2741 anchor = 1;
2742 break;
2743
2744 default:
2745 break;
2746 }
2747 }
2748 if (bufp->options & RE_OPTIMIZE_ANCHOR) {
2749 anchor = 1;
2750 }
2751
2752 if (bufp->must) {
2753 int len = ((unsigned char*)bufp->must)[0];
2754 int pos, pbeg, pend;
2755
2756 pbeg = startpos;
2757 pend = startpos + range;
2758 if (pbeg > pend) { /* swap pbeg,pend */
2759 pos = pend; pend = pbeg; pbeg = pos;
2760 }
2761 if (pend > size) pend = size;
2762 if (bufp->options & RE_OPTIMIZE_NO_BM) {
2763 pos = slow_search(bufp->must+1, len,
2764 string+pbeg, pend-pbeg,
2765 MAY_TRANSLATE()?translate:0);
2766 }
2767 else {
2768 pos = bm_search(bufp->must+1, len,
2769 string+pbeg, pend-pbeg,
2770 bufp->must_skip,
2771 MAY_TRANSLATE()?translate:0);
2772 }
2773 if (pos == -1) return -1;
2774 if (range > 0 && (bufp->options & RE_OPTIMIZE_EXACTN)) {
2775 startpos += pos;
2776 range -= pos;
2777 }
2778 }
2779
2780 for (;;)
2781 {
2782 /* If a fastmap is supplied, skip quickly over characters that
2783 cannot possibly be the start of a match. Note, however, that
2784 if the pattern can possibly match the null string, we must
2785 test it at each starting point so that we take the first null
2786 string we get. */
2787
2788 if (fastmap && startpos < size
2789 && bufp->can_be_null != 1 && !(anchor && startpos == 0))
2790 {
2791 if (range > 0) /* Searching forwards. */
2792 {
2793 register unsigned char *p, c;
2794 int irange = range;
2795
2796 p = (unsigned char *)string+startpos;
2797
2798 while (range > 0) {
2799 c = *p++;
2800 if (ismbchar(c)) {
2801 if (fastmap[c])
2802 break;
2803 c = *p++;
2804 range--;
2805 if (fastmap[c] == 2)
2806 break;
2807 }
2808 else
2809 if (fastmap[MAY_TRANSLATE() ? translate[c] : c])
2810 break;
2811 range--;
2812 }
2813 startpos += irange - range;
2814 }
2815 else /* Searching backwards. */
2816 {
2817 register unsigned char c;
2818
2819 c = string[startpos];
2820 c &= 0xff;
2821 if (MAY_TRANSLATE() ? !fastmap[translate[c]] : !fastmap[c])
2822 goto advance;
2823 }
2824 }
2825
2826 if (fastmap && startpos == size && range >= 0
2827 && (bufp->can_be_null == 0 ||
2828 (bufp->can_be_null && size > 0
2829 && string[startpos-1] == '\n')))
2830 return -1;
2831
2832 val = re_match(bufp, string, size, startpos, regs);
2833 if (val >= 0)
2834 return startpos;
2835 if (val == -2)
2836 return -2;
2837
2838 #ifndef NO_ALLOCA
2839 #ifdef C_ALLOCA
2840 alloca(0);
2841 #endif /* C_ALLOCA */
2842 #endif /* NO_ALLOCA */
2843
2844 if (range > 0) {
2845 if (anchor && startpos < size && startpos > 0 && string[startpos-1] != '\n') {
2846 while (range > 0 && string[startpos] != '\n') {
2847 range--;
2848 startpos++;
2849 }
2850 }
2851 else if (fastmap && (bufp->stclass)) {
2852 register unsigned char *p;
2853 register unsigned short c;
2854 int irange = range;
2855
2856 p = (unsigned char *)string+startpos;
2857 while (range > 0) {
2858 c = *p++;
2859 if (ismbchar(c) && fastmap[c] != 2) {
2860 c = c << 8 | *p++;
2861 }
2862 else if (MAY_TRANSLATE())
2863 c = translate[c];
2864 if (*bufp->stclass == charset) {
2865 if (!is_in_list(c, bufp->stclass+1)) break;
2866 }
2867 else {
2868 if (is_in_list(c, bufp->stclass+1)) break;
2869 }
2870 range--;
2871 if (c > 256) range--;
2872 }
2873 startpos += irange - range;
2874 }
2875 }
2876
2877 advance:
2878 if (!range)
2879 break;
2880 else if (range > 0) {
2881 const char *d = string + startpos;
2882
2883 if (ismbchar(*d)) {
2884 range--, startpos++;
2885 if (!range)
2886 break;
2887 }
2888 range--, startpos++;
2889 }
2890 else {
2891 range++, startpos--;
2892 {
2893 const char *s, *d, *p;
2894
2895 s = string; d = string + startpos;
2896 for (p = d; p-- > s && ismbchar(*p); )
2897 /* --p >= s would not work on 80[12]?86.
2898 (when the offset of s equals 0 other than huge model.) */
2899 ;
2900 if (!((d - p) & 1)) {
2901 if (!range)
2902 break;
2903 range++, startpos--;
2904 }
2905 }
2906 }
2907 }
2908 return -1;
2909 }
2910
2911
2912
2913
2914 /* The following are used for re_match, defined below: */
2915
2916 /* Roughly the maximum number of failure points on the stack. Would be
2917 exactly that if always pushed MAX_NUM_FAILURE_ITEMS each time we failed. */
2918
2919 int re_max_failures = 2000;
2920
2921 /* Routine used by re_match. */
2922 /* static int memcmp_translate(); *//* already declared */
2923
2924
2925 /* Structure and accessing macros used in re_match: */
2926
2927 typedef union
2928 {
2929 unsigned char *word;
2930 struct
2931 {
2932 /* This field is one if this group can match the empty string,
2933 zero if not. If not yet determined, `MATCH_NULL_UNSET_VALUE'. */
2934 #define MATCH_NULL_UNSET_VALUE 3
2935 unsigned match_null_string_p : 2;
2936 unsigned is_active : 1;
2937 unsigned matched_something : 1;
2938 unsigned ever_matched_something : 1;
2939 } bits;
2940 } register_info_type;
2941
2942 #define REG_MATCH_NULL_STRING_P(R) ((R).bits.match_null_string_p)
2943 #define IS_ACTIVE(R) ((R).bits.is_active)
2944 #define MATCHED_SOMETHING(R) ((R).bits.matched_something)
2945 #define EVER_MATCHED_SOMETHING(R) ((R).bits.ever_matched_something)
2946
2947
2948 /* Macros used by re_match: */
2949
2950 /* I.e., regstart, regend, and reg_info. */
2951 #define NUM_REG_ITEMS 3
2952
2953 /* Individual items aside from the registers. */
2954 #define NUM_NONREG_ITEMS 3
2955
2956 /* We push at most this many things on the stack whenever we
2957 fail. The `+ 2' refers to PATTERN_PLACE and STRING_PLACE, which are
2958 arguments to the PUSH_FAILURE_POINT macro. */
2959 #define MAX_NUM_FAILURE_ITEMS (num_regs * NUM_REG_ITEMS + NUM_NONREG_ITEMS)
2960
2961 /* We push this many things on the stack whenever we fail. */
2962 #define NUM_FAILURE_ITEMS (last_used_reg * NUM_REG_ITEMS + NUM_REG_ITEMS)
2963
2964
2965 /* This pushes most of the information about the current state we will want
2966 if we ever fail back to it. */
2967
2968 #define PUSH_FAILURE_POINT(pattern_place, string_place) \
2969 do { \
2970 long last_used_reg, this_reg; \
2971 \
2972 /* Find out how many registers are active or have been matched. \
2973 (Aside from register zero, which is only set at the end.) */ \
2974 for (last_used_reg = num_regs - 1; last_used_reg > 0; last_used_reg--)\
2975 if (!REG_UNSET(regstart[last_used_reg])) \
2976 break; \
2977 \
2978 if (stacke - stackp <= NUM_FAILURE_ITEMS) \
2979 { \
2980 unsigned char **stackx; \
2981 unsigned int len = stacke - stackb; \
2982 /* if (len > re_max_failures * MAX_NUM_FAILURE_ITEMS) \
2983 { \
2984 FREE_VARIABLES(); \
2985 FREE_AND_RETURN(stackb,(-2)); \
2986 }*/ \
2987 \
2988 /* Roughly double the size of the stack. */ \
2989 EXPAND_FAIL_STACK(stackx, stackb, len); \
2990 } \
2991 \
2992 /* Now push the info for each of those registers. */ \
2993 for (this_reg = 1; this_reg <= last_used_reg; this_reg++) \
2994 { \
2995 *stackp++ = regstart[this_reg]; \
2996 *stackp++ = regend[this_reg]; \
2997 *stackp++ = reg_info[this_reg].word; \
2998 } \
2999 \
3000 /* Push how many registers we saved. */ \
3001 *stackp++ = (unsigned char *)last_used_reg; \
3002 \
3003 *stackp++ = pattern_place; \
3004 *stackp++ = string_place; \
3005 *stackp++ = (unsigned char *)0; /* non-greedy flag */ \
3006 } while(0)
3007
3008
3009 /* This pops what PUSH_FAILURE_POINT pushes. */
3010
3011 #define POP_FAILURE_POINT() \
3012 do { \
3013 int temp; \
3014 stackp -= NUM_NONREG_ITEMS; /* Remove failure points (and flag). */ \
3015 temp = (int) *--stackp; /* How many regs pushed. */ \
3016 temp *= NUM_REG_ITEMS; /* How much to take off the stack. */ \
3017 stackp -= temp; /* Remove the register info. */ \
3018 } while(0)
3019
3020 /* Registers are set to a sentinel when they haven't yet matched. */
3021 #define REG_UNSET_VALUE ((unsigned char *) -1)
3022 #define REG_UNSET(e) ((e) == REG_UNSET_VALUE)
3023
3024 #define PREFETCH if (d == dend) goto fail
3025
3026 /* Call this when have matched something; it sets `matched' flags for the
3027 registers corresponding to the subexpressions of which we currently
3028 are inside. */
3029 #define SET_REGS_MATCHED \
3030 do { unsigned this_reg; \
3031 for (this_reg = 0; this_reg < num_regs; this_reg++) \
3032 { \
3033 if (IS_ACTIVE(reg_info[this_reg])) \
3034 MATCHED_SOMETHING(reg_info[this_reg]) \
3035 = EVER_MATCHED_SOMETHING (reg_info[this_reg]) \
3036 = 1; \
3037 else \
3038 MATCHED_SOMETHING(reg_info[this_reg]) = 0; \
3039 } \
3040 } while(0)
3041
3042 #define AT_STRINGS_BEG(d) (d == string)
3043 #define AT_STRINGS_END(d) (d == dend)
3044
3045 /* We have two special cases to check for:
3046 1) if we're past the end of string1, we have to look at the first
3047 character in string2;
3048 2) if we're before the beginning of string2, we have to look at the
3049 last character in string1; we assume there is a string1, so use
3050 this in conjunction with AT_STRINGS_BEG. */
3051 #define IS_A_LETTER(d) (SYNTAX(*(d)) == Sword)
3052
3053 static void
init_regs(regs,num_regs)3054 init_regs(regs, num_regs)
3055 struct re_registers *regs;
3056 unsigned int num_regs;
3057 {
3058 int i;
3059
3060 regs->num_regs = num_regs;
3061 if (num_regs < RE_NREGS)
3062 num_regs = RE_NREGS;
3063
3064 if (regs->allocated == 0) {
3065 regs->beg = TMALLOC(num_regs, int);
3066 regs->end = TMALLOC(num_regs, int);
3067 regs->allocated = num_regs;
3068 }
3069 else if (regs->allocated < num_regs) {
3070 TREALLOC(regs->beg, num_regs, int);
3071 TREALLOC(regs->end, num_regs, int);
3072 }
3073 for (i=0; i<num_regs; i++) {
3074 regs->beg[i] = regs->end[i] = -1;
3075 }
3076 }
3077
3078 /* Match the pattern described by BUFP against STRING, which is of
3079 SIZE. Start the match at index POS in STRING. In REGS, return the
3080 indices of STRING that matched the entire BUFP->buffer and its
3081 contained subexpressions.
3082
3083 If bufp->fastmap is nonzero, then it had better be up to date.
3084
3085 The reason that the data to match are specified as two components
3086 which are to be regarded as concatenated is so this function can be
3087 used directly on the contents of an Emacs buffer.
3088
3089 -1 is returned if there is no match. -2 is returned if there is an
3090 error (such as match stack overflow). Otherwise the value is the
3091 length of the substring which was matched. */
3092
3093 int
re_match(bufp,string_arg,size,pos,regs)3094 re_match(bufp, string_arg, size, pos, regs)
3095 struct re_pattern_buffer *bufp;
3096 char *string_arg;
3097 int size, pos;
3098 struct re_registers *regs;
3099 {
3100 register unsigned char *p = (unsigned char *) bufp->buffer;
3101 unsigned char *p1;
3102
3103 /* Pointer to beyond end of buffer. */
3104 register unsigned char *pend = p + bufp->used;
3105
3106 unsigned num_regs = bufp->re_nsub;
3107
3108 unsigned char *string = (unsigned char *) string_arg;
3109
3110 register unsigned char *d, *dend;
3111 register int mcnt; /* Multipurpose. */
3112 int options = bufp->options;
3113
3114 /* Failure point stack. Each place that can handle a failure further
3115 down the line pushes a failure point on this stack. It consists of
3116 restart, regend, and reg_info for all registers corresponding to the
3117 subexpressions we're currently inside, plus the number of such
3118 registers, and, finally, two char *'s. The first char * is where to
3119 resume scanning the pattern; the second one is where to resume
3120 scanning the strings. If the latter is zero, the failure point is a
3121 ``dummy''; if a failure happens and the failure point is a dummy, it
3122 gets discarded and the next next one is tried. */
3123
3124 unsigned char **stackb;
3125 unsigned char **stackp;
3126 unsigned char **stacke;
3127
3128
3129 /* Information on the contents of registers. These are pointers into
3130 the input strings; they record just what was matched (on this
3131 attempt) by a subexpression part of the pattern, that is, the
3132 regnum-th regstart pointer points to where in the pattern we began
3133 matching and the regnum-th regend points to right after where we
3134 stopped matching the regnum-th subexpression. (The zeroth register
3135 keeps track of what the whole pattern matches.) */
3136
3137 unsigned char **regstart = RE_TALLOC(num_regs, unsigned char*);
3138 unsigned char **regend = RE_TALLOC(num_regs, unsigned char*);
3139
3140 /* If a group that's operated upon by a repetition operator fails to
3141 match anything, then the register for its start will need to be
3142 restored because it will have been set to wherever in the string we
3143 are when we last see its open-group operator. Similarly for a
3144 register's end. */
3145 unsigned char **old_regstart = RE_TALLOC(num_regs, unsigned char*);
3146 unsigned char **old_regend = RE_TALLOC(num_regs, unsigned char*);
3147
3148 /* The is_active field of reg_info helps us keep track of which (possibly
3149 nested) subexpressions we are currently in. The matched_something
3150 field of reg_info[reg_num] helps us tell whether or not we have
3151 matched any of the pattern so far this time through the reg_num-th
3152 subexpression. These two fields get reset each time through any
3153 loop their register is in. */
3154
3155 register_info_type *reg_info = RE_TALLOC(num_regs, register_info_type);
3156
3157 /* The following record the register info as found in the above
3158 variables when we find a match better than any we've seen before.
3159 This happens as we backtrack through the failure points, which in
3160 turn happens only if we have not yet matched the entire string. */
3161
3162 unsigned best_regs_set = 0;
3163 unsigned char **best_regstart = RE_TALLOC(num_regs, unsigned char*);
3164 unsigned char **best_regend = RE_TALLOC(num_regs, unsigned char*);
3165
3166 if (regs) {
3167 init_regs(regs, num_regs);
3168 }
3169
3170 /* Initialize the stack. */
3171 stackb = RE_TALLOC(MAX_NUM_FAILURE_ITEMS * NFAILURES, unsigned char*);
3172 stackp = stackb;
3173 stacke = &stackb[MAX_NUM_FAILURE_ITEMS * NFAILURES];
3174
3175 #ifdef DEBUG_REGEX
3176 fprintf (stderr, "Entering re_match(%s%s)\n", string1_arg, string2_arg);
3177 #endif
3178
3179 /* Initialize subexpression text positions to -1 to mark ones that no
3180 \( or ( and \) or ) has been seen for. Also set all registers to
3181 inactive and mark them as not having matched anything or ever
3182 failed. */
3183 for (mcnt = 0; mcnt < num_regs; mcnt++) {
3184 regstart[mcnt] = regend[mcnt]
3185 = old_regstart[mcnt] = old_regend[mcnt]
3186 = best_regstart[mcnt] = best_regend[mcnt] = REG_UNSET_VALUE;
3187 #ifdef __CHECKER__
3188 reg_info[mcnt].word = 0;
3189 #endif
3190 REG_MATCH_NULL_STRING_P (reg_info[mcnt]) = MATCH_NULL_UNSET_VALUE;
3191 IS_ACTIVE (reg_info[mcnt]) = 0;
3192 MATCHED_SOMETHING (reg_info[mcnt]) = 0;
3193 EVER_MATCHED_SOMETHING (reg_info[mcnt]) = 0;
3194 }
3195
3196 /* Set up pointers to ends of strings.
3197 Don't allow the second string to be empty unless both are empty. */
3198
3199
3200 /* `p' scans through the pattern as `d' scans through the data. `dend'
3201 is the end of the input string that `d' points within. `d' is
3202 advanced into the following input string whenever necessary, but
3203 this happens before fetching; therefore, at the beginning of the
3204 loop, `d' can be pointing at the end of a string, but it cannot
3205 equal string2. */
3206
3207 d = string + pos, dend = string + size;
3208
3209
3210 /* This loops over pattern commands. It exits by returning from the
3211 function if match is complete, or it drops through if match fails
3212 at this starting point in the input data. */
3213
3214 for (;;)
3215 {
3216 #ifdef DEBUG_REGEX
3217 fprintf(stderr,
3218 "regex loop(%d): matching 0x%02d\n",
3219 p - (unsigned char *) bufp->buffer,
3220 *p);
3221 #endif
3222 /* End of pattern means we might have succeeded. */
3223 if (p == pend)
3224 {
3225 /* If not end of string, try backtracking. Otherwise done. */
3226 if (d != dend)
3227 {
3228 while (stackp != stackb && (int)stackp[-1] == 1)
3229 POP_FAILURE_POINT();
3230 if (stackp != stackb)
3231 {
3232 /* More failure points to try. */
3233
3234 /* If exceeds best match so far, save it. */
3235 if (! best_regs_set || (d > best_regend[0]))
3236 {
3237 best_regs_set = 1;
3238 best_regend[0] = d; /* Never use regstart[0]. */
3239
3240 for (mcnt = 1; mcnt < num_regs; mcnt++)
3241 {
3242 best_regstart[mcnt] = regstart[mcnt];
3243 best_regend[mcnt] = regend[mcnt];
3244 }
3245 }
3246 goto fail;
3247 }
3248 /* If no failure points, don't restore garbage. */
3249 else if (best_regs_set)
3250 {
3251 restore_best_regs:
3252 /* Restore best match. */
3253 d = best_regend[0];
3254
3255 for (mcnt = 0; mcnt < num_regs; mcnt++)
3256 {
3257 regstart[mcnt] = best_regstart[mcnt];
3258 regend[mcnt] = best_regend[mcnt];
3259 }
3260 }
3261 }
3262
3263 /* If caller wants register contents data back, convert it
3264 to indices. */
3265 if (regs)
3266 {
3267 regs->beg[0] = pos;
3268 regs->end[0] = d - string;
3269 for (mcnt = 1; mcnt < num_regs; mcnt++)
3270 {
3271 if (REG_UNSET(regend[mcnt]))
3272 {
3273 regs->beg[mcnt] = -1;
3274 regs->end[mcnt] = -1;
3275 continue;
3276 }
3277 regs->beg[mcnt] = regstart[mcnt] - string;
3278 regs->end[mcnt] = regend[mcnt] - string;
3279 }
3280 }
3281 FREE_VARIABLES();
3282 FREE_AND_RETURN(stackb, (d - pos - string));
3283 }
3284
3285 /* Otherwise match next pattern command. */
3286 #ifdef SWITCH_ENUM_BUG
3287 switch ((int)((enum regexpcode)*p++))
3288 #else
3289 switch ((enum regexpcode)*p++)
3290 #endif
3291 {
3292
3293 /* \( [or `(', as appropriate] is represented by start_memory,
3294 \) by stop_memory. Both of those commands are followed by
3295 a register number in the next byte. The text matched
3296 within the \( and \) is recorded under that number. */
3297 case start_memory:
3298 /* Find out if this group can match the empty string. */
3299 p1 = p; /* To send to group_match_null_string_p. */
3300 if (REG_MATCH_NULL_STRING_P (reg_info[*p]) == MATCH_NULL_UNSET_VALUE)
3301 REG_MATCH_NULL_STRING_P (reg_info[*p])
3302 = group_match_null_string_p (&p1, pend, reg_info);
3303
3304 /* Save the position in the string where we were the last time
3305 we were at this open-group operator in case the group is
3306 operated upon by a repetition operator, e.g., with `(a*)*b'
3307 against `ab'; then we want to ignore where we are now in
3308 the string in case this attempt to match fails. */
3309 old_regstart[*p] = REG_MATCH_NULL_STRING_P (reg_info[*p])
3310 ? REG_UNSET (regstart[*p]) ? d : regstart[*p]
3311 : regstart[*p];
3312 regstart[*p] = d;
3313 IS_ACTIVE(reg_info[*p]) = 1;
3314 MATCHED_SOMETHING(reg_info[*p]) = 0;
3315 p += 2;
3316 continue;
3317
3318 case stop_memory:
3319 /* We need to save the string position the last time we were at
3320 this close-group operator in case the group is operated
3321 upon by a repetition operator, e.g., with `((a*)*(b*)*)*'
3322 against `aba'; then we want to ignore where we are now in
3323 the string in case this attempt to match fails. */
3324 old_regend[*p] = REG_MATCH_NULL_STRING_P (reg_info[*p])
3325 ? REG_UNSET (regend[*p]) ? d : regend[*p]
3326 : regend[*p];
3327
3328 regend[*p] = d;
3329 IS_ACTIVE(reg_info[*p]) = 0;
3330
3331 /* If just failed to match something this time around with a sub-
3332 expression that's in a loop, try to force exit from the loop. */
3333 if ((! MATCHED_SOMETHING(reg_info[*p])
3334 || (enum regexpcode) p[-3] == start_memory)
3335 && (p + 1) != pend)
3336 {
3337 int is_a_jump_n = 0;
3338 register unsigned char *p2 = p + 2;
3339 mcnt = 0;
3340 switch (*p2++)
3341 {
3342 case jump_n:
3343 case finalize_push_n:
3344 is_a_jump_n = 1;
3345 case finalize_jump:
3346 case maybe_finalize_jump:
3347 case jump:
3348 case dummy_failure_jump:
3349 EXTRACT_NUMBER_AND_INCR(mcnt, p2);
3350 if (is_a_jump_n)
3351 p2 += 2;
3352 break;
3353 }
3354 p2 += mcnt;
3355
3356 /* If the next operation is a jump backwards in the pattern
3357 to an on_failure_jump, exit from the loop by forcing a
3358 failure after pushing on the stack the on_failure_jump's
3359 jump in the pattern, and d. */
3360 if (mcnt < 0 && (enum regexpcode) *p2 == on_failure_jump
3361 && (enum regexpcode) p2[3] == start_memory && p2[4] == *p)
3362 {
3363 /* If this group ever matched anything, then restore
3364 what its registers were before trying this last
3365 failed match, e.g., with `(a*)*b' against `ab' for
3366 regstart[1], and, e.g., with `((a*)*(b*)*)*'
3367 against `aba' for regend[3].
3368
3369 Also restore the registers for inner groups for,
3370 e.g., `((a*)(b*))*' against `aba' (register 3 would
3371 otherwise get trashed). */
3372
3373 if (EVER_MATCHED_SOMETHING (reg_info[*p]))
3374 {
3375 unsigned r;
3376
3377 EVER_MATCHED_SOMETHING (reg_info[*p]) = 0;
3378
3379 /* Restore this and inner groups' (if any) registers. */
3380 for (r = *p; r < *p + *(p + 1); r++)
3381 {
3382 regstart[r] = old_regstart[r];
3383
3384 /* xx why this test? */
3385 if ((int) old_regend[r] >= (int) regstart[r])
3386 regend[r] = old_regend[r];
3387 }
3388 }
3389 p2++;
3390 EXTRACT_NUMBER_AND_INCR(mcnt, p2);
3391 PUSH_FAILURE_POINT(p2 + mcnt, d);
3392 goto fail;
3393 }
3394 }
3395 p += 2;
3396 continue;
3397
3398 /* \<digit> has been turned into a `duplicate' command which is
3399 followed by the numeric value of <digit> as the register number. */
3400 case duplicate:
3401 {
3402 int regno = *p++; /* Get which register to match against */
3403 register unsigned char *d2, *dend2;
3404
3405 if (IS_ACTIVE(reg_info[regno])) break;
3406
3407 /* Where in input to try to start matching. */
3408 d2 = regstart[regno];
3409 if (REG_UNSET(d2)) break;
3410
3411 /* Where to stop matching; if both the place to start and
3412 the place to stop matching are in the same string, then
3413 set to the place to stop, otherwise, for now have to use
3414 the end of the first string. */
3415
3416 dend2 = regend[regno];
3417 if (REG_UNSET(dend2)) break;
3418 for (;;)
3419 {
3420 /* At end of register contents => success */
3421 if (d2 == dend2) break;
3422
3423 /* If necessary, advance to next segment in data. */
3424 PREFETCH;
3425
3426 /* How many characters left in this segment to match. */
3427 mcnt = dend - d;
3428
3429 /* Want how many consecutive characters we can match in
3430 one shot, so, if necessary, adjust the count. */
3431 if (mcnt > dend2 - d2)
3432 mcnt = dend2 - d2;
3433
3434 /* Compare that many; failure if mismatch, else move
3435 past them. */
3436 if ((options & RE_OPTION_IGNORECASE)
3437 ? memcmp_translate(d, d2, mcnt)
3438 : memcmp((char *)d, (char *)d2, mcnt))
3439 goto fail;
3440 d += mcnt, d2 += mcnt;
3441 }
3442 }
3443 break;
3444
3445 case start_nowidth:
3446 PUSH_FAILURE_POINT(0, d);
3447 EXTRACT_NUMBER_AND_INCR(mcnt, p);
3448 STORE_NUMBER(p+mcnt, stackp - stackb);
3449 continue;
3450
3451 case stop_nowidth:
3452 EXTRACT_NUMBER_AND_INCR(mcnt, p);
3453 stackp = stackb + mcnt;
3454 d = stackp[-2];
3455 POP_FAILURE_POINT();
3456 continue;
3457
3458 case pop_and_fail:
3459 EXTRACT_NUMBER(mcnt, p+1);
3460 stackp = stackb + mcnt;
3461 POP_FAILURE_POINT();
3462 goto fail;
3463
3464 case anychar:
3465 PREFETCH;
3466 /* Match anything but a newline, maybe even a null. */
3467 if (ismbchar(*d)) {
3468 if (d + 1 == dend || d[1] == '\n' || d[1] == '\0')
3469 goto fail;
3470 SET_REGS_MATCHED;
3471 d += 2;
3472 break;
3473 }
3474 if (((TRANSLATE_P()) ? translate[*d] : *d) == '\n')
3475 goto fail;
3476 SET_REGS_MATCHED;
3477 d++;
3478 break;
3479
3480 case charset:
3481 case charset_not:
3482 {
3483 int not; /* Nonzero for charset_not. */
3484 int half; /* 2 if need to match latter half of mbc */
3485 int c;
3486
3487 PREFETCH;
3488 c = (unsigned char)*d;
3489 if (ismbchar(c)) {
3490 if (d + 1 != dend) {
3491 c <<= 8;
3492 c |= (unsigned char)d[1];
3493 }
3494 }
3495 else if (TRANSLATE_P())
3496 c = (unsigned char)translate[c];
3497
3498 half = not = is_in_list(c, p);
3499 if (*(p - 1) == (unsigned char)charset_not) {
3500 not = !not;
3501 }
3502
3503 p += 1 + *p + 2 + EXTRACT_UNSIGNED(&p[1 + *p])*4;
3504
3505 if (!not) goto fail;
3506 SET_REGS_MATCHED;
3507
3508 d++;
3509 if (half != 2 && d != dend && c >= 1 << BYTEWIDTH)
3510 d++;
3511 break;
3512 }
3513
3514 case begline:
3515 if (size == 0
3516 || AT_STRINGS_BEG(d)
3517 || (d && d[-1] == '\n'))
3518 break;
3519 else
3520 goto fail;
3521
3522 case endline:
3523 if (AT_STRINGS_END(d) || *d == '\n')
3524 break;
3525 goto fail;
3526
3527 /* Match at the very beginning of the string. */
3528 case begbuf:
3529 if (AT_STRINGS_BEG(d))
3530 break;
3531 goto fail;
3532
3533 /* Match at the very end of the data. */
3534 case endbuf:
3535 if (AT_STRINGS_END(d))
3536 break;
3537 goto fail;
3538
3539 /* Match at the very end of the data. */
3540 case endbuf2:
3541 if (AT_STRINGS_END(d))
3542 break;
3543 /* .. or newline just before the end of the data. */
3544 if (*d == '\n' && AT_STRINGS_END(d+1))
3545 break;
3546 goto fail;
3547
3548 /* `or' constructs are handled by starting each alternative with
3549 an on_failure_jump that points to the start of the next
3550 alternative. Each alternative except the last ends with a
3551 jump to the joining point. (Actually, each jump except for
3552 the last one really jumps to the following jump, because
3553 tensioning the jumps is a hassle.) */
3554
3555 /* The start of a stupid repeat has an on_failure_jump that points
3556 past the end of the repeat text. This makes a failure point so
3557 that on failure to match a repetition, matching restarts past
3558 as many repetitions have been found with no way to fail and
3559 look for another one. */
3560
3561 /* A smart repeat is similar but loops back to the on_failure_jump
3562 so that each repetition makes another failure point. */
3563
3564 case on_failure_jump:
3565 on_failure:
3566 EXTRACT_NUMBER_AND_INCR(mcnt, p);
3567 PUSH_FAILURE_POINT(p + mcnt, d);
3568 continue;
3569
3570 /* The end of a smart repeat has a maybe_finalize_jump back.
3571 Change it either to a finalize_jump or an ordinary jump. */
3572 case maybe_finalize_jump:
3573 EXTRACT_NUMBER_AND_INCR(mcnt, p);
3574 {
3575 register unsigned char *p2 = p;
3576
3577 /* Compare the beginning of the repeat with what in the
3578 pattern follows its end. If we can establish that there
3579 is nothing that they would both match, i.e., that we
3580 would have to backtrack because of (as in, e.g., `a*a')
3581 then we can change to pop_failure_jump, because we'll
3582 never have to backtrack.
3583
3584 This is not true in the case of alternatives: in
3585 `(a|ab)*' we do need to backtrack to the `ab' alternative
3586 (e.g., if the string was `ab'). But instead of trying to
3587 detect that here, the alternative has put on a dummy
3588 failure point which is what we will end up popping. */
3589
3590 /* Skip over open/close-group commands. */
3591 while (p2 + 2 < pend
3592 && ((enum regexpcode) *p2 == stop_memory
3593 || (enum regexpcode) *p2 == start_memory))
3594 p2 += 3; /* Skip over args, too. */
3595
3596 if (p2 == pend)
3597 p[-3] = (unsigned char)finalize_jump;
3598 else if (*p2 == (unsigned char)exactn
3599 || *p2 == (unsigned char)endline)
3600 {
3601 register int c = *p2 == (unsigned char)endline ? '\n' : p2[2];
3602 register unsigned char *p1 = p + mcnt;
3603 /* p1[0] ... p1[2] are an on_failure_jump.
3604 Examine what follows that. */
3605 if (p1[3] == (unsigned char)exactn && p1[5] != c)
3606 p[-3] = (unsigned char)finalize_jump;
3607 else if (p1[3] == (unsigned char)charset
3608 || p1[3] == (unsigned char)charset_not) {
3609 int not;
3610 if (ismbchar(c))
3611 c = c << 8 | p2[3];
3612 /* `is_in_list()' is TRUE if c would match */
3613 /* That means it is not safe to finalize. */
3614 not = is_in_list(c, p1 + 4);
3615 if (p1[3] == (unsigned char)charset_not)
3616 not = !not;
3617 if (!not)
3618 p[-3] = (unsigned char)finalize_jump;
3619 }
3620 }
3621 }
3622 p -= 2; /* Point at relative address again. */
3623 if (p[-1] != (unsigned char)finalize_jump)
3624 {
3625 p[-1] = (unsigned char)jump;
3626 goto nofinalize;
3627 }
3628 /* Note fall through. */
3629
3630 /* The end of a stupid repeat has a finalize_jump back to the
3631 start, where another failure point will be made which will
3632 point to after all the repetitions found so far. */
3633
3634 /* Take off failure points put on by matching on_failure_jump
3635 because didn't fail. Also remove the register information
3636 put on by the on_failure_jump. */
3637 case finalize_jump:
3638 POP_FAILURE_POINT();
3639 /* Note fall through. */
3640
3641 /* Jump without taking off any failure points. */
3642 case jump:
3643 nofinalize:
3644 EXTRACT_NUMBER_AND_INCR(mcnt, p);
3645 p += mcnt;
3646 continue;
3647
3648 /* We need this opcode so we can detect where alternatives end
3649 in `group_match_null_string_p' et al. */
3650 case jump_past_alt:
3651 goto nofinalize;
3652
3653 case dummy_failure_jump:
3654 /* Normally, the on_failure_jump pushes a failure point, which
3655 then gets popped at finalize_jump. We will end up at
3656 finalize_jump, also, and with a pattern of, say, `a+', we
3657 are skipping over the on_failure_jump, so we have to push
3658 something meaningless for finalize_jump to pop. */
3659 PUSH_FAILURE_POINT(0, 0);
3660 goto nofinalize;
3661
3662 /* At the end of an alternative, we need to push a dummy failure
3663 point in case we are followed by a `finalize_jump', because
3664 we don't want the failure point for the alternative to be
3665 popped. For example, matching `(a|ab)*' against `aab'
3666 requires that we match the `ab' alternative. */
3667 case push_dummy_failure:
3668 /* See comments just above at `dummy_failure_jump' about the
3669 two zeroes. */
3670 PUSH_FAILURE_POINT(0, 0);
3671 break;
3672
3673 /* Have to succeed matching what follows at least n times. Then
3674 just handle like an on_failure_jump. */
3675 case succeed_n:
3676 EXTRACT_NUMBER(mcnt, p + 2);
3677 /* Originally, this is how many times we HAVE to succeed. */
3678 if (mcnt > 0)
3679 {
3680 mcnt--;
3681 p += 2;
3682 STORE_NUMBER_AND_INCR(p, mcnt);
3683 PUSH_FAILURE_POINT(0, 0);
3684 }
3685 else if (mcnt == 0)
3686 {
3687 p[2] = unused;
3688 p[3] = unused;
3689 goto on_failure;
3690 }
3691 continue;
3692
3693 case jump_n:
3694 EXTRACT_NUMBER(mcnt, p + 2);
3695 /* Originally, this is how many times we CAN jump. */
3696 if (mcnt)
3697 {
3698 mcnt--;
3699 STORE_NUMBER(p + 2, mcnt);
3700 goto nofinalize; /* Do the jump without taking off
3701 any failure points. */
3702 }
3703 /* If don't have to jump any more, skip over the rest of command. */
3704 else
3705 p += 4;
3706 continue;
3707
3708 case set_number_at:
3709 {
3710 register unsigned char *p1;
3711
3712 EXTRACT_NUMBER_AND_INCR(mcnt, p);
3713 p1 = p + mcnt;
3714 EXTRACT_NUMBER_AND_INCR(mcnt, p);
3715 STORE_NUMBER(p1, mcnt);
3716 continue;
3717 }
3718
3719 case try_next:
3720 EXTRACT_NUMBER_AND_INCR(mcnt, p);
3721 if (p + mcnt < pend) {
3722 PUSH_FAILURE_POINT(p, d);
3723 stackp[-1] = (unsigned char*)1;
3724 }
3725 p += mcnt;
3726 continue;
3727
3728 case finalize_push:
3729 POP_FAILURE_POINT();
3730 EXTRACT_NUMBER_AND_INCR(mcnt, p);
3731 PUSH_FAILURE_POINT(p + mcnt, d);
3732 stackp[-1] = (unsigned char*)1;
3733 continue;
3734
3735 case finalize_push_n:
3736 EXTRACT_NUMBER(mcnt, p + 2);
3737 /* Originally, this is how many times we CAN jump. */
3738 if (mcnt) {
3739 int pos, i;
3740
3741 mcnt--;
3742 STORE_NUMBER(p + 2, mcnt);
3743 EXTRACT_NUMBER(pos, p);
3744 EXTRACT_NUMBER(i, p+pos+5);
3745 if (i > 0) goto nofinalize;
3746 POP_FAILURE_POINT();
3747 EXTRACT_NUMBER_AND_INCR(mcnt, p);
3748 PUSH_FAILURE_POINT(p + mcnt, d);
3749 stackp[-1] = (unsigned char*)1;
3750 p += 2; /* skip n */
3751 }
3752 /* If don't have to push any more, skip over the rest of command. */
3753 else
3754 p += 4;
3755 continue;
3756
3757 /* Ignore these. Used to ignore the n of succeed_n's which
3758 currently have n == 0. */
3759 case unused:
3760 continue;
3761
3762 case casefold_on:
3763 options |= RE_OPTION_IGNORECASE;
3764 continue;
3765
3766 case casefold_off:
3767 options &= ~RE_OPTION_IGNORECASE;
3768 continue;
3769
3770 case wordbound:
3771 if (AT_STRINGS_BEG(d)) {
3772 if (IS_A_LETTER(d)) break;
3773 else goto fail;
3774 }
3775 if (AT_STRINGS_BEG(d)) {
3776 if (IS_A_LETTER(d-1)) break;
3777 else goto fail;
3778 }
3779 if (IS_A_LETTER(d - 1) != IS_A_LETTER(d))
3780 break;
3781 goto fail;
3782
3783 case notwordbound:
3784 if (AT_STRINGS_BEG(d)) {
3785 if (IS_A_LETTER(d)) goto fail;
3786 else break;
3787 }
3788 if (AT_STRINGS_END(d)) {
3789 if (IS_A_LETTER(d-1)) goto fail;
3790 else break;
3791 }
3792 if (IS_A_LETTER(d - 1) != IS_A_LETTER(d))
3793 goto fail;
3794 break;
3795
3796 case wordbeg:
3797 if (IS_A_LETTER(d) && (AT_STRINGS_BEG(d) || !IS_A_LETTER(d - 1)))
3798 break;
3799 goto fail;
3800
3801 case wordend:
3802 if (!AT_STRINGS_BEG(d) && IS_A_LETTER(d - 1)
3803 && (!IS_A_LETTER(d) || AT_STRINGS_END(d)))
3804 break;
3805 goto fail;
3806
3807 case wordchar:
3808 PREFETCH;
3809 if (!IS_A_LETTER(d))
3810 goto fail;
3811 d++;
3812 SET_REGS_MATCHED;
3813 break;
3814
3815 case notwordchar:
3816 PREFETCH;
3817 if (IS_A_LETTER(d))
3818 goto fail;
3819 if (ismbchar(*d) && d + 1 != dend)
3820 d++;
3821 d++;
3822 SET_REGS_MATCHED;
3823 break;
3824
3825 case exactn:
3826 /* Match the next few pattern characters exactly.
3827 mcnt is how many characters to match. */
3828 mcnt = *p++;
3829 /* This is written out as an if-else so we don't waste time
3830 testing `translate' inside the loop. */
3831 if (TRANSLATE_P())
3832 {
3833 do
3834 {
3835 unsigned char c;
3836
3837 PREFETCH;
3838 c = *d++;
3839 if (*p == 0xff) {
3840 p++;
3841 if (!--mcnt
3842 || AT_STRINGS_END(d)
3843 || (unsigned char)*d++ != (unsigned char)*p++)
3844 goto fail;
3845 continue;
3846 }
3847 if (ismbchar(c)) {
3848 if (c != (unsigned char)*p++
3849 || !--mcnt /* redundant check if pattern was
3850 compiled properly. */
3851 || AT_STRINGS_END(d)
3852 || (unsigned char)*d++ != (unsigned char)*p++)
3853 goto fail;
3854 continue;
3855 }
3856 /* compiled code translation needed for ruby */
3857 if ((unsigned char)translate[c]
3858 != (unsigned char)translate[*p++])
3859 goto fail;
3860 }
3861 while (--mcnt);
3862 }
3863 else
3864 {
3865 do
3866 {
3867 PREFETCH;
3868 if (*p == 0xff) {p++; mcnt--;}
3869 if (*d++ != *p++) goto fail;
3870 }
3871 while (--mcnt);
3872 }
3873 SET_REGS_MATCHED;
3874 break;
3875 }
3876 while (stackp != stackb && (int)stackp[-1] == 1)
3877 POP_FAILURE_POINT();
3878 continue; /* Successfully executed one pattern command; keep going. */
3879
3880 /* Jump here if any matching operation fails. */
3881 fail:
3882 if (stackp != stackb)
3883 /* A restart point is known. Restart there and pop it. */
3884 {
3885 short last_used_reg, this_reg;
3886
3887 /* If this failure point is from a dummy_failure_point, just
3888 skip it. */
3889 if (stackp[-3] == 0) {
3890 POP_FAILURE_POINT();
3891 goto fail;
3892 }
3893 stackp--; /* discard flag */
3894 d = *--stackp;
3895 p = *--stackp;
3896 /* Restore register info. */
3897 last_used_reg = (long) *--stackp;
3898
3899 /* Make the ones that weren't saved -1 or 0 again. */
3900 for (this_reg = num_regs - 1; this_reg > last_used_reg; this_reg--)
3901 {
3902 regend[this_reg] = REG_UNSET_VALUE;
3903 regstart[this_reg] = REG_UNSET_VALUE;
3904 IS_ACTIVE(reg_info[this_reg]) = 0;
3905 MATCHED_SOMETHING(reg_info[this_reg]) = 0;
3906 }
3907
3908 /* And restore the rest from the stack. */
3909 for ( ; this_reg > 0; this_reg--)
3910 {
3911 reg_info[this_reg].word = *--stackp;
3912 regend[this_reg] = *--stackp;
3913 regstart[this_reg] = *--stackp;
3914 }
3915 if (p < pend)
3916 {
3917 int is_a_jump_n = 0;
3918 unsigned char *p1 = p;
3919
3920 /* If failed to a backwards jump that's part of a repetition
3921 loop, need to pop this failure point and use the next one. */
3922 switch ((enum regexpcode) *p1)
3923 {
3924 case jump_n:
3925 case finalize_push_n:
3926 is_a_jump_n = 1;
3927 case maybe_finalize_jump:
3928 case finalize_jump:
3929 case finalize_push:
3930 case jump:
3931 p1++;
3932 EXTRACT_NUMBER_AND_INCR (mcnt, p1);
3933 p1 += mcnt;
3934
3935 if ((is_a_jump_n && (enum regexpcode) *p1 == succeed_n)
3936 || (!is_a_jump_n
3937 && (enum regexpcode) *p1 == on_failure_jump))
3938 goto fail;
3939 break;
3940 default:
3941 /* do nothing */ ;
3942 }
3943 }
3944 }
3945 else
3946 break; /* Matching at this starting point really fails. */
3947 }
3948
3949 if (best_regs_set)
3950 goto restore_best_regs;
3951
3952 FREE_AND_RETURN(stackb,(-1)); /* Failure to match. */
3953 }
3954
3955
3956 /* We are passed P pointing to a register number after a start_memory.
3957
3958 Return true if the pattern up to the corresponding stop_memory can
3959 match the empty string, and false otherwise.
3960
3961 If we find the matching stop_memory, sets P to point to one past its number.
3962 Otherwise, sets P to an undefined byte less than or equal to END.
3963
3964 We don't handle duplicates properly (yet). */
3965
3966 static int
group_match_null_string_p(p,end,reg_info)3967 group_match_null_string_p (p, end, reg_info)
3968 unsigned char **p, *end;
3969 register_info_type *reg_info;
3970 {
3971 int mcnt;
3972 /* Point to after the args to the start_memory. */
3973 unsigned char *p1 = *p + 2;
3974
3975 while (p1 < end)
3976 {
3977 /* Skip over opcodes that can match nothing, and return true or
3978 false, as appropriate, when we get to one that can't, or to the
3979 matching stop_memory. */
3980
3981 switch ((enum regexpcode) *p1)
3982 {
3983 /* Could be either a loop or a series of alternatives. */
3984 case on_failure_jump:
3985 p1++;
3986 EXTRACT_NUMBER_AND_INCR (mcnt, p1);
3987
3988 /* If the next operation is not a jump backwards in the
3989 pattern. */
3990
3991 if (mcnt >= 0)
3992 {
3993 /* Go through the on_failure_jumps of the alternatives,
3994 seeing if any of the alternatives cannot match nothing.
3995 The last alternative starts with only a jump,
3996 whereas the rest start with on_failure_jump and end
3997 with a jump, e.g., here is the pattern for `a|b|c':
3998
3999 /on_failure_jump/0/6/exactn/1/a/jump_past_alt/0/6
4000 /on_failure_jump/0/6/exactn/1/b/jump_past_alt/0/3
4001 /exactn/1/c
4002
4003 So, we have to first go through the first (n-1)
4004 alternatives and then deal with the last one separately. */
4005
4006
4007 /* Deal with the first (n-1) alternatives, which start
4008 with an on_failure_jump (see above) that jumps to right
4009 past a jump_past_alt. */
4010
4011 while ((enum regexpcode) p1[mcnt-3] == jump_past_alt)
4012 {
4013 /* `mcnt' holds how many bytes long the alternative
4014 is, including the ending `jump_past_alt' and
4015 its number. */
4016
4017 if (!alt_match_null_string_p (p1, p1 + mcnt - 3,
4018 reg_info))
4019 return 0;
4020
4021 /* Move to right after this alternative, including the
4022 jump_past_alt. */
4023 p1 += mcnt;
4024
4025 /* Break if it's the beginning of an n-th alternative
4026 that doesn't begin with an on_failure_jump. */
4027 if ((enum regexpcode) *p1 != on_failure_jump)
4028 break;
4029
4030 /* Still have to check that it's not an n-th
4031 alternative that starts with an on_failure_jump. */
4032 p1++;
4033 EXTRACT_NUMBER_AND_INCR (mcnt, p1);
4034 if ((enum regexpcode) p1[mcnt-3] != jump_past_alt)
4035 {
4036 /* Get to the beginning of the n-th alternative. */
4037 p1 -= 3;
4038 break;
4039 }
4040 }
4041
4042 /* Deal with the last alternative: go back and get number
4043 of the `jump_past_alt' just before it. `mcnt' contains
4044 the length of the alternative. */
4045 EXTRACT_NUMBER (mcnt, p1 - 2);
4046 #if 0
4047 if (!alt_match_null_string_p (p1, p1 + mcnt, reg_info))
4048 return 0;
4049 #endif
4050 p1 += mcnt; /* Get past the n-th alternative. */
4051 } /* if mcnt > 0 */
4052 break;
4053
4054
4055 case stop_memory:
4056 *p = p1 + 2;
4057 return 1;
4058
4059
4060 default:
4061 if (!common_op_match_null_string_p (&p1, end, reg_info))
4062 return 0;
4063 }
4064 } /* while p1 < end */
4065
4066 return 0;
4067 } /* group_match_null_string_p */
4068
4069
4070 /* Similar to group_match_null_string_p, but doesn't deal with alternatives:
4071 It expects P to be the first byte of a single alternative and END one
4072 byte past the last. The alternative can contain groups. */
4073
4074 static int
alt_match_null_string_p(p,end,reg_info)4075 alt_match_null_string_p (p, end, reg_info)
4076 unsigned char *p, *end;
4077 register_info_type *reg_info;
4078 {
4079 int mcnt;
4080 unsigned char *p1 = p;
4081
4082 while (p1 < end)
4083 {
4084 /* Skip over opcodes that can match nothing, and break when we get
4085 to one that can't. */
4086
4087 switch ((enum regexpcode) *p1)
4088 {
4089 /* It's a loop. */
4090 case on_failure_jump:
4091 p1++;
4092 EXTRACT_NUMBER_AND_INCR (mcnt, p1);
4093 p1 += mcnt;
4094 break;
4095
4096 default:
4097 if (!common_op_match_null_string_p (&p1, end, reg_info))
4098 return 0;
4099 }
4100 } /* while p1 < end */
4101
4102 return 1;
4103 } /* alt_match_null_string_p */
4104
4105
4106 /* Deals with the ops common to group_match_null_string_p and
4107 alt_match_null_string_p.
4108
4109 Sets P to one after the op and its arguments, if any. */
4110
4111 static int
common_op_match_null_string_p(p,end,reg_info)4112 common_op_match_null_string_p (p, end, reg_info)
4113 unsigned char **p, *end;
4114 register_info_type *reg_info;
4115 {
4116 int mcnt;
4117 int ret;
4118 int reg_no;
4119 unsigned char *p1 = *p;
4120
4121 switch ((enum regexpcode) *p1++)
4122 {
4123 case unused:
4124 case begline:
4125 case endline:
4126 case begbuf:
4127 case endbuf:
4128 case endbuf2:
4129 case wordbeg:
4130 case wordend:
4131 case wordbound:
4132 case notwordbound:
4133 #ifdef emacs
4134 case before_dot:
4135 case at_dot:
4136 case after_dot:
4137 #endif
4138 break;
4139
4140 case start_memory:
4141 reg_no = *p1;
4142 ret = group_match_null_string_p (&p1, end, reg_info);
4143
4144 /* Have to set this here in case we're checking a group which
4145 contains a group and a back reference to it. */
4146
4147 if (REG_MATCH_NULL_STRING_P (reg_info[reg_no]) == MATCH_NULL_UNSET_VALUE)
4148 REG_MATCH_NULL_STRING_P (reg_info[reg_no]) = ret;
4149
4150 if (!ret)
4151 return 0;
4152 break;
4153
4154 /* If this is an optimized succeed_n for zero times, make the jump. */
4155 case jump:
4156 EXTRACT_NUMBER_AND_INCR (mcnt, p1);
4157 if (mcnt >= 0)
4158 p1 += mcnt;
4159 else
4160 return 0;
4161 break;
4162
4163 case succeed_n:
4164 /* Get to the number of times to succeed. */
4165 p1 += 2;
4166 EXTRACT_NUMBER_AND_INCR (mcnt, p1);
4167
4168 if (mcnt == 0)
4169 {
4170 p1 -= 4;
4171 EXTRACT_NUMBER_AND_INCR (mcnt, p1);
4172 p1 += mcnt;
4173 }
4174 else
4175 return 0;
4176 break;
4177
4178 case duplicate:
4179 if (!REG_MATCH_NULL_STRING_P (reg_info[*p1]))
4180 return 0;
4181 break;
4182
4183 case set_number_at:
4184 p1 += 4;
4185
4186 default:
4187 /* All other opcodes mean we cannot match the empty string. */
4188 return 0;
4189 }
4190
4191 *p = p1;
4192 return 1;
4193 } /* common_op_match_null_string_p */
4194
4195
4196 static int
memcmp_translate(s1,s2,len)4197 memcmp_translate(s1, s2, len)
4198 unsigned char *s1, *s2;
4199 register int len;
4200 {
4201 register unsigned char *p1 = s1, *p2 = s2, c;
4202 while (len)
4203 {
4204 c = *p1++;
4205 if (ismbchar(c)) {
4206 if (c != *p2++ || !--len || *p1++ != *p2++)
4207 return 1;
4208 }
4209 else
4210 if (translate[c] != translate[*p2++])
4211 return 1;
4212 len--;
4213 }
4214 return 0;
4215 }
4216
4217 void
re_copy_registers(regs1,regs2)4218 re_copy_registers(regs1, regs2)
4219 struct re_registers *regs1, *regs2;
4220 {
4221 int i;
4222
4223 if (regs1 == regs2) return;
4224 if (regs1->allocated == 0) {
4225 regs1->beg = TMALLOC(regs2->num_regs, int);
4226 regs1->end = TMALLOC(regs2->num_regs, int);
4227 regs1->allocated = regs2->num_regs;
4228 }
4229 else if (regs1->allocated < regs2->num_regs) {
4230 TREALLOC(regs1->beg, regs2->num_regs, int);
4231 TREALLOC(regs1->end, regs2->num_regs, int);
4232 regs1->allocated = regs2->num_regs;
4233 }
4234 for (i=0; i<regs2->num_regs; i++) {
4235 regs1->beg[i] = regs2->beg[i];
4236 regs1->end[i] = regs2->end[i];
4237 }
4238 regs1->num_regs = regs2->num_regs;
4239 }
4240
4241 void
re_free_registers(regs)4242 re_free_registers(regs)
4243 struct re_registers *regs;
4244 {
4245 if (regs->allocated == 0) return;
4246 if (regs->beg) free(regs->beg);
4247 if (regs->end) free(regs->end);
4248 }
4249
4250 /* Functions for multi-byte support.
4251 Created for grep multi-byte extension Jul., 1993 by t^2 (Takahiro Tanimoto)
4252 Last change: Jul. 9, 1993 by t^2 */
4253 static const unsigned char mbctab_ascii[] = {
4254 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4255 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4256 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4257 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4258 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4259 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4260 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4261 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4262 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4263 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4264 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4265 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4266 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4267 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4268 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4269 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
4270 };
4271
4272 static const unsigned char mbctab_euc[] = { /* 0xA1-0xFE */
4273 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4274 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4275 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4276 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4277 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4278 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4279 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4280 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4281 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,
4282 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4283 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
4284 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
4285 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
4286 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
4287 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
4288 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0
4289 };
4290
4291 static const unsigned char mbctab_sjis[] = { /* 0x80-0x9f,0xE0-0xFF */
4292 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4293 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4294 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4295 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4296 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4297 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4298 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4299 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4300 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
4301 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
4302 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4303 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4304 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4305 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4306 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
4307 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1
4308 };
4309
4310 const unsigned char *mbctab = mbctab_ascii;
4311 int current_mbctype = MBCTYPE_ASCII;
4312
4313 void
re_mbcinit(mbctype)4314 re_mbcinit(mbctype)
4315 int mbctype;
4316 {
4317 switch (mbctype) {
4318 case MBCTYPE_ASCII:
4319 mbctab = mbctab_ascii;
4320 current_mbctype = MBCTYPE_ASCII;
4321 break;
4322 case MBCTYPE_EUC:
4323 mbctab = mbctab_euc;
4324 current_mbctype = MBCTYPE_EUC;
4325 break;
4326 case MBCTYPE_SJIS:
4327 mbctab = mbctab_sjis;
4328 current_mbctype = MBCTYPE_SJIS;
4329 break;
4330 }
4331 }
4332