1 /* Emacs regular expression matching and search
2 
3    Copyright (C) 1993-2021 Free Software Foundation, Inc.
4 
5    This program is free software; you can redistribute it and/or modify
6    it under the terms of the GNU General Public License as published by
7    the Free Software Foundation; either version 3, or (at your option)
8    any later version.
9 
10    This program is distributed in the hope that it will be useful,
11    but WITHOUT ANY WARRANTY; without even the implied warranty of
12    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13    GNU General Public License for more details.
14 
15    You should have received a copy of the GNU General Public License
16    along with this program.  If not, see <https://www.gnu.org/licenses/>.  */
17 
18 /* TODO:
19    - structure the opcode space into opcode+flag.
20    - replace (succeed_n + jump_n + set_number_at) with something that doesn't
21      need to modify the compiled regexp so that re_search can be reentrant.
22    - get rid of on_failure_jump_smart by doing the optimization in re_comp
23      rather than at run-time, so that re_search can be reentrant.
24 */
25 
26 #include <config.h>
27 
28 #include "regex-emacs.h"
29 
30 #include <stdlib.h>
31 
32 #include "character.h"
33 #include "buffer.h"
34 #include "syntax.h"
35 #include "category.h"
36 
37 /* Maximum number of duplicates an interval can allow.  Some systems
38    define this in other header files, but we want our value, so remove
39    any previous define.  Repeat counts are stored in opcodes as 2-byte
40    unsigned integers.  */
41 #ifdef RE_DUP_MAX
42 # undef RE_DUP_MAX
43 #endif
44 #define RE_DUP_MAX (0xffff)
45 
46 /* Make syntax table lookup grant data in gl_state.  */
47 #define SYNTAX(c) syntax_property (c, 1)
48 
49 /* Convert the pointer to the char to BEG-based offset from the start.  */
50 #define PTR_TO_OFFSET(d) POS_AS_IN_BUFFER (POINTER_TO_OFFSET (d))
51 /* Strings are 0-indexed, buffers are 1-indexed; pun on the boolean
52    result to get the right base index.  */
53 #define POS_AS_IN_BUFFER(p)                                    \
54   ((p) + (NILP (gl_state.object) || BUFFERP (gl_state.object)))
55 
56 #define RE_MULTIBYTE_P(bufp) ((bufp)->multibyte)
57 #define RE_TARGET_MULTIBYTE_P(bufp) ((bufp)->target_multibyte)
58 #define RE_STRING_CHAR(p, multibyte) \
59   (multibyte ? STRING_CHAR (p) : *(p))
60 #define RE_STRING_CHAR_AND_LENGTH(p, len, multibyte) \
61   (multibyte ? string_char_and_length (p, &(len)) : ((len) = 1, *(p)))
62 
63 #define RE_CHAR_TO_MULTIBYTE(c) UNIBYTE_TO_CHAR (c)
64 
65 #define RE_CHAR_TO_UNIBYTE(c) CHAR_TO_BYTE_SAFE (c)
66 
67 /* Set C a (possibly converted to multibyte) character before P.  P
68    points into a string which is the virtual concatenation of STR1
69    (which ends at END1) or STR2 (which ends at END2).  */
70 #define GET_CHAR_BEFORE_2(c, p, str1, end1, str2, end2)			     \
71   do {									     \
72     if (target_multibyte)						     \
73       {									     \
74 	re_char *dtemp = (p) == (str2) ? (end1) : (p);			     \
75 	re_char *dlimit = (p) > (str2) && (p) <= (end2) ? (str2) : (str1);   \
76 	while (dtemp-- > dlimit && !CHAR_HEAD_P (*dtemp))		     \
77 	  continue;							     \
78 	c = STRING_CHAR (dtemp);					     \
79       }									     \
80     else								     \
81       {									     \
82 	(c = ((p) == (str2) ? (end1) : (p))[-1]);			     \
83 	(c) = RE_CHAR_TO_MULTIBYTE (c);					     \
84       }									     \
85   } while (false)
86 
87 /* Set C a (possibly converted to multibyte) character at P, and set
88    LEN to the byte length of that character.  */
89 #define GET_CHAR_AFTER(c, p, len)		\
90   do {						\
91     if (target_multibyte)			\
92       (c) = string_char_and_length (p, &(len));	\
93     else					\
94       {						\
95 	(c) = *p;				\
96 	len = 1;				\
97 	(c) = RE_CHAR_TO_MULTIBYTE (c);		\
98       }						\
99    } while (false)
100 
101 /* 1 if C is an ASCII character.  */
102 #define IS_REAL_ASCII(c) ((c) < 0200)
103 
104 /* 1 if C is a unibyte character.  */
105 #define ISUNIBYTE(c) (SINGLE_BYTE_CHAR_P ((c)))
106 
107 /* The Emacs definitions should not be directly affected by locales.  */
108 
109 /* In Emacs, these are only used for single-byte characters.  */
110 #define ISDIGIT(c) ((c) >= '0' && (c) <= '9')
111 #define ISCNTRL(c) ((c) < ' ')
112 #define ISXDIGIT(c) (0 <= char_hexdigit (c))
113 
114 /* The rest must handle multibyte characters.  */
115 
116 #define ISBLANK(c) (IS_REAL_ASCII (c)			\
117                      ? ((c) == ' ' || (c) == '\t')      \
118                      : blankp (c))
119 
120 #define ISGRAPH(c) (SINGLE_BYTE_CHAR_P (c)				\
121 		     ? (c) > ' ' && !((c) >= 0177 && (c) <= 0240)	\
122 		     : graphicp (c))
123 
124 #define ISPRINT(c) (SINGLE_BYTE_CHAR_P (c)				\
125 		    ? (c) >= ' ' && !((c) >= 0177 && (c) <= 0237)	\
126 		     : printablep (c))
127 
128 #define ISALNUM(c) (IS_REAL_ASCII (c)			\
129 		    ? (((c) >= 'a' && (c) <= 'z')	\
130 		       || ((c) >= 'A' && (c) <= 'Z')	\
131 		       || ((c) >= '0' && (c) <= '9'))	\
132 		    : alphanumericp (c))
133 
134 #define ISALPHA(c) (IS_REAL_ASCII (c)			\
135 		    ? (((c) >= 'a' && (c) <= 'z')	\
136 		       || ((c) >= 'A' && (c) <= 'Z'))	\
137 		    : alphabeticp (c))
138 
139 #define ISLOWER(c) lowercasep (c)
140 
141 #define ISPUNCT(c) (IS_REAL_ASCII (c)				\
142 		    ? ((c) > ' ' && (c) < 0177			\
143 		       && !(((c) >= 'a' && (c) <= 'z')		\
144 		            || ((c) >= 'A' && (c) <= 'Z')	\
145 		            || ((c) >= '0' && (c) <= '9')))	\
146 		    : SYNTAX (c) != Sword)
147 
148 #define ISSPACE(c) (SYNTAX (c) == Swhitespace)
149 
150 #define ISUPPER(c) uppercasep (c)
151 
152 #define ISWORD(c) (SYNTAX (c) == Sword)
153 
154 /* Use alloca instead of malloc.  This is because using malloc in
155    re_search* or re_match* could cause memory leaks when C-g is used
156    in Emacs (note that SAFE_ALLOCA could also call malloc, but does so
157    via 'record_xmalloc' which uses 'unwind_protect' to ensure the
158    memory is freed even in case of non-local exits); also, malloc is
159    slower and causes storage fragmentation.  On the other hand, malloc
160    is more portable, and easier to debug.
161 
162    Because we sometimes use alloca, some routines have to be macros,
163    not functions -- 'alloca'-allocated space disappears at the end of the
164    function it is called in.  */
165 
166 /* This may be adjusted in main(), if the stack is successfully grown.  */
167 ptrdiff_t emacs_re_safe_alloca = MAX_ALLOCA;
168 /* Like USE_SAFE_ALLOCA, but use emacs_re_safe_alloca.  */
169 #define REGEX_USE_SAFE_ALLOCA					       \
170   USE_SAFE_ALLOCA; sa_avail = emacs_re_safe_alloca
171 
172 /* Assumes a 'char *destination' variable.  */
173 #define REGEX_REALLOCATE(source, osize, nsize)				\
174   (destination = SAFE_ALLOCA (nsize),					\
175    memcpy (destination, source, osize))
176 
177 /* True if 'size1' is non-NULL and PTR is pointing anywhere inside
178    'string1' or just past its end.  This works if PTR is NULL, which is
179    a good thing.  */
180 #define FIRST_STRING_P(ptr)					\
181   (size1 && string1 <= (ptr) && (ptr) <= string1 + size1)
182 
183 #define BYTEWIDTH 8 /* In bits.  */
184 
185 /* Type of source-pattern and string chars.  */
186 typedef const unsigned char re_char;
187 
188 static void re_compile_fastmap (struct re_pattern_buffer *);
189 static ptrdiff_t re_match_2_internal (struct re_pattern_buffer *bufp,
190 				     re_char *string1, ptrdiff_t size1,
191 				     re_char *string2, ptrdiff_t size2,
192 				     ptrdiff_t pos,
193 				     struct re_registers *regs,
194 				     ptrdiff_t stop);
195 
196 /* These are the command codes that appear in compiled regular
197    expressions.  Some opcodes are followed by argument bytes.  A
198    command code can specify any interpretation whatsoever for its
199    arguments.  Zero bytes may appear in the compiled regular expression.  */
200 
201 typedef enum
202 {
203   no_op = 0,
204 
205   /* Succeed right away--no more backtracking.  */
206   succeed,
207 
208 	/* Followed by one byte giving n, then by n literal bytes.  */
209   exactn,
210 
211 	/* Matches any (more or less) character.  */
212   anychar,
213 
214 	/* Matches any one char belonging to specified set.  First
215 	   following byte is number of bitmap bytes.  Then come bytes
216 	   for a bitmap saying which chars are in.  Bits in each byte
217 	   are ordered low-bit-first.  A character is in the set if its
218 	   bit is 1.  A character too large to have a bit in the map is
219 	   automatically not in the set.
220 
221 	   If the length byte has the 0x80 bit set, then that stuff
222 	   is followed by a range table:
223 	       2 bytes of flags for character sets (low 8 bits, high 8 bits)
224 		   See RANGE_TABLE_WORK_BITS below.
225 	       2 bytes, the number of pairs that follow (upto 32767)
226 	       pairs, each 2 multibyte characters,
227 		   each multibyte character represented as 3 bytes.  */
228   charset,
229 
230 	/* Same parameters as charset, but match any character that is
231 	   not one of those specified.  */
232   charset_not,
233 
234 	/* Start remembering the text that is matched, for storing in a
235 	   register.  Followed by one byte with the register number, in
236 	   the range 0 to one less than the pattern buffer's re_nsub
237 	   field.  */
238   start_memory,
239 
240 	/* Stop remembering the text that is matched and store it in a
241 	   memory register.  Followed by one byte with the register
242 	   number, in the range 0 to one less than 're_nsub' in the
243 	   pattern buffer.  */
244   stop_memory,
245 
246 	/* Match a duplicate of something remembered. Followed by one
247 	   byte containing the register number.  */
248   duplicate,
249 
250 	/* Fail unless at beginning of line.  */
251   begline,
252 
253 	/* Fail unless at end of line.  */
254   endline,
255 
256 	/* Succeeds if at beginning of buffer.  */
257   begbuf,
258 
259 	/* Analogously, for end of buffer/string.  */
260   endbuf,
261 
262 	/* Followed by two byte relative address to which to jump.  */
263   jump,
264 
265 	/* Followed by two-byte relative address of place to resume at
266 	   in case of failure.  */
267   on_failure_jump,
268 
269 	/* Like on_failure_jump, but pushes a placeholder instead of the
270 	   current string position when executed.  */
271   on_failure_keep_string_jump,
272 
273 	/* Just like 'on_failure_jump', except that it checks that we
274 	   don't get stuck in an infinite loop (matching an empty string
275 	   indefinitely).  */
276   on_failure_jump_loop,
277 
278 	/* Just like 'on_failure_jump_loop', except that it checks for
279 	   a different kind of loop (the kind that shows up with non-greedy
280 	   operators).  This operation has to be immediately preceded
281 	   by a 'no_op'.  */
282   on_failure_jump_nastyloop,
283 
284 	/* A smart 'on_failure_jump' used for greedy * and + operators.
285 	   It analyzes the loop before which it is put and if the
286 	   loop does not require backtracking, it changes itself to
287 	   'on_failure_keep_string_jump' and short-circuits the loop,
288 	   else it just defaults to changing itself into 'on_failure_jump'.
289 	   It assumes that it is pointing to just past a 'jump'.  */
290   on_failure_jump_smart,
291 
292 	/* Followed by two-byte relative address and two-byte number n.
293 	   After matching N times, jump to the address upon failure.
294 	   Does not work if N starts at 0: use on_failure_jump_loop
295 	   instead.  */
296   succeed_n,
297 
298 	/* Followed by two-byte relative address, and two-byte number n.
299 	   Jump to the address N times, then fail.  */
300   jump_n,
301 
302 	/* Set the following two-byte relative address to the
303 	   subsequent two-byte number.  The address *includes* the two
304 	   bytes of number.  */
305   set_number_at,
306 
307   wordbeg,	/* Succeeds if at word beginning.  */
308   wordend,	/* Succeeds if at word end.  */
309 
310   wordbound,	/* Succeeds if at a word boundary.  */
311   notwordbound,	/* Succeeds if not at a word boundary.  */
312 
313   symbeg,       /* Succeeds if at symbol beginning.  */
314   symend,       /* Succeeds if at symbol end.  */
315 
316 	/* Matches any character whose syntax is specified.  Followed by
317 	   a byte which contains a syntax code, e.g., Sword.  */
318   syntaxspec,
319 
320 	/* Matches any character whose syntax is not that specified.  */
321   notsyntaxspec,
322 
323   at_dot,	/* Succeeds if at point.  */
324 
325   /* Matches any character whose category-set contains the specified
326      category.  The operator is followed by a byte which contains a
327      category code (mnemonic ASCII character).  */
328   categoryspec,
329 
330   /* Matches any character whose category-set does not contain the
331      specified category.  The operator is followed by a byte which
332      contains the category code (mnemonic ASCII character).  */
333   notcategoryspec
334 } re_opcode_t;
335 
336 /* Common operations on the compiled pattern.  */
337 
338 /* Store NUMBER in two contiguous bytes starting at DESTINATION.  */
339 
340 #define STORE_NUMBER(destination, number)				\
341   do {									\
342     (destination)[0] = (number) & 0377;					\
343     (destination)[1] = (number) >> 8;					\
344   } while (false)
345 
346 /* Same as STORE_NUMBER, except increment DESTINATION to
347    the byte after where the number is stored.  Therefore, DESTINATION
348    must be an lvalue.  */
349 
350 #define STORE_NUMBER_AND_INCR(destination, number)			\
351   do {									\
352     STORE_NUMBER (destination, number);					\
353     (destination) += 2;							\
354   } while (false)
355 
356 /* Put into DESTINATION a number stored in two contiguous bytes starting
357    at SOURCE.  */
358 
359 #define EXTRACT_NUMBER(destination, source)				\
360   ((destination) = extract_number (source))
361 
362 static int
extract_number(re_char * source)363 extract_number (re_char *source)
364 {
365   signed char leading_byte = source[1];
366   return leading_byte * 256 + source[0];
367 }
368 
369 /* Same as EXTRACT_NUMBER, except increment SOURCE to after the number.
370    SOURCE must be an lvalue.  */
371 
372 #define EXTRACT_NUMBER_AND_INCR(destination, source)			\
373   ((destination) = extract_number_and_incr (&source))
374 
375 static int
extract_number_and_incr(re_char ** source)376 extract_number_and_incr (re_char **source)
377 {
378   int num = extract_number (*source);
379   *source += 2;
380   return num;
381 }
382 
383 /* Store a multibyte character in three contiguous bytes starting
384    DESTINATION, and increment DESTINATION to the byte after where the
385    character is stored.  Therefore, DESTINATION must be an lvalue.  */
386 
387 #define STORE_CHARACTER_AND_INCR(destination, character)	\
388   do {								\
389     (destination)[0] = (character) & 0377;			\
390     (destination)[1] = ((character) >> 8) & 0377;		\
391     (destination)[2] = (character) >> 16;			\
392     (destination) += 3;						\
393   } while (false)
394 
395 /* Put into DESTINATION a character stored in three contiguous bytes
396    starting at SOURCE.  */
397 
398 #define EXTRACT_CHARACTER(destination, source)	\
399   do {						\
400     (destination) = ((source)[0]		\
401 		     | ((source)[1] << 8)	\
402 		     | ((source)[2] << 16));	\
403   } while (false)
404 
405 
406 /* Macros for charset. */
407 
408 /* Size of bitmap of charset P in bytes.  P is a start of charset,
409    i.e. *P is (re_opcode_t) charset or (re_opcode_t) charset_not.  */
410 #define CHARSET_BITMAP_SIZE(p) ((p)[1] & 0x7F)
411 
412 /* Nonzero if charset P has range table.  */
413 #define CHARSET_RANGE_TABLE_EXISTS_P(p)	 (((p)[1] & 0x80) != 0)
414 
415 /* Return the address of range table of charset P.  But not the start
416    of table itself, but the before where the number of ranges is
417    stored.  '2 +' means to skip re_opcode_t and size of bitmap,
418    and the 2 bytes of flags at the start of the range table.  */
419 #define CHARSET_RANGE_TABLE(p) (&(p)[4 + CHARSET_BITMAP_SIZE (p)])
420 
421 /* Extract the bit flags that start a range table.  */
422 #define CHARSET_RANGE_TABLE_BITS(p)		\
423   ((p)[2 + CHARSET_BITMAP_SIZE (p)]		\
424    + (p)[3 + CHARSET_BITMAP_SIZE (p)] * 0x100)
425 
426 /* Return the address of end of RANGE_TABLE.  COUNT is number of
427    ranges (which is a pair of (start, end)) in the RANGE_TABLE.  '* 2'
428    is start of range and end of range.  '* 3' is size of each start
429    and end.  */
430 #define CHARSET_RANGE_TABLE_END(range_table, count)	\
431   ((range_table) + (count) * 2 * 3)
432 
433 /* If REGEX_EMACS_DEBUG is defined, print many voluminous messages
434    (if the variable regex_emacs_debug is positive).  */
435 
436 #ifdef REGEX_EMACS_DEBUG
437 
438 /* Use standard I/O for debugging.  */
439 # include "sysstdio.h"
440 
441 static int regex_emacs_debug = -100000;
442 
443 # define DEBUG_STATEMENT(e) e
444 # define DEBUG_PRINT(...)                                       \
445   if (regex_emacs_debug > 0) fprintf (stderr, __VA_ARGS__)
446 # define DEBUG_COMPILES_ARGUMENTS
447 # define DEBUG_PRINT_COMPILED_PATTERN(p, s, e)				\
448   if (regex_emacs_debug > 0) print_partial_compiled_pattern (s, e)
449 # define DEBUG_PRINT_DOUBLE_STRING(w, s1, sz1, s2, sz2)			\
450   if (regex_emacs_debug > 0) print_double_string (w, s1, sz1, s2, sz2)
451 
452 static void
debug_putchar(int c)453 debug_putchar (int c)
454 {
455   if (c >= 32 && c <= 126)
456     putc (c, stderr);
457   else
458     {
459       unsigned int uc = c;
460       fprintf (stderr, "{%02x}", uc);
461     }
462 }
463 
464 /* Print the fastmap in human-readable form.  */
465 
466 static void
print_fastmap(char * fastmap)467 print_fastmap (char *fastmap)
468 {
469   bool was_a_range = false;
470   int i = 0;
471 
472   while (i < (1 << BYTEWIDTH))
473     {
474       if (fastmap[i++])
475 	{
476 	  was_a_range = false;
477 	  debug_putchar (i - 1);
478 	  while (i < (1 << BYTEWIDTH)  &&  fastmap[i])
479 	    {
480 	      was_a_range = true;
481 	      i++;
482 	    }
483 	  if (was_a_range)
484 	    {
485 	      debug_putchar ('-');
486 	      debug_putchar (i - 1);
487 	    }
488 	}
489     }
490   putc ('\n', stderr);
491 }
492 
493 
494 /* Print a compiled pattern string in human-readable form, starting at
495    the START pointer into it and ending just before the pointer END.  */
496 
497 static void
print_partial_compiled_pattern(re_char * start,re_char * end)498 print_partial_compiled_pattern (re_char *start, re_char *end)
499 {
500   int mcnt, mcnt2;
501   re_char *p = start;
502   re_char *pend = end;
503 
504   if (start == NULL)
505     {
506       fputs ("(null)\n", stderr);
507       return;
508     }
509 
510   /* Loop over pattern commands.  */
511   while (p < pend)
512     {
513       fprintf (stderr, "%td:\t", p - start);
514 
515       switch ((re_opcode_t) *p++)
516 	{
517 	case no_op:
518 	  fputs ("/no_op", stderr);
519 	  break;
520 
521 	case succeed:
522 	  fputs ("/succeed", stderr);
523 	  break;
524 
525 	case exactn:
526 	  mcnt = *p++;
527 	  fprintf (stderr, "/exactn/%d", mcnt);
528 	  do
529 	    {
530 	      debug_putchar ('/');
531 	      debug_putchar (*p++);
532 	    }
533 	  while (--mcnt);
534 	  break;
535 
536 	case start_memory:
537 	  fprintf (stderr, "/start_memory/%d", *p++);
538 	  break;
539 
540 	case stop_memory:
541 	  fprintf (stderr, "/stop_memory/%d", *p++);
542 	  break;
543 
544 	case duplicate:
545 	  fprintf (stderr, "/duplicate/%d", *p++);
546 	  break;
547 
548 	case anychar:
549 	  fputs ("/anychar", stderr);
550 	  break;
551 
552 	case charset:
553 	case charset_not:
554 	  {
555 	    int c, last = -100;
556 	    bool in_range = false;
557 	    int length = CHARSET_BITMAP_SIZE (p - 1);
558 	    bool has_range_table = CHARSET_RANGE_TABLE_EXISTS_P (p - 1);
559 
560 	    fprintf (stderr, "/charset [%s",
561 		     (re_opcode_t) *(p - 1) == charset_not ? "^" : "");
562 
563 	    if (p + *p >= pend)
564 	      fputs (" !extends past end of pattern! ", stderr);
565 
566 	    for (c = 0; c < 256; c++)
567 	      if (c / 8 < length
568 		  && (p[1 + (c/8)] & (1 << (c % 8))))
569 		{
570 		  /* Are we starting a range?  */
571 		  if (last + 1 == c && ! in_range)
572 		    {
573 		      debug_putchar ('-');
574 		      in_range = true;
575 		    }
576 		  /* Have we broken a range?  */
577 		  else if (last + 1 != c && in_range)
578 		    {
579 		      debug_putchar (last);
580 		      in_range = false;
581 		    }
582 
583 		  if (! in_range)
584 		    debug_putchar (c);
585 
586 		  last = c;
587 	      }
588 
589 	    if (in_range)
590 	      debug_putchar (last);
591 
592 	    debug_putchar (']');
593 
594 	    p += 1 + length;
595 
596 	    if (has_range_table)
597 	      {
598 		int count;
599 		fputs ("has-range-table", stderr);
600 
601 		/* ??? Should print the range table; for now, just skip it.  */
602 		p += 2;		/* skip range table bits */
603 		EXTRACT_NUMBER_AND_INCR (count, p);
604 		p = CHARSET_RANGE_TABLE_END (p, count);
605 	      }
606 	  }
607 	  break;
608 
609 	case begline:
610 	  fputs ("/begline", stderr);
611 	  break;
612 
613 	case endline:
614 	  fputs ("/endline", stderr);
615 	  break;
616 
617 	case on_failure_jump:
618 	  EXTRACT_NUMBER_AND_INCR (mcnt, p);
619 	  fprintf (stderr, "/on_failure_jump to %td", p + mcnt - start);
620 	  break;
621 
622 	case on_failure_keep_string_jump:
623 	  EXTRACT_NUMBER_AND_INCR (mcnt, p);
624 	  fprintf (stderr, "/on_failure_keep_string_jump to %td",
625 		   p + mcnt - start);
626 	  break;
627 
628 	case on_failure_jump_nastyloop:
629 	  EXTRACT_NUMBER_AND_INCR (mcnt, p);
630 	  fprintf (stderr, "/on_failure_jump_nastyloop to %td",
631 		   p + mcnt - start);
632 	  break;
633 
634 	case on_failure_jump_loop:
635 	  EXTRACT_NUMBER_AND_INCR (mcnt, p);
636 	  fprintf (stderr, "/on_failure_jump_loop to %td",
637 		   p + mcnt - start);
638 	  break;
639 
640 	case on_failure_jump_smart:
641 	  EXTRACT_NUMBER_AND_INCR (mcnt, p);
642 	  fprintf (stderr, "/on_failure_jump_smart to %td",
643 		   p + mcnt - start);
644 	  break;
645 
646 	case jump:
647 	  EXTRACT_NUMBER_AND_INCR (mcnt, p);
648 	  fprintf (stderr, "/jump to %td", p + mcnt - start);
649 	  break;
650 
651 	case succeed_n:
652 	  EXTRACT_NUMBER_AND_INCR (mcnt, p);
653 	  EXTRACT_NUMBER_AND_INCR (mcnt2, p);
654 	  fprintf (stderr, "/succeed_n to %td, %d times",
655 		   p - 2 + mcnt - start, mcnt2);
656 	  break;
657 
658 	case jump_n:
659 	  EXTRACT_NUMBER_AND_INCR (mcnt, p);
660 	  EXTRACT_NUMBER_AND_INCR (mcnt2, p);
661 	  fprintf (stderr, "/jump_n to %td, %d times",
662 		   p - 2 + mcnt - start, mcnt2);
663 	  break;
664 
665 	case set_number_at:
666 	  EXTRACT_NUMBER_AND_INCR (mcnt, p);
667 	  EXTRACT_NUMBER_AND_INCR (mcnt2, p);
668 	  fprintf (stderr, "/set_number_at location %td to %d",
669 		   p - 2 + mcnt - start, mcnt2);
670 	  break;
671 
672 	case wordbound:
673 	  fputs ("/wordbound", stderr);
674 	  break;
675 
676 	case notwordbound:
677 	  fputs ("/notwordbound", stderr);
678 	  break;
679 
680 	case wordbeg:
681 	  fputs ("/wordbeg", stderr);
682 	  break;
683 
684 	case wordend:
685 	  fputs ("/wordend", stderr);
686 	  break;
687 
688 	case symbeg:
689 	  fputs ("/symbeg", stderr);
690 	  break;
691 
692 	case symend:
693 	  fputs ("/symend", stderr);
694 	  break;
695 
696 	case syntaxspec:
697 	  fputs ("/syntaxspec", stderr);
698 	  mcnt = *p++;
699 	  fprintf (stderr, "/%d", mcnt);
700 	  break;
701 
702 	case notsyntaxspec:
703 	  fputs ("/notsyntaxspec", stderr);
704 	  mcnt = *p++;
705 	  fprintf (stderr, "/%d", mcnt);
706 	  break;
707 
708 	case at_dot:
709 	  fputs ("/at_dot", stderr);
710 	  break;
711 
712 	case categoryspec:
713 	  fputs ("/categoryspec", stderr);
714 	  mcnt = *p++;
715 	  fprintf (stderr, "/%d", mcnt);
716 	  break;
717 
718 	case notcategoryspec:
719 	  fputs ("/notcategoryspec", stderr);
720 	  mcnt = *p++;
721 	  fprintf (stderr, "/%d", mcnt);
722 	  break;
723 
724 	case begbuf:
725 	  fputs ("/begbuf", stderr);
726 	  break;
727 
728 	case endbuf:
729 	  fputs ("/endbuf", stderr);
730 	  break;
731 
732 	default:
733 	  fprintf (stderr, "?%d", *(p-1));
734 	}
735 
736       putc ('\n', stderr);
737     }
738 
739   fprintf (stderr, "%td:\tend of pattern.\n", p - start);
740 }
741 
742 
743 static void
print_compiled_pattern(struct re_pattern_buffer * bufp)744 print_compiled_pattern (struct re_pattern_buffer *bufp)
745 {
746   re_char *buffer = bufp->buffer;
747 
748   print_partial_compiled_pattern (buffer, buffer + bufp->used);
749   fprintf (stderr, "%td bytes used/%td bytes allocated.\n",
750            bufp->used, bufp->allocated);
751 
752   if (bufp->fastmap_accurate && bufp->fastmap)
753     {
754       fputs ("fastmap: ", stderr);
755       print_fastmap (bufp->fastmap);
756     }
757 
758   fprintf (stderr, "re_nsub: %td\t", bufp->re_nsub);
759   fprintf (stderr, "regs_alloc: %d\t", bufp->regs_allocated);
760   fprintf (stderr, "can_be_null: %d\n", bufp->can_be_null);
761   /* Perhaps we should print the translate table?  */
762 }
763 
764 
765 static void
print_double_string(re_char * where,re_char * string1,ptrdiff_t size1,re_char * string2,ptrdiff_t size2)766 print_double_string (re_char *where, re_char *string1, ptrdiff_t size1,
767 		     re_char *string2, ptrdiff_t size2)
768 {
769   if (where == NULL)
770     fputs ("(null)", stderr);
771   else
772     {
773       int i;
774       if (FIRST_STRING_P (where))
775 	{
776 	  for (i = 0; i < string1 + size1 - where; i++)
777 	    debug_putchar (where[i]);
778 	  where = string2;
779 	}
780 
781       for (i = 0; i < string2 + size2 - where; i++)
782         debug_putchar (where[i]);
783     }
784 }
785 
786 #else /* not REGEX_EMACS_DEBUG */
787 
788 # define DEBUG_STATEMENT(e)
789 # define DEBUG_PRINT(...)
790 # define DEBUG_PRINT_COMPILED_PATTERN(p, s, e)
791 # define DEBUG_PRINT_DOUBLE_STRING(w, s1, sz1, s2, sz2)
792 
793 #endif /* not REGEX_EMACS_DEBUG */
794 
795 typedef enum
796 {
797   REG_NOERROR = 0,	/* Success.  */
798   REG_NOMATCH,		/* Didn't find a match (for regexec).  */
799 
800   /* POSIX regcomp return error codes.  (In the order listed in the
801      standard.)  An older version of this code supported the POSIX
802      API; this version continues to use these names internally.  */
803   REG_BADPAT,		/* Invalid pattern.  */
804   REG_ECOLLATE,		/* Not implemented.  */
805   REG_ECTYPE,		/* Invalid character class name.  */
806   REG_EESCAPE,		/* Trailing backslash.  */
807   REG_ESUBREG,		/* Invalid back reference.  */
808   REG_EBRACK,		/* Unmatched left bracket.  */
809   REG_EPAREN,		/* Parenthesis imbalance.  */
810   REG_EBRACE,		/* Unmatched \{.  */
811   REG_BADBR,		/* Invalid contents of \{\}.  */
812   REG_ERANGE,		/* Invalid range end.  */
813   REG_ESPACE,		/* Ran out of memory.  */
814   REG_BADRPT,		/* No preceding re for repetition op.  */
815 
816   /* Error codes we've added.  */
817   REG_EEND,		/* Premature end.  */
818   REG_ESIZE,		/* Compiled pattern bigger than 2^16 bytes.  */
819   REG_ERPAREN,		/* Unmatched ) or \); not returned from regcomp.  */
820   REG_ERANGEX,		/* Range striding over charsets.  */
821   REG_ESIZEBR           /* n or m too big in \{n,m\} */
822 } reg_errcode_t;
823 
824 static const char *re_error_msgid[] =
825   {
826    [REG_NOERROR] = "Success",
827    [REG_NOMATCH] = "No match",
828    [REG_BADPAT] = "Invalid regular expression",
829    [REG_ECOLLATE] = "Invalid collation character",
830    [REG_ECTYPE] = "Invalid character class name",
831    [REG_EESCAPE] = "Trailing backslash",
832    [REG_ESUBREG] = "Invalid back reference",
833    [REG_EBRACK] = "Unmatched [ or [^",
834    [REG_EPAREN] = "Unmatched ( or \\(",
835    [REG_EBRACE] = "Unmatched \\{",
836    [REG_BADBR] = "Invalid content of \\{\\}",
837    [REG_ERANGE] = "Invalid range end",
838    [REG_ESPACE] = "Memory exhausted",
839    [REG_BADRPT] = "Invalid preceding regular expression",
840    [REG_EEND] = "Premature end of regular expression",
841    [REG_ESIZE] = "Regular expression too big",
842    [REG_ERPAREN] = "Unmatched ) or \\)",
843    [REG_ERANGEX ] = "Range striding over charsets",
844    [REG_ESIZEBR ] = "Invalid content of \\{\\}",
845   };
846 
847 /* For 'regs_allocated'.  */
848 enum { REGS_UNALLOCATED, REGS_REALLOCATE, REGS_FIXED };
849 
850 /* If 'regs_allocated' is REGS_UNALLOCATED in the pattern buffer,
851    're_match_2' returns information about at least this many registers
852    the first time a 'regs' structure is passed.  */
853 enum { RE_NREGS = 30 };
854 
855 /* The searching and matching functions allocate memory for the
856    failure stack and registers.  Otherwise searching and matching
857    routines would have much smaller memory resources at their
858    disposal, and therefore might fail to handle complex regexps.  */
859 
860 /* Failure stack declarations and macros; both re_compile_fastmap and
861    re_match_2 use a failure stack.  These have to be macros because of
862    SAFE_ALLOCA.  */
863 
864 
865 /* Approximate number of failure points for which to initially allocate space
866    when matching.  If this number is exceeded, we allocate more
867    space, so it is not a hard limit.  */
868 #define INIT_FAILURE_ALLOC 20
869 
870 /* Roughly the maximum number of failure points on the stack.  Would be
871    exactly that if failure always used TYPICAL_FAILURE_SIZE items.
872    This is a variable only so users of regex can assign to it; we never
873    change it ourselves.  We always multiply it by TYPICAL_FAILURE_SIZE
874    before using it, so it should probably be a byte-count instead.  */
875 /* Note that 4400 was enough to cause a crash on Alpha OSF/1,
876    whose default stack limit is 2mb.  In order for a larger
877    value to work reliably, you have to try to make it accord
878    with the process stack limit.  */
879 ptrdiff_t emacs_re_max_failures = 40000;
880 
881 union fail_stack_elt
882 {
883   re_char *pointer;
884   intptr_t integer;
885 };
886 
887 typedef union fail_stack_elt fail_stack_elt_t;
888 
889 typedef struct
890 {
891   fail_stack_elt_t *stack;
892   ptrdiff_t size;
893   ptrdiff_t avail;	/* Offset of next open position.  */
894   ptrdiff_t frame;	/* Offset of the cur constructed frame.  */
895 } fail_stack_type;
896 
897 #define FAIL_STACK_EMPTY()     (fail_stack.frame == 0)
898 
899 
900 /* Define macros to initialize and free the failure stack.  */
901 
902 #define INIT_FAIL_STACK()						\
903   do {									\
904     fail_stack.stack =							\
905       SAFE_ALLOCA (INIT_FAILURE_ALLOC * TYPICAL_FAILURE_SIZE		\
906 		   * sizeof (fail_stack_elt_t));			\
907     fail_stack.size = INIT_FAILURE_ALLOC;				\
908     fail_stack.avail = 0;						\
909     fail_stack.frame = 0;						\
910   } while (false)
911 
912 
913 /* Double the size of FAIL_STACK, up to a limit
914    which allows approximately 'emacs_re_max_failures' items.
915 
916    Return 1 if succeeds, and 0 if either ran out of memory
917    allocating space for it or it was already too large.
918 
919    REGEX_REALLOCATE requires 'destination' be declared.   */
920 
921 /* Factor to increase the failure stack size by.
922    This used to be 2, but 2 was too wasteful
923    because the old discarded stacks added up to as much space
924    were as ultimate, maximum-size stack.  */
925 #define FAIL_STACK_GROWTH_FACTOR 4
926 
927 #define GROW_FAIL_STACK(fail_stack)					\
928   (((fail_stack).size >= emacs_re_max_failures * TYPICAL_FAILURE_SIZE)        \
929    ? 0									\
930    : ((fail_stack).stack						\
931       = REGEX_REALLOCATE ((fail_stack).stack,				\
932 	  (fail_stack).avail * sizeof (fail_stack_elt_t),		\
933           min (emacs_re_max_failures * TYPICAL_FAILURE_SIZE,                  \
934                ((fail_stack).size * FAIL_STACK_GROWTH_FACTOR))          \
935           * sizeof (fail_stack_elt_t)),                                 \
936       ((fail_stack).size						\
937        = (min (emacs_re_max_failures * TYPICAL_FAILURE_SIZE,		\
938 	       ((fail_stack).size * FAIL_STACK_GROWTH_FACTOR)))),	\
939       1))
940 
941 
942 /* Push a pointer value onto the failure stack.
943    Assumes the variable 'fail_stack'.  Probably should only
944    be called from within 'PUSH_FAILURE_POINT'.  */
945 #define PUSH_FAILURE_POINTER(item)					\
946   fail_stack.stack[fail_stack.avail++].pointer = (item)
947 
948 /* This pushes an integer-valued item onto the failure stack.
949    Assumes the variable 'fail_stack'.  Probably should only
950    be called from within 'PUSH_FAILURE_POINT'.  */
951 #define PUSH_FAILURE_INT(item)					\
952   fail_stack.stack[fail_stack.avail++].integer = (item)
953 
954 /* These POP... operations complement the PUSH... operations.
955    All assume that 'fail_stack' is nonempty.  */
956 #define POP_FAILURE_POINTER() fail_stack.stack[--fail_stack.avail].pointer
957 #define POP_FAILURE_INT() fail_stack.stack[--fail_stack.avail].integer
958 
959 /* Individual items aside from the registers.  */
960 #define NUM_NONREG_ITEMS 3
961 
962 /* Used to examine the stack (to detect infinite loops).  */
963 #define FAILURE_PAT(h) fail_stack.stack[(h) - 1].pointer
964 #define FAILURE_STR(h) (fail_stack.stack[(h) - 2].pointer)
965 #define NEXT_FAILURE_HANDLE(h) fail_stack.stack[(h) - 3].integer
966 #define TOP_FAILURE_HANDLE() fail_stack.frame
967 
968 
969 #define ENSURE_FAIL_STACK(space)					\
970 while (REMAINING_AVAIL_SLOTS <= space) {				\
971   if (!GROW_FAIL_STACK (fail_stack))					\
972     {									\
973       unbind_to (count, Qnil);						\
974       SAFE_FREE ();							\
975       return -2;							\
976     }									\
977   DEBUG_PRINT ("\n  Doubled stack; size now: %td\n", fail_stack.size);	\
978   DEBUG_PRINT ("	 slots available: %td\n", REMAINING_AVAIL_SLOTS);\
979 }
980 
981 /* Push register NUM onto the stack.  */
982 #define PUSH_FAILURE_REG(num)						\
983 do {									\
984   char *destination;							\
985   intptr_t n = num;							\
986   eassert (0 < n && n < num_regs);					\
987   eassert (REG_UNSET (regstart[n]) <= REG_UNSET (regend[n]));		\
988   ENSURE_FAIL_STACK(3);							\
989   DEBUG_PRINT ("    Push reg %"PRIdPTR" (spanning %p -> %p)\n",		\
990 	       n, regstart[n], regend[n]);				\
991   PUSH_FAILURE_POINTER (regstart[n]);					\
992   PUSH_FAILURE_POINTER (regend[n]);					\
993   PUSH_FAILURE_INT (n);							\
994 } while (false)
995 
996 /* Change the counter's value to VAL, but make sure that it will
997    be reset when backtracking.  */
998 #define PUSH_NUMBER(ptr,val)						\
999 do {									\
1000   char *destination;							\
1001   int c;								\
1002   ENSURE_FAIL_STACK(3);							\
1003   EXTRACT_NUMBER (c, ptr);						\
1004   DEBUG_PRINT ("    Push number %p = %d -> %d\n", ptr, c, val);		\
1005   PUSH_FAILURE_INT (c);							\
1006   PUSH_FAILURE_POINTER (ptr);						\
1007   PUSH_FAILURE_INT (-1);						\
1008   STORE_NUMBER (ptr, val);						\
1009 } while (false)
1010 
1011 /* Pop a saved register off the stack.  */
1012 #define POP_FAILURE_REG_OR_COUNT()					\
1013 do {									\
1014   intptr_t pfreg = POP_FAILURE_INT ();					\
1015   if (pfreg == -1)							\
1016     {									\
1017       /* It's a counter.  */						\
1018       /* Discard 'const', making re_search non-reentrant.  */		\
1019       unsigned char *ptr = (unsigned char *) POP_FAILURE_POINTER ();	\
1020       pfreg = POP_FAILURE_INT ();					\
1021       STORE_NUMBER (ptr, pfreg);					\
1022       DEBUG_PRINT ("     Pop counter %p = %"PRIdPTR"\n", ptr, pfreg);	\
1023     }									\
1024   else									\
1025     {									\
1026       eassert (0 < pfreg && pfreg < num_regs);				\
1027       regend[pfreg] = POP_FAILURE_POINTER ();				\
1028       regstart[pfreg] = POP_FAILURE_POINTER ();				\
1029       eassert (REG_UNSET (regstart[pfreg]) <= REG_UNSET (regend[pfreg])); \
1030       DEBUG_PRINT ("     Pop reg %ld (spanning %p -> %p)\n",		\
1031 		   pfreg, regstart[pfreg], regend[pfreg]);		\
1032     }									\
1033 } while (false)
1034 
1035 /* Check that we are not stuck in an infinite loop.  */
1036 #define CHECK_INFINITE_LOOP(pat_cur, string_place)			\
1037 do {									\
1038   ptrdiff_t failure = TOP_FAILURE_HANDLE ();				\
1039   /* Check for infinite matching loops */				\
1040   while (failure > 0							\
1041 	 && (FAILURE_STR (failure) == string_place			\
1042 	     || FAILURE_STR (failure) == NULL))				\
1043     {									\
1044       eassert (FAILURE_PAT (failure) >= bufp->buffer			\
1045 	       && FAILURE_PAT (failure) <= bufp->buffer + bufp->used);	\
1046       if (FAILURE_PAT (failure) == pat_cur)				\
1047 	{								\
1048 	  cycle = true;							\
1049 	  break;							\
1050 	}								\
1051       DEBUG_PRINT ("  Other pattern: %p\n", FAILURE_PAT (failure));	\
1052       failure = NEXT_FAILURE_HANDLE(failure);				\
1053     }									\
1054   DEBUG_PRINT ("  Other string: %p\n", FAILURE_STR (failure));		\
1055 } while (false)
1056 
1057 /* Push the information about the state we will need
1058    if we ever fail back to it.
1059 
1060    Requires variables fail_stack, regstart, regend and
1061    num_regs be declared.  GROW_FAIL_STACK requires 'destination' be
1062    declared.
1063 
1064    Does 'return FAILURE_CODE' if runs out of memory.  */
1065 
1066 #define PUSH_FAILURE_POINT(pattern, string_place)			\
1067 do {									\
1068   char *destination;							\
1069   DEBUG_STATEMENT (nfailure_points_pushed++);				\
1070   DEBUG_PRINT ("\nPUSH_FAILURE_POINT:\n");				\
1071   DEBUG_PRINT ("  Before push, next avail: %td\n", fail_stack.avail);	\
1072   DEBUG_PRINT ("			size: %td\n", fail_stack.size);	\
1073 									\
1074   ENSURE_FAIL_STACK (NUM_NONREG_ITEMS);					\
1075 									\
1076   DEBUG_PRINT ("\n");							\
1077 									\
1078   DEBUG_PRINT ("  Push frame index: %td\n", fail_stack.frame);		\
1079   PUSH_FAILURE_INT (fail_stack.frame);					\
1080 									\
1081   DEBUG_PRINT ("  Push string %p: \"", string_place);			\
1082   DEBUG_PRINT_DOUBLE_STRING (string_place, string1, size1, string2, size2);\
1083   DEBUG_PRINT ("\"\n");							\
1084   PUSH_FAILURE_POINTER (string_place);					\
1085 									\
1086   DEBUG_PRINT ("  Push pattern %p: ", pattern);				\
1087   DEBUG_PRINT_COMPILED_PATTERN (bufp, pattern, pend);			\
1088   PUSH_FAILURE_POINTER (pattern);					\
1089 									\
1090   /* Close the frame by moving the frame pointer past it.  */		\
1091   fail_stack.frame = fail_stack.avail;					\
1092 } while (false)
1093 
1094 /* Estimate the size of data pushed by a typical failure stack entry.
1095    An estimate is all we need, because all we use this for
1096    is to choose a limit for how big to make the failure stack.  */
1097 /* BEWARE, the value `20' is hard-coded in emacs.c:main().  */
1098 #define TYPICAL_FAILURE_SIZE 20
1099 
1100 /* How many items can still be added to the stack without overflowing it.  */
1101 #define REMAINING_AVAIL_SLOTS ((fail_stack).size - (fail_stack).avail)
1102 
1103 
1104 /* Pop what PUSH_FAIL_STACK pushes.
1105 
1106    Restore into the parameters, all of which should be lvalues:
1107      STR -- the saved data position.
1108      PAT -- the saved pattern position.
1109      REGSTART, REGEND -- arrays of string positions.
1110 
1111    Also assume the variables FAIL_STACK and (if debugging) BUFP, PEND,
1112    STRING1, SIZE1, STRING2, and SIZE2.  */
1113 
1114 #define POP_FAILURE_POINT(str, pat)                                     \
1115 do {									\
1116   eassert (!FAIL_STACK_EMPTY ());					\
1117 									\
1118   /* Remove failure points and point to how many regs pushed.  */	\
1119   DEBUG_PRINT ("POP_FAILURE_POINT:\n");					\
1120   DEBUG_PRINT ("  Before pop, next avail: %td\n", fail_stack.avail);	\
1121   DEBUG_PRINT ("		     size: %td\n", fail_stack.size);	\
1122 									\
1123   /* Pop the saved registers.  */					\
1124   while (fail_stack.frame < fail_stack.avail)				\
1125     POP_FAILURE_REG_OR_COUNT ();					\
1126 									\
1127   pat = POP_FAILURE_POINTER ();						\
1128   DEBUG_PRINT ("  Popping pattern %p: ", pat);				\
1129   DEBUG_PRINT_COMPILED_PATTERN (bufp, pat, pend);			\
1130 									\
1131   /* If the saved string location is NULL, it came from an		\
1132      on_failure_keep_string_jump opcode, and we want to throw away the	\
1133      saved NULL, thus retaining our current position in the string.  */	\
1134   str = POP_FAILURE_POINTER ();						\
1135   DEBUG_PRINT ("  Popping string %p: \"", str);				\
1136   DEBUG_PRINT_DOUBLE_STRING (str, string1, size1, string2, size2);	\
1137   DEBUG_PRINT ("\"\n");							\
1138 									\
1139   fail_stack.frame = POP_FAILURE_INT ();				\
1140   DEBUG_PRINT ("  Popping  frame index: %td\n", fail_stack.frame);	\
1141 									\
1142   eassert (fail_stack.avail >= 0);					\
1143   eassert (fail_stack.frame <= fail_stack.avail);			\
1144 									\
1145   DEBUG_STATEMENT (nfailure_points_popped++);				\
1146 } while (false) /* POP_FAILURE_POINT */
1147 
1148 
1149 
1150 /* Registers are set to a sentinel when they haven't yet matched.  */
1151 #define REG_UNSET(e) ((e) == NULL)
1152 
1153 /* Subroutine declarations and macros for regex_compile.  */
1154 
1155 static reg_errcode_t regex_compile (re_char *pattern, ptrdiff_t size,
1156 				    bool posix_backtracking,
1157 				    const char *whitespace_regexp,
1158 				    struct re_pattern_buffer *bufp);
1159 static void store_op1 (re_opcode_t op, unsigned char *loc, int arg);
1160 static void store_op2 (re_opcode_t op, unsigned char *loc, int arg1, int arg2);
1161 static void insert_op1 (re_opcode_t op, unsigned char *loc,
1162 			int arg, unsigned char *end);
1163 static void insert_op2 (re_opcode_t op, unsigned char *loc,
1164 			int arg1, int arg2, unsigned char *end);
1165 static bool at_begline_loc_p (re_char *pattern, re_char *p);
1166 static bool at_endline_loc_p (re_char *p, re_char *pend);
1167 static re_char *skip_one_char (re_char *p);
1168 static int analyze_first (re_char *p, re_char *pend,
1169 			  char *fastmap, bool multibyte);
1170 
1171 /* Fetch the next character in the uncompiled pattern, with no
1172    translation.  */
1173 #define PATFETCH(c)							\
1174   do {									\
1175     int len;								\
1176     if (p == pend) return REG_EEND;					\
1177     c = RE_STRING_CHAR_AND_LENGTH (p, len, multibyte);			\
1178     p += len;								\
1179   } while (false)
1180 
1181 
1182 #define RE_TRANSLATE(TBL, C) char_table_translate (TBL, C)
1183 #define TRANSLATE(d) (!NILP (translate) ? RE_TRANSLATE (translate, d) : (d))
1184 
1185 /* Macros for outputting the compiled pattern into 'buffer'.  */
1186 
1187 /* If the buffer isn't allocated when it comes in, use this.  */
1188 #define INIT_BUF_SIZE  32
1189 
1190 /* Ensure at least N more bytes of space in buffer.  */
1191 #define GET_BUFFER_SPACE(n)						\
1192     if (bufp->buffer + bufp->allocated - b < (n))			\
1193       EXTEND_BUFFER ((n) - (bufp->buffer + bufp->allocated - b))
1194 
1195 /* Ensure one more byte of buffer space and then add C to it.  */
1196 #define BUF_PUSH(c)							\
1197   do {									\
1198     GET_BUFFER_SPACE (1);						\
1199     *b++ = (unsigned char) (c);						\
1200   } while (false)
1201 
1202 
1203 /* Ensure we have two more bytes of buffer space and then append C1 and C2.  */
1204 #define BUF_PUSH_2(c1, c2)						\
1205   do {									\
1206     GET_BUFFER_SPACE (2);						\
1207     *b++ = (unsigned char) (c1);					\
1208     *b++ = (unsigned char) (c2);					\
1209   } while (false)
1210 
1211 
1212 /* Store a jump with opcode OP at LOC to location TO.  Store a
1213    relative address offset by the three bytes the jump itself occupies.  */
1214 #define STORE_JUMP(op, loc, to) \
1215   store_op1 (op, loc, (to) - (loc) - 3)
1216 
1217 /* Likewise, for a two-argument jump.  */
1218 #define STORE_JUMP2(op, loc, to, arg) \
1219   store_op2 (op, loc, (to) - (loc) - 3, arg)
1220 
1221 /* Like 'STORE_JUMP', but for inserting.  Assume B is the buffer end.  */
1222 #define INSERT_JUMP(op, loc, to) \
1223   insert_op1 (op, loc, (to) - (loc) - 3, b)
1224 
1225 /* Like 'STORE_JUMP2', but for inserting.  Assume B is the buffer end.  */
1226 #define INSERT_JUMP2(op, loc, to, arg) \
1227   insert_op2 (op, loc, (to) - (loc) - 3, arg, b)
1228 
1229 
1230 /* This is not an arbitrary limit: the arguments which represent offsets
1231    into the pattern are two bytes long.  So if 2^15 bytes turns out to
1232    be too small, many things would have to change.  */
1233 # define MAX_BUF_SIZE (1 << 15)
1234 
1235 /* Extend the buffer by at least N bytes via realloc and
1236    reset the pointers that pointed into the old block to point to the
1237    correct places in the new one.  If extending the buffer results in it
1238    being larger than MAX_BUF_SIZE, then flag memory exhausted.  */
1239 #define EXTEND_BUFFER(n)						\
1240   do {									\
1241     ptrdiff_t requested_extension = n;					\
1242     unsigned char *old_buffer = bufp->buffer;				\
1243     if (MAX_BUF_SIZE - bufp->allocated < requested_extension)		\
1244       return REG_ESIZE;							\
1245     ptrdiff_t b_off = b - old_buffer;					\
1246     ptrdiff_t begalt_off = begalt - old_buffer;				\
1247     bool fixup_alt_jump_set = !!fixup_alt_jump;				\
1248     bool laststart_set = !!laststart;					\
1249     bool pending_exact_set = !!pending_exact;				\
1250     ptrdiff_t fixup_alt_jump_off, laststart_off, pending_exact_off;	\
1251     if (fixup_alt_jump_set) fixup_alt_jump_off = fixup_alt_jump - old_buffer; \
1252     if (laststart_set) laststart_off = laststart - old_buffer;		\
1253     if (pending_exact_set) pending_exact_off = pending_exact - old_buffer; \
1254     bufp->buffer = xpalloc (bufp->buffer, &bufp->allocated,		\
1255 			    requested_extension, MAX_BUF_SIZE, 1);	\
1256     unsigned char *new_buffer = bufp->buffer;				\
1257     b = new_buffer + b_off;						\
1258     begalt = new_buffer + begalt_off;					\
1259     if (fixup_alt_jump_set) fixup_alt_jump = new_buffer + fixup_alt_jump_off; \
1260     if (laststart_set) laststart = new_buffer + laststart_off;		\
1261     if (pending_exact_set) pending_exact = new_buffer + pending_exact_off; \
1262   } while (false)
1263 
1264 
1265 /* Since we have one byte reserved for the register number argument to
1266    {start,stop}_memory, the maximum number of groups we can report
1267    things about is what fits in that byte.  */
1268 #define MAX_REGNUM 255
1269 
1270 /* But patterns can have more than 'MAX_REGNUM' registers.  Just
1271    ignore the excess.  */
1272 typedef int regnum_t;
1273 
1274 
1275 /* Macros for the compile stack.  */
1276 
1277 typedef long pattern_offset_t;
1278 verify (LONG_MIN <= -(MAX_BUF_SIZE - 1) && MAX_BUF_SIZE - 1 <= LONG_MAX);
1279 
1280 typedef struct
1281 {
1282   pattern_offset_t begalt_offset;
1283   pattern_offset_t fixup_alt_jump;
1284   pattern_offset_t laststart_offset;
1285   regnum_t regnum;
1286 } compile_stack_elt_t;
1287 
1288 
1289 typedef struct
1290 {
1291   compile_stack_elt_t *stack;
1292   ptrdiff_t size;
1293   ptrdiff_t avail;		/* Offset of next open position.  */
1294 } compile_stack_type;
1295 
1296 
1297 #define INIT_COMPILE_STACK_SIZE 32
1298 
1299 #define COMPILE_STACK_EMPTY  (compile_stack.avail == 0)
1300 #define COMPILE_STACK_FULL  (compile_stack.avail == compile_stack.size)
1301 
1302 /* The next available element.  */
1303 #define COMPILE_STACK_TOP (compile_stack.stack[compile_stack.avail])
1304 
1305 /* Structure to manage work area for range table.  */
1306 struct range_table_work_area
1307 {
1308   int *table;			/* actual work area.  */
1309   int allocated;		/* allocated size for work area in bytes.  */
1310   int used;			/* actually used size in words.  */
1311   int bits;			/* flag to record character classes */
1312 };
1313 
1314 /* Make sure that WORK_AREA can hold N more multibyte characters.
1315    If it can't get the space, it returns from the surrounding function.  */
1316 
1317 #define EXTEND_RANGE_TABLE(work_area, n)				\
1318   do {									\
1319     if (((work_area).used + (n)) * sizeof (int) > (work_area).allocated) \
1320       {									\
1321         extend_range_table_work_area (&work_area);			\
1322         if ((work_area).table == 0)					\
1323           return (REG_ESPACE);						\
1324       }									\
1325   } while (false)
1326 
1327 #define SET_RANGE_TABLE_WORK_AREA_BIT(work_area, bit)		\
1328   (work_area).bits |= (bit)
1329 
1330 /* Set a range (RANGE_START, RANGE_END) to WORK_AREA.  */
1331 #define SET_RANGE_TABLE_WORK_AREA(work_area, range_start, range_end)	\
1332   do {									\
1333     EXTEND_RANGE_TABLE ((work_area), 2);				\
1334     (work_area).table[(work_area).used++] = (range_start);		\
1335     (work_area).table[(work_area).used++] = (range_end);		\
1336   } while (false)
1337 
1338 /* Free allocated memory for WORK_AREA.  */
1339 #define FREE_RANGE_TABLE_WORK_AREA(work_area)	\
1340   do {						\
1341     if ((work_area).table)			\
1342       xfree ((work_area).table);			\
1343   } while (false)
1344 
1345 #define CLEAR_RANGE_TABLE_WORK_USED(work_area) \
1346   ((work_area).used = 0, (work_area).bits = 0)
1347 #define RANGE_TABLE_WORK_USED(work_area) ((work_area).used)
1348 #define RANGE_TABLE_WORK_BITS(work_area) ((work_area).bits)
1349 #define RANGE_TABLE_WORK_ELT(work_area, i) ((work_area).table[i])
1350 
1351 /* Bits used to implement the multibyte-part of the various character classes
1352    such as [:alnum:] in a charset's range table.  The code currently assumes
1353    that only the low 16 bits are used.  */
1354 #define BIT_WORD	0x1
1355 #define BIT_LOWER	0x2
1356 #define BIT_PUNCT	0x4
1357 #define BIT_SPACE	0x8
1358 #define BIT_UPPER	0x10
1359 #define BIT_MULTIBYTE	0x20
1360 #define BIT_ALPHA	0x40
1361 #define BIT_ALNUM	0x80
1362 #define BIT_GRAPH	0x100
1363 #define BIT_PRINT	0x200
1364 #define BIT_BLANK       0x400
1365 
1366 
1367 /* Set the bit for character C in a list.  */
1368 #define SET_LIST_BIT(c) (b[((c)) / BYTEWIDTH] |= 1 << ((c) % BYTEWIDTH))
1369 
1370 
1371 /* Store characters in the range FROM to TO in the bitmap at B (for
1372    ASCII and unibyte characters) and WORK_AREA (for multibyte
1373    characters) while translating them and paying attention to the
1374    continuity of translated characters.
1375 
1376    Implementation note: It is better to implement these fairly big
1377    macros by a function, but it's not that easy because macros called
1378    in this macro assume various local variables already declared.  */
1379 
1380 /* Both FROM and TO are ASCII characters.  */
1381 
1382 #define SETUP_ASCII_RANGE(work_area, FROM, TO)			\
1383   do {								\
1384     int C0, C1;							\
1385 								\
1386     for (C0 = (FROM); C0 <= (TO); C0++)				\
1387       {								\
1388 	C1 = TRANSLATE (C0);					\
1389 	if (! ASCII_CHAR_P (C1))				\
1390 	  {							\
1391 	    SET_RANGE_TABLE_WORK_AREA ((work_area), C1, C1);	\
1392 	    if ((C1 = RE_CHAR_TO_UNIBYTE (C1)) < 0)		\
1393 	      C1 = C0;						\
1394 	  }							\
1395 	SET_LIST_BIT (C1);					\
1396       }								\
1397   } while (false)
1398 
1399 
1400 /* Both FROM and TO are unibyte characters (0x80..0xFF).  */
1401 
1402 #define SETUP_UNIBYTE_RANGE(work_area, FROM, TO)			       \
1403   do {									       \
1404     int C0, C1, C2, I;							       \
1405     int USED = RANGE_TABLE_WORK_USED (work_area);			       \
1406 									       \
1407     for (C0 = (FROM); C0 <= (TO); C0++)					       \
1408       {									       \
1409 	C1 = RE_CHAR_TO_MULTIBYTE (C0);					       \
1410 	if (CHAR_BYTE8_P (C1))						       \
1411 	  SET_LIST_BIT (C0);						       \
1412 	else								       \
1413 	  {								       \
1414 	    C2 = TRANSLATE (C1);					       \
1415 	    if (C2 == C1						       \
1416 		|| (C1 = RE_CHAR_TO_UNIBYTE (C2)) < 0)			       \
1417 	      C1 = C0;							       \
1418 	    SET_LIST_BIT (C1);						       \
1419 	    for (I = RANGE_TABLE_WORK_USED (work_area) - 2; I >= USED; I -= 2) \
1420 	      {								       \
1421 		int from = RANGE_TABLE_WORK_ELT (work_area, I);		       \
1422 		int to = RANGE_TABLE_WORK_ELT (work_area, I + 1);	       \
1423 									       \
1424 		if (C2 >= from - 1 && C2 <= to + 1)			       \
1425 		  {							       \
1426 		    if (C2 == from - 1)					       \
1427 		      RANGE_TABLE_WORK_ELT (work_area, I)--;		       \
1428 		    else if (C2 == to + 1)				       \
1429 		      RANGE_TABLE_WORK_ELT (work_area, I + 1)++;	       \
1430 		    break;						       \
1431 		  }							       \
1432 	      }								       \
1433 	    if (I < USED)						       \
1434 	      SET_RANGE_TABLE_WORK_AREA ((work_area), C2, C2);		       \
1435 	  }								       \
1436       }									       \
1437   } while (false)
1438 
1439 
1440 /* Both FROM and TO are multibyte characters.  */
1441 
1442 #define SETUP_MULTIBYTE_RANGE(work_area, FROM, TO)			   \
1443   do {									   \
1444     int C0, C1, C2, I, USED = RANGE_TABLE_WORK_USED (work_area);	   \
1445 									   \
1446     SET_RANGE_TABLE_WORK_AREA ((work_area), (FROM), (TO));		   \
1447     for (C0 = (FROM); C0 <= (TO); C0++)					   \
1448       {									   \
1449 	C1 = TRANSLATE (C0);						   \
1450 	if ((C2 = RE_CHAR_TO_UNIBYTE (C1)) >= 0				   \
1451 	    || (C1 != C0 && (C2 = RE_CHAR_TO_UNIBYTE (C0)) >= 0))	   \
1452 	  SET_LIST_BIT (C2);						   \
1453 	if (C1 >= (FROM) && C1 <= (TO))					   \
1454 	  continue;							   \
1455 	for (I = RANGE_TABLE_WORK_USED (work_area) - 2; I >= USED; I -= 2) \
1456 	  {								   \
1457 	    int from = RANGE_TABLE_WORK_ELT (work_area, I);		   \
1458 	    int to = RANGE_TABLE_WORK_ELT (work_area, I + 1);		   \
1459 									   \
1460 	    if (C1 >= from - 1 && C1 <= to + 1)				   \
1461 	      {								   \
1462 		if (C1 == from - 1)					   \
1463 		  RANGE_TABLE_WORK_ELT (work_area, I)--;		   \
1464 		else if (C1 == to + 1)					   \
1465 		  RANGE_TABLE_WORK_ELT (work_area, I + 1)++;		   \
1466 		break;							   \
1467 	      }								   \
1468 	  }								   \
1469 	if (I < USED)							   \
1470 	  SET_RANGE_TABLE_WORK_AREA ((work_area), C1, C1);		   \
1471       }									   \
1472   } while (false)
1473 
1474 /* Get the next unsigned number in the uncompiled pattern.  */
1475 #define GET_INTERVAL_COUNT(num)					\
1476   do {									\
1477     if (p == pend)							\
1478       FREE_STACK_RETURN (REG_EBRACE);					\
1479     else								\
1480       {									\
1481 	PATFETCH (c);							\
1482 	while ('0' <= c && c <= '9')					\
1483 	  {								\
1484 	    if (num < 0)						\
1485 	      num = 0;							\
1486 	    if (RE_DUP_MAX / 10 - (RE_DUP_MAX % 10 < c - '0') < num)	\
1487 	      FREE_STACK_RETURN (REG_ESIZEBR);				\
1488 	    num = num * 10 + c - '0';					\
1489 	    if (p == pend)						\
1490 	      FREE_STACK_RETURN (REG_EBRACE);				\
1491 	    PATFETCH (c);						\
1492 	  }								\
1493       }									\
1494   } while (false)
1495 
1496 /* Parse a character class, i.e. string such as "[:name:]".  *strp
1497    points to the string to be parsed and limit is length, in bytes, of
1498    that string.
1499 
1500    If *strp point to a string that begins with "[:name:]", where name is
1501    a non-empty sequence of lower case letters, *strp will be advanced past the
1502    closing square bracket and RECC_* constant which maps to the name will be
1503    returned.  If name is not a valid character class name zero, or RECC_ERROR,
1504    is returned.
1505 
1506    Otherwise, if *strp doesn't begin with "[:name:]", -1 is returned.
1507 
1508    The function can be used on ASCII and multibyte (UTF-8-encoded) strings.
1509  */
1510 re_wctype_t
re_wctype_parse(const unsigned char ** strp,ptrdiff_t limit)1511 re_wctype_parse (const unsigned char **strp, ptrdiff_t limit)
1512 {
1513   const char *beg = (const char *)*strp, *it;
1514 
1515   if (limit < 4 || beg[0] != '[' || beg[1] != ':')
1516     return -1;
1517 
1518   beg += 2;  /* skip opening "[:" */
1519   limit -= 3;  /* opening "[:" and half of closing ":]"; --limit handles rest */
1520   for (it = beg; it[0] != ':' || it[1] != ']'; ++it)
1521     if (!--limit)
1522       return -1;
1523 
1524   *strp = (const unsigned char *)(it + 2);
1525 
1526   /* Sort tests in the length=five case by frequency the classes to minimize
1527      number of times we fail the comparison.  The frequencies of character class
1528      names used in Emacs sources as of 2016-07-27:
1529 
1530      $ find \( -name \*.c -o -name \*.el \) -exec grep -h '\[:[a-z]*:]' {} + |
1531            sed 's/]/]\n/g' |grep -o '\[:[a-z]*:]' |sort |uniq -c |sort -nr
1532          213 [:alnum:]
1533          104 [:alpha:]
1534           62 [:space:]
1535           39 [:digit:]
1536           36 [:blank:]
1537           26 [:word:]
1538           26 [:upper:]
1539           21 [:lower:]
1540           10 [:xdigit:]
1541           10 [:punct:]
1542           10 [:ascii:]
1543            4 [:nonascii:]
1544            4 [:graph:]
1545            2 [:print:]
1546            2 [:cntrl:]
1547            1 [:ff:]
1548 
1549      If you update this list, consider also updating chain of or'ed conditions
1550      in execute_charset function.
1551    */
1552 
1553   switch (it - beg) {
1554   case 4:
1555     if (!memcmp (beg, "word", 4))      return RECC_WORD;
1556     break;
1557   case 5:
1558     if (!memcmp (beg, "alnum", 5))     return RECC_ALNUM;
1559     if (!memcmp (beg, "alpha", 5))     return RECC_ALPHA;
1560     if (!memcmp (beg, "space", 5))     return RECC_SPACE;
1561     if (!memcmp (beg, "digit", 5))     return RECC_DIGIT;
1562     if (!memcmp (beg, "blank", 5))     return RECC_BLANK;
1563     if (!memcmp (beg, "upper", 5))     return RECC_UPPER;
1564     if (!memcmp (beg, "lower", 5))     return RECC_LOWER;
1565     if (!memcmp (beg, "punct", 5))     return RECC_PUNCT;
1566     if (!memcmp (beg, "ascii", 5))     return RECC_ASCII;
1567     if (!memcmp (beg, "graph", 5))     return RECC_GRAPH;
1568     if (!memcmp (beg, "print", 5))     return RECC_PRINT;
1569     if (!memcmp (beg, "cntrl", 5))     return RECC_CNTRL;
1570     break;
1571   case 6:
1572     if (!memcmp (beg, "xdigit", 6))    return RECC_XDIGIT;
1573     break;
1574   case 7:
1575     if (!memcmp (beg, "unibyte", 7))   return RECC_UNIBYTE;
1576     break;
1577   case 8:
1578     if (!memcmp (beg, "nonascii", 8))  return RECC_NONASCII;
1579     break;
1580   case 9:
1581     if (!memcmp (beg, "multibyte", 9)) return RECC_MULTIBYTE;
1582     break;
1583   }
1584 
1585   return RECC_ERROR;
1586 }
1587 
1588 /* True if CH is in the char class CC.  */
1589 bool
re_iswctype(int ch,re_wctype_t cc)1590 re_iswctype (int ch, re_wctype_t cc)
1591 {
1592   switch (cc)
1593     {
1594     case RECC_ALNUM: return ISALNUM (ch) != 0;
1595     case RECC_ALPHA: return ISALPHA (ch) != 0;
1596     case RECC_BLANK: return ISBLANK (ch) != 0;
1597     case RECC_CNTRL: return ISCNTRL (ch) != 0;
1598     case RECC_DIGIT: return ISDIGIT (ch) != 0;
1599     case RECC_GRAPH: return ISGRAPH (ch) != 0;
1600     case RECC_LOWER: return ISLOWER (ch) != 0;
1601     case RECC_PRINT: return ISPRINT (ch) != 0;
1602     case RECC_PUNCT: return ISPUNCT (ch) != 0;
1603     case RECC_SPACE: return ISSPACE (ch) != 0;
1604     case RECC_UPPER: return ISUPPER (ch) != 0;
1605     case RECC_XDIGIT: return ISXDIGIT (ch) != 0;
1606     case RECC_ASCII: return IS_REAL_ASCII (ch) != 0;
1607     case RECC_NONASCII: return !IS_REAL_ASCII (ch);
1608     case RECC_UNIBYTE: return ISUNIBYTE (ch) != 0;
1609     case RECC_MULTIBYTE: return !ISUNIBYTE (ch);
1610     case RECC_WORD: return ISWORD (ch) != 0;
1611     case RECC_ERROR: return false;
1612     default:
1613       abort ();
1614     }
1615 }
1616 
1617 /* Return a bit-pattern to use in the range-table bits to match multibyte
1618    chars of class CC.  */
1619 static int
re_wctype_to_bit(re_wctype_t cc)1620 re_wctype_to_bit (re_wctype_t cc)
1621 {
1622   switch (cc)
1623     {
1624     case RECC_NONASCII:
1625     case RECC_MULTIBYTE: return BIT_MULTIBYTE;
1626     case RECC_ALPHA: return BIT_ALPHA;
1627     case RECC_ALNUM: return BIT_ALNUM;
1628     case RECC_WORD: return BIT_WORD;
1629     case RECC_LOWER: return BIT_LOWER;
1630     case RECC_UPPER: return BIT_UPPER;
1631     case RECC_PUNCT: return BIT_PUNCT;
1632     case RECC_SPACE: return BIT_SPACE;
1633     case RECC_GRAPH: return BIT_GRAPH;
1634     case RECC_PRINT: return BIT_PRINT;
1635     case RECC_BLANK: return BIT_BLANK;
1636     case RECC_ASCII: case RECC_DIGIT: case RECC_XDIGIT: case RECC_CNTRL:
1637     case RECC_UNIBYTE: case RECC_ERROR: return 0;
1638     default:
1639       abort ();
1640     }
1641 }
1642 
1643 /* Filling in the work area of a range.  */
1644 
1645 /* Actually extend the space in WORK_AREA.  */
1646 
1647 static void
extend_range_table_work_area(struct range_table_work_area * work_area)1648 extend_range_table_work_area (struct range_table_work_area *work_area)
1649 {
1650   work_area->allocated += 16 * sizeof (int);
1651   work_area->table = xrealloc (work_area->table, work_area->allocated);
1652 }
1653 
1654 /* regex_compile and helpers.  */
1655 
1656 static bool group_in_compile_stack (compile_stack_type, regnum_t);
1657 
1658 /* Insert the 'jump' from the end of last alternative to "here".
1659    The space for the jump has already been allocated. */
1660 #define FIXUP_ALT_JUMP()						\
1661 do {									\
1662   if (fixup_alt_jump)							\
1663     STORE_JUMP (jump, fixup_alt_jump, b);				\
1664 } while (false)
1665 
1666 
1667 /* Return, freeing storage we allocated.  */
1668 #define FREE_STACK_RETURN(value)		\
1669   do {							\
1670     FREE_RANGE_TABLE_WORK_AREA (range_table_work);	\
1671     xfree (compile_stack.stack);			\
1672     return value;					\
1673   } while (false)
1674 
1675 /* Compile PATTERN (of length SIZE) according to SYNTAX.
1676    Return a nonzero error code on failure, or zero for success.
1677 
1678    If WHITESPACE_REGEXP is given, use it instead of a space
1679    character in PATTERN.
1680 
1681    Assume the 'allocated' (and perhaps 'buffer') and 'translate'
1682    fields are set in BUFP on entry.
1683 
1684    If successful, put results in *BUFP (otherwise the
1685    contents of *BUFP are undefined):
1686      'buffer' is the compiled pattern;
1687      'syntax' is set to SYNTAX;
1688      'used' is set to the length of the compiled pattern;
1689      'fastmap_accurate' is false;
1690      're_nsub' is the number of subexpressions in PATTERN;
1691 
1692    The 'fastmap' field is neither examined nor set.  */
1693 
1694 static reg_errcode_t
regex_compile(re_char * pattern,ptrdiff_t size,bool posix_backtracking,const char * whitespace_regexp,struct re_pattern_buffer * bufp)1695 regex_compile (re_char *pattern, ptrdiff_t size,
1696 	       bool posix_backtracking,
1697 	       const char *whitespace_regexp,
1698 	       struct re_pattern_buffer *bufp)
1699 {
1700   /* Fetch characters from PATTERN here.  */
1701   int c, c1;
1702 
1703   /* Points to the end of the buffer, where we should append.  */
1704   unsigned char *b;
1705 
1706   /* Keeps track of unclosed groups.  */
1707   compile_stack_type compile_stack;
1708 
1709   /* Points to the current (ending) position in the pattern.  */
1710   re_char *p = pattern;
1711   re_char *pend = pattern + size;
1712 
1713   /* How to translate the characters in the pattern.  */
1714   Lisp_Object translate = bufp->translate;
1715 
1716   /* Address of the count-byte of the most recently inserted 'exactn'
1717      command.  This makes it possible to tell if a new exact-match
1718      character can be added to that command or if the character requires
1719      a new 'exactn' command.  */
1720   unsigned char *pending_exact = 0;
1721 
1722   /* Address of start of the most recently finished expression.
1723      This tells, e.g., postfix * where to find the start of its
1724      operand.  Reset at the beginning of groups and alternatives.  */
1725   unsigned char *laststart = 0;
1726 
1727   /* Address of beginning of regexp, or inside of last group.  */
1728   unsigned char *begalt;
1729 
1730   /* Place in the uncompiled pattern (i.e., the {) to
1731      which to go back if the interval is invalid.  */
1732   re_char *beg_interval;
1733 
1734   /* Address of the place where a forward jump should go to the end of
1735      the containing expression.  Each alternative of an 'or' -- except the
1736      last -- ends with a forward jump of this sort.  */
1737   unsigned char *fixup_alt_jump = 0;
1738 
1739   /* Work area for range table of charset.  */
1740   struct range_table_work_area range_table_work;
1741 
1742   /* If the regular expression is multibyte.  */
1743   bool multibyte = RE_MULTIBYTE_P (bufp);
1744 
1745   /* Nonzero if we have pushed down into a subpattern.  */
1746   int in_subpattern = 0;
1747 
1748   /* These hold the values of p, pattern, and pend from the main
1749      pattern when we have pushed into a subpattern.  */
1750   re_char *main_p;
1751   re_char *main_pattern;
1752   re_char *main_pend;
1753 
1754 #ifdef REGEX_EMACS_DEBUG
1755   regex_emacs_debug++;
1756   DEBUG_PRINT ("\nCompiling pattern: ");
1757   if (regex_emacs_debug > 0)
1758     {
1759       for (ptrdiff_t debug_count = 0; debug_count < size; debug_count++)
1760 	debug_putchar (pattern[debug_count]);
1761       putc ('\n', stderr);
1762     }
1763 #endif
1764 
1765   /* Initialize the compile stack.  */
1766   compile_stack.stack = xmalloc (INIT_COMPILE_STACK_SIZE
1767 				 * sizeof *compile_stack.stack);
1768   __lsan_ignore_object (compile_stack.stack);
1769   compile_stack.size = INIT_COMPILE_STACK_SIZE;
1770   compile_stack.avail = 0;
1771 
1772   range_table_work.table = 0;
1773   range_table_work.allocated = 0;
1774 
1775   /* Initialize the pattern buffer.  */
1776   bufp->fastmap_accurate = false;
1777   bufp->used_syntax = false;
1778 
1779   /* Set 'used' to zero, so that if we return an error, the pattern
1780      printer (for debugging) will think there's no pattern.  We reset it
1781      at the end.  */
1782   bufp->used = 0;
1783 
1784   bufp->re_nsub = 0;
1785 
1786   if (bufp->allocated == 0)
1787     {
1788       /* This loses if BUFP->buffer is bogus, but that is the user's
1789 	 responsibility.  */
1790       bufp->buffer = xrealloc (bufp->buffer, INIT_BUF_SIZE);
1791       bufp->allocated = INIT_BUF_SIZE;
1792     }
1793 
1794   begalt = b = bufp->buffer;
1795 
1796   /* Loop through the uncompiled pattern until we're at the end.  */
1797   while (1)
1798     {
1799       if (p == pend)
1800 	{
1801 	  /* If this is the end of an included regexp,
1802 	     pop back to the main regexp and try again.  */
1803 	  if (in_subpattern)
1804 	    {
1805 	      in_subpattern = 0;
1806 	      pattern = main_pattern;
1807 	      p = main_p;
1808 	      pend = main_pend;
1809 	      continue;
1810 	    }
1811 	  /* If this is the end of the main regexp, we are done.  */
1812 	  break;
1813 	}
1814 
1815       PATFETCH (c);
1816 
1817       switch (c)
1818 	{
1819 	case ' ':
1820 	  {
1821 	    re_char *p1 = p;
1822 
1823 	    /* If there's no special whitespace regexp, treat
1824 	       spaces normally.  And don't try to do this recursively.  */
1825 	    if (!whitespace_regexp || in_subpattern)
1826 	      goto normal_char;
1827 
1828 	    /* Peek past following spaces.  */
1829 	    while (p1 != pend)
1830 	      {
1831 		if (*p1 != ' ')
1832 		  break;
1833 		p1++;
1834 	      }
1835 	    /* If the spaces are followed by a repetition op,
1836 	       treat them normally.  */
1837 	    if (p1 != pend
1838 		&& (*p1 == '*' || *p1 == '+' || *p1 == '?'
1839 		    || (*p1 == '\\' && p1 + 1 != pend && p1[1] == '{')))
1840 	      goto normal_char;
1841 
1842 	    /* Replace the spaces with the whitespace regexp.  */
1843 	    in_subpattern = 1;
1844 	    main_p = p1;
1845 	    main_pend = pend;
1846 	    main_pattern = pattern;
1847 	    p = pattern = (re_char *) whitespace_regexp;
1848 	    pend = p + strlen (whitespace_regexp);
1849 	    break;
1850 	  }
1851 
1852 	case '^':
1853 	  if (! (p == pattern + 1 || at_begline_loc_p (pattern, p)))
1854 	    goto normal_char;
1855 	  BUF_PUSH (begline);
1856 	  break;
1857 
1858 	case '$':
1859 	  if (! (p == pend || at_endline_loc_p (p, pend)))
1860 	    goto normal_char;
1861 	  BUF_PUSH (endline);
1862 	  break;
1863 
1864 
1865 	case '+':
1866 	case '?':
1867 	case '*':
1868 	  /* If there is no previous pattern...  */
1869 	  if (!laststart)
1870 	    goto normal_char;
1871 
1872 	  {
1873 	    /* 1 means zero (many) matches is allowed.  */
1874 	    bool zero_times_ok = false, many_times_ok = false;
1875 	    bool greedy = true;
1876 
1877 	    /* If there is a sequence of repetition chars, collapse it
1878 	       down to just one (the right one).  We can't combine
1879 	       interval operators with these because of, e.g., 'a{2}*',
1880 	       which should only match an even number of 'a's.  */
1881 
1882 	    for (;;)
1883 	      {
1884 		if (c == '?' && (zero_times_ok || many_times_ok))
1885 		  greedy = false;
1886 		else
1887 		  {
1888 		    zero_times_ok |= c != '+';
1889 		    many_times_ok |= c != '?';
1890 		  }
1891 
1892 		if (! (p < pend && (*p == '*' || *p == '+' || *p == '?')))
1893 		  break;
1894 		/* If we get here, we found another repeat character.  */
1895 		c = *p++;
1896 	       }
1897 
1898 	    /* Star, etc. applied to an empty pattern is equivalent
1899 	       to an empty pattern.  */
1900 	    if (!laststart || laststart == b)
1901 	      break;
1902 
1903 	    /* Now we know whether or not zero matches is allowed
1904 	       and also whether or not two or more matches is allowed.  */
1905 	    if (greedy)
1906 	      {
1907 		if (many_times_ok)
1908 		  {
1909 		    bool simple = skip_one_char (laststart) == b;
1910 		    ptrdiff_t startoffset = 0;
1911 		    re_opcode_t ofj =
1912 		      /* Check if the loop can match the empty string.  */
1913 		      (simple || !analyze_first (laststart, b, NULL, false))
1914 		      ? on_failure_jump : on_failure_jump_loop;
1915 		    eassert (skip_one_char (laststart) <= b);
1916 
1917 		    if (!zero_times_ok && simple)
1918 		      { /* Since simple * loops can be made faster by using
1919 			   on_failure_keep_string_jump, we turn simple P+
1920 			   into PP* if P is simple.  */
1921 			unsigned char *p1, *p2;
1922 			startoffset = b - laststart;
1923 			GET_BUFFER_SPACE (startoffset);
1924 			p1 = b; p2 = laststart;
1925 			while (p2 < p1)
1926 			  *b++ = *p2++;
1927 			zero_times_ok = 1;
1928 		      }
1929 
1930 		    GET_BUFFER_SPACE (6);
1931 		    if (!zero_times_ok)
1932 		      /* A + loop.  */
1933 		      STORE_JUMP (ofj, b, b + 6);
1934 		    else
1935 		      /* Simple * loops can use on_failure_keep_string_jump
1936 			 depending on what follows.  But since we don't know
1937 			 that yet, we leave the decision up to
1938 			 on_failure_jump_smart.  */
1939 		      INSERT_JUMP (simple ? on_failure_jump_smart : ofj,
1940 				   laststart + startoffset, b + 6);
1941 		    b += 3;
1942 		    STORE_JUMP (jump, b, laststart + startoffset);
1943 		    b += 3;
1944 		  }
1945 		else
1946 		  {
1947 		    /* A simple ? pattern.  */
1948 		    eassert (zero_times_ok);
1949 		    GET_BUFFER_SPACE (3);
1950 		    INSERT_JUMP (on_failure_jump, laststart, b + 3);
1951 		    b += 3;
1952 		  }
1953 	      }
1954 	    else		/* not greedy */
1955 	      { /* I wish the greedy and non-greedy cases could be merged.  */
1956 
1957 		GET_BUFFER_SPACE (7); /* We might use less.  */
1958 		if (many_times_ok)
1959 		  {
1960 		    bool emptyp = !!analyze_first (laststart, b, NULL, false);
1961 
1962 		    /* The non-greedy multiple match looks like
1963 		       a repeat..until: we only need a conditional jump
1964 		       at the end of the loop.  */
1965 		    if (emptyp) BUF_PUSH (no_op);
1966 		    STORE_JUMP (emptyp ? on_failure_jump_nastyloop
1967 				: on_failure_jump, b, laststart);
1968 		    b += 3;
1969 		    if (zero_times_ok)
1970 		      {
1971 			/* The repeat...until naturally matches one or more.
1972 			   To also match zero times, we need to first jump to
1973 			   the end of the loop (its conditional jump).  */
1974 			INSERT_JUMP (jump, laststart, b);
1975 			b += 3;
1976 		      }
1977 		  }
1978 		else
1979 		  {
1980 		    /* non-greedy a?? */
1981 		    INSERT_JUMP (jump, laststart, b + 3);
1982 		    b += 3;
1983 		    INSERT_JUMP (on_failure_jump, laststart, laststart + 6);
1984 		    b += 3;
1985 		  }
1986 	      }
1987 	  }
1988 	  pending_exact = 0;
1989 	  break;
1990 
1991 
1992 	case '.':
1993 	  laststart = b;
1994 	  BUF_PUSH (anychar);
1995 	  break;
1996 
1997 
1998 	case '[':
1999 	  {
2000 	    re_char *p1;
2001 
2002 	    CLEAR_RANGE_TABLE_WORK_USED (range_table_work);
2003 
2004 	    if (p == pend) FREE_STACK_RETURN (REG_EBRACK);
2005 
2006 	    /* Ensure that we have enough space to push a charset: the
2007 	       opcode, the length count, and the bitset; 34 bytes in all.  */
2008 	    GET_BUFFER_SPACE (34);
2009 
2010 	    laststart = b;
2011 
2012 	    /* Test '*p == '^' twice, instead of using an if
2013 	       statement, so we need only one BUF_PUSH.  */
2014 	    BUF_PUSH (*p == '^' ? charset_not : charset);
2015 	    if (*p == '^')
2016 	      p++;
2017 
2018 	    /* Remember the first position in the bracket expression.  */
2019 	    p1 = p;
2020 
2021 	    /* Push the number of bytes in the bitmap.  */
2022 	    BUF_PUSH ((1 << BYTEWIDTH) / BYTEWIDTH);
2023 
2024 	    /* Clear the whole map.  */
2025 	    memset (b, 0, (1 << BYTEWIDTH) / BYTEWIDTH);
2026 
2027 	    /* Read in characters and ranges, setting map bits.  */
2028 	    for (;;)
2029 	      {
2030 		const unsigned char *p2 = p;
2031 		int ch;
2032 
2033 		if (p == pend) FREE_STACK_RETURN (REG_EBRACK);
2034 
2035 		/* See if we're at the beginning of a possible character
2036 		   class.  */
2037 		re_wctype_t cc = re_wctype_parse (&p, pend - p);
2038 		if (cc != -1)
2039 		  {
2040 		    if (cc == 0)
2041 		      FREE_STACK_RETURN (REG_ECTYPE);
2042 
2043 		    if (p == pend)
2044 		      FREE_STACK_RETURN (REG_EBRACK);
2045 
2046 		    /* Most character classes in a multibyte match just set
2047 		       a flag.  Exceptions are is_blank, is_digit, is_cntrl, and
2048 		       is_xdigit, since they can only match ASCII characters.
2049 		       We don't need to handle them for multibyte.  */
2050 
2051 		    /* Setup the gl_state object to its buffer-defined value.
2052 		       This hardcodes the buffer-global syntax-table for ASCII
2053 		       chars, while the other chars will obey syntax-table
2054 		       properties.  It's not ideal, but it's the way it's been
2055 		       done until now.  */
2056 		    SETUP_BUFFER_SYNTAX_TABLE ();
2057 
2058 		    for (c = 0; c < 0x80; ++c)
2059 		      if (re_iswctype (c, cc))
2060 			{
2061 			  SET_LIST_BIT (c);
2062 			  c1 = TRANSLATE (c);
2063 			  if (c1 == c)
2064 			    continue;
2065 			  if (ASCII_CHAR_P (c1))
2066 			    SET_LIST_BIT (c1);
2067 			  else if ((c1 = RE_CHAR_TO_UNIBYTE (c1)) >= 0)
2068 			    SET_LIST_BIT (c1);
2069 			}
2070 		    SET_RANGE_TABLE_WORK_AREA_BIT
2071 		      (range_table_work, re_wctype_to_bit (cc));
2072 
2073 		    /* In most cases the matching rule for char classes only
2074 		       uses the syntax table for multibyte chars, so that the
2075 		       content of the syntax-table is not hardcoded in the
2076 		       range_table.  SPACE and WORD are the two exceptions.  */
2077 		    if ((1 << cc) & ((1 << RECC_SPACE) | (1 << RECC_WORD)))
2078 		      bufp->used_syntax = true;
2079 
2080 		    /* Repeat the loop. */
2081 		    continue;
2082 		  }
2083 
2084 		/* Don't translate yet.  The range TRANSLATE(X..Y) cannot
2085 		   always be determined from TRANSLATE(X) and TRANSLATE(Y)
2086 		   So the translation is done later in a loop.  Example:
2087 		   (let ((case-fold-search t)) (string-match "[A-_]" "A"))  */
2088 		PATFETCH (c);
2089 
2090 		/* Could be the end of the bracket expression.  If it's
2091 		   not (i.e., when the bracket expression is '[]' so
2092 		   far), the ']' character bit gets set way below.  */
2093 		if (c == ']' && p2 != p1)
2094 		  break;
2095 
2096 		if (p < pend && p[0] == '-' && p[1] != ']')
2097 		  {
2098 
2099 		    /* Discard the '-'. */
2100 		    PATFETCH (c1);
2101 
2102 		    /* Fetch the character which ends the range. */
2103 		    PATFETCH (c1);
2104 
2105 		    if (CHAR_BYTE8_P (c1)
2106 			&& ! ASCII_CHAR_P (c) && ! CHAR_BYTE8_P (c))
2107 		      /* Treat the range from a multibyte character to
2108 			 raw-byte character as empty.  */
2109 		      c = c1 + 1;
2110 		  }
2111 		else
2112 		  /* Range from C to C. */
2113 		  c1 = c;
2114 
2115 		if (c <= c1)
2116 		  {
2117 		    if (c < 128)
2118 		      {
2119 			ch = min (127, c1);
2120 			SETUP_ASCII_RANGE (range_table_work, c, ch);
2121 			c = ch + 1;
2122 			if (CHAR_BYTE8_P (c1))
2123 			  c = BYTE8_TO_CHAR (128);
2124 		      }
2125                     if (c <= c1)
2126                       {
2127                         if (CHAR_BYTE8_P (c))
2128                           {
2129                             c = CHAR_TO_BYTE8 (c);
2130                             c1 = CHAR_TO_BYTE8 (c1);
2131                             for (; c <= c1; c++)
2132                               SET_LIST_BIT (c);
2133                           }
2134                         else if (multibyte)
2135                           SETUP_MULTIBYTE_RANGE (range_table_work, c, c1);
2136                         else
2137                           SETUP_UNIBYTE_RANGE (range_table_work, c, c1);
2138                       }
2139 		  }
2140 	      }
2141 
2142 	    /* Discard any (non)matching list bytes that are all 0 at the
2143 	       end of the map.  Decrease the map-length byte too.  */
2144 	    while ((int) b[-1] > 0 && b[b[-1] - 1] == 0)
2145 	      b[-1]--;
2146 	    b += b[-1];
2147 
2148 	    /* Build real range table from work area.  */
2149 	    if (RANGE_TABLE_WORK_USED (range_table_work)
2150 		|| RANGE_TABLE_WORK_BITS (range_table_work))
2151 	      {
2152 		int i;
2153 		int used = RANGE_TABLE_WORK_USED (range_table_work);
2154 
2155 		/* Allocate space for COUNT + RANGE_TABLE.  Needs two
2156 		   bytes for flags, two for COUNT, and three bytes for
2157 		   each character.  */
2158 		GET_BUFFER_SPACE (4 + used * 3);
2159 
2160 		/* Indicate the existence of range table.  */
2161 		laststart[1] |= 0x80;
2162 
2163 		/* Store the character class flag bits into the range table.  */
2164 		*b++ = RANGE_TABLE_WORK_BITS (range_table_work) & 0xff;
2165 		*b++ = RANGE_TABLE_WORK_BITS (range_table_work) >> 8;
2166 
2167 		STORE_NUMBER_AND_INCR (b, used / 2);
2168 		for (i = 0; i < used; i++)
2169 		  STORE_CHARACTER_AND_INCR
2170 		    (b, RANGE_TABLE_WORK_ELT (range_table_work, i));
2171 	      }
2172 	  }
2173 	  break;
2174 
2175 
2176 	case '\\':
2177 	  if (p == pend) FREE_STACK_RETURN (REG_EESCAPE);
2178 
2179 	  /* Do not translate the character after the \, so that we can
2180 	     distinguish, e.g., \B from \b, even if we normally would
2181 	     translate, e.g., B to b.  */
2182 	  PATFETCH (c);
2183 
2184 	  switch (c)
2185 	    {
2186 	    case '(':
2187 	      {
2188 		bool shy = false;
2189 		regnum_t regnum = 0;
2190 		if (p+1 < pend)
2191 		  {
2192 		    /* Look for a special (?...) construct */
2193 		    if (*p == '?')
2194 		      {
2195 			PATFETCH (c); /* Gobble up the '?'.  */
2196 			while (!shy)
2197 			  {
2198 			    PATFETCH (c);
2199 			    switch (c)
2200 			      {
2201 			      case ':': shy = true; break;
2202 			      case '0':
2203 				/* An explicitly specified regnum must start
2204 				   with non-0. */
2205 				if (regnum == 0)
2206 				  FREE_STACK_RETURN (REG_BADPAT);
2207 				FALLTHROUGH;
2208 			      case '1': case '2': case '3': case '4':
2209 			      case '5': case '6': case '7': case '8': case '9':
2210 				if (INT_MULTIPLY_WRAPV (regnum, 10, &regnum)
2211 				    || INT_ADD_WRAPV (regnum, c - '0',
2212 						      &regnum))
2213 				  FREE_STACK_RETURN (REG_ESIZE);
2214 				break;
2215 			      default:
2216 				/* Only (?:...) is supported right now. */
2217 				FREE_STACK_RETURN (REG_BADPAT);
2218 			      }
2219 			  }
2220 		      }
2221 		  }
2222 
2223 		if (!shy)
2224 		  regnum = ++bufp->re_nsub;
2225 		else if (regnum)
2226 		  { /* It's actually not shy, but explicitly numbered.  */
2227 		    shy = false;
2228 		    if (regnum > bufp->re_nsub)
2229 		      bufp->re_nsub = regnum;
2230 		    else if (regnum > bufp->re_nsub
2231 			     /* Ideally, we'd want to check that the specified
2232 				group can't have matched (i.e. all subgroups
2233 				using the same regnum are in other branches of
2234 				OR patterns), but we don't currently keep track
2235 				of enough info to do that easily.  */
2236 			     || group_in_compile_stack (compile_stack, regnum))
2237 		      FREE_STACK_RETURN (REG_BADPAT);
2238 		  }
2239 		else
2240 		  /* It's really shy.  */
2241 		  regnum = - bufp->re_nsub;
2242 
2243 		if (COMPILE_STACK_FULL)
2244 		  compile_stack.stack
2245 		    = xpalloc (compile_stack.stack, &compile_stack.size,
2246 			       1, -1, sizeof *compile_stack.stack);
2247 
2248 		/* These are the values to restore when we hit end of this
2249 		   group.  They are all relative offsets, so that if the
2250 		   whole pattern moves because of realloc, they will still
2251 		   be valid.  */
2252 		COMPILE_STACK_TOP.begalt_offset = begalt - bufp->buffer;
2253 		COMPILE_STACK_TOP.fixup_alt_jump
2254 		  = fixup_alt_jump ? fixup_alt_jump - bufp->buffer + 1 : 0;
2255 		COMPILE_STACK_TOP.laststart_offset = b - bufp->buffer;
2256 		COMPILE_STACK_TOP.regnum = regnum;
2257 
2258 		/* Do not push a start_memory for groups beyond the last one
2259 		   we can represent in the compiled pattern.  */
2260 		if (regnum <= MAX_REGNUM && regnum > 0)
2261 		  BUF_PUSH_2 (start_memory, regnum);
2262 
2263 		compile_stack.avail++;
2264 
2265 		fixup_alt_jump = 0;
2266 		laststart = 0;
2267 		begalt = b;
2268 		/* If we've reached MAX_REGNUM groups, then this open
2269 		   won't actually generate any code, so we'll have to
2270 		   clear pending_exact explicitly.  */
2271 		pending_exact = 0;
2272 		break;
2273 	      }
2274 
2275 	    case ')':
2276 	      if (COMPILE_STACK_EMPTY)
2277 		FREE_STACK_RETURN (REG_ERPAREN);
2278 
2279 	      FIXUP_ALT_JUMP ();
2280 
2281 	      /* See similar code for backslashed left paren above.  */
2282 	      if (COMPILE_STACK_EMPTY)
2283 		FREE_STACK_RETURN (REG_ERPAREN);
2284 
2285 	      /* Since we just checked for an empty stack above, this
2286 		 "can't happen".  */
2287 	      eassert (compile_stack.avail != 0);
2288 	      {
2289 		/* We don't just want to restore into 'regnum', because
2290 		   later groups should continue to be numbered higher,
2291 		   as in '(ab)c(de)' -- the second group is #2.  */
2292 		regnum_t regnum;
2293 
2294 		compile_stack.avail--;
2295 		begalt = bufp->buffer + COMPILE_STACK_TOP.begalt_offset;
2296 		fixup_alt_jump
2297 		  = COMPILE_STACK_TOP.fixup_alt_jump
2298 		    ? bufp->buffer + COMPILE_STACK_TOP.fixup_alt_jump - 1
2299 		    : 0;
2300 		laststart = bufp->buffer + COMPILE_STACK_TOP.laststart_offset;
2301 		regnum = COMPILE_STACK_TOP.regnum;
2302 		/* If we've reached MAX_REGNUM groups, then this open
2303 		   won't actually generate any code, so we'll have to
2304 		   clear pending_exact explicitly.  */
2305 		pending_exact = 0;
2306 
2307 		/* We're at the end of the group, so now we know how many
2308 		   groups were inside this one.  */
2309 		if (regnum <= MAX_REGNUM && regnum > 0)
2310 		  BUF_PUSH_2 (stop_memory, regnum);
2311 	      }
2312 	      break;
2313 
2314 
2315 	    case '|':					/* '\|'.  */
2316 	      /* Insert before the previous alternative a jump which
2317 		 jumps to this alternative if the former fails.  */
2318 	      GET_BUFFER_SPACE (3);
2319 	      INSERT_JUMP (on_failure_jump, begalt, b + 6);
2320 	      pending_exact = 0;
2321 	      b += 3;
2322 
2323 	      /* The alternative before this one has a jump after it
2324 		 which gets executed if it gets matched.  Adjust that
2325 		 jump so it will jump to this alternative's analogous
2326 		 jump (put in below, which in turn will jump to the next
2327 		 (if any) alternative's such jump, etc.).  The last such
2328 		 jump jumps to the correct final destination.  A picture:
2329 			  _____ _____
2330 			  |   | |   |
2331 			  |   v |   v
2332 			A | B	| C
2333 
2334 		 If we are at B, then fixup_alt_jump right now points to a
2335 		 three-byte space after A.  We'll put in the jump, set
2336 		 fixup_alt_jump to right after B, and leave behind three
2337 		 bytes which we'll fill in when we get to after C.  */
2338 
2339 	      FIXUP_ALT_JUMP ();
2340 
2341 	      /* Mark and leave space for a jump after this alternative,
2342 		 to be filled in later either by next alternative or
2343 		 when know we're at the end of a series of alternatives.  */
2344 	      fixup_alt_jump = b;
2345 	      GET_BUFFER_SPACE (3);
2346 	      b += 3;
2347 
2348 	      laststart = 0;
2349 	      begalt = b;
2350 	      break;
2351 
2352 
2353 	    case '{':
2354 	      {
2355 		/* At least (most) this many matches must be made.  */
2356 		int lower_bound = 0, upper_bound = -1;
2357 
2358 		beg_interval = p;
2359 
2360 		GET_INTERVAL_COUNT (lower_bound);
2361 
2362 		if (c == ',')
2363 		  GET_INTERVAL_COUNT (upper_bound);
2364 		else
2365 		  /* Interval such as '{1}' => match exactly once. */
2366 		  upper_bound = lower_bound;
2367 
2368 		if (lower_bound < 0
2369 		    || (0 <= upper_bound && upper_bound < lower_bound)
2370 		    || c != '\\')
2371 		  FREE_STACK_RETURN (REG_BADBR);
2372 		if (p == pend)
2373 		  FREE_STACK_RETURN (REG_EESCAPE);
2374 		if (*p++ != '}')
2375 		  FREE_STACK_RETURN (REG_BADBR);
2376 
2377 		/* We just parsed a valid interval.  */
2378 
2379 		/* If it's invalid to have no preceding re.  */
2380 		if (!laststart)
2381 		  goto unfetch_interval;
2382 
2383 		if (upper_bound == 0)
2384 		  /* If the upper bound is zero, just drop the sub pattern
2385 		     altogether.  */
2386 		  b = laststart;
2387 		else if (lower_bound == 1 && upper_bound == 1)
2388 		  /* Just match it once: nothing to do here.  */
2389 		  ;
2390 
2391 		/* Otherwise, we have a nontrivial interval.  When
2392 		   we're all done, the pattern will look like:
2393 		   set_number_at <jump count> <upper bound>
2394 		   set_number_at <succeed_n count> <lower bound>
2395 		   succeed_n <after jump addr> <succeed_n count>
2396 		   <body of loop>
2397 		   jump_n <succeed_n addr> <jump count>
2398 		   (The upper bound and 'jump_n' are omitted if
2399 		   'upper_bound' is 1, though.)  */
2400 		else
2401 		  { /* If the upper bound is > 1, we need to insert
2402 		       more at the end of the loop.  */
2403 		    int nbytes = upper_bound < 0 ? 3 : upper_bound > 1 ? 5 : 0;
2404 		    int startoffset = 0;
2405 
2406 		    GET_BUFFER_SPACE (20); /* We might use less.  */
2407 
2408 		    if (lower_bound == 0)
2409 		      {
2410 			/* A succeed_n that starts with 0 is really
2411 			   a simple on_failure_jump_loop.  */
2412 			INSERT_JUMP (on_failure_jump_loop, laststart,
2413 				     b + 3 + nbytes);
2414 			b += 3;
2415 		      }
2416 		    else
2417 		      {
2418 			/* Initialize lower bound of the 'succeed_n', even
2419 			   though it will be set during matching by its
2420 			   attendant 'set_number_at' (inserted next),
2421 			   because 're_compile_fastmap' needs to know.
2422 			   Jump to the 'jump_n' we might insert below.  */
2423 			INSERT_JUMP2 (succeed_n, laststart,
2424 				      b + 5 + nbytes,
2425 				      lower_bound);
2426 			b += 5;
2427 
2428 			/* Code to initialize the lower bound.  Insert
2429 			   before the 'succeed_n'.  The '5' is the last two
2430 			   bytes of this 'set_number_at', plus 3 bytes of
2431 			   the following 'succeed_n'.  */
2432 			insert_op2 (set_number_at, laststart, 5,
2433 				    lower_bound, b);
2434 			b += 5;
2435 			startoffset += 5;
2436 		      }
2437 
2438 		    if (upper_bound < 0)
2439 		      {
2440 			/* A negative upper bound stands for infinity,
2441 			   in which case it degenerates to a plain jump.  */
2442 			STORE_JUMP (jump, b, laststart + startoffset);
2443 			b += 3;
2444 		      }
2445 		    else if (upper_bound > 1)
2446 		      { /* More than one repetition is allowed, so
2447 			   append a backward jump to the 'succeed_n'
2448 			   that starts this interval.
2449 
2450 			   When we've reached this during matching,
2451 			   we'll have matched the interval once, so
2452 			   jump back only 'upper_bound - 1' times.  */
2453 			STORE_JUMP2 (jump_n, b, laststart + startoffset,
2454 				     upper_bound - 1);
2455 			b += 5;
2456 
2457 			/* The location we want to set is the second
2458 			   parameter of the 'jump_n'; that is 'b-2' as
2459 			   an absolute address.  'laststart' will be
2460 			   the 'set_number_at' we're about to insert;
2461 			   'laststart+3' the number to set, the source
2462 			   for the relative address.  But we are
2463 			   inserting into the middle of the pattern --
2464 			   so everything is getting moved up by 5.
2465 			   Conclusion: (b - 2) - (laststart + 3) + 5,
2466 			   i.e., b - laststart.
2467 
2468 			   Insert this at the beginning of the loop
2469 			   so that if we fail during matching, we'll
2470 			   reinitialize the bounds.  */
2471 			insert_op2 (set_number_at, laststart, b - laststart,
2472 				    upper_bound - 1, b);
2473 			b += 5;
2474 		      }
2475 		  }
2476 		pending_exact = 0;
2477 		beg_interval = NULL;
2478 	      }
2479 	      break;
2480 
2481 	    unfetch_interval:
2482 	      /* If an invalid interval, match the characters as literals.  */
2483 	       eassert (beg_interval);
2484 	       p = beg_interval;
2485 	       beg_interval = NULL;
2486 	       eassert (p > pattern && p[-1] == '\\');
2487 	       c = '{';
2488 	       goto normal_char;
2489 
2490 	    case '=':
2491 	      laststart = b;
2492 	      BUF_PUSH (at_dot);
2493 	      break;
2494 
2495 	    case 's':
2496 	      laststart = b;
2497 	      PATFETCH (c);
2498 	      BUF_PUSH_2 (syntaxspec, syntax_spec_code[c]);
2499 	      break;
2500 
2501 	    case 'S':
2502 	      laststart = b;
2503 	      PATFETCH (c);
2504 	      BUF_PUSH_2 (notsyntaxspec, syntax_spec_code[c]);
2505 	      break;
2506 
2507 	    case 'c':
2508 	      laststart = b;
2509 	      PATFETCH (c);
2510 	      BUF_PUSH_2 (categoryspec, c);
2511 	      break;
2512 
2513 	    case 'C':
2514 	      laststart = b;
2515 	      PATFETCH (c);
2516 	      BUF_PUSH_2 (notcategoryspec, c);
2517 	      break;
2518 
2519 	    case 'w':
2520 	      laststart = b;
2521 	      BUF_PUSH_2 (syntaxspec, Sword);
2522 	      break;
2523 
2524 
2525 	    case 'W':
2526 	      laststart = b;
2527 	      BUF_PUSH_2 (notsyntaxspec, Sword);
2528 	      break;
2529 
2530 
2531 	    case '<':
2532 	      laststart = b;
2533 	      BUF_PUSH (wordbeg);
2534 	      break;
2535 
2536 	    case '>':
2537 	      laststart = b;
2538 	      BUF_PUSH (wordend);
2539 	      break;
2540 
2541 	    case '_':
2542               laststart = b;
2543               PATFETCH (c);
2544               if (c == '<')
2545                 BUF_PUSH (symbeg);
2546               else if (c == '>')
2547                 BUF_PUSH (symend);
2548               else
2549                 FREE_STACK_RETURN (REG_BADPAT);
2550               break;
2551 
2552 	    case 'b':
2553 	      BUF_PUSH (wordbound);
2554 	      break;
2555 
2556 	    case 'B':
2557 	      BUF_PUSH (notwordbound);
2558 	      break;
2559 
2560 	    case '`':
2561 	      BUF_PUSH (begbuf);
2562 	      break;
2563 
2564 	    case '\'':
2565 	      BUF_PUSH (endbuf);
2566 	      break;
2567 
2568 	    case '1': case '2': case '3': case '4': case '5':
2569 	    case '6': case '7': case '8': case '9':
2570 	      {
2571 		regnum_t reg = c - '0';
2572 
2573 		if (reg > bufp->re_nsub || reg < 1
2574 		    /* Can't back reference to a subexp before its end.  */
2575 		    || group_in_compile_stack (compile_stack, reg))
2576 		  FREE_STACK_RETURN (REG_ESUBREG);
2577 
2578 		laststart = b;
2579 		BUF_PUSH_2 (duplicate, reg);
2580 	      }
2581 	      break;
2582 
2583 	    default:
2584 	      /* You might think it would be useful for \ to mean
2585 		 not to translate; but if we don't translate it
2586 		 it will never match anything.  */
2587 	      goto normal_char;
2588 	    }
2589 	  break;
2590 
2591 
2592 	default:
2593 	/* Expects the character in C.  */
2594 	normal_char:
2595 	  /* If no exactn currently being built.  */
2596 	  if (!pending_exact
2597 
2598 	      /* If last exactn not at current position.  */
2599 	      || pending_exact + *pending_exact + 1 != b
2600 
2601 	      /* Only one byte follows the exactn for the count.  */
2602 	      || *pending_exact >= (1 << BYTEWIDTH) - MAX_MULTIBYTE_LENGTH
2603 
2604 	      /* If followed by a repetition operator.  */
2605 	      || (p != pend
2606 		  && (*p == '*' || *p == '+' || *p == '?' || *p == '^'))
2607 	      || (p + 1 < pend && p[0] == '\\' && p[1] == '{'))
2608 	    {
2609 	      /* Start building a new exactn.  */
2610 
2611 	      laststart = b;
2612 
2613 	      BUF_PUSH_2 (exactn, 0);
2614 	      pending_exact = b - 1;
2615 	    }
2616 
2617 	  GET_BUFFER_SPACE (MAX_MULTIBYTE_LENGTH);
2618 	  {
2619 	    int len;
2620 
2621 	    if (multibyte)
2622 	      {
2623 		c = TRANSLATE (c);
2624 		len = CHAR_STRING (c, b);
2625 		b += len;
2626 	      }
2627 	    else
2628 	      {
2629 		c1 = RE_CHAR_TO_MULTIBYTE (c);
2630 		if (! CHAR_BYTE8_P (c1))
2631 		  {
2632 		    int c2 = TRANSLATE (c1);
2633 
2634 		    if (c1 != c2 && (c1 = RE_CHAR_TO_UNIBYTE (c2)) >= 0)
2635 		      c = c1;
2636 		  }
2637 		*b++ = c;
2638 		len = 1;
2639 	      }
2640 	    (*pending_exact) += len;
2641 	  }
2642 
2643 	  break;
2644 	} /* switch (c) */
2645     } /* while p != pend */
2646 
2647 
2648   /* Through the pattern now.  */
2649 
2650   FIXUP_ALT_JUMP ();
2651 
2652   if (!COMPILE_STACK_EMPTY)
2653     FREE_STACK_RETURN (REG_EPAREN);
2654 
2655   /* If we don't want backtracking, force success
2656      the first time we reach the end of the compiled pattern.  */
2657   if (!posix_backtracking)
2658     BUF_PUSH (succeed);
2659 
2660   /* Success; set the length of the buffer.  */
2661   bufp->used = b - bufp->buffer;
2662 
2663 #ifdef REGEX_EMACS_DEBUG
2664   if (regex_emacs_debug > 0)
2665     {
2666       re_compile_fastmap (bufp);
2667       DEBUG_PRINT ("\nCompiled pattern:\n");
2668       print_compiled_pattern (bufp);
2669     }
2670   regex_emacs_debug--;
2671 #endif
2672 
2673   FREE_STACK_RETURN (REG_NOERROR);
2674 
2675 } /* regex_compile */
2676 
2677 /* Subroutines for 'regex_compile'.  */
2678 
2679 /* Store OP at LOC followed by two-byte integer parameter ARG.  */
2680 
2681 static void
store_op1(re_opcode_t op,unsigned char * loc,int arg)2682 store_op1 (re_opcode_t op, unsigned char *loc, int arg)
2683 {
2684   *loc = (unsigned char) op;
2685   STORE_NUMBER (loc + 1, arg);
2686 }
2687 
2688 
2689 /* Like 'store_op1', but for two two-byte parameters ARG1 and ARG2.  */
2690 
2691 static void
store_op2(re_opcode_t op,unsigned char * loc,int arg1,int arg2)2692 store_op2 (re_opcode_t op, unsigned char *loc, int arg1, int arg2)
2693 {
2694   *loc = (unsigned char) op;
2695   STORE_NUMBER (loc + 1, arg1);
2696   STORE_NUMBER (loc + 3, arg2);
2697 }
2698 
2699 
2700 /* Copy the bytes from LOC to END to open up three bytes of space at LOC
2701    for OP followed by two-byte integer parameter ARG.  */
2702 
2703 static void
insert_op1(re_opcode_t op,unsigned char * loc,int arg,unsigned char * end)2704 insert_op1 (re_opcode_t op, unsigned char *loc, int arg, unsigned char *end)
2705 {
2706   register unsigned char *pfrom = end;
2707   register unsigned char *pto = end + 3;
2708 
2709   while (pfrom != loc)
2710     *--pto = *--pfrom;
2711 
2712   store_op1 (op, loc, arg);
2713 }
2714 
2715 
2716 /* Like 'insert_op1', but for two two-byte parameters ARG1 and ARG2.  */
2717 
2718 static void
insert_op2(re_opcode_t op,unsigned char * loc,int arg1,int arg2,unsigned char * end)2719 insert_op2 (re_opcode_t op, unsigned char *loc, int arg1, int arg2,
2720 	    unsigned char *end)
2721 {
2722   register unsigned char *pfrom = end;
2723   register unsigned char *pto = end + 5;
2724 
2725   while (pfrom != loc)
2726     *--pto = *--pfrom;
2727 
2728   store_op2 (op, loc, arg1, arg2);
2729 }
2730 
2731 
2732 /* P points to just after a ^ in PATTERN.  Return true if that ^ comes
2733    after an alternative or a begin-subexpression.  Assume there is at
2734    least one character before the ^.  */
2735 
2736 static bool
at_begline_loc_p(re_char * pattern,re_char * p)2737 at_begline_loc_p (re_char *pattern, re_char *p)
2738 {
2739   re_char *prev = p - 2;
2740 
2741   switch (*prev)
2742     {
2743     case '(': /* After a subexpression.  */
2744     case '|': /* After an alternative.  */
2745       break;
2746 
2747     case ':': /* After a shy subexpression.  */
2748       /* Skip over optional regnum.  */
2749       while (prev > pattern && '0' <= prev[-1] && prev[-1] <= '9')
2750 	--prev;
2751 
2752       if (! (prev > pattern + 1 && prev[-1] == '?' && prev[-2] == '('))
2753 	return false;
2754       prev -= 2;
2755       break;
2756 
2757     default:
2758       return false;
2759     }
2760 
2761   /* Count the number of preceding backslashes.  */
2762   p = prev;
2763   while (prev > pattern && prev[-1] == '\\')
2764     --prev;
2765   return (p - prev) & 1;
2766 }
2767 
2768 
2769 /* The dual of at_begline_loc_p.  This one is for $.  Assume there is
2770    at least one character after the $, i.e., 'P < PEND'.  */
2771 
2772 static bool
at_endline_loc_p(re_char * p,re_char * pend)2773 at_endline_loc_p (re_char *p, re_char *pend)
2774 {
2775   /* Before a subexpression or an alternative?  */
2776   return *p == '\\' && p + 1 < pend && (p[1] == ')' || p[1] == '|');
2777 }
2778 
2779 
2780 /* Returns true if REGNUM is in one of COMPILE_STACK's elements and
2781    false if it's not.  */
2782 
2783 static bool
group_in_compile_stack(compile_stack_type compile_stack,regnum_t regnum)2784 group_in_compile_stack (compile_stack_type compile_stack, regnum_t regnum)
2785 {
2786   ptrdiff_t this_element;
2787 
2788   for (this_element = compile_stack.avail - 1;
2789        this_element >= 0;
2790        this_element--)
2791     if (compile_stack.stack[this_element].regnum == regnum)
2792       return true;
2793 
2794   return false;
2795 }
2796 
2797 /* analyze_first.
2798    If fastmap is non-NULL, go through the pattern and fill fastmap
2799    with all the possible leading chars.  If fastmap is NULL, don't
2800    bother filling it up (obviously) and only return whether the
2801    pattern could potentially match the empty string.
2802 
2803    Return 1  if p..pend might match the empty string.
2804    Return 0  if p..pend matches at least one char.
2805    Return -1 if fastmap was not updated accurately.  */
2806 
2807 static int
analyze_first(re_char * p,re_char * pend,char * fastmap,bool multibyte)2808 analyze_first (re_char *p, re_char *pend, char *fastmap, bool multibyte)
2809 {
2810   int j, k;
2811   int nbits;
2812   bool not;
2813 
2814   /* If all elements for base leading-codes in fastmap is set, this
2815      flag is set true.  */
2816   bool match_any_multibyte_characters = false;
2817 
2818   eassert (p);
2819 
2820   /* The loop below works as follows:
2821      - It has a working-list kept in the PATTERN_STACK and which basically
2822        starts by only containing a pointer to the first operation.
2823      - If the opcode we're looking at is a match against some set of
2824        chars, then we add those chars to the fastmap and go on to the
2825        next work element from the worklist (done via 'break').
2826      - If the opcode is a control operator on the other hand, we either
2827        ignore it (if it's meaningless at this point, such as 'start_memory')
2828        or execute it (if it's a jump).  If the jump has several destinations
2829        (i.e. 'on_failure_jump'), then we push the other destination onto the
2830        worklist.
2831      We guarantee termination by ignoring backward jumps (more or less),
2832      so that P is monotonically increasing.  More to the point, we
2833      never set P (or push) anything '<= p1'.  */
2834 
2835   while (p < pend)
2836     {
2837       /* P1 is used as a marker of how far back a 'on_failure_jump'
2838 	 can go without being ignored.  It is normally equal to P
2839 	 (which prevents any backward 'on_failure_jump') except right
2840 	 after a plain 'jump', to allow patterns such as:
2841 	    0: jump 10
2842 	    3..9: <body>
2843 	    10: on_failure_jump 3
2844 	 as used for the *? operator.  */
2845       re_char *p1 = p;
2846 
2847       switch (*p++)
2848 	{
2849 	case succeed:
2850 	  return 1;
2851 
2852 	case duplicate:
2853 	  /* If the first character has to match a backreference, that means
2854 	     that the group was empty (since it already matched).  Since this
2855 	     is the only case that interests us here, we can assume that the
2856 	     backreference must match the empty string.  */
2857 	  p++;
2858 	  continue;
2859 
2860 
2861       /* Following are the cases which match a character.  These end
2862 	 with 'break'.  */
2863 
2864 	case exactn:
2865 	  if (fastmap)
2866 	    {
2867 	      /* If multibyte is nonzero, the first byte of each
2868 		 character is an ASCII or a leading code.  Otherwise,
2869 		 each byte is a character.  Thus, this works in both
2870 		 cases. */
2871 	      fastmap[p[1]] = 1;
2872 	      if (multibyte)
2873 		{
2874 		  /* Cover the case of matching a raw char in a
2875 		     multibyte regexp against unibyte.	*/
2876 		  if (CHAR_BYTE8_HEAD_P (p[1]))
2877 		    fastmap[CHAR_TO_BYTE8 (STRING_CHAR (p + 1))] = 1;
2878 		}
2879 	      else
2880 		{
2881 		  /* For the case of matching this unibyte regex
2882 		     against multibyte, we must set a leading code of
2883 		     the corresponding multibyte character.  */
2884 		  int c = RE_CHAR_TO_MULTIBYTE (p[1]);
2885 
2886 		  fastmap[CHAR_LEADING_CODE (c)] = 1;
2887 		}
2888 	    }
2889 	  break;
2890 
2891 
2892 	case anychar:
2893 	  /* We could put all the chars except for \n (and maybe \0)
2894 	     but we don't bother since it is generally not worth it.  */
2895 	  if (!fastmap) break;
2896 	  return -1;
2897 
2898 
2899 	case charset_not:
2900 	  if (!fastmap) break;
2901 	  {
2902 	    /* Chars beyond end of bitmap are possible matches.  */
2903 	    for (j = CHARSET_BITMAP_SIZE (&p[-1]) * BYTEWIDTH;
2904 		 j < (1 << BYTEWIDTH); j++)
2905 	      fastmap[j] = 1;
2906 	  }
2907 	  FALLTHROUGH;
2908 	case charset:
2909 	  if (!fastmap) break;
2910 	  not = (re_opcode_t) *(p - 1) == charset_not;
2911 	  nbits = CHARSET_BITMAP_SIZE (&p[-1]) * BYTEWIDTH;
2912 	  p++;
2913 	  for (j = 0; j < nbits; j++)
2914 	    if (!!(p[j / BYTEWIDTH] & (1 << (j % BYTEWIDTH))) ^ not)
2915 	      fastmap[j] = 1;
2916 
2917 	  /* To match raw bytes (in the 80..ff range) against multibyte
2918 	     strings, add their leading bytes to the fastmap.  */
2919 	  for (j = 0x80; j < nbits; j++)
2920 	    if (!!(p[j / BYTEWIDTH] & (1 << (j % BYTEWIDTH))) ^ not)
2921 	      fastmap[CHAR_LEADING_CODE (BYTE8_TO_CHAR (j))] = 1;
2922 
2923 	  if (/* Any leading code can possibly start a character
2924 		 which doesn't match the specified set of characters.  */
2925 	      not
2926 	      ||
2927 	      /* If we can match a character class, we can match any
2928 		 multibyte characters.  */
2929 	      (CHARSET_RANGE_TABLE_EXISTS_P (&p[-2])
2930 	       && CHARSET_RANGE_TABLE_BITS (&p[-2]) != 0))
2931 
2932 	    {
2933 	      if (match_any_multibyte_characters == false)
2934 		{
2935 		  for (j = MIN_MULTIBYTE_LEADING_CODE;
2936 		       j <= MAX_MULTIBYTE_LEADING_CODE; j++)
2937 		    fastmap[j] = 1;
2938 		  match_any_multibyte_characters = true;
2939 		}
2940 	    }
2941 
2942 	  else if (!not && CHARSET_RANGE_TABLE_EXISTS_P (&p[-2])
2943 		   && match_any_multibyte_characters == false)
2944 	    {
2945 	      /* Set fastmap[I] to 1 where I is a leading code of each
2946 		 multibyte character in the range table. */
2947 	      int c, count;
2948 	      unsigned char lc1, lc2;
2949 
2950 	      /* Make P points the range table.  '+ 2' is to skip flag
2951 		 bits for a character class.  */
2952 	      p += CHARSET_BITMAP_SIZE (&p[-2]) + 2;
2953 
2954 	      /* Extract the number of ranges in range table into COUNT.  */
2955 	      EXTRACT_NUMBER_AND_INCR (count, p);
2956 	      for (; count > 0; count--, p += 3)
2957 		{
2958 		  /* Extract the start and end of each range.  */
2959 		  EXTRACT_CHARACTER (c, p);
2960 		  lc1 = CHAR_LEADING_CODE (c);
2961 		  p += 3;
2962 		  EXTRACT_CHARACTER (c, p);
2963 		  lc2 = CHAR_LEADING_CODE (c);
2964 		  for (j = lc1; j <= lc2; j++)
2965 		    fastmap[j] = 1;
2966 		}
2967 	    }
2968 	  break;
2969 
2970 	case syntaxspec:
2971 	case notsyntaxspec:
2972 	  if (!fastmap) break;
2973 	  /* This match depends on text properties.  These end with
2974 	     aborting optimizations.  */
2975 	  return -1;
2976 
2977 	case categoryspec:
2978 	case notcategoryspec:
2979 	  if (!fastmap) break;
2980 	  not = (re_opcode_t)p[-1] == notcategoryspec;
2981 	  k = *p++;
2982 	  for (j = (1 << BYTEWIDTH); j >= 0; j--)
2983 	    if ((CHAR_HAS_CATEGORY (j, k)) ^ not)
2984 	      fastmap[j] = 1;
2985 
2986 	  /* Any leading code can possibly start a character which
2987 	     has or doesn't has the specified category.  */
2988 	  if (match_any_multibyte_characters == false)
2989 	    {
2990 	      for (j = MIN_MULTIBYTE_LEADING_CODE;
2991 		   j <= MAX_MULTIBYTE_LEADING_CODE; j++)
2992 		fastmap[j] = 1;
2993 	      match_any_multibyte_characters = true;
2994 	    }
2995 	  break;
2996 
2997       /* All cases after this match the empty string.  These end with
2998 	 'continue'.  */
2999 
3000 	case at_dot:
3001 	case no_op:
3002 	case begline:
3003 	case endline:
3004 	case begbuf:
3005 	case endbuf:
3006 	case wordbound:
3007 	case notwordbound:
3008 	case wordbeg:
3009 	case wordend:
3010 	case symbeg:
3011 	case symend:
3012 	  continue;
3013 
3014 
3015 	case jump:
3016 	  EXTRACT_NUMBER_AND_INCR (j, p);
3017 	  if (j < 0)
3018 	    /* Backward jumps can only go back to code that we've already
3019 	       visited.  're_compile' should make sure this is true.  */
3020 	    break;
3021 	  p += j;
3022 	  switch (*p)
3023 	    {
3024 	    case on_failure_jump:
3025 	    case on_failure_keep_string_jump:
3026 	    case on_failure_jump_loop:
3027 	    case on_failure_jump_nastyloop:
3028 	    case on_failure_jump_smart:
3029 	      p++;
3030 	      break;
3031 	    default:
3032 	      continue;
3033 	    };
3034 	  /* Keep P1 to allow the 'on_failure_jump' we are jumping to
3035 	     to jump back to "just after here".  */
3036 	  FALLTHROUGH;
3037 	case on_failure_jump:
3038 	case on_failure_keep_string_jump:
3039 	case on_failure_jump_nastyloop:
3040 	case on_failure_jump_loop:
3041 	case on_failure_jump_smart:
3042 	  EXTRACT_NUMBER_AND_INCR (j, p);
3043 	  if (p + j <= p1)
3044 	    ; /* Backward jump to be ignored.  */
3045 	  else
3046 	    { /* We have to look down both arms.
3047 		 We first go down the "straight" path so as to minimize
3048 		 stack usage when going through alternatives.  */
3049 	      int r = analyze_first (p, pend, fastmap, multibyte);
3050 	      if (r) return r;
3051 	      p += j;
3052 	    }
3053 	  continue;
3054 
3055 
3056 	case jump_n:
3057 	  /* This code simply does not properly handle forward jump_n.  */
3058 	  DEBUG_STATEMENT (EXTRACT_NUMBER (j, p); eassert (j < 0));
3059 	  p += 4;
3060 	  /* jump_n can either jump or fall through.  The (backward) jump
3061 	     case has already been handled, so we only need to look at the
3062 	     fallthrough case.  */
3063 	  continue;
3064 
3065 	case succeed_n:
3066 	  /* If N == 0, it should be an on_failure_jump_loop instead.  */
3067 	  DEBUG_STATEMENT (EXTRACT_NUMBER (j, p + 2); eassert (j > 0));
3068 	  p += 4;
3069 	  /* We only care about one iteration of the loop, so we don't
3070 	     need to consider the case where this behaves like an
3071 	     on_failure_jump.  */
3072 	  continue;
3073 
3074 
3075 	case set_number_at:
3076 	  p += 4;
3077 	  continue;
3078 
3079 
3080 	case start_memory:
3081 	case stop_memory:
3082 	  p += 1;
3083 	  continue;
3084 
3085 
3086 	default:
3087 	  abort (); /* We have listed all the cases.  */
3088 	} /* switch *p++ */
3089 
3090       /* Getting here means we have found the possible starting
3091 	 characters for one path of the pattern -- and that the empty
3092 	 string does not match.  We need not follow this path further.  */
3093       return 0;
3094     } /* while p */
3095 
3096   /* We reached the end without matching anything.  */
3097   return 1;
3098 
3099 } /* analyze_first */
3100 
3101 /* Compute a fastmap for the compiled pattern in BUFP.
3102    A fastmap records which of the (1 << BYTEWIDTH) possible
3103    characters can start a string that matches the pattern.  This fastmap
3104    is used by re_search to skip quickly over impossible starting points.
3105 
3106    Character codes above (1 << BYTEWIDTH) are not represented in the
3107    fastmap, but the leading codes are represented.  Thus, the fastmap
3108    indicates which character sets could start a match.
3109 
3110    The caller must supply the address of a (1 << BYTEWIDTH)-byte data
3111    area as BUFP->fastmap.
3112 
3113    Set the 'fastmap', 'fastmap_accurate', and 'can_be_null' fields in
3114    the pattern buffer.  */
3115 
3116 static void
re_compile_fastmap(struct re_pattern_buffer * bufp)3117 re_compile_fastmap (struct re_pattern_buffer *bufp)
3118 {
3119   char *fastmap = bufp->fastmap;
3120   int analysis;
3121 
3122   eassert (fastmap && bufp->buffer);
3123 
3124   memset (fastmap, 0, 1 << BYTEWIDTH);  /* Assume nothing's valid.  */
3125 
3126   /* FIXME: Is the following assignment correct even when ANALYSIS < 0?  */
3127   bufp->fastmap_accurate = 1;	    /* It will be when we're done.  */
3128 
3129   analysis = analyze_first (bufp->buffer, bufp->buffer + bufp->used,
3130 			    fastmap, RE_MULTIBYTE_P (bufp));
3131   bufp->can_be_null = (analysis != 0);
3132 } /* re_compile_fastmap */
3133 
3134 /* Set REGS to hold NUM_REGS registers, storing them in STARTS and
3135    ENDS.  Subsequent matches using PATTERN_BUFFER and REGS will use
3136    this memory for recording register information.  STARTS and ENDS
3137    must be allocated using the malloc library routine, and must each
3138    be at least NUM_REGS * sizeof (ptrdiff_t) bytes long.
3139 
3140    If NUM_REGS == 0, then subsequent matches should allocate their own
3141    register data.
3142 
3143    Unless this function is called, the first search or match using
3144    PATTERN_BUFFER will allocate its own register data, without
3145    freeing the old data.  */
3146 
3147 void
re_set_registers(struct re_pattern_buffer * bufp,struct re_registers * regs,ptrdiff_t num_regs,ptrdiff_t * starts,ptrdiff_t * ends)3148 re_set_registers (struct re_pattern_buffer *bufp, struct re_registers *regs,
3149 		  ptrdiff_t num_regs, ptrdiff_t *starts, ptrdiff_t *ends)
3150 {
3151   if (num_regs)
3152     {
3153       bufp->regs_allocated = REGS_REALLOCATE;
3154       regs->num_regs = num_regs;
3155       regs->start = starts;
3156       regs->end = ends;
3157     }
3158   else
3159     {
3160       bufp->regs_allocated = REGS_UNALLOCATED;
3161       regs->num_regs = 0;
3162       regs->start = regs->end = 0;
3163     }
3164 }
3165 
3166 /* Searching routines.  */
3167 
3168 /* Like re_search_2, below, but only one string is specified, and
3169    doesn't let you say where to stop matching. */
3170 
3171 ptrdiff_t
re_search(struct re_pattern_buffer * bufp,const char * string,ptrdiff_t size,ptrdiff_t startpos,ptrdiff_t range,struct re_registers * regs)3172 re_search (struct re_pattern_buffer *bufp, const char *string, ptrdiff_t size,
3173 	   ptrdiff_t startpos, ptrdiff_t range, struct re_registers *regs)
3174 {
3175   return re_search_2 (bufp, NULL, 0, string, size, startpos, range,
3176 		      regs, size);
3177 }
3178 
3179 /* Address of POS in the concatenation of virtual string. */
3180 #define POS_ADDR_VSTRING(POS)					\
3181   (((POS) >= size1 ? string2 - size1 : string1) + (POS))
3182 
3183 /* Using the compiled pattern in BUFP->buffer, first tries to match the
3184    virtual concatenation of STRING1 and STRING2, starting first at index
3185    STARTPOS, then at STARTPOS + 1, and so on.
3186 
3187    STRING1 and STRING2 have length SIZE1 and SIZE2, respectively.
3188 
3189    RANGE is how far to scan while trying to match.  RANGE = 0 means try
3190    only at STARTPOS; in general, the last start tried is STARTPOS +
3191    RANGE.
3192 
3193    In REGS, return the indices of the virtual concatenation of STRING1
3194    and STRING2 that matched the entire BUFP->buffer and its contained
3195    subexpressions.
3196 
3197    Do not consider matching one past the index STOP in the virtual
3198    concatenation of STRING1 and STRING2.
3199 
3200    Return either the position in the strings at which the match was
3201    found, -1 if no match, or -2 if error (such as failure
3202    stack overflow).  */
3203 
3204 ptrdiff_t
re_search_2(struct re_pattern_buffer * bufp,const char * str1,ptrdiff_t size1,const char * str2,ptrdiff_t size2,ptrdiff_t startpos,ptrdiff_t range,struct re_registers * regs,ptrdiff_t stop)3205 re_search_2 (struct re_pattern_buffer *bufp, const char *str1, ptrdiff_t size1,
3206 	     const char *str2, ptrdiff_t size2,
3207 	     ptrdiff_t startpos, ptrdiff_t range,
3208 	     struct re_registers *regs, ptrdiff_t stop)
3209 {
3210   ptrdiff_t val;
3211   re_char *string1 = (re_char *) str1;
3212   re_char *string2 = (re_char *) str2;
3213   char *fastmap = bufp->fastmap;
3214   Lisp_Object translate = bufp->translate;
3215   ptrdiff_t total_size = size1 + size2;
3216   ptrdiff_t endpos = startpos + range;
3217   bool anchored_start;
3218   /* Nonzero if we are searching multibyte string.  */
3219   bool multibyte = RE_TARGET_MULTIBYTE_P (bufp);
3220 
3221   /* Check for out-of-range STARTPOS.  */
3222   if (startpos < 0 || startpos > total_size)
3223     return -1;
3224 
3225   /* Fix up RANGE if it might eventually take us outside
3226      the virtual concatenation of STRING1 and STRING2.
3227      Make sure we won't move STARTPOS below 0 or above TOTAL_SIZE.  */
3228   if (endpos < 0)
3229     range = 0 - startpos;
3230   else if (endpos > total_size)
3231     range = total_size - startpos;
3232 
3233   /* If the search isn't to be a backwards one, don't waste time in a
3234      search for a pattern anchored at beginning of buffer.  */
3235   if (bufp->used > 0 && (re_opcode_t) bufp->buffer[0] == begbuf && range > 0)
3236     {
3237       if (startpos > 0)
3238 	return -1;
3239       else
3240 	range = 0;
3241     }
3242 
3243   /* In a forward search for something that starts with \=.
3244      don't keep searching past point.  */
3245   if (bufp->used > 0 && (re_opcode_t) bufp->buffer[0] == at_dot && range > 0)
3246     {
3247       range = PT_BYTE - BEGV_BYTE - startpos;
3248       if (range < 0)
3249 	return -1;
3250     }
3251 
3252   /* Update the fastmap now if not correct already.  */
3253   if (fastmap && !bufp->fastmap_accurate)
3254     re_compile_fastmap (bufp);
3255 
3256   /* See whether the pattern is anchored.  */
3257   anchored_start = (bufp->buffer[0] == begline);
3258 
3259   gl_state.object = re_match_object; /* Used by SYNTAX_TABLE_BYTE_TO_CHAR. */
3260   {
3261     ptrdiff_t charpos = SYNTAX_TABLE_BYTE_TO_CHAR (POS_AS_IN_BUFFER (startpos));
3262 
3263     SETUP_SYNTAX_TABLE_FOR_OBJECT (re_match_object, charpos, 1);
3264   }
3265 
3266   /* Loop through the string, looking for a place to start matching.  */
3267   for (;;)
3268     {
3269       /* If the pattern is anchored,
3270 	 skip quickly past places we cannot match.
3271 	 Don't bother to treat startpos == 0 specially
3272 	 because that case doesn't repeat.  */
3273       if (anchored_start && startpos > 0)
3274 	{
3275 	  if (! ((startpos <= size1 ? string1[startpos - 1]
3276 		  : string2[startpos - size1 - 1])
3277 		 == '\n'))
3278 	    goto advance;
3279 	}
3280 
3281       /* If a fastmap is supplied, skip quickly over characters that
3282 	 cannot be the start of a match.  If the pattern can match the
3283 	 null string, however, we don't need to skip characters; we want
3284 	 the first null string.  */
3285       if (fastmap && startpos < total_size && !bufp->can_be_null)
3286 	{
3287 	  re_char *d;
3288 	  int buf_ch;
3289 
3290 	  d = POS_ADDR_VSTRING (startpos);
3291 
3292 	  if (range > 0)	/* Searching forwards.  */
3293 	    {
3294 	      ptrdiff_t irange = range, lim = 0;
3295 
3296 	      if (startpos < size1 && startpos + range >= size1)
3297 		lim = range - (size1 - startpos);
3298 
3299 	      /* Written out as an if-else to avoid testing 'translate'
3300 		 inside the loop.  */
3301 	      if (!NILP (translate))
3302 		{
3303 		  if (multibyte)
3304 		    while (range > lim)
3305 		      {
3306 			int buf_charlen;
3307 
3308 			buf_ch = string_char_and_length (d, &buf_charlen);
3309 			buf_ch = RE_TRANSLATE (translate, buf_ch);
3310 			if (fastmap[CHAR_LEADING_CODE (buf_ch)])
3311 			  break;
3312 
3313 			range -= buf_charlen;
3314 			d += buf_charlen;
3315 		      }
3316 		  else
3317 		    while (range > lim)
3318 		      {
3319 			buf_ch = *d;
3320 			int ch = RE_CHAR_TO_MULTIBYTE (buf_ch);
3321 			int translated = RE_TRANSLATE (translate, ch);
3322 			if (translated != ch
3323 			    && (ch = RE_CHAR_TO_UNIBYTE (translated)) >= 0)
3324 			  buf_ch = ch;
3325 			if (fastmap[buf_ch])
3326 			  break;
3327 			d++;
3328 			range--;
3329 		      }
3330 		}
3331 	      else
3332 		{
3333 		  if (multibyte)
3334 		    while (range > lim)
3335 		      {
3336 			int buf_charlen;
3337 
3338 			buf_ch = string_char_and_length (d, &buf_charlen);
3339 			if (fastmap[CHAR_LEADING_CODE (buf_ch)])
3340 			  break;
3341 			range -= buf_charlen;
3342 			d += buf_charlen;
3343 		      }
3344 		  else
3345 		    while (range > lim && !fastmap[*d])
3346 		      {
3347 			d++;
3348 			range--;
3349 		      }
3350 		}
3351 	      startpos += irange - range;
3352 	    }
3353 	  else				/* Searching backwards.  */
3354 	    {
3355 	      if (multibyte)
3356 		{
3357 		  buf_ch = STRING_CHAR (d);
3358 		  buf_ch = TRANSLATE (buf_ch);
3359 		  if (! fastmap[CHAR_LEADING_CODE (buf_ch)])
3360 		    goto advance;
3361 		}
3362 	      else
3363 		{
3364 		  buf_ch = *d;
3365 		  int ch = RE_CHAR_TO_MULTIBYTE (buf_ch);
3366 		  int translated = TRANSLATE (ch);
3367 		  if (translated != ch
3368 		      && (ch = RE_CHAR_TO_UNIBYTE (translated)) >= 0)
3369 		    buf_ch = ch;
3370 		  if (! fastmap[TRANSLATE (buf_ch)])
3371 		    goto advance;
3372 		}
3373 	    }
3374 	}
3375 
3376       /* If can't match the null string, and that's all we have left, fail.  */
3377       if (range >= 0 && startpos == total_size && fastmap
3378 	  && !bufp->can_be_null)
3379 	return -1;
3380 
3381       val = re_match_2_internal (bufp, string1, size1, string2, size2,
3382 				 startpos, regs, stop);
3383 
3384       if (val >= 0)
3385 	return startpos;
3386 
3387       if (val == -2)
3388 	return -2;
3389 
3390     advance:
3391       if (!range)
3392 	break;
3393       else if (range > 0)
3394 	{
3395 	  /* Update STARTPOS to the next character boundary.  */
3396 	  if (multibyte)
3397 	    {
3398 	      re_char *p = POS_ADDR_VSTRING (startpos);
3399 	      int len = BYTES_BY_CHAR_HEAD (*p);
3400 
3401 	      range -= len;
3402 	      if (range < 0)
3403 		break;
3404 	      startpos += len;
3405 	    }
3406 	  else
3407 	    {
3408 	      range--;
3409 	      startpos++;
3410 	    }
3411 	}
3412       else
3413 	{
3414 	  range++;
3415 	  startpos--;
3416 
3417 	  /* Update STARTPOS to the previous character boundary.  */
3418 	  if (multibyte)
3419 	    {
3420 	      re_char *p = POS_ADDR_VSTRING (startpos) + 1;
3421 	      int len = raw_prev_char_len (p);
3422 
3423 	      range += len - 1;
3424 	      if (range > 0)
3425 		break;
3426 	      startpos -= len - 1;
3427 	    }
3428 	}
3429     }
3430   return -1;
3431 } /* re_search_2 */
3432 
3433 /* Declarations and macros for re_match_2.  */
3434 
3435 static bool bcmp_translate (re_char *, re_char *, ptrdiff_t,
3436 			    Lisp_Object, bool);
3437 
3438 /* This converts PTR, a pointer into one of the search strings 'string1'
3439    and 'string2' into an offset from the beginning of that string.  */
3440 #define POINTER_TO_OFFSET(ptr)			\
3441   (FIRST_STRING_P (ptr)				\
3442    ? (ptr) - string1				\
3443    : (ptr) - string2 + (ptrdiff_t) size1)
3444 
3445 /* Call before fetching a character with *d.  This switches over to
3446    string2 if necessary.
3447    Check re_match_2_internal for a discussion of why end_match_2 might
3448    not be within string2 (but be equal to end_match_1 instead).  */
3449 #define PREFETCH()							\
3450   while (d == dend)							\
3451     {									\
3452       /* End of string2 => fail.  */					\
3453       if (dend == end_match_2)						\
3454 	goto fail;							\
3455       /* End of string1 => advance to string2.  */			\
3456       d = string2;							\
3457       dend = end_match_2;						\
3458     }
3459 
3460 /* Call before fetching a char with *d if you already checked other limits.
3461    This is meant for use in lookahead operations like wordend, etc..
3462    where we might need to look at parts of the string that might be
3463    outside of the LIMITs (i.e past 'stop').  */
3464 #define PREFETCH_NOLIMIT()						\
3465   if (d == end1)							\
3466      {									\
3467        d = string2;							\
3468        dend = end_match_2;						\
3469      }
3470 
3471 /* Test if at very beginning or at very end of the virtual concatenation
3472    of STRING1 and STRING2.  If only one string, it's STRING2.  */
3473 #define AT_STRINGS_BEG(d) ((d) == (size1 ? string1 : string2) || !size2)
3474 #define AT_STRINGS_END(d) ((d) == end2)
3475 
3476 /* Disabled due to a compiler bug -- see comment at case wordbound */
3477 
3478 /* The comment at case wordbound is following one, but we don't use
3479    AT_WORD_BOUNDARY anymore to support multibyte form.
3480 
3481    The DEC Alpha C compiler 3.x generates incorrect code for the
3482    test	 WORDCHAR_P (d - 1) != WORDCHAR_P (d)  in the expansion of
3483    AT_WORD_BOUNDARY, so this code is disabled.  Expanding the
3484    macro and introducing temporary variables works around the bug.  */
3485 
3486 #if 0
3487 /* Test if D points to a character which is word-constituent.  We have
3488    two special cases to check for: if past the end of string1, look at
3489    the first character in string2; and if before the beginning of
3490    string2, look at the last character in string1.  */
3491 #define WORDCHAR_P(d)							\
3492   (SYNTAX ((d) == end1 ? *string2					\
3493 	   : (d) == string2 - 1 ? *(end1 - 1) : *(d))			\
3494    == Sword)
3495 
3496 /* Test if the character before D and the one at D differ with respect
3497    to being word-constituent.  */
3498 #define AT_WORD_BOUNDARY(d)						\
3499   (AT_STRINGS_BEG (d) || AT_STRINGS_END (d)				\
3500    || WORDCHAR_P (d - 1) != WORDCHAR_P (d))
3501 #endif
3502 
3503 
3504 /* Optimization routines.  */
3505 
3506 /* If the operation is a match against one or more chars,
3507    return a pointer to the next operation, else return NULL.  */
3508 static re_char *
skip_one_char(re_char * p)3509 skip_one_char (re_char *p)
3510 {
3511   switch (*p++)
3512     {
3513     case anychar:
3514       break;
3515 
3516     case exactn:
3517       p += *p + 1;
3518       break;
3519 
3520     case charset_not:
3521     case charset:
3522       if (CHARSET_RANGE_TABLE_EXISTS_P (p - 1))
3523 	{
3524 	  int mcnt;
3525 	  p = CHARSET_RANGE_TABLE (p - 1);
3526 	  EXTRACT_NUMBER_AND_INCR (mcnt, p);
3527 	  p = CHARSET_RANGE_TABLE_END (p, mcnt);
3528 	}
3529       else
3530 	p += 1 + CHARSET_BITMAP_SIZE (p - 1);
3531       break;
3532 
3533     case syntaxspec:
3534     case notsyntaxspec:
3535     case categoryspec:
3536     case notcategoryspec:
3537       p++;
3538       break;
3539 
3540     default:
3541       p = NULL;
3542     }
3543   return p;
3544 }
3545 
3546 
3547 /* Jump over non-matching operations.  */
3548 static re_char *
skip_noops(re_char * p,re_char * pend)3549 skip_noops (re_char *p, re_char *pend)
3550 {
3551   int mcnt;
3552   while (p < pend)
3553     {
3554       switch (*p)
3555 	{
3556 	case start_memory:
3557 	case stop_memory:
3558 	  p += 2; break;
3559 	case no_op:
3560 	  p += 1; break;
3561 	case jump:
3562 	  p += 1;
3563 	  EXTRACT_NUMBER_AND_INCR (mcnt, p);
3564 	  p += mcnt;
3565 	  break;
3566 	default:
3567 	  return p;
3568 	}
3569     }
3570   eassert (p == pend);
3571   return p;
3572 }
3573 
3574 /* Test if C matches charset op.  *PP points to the charset or charset_not
3575    opcode.  When the function finishes, *PP will be advanced past that opcode.
3576    C is character to test (possibly after translations) and CORIG is original
3577    character (i.e. without any translations).  UNIBYTE denotes whether c is
3578    unibyte or multibyte character.
3579    CANON_TABLE is the canonicalisation table for case folding or Qnil.  */
3580 static bool
execute_charset(re_char ** pp,int c,int corig,bool unibyte,Lisp_Object canon_table)3581 execute_charset (re_char **pp, int c, int corig, bool unibyte,
3582                  Lisp_Object canon_table)
3583 {
3584   eassume (0 <= c && 0 <= corig);
3585   re_char *p = *pp, *rtp = NULL;
3586   bool not = (re_opcode_t) *p == charset_not;
3587 
3588   if (CHARSET_RANGE_TABLE_EXISTS_P (p))
3589     {
3590       int count;
3591       rtp = CHARSET_RANGE_TABLE (p);
3592       EXTRACT_NUMBER_AND_INCR (count, rtp);
3593       *pp = CHARSET_RANGE_TABLE_END ((rtp), (count));
3594     }
3595   else
3596     *pp += 2 + CHARSET_BITMAP_SIZE (p);
3597 
3598   if (unibyte && c < (1 << BYTEWIDTH))
3599     {			/* Lookup bitmap.  */
3600       /* Cast to 'unsigned' instead of 'unsigned char' in
3601 	 case the bit list is a full 32 bytes long.  */
3602       if (c < (unsigned) (CHARSET_BITMAP_SIZE (p) * BYTEWIDTH)
3603 	  && p[2 + c / BYTEWIDTH] & (1 << (c % BYTEWIDTH)))
3604 	return !not;
3605     }
3606   else if (rtp)
3607     {
3608       int class_bits = CHARSET_RANGE_TABLE_BITS (p);
3609       int range_start, range_end;
3610 
3611   /* Sort tests by the most commonly used classes with some adjustment to which
3612      tests are easiest to perform.  Take a look at comment in re_wctype_parse
3613      for table with frequencies of character class names. */
3614 
3615       if ((class_bits & BIT_MULTIBYTE) ||
3616 	  (class_bits & BIT_ALNUM && ISALNUM (c)) ||
3617 	  (class_bits & BIT_ALPHA && ISALPHA (c)) ||
3618 	  (class_bits & BIT_SPACE && ISSPACE (c)) ||
3619           (class_bits & BIT_BLANK && ISBLANK (c)) ||
3620 	  (class_bits & BIT_WORD  && ISWORD  (c)) ||
3621 	  ((class_bits & BIT_UPPER) &&
3622 	   (ISUPPER (corig) || (!NILP (canon_table) && ISLOWER (corig)))) ||
3623 	  ((class_bits & BIT_LOWER) &&
3624 	   (ISLOWER (corig) || (!NILP (canon_table) && ISUPPER (corig)))) ||
3625 	  (class_bits & BIT_PUNCT && ISPUNCT (c)) ||
3626 	  (class_bits & BIT_GRAPH && ISGRAPH (c)) ||
3627 	  (class_bits & BIT_PRINT && ISPRINT (c)))
3628 	return !not;
3629 
3630       for (p = *pp; rtp < p; rtp += 2 * 3)
3631 	{
3632 	  EXTRACT_CHARACTER (range_start, rtp);
3633 	  EXTRACT_CHARACTER (range_end, rtp + 3);
3634 	  if (range_start <= c && c <= range_end)
3635 	    return !not;
3636 	}
3637     }
3638 
3639   return not;
3640 }
3641 
3642 /* True if "p1 matches something" implies "p2 fails".  */
3643 static bool
mutually_exclusive_p(struct re_pattern_buffer * bufp,re_char * p1,re_char * p2)3644 mutually_exclusive_p (struct re_pattern_buffer *bufp, re_char *p1,
3645 		      re_char *p2)
3646 {
3647   re_opcode_t op2;
3648   bool multibyte = RE_MULTIBYTE_P (bufp);
3649   unsigned char *pend = bufp->buffer + bufp->used;
3650 
3651   eassert (p1 >= bufp->buffer && p1 < pend
3652 	   && p2 >= bufp->buffer && p2 <= pend);
3653 
3654   /* Skip over open/close-group commands.
3655      If what follows this loop is a ...+ construct,
3656      look at what begins its body, since we will have to
3657      match at least one of that.  */
3658   p2 = skip_noops (p2, pend);
3659   /* The same skip can be done for p1, except that this function
3660      is only used in the case where p1 is a simple match operator.  */
3661   /* p1 = skip_noops (p1, pend); */
3662 
3663   eassert (p1 >= bufp->buffer && p1 < pend
3664 	   && p2 >= bufp->buffer && p2 <= pend);
3665 
3666   op2 = p2 == pend ? succeed : *p2;
3667 
3668   switch (op2)
3669     {
3670     case succeed:
3671     case endbuf:
3672       /* If we're at the end of the pattern, we can change.  */
3673       if (skip_one_char (p1))
3674 	{
3675 	  DEBUG_PRINT ("  End of pattern: fast loop.\n");
3676 	  return true;
3677 	}
3678       break;
3679 
3680     case endline:
3681     case exactn:
3682       {
3683 	int c
3684 	  = (re_opcode_t) *p2 == endline ? '\n'
3685 	  : RE_STRING_CHAR (p2 + 2, multibyte);
3686 
3687 	if ((re_opcode_t) *p1 == exactn)
3688 	  {
3689 	    if (c != RE_STRING_CHAR (p1 + 2, multibyte))
3690 	      {
3691 		DEBUG_PRINT ("  '%c' != '%c' => fast loop.\n", c, p1[2]);
3692 		return true;
3693 	      }
3694 	  }
3695 
3696 	else if ((re_opcode_t) *p1 == charset
3697 		 || (re_opcode_t) *p1 == charset_not)
3698 	  {
3699 	    if (!execute_charset (&p1, c, c, !multibyte || ASCII_CHAR_P (c),
3700                                   Qnil))
3701 	      {
3702 		DEBUG_PRINT ("	 No match => fast loop.\n");
3703 		return true;
3704 	      }
3705 	  }
3706 	else if ((re_opcode_t) *p1 == anychar
3707 		 && c == '\n')
3708 	  {
3709 	    DEBUG_PRINT ("   . != \\n => fast loop.\n");
3710 	    return true;
3711 	  }
3712       }
3713       break;
3714 
3715     case charset:
3716       {
3717 	if ((re_opcode_t) *p1 == exactn)
3718 	  /* Reuse the code above.  */
3719 	  return mutually_exclusive_p (bufp, p2, p1);
3720 
3721       /* It is hard to list up all the character in charset
3722 	 P2 if it includes multibyte character.  Give up in
3723 	 such case.  */
3724       else if (!multibyte || !CHARSET_RANGE_TABLE_EXISTS_P (p2))
3725 	{
3726 	  /* Now, we are sure that P2 has no range table.
3727 	     So, for the size of bitmap in P2, 'p2[1]' is
3728 	     enough.  But P1 may have range table, so the
3729 	     size of bitmap table of P1 is extracted by
3730 	     using macro 'CHARSET_BITMAP_SIZE'.
3731 
3732 	     In a multibyte case, we know that all the character
3733 	     listed in P2 is ASCII.  In a unibyte case, P1 has only a
3734 	     bitmap table.  So, in both cases, it is enough to test
3735 	     only the bitmap table of P1.  */
3736 
3737 	  if ((re_opcode_t) *p1 == charset)
3738 	    {
3739 	      int idx;
3740 	      /* We win if the charset inside the loop
3741 		 has no overlap with the one after the loop.  */
3742 	      for (idx = 0;
3743 		   (idx < (int) p2[1]
3744 		    && idx < CHARSET_BITMAP_SIZE (p1));
3745 		   idx++)
3746 		if ((p2[2 + idx] & p1[2 + idx]) != 0)
3747 		  break;
3748 
3749 	      if (idx == p2[1]
3750 		  || idx == CHARSET_BITMAP_SIZE (p1))
3751 		{
3752 		  DEBUG_PRINT ("	 No match => fast loop.\n");
3753 		  return true;
3754 		}
3755 	    }
3756 	  else if ((re_opcode_t) *p1 == charset_not)
3757 	    {
3758 	      int idx;
3759 	      /* We win if the charset_not inside the loop lists
3760 		 every character listed in the charset after.  */
3761 	      for (idx = 0; idx < (int) p2[1]; idx++)
3762 		if (! (p2[2 + idx] == 0
3763 		       || (idx < CHARSET_BITMAP_SIZE (p1)
3764 			   && ((p2[2 + idx] & ~ p1[2 + idx]) == 0))))
3765 		  break;
3766 
3767 	      if (idx == p2[1])
3768 		{
3769 		  DEBUG_PRINT ("	 No match => fast loop.\n");
3770 		  return true;
3771 		}
3772 	      }
3773 	  }
3774       }
3775       break;
3776 
3777     case charset_not:
3778       switch (*p1)
3779 	{
3780 	case exactn:
3781 	case charset:
3782 	  /* Reuse the code above.  */
3783 	  return mutually_exclusive_p (bufp, p2, p1);
3784 	case charset_not:
3785 	  /* When we have two charset_not, it's very unlikely that
3786 	     they don't overlap.  The union of the two sets of excluded
3787 	     chars should cover all possible chars, which, as a matter of
3788 	     fact, is virtually impossible in multibyte buffers.  */
3789 	  break;
3790 	}
3791       break;
3792 
3793     case wordend:
3794       return ((re_opcode_t) *p1 == syntaxspec && p1[1] == Sword);
3795     case symend:
3796       return ((re_opcode_t) *p1 == syntaxspec
3797               && (p1[1] == Ssymbol || p1[1] == Sword));
3798     case notsyntaxspec:
3799       return ((re_opcode_t) *p1 == syntaxspec && p1[1] == p2[1]);
3800 
3801     case wordbeg:
3802       return ((re_opcode_t) *p1 == notsyntaxspec && p1[1] == Sword);
3803     case symbeg:
3804       return ((re_opcode_t) *p1 == notsyntaxspec
3805               && (p1[1] == Ssymbol || p1[1] == Sword));
3806     case syntaxspec:
3807       return ((re_opcode_t) *p1 == notsyntaxspec && p1[1] == p2[1]);
3808 
3809     case wordbound:
3810       return (((re_opcode_t) *p1 == notsyntaxspec
3811 	       || (re_opcode_t) *p1 == syntaxspec)
3812 	      && p1[1] == Sword);
3813 
3814     case categoryspec:
3815       return ((re_opcode_t) *p1 == notcategoryspec && p1[1] == p2[1]);
3816     case notcategoryspec:
3817       return ((re_opcode_t) *p1 == categoryspec && p1[1] == p2[1]);
3818 
3819     default:
3820       ;
3821     }
3822 
3823   /* Safe default.  */
3824   return false;
3825 }
3826 
3827 
3828 /* Matching routines.  */
3829 
3830 /* re_match_2 matches the compiled pattern in BUFP against the
3831    (virtual) concatenation of STRING1 and STRING2 (of length SIZE1
3832    and SIZE2, respectively).  We start matching at POS, and stop
3833    matching at STOP.
3834 
3835    If REGS is non-null, store offsets for the substring each group
3836    matched in REGS.
3837 
3838    We return -1 if no match, -2 if an internal error (such as the
3839    failure stack overflowing).  Otherwise, we return the length of the
3840    matched substring.  */
3841 
3842 ptrdiff_t
re_match_2(struct re_pattern_buffer * bufp,char const * string1,ptrdiff_t size1,char const * string2,ptrdiff_t size2,ptrdiff_t pos,struct re_registers * regs,ptrdiff_t stop)3843 re_match_2 (struct re_pattern_buffer *bufp,
3844 	    char const *string1, ptrdiff_t size1,
3845 	    char const *string2, ptrdiff_t size2,
3846 	    ptrdiff_t pos, struct re_registers *regs, ptrdiff_t stop)
3847 {
3848   ptrdiff_t result;
3849 
3850   ptrdiff_t charpos;
3851   gl_state.object = re_match_object; /* Used by SYNTAX_TABLE_BYTE_TO_CHAR. */
3852   charpos = SYNTAX_TABLE_BYTE_TO_CHAR (POS_AS_IN_BUFFER (pos));
3853   SETUP_SYNTAX_TABLE_FOR_OBJECT (re_match_object, charpos, 1);
3854 
3855   result = re_match_2_internal (bufp, (re_char *) string1, size1,
3856 				(re_char *) string2, size2,
3857 				pos, regs, stop);
3858   return result;
3859 }
3860 
3861 static void
unwind_re_match(void * ptr)3862 unwind_re_match (void *ptr)
3863 {
3864   struct buffer *b = (struct buffer *) ptr;
3865   b->text->inhibit_shrinking = 0;
3866 }
3867 
3868 /* This is a separate function so that we can force an alloca cleanup
3869    afterwards.  */
3870 static ptrdiff_t
re_match_2_internal(struct re_pattern_buffer * bufp,re_char * string1,ptrdiff_t size1,re_char * string2,ptrdiff_t size2,ptrdiff_t pos,struct re_registers * regs,ptrdiff_t stop)3871 re_match_2_internal (struct re_pattern_buffer *bufp,
3872 		     re_char *string1, ptrdiff_t size1,
3873 		     re_char *string2, ptrdiff_t size2,
3874 		     ptrdiff_t pos, struct re_registers *regs, ptrdiff_t stop)
3875 {
3876   eassume (0 <= size1);
3877   eassume (0 <= size2);
3878   eassume (0 <= pos && pos <= stop && stop <= size1 + size2);
3879 
3880   /* General temporaries.  */
3881   int mcnt;
3882 
3883   /* Just past the end of the corresponding string.  */
3884   re_char *end1, *end2;
3885 
3886   /* Pointers into string1 and string2, just past the last characters in
3887      each to consider matching.  */
3888   re_char *end_match_1, *end_match_2;
3889 
3890   /* Where we are in the data, and the end of the current string.  */
3891   re_char *d, *dend;
3892 
3893   /* Used sometimes to remember where we were before starting matching
3894      an operator so that we can go back in case of failure.  This "atomic"
3895      behavior of matching opcodes is indispensable to the correctness
3896      of the on_failure_keep_string_jump optimization.  */
3897   re_char *dfail;
3898 
3899   /* Where we are in the pattern, and the end of the pattern.  */
3900   re_char *p = bufp->buffer;
3901   re_char *pend = p + bufp->used;
3902 
3903   /* We use this to map every character in the string.	*/
3904   Lisp_Object translate = bufp->translate;
3905 
3906   /* True if BUFP is setup from a multibyte regex.  */
3907   bool multibyte = RE_MULTIBYTE_P (bufp);
3908 
3909   /* True if STRING1/STRING2 are multibyte.  */
3910   bool target_multibyte = RE_TARGET_MULTIBYTE_P (bufp);
3911 
3912   /* Failure point stack.  Each place that can handle a failure further
3913      down the line pushes a failure point on this stack.  It consists of
3914      regstart, and regend for all registers corresponding to
3915      the subexpressions we're currently inside, plus the number of such
3916      registers, and, finally, two char *'s.  The first char * is where
3917      to resume scanning the pattern; the second one is where to resume
3918      scanning the strings.  */
3919   fail_stack_type fail_stack;
3920 #ifdef DEBUG_COMPILES_ARGUMENTS
3921   ptrdiff_t nfailure_points_pushed = 0, nfailure_points_popped = 0;
3922 #endif
3923 
3924   /* We fill all the registers internally, independent of what we
3925      return, for use in backreferences.  The number here includes
3926      an element for register zero.  */
3927   ptrdiff_t num_regs = bufp->re_nsub + 1;
3928   eassume (0 < num_regs);
3929 
3930   /* Information on the contents of registers. These are pointers into
3931      the input strings; they record just what was matched (on this
3932      attempt) by a subexpression part of the pattern, that is, the
3933      regnum-th regstart pointer points to where in the pattern we began
3934      matching and the regnum-th regend points to right after where we
3935      stopped matching the regnum-th subexpression.  */
3936   re_char **regstart UNINIT, **regend UNINIT;
3937 
3938   /* The following record the register info as found in the above
3939      variables when we find a match better than any we've seen before.
3940      This happens as we backtrack through the failure points, which in
3941      turn happens only if we have not yet matched the entire string. */
3942   bool best_regs_set = false;
3943   re_char **best_regstart UNINIT, **best_regend UNINIT;
3944 
3945   /* Logically, this is 'best_regend[0]'.  But we don't want to have to
3946      allocate space for that if we're not allocating space for anything
3947      else (see below).  Also, we never need info about register 0 for
3948      any of the other register vectors, and it seems rather a kludge to
3949      treat 'best_regend' differently from the rest.  So we keep track of
3950      the end of the best match so far in a separate variable.  We
3951      initialize this to NULL so that when we backtrack the first time
3952      and need to test it, it's not garbage.  */
3953   re_char *match_end = NULL;
3954 
3955 #ifdef DEBUG_COMPILES_ARGUMENTS
3956   /* Counts the total number of registers pushed.  */
3957   ptrdiff_t num_regs_pushed = 0;
3958 #endif
3959 
3960   DEBUG_PRINT ("\nEntering re_match_2.\n");
3961 
3962   REGEX_USE_SAFE_ALLOCA;
3963 
3964   INIT_FAIL_STACK ();
3965 
3966   ptrdiff_t count = SPECPDL_INDEX ();
3967 
3968   /* Prevent shrinking and relocation of buffer text if GC happens
3969      while we are inside this function.  The calls to
3970      UPDATE_SYNTAX_TABLE_* macros can call Lisp (via
3971      `internal--syntax-propertize`); these calls are careful to defend against
3972      buffer modifications, but even with no modifications, the buffer text may
3973      be relocated during GC by `compact_buffer` which would invalidate
3974      our C pointers to buffer text.  */
3975   if (!current_buffer->text->inhibit_shrinking)
3976     {
3977       record_unwind_protect_ptr (unwind_re_match, current_buffer);
3978       current_buffer->text->inhibit_shrinking = 1;
3979     }
3980 
3981   /* Do not bother to initialize all the register variables if there are
3982      no groups in the pattern, as it takes a fair amount of time.  If
3983      there are groups, we include space for register 0 (the whole
3984      pattern) in REGSTART[0], even though we never use it, to avoid
3985      the undefined behavior of subtracting 1 from REGSTART.  */
3986   ptrdiff_t re_nsub = num_regs - 1;
3987   if (0 < re_nsub)
3988     {
3989       regstart = SAFE_ALLOCA ((re_nsub * 4 + 1) * sizeof *regstart);
3990       regend = regstart + num_regs;
3991       best_regstart = regend + re_nsub;
3992       best_regend = best_regstart + re_nsub;
3993 
3994       /* Initialize subexpression text positions to unset, to mark ones
3995 	 that no start_memory/stop_memory has been seen for.  */
3996       for (re_char **apos = regstart + 1; apos < best_regstart + 1; apos++)
3997 	*apos = NULL;
3998     }
3999 
4000   /* We move 'string1' into 'string2' if the latter's empty -- but not if
4001      'string1' is null.  */
4002   if (size2 == 0 && string1 != NULL)
4003     {
4004       string2 = string1;
4005       size2 = size1;
4006       string1 = 0;
4007       size1 = 0;
4008     }
4009   end1 = string1 + size1;
4010   end2 = string2 + size2;
4011 
4012   /* P scans through the pattern as D scans through the data.
4013      DEND is the end of the input string that D points within.
4014      Advance D into the following input string whenever necessary, but
4015      this happens before fetching; therefore, at the beginning of the
4016      loop, D can be pointing at the end of a string, but it cannot
4017      equal STRING2.  */
4018   if (pos >= size1)
4019     {
4020       /* Only match within string2.  */
4021       d = string2 + pos - size1;
4022       dend = end_match_2 = string2 + stop - size1;
4023       end_match_1 = end1;	/* Just to give it a value.  */
4024     }
4025   else
4026     {
4027       if (stop < size1)
4028 	{
4029 	  /* Only match within string1.  */
4030 	  end_match_1 = string1 + stop;
4031 	  /* BEWARE!
4032 	     When we reach end_match_1, PREFETCH normally switches to string2.
4033 	     But in the present case, this means that just doing a PREFETCH
4034 	     makes us jump from 'stop' to 'gap' within the string.
4035 	     What we really want here is for the search to stop as
4036 	     soon as we hit end_match_1.  That's why we set end_match_2
4037 	     to end_match_1 (since PREFETCH fails as soon as we hit
4038 	     end_match_2).  */
4039 	  end_match_2 = end_match_1;
4040 	}
4041       else
4042 	{ /* It's important to use this code when STOP == SIZE so that
4043 	     moving D from end1 to string2 will not prevent the D == DEND
4044 	     check from catching the end of string.  */
4045 	  end_match_1 = end1;
4046 	  end_match_2 = string2 + stop - size1;
4047 	}
4048       d = string1 + pos;
4049       dend = end_match_1;
4050     }
4051 
4052   DEBUG_PRINT ("The compiled pattern is:\n");
4053   DEBUG_PRINT_COMPILED_PATTERN (bufp, p, pend);
4054   DEBUG_PRINT ("The string to match is: \"");
4055   DEBUG_PRINT_DOUBLE_STRING (d, string1, size1, string2, size2);
4056   DEBUG_PRINT ("\"\n");
4057 
4058   /* This loops over pattern commands.  It exits by returning from the
4059      function if the match is complete, or it drops through if the match
4060      fails at this starting point in the input data.  */
4061   for (;;)
4062     {
4063       DEBUG_PRINT ("\n%p: ", p);
4064 
4065       if (p == pend)
4066 	{
4067 	  /* End of pattern means we might have succeeded.  */
4068 	  DEBUG_PRINT ("end of pattern ... ");
4069 
4070 	  /* If we haven't matched the entire string, and we want the
4071 	     longest match, try backtracking.  */
4072 	  if (d != end_match_2)
4073 	    {
4074 	      /* True if this match is the best seen so far.  */
4075 	      bool best_match_p;
4076 
4077 	      {
4078 		/* True if this match ends in the same string (string1
4079 		   or string2) as the best previous match.  */
4080 		bool same_str_p = (FIRST_STRING_P (match_end)
4081 				   == FIRST_STRING_P (d));
4082 
4083 		/* AIX compiler got confused when this was combined
4084 		   with the previous declaration.  */
4085 		if (same_str_p)
4086 		  best_match_p = d > match_end;
4087 		else
4088 		  best_match_p = !FIRST_STRING_P (d);
4089 	      }
4090 
4091 	      DEBUG_PRINT ("backtracking.\n");
4092 
4093 	      if (!FAIL_STACK_EMPTY ())
4094 		{ /* More failure points to try.  */
4095 
4096 		  /* If exceeds best match so far, save it.  */
4097 		  if (!best_regs_set || best_match_p)
4098 		    {
4099 		      best_regs_set = true;
4100 		      match_end = d;
4101 
4102 		      DEBUG_PRINT ("\nSAVING match as best so far.\n");
4103 
4104 		      for (ptrdiff_t reg = 1; reg < num_regs; reg++)
4105 			{
4106 			  best_regstart[reg] = regstart[reg];
4107 			  best_regend[reg] = regend[reg];
4108 			}
4109 		    }
4110 		  goto fail;
4111 		}
4112 
4113 	      /* If no failure points, don't restore garbage.  And if
4114 		 last match is real best match, don't restore second
4115 		 best one. */
4116 	      else if (best_regs_set && !best_match_p)
4117 		{
4118 		restore_best_regs:
4119 		  /* Restore best match.  It may happen that 'dend ==
4120 		     end_match_1' while the restored d is in string2.
4121 		     For example, the pattern 'x.*y.*z' against the
4122 		     strings 'x-' and 'y-z-', if the two strings are
4123 		     not consecutive in memory.  */
4124 		  DEBUG_PRINT ("Restoring best registers.\n");
4125 
4126 		  d = match_end;
4127 		  dend = ((d >= string1 && d <= end1)
4128 			   ? end_match_1 : end_match_2);
4129 
4130 		  for (ptrdiff_t reg = 1; reg < num_regs; reg++)
4131 		    {
4132 		      regstart[reg] = best_regstart[reg];
4133 		      regend[reg] = best_regend[reg];
4134 		      eassert (REG_UNSET (regstart[reg])
4135 			       <= REG_UNSET (regend[reg]));
4136 		    }
4137 		}
4138 	    } /* d != end_match_2 */
4139 
4140 	succeed_label:
4141 	  DEBUG_PRINT ("Accepting match.\n");
4142 
4143 	  /* If caller wants register contents data back, do it.  */
4144 	  if (regs)
4145 	    {
4146 	      /* Have the register data arrays been allocated?	*/
4147 	      if (bufp->regs_allocated == REGS_UNALLOCATED)
4148 		{ /* No.  So allocate them with malloc.  */
4149 		  ptrdiff_t n = max (RE_NREGS, num_regs);
4150 		  regs->start = xnmalloc (n, sizeof *regs->start);
4151 		  regs->end = xnmalloc (n, sizeof *regs->end);
4152 		  regs->num_regs = n;
4153 		  bufp->regs_allocated = REGS_REALLOCATE;
4154 		}
4155 	      else if (bufp->regs_allocated == REGS_REALLOCATE)
4156 		{ /* Yes.  If we need more elements than were already
4157 		     allocated, reallocate them.  If we need fewer, just
4158 		     leave it alone.  */
4159 		  ptrdiff_t n = regs->num_regs;
4160 		  if (n < num_regs)
4161 		    {
4162 		      n = max (n + (n >> 1), num_regs);
4163 		      regs->start
4164 			= xnrealloc (regs->start, n, sizeof *regs->start);
4165 		      regs->end = xnrealloc (regs->end, n, sizeof *regs->end);
4166 		      regs->num_regs = n;
4167 		    }
4168 		}
4169 	      else
4170 		eassert (bufp->regs_allocated == REGS_FIXED);
4171 
4172 	      /* Convert the pointer data in 'regstart' and 'regend' to
4173 		 indices.  Register zero has to be set differently,
4174 		 since we haven't kept track of any info for it.  */
4175 	      if (regs->num_regs > 0)
4176 		{
4177 		  regs->start[0] = pos;
4178 		  regs->end[0] = POINTER_TO_OFFSET (d);
4179 		}
4180 
4181 	      for (ptrdiff_t reg = 1; reg < num_regs; reg++)
4182 		{
4183 		  eassert (REG_UNSET (regstart[reg])
4184 			   <= REG_UNSET (regend[reg]));
4185 		  if (REG_UNSET (regend[reg]))
4186 		    regs->start[reg] = regs->end[reg] = -1;
4187 		  else
4188 		    {
4189 		      regs->start[reg] = POINTER_TO_OFFSET (regstart[reg]);
4190 		      regs->end[reg] = POINTER_TO_OFFSET (regend[reg]);
4191 		    }
4192 		}
4193 
4194 	      /* If the regs structure we return has more elements than
4195 		 were in the pattern, set the extra elements to -1.  */
4196 	      for (ptrdiff_t reg = num_regs; reg < regs->num_regs; reg++)
4197 		regs->start[reg] = regs->end[reg] = -1;
4198 	    }
4199 
4200 	  DEBUG_PRINT ("%td failure points pushed, %td popped (%td remain).\n",
4201 		       nfailure_points_pushed, nfailure_points_popped,
4202 		       nfailure_points_pushed - nfailure_points_popped);
4203 	  DEBUG_PRINT ("%td registers pushed.\n", num_regs_pushed);
4204 
4205 	  ptrdiff_t dcnt = POINTER_TO_OFFSET (d) - pos;
4206 
4207 	  DEBUG_PRINT ("Returning %td from re_match_2.\n", dcnt);
4208 
4209 	  unbind_to (count, Qnil);
4210 	  SAFE_FREE ();
4211 	  return dcnt;
4212 	}
4213 
4214       /* Otherwise match next pattern command.  */
4215       switch (*p++)
4216 	{
4217 	/* Ignore these.  Used to ignore the n of succeed_n's which
4218 	   currently have n == 0.  */
4219 	case no_op:
4220 	  DEBUG_PRINT ("EXECUTING no_op.\n");
4221 	  break;
4222 
4223 	case succeed:
4224 	  DEBUG_PRINT ("EXECUTING succeed.\n");
4225 	  goto succeed_label;
4226 
4227 	/* Match the next n pattern characters exactly.  The following
4228 	   byte in the pattern defines n, and the n bytes after that
4229 	   are the characters to match.  */
4230 	case exactn:
4231 	  mcnt = *p++;
4232 	  DEBUG_PRINT ("EXECUTING exactn %d.\n", mcnt);
4233 
4234 	  /* Remember the start point to rollback upon failure.  */
4235 	  dfail = d;
4236 
4237 	  /* The cost of testing 'translate' is comparatively small.  */
4238 	  if (target_multibyte)
4239 	    do
4240 	      {
4241 		int pat_charlen, buf_charlen;
4242 		int pat_ch, buf_ch;
4243 
4244 		PREFETCH ();
4245 		if (multibyte)
4246 		  pat_ch = string_char_and_length (p, &pat_charlen);
4247 		else
4248 		  {
4249 		    pat_ch = RE_CHAR_TO_MULTIBYTE (*p);
4250 		    pat_charlen = 1;
4251 		  }
4252 		buf_ch = string_char_and_length (d, &buf_charlen);
4253 
4254 		if (TRANSLATE (buf_ch) != pat_ch)
4255 		  {
4256 		    d = dfail;
4257 		    goto fail;
4258 		  }
4259 
4260 		p += pat_charlen;
4261 		d += buf_charlen;
4262 		mcnt -= pat_charlen;
4263 	      }
4264 	    while (mcnt > 0);
4265 	  else
4266 	    do
4267 	      {
4268 		int pat_charlen;
4269 		int pat_ch, buf_ch;
4270 
4271 		PREFETCH ();
4272 		if (multibyte)
4273 		  {
4274 		    pat_ch = string_char_and_length (p, &pat_charlen);
4275 		    pat_ch = RE_CHAR_TO_UNIBYTE (pat_ch);
4276 		  }
4277 		else
4278 		  {
4279 		    pat_ch = *p;
4280 		    pat_charlen = 1;
4281 		  }
4282 		buf_ch = RE_CHAR_TO_MULTIBYTE (*d);
4283 		if (! CHAR_BYTE8_P (buf_ch))
4284 		  {
4285 		    buf_ch = TRANSLATE (buf_ch);
4286 		    buf_ch = RE_CHAR_TO_UNIBYTE (buf_ch);
4287 		    if (buf_ch < 0)
4288 		      buf_ch = *d;
4289 		  }
4290 		else
4291 		  buf_ch = *d;
4292 		if (buf_ch != pat_ch)
4293 		  {
4294 		    d = dfail;
4295 		    goto fail;
4296 		  }
4297 		p += pat_charlen;
4298 		d++;
4299 		mcnt -= pat_charlen;
4300 	      }
4301 	    while (mcnt > 0);
4302 
4303 	  break;
4304 
4305 
4306 	/* Match any character except newline.  */
4307 	case anychar:
4308 	  {
4309 	    int buf_charlen;
4310 	    int buf_ch;
4311 
4312 	    DEBUG_PRINT ("EXECUTING anychar.\n");
4313 
4314 	    PREFETCH ();
4315 	    buf_ch = RE_STRING_CHAR_AND_LENGTH (d, buf_charlen,
4316 						target_multibyte);
4317 	    buf_ch = TRANSLATE (buf_ch);
4318 	    if (buf_ch == '\n')
4319 	      goto fail;
4320 
4321 	    DEBUG_PRINT ("  Matched \"%d\".\n", *d);
4322 	    d += buf_charlen;
4323 	  }
4324 	  break;
4325 
4326 
4327 	case charset:
4328 	case charset_not:
4329 	  {
4330 	    /* Whether matching against a unibyte character.  */
4331 	    bool unibyte_char = false;
4332 
4333 	    DEBUG_PRINT ("EXECUTING charset%s.\n",
4334 			 (re_opcode_t) *(p - 1) == charset_not ? "_not" : "");
4335 
4336 	    PREFETCH ();
4337 	    int len;
4338 	    int corig = RE_STRING_CHAR_AND_LENGTH (d, len, target_multibyte);
4339 	    int c = corig;
4340 	    if (target_multibyte)
4341 	      {
4342 		int c1;
4343 
4344 		c = TRANSLATE (c);
4345 		c1 = RE_CHAR_TO_UNIBYTE (c);
4346 		if (c1 >= 0)
4347 		  {
4348 		    unibyte_char = true;
4349 		    c = c1;
4350 		  }
4351 	      }
4352 	    else
4353 	      {
4354 		int c1 = RE_CHAR_TO_MULTIBYTE (c);
4355 
4356 		if (! CHAR_BYTE8_P (c1))
4357 		  {
4358 		    c1 = TRANSLATE (c1);
4359 		    c1 = RE_CHAR_TO_UNIBYTE (c1);
4360 		    if (c1 >= 0)
4361 		      {
4362 			unibyte_char = true;
4363 			c = c1;
4364 		      }
4365 		  }
4366 		else
4367 		  unibyte_char = true;
4368 	      }
4369 
4370 	    p -= 1;
4371 	    if (!execute_charset (&p, c, corig, unibyte_char, translate))
4372 	      goto fail;
4373 
4374 	    d += len;
4375 	  }
4376 	  break;
4377 
4378 
4379 	/* The beginning of a group is represented by start_memory.
4380 	   The argument is the register number.  The text
4381 	   matched within the group is recorded (in the internal
4382 	   registers data structure) under the register number.  */
4383 	case start_memory:
4384 	  DEBUG_PRINT ("EXECUTING start_memory %d:\n", *p);
4385 	  eassert (0 < *p && *p < num_regs);
4386 
4387 	  /* In case we need to undo this operation (via backtracking).  */
4388 	  PUSH_FAILURE_REG (*p);
4389 
4390 	  regstart[*p] = d;
4391 	  DEBUG_PRINT ("  regstart: %td\n", POINTER_TO_OFFSET (regstart[*p]));
4392 
4393 	  /* Move past the register number and inner group count.  */
4394 	  p += 1;
4395 	  break;
4396 
4397 
4398 	/* The stop_memory opcode represents the end of a group.  Its
4399 	   argument is the same as start_memory's: the register number.  */
4400 	case stop_memory:
4401 	  DEBUG_PRINT ("EXECUTING stop_memory %d:\n", *p);
4402 
4403 	  eassert (0 < *p && *p < num_regs);
4404 	  eassert (!REG_UNSET (regstart[*p]));
4405 	  /* Strictly speaking, there should be code such as:
4406 
4407 		eassert (REG_UNSET (regend[*p]));
4408 		PUSH_FAILURE_REGSTOP (*p);
4409 
4410 	     But the only info to be pushed is regend[*p] and it is known to
4411 	     be UNSET, so there really isn't anything to push.
4412 	     Not pushing anything, on the other hand deprives us from the
4413 	     guarantee that regend[*p] is UNSET since undoing this operation
4414 	     will not reset its value properly.  This is not important since
4415 	     the value will only be read on the next start_memory or at
4416 	     the very end and both events can only happen if this stop_memory
4417 	     is *not* undone.  */
4418 
4419 	  regend[*p] = d;
4420 	  DEBUG_PRINT ("      regend: %td\n", POINTER_TO_OFFSET (regend[*p]));
4421 
4422 	  /* Move past the register number and the inner group count.  */
4423 	  p += 1;
4424 	  break;
4425 
4426 
4427 	/* \<digit> has been turned into a 'duplicate' command which is
4428 	   followed by the numeric value of <digit> as the register number.  */
4429 	case duplicate:
4430 	  {
4431 	    re_char *d2, *dend2;
4432 	    int regno = *p++;	/* Get which register to match against.  */
4433 	    DEBUG_PRINT ("EXECUTING duplicate %d.\n", regno);
4434 
4435 	    /* Can't back reference a group which we've never matched.  */
4436 	    eassert (0 < regno && regno < num_regs);
4437 	    eassert (REG_UNSET (regstart[regno]) <= REG_UNSET (regend[regno]));
4438 	    if (REG_UNSET (regend[regno]))
4439 	      goto fail;
4440 
4441 	    /* Where in input to try to start matching.  */
4442 	    d2 = regstart[regno];
4443 
4444 	    /* Remember the start point to rollback upon failure.  */
4445 	    dfail = d;
4446 
4447 	    /* Where to stop matching; if both the place to start and
4448 	       the place to stop matching are in the same string, then
4449 	       set to the place to stop, otherwise, for now have to use
4450 	       the end of the first string.  */
4451 
4452 	    dend2 = ((FIRST_STRING_P (regstart[regno])
4453 		      == FIRST_STRING_P (regend[regno]))
4454 		     ? regend[regno] : end_match_1);
4455 	    for (;;)
4456 	      {
4457 		ptrdiff_t dcnt;
4458 
4459 		/* If necessary, advance to next segment in register
4460 		   contents.  */
4461 		while (d2 == dend2)
4462 		  {
4463 		    if (dend2 == end_match_2) break;
4464 		    if (dend2 == regend[regno]) break;
4465 
4466 		    /* End of string1 => advance to string2. */
4467 		    d2 = string2;
4468 		    dend2 = regend[regno];
4469 		  }
4470 		/* At end of register contents => success */
4471 		if (d2 == dend2) break;
4472 
4473 		/* If necessary, advance to next segment in data.  */
4474 		PREFETCH ();
4475 
4476 		/* How many characters left in this segment to match.  */
4477 		dcnt = dend - d;
4478 
4479 		/* Want how many consecutive characters we can match in
4480 		   one shot, so, if necessary, adjust the count.  */
4481 		if (dcnt > dend2 - d2)
4482 		  dcnt = dend2 - d2;
4483 
4484 		/* Compare that many; failure if mismatch, else move
4485 		   past them.  */
4486 		if (!NILP (translate)
4487 		    ? bcmp_translate (d, d2, dcnt, translate, target_multibyte)
4488 		    : memcmp (d, d2, dcnt))
4489 		  {
4490 		    d = dfail;
4491 		    goto fail;
4492 		  }
4493 		d += dcnt, d2 += dcnt;
4494 	      }
4495 	  }
4496 	  break;
4497 
4498 
4499 	/* begline matches the empty string at the beginning of the string,
4500 	   and after newlines.  */
4501 	case begline:
4502 	  DEBUG_PRINT ("EXECUTING begline.\n");
4503 
4504 	  if (AT_STRINGS_BEG (d))
4505 	    break;
4506 	  else
4507 	    {
4508 	      unsigned c;
4509 	      GET_CHAR_BEFORE_2 (c, d, string1, end1, string2, end2);
4510 	      if (c == '\n')
4511 		break;
4512 	    }
4513 	  goto fail;
4514 
4515 
4516 	/* endline is the dual of begline.  */
4517 	case endline:
4518 	  DEBUG_PRINT ("EXECUTING endline.\n");
4519 
4520 	  if (AT_STRINGS_END (d))
4521 	    break;
4522 	  PREFETCH_NOLIMIT ();
4523 	  if (*d == '\n')
4524 	    break;
4525 	  goto fail;
4526 
4527 
4528 	/* Match at the very beginning of the data.  */
4529 	case begbuf:
4530 	  DEBUG_PRINT ("EXECUTING begbuf.\n");
4531 	  if (AT_STRINGS_BEG (d))
4532 	    break;
4533 	  goto fail;
4534 
4535 
4536 	/* Match at the very end of the data.  */
4537 	case endbuf:
4538 	  DEBUG_PRINT ("EXECUTING endbuf.\n");
4539 	  if (AT_STRINGS_END (d))
4540 	    break;
4541 	  goto fail;
4542 
4543 
4544 	/* on_failure_keep_string_jump is used to optimize '.*\n'.  It
4545 	   pushes NULL as the value for the string on the stack.  Then
4546 	   'POP_FAILURE_POINT' will keep the current value for the
4547 	   string, instead of restoring it.  To see why, consider
4548 	   matching 'foo\nbar' against '.*\n'.  The .* matches the foo;
4549 	   then the . fails against the \n.  But the next thing we want
4550 	   to do is match the \n against the \n; if we restored the
4551 	   string value, we would be back at the foo.
4552 
4553 	   Because this is used only in specific cases, we don't need to
4554 	   check all the things that 'on_failure_jump' does, to make
4555 	   sure the right things get saved on the stack.  Hence we don't
4556 	   share its code.  The only reason to push anything on the
4557 	   stack at all is that otherwise we would have to change
4558 	   'anychar's code to do something besides goto fail in this
4559 	   case; that seems worse than this.  */
4560 	case on_failure_keep_string_jump:
4561 	  EXTRACT_NUMBER_AND_INCR (mcnt, p);
4562 	  DEBUG_PRINT ("EXECUTING on_failure_keep_string_jump %d (to %p):\n",
4563 		       mcnt, p + mcnt);
4564 
4565 	  PUSH_FAILURE_POINT (p - 3, NULL);
4566 	  break;
4567 
4568 	  /* A nasty loop is introduced by the non-greedy *? and +?.
4569 	     With such loops, the stack only ever contains one failure point
4570 	     at a time, so that a plain on_failure_jump_loop kind of
4571 	     cycle detection cannot work.  Worse yet, such a detection
4572 	     can not only fail to detect a cycle, but it can also wrongly
4573 	     detect a cycle (between different instantiations of the same
4574 	     loop).
4575 	     So the method used for those nasty loops is a little different:
4576 	     We use a special cycle-detection-stack-frame which is pushed
4577 	     when the on_failure_jump_nastyloop failure-point is *popped*.
4578 	     This special frame thus marks the beginning of one iteration
4579 	     through the loop and we can hence easily check right here
4580 	     whether something matched between the beginning and the end of
4581 	     the loop.  */
4582 	case on_failure_jump_nastyloop:
4583 	  EXTRACT_NUMBER_AND_INCR (mcnt, p);
4584 	  DEBUG_PRINT ("EXECUTING on_failure_jump_nastyloop %d (to %p):\n",
4585 		       mcnt, p + mcnt);
4586 
4587 	  eassert ((re_opcode_t)p[-4] == no_op);
4588 	  {
4589 	    bool cycle = false;
4590 	    CHECK_INFINITE_LOOP (p - 4, d);
4591 	    if (!cycle)
4592 	      /* If there's a cycle, just continue without pushing
4593 		 this failure point.  The failure point is the "try again"
4594 		 option, which shouldn't be tried.
4595 		 We want (x?)*?y\1z to match both xxyz and xxyxz.  */
4596 	      PUSH_FAILURE_POINT (p - 3, d);
4597 	  }
4598 	  break;
4599 
4600 	  /* Simple loop detecting on_failure_jump:  just check on the
4601 	     failure stack if the same spot was already hit earlier.  */
4602 	case on_failure_jump_loop:
4603 	on_failure:
4604 	  EXTRACT_NUMBER_AND_INCR (mcnt, p);
4605 	  DEBUG_PRINT ("EXECUTING on_failure_jump_loop %d (to %p):\n",
4606 		       mcnt, p + mcnt);
4607 	  {
4608 	    bool cycle = false;
4609 	    CHECK_INFINITE_LOOP (p - 3, d);
4610 	    if (cycle)
4611 	      /* If there's a cycle, get out of the loop, as if the matching
4612 		 had failed.  We used to just 'goto fail' here, but that was
4613 		 aborting the search a bit too early: we want to keep the
4614 		 empty-loop-match and keep matching after the loop.
4615 		 We want (x?)*y\1z to match both xxyz and xxyxz.  */
4616 	      p += mcnt;
4617 	    else
4618 	      PUSH_FAILURE_POINT (p - 3, d);
4619 	  }
4620 	  break;
4621 
4622 
4623 	/* Uses of on_failure_jump:
4624 
4625 	   Each alternative starts with an on_failure_jump that points
4626 	   to the beginning of the next alternative.  Each alternative
4627 	   except the last ends with a jump that in effect jumps past
4628 	   the rest of the alternatives.  (They really jump to the
4629 	   ending jump of the following alternative, because tensioning
4630 	   these jumps is a hassle.)
4631 
4632 	   Repeats start with an on_failure_jump that points past both
4633 	   the repetition text and either the following jump or
4634 	   pop_failure_jump back to this on_failure_jump.  */
4635 	case on_failure_jump:
4636 	  EXTRACT_NUMBER_AND_INCR (mcnt, p);
4637 	  DEBUG_PRINT ("EXECUTING on_failure_jump %d (to %p):\n",
4638 		       mcnt, p + mcnt);
4639 
4640 	  PUSH_FAILURE_POINT (p -3, d);
4641 	  break;
4642 
4643 	/* This operation is used for greedy *.
4644 	   Compare the beginning of the repeat with what in the
4645 	   pattern follows its end. If we can establish that there
4646 	   is nothing that they would both match, i.e., that we
4647 	   would have to backtrack because of (as in, e.g., 'a*a')
4648 	   then we can use a non-backtracking loop based on
4649 	   on_failure_keep_string_jump instead of on_failure_jump.  */
4650 	case on_failure_jump_smart:
4651 	  EXTRACT_NUMBER_AND_INCR (mcnt, p);
4652 	  DEBUG_PRINT ("EXECUTING on_failure_jump_smart %d (to %p).\n",
4653 		       mcnt, p + mcnt);
4654 	  {
4655 	    re_char *p1 = p; /* Next operation.  */
4656 	    /* Discard 'const', making re_search non-reentrant.  */
4657 	    unsigned char *p2 = (unsigned char *) p + mcnt; /* Jump dest.  */
4658 	    unsigned char *p3 = (unsigned char *) p - 3; /* opcode location.  */
4659 
4660 	    p -= 3;		/* Reset so that we will re-execute the
4661 				   instruction once it's been changed. */
4662 
4663 	    EXTRACT_NUMBER (mcnt, p2 - 2);
4664 
4665 	    /* Ensure this is indeed the trivial kind of loop
4666 	       we are expecting.  */
4667 	    eassert (skip_one_char (p1) == p2 - 3);
4668 	    eassert ((re_opcode_t) p2[-3] == jump && p2 + mcnt == p);
4669 	    DEBUG_STATEMENT (regex_emacs_debug += 2);
4670 	    if (mutually_exclusive_p (bufp, p1, p2))
4671 	      {
4672 		/* Use a fast 'on_failure_keep_string_jump' loop.  */
4673 		DEBUG_PRINT ("  smart exclusive => fast loop.\n");
4674 		*p3 = (unsigned char) on_failure_keep_string_jump;
4675 		STORE_NUMBER (p2 - 2, mcnt + 3);
4676 	      }
4677 	    else
4678 	      {
4679 		/* Default to a safe 'on_failure_jump' loop.  */
4680 		DEBUG_PRINT ("  smart default => slow loop.\n");
4681 		*p3 = (unsigned char) on_failure_jump;
4682 	      }
4683 	    DEBUG_STATEMENT (regex_emacs_debug -= 2);
4684 	  }
4685 	  break;
4686 
4687 	/* Unconditionally jump (without popping any failure points).  */
4688 	case jump:
4689 	unconditional_jump:
4690 	  maybe_quit ();
4691 	  EXTRACT_NUMBER_AND_INCR (mcnt, p);	/* Get the amount to jump.  */
4692 	  DEBUG_PRINT ("EXECUTING jump %d ", mcnt);
4693 	  p += mcnt;				/* Do the jump.  */
4694 	  DEBUG_PRINT ("(to %p).\n", p);
4695 	  break;
4696 
4697 
4698 	/* Have to succeed matching what follows at least n times.
4699 	   After that, handle like 'on_failure_jump'.  */
4700 	case succeed_n:
4701 	  /* Signedness doesn't matter since we only compare MCNT to 0.  */
4702 	  EXTRACT_NUMBER (mcnt, p + 2);
4703 	  DEBUG_PRINT ("EXECUTING succeed_n %d.\n", mcnt);
4704 
4705 	  /* Originally, mcnt is how many times we HAVE to succeed.  */
4706 	  if (mcnt != 0)
4707 	    {
4708 	      /* Discard 'const', making re_search non-reentrant.  */
4709 	      unsigned char *p2 = (unsigned char *) p + 2; /* counter loc.  */
4710 	      mcnt--;
4711 	      p += 4;
4712 	      PUSH_NUMBER (p2, mcnt);
4713 	    }
4714 	  else
4715 	    /* The two bytes encoding mcnt == 0 are two no_op opcodes.  */
4716 	    goto on_failure;
4717 	  break;
4718 
4719 	case jump_n:
4720 	  /* Signedness doesn't matter since we only compare MCNT to 0.  */
4721 	  EXTRACT_NUMBER (mcnt, p + 2);
4722 	  DEBUG_PRINT ("EXECUTING jump_n %d.\n", mcnt);
4723 
4724 	  /* Originally, this is how many times we CAN jump.  */
4725 	  if (mcnt != 0)
4726 	    {
4727 	      /* Discard 'const', making re_search non-reentrant.  */
4728 	      unsigned char *p2 = (unsigned char *) p + 2; /* counter loc.  */
4729 	      mcnt--;
4730 	      PUSH_NUMBER (p2, mcnt);
4731 	      goto unconditional_jump;
4732 	    }
4733 	  /* If don't have to jump any more, skip over the rest of command.  */
4734 	  else
4735 	    p += 4;
4736 	  break;
4737 
4738 	case set_number_at:
4739 	  {
4740 	    unsigned char *p2;	/* Location of the counter.  */
4741 	    DEBUG_PRINT ("EXECUTING set_number_at.\n");
4742 
4743 	    EXTRACT_NUMBER_AND_INCR (mcnt, p);
4744 	    /* Discard 'const', making re_search non-reentrant.  */
4745 	    p2 = (unsigned char *) p + mcnt;
4746 	    /* Signedness doesn't matter since we only copy MCNT's bits.  */
4747 	    EXTRACT_NUMBER_AND_INCR (mcnt, p);
4748 	    DEBUG_PRINT ("  Setting %p to %d.\n", p2, mcnt);
4749 	    PUSH_NUMBER (p2, mcnt);
4750 	    break;
4751 	  }
4752 
4753 	case wordbound:
4754 	case notwordbound:
4755 	  {
4756 	    bool not = (re_opcode_t) *(p - 1) == notwordbound;
4757 	    DEBUG_PRINT ("EXECUTING %swordbound.\n", not ? "not" : "");
4758 
4759 	    /* We SUCCEED (or FAIL) in one of the following cases: */
4760 
4761 	    /* Case 1: D is at the beginning or the end of string.  */
4762 	    if (AT_STRINGS_BEG (d) || AT_STRINGS_END (d))
4763 	      not = !not;
4764 	    else
4765 	      {
4766 		/* C1 is the character before D, S1 is the syntax of C1, C2
4767 		   is the character at D, and S2 is the syntax of C2.  */
4768 		int c1, c2;
4769 		int s1, s2;
4770 		int dummy;
4771                 ptrdiff_t offset = PTR_TO_OFFSET (d);
4772                 ptrdiff_t charpos = SYNTAX_TABLE_BYTE_TO_CHAR (offset) - 1;
4773 		UPDATE_SYNTAX_TABLE (charpos);
4774 		GET_CHAR_BEFORE_2 (c1, d, string1, end1, string2, end2);
4775 		s1 = SYNTAX (c1);
4776 		UPDATE_SYNTAX_TABLE_FORWARD (charpos + 1);
4777 		PREFETCH_NOLIMIT ();
4778 		GET_CHAR_AFTER (c2, d, dummy);
4779 		s2 = SYNTAX (c2);
4780 
4781 		if (/* Case 2: Only one of S1 and S2 is Sword.  */
4782 		    ((s1 == Sword) != (s2 == Sword))
4783 		    /* Case 3: Both of S1 and S2 are Sword, and macro
4784 		       WORD_BOUNDARY_P (C1, C2) returns nonzero.  */
4785 		    || ((s1 == Sword) && WORD_BOUNDARY_P (c1, c2)))
4786 		  not = !not;
4787 	      }
4788 	    if (not)
4789 	      break;
4790 	    else
4791 	      goto fail;
4792 	  }
4793 
4794 	case wordbeg:
4795 	  DEBUG_PRINT ("EXECUTING wordbeg.\n");
4796 
4797 	  /* We FAIL in one of the following cases: */
4798 
4799 	  /* Case 1: D is at the end of string.  */
4800 	  if (AT_STRINGS_END (d))
4801 	    goto fail;
4802 	  else
4803 	    {
4804 	      /* C1 is the character before D, S1 is the syntax of C1, C2
4805 		 is the character at D, and S2 is the syntax of C2.  */
4806 	      int c1, c2;
4807 	      int s1, s2;
4808 	      int dummy;
4809 	      ptrdiff_t offset = PTR_TO_OFFSET (d);
4810 	      ptrdiff_t charpos = SYNTAX_TABLE_BYTE_TO_CHAR (offset);
4811 	      UPDATE_SYNTAX_TABLE (charpos);
4812 	      PREFETCH ();
4813 	      GET_CHAR_AFTER (c2, d, dummy);
4814 	      s2 = SYNTAX (c2);
4815 
4816 	      /* Case 2: S2 is not Sword. */
4817 	      if (s2 != Sword)
4818 		goto fail;
4819 
4820 	      /* Case 3: D is not at the beginning of string ... */
4821 	      if (!AT_STRINGS_BEG (d))
4822 		{
4823 		  GET_CHAR_BEFORE_2 (c1, d, string1, end1, string2, end2);
4824 		  UPDATE_SYNTAX_TABLE_BACKWARD (charpos - 1);
4825 		  s1 = SYNTAX (c1);
4826 
4827 		  /* ... and S1 is Sword, and WORD_BOUNDARY_P (C1, C2)
4828 		     returns 0.  */
4829 		  if ((s1 == Sword) && !WORD_BOUNDARY_P (c1, c2))
4830 		    goto fail;
4831 		}
4832 	    }
4833 	  break;
4834 
4835 	case wordend:
4836 	  DEBUG_PRINT ("EXECUTING wordend.\n");
4837 
4838 	  /* We FAIL in one of the following cases: */
4839 
4840 	  /* Case 1: D is at the beginning of string.  */
4841 	  if (AT_STRINGS_BEG (d))
4842 	    goto fail;
4843 	  else
4844 	    {
4845 	      /* C1 is the character before D, S1 is the syntax of C1, C2
4846 		 is the character at D, and S2 is the syntax of C2.  */
4847 	      int c1, c2;
4848 	      int s1, s2;
4849 	      int dummy;
4850               ptrdiff_t offset = PTR_TO_OFFSET (d);
4851               ptrdiff_t charpos = SYNTAX_TABLE_BYTE_TO_CHAR (offset) - 1;
4852 	      UPDATE_SYNTAX_TABLE (charpos);
4853 	      GET_CHAR_BEFORE_2 (c1, d, string1, end1, string2, end2);
4854 	      s1 = SYNTAX (c1);
4855 
4856 	      /* Case 2: S1 is not Sword.  */
4857 	      if (s1 != Sword)
4858 		goto fail;
4859 
4860 	      /* Case 3: D is not at the end of string ... */
4861 	      if (!AT_STRINGS_END (d))
4862 		{
4863 		  PREFETCH_NOLIMIT ();
4864 		  GET_CHAR_AFTER (c2, d, dummy);
4865                   UPDATE_SYNTAX_TABLE_FORWARD (charpos + 1);
4866 		  s2 = SYNTAX (c2);
4867 
4868 		  /* ... and S2 is Sword, and WORD_BOUNDARY_P (C1, C2)
4869 		     returns 0.  */
4870 		  if ((s2 == Sword) && !WORD_BOUNDARY_P (c1, c2))
4871 	  goto fail;
4872 		}
4873 	    }
4874 	  break;
4875 
4876 	case symbeg:
4877 	  DEBUG_PRINT ("EXECUTING symbeg.\n");
4878 
4879 	  /* We FAIL in one of the following cases: */
4880 
4881 	  /* Case 1: D is at the end of string.  */
4882 	  if (AT_STRINGS_END (d))
4883 	    goto fail;
4884 	  else
4885 	    {
4886 	      /* C1 is the character before D, S1 is the syntax of C1, C2
4887 		 is the character at D, and S2 is the syntax of C2.  */
4888 	      int c1, c2;
4889 	      int s1, s2;
4890 	      ptrdiff_t offset = PTR_TO_OFFSET (d);
4891 	      ptrdiff_t charpos = SYNTAX_TABLE_BYTE_TO_CHAR (offset);
4892 	      UPDATE_SYNTAX_TABLE (charpos);
4893 	      PREFETCH ();
4894 	      c2 = RE_STRING_CHAR (d, target_multibyte);
4895 	      s2 = SYNTAX (c2);
4896 
4897 	      /* Case 2: S2 is neither Sword nor Ssymbol. */
4898 	      if (s2 != Sword && s2 != Ssymbol)
4899 		goto fail;
4900 
4901 	      /* Case 3: D is not at the beginning of string ... */
4902 	      if (!AT_STRINGS_BEG (d))
4903 		{
4904 		  GET_CHAR_BEFORE_2 (c1, d, string1, end1, string2, end2);
4905 		  UPDATE_SYNTAX_TABLE_BACKWARD (charpos - 1);
4906 		  s1 = SYNTAX (c1);
4907 
4908 		  /* ... and S1 is Sword or Ssymbol.  */
4909 		  if (s1 == Sword || s1 == Ssymbol)
4910 		    goto fail;
4911 		}
4912 	    }
4913 	  break;
4914 
4915 	case symend:
4916 	  DEBUG_PRINT ("EXECUTING symend.\n");
4917 
4918 	  /* We FAIL in one of the following cases: */
4919 
4920 	  /* Case 1: D is at the beginning of string.  */
4921 	  if (AT_STRINGS_BEG (d))
4922 	    goto fail;
4923 	  else
4924 	    {
4925 	      /* C1 is the character before D, S1 is the syntax of C1, C2
4926 		 is the character at D, and S2 is the syntax of C2.  */
4927 	      int c1, c2;
4928 	      int s1, s2;
4929               ptrdiff_t offset = PTR_TO_OFFSET (d);
4930               ptrdiff_t charpos = SYNTAX_TABLE_BYTE_TO_CHAR (offset) - 1;
4931 	      UPDATE_SYNTAX_TABLE (charpos);
4932 	      GET_CHAR_BEFORE_2 (c1, d, string1, end1, string2, end2);
4933 	      s1 = SYNTAX (c1);
4934 
4935 	      /* Case 2: S1 is neither Ssymbol nor Sword.  */
4936 	      if (s1 != Sword && s1 != Ssymbol)
4937 		goto fail;
4938 
4939 	      /* Case 3: D is not at the end of string ... */
4940 	      if (!AT_STRINGS_END (d))
4941 		{
4942 		  PREFETCH_NOLIMIT ();
4943 		  c2 = RE_STRING_CHAR (d, target_multibyte);
4944 		  UPDATE_SYNTAX_TABLE_FORWARD (charpos + 1);
4945 		  s2 = SYNTAX (c2);
4946 
4947 		  /* ... and S2 is Sword or Ssymbol.  */
4948 		  if (s2 == Sword || s2 == Ssymbol)
4949                     goto fail;
4950 		}
4951 	    }
4952 	  break;
4953 
4954 	case syntaxspec:
4955 	case notsyntaxspec:
4956 	  {
4957 	    bool not = (re_opcode_t) *(p - 1) == notsyntaxspec;
4958 	    mcnt = *p++;
4959 	    DEBUG_PRINT ("EXECUTING %ssyntaxspec %d.\n", not ? "not" : "",
4960 			 mcnt);
4961 	    PREFETCH ();
4962 	    {
4963 	      ptrdiff_t offset = PTR_TO_OFFSET (d);
4964 	      ptrdiff_t pos1 = SYNTAX_TABLE_BYTE_TO_CHAR (offset);
4965 	      UPDATE_SYNTAX_TABLE (pos1);
4966 	    }
4967 	    {
4968 	      int len;
4969 	      int c;
4970 
4971 	      GET_CHAR_AFTER (c, d, len);
4972 	      if ((SYNTAX (c) != (enum syntaxcode) mcnt) ^ not)
4973 		goto fail;
4974 	      d += len;
4975 	    }
4976 	  }
4977 	  break;
4978 
4979 	case at_dot:
4980 	  DEBUG_PRINT ("EXECUTING at_dot.\n");
4981 	  if (PTR_BYTE_POS (d) != PT_BYTE)
4982 	    goto fail;
4983 	  break;
4984 
4985 	case categoryspec:
4986 	case notcategoryspec:
4987 	  {
4988 	    bool not = (re_opcode_t) *(p - 1) == notcategoryspec;
4989 	    mcnt = *p++;
4990 	    DEBUG_PRINT ("EXECUTING %scategoryspec %d.\n",
4991 			 not ? "not" : "", mcnt);
4992 	    PREFETCH ();
4993 
4994 	    {
4995 	      int len;
4996 	      int c;
4997 	      GET_CHAR_AFTER (c, d, len);
4998 	      if ((!CHAR_HAS_CATEGORY (c, mcnt)) ^ not)
4999 		goto fail;
5000 	      d += len;
5001 	    }
5002 	  }
5003 	  break;
5004 
5005 	default:
5006 	  abort ();
5007 	}
5008       continue;  /* Successfully executed one pattern command; keep going.  */
5009 
5010 
5011     /* We goto here if a matching operation fails. */
5012     fail:
5013       maybe_quit ();
5014       if (!FAIL_STACK_EMPTY ())
5015 	{
5016 	  re_char *str, *pat;
5017 	  /* A restart point is known.  Restore to that state.  */
5018 	  DEBUG_PRINT ("\nFAIL:\n");
5019 	  POP_FAILURE_POINT (str, pat);
5020 	  switch (*pat++)
5021 	    {
5022 	    case on_failure_keep_string_jump:
5023 	      eassert (str == NULL);
5024 	      goto continue_failure_jump;
5025 
5026 	    case on_failure_jump_nastyloop:
5027 	      eassert ((re_opcode_t)pat[-2] == no_op);
5028 	      PUSH_FAILURE_POINT (pat - 2, str);
5029 	      FALLTHROUGH;
5030 	    case on_failure_jump_loop:
5031 	    case on_failure_jump:
5032 	    case succeed_n:
5033 	      d = str;
5034 	    continue_failure_jump:
5035 	      EXTRACT_NUMBER_AND_INCR (mcnt, pat);
5036 	      p = pat + mcnt;
5037 	      break;
5038 
5039 	    case no_op:
5040 	      /* A special frame used for nastyloops. */
5041 	      goto fail;
5042 
5043 	    default:
5044 	      abort ();
5045 	    }
5046 
5047 	  eassert (p >= bufp->buffer && p <= pend);
5048 
5049 	  if (d >= string1 && d <= end1)
5050 	    dend = end_match_1;
5051 	}
5052       else
5053 	break;   /* Matching at this starting point really fails.  */
5054     } /* for (;;) */
5055 
5056   if (best_regs_set)
5057     goto restore_best_regs;
5058 
5059   unbind_to (count, Qnil);
5060   SAFE_FREE ();
5061 
5062   return -1;				/* Failure to match.  */
5063 }
5064 
5065 /* Subroutine definitions for re_match_2.  */
5066 
5067 /* Return true if TRANSLATE[S1] and TRANSLATE[S2] are not identical
5068    for LEN bytes.  */
5069 
5070 static bool
bcmp_translate(re_char * s1,re_char * s2,ptrdiff_t len,Lisp_Object translate,bool target_multibyte)5071 bcmp_translate (re_char *s1, re_char *s2, ptrdiff_t len,
5072 		Lisp_Object translate, bool target_multibyte)
5073 {
5074   re_char *p1 = s1, *p2 = s2;
5075   re_char *p1_end = s1 + len;
5076   re_char *p2_end = s2 + len;
5077 
5078   /* FIXME: Checking both p1 and p2 presumes that the two strings might have
5079      different lengths, but relying on a single LEN would break this. -sm  */
5080   while (p1 < p1_end && p2 < p2_end)
5081     {
5082       int p1_charlen, p2_charlen;
5083       int p1_ch, p2_ch;
5084 
5085       GET_CHAR_AFTER (p1_ch, p1, p1_charlen);
5086       GET_CHAR_AFTER (p2_ch, p2, p2_charlen);
5087 
5088       if (RE_TRANSLATE (translate, p1_ch)
5089 	  != RE_TRANSLATE (translate, p2_ch))
5090 	return true;
5091 
5092       p1 += p1_charlen, p2 += p2_charlen;
5093     }
5094 
5095   return p1 != p1_end || p2 != p2_end;
5096 }
5097 
5098 /* Entry points for GNU code.  */
5099 
5100 /* re_compile_pattern is the GNU regular expression compiler: it
5101    compiles PATTERN (of length SIZE) and puts the result in BUFP.
5102    Returns 0 if the pattern was valid, otherwise an error string.
5103 
5104    Assumes the 'allocated' (and perhaps 'buffer') and 'translate' fields
5105    are set in BUFP on entry.
5106 
5107    We call regex_compile to do the actual compilation.  */
5108 
5109 const char *
re_compile_pattern(const char * pattern,ptrdiff_t length,bool posix_backtracking,const char * whitespace_regexp,struct re_pattern_buffer * bufp)5110 re_compile_pattern (const char *pattern, ptrdiff_t length,
5111 		    bool posix_backtracking, const char *whitespace_regexp,
5112 		    struct re_pattern_buffer *bufp)
5113 {
5114   bufp->regs_allocated = REGS_UNALLOCATED;
5115 
5116   reg_errcode_t ret
5117       = regex_compile ((re_char *) pattern, length,
5118 		       posix_backtracking,
5119 		       whitespace_regexp,
5120 		       bufp);
5121 
5122   if (!ret)
5123     return NULL;
5124   return re_error_msgid[ret];
5125 }
5126