1 /* Emacs regular expression matching and search
2
3 Copyright (C) 1993-2021 Free Software Foundation, Inc.
4
5 This program is free software; you can redistribute it and/or modify
6 it under the terms of the GNU General Public License as published by
7 the Free Software Foundation; either version 3, or (at your option)
8 any later version.
9
10 This program is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 GNU General Public License for more details.
14
15 You should have received a copy of the GNU General Public License
16 along with this program. If not, see <https://www.gnu.org/licenses/>. */
17
18 /* TODO:
19 - structure the opcode space into opcode+flag.
20 - replace (succeed_n + jump_n + set_number_at) with something that doesn't
21 need to modify the compiled regexp so that re_search can be reentrant.
22 - get rid of on_failure_jump_smart by doing the optimization in re_comp
23 rather than at run-time, so that re_search can be reentrant.
24 */
25
26 #include <config.h>
27
28 #include "regex-emacs.h"
29
30 #include <stdlib.h>
31
32 #include "character.h"
33 #include "buffer.h"
34 #include "syntax.h"
35 #include "category.h"
36
37 /* Maximum number of duplicates an interval can allow. Some systems
38 define this in other header files, but we want our value, so remove
39 any previous define. Repeat counts are stored in opcodes as 2-byte
40 unsigned integers. */
41 #ifdef RE_DUP_MAX
42 # undef RE_DUP_MAX
43 #endif
44 #define RE_DUP_MAX (0xffff)
45
46 /* Make syntax table lookup grant data in gl_state. */
47 #define SYNTAX(c) syntax_property (c, 1)
48
49 /* Convert the pointer to the char to BEG-based offset from the start. */
50 #define PTR_TO_OFFSET(d) POS_AS_IN_BUFFER (POINTER_TO_OFFSET (d))
51 /* Strings are 0-indexed, buffers are 1-indexed; pun on the boolean
52 result to get the right base index. */
53 #define POS_AS_IN_BUFFER(p) \
54 ((p) + (NILP (gl_state.object) || BUFFERP (gl_state.object)))
55
56 #define RE_MULTIBYTE_P(bufp) ((bufp)->multibyte)
57 #define RE_TARGET_MULTIBYTE_P(bufp) ((bufp)->target_multibyte)
58 #define RE_STRING_CHAR(p, multibyte) \
59 (multibyte ? STRING_CHAR (p) : *(p))
60 #define RE_STRING_CHAR_AND_LENGTH(p, len, multibyte) \
61 (multibyte ? string_char_and_length (p, &(len)) : ((len) = 1, *(p)))
62
63 #define RE_CHAR_TO_MULTIBYTE(c) UNIBYTE_TO_CHAR (c)
64
65 #define RE_CHAR_TO_UNIBYTE(c) CHAR_TO_BYTE_SAFE (c)
66
67 /* Set C a (possibly converted to multibyte) character before P. P
68 points into a string which is the virtual concatenation of STR1
69 (which ends at END1) or STR2 (which ends at END2). */
70 #define GET_CHAR_BEFORE_2(c, p, str1, end1, str2, end2) \
71 do { \
72 if (target_multibyte) \
73 { \
74 re_char *dtemp = (p) == (str2) ? (end1) : (p); \
75 re_char *dlimit = (p) > (str2) && (p) <= (end2) ? (str2) : (str1); \
76 while (dtemp-- > dlimit && !CHAR_HEAD_P (*dtemp)) \
77 continue; \
78 c = STRING_CHAR (dtemp); \
79 } \
80 else \
81 { \
82 (c = ((p) == (str2) ? (end1) : (p))[-1]); \
83 (c) = RE_CHAR_TO_MULTIBYTE (c); \
84 } \
85 } while (false)
86
87 /* Set C a (possibly converted to multibyte) character at P, and set
88 LEN to the byte length of that character. */
89 #define GET_CHAR_AFTER(c, p, len) \
90 do { \
91 if (target_multibyte) \
92 (c) = string_char_and_length (p, &(len)); \
93 else \
94 { \
95 (c) = *p; \
96 len = 1; \
97 (c) = RE_CHAR_TO_MULTIBYTE (c); \
98 } \
99 } while (false)
100
101 /* 1 if C is an ASCII character. */
102 #define IS_REAL_ASCII(c) ((c) < 0200)
103
104 /* 1 if C is a unibyte character. */
105 #define ISUNIBYTE(c) (SINGLE_BYTE_CHAR_P ((c)))
106
107 /* The Emacs definitions should not be directly affected by locales. */
108
109 /* In Emacs, these are only used for single-byte characters. */
110 #define ISDIGIT(c) ((c) >= '0' && (c) <= '9')
111 #define ISCNTRL(c) ((c) < ' ')
112 #define ISXDIGIT(c) (0 <= char_hexdigit (c))
113
114 /* The rest must handle multibyte characters. */
115
116 #define ISBLANK(c) (IS_REAL_ASCII (c) \
117 ? ((c) == ' ' || (c) == '\t') \
118 : blankp (c))
119
120 #define ISGRAPH(c) (SINGLE_BYTE_CHAR_P (c) \
121 ? (c) > ' ' && !((c) >= 0177 && (c) <= 0240) \
122 : graphicp (c))
123
124 #define ISPRINT(c) (SINGLE_BYTE_CHAR_P (c) \
125 ? (c) >= ' ' && !((c) >= 0177 && (c) <= 0237) \
126 : printablep (c))
127
128 #define ISALNUM(c) (IS_REAL_ASCII (c) \
129 ? (((c) >= 'a' && (c) <= 'z') \
130 || ((c) >= 'A' && (c) <= 'Z') \
131 || ((c) >= '0' && (c) <= '9')) \
132 : alphanumericp (c))
133
134 #define ISALPHA(c) (IS_REAL_ASCII (c) \
135 ? (((c) >= 'a' && (c) <= 'z') \
136 || ((c) >= 'A' && (c) <= 'Z')) \
137 : alphabeticp (c))
138
139 #define ISLOWER(c) lowercasep (c)
140
141 #define ISPUNCT(c) (IS_REAL_ASCII (c) \
142 ? ((c) > ' ' && (c) < 0177 \
143 && !(((c) >= 'a' && (c) <= 'z') \
144 || ((c) >= 'A' && (c) <= 'Z') \
145 || ((c) >= '0' && (c) <= '9'))) \
146 : SYNTAX (c) != Sword)
147
148 #define ISSPACE(c) (SYNTAX (c) == Swhitespace)
149
150 #define ISUPPER(c) uppercasep (c)
151
152 #define ISWORD(c) (SYNTAX (c) == Sword)
153
154 /* Use alloca instead of malloc. This is because using malloc in
155 re_search* or re_match* could cause memory leaks when C-g is used
156 in Emacs (note that SAFE_ALLOCA could also call malloc, but does so
157 via 'record_xmalloc' which uses 'unwind_protect' to ensure the
158 memory is freed even in case of non-local exits); also, malloc is
159 slower and causes storage fragmentation. On the other hand, malloc
160 is more portable, and easier to debug.
161
162 Because we sometimes use alloca, some routines have to be macros,
163 not functions -- 'alloca'-allocated space disappears at the end of the
164 function it is called in. */
165
166 /* This may be adjusted in main(), if the stack is successfully grown. */
167 ptrdiff_t emacs_re_safe_alloca = MAX_ALLOCA;
168 /* Like USE_SAFE_ALLOCA, but use emacs_re_safe_alloca. */
169 #define REGEX_USE_SAFE_ALLOCA \
170 USE_SAFE_ALLOCA; sa_avail = emacs_re_safe_alloca
171
172 /* Assumes a 'char *destination' variable. */
173 #define REGEX_REALLOCATE(source, osize, nsize) \
174 (destination = SAFE_ALLOCA (nsize), \
175 memcpy (destination, source, osize))
176
177 /* True if 'size1' is non-NULL and PTR is pointing anywhere inside
178 'string1' or just past its end. This works if PTR is NULL, which is
179 a good thing. */
180 #define FIRST_STRING_P(ptr) \
181 (size1 && string1 <= (ptr) && (ptr) <= string1 + size1)
182
183 #define BYTEWIDTH 8 /* In bits. */
184
185 /* Type of source-pattern and string chars. */
186 typedef const unsigned char re_char;
187
188 static void re_compile_fastmap (struct re_pattern_buffer *);
189 static ptrdiff_t re_match_2_internal (struct re_pattern_buffer *bufp,
190 re_char *string1, ptrdiff_t size1,
191 re_char *string2, ptrdiff_t size2,
192 ptrdiff_t pos,
193 struct re_registers *regs,
194 ptrdiff_t stop);
195
196 /* These are the command codes that appear in compiled regular
197 expressions. Some opcodes are followed by argument bytes. A
198 command code can specify any interpretation whatsoever for its
199 arguments. Zero bytes may appear in the compiled regular expression. */
200
201 typedef enum
202 {
203 no_op = 0,
204
205 /* Succeed right away--no more backtracking. */
206 succeed,
207
208 /* Followed by one byte giving n, then by n literal bytes. */
209 exactn,
210
211 /* Matches any (more or less) character. */
212 anychar,
213
214 /* Matches any one char belonging to specified set. First
215 following byte is number of bitmap bytes. Then come bytes
216 for a bitmap saying which chars are in. Bits in each byte
217 are ordered low-bit-first. A character is in the set if its
218 bit is 1. A character too large to have a bit in the map is
219 automatically not in the set.
220
221 If the length byte has the 0x80 bit set, then that stuff
222 is followed by a range table:
223 2 bytes of flags for character sets (low 8 bits, high 8 bits)
224 See RANGE_TABLE_WORK_BITS below.
225 2 bytes, the number of pairs that follow (upto 32767)
226 pairs, each 2 multibyte characters,
227 each multibyte character represented as 3 bytes. */
228 charset,
229
230 /* Same parameters as charset, but match any character that is
231 not one of those specified. */
232 charset_not,
233
234 /* Start remembering the text that is matched, for storing in a
235 register. Followed by one byte with the register number, in
236 the range 0 to one less than the pattern buffer's re_nsub
237 field. */
238 start_memory,
239
240 /* Stop remembering the text that is matched and store it in a
241 memory register. Followed by one byte with the register
242 number, in the range 0 to one less than 're_nsub' in the
243 pattern buffer. */
244 stop_memory,
245
246 /* Match a duplicate of something remembered. Followed by one
247 byte containing the register number. */
248 duplicate,
249
250 /* Fail unless at beginning of line. */
251 begline,
252
253 /* Fail unless at end of line. */
254 endline,
255
256 /* Succeeds if at beginning of buffer. */
257 begbuf,
258
259 /* Analogously, for end of buffer/string. */
260 endbuf,
261
262 /* Followed by two byte relative address to which to jump. */
263 jump,
264
265 /* Followed by two-byte relative address of place to resume at
266 in case of failure. */
267 on_failure_jump,
268
269 /* Like on_failure_jump, but pushes a placeholder instead of the
270 current string position when executed. */
271 on_failure_keep_string_jump,
272
273 /* Just like 'on_failure_jump', except that it checks that we
274 don't get stuck in an infinite loop (matching an empty string
275 indefinitely). */
276 on_failure_jump_loop,
277
278 /* Just like 'on_failure_jump_loop', except that it checks for
279 a different kind of loop (the kind that shows up with non-greedy
280 operators). This operation has to be immediately preceded
281 by a 'no_op'. */
282 on_failure_jump_nastyloop,
283
284 /* A smart 'on_failure_jump' used for greedy * and + operators.
285 It analyzes the loop before which it is put and if the
286 loop does not require backtracking, it changes itself to
287 'on_failure_keep_string_jump' and short-circuits the loop,
288 else it just defaults to changing itself into 'on_failure_jump'.
289 It assumes that it is pointing to just past a 'jump'. */
290 on_failure_jump_smart,
291
292 /* Followed by two-byte relative address and two-byte number n.
293 After matching N times, jump to the address upon failure.
294 Does not work if N starts at 0: use on_failure_jump_loop
295 instead. */
296 succeed_n,
297
298 /* Followed by two-byte relative address, and two-byte number n.
299 Jump to the address N times, then fail. */
300 jump_n,
301
302 /* Set the following two-byte relative address to the
303 subsequent two-byte number. The address *includes* the two
304 bytes of number. */
305 set_number_at,
306
307 wordbeg, /* Succeeds if at word beginning. */
308 wordend, /* Succeeds if at word end. */
309
310 wordbound, /* Succeeds if at a word boundary. */
311 notwordbound, /* Succeeds if not at a word boundary. */
312
313 symbeg, /* Succeeds if at symbol beginning. */
314 symend, /* Succeeds if at symbol end. */
315
316 /* Matches any character whose syntax is specified. Followed by
317 a byte which contains a syntax code, e.g., Sword. */
318 syntaxspec,
319
320 /* Matches any character whose syntax is not that specified. */
321 notsyntaxspec,
322
323 at_dot, /* Succeeds if at point. */
324
325 /* Matches any character whose category-set contains the specified
326 category. The operator is followed by a byte which contains a
327 category code (mnemonic ASCII character). */
328 categoryspec,
329
330 /* Matches any character whose category-set does not contain the
331 specified category. The operator is followed by a byte which
332 contains the category code (mnemonic ASCII character). */
333 notcategoryspec
334 } re_opcode_t;
335
336 /* Common operations on the compiled pattern. */
337
338 /* Store NUMBER in two contiguous bytes starting at DESTINATION. */
339
340 #define STORE_NUMBER(destination, number) \
341 do { \
342 (destination)[0] = (number) & 0377; \
343 (destination)[1] = (number) >> 8; \
344 } while (false)
345
346 /* Same as STORE_NUMBER, except increment DESTINATION to
347 the byte after where the number is stored. Therefore, DESTINATION
348 must be an lvalue. */
349
350 #define STORE_NUMBER_AND_INCR(destination, number) \
351 do { \
352 STORE_NUMBER (destination, number); \
353 (destination) += 2; \
354 } while (false)
355
356 /* Put into DESTINATION a number stored in two contiguous bytes starting
357 at SOURCE. */
358
359 #define EXTRACT_NUMBER(destination, source) \
360 ((destination) = extract_number (source))
361
362 static int
extract_number(re_char * source)363 extract_number (re_char *source)
364 {
365 signed char leading_byte = source[1];
366 return leading_byte * 256 + source[0];
367 }
368
369 /* Same as EXTRACT_NUMBER, except increment SOURCE to after the number.
370 SOURCE must be an lvalue. */
371
372 #define EXTRACT_NUMBER_AND_INCR(destination, source) \
373 ((destination) = extract_number_and_incr (&source))
374
375 static int
extract_number_and_incr(re_char ** source)376 extract_number_and_incr (re_char **source)
377 {
378 int num = extract_number (*source);
379 *source += 2;
380 return num;
381 }
382
383 /* Store a multibyte character in three contiguous bytes starting
384 DESTINATION, and increment DESTINATION to the byte after where the
385 character is stored. Therefore, DESTINATION must be an lvalue. */
386
387 #define STORE_CHARACTER_AND_INCR(destination, character) \
388 do { \
389 (destination)[0] = (character) & 0377; \
390 (destination)[1] = ((character) >> 8) & 0377; \
391 (destination)[2] = (character) >> 16; \
392 (destination) += 3; \
393 } while (false)
394
395 /* Put into DESTINATION a character stored in three contiguous bytes
396 starting at SOURCE. */
397
398 #define EXTRACT_CHARACTER(destination, source) \
399 do { \
400 (destination) = ((source)[0] \
401 | ((source)[1] << 8) \
402 | ((source)[2] << 16)); \
403 } while (false)
404
405
406 /* Macros for charset. */
407
408 /* Size of bitmap of charset P in bytes. P is a start of charset,
409 i.e. *P is (re_opcode_t) charset or (re_opcode_t) charset_not. */
410 #define CHARSET_BITMAP_SIZE(p) ((p)[1] & 0x7F)
411
412 /* Nonzero if charset P has range table. */
413 #define CHARSET_RANGE_TABLE_EXISTS_P(p) (((p)[1] & 0x80) != 0)
414
415 /* Return the address of range table of charset P. But not the start
416 of table itself, but the before where the number of ranges is
417 stored. '2 +' means to skip re_opcode_t and size of bitmap,
418 and the 2 bytes of flags at the start of the range table. */
419 #define CHARSET_RANGE_TABLE(p) (&(p)[4 + CHARSET_BITMAP_SIZE (p)])
420
421 /* Extract the bit flags that start a range table. */
422 #define CHARSET_RANGE_TABLE_BITS(p) \
423 ((p)[2 + CHARSET_BITMAP_SIZE (p)] \
424 + (p)[3 + CHARSET_BITMAP_SIZE (p)] * 0x100)
425
426 /* Return the address of end of RANGE_TABLE. COUNT is number of
427 ranges (which is a pair of (start, end)) in the RANGE_TABLE. '* 2'
428 is start of range and end of range. '* 3' is size of each start
429 and end. */
430 #define CHARSET_RANGE_TABLE_END(range_table, count) \
431 ((range_table) + (count) * 2 * 3)
432
433 /* If REGEX_EMACS_DEBUG is defined, print many voluminous messages
434 (if the variable regex_emacs_debug is positive). */
435
436 #ifdef REGEX_EMACS_DEBUG
437
438 /* Use standard I/O for debugging. */
439 # include "sysstdio.h"
440
441 static int regex_emacs_debug = -100000;
442
443 # define DEBUG_STATEMENT(e) e
444 # define DEBUG_PRINT(...) \
445 if (regex_emacs_debug > 0) fprintf (stderr, __VA_ARGS__)
446 # define DEBUG_COMPILES_ARGUMENTS
447 # define DEBUG_PRINT_COMPILED_PATTERN(p, s, e) \
448 if (regex_emacs_debug > 0) print_partial_compiled_pattern (s, e)
449 # define DEBUG_PRINT_DOUBLE_STRING(w, s1, sz1, s2, sz2) \
450 if (regex_emacs_debug > 0) print_double_string (w, s1, sz1, s2, sz2)
451
452 static void
debug_putchar(int c)453 debug_putchar (int c)
454 {
455 if (c >= 32 && c <= 126)
456 putc (c, stderr);
457 else
458 {
459 unsigned int uc = c;
460 fprintf (stderr, "{%02x}", uc);
461 }
462 }
463
464 /* Print the fastmap in human-readable form. */
465
466 static void
print_fastmap(char * fastmap)467 print_fastmap (char *fastmap)
468 {
469 bool was_a_range = false;
470 int i = 0;
471
472 while (i < (1 << BYTEWIDTH))
473 {
474 if (fastmap[i++])
475 {
476 was_a_range = false;
477 debug_putchar (i - 1);
478 while (i < (1 << BYTEWIDTH) && fastmap[i])
479 {
480 was_a_range = true;
481 i++;
482 }
483 if (was_a_range)
484 {
485 debug_putchar ('-');
486 debug_putchar (i - 1);
487 }
488 }
489 }
490 putc ('\n', stderr);
491 }
492
493
494 /* Print a compiled pattern string in human-readable form, starting at
495 the START pointer into it and ending just before the pointer END. */
496
497 static void
print_partial_compiled_pattern(re_char * start,re_char * end)498 print_partial_compiled_pattern (re_char *start, re_char *end)
499 {
500 int mcnt, mcnt2;
501 re_char *p = start;
502 re_char *pend = end;
503
504 if (start == NULL)
505 {
506 fputs ("(null)\n", stderr);
507 return;
508 }
509
510 /* Loop over pattern commands. */
511 while (p < pend)
512 {
513 fprintf (stderr, "%td:\t", p - start);
514
515 switch ((re_opcode_t) *p++)
516 {
517 case no_op:
518 fputs ("/no_op", stderr);
519 break;
520
521 case succeed:
522 fputs ("/succeed", stderr);
523 break;
524
525 case exactn:
526 mcnt = *p++;
527 fprintf (stderr, "/exactn/%d", mcnt);
528 do
529 {
530 debug_putchar ('/');
531 debug_putchar (*p++);
532 }
533 while (--mcnt);
534 break;
535
536 case start_memory:
537 fprintf (stderr, "/start_memory/%d", *p++);
538 break;
539
540 case stop_memory:
541 fprintf (stderr, "/stop_memory/%d", *p++);
542 break;
543
544 case duplicate:
545 fprintf (stderr, "/duplicate/%d", *p++);
546 break;
547
548 case anychar:
549 fputs ("/anychar", stderr);
550 break;
551
552 case charset:
553 case charset_not:
554 {
555 int c, last = -100;
556 bool in_range = false;
557 int length = CHARSET_BITMAP_SIZE (p - 1);
558 bool has_range_table = CHARSET_RANGE_TABLE_EXISTS_P (p - 1);
559
560 fprintf (stderr, "/charset [%s",
561 (re_opcode_t) *(p - 1) == charset_not ? "^" : "");
562
563 if (p + *p >= pend)
564 fputs (" !extends past end of pattern! ", stderr);
565
566 for (c = 0; c < 256; c++)
567 if (c / 8 < length
568 && (p[1 + (c/8)] & (1 << (c % 8))))
569 {
570 /* Are we starting a range? */
571 if (last + 1 == c && ! in_range)
572 {
573 debug_putchar ('-');
574 in_range = true;
575 }
576 /* Have we broken a range? */
577 else if (last + 1 != c && in_range)
578 {
579 debug_putchar (last);
580 in_range = false;
581 }
582
583 if (! in_range)
584 debug_putchar (c);
585
586 last = c;
587 }
588
589 if (in_range)
590 debug_putchar (last);
591
592 debug_putchar (']');
593
594 p += 1 + length;
595
596 if (has_range_table)
597 {
598 int count;
599 fputs ("has-range-table", stderr);
600
601 /* ??? Should print the range table; for now, just skip it. */
602 p += 2; /* skip range table bits */
603 EXTRACT_NUMBER_AND_INCR (count, p);
604 p = CHARSET_RANGE_TABLE_END (p, count);
605 }
606 }
607 break;
608
609 case begline:
610 fputs ("/begline", stderr);
611 break;
612
613 case endline:
614 fputs ("/endline", stderr);
615 break;
616
617 case on_failure_jump:
618 EXTRACT_NUMBER_AND_INCR (mcnt, p);
619 fprintf (stderr, "/on_failure_jump to %td", p + mcnt - start);
620 break;
621
622 case on_failure_keep_string_jump:
623 EXTRACT_NUMBER_AND_INCR (mcnt, p);
624 fprintf (stderr, "/on_failure_keep_string_jump to %td",
625 p + mcnt - start);
626 break;
627
628 case on_failure_jump_nastyloop:
629 EXTRACT_NUMBER_AND_INCR (mcnt, p);
630 fprintf (stderr, "/on_failure_jump_nastyloop to %td",
631 p + mcnt - start);
632 break;
633
634 case on_failure_jump_loop:
635 EXTRACT_NUMBER_AND_INCR (mcnt, p);
636 fprintf (stderr, "/on_failure_jump_loop to %td",
637 p + mcnt - start);
638 break;
639
640 case on_failure_jump_smart:
641 EXTRACT_NUMBER_AND_INCR (mcnt, p);
642 fprintf (stderr, "/on_failure_jump_smart to %td",
643 p + mcnt - start);
644 break;
645
646 case jump:
647 EXTRACT_NUMBER_AND_INCR (mcnt, p);
648 fprintf (stderr, "/jump to %td", p + mcnt - start);
649 break;
650
651 case succeed_n:
652 EXTRACT_NUMBER_AND_INCR (mcnt, p);
653 EXTRACT_NUMBER_AND_INCR (mcnt2, p);
654 fprintf (stderr, "/succeed_n to %td, %d times",
655 p - 2 + mcnt - start, mcnt2);
656 break;
657
658 case jump_n:
659 EXTRACT_NUMBER_AND_INCR (mcnt, p);
660 EXTRACT_NUMBER_AND_INCR (mcnt2, p);
661 fprintf (stderr, "/jump_n to %td, %d times",
662 p - 2 + mcnt - start, mcnt2);
663 break;
664
665 case set_number_at:
666 EXTRACT_NUMBER_AND_INCR (mcnt, p);
667 EXTRACT_NUMBER_AND_INCR (mcnt2, p);
668 fprintf (stderr, "/set_number_at location %td to %d",
669 p - 2 + mcnt - start, mcnt2);
670 break;
671
672 case wordbound:
673 fputs ("/wordbound", stderr);
674 break;
675
676 case notwordbound:
677 fputs ("/notwordbound", stderr);
678 break;
679
680 case wordbeg:
681 fputs ("/wordbeg", stderr);
682 break;
683
684 case wordend:
685 fputs ("/wordend", stderr);
686 break;
687
688 case symbeg:
689 fputs ("/symbeg", stderr);
690 break;
691
692 case symend:
693 fputs ("/symend", stderr);
694 break;
695
696 case syntaxspec:
697 fputs ("/syntaxspec", stderr);
698 mcnt = *p++;
699 fprintf (stderr, "/%d", mcnt);
700 break;
701
702 case notsyntaxspec:
703 fputs ("/notsyntaxspec", stderr);
704 mcnt = *p++;
705 fprintf (stderr, "/%d", mcnt);
706 break;
707
708 case at_dot:
709 fputs ("/at_dot", stderr);
710 break;
711
712 case categoryspec:
713 fputs ("/categoryspec", stderr);
714 mcnt = *p++;
715 fprintf (stderr, "/%d", mcnt);
716 break;
717
718 case notcategoryspec:
719 fputs ("/notcategoryspec", stderr);
720 mcnt = *p++;
721 fprintf (stderr, "/%d", mcnt);
722 break;
723
724 case begbuf:
725 fputs ("/begbuf", stderr);
726 break;
727
728 case endbuf:
729 fputs ("/endbuf", stderr);
730 break;
731
732 default:
733 fprintf (stderr, "?%d", *(p-1));
734 }
735
736 putc ('\n', stderr);
737 }
738
739 fprintf (stderr, "%td:\tend of pattern.\n", p - start);
740 }
741
742
743 static void
print_compiled_pattern(struct re_pattern_buffer * bufp)744 print_compiled_pattern (struct re_pattern_buffer *bufp)
745 {
746 re_char *buffer = bufp->buffer;
747
748 print_partial_compiled_pattern (buffer, buffer + bufp->used);
749 fprintf (stderr, "%td bytes used/%td bytes allocated.\n",
750 bufp->used, bufp->allocated);
751
752 if (bufp->fastmap_accurate && bufp->fastmap)
753 {
754 fputs ("fastmap: ", stderr);
755 print_fastmap (bufp->fastmap);
756 }
757
758 fprintf (stderr, "re_nsub: %td\t", bufp->re_nsub);
759 fprintf (stderr, "regs_alloc: %d\t", bufp->regs_allocated);
760 fprintf (stderr, "can_be_null: %d\n", bufp->can_be_null);
761 /* Perhaps we should print the translate table? */
762 }
763
764
765 static void
print_double_string(re_char * where,re_char * string1,ptrdiff_t size1,re_char * string2,ptrdiff_t size2)766 print_double_string (re_char *where, re_char *string1, ptrdiff_t size1,
767 re_char *string2, ptrdiff_t size2)
768 {
769 if (where == NULL)
770 fputs ("(null)", stderr);
771 else
772 {
773 int i;
774 if (FIRST_STRING_P (where))
775 {
776 for (i = 0; i < string1 + size1 - where; i++)
777 debug_putchar (where[i]);
778 where = string2;
779 }
780
781 for (i = 0; i < string2 + size2 - where; i++)
782 debug_putchar (where[i]);
783 }
784 }
785
786 #else /* not REGEX_EMACS_DEBUG */
787
788 # define DEBUG_STATEMENT(e)
789 # define DEBUG_PRINT(...)
790 # define DEBUG_PRINT_COMPILED_PATTERN(p, s, e)
791 # define DEBUG_PRINT_DOUBLE_STRING(w, s1, sz1, s2, sz2)
792
793 #endif /* not REGEX_EMACS_DEBUG */
794
795 typedef enum
796 {
797 REG_NOERROR = 0, /* Success. */
798 REG_NOMATCH, /* Didn't find a match (for regexec). */
799
800 /* POSIX regcomp return error codes. (In the order listed in the
801 standard.) An older version of this code supported the POSIX
802 API; this version continues to use these names internally. */
803 REG_BADPAT, /* Invalid pattern. */
804 REG_ECOLLATE, /* Not implemented. */
805 REG_ECTYPE, /* Invalid character class name. */
806 REG_EESCAPE, /* Trailing backslash. */
807 REG_ESUBREG, /* Invalid back reference. */
808 REG_EBRACK, /* Unmatched left bracket. */
809 REG_EPAREN, /* Parenthesis imbalance. */
810 REG_EBRACE, /* Unmatched \{. */
811 REG_BADBR, /* Invalid contents of \{\}. */
812 REG_ERANGE, /* Invalid range end. */
813 REG_ESPACE, /* Ran out of memory. */
814 REG_BADRPT, /* No preceding re for repetition op. */
815
816 /* Error codes we've added. */
817 REG_EEND, /* Premature end. */
818 REG_ESIZE, /* Compiled pattern bigger than 2^16 bytes. */
819 REG_ERPAREN, /* Unmatched ) or \); not returned from regcomp. */
820 REG_ERANGEX, /* Range striding over charsets. */
821 REG_ESIZEBR /* n or m too big in \{n,m\} */
822 } reg_errcode_t;
823
824 static const char *re_error_msgid[] =
825 {
826 [REG_NOERROR] = "Success",
827 [REG_NOMATCH] = "No match",
828 [REG_BADPAT] = "Invalid regular expression",
829 [REG_ECOLLATE] = "Invalid collation character",
830 [REG_ECTYPE] = "Invalid character class name",
831 [REG_EESCAPE] = "Trailing backslash",
832 [REG_ESUBREG] = "Invalid back reference",
833 [REG_EBRACK] = "Unmatched [ or [^",
834 [REG_EPAREN] = "Unmatched ( or \\(",
835 [REG_EBRACE] = "Unmatched \\{",
836 [REG_BADBR] = "Invalid content of \\{\\}",
837 [REG_ERANGE] = "Invalid range end",
838 [REG_ESPACE] = "Memory exhausted",
839 [REG_BADRPT] = "Invalid preceding regular expression",
840 [REG_EEND] = "Premature end of regular expression",
841 [REG_ESIZE] = "Regular expression too big",
842 [REG_ERPAREN] = "Unmatched ) or \\)",
843 [REG_ERANGEX ] = "Range striding over charsets",
844 [REG_ESIZEBR ] = "Invalid content of \\{\\}",
845 };
846
847 /* For 'regs_allocated'. */
848 enum { REGS_UNALLOCATED, REGS_REALLOCATE, REGS_FIXED };
849
850 /* If 'regs_allocated' is REGS_UNALLOCATED in the pattern buffer,
851 're_match_2' returns information about at least this many registers
852 the first time a 'regs' structure is passed. */
853 enum { RE_NREGS = 30 };
854
855 /* The searching and matching functions allocate memory for the
856 failure stack and registers. Otherwise searching and matching
857 routines would have much smaller memory resources at their
858 disposal, and therefore might fail to handle complex regexps. */
859
860 /* Failure stack declarations and macros; both re_compile_fastmap and
861 re_match_2 use a failure stack. These have to be macros because of
862 SAFE_ALLOCA. */
863
864
865 /* Approximate number of failure points for which to initially allocate space
866 when matching. If this number is exceeded, we allocate more
867 space, so it is not a hard limit. */
868 #define INIT_FAILURE_ALLOC 20
869
870 /* Roughly the maximum number of failure points on the stack. Would be
871 exactly that if failure always used TYPICAL_FAILURE_SIZE items.
872 This is a variable only so users of regex can assign to it; we never
873 change it ourselves. We always multiply it by TYPICAL_FAILURE_SIZE
874 before using it, so it should probably be a byte-count instead. */
875 /* Note that 4400 was enough to cause a crash on Alpha OSF/1,
876 whose default stack limit is 2mb. In order for a larger
877 value to work reliably, you have to try to make it accord
878 with the process stack limit. */
879 ptrdiff_t emacs_re_max_failures = 40000;
880
881 union fail_stack_elt
882 {
883 re_char *pointer;
884 intptr_t integer;
885 };
886
887 typedef union fail_stack_elt fail_stack_elt_t;
888
889 typedef struct
890 {
891 fail_stack_elt_t *stack;
892 ptrdiff_t size;
893 ptrdiff_t avail; /* Offset of next open position. */
894 ptrdiff_t frame; /* Offset of the cur constructed frame. */
895 } fail_stack_type;
896
897 #define FAIL_STACK_EMPTY() (fail_stack.frame == 0)
898
899
900 /* Define macros to initialize and free the failure stack. */
901
902 #define INIT_FAIL_STACK() \
903 do { \
904 fail_stack.stack = \
905 SAFE_ALLOCA (INIT_FAILURE_ALLOC * TYPICAL_FAILURE_SIZE \
906 * sizeof (fail_stack_elt_t)); \
907 fail_stack.size = INIT_FAILURE_ALLOC; \
908 fail_stack.avail = 0; \
909 fail_stack.frame = 0; \
910 } while (false)
911
912
913 /* Double the size of FAIL_STACK, up to a limit
914 which allows approximately 'emacs_re_max_failures' items.
915
916 Return 1 if succeeds, and 0 if either ran out of memory
917 allocating space for it or it was already too large.
918
919 REGEX_REALLOCATE requires 'destination' be declared. */
920
921 /* Factor to increase the failure stack size by.
922 This used to be 2, but 2 was too wasteful
923 because the old discarded stacks added up to as much space
924 were as ultimate, maximum-size stack. */
925 #define FAIL_STACK_GROWTH_FACTOR 4
926
927 #define GROW_FAIL_STACK(fail_stack) \
928 (((fail_stack).size >= emacs_re_max_failures * TYPICAL_FAILURE_SIZE) \
929 ? 0 \
930 : ((fail_stack).stack \
931 = REGEX_REALLOCATE ((fail_stack).stack, \
932 (fail_stack).avail * sizeof (fail_stack_elt_t), \
933 min (emacs_re_max_failures * TYPICAL_FAILURE_SIZE, \
934 ((fail_stack).size * FAIL_STACK_GROWTH_FACTOR)) \
935 * sizeof (fail_stack_elt_t)), \
936 ((fail_stack).size \
937 = (min (emacs_re_max_failures * TYPICAL_FAILURE_SIZE, \
938 ((fail_stack).size * FAIL_STACK_GROWTH_FACTOR)))), \
939 1))
940
941
942 /* Push a pointer value onto the failure stack.
943 Assumes the variable 'fail_stack'. Probably should only
944 be called from within 'PUSH_FAILURE_POINT'. */
945 #define PUSH_FAILURE_POINTER(item) \
946 fail_stack.stack[fail_stack.avail++].pointer = (item)
947
948 /* This pushes an integer-valued item onto the failure stack.
949 Assumes the variable 'fail_stack'. Probably should only
950 be called from within 'PUSH_FAILURE_POINT'. */
951 #define PUSH_FAILURE_INT(item) \
952 fail_stack.stack[fail_stack.avail++].integer = (item)
953
954 /* These POP... operations complement the PUSH... operations.
955 All assume that 'fail_stack' is nonempty. */
956 #define POP_FAILURE_POINTER() fail_stack.stack[--fail_stack.avail].pointer
957 #define POP_FAILURE_INT() fail_stack.stack[--fail_stack.avail].integer
958
959 /* Individual items aside from the registers. */
960 #define NUM_NONREG_ITEMS 3
961
962 /* Used to examine the stack (to detect infinite loops). */
963 #define FAILURE_PAT(h) fail_stack.stack[(h) - 1].pointer
964 #define FAILURE_STR(h) (fail_stack.stack[(h) - 2].pointer)
965 #define NEXT_FAILURE_HANDLE(h) fail_stack.stack[(h) - 3].integer
966 #define TOP_FAILURE_HANDLE() fail_stack.frame
967
968
969 #define ENSURE_FAIL_STACK(space) \
970 while (REMAINING_AVAIL_SLOTS <= space) { \
971 if (!GROW_FAIL_STACK (fail_stack)) \
972 { \
973 unbind_to (count, Qnil); \
974 SAFE_FREE (); \
975 return -2; \
976 } \
977 DEBUG_PRINT ("\n Doubled stack; size now: %td\n", fail_stack.size); \
978 DEBUG_PRINT (" slots available: %td\n", REMAINING_AVAIL_SLOTS);\
979 }
980
981 /* Push register NUM onto the stack. */
982 #define PUSH_FAILURE_REG(num) \
983 do { \
984 char *destination; \
985 intptr_t n = num; \
986 eassert (0 < n && n < num_regs); \
987 eassert (REG_UNSET (regstart[n]) <= REG_UNSET (regend[n])); \
988 ENSURE_FAIL_STACK(3); \
989 DEBUG_PRINT (" Push reg %"PRIdPTR" (spanning %p -> %p)\n", \
990 n, regstart[n], regend[n]); \
991 PUSH_FAILURE_POINTER (regstart[n]); \
992 PUSH_FAILURE_POINTER (regend[n]); \
993 PUSH_FAILURE_INT (n); \
994 } while (false)
995
996 /* Change the counter's value to VAL, but make sure that it will
997 be reset when backtracking. */
998 #define PUSH_NUMBER(ptr,val) \
999 do { \
1000 char *destination; \
1001 int c; \
1002 ENSURE_FAIL_STACK(3); \
1003 EXTRACT_NUMBER (c, ptr); \
1004 DEBUG_PRINT (" Push number %p = %d -> %d\n", ptr, c, val); \
1005 PUSH_FAILURE_INT (c); \
1006 PUSH_FAILURE_POINTER (ptr); \
1007 PUSH_FAILURE_INT (-1); \
1008 STORE_NUMBER (ptr, val); \
1009 } while (false)
1010
1011 /* Pop a saved register off the stack. */
1012 #define POP_FAILURE_REG_OR_COUNT() \
1013 do { \
1014 intptr_t pfreg = POP_FAILURE_INT (); \
1015 if (pfreg == -1) \
1016 { \
1017 /* It's a counter. */ \
1018 /* Discard 'const', making re_search non-reentrant. */ \
1019 unsigned char *ptr = (unsigned char *) POP_FAILURE_POINTER (); \
1020 pfreg = POP_FAILURE_INT (); \
1021 STORE_NUMBER (ptr, pfreg); \
1022 DEBUG_PRINT (" Pop counter %p = %"PRIdPTR"\n", ptr, pfreg); \
1023 } \
1024 else \
1025 { \
1026 eassert (0 < pfreg && pfreg < num_regs); \
1027 regend[pfreg] = POP_FAILURE_POINTER (); \
1028 regstart[pfreg] = POP_FAILURE_POINTER (); \
1029 eassert (REG_UNSET (regstart[pfreg]) <= REG_UNSET (regend[pfreg])); \
1030 DEBUG_PRINT (" Pop reg %ld (spanning %p -> %p)\n", \
1031 pfreg, regstart[pfreg], regend[pfreg]); \
1032 } \
1033 } while (false)
1034
1035 /* Check that we are not stuck in an infinite loop. */
1036 #define CHECK_INFINITE_LOOP(pat_cur, string_place) \
1037 do { \
1038 ptrdiff_t failure = TOP_FAILURE_HANDLE (); \
1039 /* Check for infinite matching loops */ \
1040 while (failure > 0 \
1041 && (FAILURE_STR (failure) == string_place \
1042 || FAILURE_STR (failure) == NULL)) \
1043 { \
1044 eassert (FAILURE_PAT (failure) >= bufp->buffer \
1045 && FAILURE_PAT (failure) <= bufp->buffer + bufp->used); \
1046 if (FAILURE_PAT (failure) == pat_cur) \
1047 { \
1048 cycle = true; \
1049 break; \
1050 } \
1051 DEBUG_PRINT (" Other pattern: %p\n", FAILURE_PAT (failure)); \
1052 failure = NEXT_FAILURE_HANDLE(failure); \
1053 } \
1054 DEBUG_PRINT (" Other string: %p\n", FAILURE_STR (failure)); \
1055 } while (false)
1056
1057 /* Push the information about the state we will need
1058 if we ever fail back to it.
1059
1060 Requires variables fail_stack, regstart, regend and
1061 num_regs be declared. GROW_FAIL_STACK requires 'destination' be
1062 declared.
1063
1064 Does 'return FAILURE_CODE' if runs out of memory. */
1065
1066 #define PUSH_FAILURE_POINT(pattern, string_place) \
1067 do { \
1068 char *destination; \
1069 DEBUG_STATEMENT (nfailure_points_pushed++); \
1070 DEBUG_PRINT ("\nPUSH_FAILURE_POINT:\n"); \
1071 DEBUG_PRINT (" Before push, next avail: %td\n", fail_stack.avail); \
1072 DEBUG_PRINT (" size: %td\n", fail_stack.size); \
1073 \
1074 ENSURE_FAIL_STACK (NUM_NONREG_ITEMS); \
1075 \
1076 DEBUG_PRINT ("\n"); \
1077 \
1078 DEBUG_PRINT (" Push frame index: %td\n", fail_stack.frame); \
1079 PUSH_FAILURE_INT (fail_stack.frame); \
1080 \
1081 DEBUG_PRINT (" Push string %p: \"", string_place); \
1082 DEBUG_PRINT_DOUBLE_STRING (string_place, string1, size1, string2, size2);\
1083 DEBUG_PRINT ("\"\n"); \
1084 PUSH_FAILURE_POINTER (string_place); \
1085 \
1086 DEBUG_PRINT (" Push pattern %p: ", pattern); \
1087 DEBUG_PRINT_COMPILED_PATTERN (bufp, pattern, pend); \
1088 PUSH_FAILURE_POINTER (pattern); \
1089 \
1090 /* Close the frame by moving the frame pointer past it. */ \
1091 fail_stack.frame = fail_stack.avail; \
1092 } while (false)
1093
1094 /* Estimate the size of data pushed by a typical failure stack entry.
1095 An estimate is all we need, because all we use this for
1096 is to choose a limit for how big to make the failure stack. */
1097 /* BEWARE, the value `20' is hard-coded in emacs.c:main(). */
1098 #define TYPICAL_FAILURE_SIZE 20
1099
1100 /* How many items can still be added to the stack without overflowing it. */
1101 #define REMAINING_AVAIL_SLOTS ((fail_stack).size - (fail_stack).avail)
1102
1103
1104 /* Pop what PUSH_FAIL_STACK pushes.
1105
1106 Restore into the parameters, all of which should be lvalues:
1107 STR -- the saved data position.
1108 PAT -- the saved pattern position.
1109 REGSTART, REGEND -- arrays of string positions.
1110
1111 Also assume the variables FAIL_STACK and (if debugging) BUFP, PEND,
1112 STRING1, SIZE1, STRING2, and SIZE2. */
1113
1114 #define POP_FAILURE_POINT(str, pat) \
1115 do { \
1116 eassert (!FAIL_STACK_EMPTY ()); \
1117 \
1118 /* Remove failure points and point to how many regs pushed. */ \
1119 DEBUG_PRINT ("POP_FAILURE_POINT:\n"); \
1120 DEBUG_PRINT (" Before pop, next avail: %td\n", fail_stack.avail); \
1121 DEBUG_PRINT (" size: %td\n", fail_stack.size); \
1122 \
1123 /* Pop the saved registers. */ \
1124 while (fail_stack.frame < fail_stack.avail) \
1125 POP_FAILURE_REG_OR_COUNT (); \
1126 \
1127 pat = POP_FAILURE_POINTER (); \
1128 DEBUG_PRINT (" Popping pattern %p: ", pat); \
1129 DEBUG_PRINT_COMPILED_PATTERN (bufp, pat, pend); \
1130 \
1131 /* If the saved string location is NULL, it came from an \
1132 on_failure_keep_string_jump opcode, and we want to throw away the \
1133 saved NULL, thus retaining our current position in the string. */ \
1134 str = POP_FAILURE_POINTER (); \
1135 DEBUG_PRINT (" Popping string %p: \"", str); \
1136 DEBUG_PRINT_DOUBLE_STRING (str, string1, size1, string2, size2); \
1137 DEBUG_PRINT ("\"\n"); \
1138 \
1139 fail_stack.frame = POP_FAILURE_INT (); \
1140 DEBUG_PRINT (" Popping frame index: %td\n", fail_stack.frame); \
1141 \
1142 eassert (fail_stack.avail >= 0); \
1143 eassert (fail_stack.frame <= fail_stack.avail); \
1144 \
1145 DEBUG_STATEMENT (nfailure_points_popped++); \
1146 } while (false) /* POP_FAILURE_POINT */
1147
1148
1149
1150 /* Registers are set to a sentinel when they haven't yet matched. */
1151 #define REG_UNSET(e) ((e) == NULL)
1152
1153 /* Subroutine declarations and macros for regex_compile. */
1154
1155 static reg_errcode_t regex_compile (re_char *pattern, ptrdiff_t size,
1156 bool posix_backtracking,
1157 const char *whitespace_regexp,
1158 struct re_pattern_buffer *bufp);
1159 static void store_op1 (re_opcode_t op, unsigned char *loc, int arg);
1160 static void store_op2 (re_opcode_t op, unsigned char *loc, int arg1, int arg2);
1161 static void insert_op1 (re_opcode_t op, unsigned char *loc,
1162 int arg, unsigned char *end);
1163 static void insert_op2 (re_opcode_t op, unsigned char *loc,
1164 int arg1, int arg2, unsigned char *end);
1165 static bool at_begline_loc_p (re_char *pattern, re_char *p);
1166 static bool at_endline_loc_p (re_char *p, re_char *pend);
1167 static re_char *skip_one_char (re_char *p);
1168 static int analyze_first (re_char *p, re_char *pend,
1169 char *fastmap, bool multibyte);
1170
1171 /* Fetch the next character in the uncompiled pattern, with no
1172 translation. */
1173 #define PATFETCH(c) \
1174 do { \
1175 int len; \
1176 if (p == pend) return REG_EEND; \
1177 c = RE_STRING_CHAR_AND_LENGTH (p, len, multibyte); \
1178 p += len; \
1179 } while (false)
1180
1181
1182 #define RE_TRANSLATE(TBL, C) char_table_translate (TBL, C)
1183 #define TRANSLATE(d) (!NILP (translate) ? RE_TRANSLATE (translate, d) : (d))
1184
1185 /* Macros for outputting the compiled pattern into 'buffer'. */
1186
1187 /* If the buffer isn't allocated when it comes in, use this. */
1188 #define INIT_BUF_SIZE 32
1189
1190 /* Ensure at least N more bytes of space in buffer. */
1191 #define GET_BUFFER_SPACE(n) \
1192 if (bufp->buffer + bufp->allocated - b < (n)) \
1193 EXTEND_BUFFER ((n) - (bufp->buffer + bufp->allocated - b))
1194
1195 /* Ensure one more byte of buffer space and then add C to it. */
1196 #define BUF_PUSH(c) \
1197 do { \
1198 GET_BUFFER_SPACE (1); \
1199 *b++ = (unsigned char) (c); \
1200 } while (false)
1201
1202
1203 /* Ensure we have two more bytes of buffer space and then append C1 and C2. */
1204 #define BUF_PUSH_2(c1, c2) \
1205 do { \
1206 GET_BUFFER_SPACE (2); \
1207 *b++ = (unsigned char) (c1); \
1208 *b++ = (unsigned char) (c2); \
1209 } while (false)
1210
1211
1212 /* Store a jump with opcode OP at LOC to location TO. Store a
1213 relative address offset by the three bytes the jump itself occupies. */
1214 #define STORE_JUMP(op, loc, to) \
1215 store_op1 (op, loc, (to) - (loc) - 3)
1216
1217 /* Likewise, for a two-argument jump. */
1218 #define STORE_JUMP2(op, loc, to, arg) \
1219 store_op2 (op, loc, (to) - (loc) - 3, arg)
1220
1221 /* Like 'STORE_JUMP', but for inserting. Assume B is the buffer end. */
1222 #define INSERT_JUMP(op, loc, to) \
1223 insert_op1 (op, loc, (to) - (loc) - 3, b)
1224
1225 /* Like 'STORE_JUMP2', but for inserting. Assume B is the buffer end. */
1226 #define INSERT_JUMP2(op, loc, to, arg) \
1227 insert_op2 (op, loc, (to) - (loc) - 3, arg, b)
1228
1229
1230 /* This is not an arbitrary limit: the arguments which represent offsets
1231 into the pattern are two bytes long. So if 2^15 bytes turns out to
1232 be too small, many things would have to change. */
1233 # define MAX_BUF_SIZE (1 << 15)
1234
1235 /* Extend the buffer by at least N bytes via realloc and
1236 reset the pointers that pointed into the old block to point to the
1237 correct places in the new one. If extending the buffer results in it
1238 being larger than MAX_BUF_SIZE, then flag memory exhausted. */
1239 #define EXTEND_BUFFER(n) \
1240 do { \
1241 ptrdiff_t requested_extension = n; \
1242 unsigned char *old_buffer = bufp->buffer; \
1243 if (MAX_BUF_SIZE - bufp->allocated < requested_extension) \
1244 return REG_ESIZE; \
1245 ptrdiff_t b_off = b - old_buffer; \
1246 ptrdiff_t begalt_off = begalt - old_buffer; \
1247 bool fixup_alt_jump_set = !!fixup_alt_jump; \
1248 bool laststart_set = !!laststart; \
1249 bool pending_exact_set = !!pending_exact; \
1250 ptrdiff_t fixup_alt_jump_off, laststart_off, pending_exact_off; \
1251 if (fixup_alt_jump_set) fixup_alt_jump_off = fixup_alt_jump - old_buffer; \
1252 if (laststart_set) laststart_off = laststart - old_buffer; \
1253 if (pending_exact_set) pending_exact_off = pending_exact - old_buffer; \
1254 bufp->buffer = xpalloc (bufp->buffer, &bufp->allocated, \
1255 requested_extension, MAX_BUF_SIZE, 1); \
1256 unsigned char *new_buffer = bufp->buffer; \
1257 b = new_buffer + b_off; \
1258 begalt = new_buffer + begalt_off; \
1259 if (fixup_alt_jump_set) fixup_alt_jump = new_buffer + fixup_alt_jump_off; \
1260 if (laststart_set) laststart = new_buffer + laststart_off; \
1261 if (pending_exact_set) pending_exact = new_buffer + pending_exact_off; \
1262 } while (false)
1263
1264
1265 /* Since we have one byte reserved for the register number argument to
1266 {start,stop}_memory, the maximum number of groups we can report
1267 things about is what fits in that byte. */
1268 #define MAX_REGNUM 255
1269
1270 /* But patterns can have more than 'MAX_REGNUM' registers. Just
1271 ignore the excess. */
1272 typedef int regnum_t;
1273
1274
1275 /* Macros for the compile stack. */
1276
1277 typedef long pattern_offset_t;
1278 verify (LONG_MIN <= -(MAX_BUF_SIZE - 1) && MAX_BUF_SIZE - 1 <= LONG_MAX);
1279
1280 typedef struct
1281 {
1282 pattern_offset_t begalt_offset;
1283 pattern_offset_t fixup_alt_jump;
1284 pattern_offset_t laststart_offset;
1285 regnum_t regnum;
1286 } compile_stack_elt_t;
1287
1288
1289 typedef struct
1290 {
1291 compile_stack_elt_t *stack;
1292 ptrdiff_t size;
1293 ptrdiff_t avail; /* Offset of next open position. */
1294 } compile_stack_type;
1295
1296
1297 #define INIT_COMPILE_STACK_SIZE 32
1298
1299 #define COMPILE_STACK_EMPTY (compile_stack.avail == 0)
1300 #define COMPILE_STACK_FULL (compile_stack.avail == compile_stack.size)
1301
1302 /* The next available element. */
1303 #define COMPILE_STACK_TOP (compile_stack.stack[compile_stack.avail])
1304
1305 /* Structure to manage work area for range table. */
1306 struct range_table_work_area
1307 {
1308 int *table; /* actual work area. */
1309 int allocated; /* allocated size for work area in bytes. */
1310 int used; /* actually used size in words. */
1311 int bits; /* flag to record character classes */
1312 };
1313
1314 /* Make sure that WORK_AREA can hold N more multibyte characters.
1315 If it can't get the space, it returns from the surrounding function. */
1316
1317 #define EXTEND_RANGE_TABLE(work_area, n) \
1318 do { \
1319 if (((work_area).used + (n)) * sizeof (int) > (work_area).allocated) \
1320 { \
1321 extend_range_table_work_area (&work_area); \
1322 if ((work_area).table == 0) \
1323 return (REG_ESPACE); \
1324 } \
1325 } while (false)
1326
1327 #define SET_RANGE_TABLE_WORK_AREA_BIT(work_area, bit) \
1328 (work_area).bits |= (bit)
1329
1330 /* Set a range (RANGE_START, RANGE_END) to WORK_AREA. */
1331 #define SET_RANGE_TABLE_WORK_AREA(work_area, range_start, range_end) \
1332 do { \
1333 EXTEND_RANGE_TABLE ((work_area), 2); \
1334 (work_area).table[(work_area).used++] = (range_start); \
1335 (work_area).table[(work_area).used++] = (range_end); \
1336 } while (false)
1337
1338 /* Free allocated memory for WORK_AREA. */
1339 #define FREE_RANGE_TABLE_WORK_AREA(work_area) \
1340 do { \
1341 if ((work_area).table) \
1342 xfree ((work_area).table); \
1343 } while (false)
1344
1345 #define CLEAR_RANGE_TABLE_WORK_USED(work_area) \
1346 ((work_area).used = 0, (work_area).bits = 0)
1347 #define RANGE_TABLE_WORK_USED(work_area) ((work_area).used)
1348 #define RANGE_TABLE_WORK_BITS(work_area) ((work_area).bits)
1349 #define RANGE_TABLE_WORK_ELT(work_area, i) ((work_area).table[i])
1350
1351 /* Bits used to implement the multibyte-part of the various character classes
1352 such as [:alnum:] in a charset's range table. The code currently assumes
1353 that only the low 16 bits are used. */
1354 #define BIT_WORD 0x1
1355 #define BIT_LOWER 0x2
1356 #define BIT_PUNCT 0x4
1357 #define BIT_SPACE 0x8
1358 #define BIT_UPPER 0x10
1359 #define BIT_MULTIBYTE 0x20
1360 #define BIT_ALPHA 0x40
1361 #define BIT_ALNUM 0x80
1362 #define BIT_GRAPH 0x100
1363 #define BIT_PRINT 0x200
1364 #define BIT_BLANK 0x400
1365
1366
1367 /* Set the bit for character C in a list. */
1368 #define SET_LIST_BIT(c) (b[((c)) / BYTEWIDTH] |= 1 << ((c) % BYTEWIDTH))
1369
1370
1371 /* Store characters in the range FROM to TO in the bitmap at B (for
1372 ASCII and unibyte characters) and WORK_AREA (for multibyte
1373 characters) while translating them and paying attention to the
1374 continuity of translated characters.
1375
1376 Implementation note: It is better to implement these fairly big
1377 macros by a function, but it's not that easy because macros called
1378 in this macro assume various local variables already declared. */
1379
1380 /* Both FROM and TO are ASCII characters. */
1381
1382 #define SETUP_ASCII_RANGE(work_area, FROM, TO) \
1383 do { \
1384 int C0, C1; \
1385 \
1386 for (C0 = (FROM); C0 <= (TO); C0++) \
1387 { \
1388 C1 = TRANSLATE (C0); \
1389 if (! ASCII_CHAR_P (C1)) \
1390 { \
1391 SET_RANGE_TABLE_WORK_AREA ((work_area), C1, C1); \
1392 if ((C1 = RE_CHAR_TO_UNIBYTE (C1)) < 0) \
1393 C1 = C0; \
1394 } \
1395 SET_LIST_BIT (C1); \
1396 } \
1397 } while (false)
1398
1399
1400 /* Both FROM and TO are unibyte characters (0x80..0xFF). */
1401
1402 #define SETUP_UNIBYTE_RANGE(work_area, FROM, TO) \
1403 do { \
1404 int C0, C1, C2, I; \
1405 int USED = RANGE_TABLE_WORK_USED (work_area); \
1406 \
1407 for (C0 = (FROM); C0 <= (TO); C0++) \
1408 { \
1409 C1 = RE_CHAR_TO_MULTIBYTE (C0); \
1410 if (CHAR_BYTE8_P (C1)) \
1411 SET_LIST_BIT (C0); \
1412 else \
1413 { \
1414 C2 = TRANSLATE (C1); \
1415 if (C2 == C1 \
1416 || (C1 = RE_CHAR_TO_UNIBYTE (C2)) < 0) \
1417 C1 = C0; \
1418 SET_LIST_BIT (C1); \
1419 for (I = RANGE_TABLE_WORK_USED (work_area) - 2; I >= USED; I -= 2) \
1420 { \
1421 int from = RANGE_TABLE_WORK_ELT (work_area, I); \
1422 int to = RANGE_TABLE_WORK_ELT (work_area, I + 1); \
1423 \
1424 if (C2 >= from - 1 && C2 <= to + 1) \
1425 { \
1426 if (C2 == from - 1) \
1427 RANGE_TABLE_WORK_ELT (work_area, I)--; \
1428 else if (C2 == to + 1) \
1429 RANGE_TABLE_WORK_ELT (work_area, I + 1)++; \
1430 break; \
1431 } \
1432 } \
1433 if (I < USED) \
1434 SET_RANGE_TABLE_WORK_AREA ((work_area), C2, C2); \
1435 } \
1436 } \
1437 } while (false)
1438
1439
1440 /* Both FROM and TO are multibyte characters. */
1441
1442 #define SETUP_MULTIBYTE_RANGE(work_area, FROM, TO) \
1443 do { \
1444 int C0, C1, C2, I, USED = RANGE_TABLE_WORK_USED (work_area); \
1445 \
1446 SET_RANGE_TABLE_WORK_AREA ((work_area), (FROM), (TO)); \
1447 for (C0 = (FROM); C0 <= (TO); C0++) \
1448 { \
1449 C1 = TRANSLATE (C0); \
1450 if ((C2 = RE_CHAR_TO_UNIBYTE (C1)) >= 0 \
1451 || (C1 != C0 && (C2 = RE_CHAR_TO_UNIBYTE (C0)) >= 0)) \
1452 SET_LIST_BIT (C2); \
1453 if (C1 >= (FROM) && C1 <= (TO)) \
1454 continue; \
1455 for (I = RANGE_TABLE_WORK_USED (work_area) - 2; I >= USED; I -= 2) \
1456 { \
1457 int from = RANGE_TABLE_WORK_ELT (work_area, I); \
1458 int to = RANGE_TABLE_WORK_ELT (work_area, I + 1); \
1459 \
1460 if (C1 >= from - 1 && C1 <= to + 1) \
1461 { \
1462 if (C1 == from - 1) \
1463 RANGE_TABLE_WORK_ELT (work_area, I)--; \
1464 else if (C1 == to + 1) \
1465 RANGE_TABLE_WORK_ELT (work_area, I + 1)++; \
1466 break; \
1467 } \
1468 } \
1469 if (I < USED) \
1470 SET_RANGE_TABLE_WORK_AREA ((work_area), C1, C1); \
1471 } \
1472 } while (false)
1473
1474 /* Get the next unsigned number in the uncompiled pattern. */
1475 #define GET_INTERVAL_COUNT(num) \
1476 do { \
1477 if (p == pend) \
1478 FREE_STACK_RETURN (REG_EBRACE); \
1479 else \
1480 { \
1481 PATFETCH (c); \
1482 while ('0' <= c && c <= '9') \
1483 { \
1484 if (num < 0) \
1485 num = 0; \
1486 if (RE_DUP_MAX / 10 - (RE_DUP_MAX % 10 < c - '0') < num) \
1487 FREE_STACK_RETURN (REG_ESIZEBR); \
1488 num = num * 10 + c - '0'; \
1489 if (p == pend) \
1490 FREE_STACK_RETURN (REG_EBRACE); \
1491 PATFETCH (c); \
1492 } \
1493 } \
1494 } while (false)
1495
1496 /* Parse a character class, i.e. string such as "[:name:]". *strp
1497 points to the string to be parsed and limit is length, in bytes, of
1498 that string.
1499
1500 If *strp point to a string that begins with "[:name:]", where name is
1501 a non-empty sequence of lower case letters, *strp will be advanced past the
1502 closing square bracket and RECC_* constant which maps to the name will be
1503 returned. If name is not a valid character class name zero, or RECC_ERROR,
1504 is returned.
1505
1506 Otherwise, if *strp doesn't begin with "[:name:]", -1 is returned.
1507
1508 The function can be used on ASCII and multibyte (UTF-8-encoded) strings.
1509 */
1510 re_wctype_t
re_wctype_parse(const unsigned char ** strp,ptrdiff_t limit)1511 re_wctype_parse (const unsigned char **strp, ptrdiff_t limit)
1512 {
1513 const char *beg = (const char *)*strp, *it;
1514
1515 if (limit < 4 || beg[0] != '[' || beg[1] != ':')
1516 return -1;
1517
1518 beg += 2; /* skip opening "[:" */
1519 limit -= 3; /* opening "[:" and half of closing ":]"; --limit handles rest */
1520 for (it = beg; it[0] != ':' || it[1] != ']'; ++it)
1521 if (!--limit)
1522 return -1;
1523
1524 *strp = (const unsigned char *)(it + 2);
1525
1526 /* Sort tests in the length=five case by frequency the classes to minimize
1527 number of times we fail the comparison. The frequencies of character class
1528 names used in Emacs sources as of 2016-07-27:
1529
1530 $ find \( -name \*.c -o -name \*.el \) -exec grep -h '\[:[a-z]*:]' {} + |
1531 sed 's/]/]\n/g' |grep -o '\[:[a-z]*:]' |sort |uniq -c |sort -nr
1532 213 [:alnum:]
1533 104 [:alpha:]
1534 62 [:space:]
1535 39 [:digit:]
1536 36 [:blank:]
1537 26 [:word:]
1538 26 [:upper:]
1539 21 [:lower:]
1540 10 [:xdigit:]
1541 10 [:punct:]
1542 10 [:ascii:]
1543 4 [:nonascii:]
1544 4 [:graph:]
1545 2 [:print:]
1546 2 [:cntrl:]
1547 1 [:ff:]
1548
1549 If you update this list, consider also updating chain of or'ed conditions
1550 in execute_charset function.
1551 */
1552
1553 switch (it - beg) {
1554 case 4:
1555 if (!memcmp (beg, "word", 4)) return RECC_WORD;
1556 break;
1557 case 5:
1558 if (!memcmp (beg, "alnum", 5)) return RECC_ALNUM;
1559 if (!memcmp (beg, "alpha", 5)) return RECC_ALPHA;
1560 if (!memcmp (beg, "space", 5)) return RECC_SPACE;
1561 if (!memcmp (beg, "digit", 5)) return RECC_DIGIT;
1562 if (!memcmp (beg, "blank", 5)) return RECC_BLANK;
1563 if (!memcmp (beg, "upper", 5)) return RECC_UPPER;
1564 if (!memcmp (beg, "lower", 5)) return RECC_LOWER;
1565 if (!memcmp (beg, "punct", 5)) return RECC_PUNCT;
1566 if (!memcmp (beg, "ascii", 5)) return RECC_ASCII;
1567 if (!memcmp (beg, "graph", 5)) return RECC_GRAPH;
1568 if (!memcmp (beg, "print", 5)) return RECC_PRINT;
1569 if (!memcmp (beg, "cntrl", 5)) return RECC_CNTRL;
1570 break;
1571 case 6:
1572 if (!memcmp (beg, "xdigit", 6)) return RECC_XDIGIT;
1573 break;
1574 case 7:
1575 if (!memcmp (beg, "unibyte", 7)) return RECC_UNIBYTE;
1576 break;
1577 case 8:
1578 if (!memcmp (beg, "nonascii", 8)) return RECC_NONASCII;
1579 break;
1580 case 9:
1581 if (!memcmp (beg, "multibyte", 9)) return RECC_MULTIBYTE;
1582 break;
1583 }
1584
1585 return RECC_ERROR;
1586 }
1587
1588 /* True if CH is in the char class CC. */
1589 bool
re_iswctype(int ch,re_wctype_t cc)1590 re_iswctype (int ch, re_wctype_t cc)
1591 {
1592 switch (cc)
1593 {
1594 case RECC_ALNUM: return ISALNUM (ch) != 0;
1595 case RECC_ALPHA: return ISALPHA (ch) != 0;
1596 case RECC_BLANK: return ISBLANK (ch) != 0;
1597 case RECC_CNTRL: return ISCNTRL (ch) != 0;
1598 case RECC_DIGIT: return ISDIGIT (ch) != 0;
1599 case RECC_GRAPH: return ISGRAPH (ch) != 0;
1600 case RECC_LOWER: return ISLOWER (ch) != 0;
1601 case RECC_PRINT: return ISPRINT (ch) != 0;
1602 case RECC_PUNCT: return ISPUNCT (ch) != 0;
1603 case RECC_SPACE: return ISSPACE (ch) != 0;
1604 case RECC_UPPER: return ISUPPER (ch) != 0;
1605 case RECC_XDIGIT: return ISXDIGIT (ch) != 0;
1606 case RECC_ASCII: return IS_REAL_ASCII (ch) != 0;
1607 case RECC_NONASCII: return !IS_REAL_ASCII (ch);
1608 case RECC_UNIBYTE: return ISUNIBYTE (ch) != 0;
1609 case RECC_MULTIBYTE: return !ISUNIBYTE (ch);
1610 case RECC_WORD: return ISWORD (ch) != 0;
1611 case RECC_ERROR: return false;
1612 default:
1613 abort ();
1614 }
1615 }
1616
1617 /* Return a bit-pattern to use in the range-table bits to match multibyte
1618 chars of class CC. */
1619 static int
re_wctype_to_bit(re_wctype_t cc)1620 re_wctype_to_bit (re_wctype_t cc)
1621 {
1622 switch (cc)
1623 {
1624 case RECC_NONASCII:
1625 case RECC_MULTIBYTE: return BIT_MULTIBYTE;
1626 case RECC_ALPHA: return BIT_ALPHA;
1627 case RECC_ALNUM: return BIT_ALNUM;
1628 case RECC_WORD: return BIT_WORD;
1629 case RECC_LOWER: return BIT_LOWER;
1630 case RECC_UPPER: return BIT_UPPER;
1631 case RECC_PUNCT: return BIT_PUNCT;
1632 case RECC_SPACE: return BIT_SPACE;
1633 case RECC_GRAPH: return BIT_GRAPH;
1634 case RECC_PRINT: return BIT_PRINT;
1635 case RECC_BLANK: return BIT_BLANK;
1636 case RECC_ASCII: case RECC_DIGIT: case RECC_XDIGIT: case RECC_CNTRL:
1637 case RECC_UNIBYTE: case RECC_ERROR: return 0;
1638 default:
1639 abort ();
1640 }
1641 }
1642
1643 /* Filling in the work area of a range. */
1644
1645 /* Actually extend the space in WORK_AREA. */
1646
1647 static void
extend_range_table_work_area(struct range_table_work_area * work_area)1648 extend_range_table_work_area (struct range_table_work_area *work_area)
1649 {
1650 work_area->allocated += 16 * sizeof (int);
1651 work_area->table = xrealloc (work_area->table, work_area->allocated);
1652 }
1653
1654 /* regex_compile and helpers. */
1655
1656 static bool group_in_compile_stack (compile_stack_type, regnum_t);
1657
1658 /* Insert the 'jump' from the end of last alternative to "here".
1659 The space for the jump has already been allocated. */
1660 #define FIXUP_ALT_JUMP() \
1661 do { \
1662 if (fixup_alt_jump) \
1663 STORE_JUMP (jump, fixup_alt_jump, b); \
1664 } while (false)
1665
1666
1667 /* Return, freeing storage we allocated. */
1668 #define FREE_STACK_RETURN(value) \
1669 do { \
1670 FREE_RANGE_TABLE_WORK_AREA (range_table_work); \
1671 xfree (compile_stack.stack); \
1672 return value; \
1673 } while (false)
1674
1675 /* Compile PATTERN (of length SIZE) according to SYNTAX.
1676 Return a nonzero error code on failure, or zero for success.
1677
1678 If WHITESPACE_REGEXP is given, use it instead of a space
1679 character in PATTERN.
1680
1681 Assume the 'allocated' (and perhaps 'buffer') and 'translate'
1682 fields are set in BUFP on entry.
1683
1684 If successful, put results in *BUFP (otherwise the
1685 contents of *BUFP are undefined):
1686 'buffer' is the compiled pattern;
1687 'syntax' is set to SYNTAX;
1688 'used' is set to the length of the compiled pattern;
1689 'fastmap_accurate' is false;
1690 're_nsub' is the number of subexpressions in PATTERN;
1691
1692 The 'fastmap' field is neither examined nor set. */
1693
1694 static reg_errcode_t
regex_compile(re_char * pattern,ptrdiff_t size,bool posix_backtracking,const char * whitespace_regexp,struct re_pattern_buffer * bufp)1695 regex_compile (re_char *pattern, ptrdiff_t size,
1696 bool posix_backtracking,
1697 const char *whitespace_regexp,
1698 struct re_pattern_buffer *bufp)
1699 {
1700 /* Fetch characters from PATTERN here. */
1701 int c, c1;
1702
1703 /* Points to the end of the buffer, where we should append. */
1704 unsigned char *b;
1705
1706 /* Keeps track of unclosed groups. */
1707 compile_stack_type compile_stack;
1708
1709 /* Points to the current (ending) position in the pattern. */
1710 re_char *p = pattern;
1711 re_char *pend = pattern + size;
1712
1713 /* How to translate the characters in the pattern. */
1714 Lisp_Object translate = bufp->translate;
1715
1716 /* Address of the count-byte of the most recently inserted 'exactn'
1717 command. This makes it possible to tell if a new exact-match
1718 character can be added to that command or if the character requires
1719 a new 'exactn' command. */
1720 unsigned char *pending_exact = 0;
1721
1722 /* Address of start of the most recently finished expression.
1723 This tells, e.g., postfix * where to find the start of its
1724 operand. Reset at the beginning of groups and alternatives. */
1725 unsigned char *laststart = 0;
1726
1727 /* Address of beginning of regexp, or inside of last group. */
1728 unsigned char *begalt;
1729
1730 /* Place in the uncompiled pattern (i.e., the {) to
1731 which to go back if the interval is invalid. */
1732 re_char *beg_interval;
1733
1734 /* Address of the place where a forward jump should go to the end of
1735 the containing expression. Each alternative of an 'or' -- except the
1736 last -- ends with a forward jump of this sort. */
1737 unsigned char *fixup_alt_jump = 0;
1738
1739 /* Work area for range table of charset. */
1740 struct range_table_work_area range_table_work;
1741
1742 /* If the regular expression is multibyte. */
1743 bool multibyte = RE_MULTIBYTE_P (bufp);
1744
1745 /* Nonzero if we have pushed down into a subpattern. */
1746 int in_subpattern = 0;
1747
1748 /* These hold the values of p, pattern, and pend from the main
1749 pattern when we have pushed into a subpattern. */
1750 re_char *main_p;
1751 re_char *main_pattern;
1752 re_char *main_pend;
1753
1754 #ifdef REGEX_EMACS_DEBUG
1755 regex_emacs_debug++;
1756 DEBUG_PRINT ("\nCompiling pattern: ");
1757 if (regex_emacs_debug > 0)
1758 {
1759 for (ptrdiff_t debug_count = 0; debug_count < size; debug_count++)
1760 debug_putchar (pattern[debug_count]);
1761 putc ('\n', stderr);
1762 }
1763 #endif
1764
1765 /* Initialize the compile stack. */
1766 compile_stack.stack = xmalloc (INIT_COMPILE_STACK_SIZE
1767 * sizeof *compile_stack.stack);
1768 __lsan_ignore_object (compile_stack.stack);
1769 compile_stack.size = INIT_COMPILE_STACK_SIZE;
1770 compile_stack.avail = 0;
1771
1772 range_table_work.table = 0;
1773 range_table_work.allocated = 0;
1774
1775 /* Initialize the pattern buffer. */
1776 bufp->fastmap_accurate = false;
1777 bufp->used_syntax = false;
1778
1779 /* Set 'used' to zero, so that if we return an error, the pattern
1780 printer (for debugging) will think there's no pattern. We reset it
1781 at the end. */
1782 bufp->used = 0;
1783
1784 bufp->re_nsub = 0;
1785
1786 if (bufp->allocated == 0)
1787 {
1788 /* This loses if BUFP->buffer is bogus, but that is the user's
1789 responsibility. */
1790 bufp->buffer = xrealloc (bufp->buffer, INIT_BUF_SIZE);
1791 bufp->allocated = INIT_BUF_SIZE;
1792 }
1793
1794 begalt = b = bufp->buffer;
1795
1796 /* Loop through the uncompiled pattern until we're at the end. */
1797 while (1)
1798 {
1799 if (p == pend)
1800 {
1801 /* If this is the end of an included regexp,
1802 pop back to the main regexp and try again. */
1803 if (in_subpattern)
1804 {
1805 in_subpattern = 0;
1806 pattern = main_pattern;
1807 p = main_p;
1808 pend = main_pend;
1809 continue;
1810 }
1811 /* If this is the end of the main regexp, we are done. */
1812 break;
1813 }
1814
1815 PATFETCH (c);
1816
1817 switch (c)
1818 {
1819 case ' ':
1820 {
1821 re_char *p1 = p;
1822
1823 /* If there's no special whitespace regexp, treat
1824 spaces normally. And don't try to do this recursively. */
1825 if (!whitespace_regexp || in_subpattern)
1826 goto normal_char;
1827
1828 /* Peek past following spaces. */
1829 while (p1 != pend)
1830 {
1831 if (*p1 != ' ')
1832 break;
1833 p1++;
1834 }
1835 /* If the spaces are followed by a repetition op,
1836 treat them normally. */
1837 if (p1 != pend
1838 && (*p1 == '*' || *p1 == '+' || *p1 == '?'
1839 || (*p1 == '\\' && p1 + 1 != pend && p1[1] == '{')))
1840 goto normal_char;
1841
1842 /* Replace the spaces with the whitespace regexp. */
1843 in_subpattern = 1;
1844 main_p = p1;
1845 main_pend = pend;
1846 main_pattern = pattern;
1847 p = pattern = (re_char *) whitespace_regexp;
1848 pend = p + strlen (whitespace_regexp);
1849 break;
1850 }
1851
1852 case '^':
1853 if (! (p == pattern + 1 || at_begline_loc_p (pattern, p)))
1854 goto normal_char;
1855 BUF_PUSH (begline);
1856 break;
1857
1858 case '$':
1859 if (! (p == pend || at_endline_loc_p (p, pend)))
1860 goto normal_char;
1861 BUF_PUSH (endline);
1862 break;
1863
1864
1865 case '+':
1866 case '?':
1867 case '*':
1868 /* If there is no previous pattern... */
1869 if (!laststart)
1870 goto normal_char;
1871
1872 {
1873 /* 1 means zero (many) matches is allowed. */
1874 bool zero_times_ok = false, many_times_ok = false;
1875 bool greedy = true;
1876
1877 /* If there is a sequence of repetition chars, collapse it
1878 down to just one (the right one). We can't combine
1879 interval operators with these because of, e.g., 'a{2}*',
1880 which should only match an even number of 'a's. */
1881
1882 for (;;)
1883 {
1884 if (c == '?' && (zero_times_ok || many_times_ok))
1885 greedy = false;
1886 else
1887 {
1888 zero_times_ok |= c != '+';
1889 many_times_ok |= c != '?';
1890 }
1891
1892 if (! (p < pend && (*p == '*' || *p == '+' || *p == '?')))
1893 break;
1894 /* If we get here, we found another repeat character. */
1895 c = *p++;
1896 }
1897
1898 /* Star, etc. applied to an empty pattern is equivalent
1899 to an empty pattern. */
1900 if (!laststart || laststart == b)
1901 break;
1902
1903 /* Now we know whether or not zero matches is allowed
1904 and also whether or not two or more matches is allowed. */
1905 if (greedy)
1906 {
1907 if (many_times_ok)
1908 {
1909 bool simple = skip_one_char (laststart) == b;
1910 ptrdiff_t startoffset = 0;
1911 re_opcode_t ofj =
1912 /* Check if the loop can match the empty string. */
1913 (simple || !analyze_first (laststart, b, NULL, false))
1914 ? on_failure_jump : on_failure_jump_loop;
1915 eassert (skip_one_char (laststart) <= b);
1916
1917 if (!zero_times_ok && simple)
1918 { /* Since simple * loops can be made faster by using
1919 on_failure_keep_string_jump, we turn simple P+
1920 into PP* if P is simple. */
1921 unsigned char *p1, *p2;
1922 startoffset = b - laststart;
1923 GET_BUFFER_SPACE (startoffset);
1924 p1 = b; p2 = laststart;
1925 while (p2 < p1)
1926 *b++ = *p2++;
1927 zero_times_ok = 1;
1928 }
1929
1930 GET_BUFFER_SPACE (6);
1931 if (!zero_times_ok)
1932 /* A + loop. */
1933 STORE_JUMP (ofj, b, b + 6);
1934 else
1935 /* Simple * loops can use on_failure_keep_string_jump
1936 depending on what follows. But since we don't know
1937 that yet, we leave the decision up to
1938 on_failure_jump_smart. */
1939 INSERT_JUMP (simple ? on_failure_jump_smart : ofj,
1940 laststart + startoffset, b + 6);
1941 b += 3;
1942 STORE_JUMP (jump, b, laststart + startoffset);
1943 b += 3;
1944 }
1945 else
1946 {
1947 /* A simple ? pattern. */
1948 eassert (zero_times_ok);
1949 GET_BUFFER_SPACE (3);
1950 INSERT_JUMP (on_failure_jump, laststart, b + 3);
1951 b += 3;
1952 }
1953 }
1954 else /* not greedy */
1955 { /* I wish the greedy and non-greedy cases could be merged. */
1956
1957 GET_BUFFER_SPACE (7); /* We might use less. */
1958 if (many_times_ok)
1959 {
1960 bool emptyp = !!analyze_first (laststart, b, NULL, false);
1961
1962 /* The non-greedy multiple match looks like
1963 a repeat..until: we only need a conditional jump
1964 at the end of the loop. */
1965 if (emptyp) BUF_PUSH (no_op);
1966 STORE_JUMP (emptyp ? on_failure_jump_nastyloop
1967 : on_failure_jump, b, laststart);
1968 b += 3;
1969 if (zero_times_ok)
1970 {
1971 /* The repeat...until naturally matches one or more.
1972 To also match zero times, we need to first jump to
1973 the end of the loop (its conditional jump). */
1974 INSERT_JUMP (jump, laststart, b);
1975 b += 3;
1976 }
1977 }
1978 else
1979 {
1980 /* non-greedy a?? */
1981 INSERT_JUMP (jump, laststart, b + 3);
1982 b += 3;
1983 INSERT_JUMP (on_failure_jump, laststart, laststart + 6);
1984 b += 3;
1985 }
1986 }
1987 }
1988 pending_exact = 0;
1989 break;
1990
1991
1992 case '.':
1993 laststart = b;
1994 BUF_PUSH (anychar);
1995 break;
1996
1997
1998 case '[':
1999 {
2000 re_char *p1;
2001
2002 CLEAR_RANGE_TABLE_WORK_USED (range_table_work);
2003
2004 if (p == pend) FREE_STACK_RETURN (REG_EBRACK);
2005
2006 /* Ensure that we have enough space to push a charset: the
2007 opcode, the length count, and the bitset; 34 bytes in all. */
2008 GET_BUFFER_SPACE (34);
2009
2010 laststart = b;
2011
2012 /* Test '*p == '^' twice, instead of using an if
2013 statement, so we need only one BUF_PUSH. */
2014 BUF_PUSH (*p == '^' ? charset_not : charset);
2015 if (*p == '^')
2016 p++;
2017
2018 /* Remember the first position in the bracket expression. */
2019 p1 = p;
2020
2021 /* Push the number of bytes in the bitmap. */
2022 BUF_PUSH ((1 << BYTEWIDTH) / BYTEWIDTH);
2023
2024 /* Clear the whole map. */
2025 memset (b, 0, (1 << BYTEWIDTH) / BYTEWIDTH);
2026
2027 /* Read in characters and ranges, setting map bits. */
2028 for (;;)
2029 {
2030 const unsigned char *p2 = p;
2031 int ch;
2032
2033 if (p == pend) FREE_STACK_RETURN (REG_EBRACK);
2034
2035 /* See if we're at the beginning of a possible character
2036 class. */
2037 re_wctype_t cc = re_wctype_parse (&p, pend - p);
2038 if (cc != -1)
2039 {
2040 if (cc == 0)
2041 FREE_STACK_RETURN (REG_ECTYPE);
2042
2043 if (p == pend)
2044 FREE_STACK_RETURN (REG_EBRACK);
2045
2046 /* Most character classes in a multibyte match just set
2047 a flag. Exceptions are is_blank, is_digit, is_cntrl, and
2048 is_xdigit, since they can only match ASCII characters.
2049 We don't need to handle them for multibyte. */
2050
2051 /* Setup the gl_state object to its buffer-defined value.
2052 This hardcodes the buffer-global syntax-table for ASCII
2053 chars, while the other chars will obey syntax-table
2054 properties. It's not ideal, but it's the way it's been
2055 done until now. */
2056 SETUP_BUFFER_SYNTAX_TABLE ();
2057
2058 for (c = 0; c < 0x80; ++c)
2059 if (re_iswctype (c, cc))
2060 {
2061 SET_LIST_BIT (c);
2062 c1 = TRANSLATE (c);
2063 if (c1 == c)
2064 continue;
2065 if (ASCII_CHAR_P (c1))
2066 SET_LIST_BIT (c1);
2067 else if ((c1 = RE_CHAR_TO_UNIBYTE (c1)) >= 0)
2068 SET_LIST_BIT (c1);
2069 }
2070 SET_RANGE_TABLE_WORK_AREA_BIT
2071 (range_table_work, re_wctype_to_bit (cc));
2072
2073 /* In most cases the matching rule for char classes only
2074 uses the syntax table for multibyte chars, so that the
2075 content of the syntax-table is not hardcoded in the
2076 range_table. SPACE and WORD are the two exceptions. */
2077 if ((1 << cc) & ((1 << RECC_SPACE) | (1 << RECC_WORD)))
2078 bufp->used_syntax = true;
2079
2080 /* Repeat the loop. */
2081 continue;
2082 }
2083
2084 /* Don't translate yet. The range TRANSLATE(X..Y) cannot
2085 always be determined from TRANSLATE(X) and TRANSLATE(Y)
2086 So the translation is done later in a loop. Example:
2087 (let ((case-fold-search t)) (string-match "[A-_]" "A")) */
2088 PATFETCH (c);
2089
2090 /* Could be the end of the bracket expression. If it's
2091 not (i.e., when the bracket expression is '[]' so
2092 far), the ']' character bit gets set way below. */
2093 if (c == ']' && p2 != p1)
2094 break;
2095
2096 if (p < pend && p[0] == '-' && p[1] != ']')
2097 {
2098
2099 /* Discard the '-'. */
2100 PATFETCH (c1);
2101
2102 /* Fetch the character which ends the range. */
2103 PATFETCH (c1);
2104
2105 if (CHAR_BYTE8_P (c1)
2106 && ! ASCII_CHAR_P (c) && ! CHAR_BYTE8_P (c))
2107 /* Treat the range from a multibyte character to
2108 raw-byte character as empty. */
2109 c = c1 + 1;
2110 }
2111 else
2112 /* Range from C to C. */
2113 c1 = c;
2114
2115 if (c <= c1)
2116 {
2117 if (c < 128)
2118 {
2119 ch = min (127, c1);
2120 SETUP_ASCII_RANGE (range_table_work, c, ch);
2121 c = ch + 1;
2122 if (CHAR_BYTE8_P (c1))
2123 c = BYTE8_TO_CHAR (128);
2124 }
2125 if (c <= c1)
2126 {
2127 if (CHAR_BYTE8_P (c))
2128 {
2129 c = CHAR_TO_BYTE8 (c);
2130 c1 = CHAR_TO_BYTE8 (c1);
2131 for (; c <= c1; c++)
2132 SET_LIST_BIT (c);
2133 }
2134 else if (multibyte)
2135 SETUP_MULTIBYTE_RANGE (range_table_work, c, c1);
2136 else
2137 SETUP_UNIBYTE_RANGE (range_table_work, c, c1);
2138 }
2139 }
2140 }
2141
2142 /* Discard any (non)matching list bytes that are all 0 at the
2143 end of the map. Decrease the map-length byte too. */
2144 while ((int) b[-1] > 0 && b[b[-1] - 1] == 0)
2145 b[-1]--;
2146 b += b[-1];
2147
2148 /* Build real range table from work area. */
2149 if (RANGE_TABLE_WORK_USED (range_table_work)
2150 || RANGE_TABLE_WORK_BITS (range_table_work))
2151 {
2152 int i;
2153 int used = RANGE_TABLE_WORK_USED (range_table_work);
2154
2155 /* Allocate space for COUNT + RANGE_TABLE. Needs two
2156 bytes for flags, two for COUNT, and three bytes for
2157 each character. */
2158 GET_BUFFER_SPACE (4 + used * 3);
2159
2160 /* Indicate the existence of range table. */
2161 laststart[1] |= 0x80;
2162
2163 /* Store the character class flag bits into the range table. */
2164 *b++ = RANGE_TABLE_WORK_BITS (range_table_work) & 0xff;
2165 *b++ = RANGE_TABLE_WORK_BITS (range_table_work) >> 8;
2166
2167 STORE_NUMBER_AND_INCR (b, used / 2);
2168 for (i = 0; i < used; i++)
2169 STORE_CHARACTER_AND_INCR
2170 (b, RANGE_TABLE_WORK_ELT (range_table_work, i));
2171 }
2172 }
2173 break;
2174
2175
2176 case '\\':
2177 if (p == pend) FREE_STACK_RETURN (REG_EESCAPE);
2178
2179 /* Do not translate the character after the \, so that we can
2180 distinguish, e.g., \B from \b, even if we normally would
2181 translate, e.g., B to b. */
2182 PATFETCH (c);
2183
2184 switch (c)
2185 {
2186 case '(':
2187 {
2188 bool shy = false;
2189 regnum_t regnum = 0;
2190 if (p+1 < pend)
2191 {
2192 /* Look for a special (?...) construct */
2193 if (*p == '?')
2194 {
2195 PATFETCH (c); /* Gobble up the '?'. */
2196 while (!shy)
2197 {
2198 PATFETCH (c);
2199 switch (c)
2200 {
2201 case ':': shy = true; break;
2202 case '0':
2203 /* An explicitly specified regnum must start
2204 with non-0. */
2205 if (regnum == 0)
2206 FREE_STACK_RETURN (REG_BADPAT);
2207 FALLTHROUGH;
2208 case '1': case '2': case '3': case '4':
2209 case '5': case '6': case '7': case '8': case '9':
2210 if (INT_MULTIPLY_WRAPV (regnum, 10, ®num)
2211 || INT_ADD_WRAPV (regnum, c - '0',
2212 ®num))
2213 FREE_STACK_RETURN (REG_ESIZE);
2214 break;
2215 default:
2216 /* Only (?:...) is supported right now. */
2217 FREE_STACK_RETURN (REG_BADPAT);
2218 }
2219 }
2220 }
2221 }
2222
2223 if (!shy)
2224 regnum = ++bufp->re_nsub;
2225 else if (regnum)
2226 { /* It's actually not shy, but explicitly numbered. */
2227 shy = false;
2228 if (regnum > bufp->re_nsub)
2229 bufp->re_nsub = regnum;
2230 else if (regnum > bufp->re_nsub
2231 /* Ideally, we'd want to check that the specified
2232 group can't have matched (i.e. all subgroups
2233 using the same regnum are in other branches of
2234 OR patterns), but we don't currently keep track
2235 of enough info to do that easily. */
2236 || group_in_compile_stack (compile_stack, regnum))
2237 FREE_STACK_RETURN (REG_BADPAT);
2238 }
2239 else
2240 /* It's really shy. */
2241 regnum = - bufp->re_nsub;
2242
2243 if (COMPILE_STACK_FULL)
2244 compile_stack.stack
2245 = xpalloc (compile_stack.stack, &compile_stack.size,
2246 1, -1, sizeof *compile_stack.stack);
2247
2248 /* These are the values to restore when we hit end of this
2249 group. They are all relative offsets, so that if the
2250 whole pattern moves because of realloc, they will still
2251 be valid. */
2252 COMPILE_STACK_TOP.begalt_offset = begalt - bufp->buffer;
2253 COMPILE_STACK_TOP.fixup_alt_jump
2254 = fixup_alt_jump ? fixup_alt_jump - bufp->buffer + 1 : 0;
2255 COMPILE_STACK_TOP.laststart_offset = b - bufp->buffer;
2256 COMPILE_STACK_TOP.regnum = regnum;
2257
2258 /* Do not push a start_memory for groups beyond the last one
2259 we can represent in the compiled pattern. */
2260 if (regnum <= MAX_REGNUM && regnum > 0)
2261 BUF_PUSH_2 (start_memory, regnum);
2262
2263 compile_stack.avail++;
2264
2265 fixup_alt_jump = 0;
2266 laststart = 0;
2267 begalt = b;
2268 /* If we've reached MAX_REGNUM groups, then this open
2269 won't actually generate any code, so we'll have to
2270 clear pending_exact explicitly. */
2271 pending_exact = 0;
2272 break;
2273 }
2274
2275 case ')':
2276 if (COMPILE_STACK_EMPTY)
2277 FREE_STACK_RETURN (REG_ERPAREN);
2278
2279 FIXUP_ALT_JUMP ();
2280
2281 /* See similar code for backslashed left paren above. */
2282 if (COMPILE_STACK_EMPTY)
2283 FREE_STACK_RETURN (REG_ERPAREN);
2284
2285 /* Since we just checked for an empty stack above, this
2286 "can't happen". */
2287 eassert (compile_stack.avail != 0);
2288 {
2289 /* We don't just want to restore into 'regnum', because
2290 later groups should continue to be numbered higher,
2291 as in '(ab)c(de)' -- the second group is #2. */
2292 regnum_t regnum;
2293
2294 compile_stack.avail--;
2295 begalt = bufp->buffer + COMPILE_STACK_TOP.begalt_offset;
2296 fixup_alt_jump
2297 = COMPILE_STACK_TOP.fixup_alt_jump
2298 ? bufp->buffer + COMPILE_STACK_TOP.fixup_alt_jump - 1
2299 : 0;
2300 laststart = bufp->buffer + COMPILE_STACK_TOP.laststart_offset;
2301 regnum = COMPILE_STACK_TOP.regnum;
2302 /* If we've reached MAX_REGNUM groups, then this open
2303 won't actually generate any code, so we'll have to
2304 clear pending_exact explicitly. */
2305 pending_exact = 0;
2306
2307 /* We're at the end of the group, so now we know how many
2308 groups were inside this one. */
2309 if (regnum <= MAX_REGNUM && regnum > 0)
2310 BUF_PUSH_2 (stop_memory, regnum);
2311 }
2312 break;
2313
2314
2315 case '|': /* '\|'. */
2316 /* Insert before the previous alternative a jump which
2317 jumps to this alternative if the former fails. */
2318 GET_BUFFER_SPACE (3);
2319 INSERT_JUMP (on_failure_jump, begalt, b + 6);
2320 pending_exact = 0;
2321 b += 3;
2322
2323 /* The alternative before this one has a jump after it
2324 which gets executed if it gets matched. Adjust that
2325 jump so it will jump to this alternative's analogous
2326 jump (put in below, which in turn will jump to the next
2327 (if any) alternative's such jump, etc.). The last such
2328 jump jumps to the correct final destination. A picture:
2329 _____ _____
2330 | | | |
2331 | v | v
2332 A | B | C
2333
2334 If we are at B, then fixup_alt_jump right now points to a
2335 three-byte space after A. We'll put in the jump, set
2336 fixup_alt_jump to right after B, and leave behind three
2337 bytes which we'll fill in when we get to after C. */
2338
2339 FIXUP_ALT_JUMP ();
2340
2341 /* Mark and leave space for a jump after this alternative,
2342 to be filled in later either by next alternative or
2343 when know we're at the end of a series of alternatives. */
2344 fixup_alt_jump = b;
2345 GET_BUFFER_SPACE (3);
2346 b += 3;
2347
2348 laststart = 0;
2349 begalt = b;
2350 break;
2351
2352
2353 case '{':
2354 {
2355 /* At least (most) this many matches must be made. */
2356 int lower_bound = 0, upper_bound = -1;
2357
2358 beg_interval = p;
2359
2360 GET_INTERVAL_COUNT (lower_bound);
2361
2362 if (c == ',')
2363 GET_INTERVAL_COUNT (upper_bound);
2364 else
2365 /* Interval such as '{1}' => match exactly once. */
2366 upper_bound = lower_bound;
2367
2368 if (lower_bound < 0
2369 || (0 <= upper_bound && upper_bound < lower_bound)
2370 || c != '\\')
2371 FREE_STACK_RETURN (REG_BADBR);
2372 if (p == pend)
2373 FREE_STACK_RETURN (REG_EESCAPE);
2374 if (*p++ != '}')
2375 FREE_STACK_RETURN (REG_BADBR);
2376
2377 /* We just parsed a valid interval. */
2378
2379 /* If it's invalid to have no preceding re. */
2380 if (!laststart)
2381 goto unfetch_interval;
2382
2383 if (upper_bound == 0)
2384 /* If the upper bound is zero, just drop the sub pattern
2385 altogether. */
2386 b = laststart;
2387 else if (lower_bound == 1 && upper_bound == 1)
2388 /* Just match it once: nothing to do here. */
2389 ;
2390
2391 /* Otherwise, we have a nontrivial interval. When
2392 we're all done, the pattern will look like:
2393 set_number_at <jump count> <upper bound>
2394 set_number_at <succeed_n count> <lower bound>
2395 succeed_n <after jump addr> <succeed_n count>
2396 <body of loop>
2397 jump_n <succeed_n addr> <jump count>
2398 (The upper bound and 'jump_n' are omitted if
2399 'upper_bound' is 1, though.) */
2400 else
2401 { /* If the upper bound is > 1, we need to insert
2402 more at the end of the loop. */
2403 int nbytes = upper_bound < 0 ? 3 : upper_bound > 1 ? 5 : 0;
2404 int startoffset = 0;
2405
2406 GET_BUFFER_SPACE (20); /* We might use less. */
2407
2408 if (lower_bound == 0)
2409 {
2410 /* A succeed_n that starts with 0 is really
2411 a simple on_failure_jump_loop. */
2412 INSERT_JUMP (on_failure_jump_loop, laststart,
2413 b + 3 + nbytes);
2414 b += 3;
2415 }
2416 else
2417 {
2418 /* Initialize lower bound of the 'succeed_n', even
2419 though it will be set during matching by its
2420 attendant 'set_number_at' (inserted next),
2421 because 're_compile_fastmap' needs to know.
2422 Jump to the 'jump_n' we might insert below. */
2423 INSERT_JUMP2 (succeed_n, laststart,
2424 b + 5 + nbytes,
2425 lower_bound);
2426 b += 5;
2427
2428 /* Code to initialize the lower bound. Insert
2429 before the 'succeed_n'. The '5' is the last two
2430 bytes of this 'set_number_at', plus 3 bytes of
2431 the following 'succeed_n'. */
2432 insert_op2 (set_number_at, laststart, 5,
2433 lower_bound, b);
2434 b += 5;
2435 startoffset += 5;
2436 }
2437
2438 if (upper_bound < 0)
2439 {
2440 /* A negative upper bound stands for infinity,
2441 in which case it degenerates to a plain jump. */
2442 STORE_JUMP (jump, b, laststart + startoffset);
2443 b += 3;
2444 }
2445 else if (upper_bound > 1)
2446 { /* More than one repetition is allowed, so
2447 append a backward jump to the 'succeed_n'
2448 that starts this interval.
2449
2450 When we've reached this during matching,
2451 we'll have matched the interval once, so
2452 jump back only 'upper_bound - 1' times. */
2453 STORE_JUMP2 (jump_n, b, laststart + startoffset,
2454 upper_bound - 1);
2455 b += 5;
2456
2457 /* The location we want to set is the second
2458 parameter of the 'jump_n'; that is 'b-2' as
2459 an absolute address. 'laststart' will be
2460 the 'set_number_at' we're about to insert;
2461 'laststart+3' the number to set, the source
2462 for the relative address. But we are
2463 inserting into the middle of the pattern --
2464 so everything is getting moved up by 5.
2465 Conclusion: (b - 2) - (laststart + 3) + 5,
2466 i.e., b - laststart.
2467
2468 Insert this at the beginning of the loop
2469 so that if we fail during matching, we'll
2470 reinitialize the bounds. */
2471 insert_op2 (set_number_at, laststart, b - laststart,
2472 upper_bound - 1, b);
2473 b += 5;
2474 }
2475 }
2476 pending_exact = 0;
2477 beg_interval = NULL;
2478 }
2479 break;
2480
2481 unfetch_interval:
2482 /* If an invalid interval, match the characters as literals. */
2483 eassert (beg_interval);
2484 p = beg_interval;
2485 beg_interval = NULL;
2486 eassert (p > pattern && p[-1] == '\\');
2487 c = '{';
2488 goto normal_char;
2489
2490 case '=':
2491 laststart = b;
2492 BUF_PUSH (at_dot);
2493 break;
2494
2495 case 's':
2496 laststart = b;
2497 PATFETCH (c);
2498 BUF_PUSH_2 (syntaxspec, syntax_spec_code[c]);
2499 break;
2500
2501 case 'S':
2502 laststart = b;
2503 PATFETCH (c);
2504 BUF_PUSH_2 (notsyntaxspec, syntax_spec_code[c]);
2505 break;
2506
2507 case 'c':
2508 laststart = b;
2509 PATFETCH (c);
2510 BUF_PUSH_2 (categoryspec, c);
2511 break;
2512
2513 case 'C':
2514 laststart = b;
2515 PATFETCH (c);
2516 BUF_PUSH_2 (notcategoryspec, c);
2517 break;
2518
2519 case 'w':
2520 laststart = b;
2521 BUF_PUSH_2 (syntaxspec, Sword);
2522 break;
2523
2524
2525 case 'W':
2526 laststart = b;
2527 BUF_PUSH_2 (notsyntaxspec, Sword);
2528 break;
2529
2530
2531 case '<':
2532 laststart = b;
2533 BUF_PUSH (wordbeg);
2534 break;
2535
2536 case '>':
2537 laststart = b;
2538 BUF_PUSH (wordend);
2539 break;
2540
2541 case '_':
2542 laststart = b;
2543 PATFETCH (c);
2544 if (c == '<')
2545 BUF_PUSH (symbeg);
2546 else if (c == '>')
2547 BUF_PUSH (symend);
2548 else
2549 FREE_STACK_RETURN (REG_BADPAT);
2550 break;
2551
2552 case 'b':
2553 BUF_PUSH (wordbound);
2554 break;
2555
2556 case 'B':
2557 BUF_PUSH (notwordbound);
2558 break;
2559
2560 case '`':
2561 BUF_PUSH (begbuf);
2562 break;
2563
2564 case '\'':
2565 BUF_PUSH (endbuf);
2566 break;
2567
2568 case '1': case '2': case '3': case '4': case '5':
2569 case '6': case '7': case '8': case '9':
2570 {
2571 regnum_t reg = c - '0';
2572
2573 if (reg > bufp->re_nsub || reg < 1
2574 /* Can't back reference to a subexp before its end. */
2575 || group_in_compile_stack (compile_stack, reg))
2576 FREE_STACK_RETURN (REG_ESUBREG);
2577
2578 laststart = b;
2579 BUF_PUSH_2 (duplicate, reg);
2580 }
2581 break;
2582
2583 default:
2584 /* You might think it would be useful for \ to mean
2585 not to translate; but if we don't translate it
2586 it will never match anything. */
2587 goto normal_char;
2588 }
2589 break;
2590
2591
2592 default:
2593 /* Expects the character in C. */
2594 normal_char:
2595 /* If no exactn currently being built. */
2596 if (!pending_exact
2597
2598 /* If last exactn not at current position. */
2599 || pending_exact + *pending_exact + 1 != b
2600
2601 /* Only one byte follows the exactn for the count. */
2602 || *pending_exact >= (1 << BYTEWIDTH) - MAX_MULTIBYTE_LENGTH
2603
2604 /* If followed by a repetition operator. */
2605 || (p != pend
2606 && (*p == '*' || *p == '+' || *p == '?' || *p == '^'))
2607 || (p + 1 < pend && p[0] == '\\' && p[1] == '{'))
2608 {
2609 /* Start building a new exactn. */
2610
2611 laststart = b;
2612
2613 BUF_PUSH_2 (exactn, 0);
2614 pending_exact = b - 1;
2615 }
2616
2617 GET_BUFFER_SPACE (MAX_MULTIBYTE_LENGTH);
2618 {
2619 int len;
2620
2621 if (multibyte)
2622 {
2623 c = TRANSLATE (c);
2624 len = CHAR_STRING (c, b);
2625 b += len;
2626 }
2627 else
2628 {
2629 c1 = RE_CHAR_TO_MULTIBYTE (c);
2630 if (! CHAR_BYTE8_P (c1))
2631 {
2632 int c2 = TRANSLATE (c1);
2633
2634 if (c1 != c2 && (c1 = RE_CHAR_TO_UNIBYTE (c2)) >= 0)
2635 c = c1;
2636 }
2637 *b++ = c;
2638 len = 1;
2639 }
2640 (*pending_exact) += len;
2641 }
2642
2643 break;
2644 } /* switch (c) */
2645 } /* while p != pend */
2646
2647
2648 /* Through the pattern now. */
2649
2650 FIXUP_ALT_JUMP ();
2651
2652 if (!COMPILE_STACK_EMPTY)
2653 FREE_STACK_RETURN (REG_EPAREN);
2654
2655 /* If we don't want backtracking, force success
2656 the first time we reach the end of the compiled pattern. */
2657 if (!posix_backtracking)
2658 BUF_PUSH (succeed);
2659
2660 /* Success; set the length of the buffer. */
2661 bufp->used = b - bufp->buffer;
2662
2663 #ifdef REGEX_EMACS_DEBUG
2664 if (regex_emacs_debug > 0)
2665 {
2666 re_compile_fastmap (bufp);
2667 DEBUG_PRINT ("\nCompiled pattern:\n");
2668 print_compiled_pattern (bufp);
2669 }
2670 regex_emacs_debug--;
2671 #endif
2672
2673 FREE_STACK_RETURN (REG_NOERROR);
2674
2675 } /* regex_compile */
2676
2677 /* Subroutines for 'regex_compile'. */
2678
2679 /* Store OP at LOC followed by two-byte integer parameter ARG. */
2680
2681 static void
store_op1(re_opcode_t op,unsigned char * loc,int arg)2682 store_op1 (re_opcode_t op, unsigned char *loc, int arg)
2683 {
2684 *loc = (unsigned char) op;
2685 STORE_NUMBER (loc + 1, arg);
2686 }
2687
2688
2689 /* Like 'store_op1', but for two two-byte parameters ARG1 and ARG2. */
2690
2691 static void
store_op2(re_opcode_t op,unsigned char * loc,int arg1,int arg2)2692 store_op2 (re_opcode_t op, unsigned char *loc, int arg1, int arg2)
2693 {
2694 *loc = (unsigned char) op;
2695 STORE_NUMBER (loc + 1, arg1);
2696 STORE_NUMBER (loc + 3, arg2);
2697 }
2698
2699
2700 /* Copy the bytes from LOC to END to open up three bytes of space at LOC
2701 for OP followed by two-byte integer parameter ARG. */
2702
2703 static void
insert_op1(re_opcode_t op,unsigned char * loc,int arg,unsigned char * end)2704 insert_op1 (re_opcode_t op, unsigned char *loc, int arg, unsigned char *end)
2705 {
2706 register unsigned char *pfrom = end;
2707 register unsigned char *pto = end + 3;
2708
2709 while (pfrom != loc)
2710 *--pto = *--pfrom;
2711
2712 store_op1 (op, loc, arg);
2713 }
2714
2715
2716 /* Like 'insert_op1', but for two two-byte parameters ARG1 and ARG2. */
2717
2718 static void
insert_op2(re_opcode_t op,unsigned char * loc,int arg1,int arg2,unsigned char * end)2719 insert_op2 (re_opcode_t op, unsigned char *loc, int arg1, int arg2,
2720 unsigned char *end)
2721 {
2722 register unsigned char *pfrom = end;
2723 register unsigned char *pto = end + 5;
2724
2725 while (pfrom != loc)
2726 *--pto = *--pfrom;
2727
2728 store_op2 (op, loc, arg1, arg2);
2729 }
2730
2731
2732 /* P points to just after a ^ in PATTERN. Return true if that ^ comes
2733 after an alternative or a begin-subexpression. Assume there is at
2734 least one character before the ^. */
2735
2736 static bool
at_begline_loc_p(re_char * pattern,re_char * p)2737 at_begline_loc_p (re_char *pattern, re_char *p)
2738 {
2739 re_char *prev = p - 2;
2740
2741 switch (*prev)
2742 {
2743 case '(': /* After a subexpression. */
2744 case '|': /* After an alternative. */
2745 break;
2746
2747 case ':': /* After a shy subexpression. */
2748 /* Skip over optional regnum. */
2749 while (prev > pattern && '0' <= prev[-1] && prev[-1] <= '9')
2750 --prev;
2751
2752 if (! (prev > pattern + 1 && prev[-1] == '?' && prev[-2] == '('))
2753 return false;
2754 prev -= 2;
2755 break;
2756
2757 default:
2758 return false;
2759 }
2760
2761 /* Count the number of preceding backslashes. */
2762 p = prev;
2763 while (prev > pattern && prev[-1] == '\\')
2764 --prev;
2765 return (p - prev) & 1;
2766 }
2767
2768
2769 /* The dual of at_begline_loc_p. This one is for $. Assume there is
2770 at least one character after the $, i.e., 'P < PEND'. */
2771
2772 static bool
at_endline_loc_p(re_char * p,re_char * pend)2773 at_endline_loc_p (re_char *p, re_char *pend)
2774 {
2775 /* Before a subexpression or an alternative? */
2776 return *p == '\\' && p + 1 < pend && (p[1] == ')' || p[1] == '|');
2777 }
2778
2779
2780 /* Returns true if REGNUM is in one of COMPILE_STACK's elements and
2781 false if it's not. */
2782
2783 static bool
group_in_compile_stack(compile_stack_type compile_stack,regnum_t regnum)2784 group_in_compile_stack (compile_stack_type compile_stack, regnum_t regnum)
2785 {
2786 ptrdiff_t this_element;
2787
2788 for (this_element = compile_stack.avail - 1;
2789 this_element >= 0;
2790 this_element--)
2791 if (compile_stack.stack[this_element].regnum == regnum)
2792 return true;
2793
2794 return false;
2795 }
2796
2797 /* analyze_first.
2798 If fastmap is non-NULL, go through the pattern and fill fastmap
2799 with all the possible leading chars. If fastmap is NULL, don't
2800 bother filling it up (obviously) and only return whether the
2801 pattern could potentially match the empty string.
2802
2803 Return 1 if p..pend might match the empty string.
2804 Return 0 if p..pend matches at least one char.
2805 Return -1 if fastmap was not updated accurately. */
2806
2807 static int
analyze_first(re_char * p,re_char * pend,char * fastmap,bool multibyte)2808 analyze_first (re_char *p, re_char *pend, char *fastmap, bool multibyte)
2809 {
2810 int j, k;
2811 int nbits;
2812 bool not;
2813
2814 /* If all elements for base leading-codes in fastmap is set, this
2815 flag is set true. */
2816 bool match_any_multibyte_characters = false;
2817
2818 eassert (p);
2819
2820 /* The loop below works as follows:
2821 - It has a working-list kept in the PATTERN_STACK and which basically
2822 starts by only containing a pointer to the first operation.
2823 - If the opcode we're looking at is a match against some set of
2824 chars, then we add those chars to the fastmap and go on to the
2825 next work element from the worklist (done via 'break').
2826 - If the opcode is a control operator on the other hand, we either
2827 ignore it (if it's meaningless at this point, such as 'start_memory')
2828 or execute it (if it's a jump). If the jump has several destinations
2829 (i.e. 'on_failure_jump'), then we push the other destination onto the
2830 worklist.
2831 We guarantee termination by ignoring backward jumps (more or less),
2832 so that P is monotonically increasing. More to the point, we
2833 never set P (or push) anything '<= p1'. */
2834
2835 while (p < pend)
2836 {
2837 /* P1 is used as a marker of how far back a 'on_failure_jump'
2838 can go without being ignored. It is normally equal to P
2839 (which prevents any backward 'on_failure_jump') except right
2840 after a plain 'jump', to allow patterns such as:
2841 0: jump 10
2842 3..9: <body>
2843 10: on_failure_jump 3
2844 as used for the *? operator. */
2845 re_char *p1 = p;
2846
2847 switch (*p++)
2848 {
2849 case succeed:
2850 return 1;
2851
2852 case duplicate:
2853 /* If the first character has to match a backreference, that means
2854 that the group was empty (since it already matched). Since this
2855 is the only case that interests us here, we can assume that the
2856 backreference must match the empty string. */
2857 p++;
2858 continue;
2859
2860
2861 /* Following are the cases which match a character. These end
2862 with 'break'. */
2863
2864 case exactn:
2865 if (fastmap)
2866 {
2867 /* If multibyte is nonzero, the first byte of each
2868 character is an ASCII or a leading code. Otherwise,
2869 each byte is a character. Thus, this works in both
2870 cases. */
2871 fastmap[p[1]] = 1;
2872 if (multibyte)
2873 {
2874 /* Cover the case of matching a raw char in a
2875 multibyte regexp against unibyte. */
2876 if (CHAR_BYTE8_HEAD_P (p[1]))
2877 fastmap[CHAR_TO_BYTE8 (STRING_CHAR (p + 1))] = 1;
2878 }
2879 else
2880 {
2881 /* For the case of matching this unibyte regex
2882 against multibyte, we must set a leading code of
2883 the corresponding multibyte character. */
2884 int c = RE_CHAR_TO_MULTIBYTE (p[1]);
2885
2886 fastmap[CHAR_LEADING_CODE (c)] = 1;
2887 }
2888 }
2889 break;
2890
2891
2892 case anychar:
2893 /* We could put all the chars except for \n (and maybe \0)
2894 but we don't bother since it is generally not worth it. */
2895 if (!fastmap) break;
2896 return -1;
2897
2898
2899 case charset_not:
2900 if (!fastmap) break;
2901 {
2902 /* Chars beyond end of bitmap are possible matches. */
2903 for (j = CHARSET_BITMAP_SIZE (&p[-1]) * BYTEWIDTH;
2904 j < (1 << BYTEWIDTH); j++)
2905 fastmap[j] = 1;
2906 }
2907 FALLTHROUGH;
2908 case charset:
2909 if (!fastmap) break;
2910 not = (re_opcode_t) *(p - 1) == charset_not;
2911 nbits = CHARSET_BITMAP_SIZE (&p[-1]) * BYTEWIDTH;
2912 p++;
2913 for (j = 0; j < nbits; j++)
2914 if (!!(p[j / BYTEWIDTH] & (1 << (j % BYTEWIDTH))) ^ not)
2915 fastmap[j] = 1;
2916
2917 /* To match raw bytes (in the 80..ff range) against multibyte
2918 strings, add their leading bytes to the fastmap. */
2919 for (j = 0x80; j < nbits; j++)
2920 if (!!(p[j / BYTEWIDTH] & (1 << (j % BYTEWIDTH))) ^ not)
2921 fastmap[CHAR_LEADING_CODE (BYTE8_TO_CHAR (j))] = 1;
2922
2923 if (/* Any leading code can possibly start a character
2924 which doesn't match the specified set of characters. */
2925 not
2926 ||
2927 /* If we can match a character class, we can match any
2928 multibyte characters. */
2929 (CHARSET_RANGE_TABLE_EXISTS_P (&p[-2])
2930 && CHARSET_RANGE_TABLE_BITS (&p[-2]) != 0))
2931
2932 {
2933 if (match_any_multibyte_characters == false)
2934 {
2935 for (j = MIN_MULTIBYTE_LEADING_CODE;
2936 j <= MAX_MULTIBYTE_LEADING_CODE; j++)
2937 fastmap[j] = 1;
2938 match_any_multibyte_characters = true;
2939 }
2940 }
2941
2942 else if (!not && CHARSET_RANGE_TABLE_EXISTS_P (&p[-2])
2943 && match_any_multibyte_characters == false)
2944 {
2945 /* Set fastmap[I] to 1 where I is a leading code of each
2946 multibyte character in the range table. */
2947 int c, count;
2948 unsigned char lc1, lc2;
2949
2950 /* Make P points the range table. '+ 2' is to skip flag
2951 bits for a character class. */
2952 p += CHARSET_BITMAP_SIZE (&p[-2]) + 2;
2953
2954 /* Extract the number of ranges in range table into COUNT. */
2955 EXTRACT_NUMBER_AND_INCR (count, p);
2956 for (; count > 0; count--, p += 3)
2957 {
2958 /* Extract the start and end of each range. */
2959 EXTRACT_CHARACTER (c, p);
2960 lc1 = CHAR_LEADING_CODE (c);
2961 p += 3;
2962 EXTRACT_CHARACTER (c, p);
2963 lc2 = CHAR_LEADING_CODE (c);
2964 for (j = lc1; j <= lc2; j++)
2965 fastmap[j] = 1;
2966 }
2967 }
2968 break;
2969
2970 case syntaxspec:
2971 case notsyntaxspec:
2972 if (!fastmap) break;
2973 /* This match depends on text properties. These end with
2974 aborting optimizations. */
2975 return -1;
2976
2977 case categoryspec:
2978 case notcategoryspec:
2979 if (!fastmap) break;
2980 not = (re_opcode_t)p[-1] == notcategoryspec;
2981 k = *p++;
2982 for (j = (1 << BYTEWIDTH); j >= 0; j--)
2983 if ((CHAR_HAS_CATEGORY (j, k)) ^ not)
2984 fastmap[j] = 1;
2985
2986 /* Any leading code can possibly start a character which
2987 has or doesn't has the specified category. */
2988 if (match_any_multibyte_characters == false)
2989 {
2990 for (j = MIN_MULTIBYTE_LEADING_CODE;
2991 j <= MAX_MULTIBYTE_LEADING_CODE; j++)
2992 fastmap[j] = 1;
2993 match_any_multibyte_characters = true;
2994 }
2995 break;
2996
2997 /* All cases after this match the empty string. These end with
2998 'continue'. */
2999
3000 case at_dot:
3001 case no_op:
3002 case begline:
3003 case endline:
3004 case begbuf:
3005 case endbuf:
3006 case wordbound:
3007 case notwordbound:
3008 case wordbeg:
3009 case wordend:
3010 case symbeg:
3011 case symend:
3012 continue;
3013
3014
3015 case jump:
3016 EXTRACT_NUMBER_AND_INCR (j, p);
3017 if (j < 0)
3018 /* Backward jumps can only go back to code that we've already
3019 visited. 're_compile' should make sure this is true. */
3020 break;
3021 p += j;
3022 switch (*p)
3023 {
3024 case on_failure_jump:
3025 case on_failure_keep_string_jump:
3026 case on_failure_jump_loop:
3027 case on_failure_jump_nastyloop:
3028 case on_failure_jump_smart:
3029 p++;
3030 break;
3031 default:
3032 continue;
3033 };
3034 /* Keep P1 to allow the 'on_failure_jump' we are jumping to
3035 to jump back to "just after here". */
3036 FALLTHROUGH;
3037 case on_failure_jump:
3038 case on_failure_keep_string_jump:
3039 case on_failure_jump_nastyloop:
3040 case on_failure_jump_loop:
3041 case on_failure_jump_smart:
3042 EXTRACT_NUMBER_AND_INCR (j, p);
3043 if (p + j <= p1)
3044 ; /* Backward jump to be ignored. */
3045 else
3046 { /* We have to look down both arms.
3047 We first go down the "straight" path so as to minimize
3048 stack usage when going through alternatives. */
3049 int r = analyze_first (p, pend, fastmap, multibyte);
3050 if (r) return r;
3051 p += j;
3052 }
3053 continue;
3054
3055
3056 case jump_n:
3057 /* This code simply does not properly handle forward jump_n. */
3058 DEBUG_STATEMENT (EXTRACT_NUMBER (j, p); eassert (j < 0));
3059 p += 4;
3060 /* jump_n can either jump or fall through. The (backward) jump
3061 case has already been handled, so we only need to look at the
3062 fallthrough case. */
3063 continue;
3064
3065 case succeed_n:
3066 /* If N == 0, it should be an on_failure_jump_loop instead. */
3067 DEBUG_STATEMENT (EXTRACT_NUMBER (j, p + 2); eassert (j > 0));
3068 p += 4;
3069 /* We only care about one iteration of the loop, so we don't
3070 need to consider the case where this behaves like an
3071 on_failure_jump. */
3072 continue;
3073
3074
3075 case set_number_at:
3076 p += 4;
3077 continue;
3078
3079
3080 case start_memory:
3081 case stop_memory:
3082 p += 1;
3083 continue;
3084
3085
3086 default:
3087 abort (); /* We have listed all the cases. */
3088 } /* switch *p++ */
3089
3090 /* Getting here means we have found the possible starting
3091 characters for one path of the pattern -- and that the empty
3092 string does not match. We need not follow this path further. */
3093 return 0;
3094 } /* while p */
3095
3096 /* We reached the end without matching anything. */
3097 return 1;
3098
3099 } /* analyze_first */
3100
3101 /* Compute a fastmap for the compiled pattern in BUFP.
3102 A fastmap records which of the (1 << BYTEWIDTH) possible
3103 characters can start a string that matches the pattern. This fastmap
3104 is used by re_search to skip quickly over impossible starting points.
3105
3106 Character codes above (1 << BYTEWIDTH) are not represented in the
3107 fastmap, but the leading codes are represented. Thus, the fastmap
3108 indicates which character sets could start a match.
3109
3110 The caller must supply the address of a (1 << BYTEWIDTH)-byte data
3111 area as BUFP->fastmap.
3112
3113 Set the 'fastmap', 'fastmap_accurate', and 'can_be_null' fields in
3114 the pattern buffer. */
3115
3116 static void
re_compile_fastmap(struct re_pattern_buffer * bufp)3117 re_compile_fastmap (struct re_pattern_buffer *bufp)
3118 {
3119 char *fastmap = bufp->fastmap;
3120 int analysis;
3121
3122 eassert (fastmap && bufp->buffer);
3123
3124 memset (fastmap, 0, 1 << BYTEWIDTH); /* Assume nothing's valid. */
3125
3126 /* FIXME: Is the following assignment correct even when ANALYSIS < 0? */
3127 bufp->fastmap_accurate = 1; /* It will be when we're done. */
3128
3129 analysis = analyze_first (bufp->buffer, bufp->buffer + bufp->used,
3130 fastmap, RE_MULTIBYTE_P (bufp));
3131 bufp->can_be_null = (analysis != 0);
3132 } /* re_compile_fastmap */
3133
3134 /* Set REGS to hold NUM_REGS registers, storing them in STARTS and
3135 ENDS. Subsequent matches using PATTERN_BUFFER and REGS will use
3136 this memory for recording register information. STARTS and ENDS
3137 must be allocated using the malloc library routine, and must each
3138 be at least NUM_REGS * sizeof (ptrdiff_t) bytes long.
3139
3140 If NUM_REGS == 0, then subsequent matches should allocate their own
3141 register data.
3142
3143 Unless this function is called, the first search or match using
3144 PATTERN_BUFFER will allocate its own register data, without
3145 freeing the old data. */
3146
3147 void
re_set_registers(struct re_pattern_buffer * bufp,struct re_registers * regs,ptrdiff_t num_regs,ptrdiff_t * starts,ptrdiff_t * ends)3148 re_set_registers (struct re_pattern_buffer *bufp, struct re_registers *regs,
3149 ptrdiff_t num_regs, ptrdiff_t *starts, ptrdiff_t *ends)
3150 {
3151 if (num_regs)
3152 {
3153 bufp->regs_allocated = REGS_REALLOCATE;
3154 regs->num_regs = num_regs;
3155 regs->start = starts;
3156 regs->end = ends;
3157 }
3158 else
3159 {
3160 bufp->regs_allocated = REGS_UNALLOCATED;
3161 regs->num_regs = 0;
3162 regs->start = regs->end = 0;
3163 }
3164 }
3165
3166 /* Searching routines. */
3167
3168 /* Like re_search_2, below, but only one string is specified, and
3169 doesn't let you say where to stop matching. */
3170
3171 ptrdiff_t
re_search(struct re_pattern_buffer * bufp,const char * string,ptrdiff_t size,ptrdiff_t startpos,ptrdiff_t range,struct re_registers * regs)3172 re_search (struct re_pattern_buffer *bufp, const char *string, ptrdiff_t size,
3173 ptrdiff_t startpos, ptrdiff_t range, struct re_registers *regs)
3174 {
3175 return re_search_2 (bufp, NULL, 0, string, size, startpos, range,
3176 regs, size);
3177 }
3178
3179 /* Address of POS in the concatenation of virtual string. */
3180 #define POS_ADDR_VSTRING(POS) \
3181 (((POS) >= size1 ? string2 - size1 : string1) + (POS))
3182
3183 /* Using the compiled pattern in BUFP->buffer, first tries to match the
3184 virtual concatenation of STRING1 and STRING2, starting first at index
3185 STARTPOS, then at STARTPOS + 1, and so on.
3186
3187 STRING1 and STRING2 have length SIZE1 and SIZE2, respectively.
3188
3189 RANGE is how far to scan while trying to match. RANGE = 0 means try
3190 only at STARTPOS; in general, the last start tried is STARTPOS +
3191 RANGE.
3192
3193 In REGS, return the indices of the virtual concatenation of STRING1
3194 and STRING2 that matched the entire BUFP->buffer and its contained
3195 subexpressions.
3196
3197 Do not consider matching one past the index STOP in the virtual
3198 concatenation of STRING1 and STRING2.
3199
3200 Return either the position in the strings at which the match was
3201 found, -1 if no match, or -2 if error (such as failure
3202 stack overflow). */
3203
3204 ptrdiff_t
re_search_2(struct re_pattern_buffer * bufp,const char * str1,ptrdiff_t size1,const char * str2,ptrdiff_t size2,ptrdiff_t startpos,ptrdiff_t range,struct re_registers * regs,ptrdiff_t stop)3205 re_search_2 (struct re_pattern_buffer *bufp, const char *str1, ptrdiff_t size1,
3206 const char *str2, ptrdiff_t size2,
3207 ptrdiff_t startpos, ptrdiff_t range,
3208 struct re_registers *regs, ptrdiff_t stop)
3209 {
3210 ptrdiff_t val;
3211 re_char *string1 = (re_char *) str1;
3212 re_char *string2 = (re_char *) str2;
3213 char *fastmap = bufp->fastmap;
3214 Lisp_Object translate = bufp->translate;
3215 ptrdiff_t total_size = size1 + size2;
3216 ptrdiff_t endpos = startpos + range;
3217 bool anchored_start;
3218 /* Nonzero if we are searching multibyte string. */
3219 bool multibyte = RE_TARGET_MULTIBYTE_P (bufp);
3220
3221 /* Check for out-of-range STARTPOS. */
3222 if (startpos < 0 || startpos > total_size)
3223 return -1;
3224
3225 /* Fix up RANGE if it might eventually take us outside
3226 the virtual concatenation of STRING1 and STRING2.
3227 Make sure we won't move STARTPOS below 0 or above TOTAL_SIZE. */
3228 if (endpos < 0)
3229 range = 0 - startpos;
3230 else if (endpos > total_size)
3231 range = total_size - startpos;
3232
3233 /* If the search isn't to be a backwards one, don't waste time in a
3234 search for a pattern anchored at beginning of buffer. */
3235 if (bufp->used > 0 && (re_opcode_t) bufp->buffer[0] == begbuf && range > 0)
3236 {
3237 if (startpos > 0)
3238 return -1;
3239 else
3240 range = 0;
3241 }
3242
3243 /* In a forward search for something that starts with \=.
3244 don't keep searching past point. */
3245 if (bufp->used > 0 && (re_opcode_t) bufp->buffer[0] == at_dot && range > 0)
3246 {
3247 range = PT_BYTE - BEGV_BYTE - startpos;
3248 if (range < 0)
3249 return -1;
3250 }
3251
3252 /* Update the fastmap now if not correct already. */
3253 if (fastmap && !bufp->fastmap_accurate)
3254 re_compile_fastmap (bufp);
3255
3256 /* See whether the pattern is anchored. */
3257 anchored_start = (bufp->buffer[0] == begline);
3258
3259 gl_state.object = re_match_object; /* Used by SYNTAX_TABLE_BYTE_TO_CHAR. */
3260 {
3261 ptrdiff_t charpos = SYNTAX_TABLE_BYTE_TO_CHAR (POS_AS_IN_BUFFER (startpos));
3262
3263 SETUP_SYNTAX_TABLE_FOR_OBJECT (re_match_object, charpos, 1);
3264 }
3265
3266 /* Loop through the string, looking for a place to start matching. */
3267 for (;;)
3268 {
3269 /* If the pattern is anchored,
3270 skip quickly past places we cannot match.
3271 Don't bother to treat startpos == 0 specially
3272 because that case doesn't repeat. */
3273 if (anchored_start && startpos > 0)
3274 {
3275 if (! ((startpos <= size1 ? string1[startpos - 1]
3276 : string2[startpos - size1 - 1])
3277 == '\n'))
3278 goto advance;
3279 }
3280
3281 /* If a fastmap is supplied, skip quickly over characters that
3282 cannot be the start of a match. If the pattern can match the
3283 null string, however, we don't need to skip characters; we want
3284 the first null string. */
3285 if (fastmap && startpos < total_size && !bufp->can_be_null)
3286 {
3287 re_char *d;
3288 int buf_ch;
3289
3290 d = POS_ADDR_VSTRING (startpos);
3291
3292 if (range > 0) /* Searching forwards. */
3293 {
3294 ptrdiff_t irange = range, lim = 0;
3295
3296 if (startpos < size1 && startpos + range >= size1)
3297 lim = range - (size1 - startpos);
3298
3299 /* Written out as an if-else to avoid testing 'translate'
3300 inside the loop. */
3301 if (!NILP (translate))
3302 {
3303 if (multibyte)
3304 while (range > lim)
3305 {
3306 int buf_charlen;
3307
3308 buf_ch = string_char_and_length (d, &buf_charlen);
3309 buf_ch = RE_TRANSLATE (translate, buf_ch);
3310 if (fastmap[CHAR_LEADING_CODE (buf_ch)])
3311 break;
3312
3313 range -= buf_charlen;
3314 d += buf_charlen;
3315 }
3316 else
3317 while (range > lim)
3318 {
3319 buf_ch = *d;
3320 int ch = RE_CHAR_TO_MULTIBYTE (buf_ch);
3321 int translated = RE_TRANSLATE (translate, ch);
3322 if (translated != ch
3323 && (ch = RE_CHAR_TO_UNIBYTE (translated)) >= 0)
3324 buf_ch = ch;
3325 if (fastmap[buf_ch])
3326 break;
3327 d++;
3328 range--;
3329 }
3330 }
3331 else
3332 {
3333 if (multibyte)
3334 while (range > lim)
3335 {
3336 int buf_charlen;
3337
3338 buf_ch = string_char_and_length (d, &buf_charlen);
3339 if (fastmap[CHAR_LEADING_CODE (buf_ch)])
3340 break;
3341 range -= buf_charlen;
3342 d += buf_charlen;
3343 }
3344 else
3345 while (range > lim && !fastmap[*d])
3346 {
3347 d++;
3348 range--;
3349 }
3350 }
3351 startpos += irange - range;
3352 }
3353 else /* Searching backwards. */
3354 {
3355 if (multibyte)
3356 {
3357 buf_ch = STRING_CHAR (d);
3358 buf_ch = TRANSLATE (buf_ch);
3359 if (! fastmap[CHAR_LEADING_CODE (buf_ch)])
3360 goto advance;
3361 }
3362 else
3363 {
3364 buf_ch = *d;
3365 int ch = RE_CHAR_TO_MULTIBYTE (buf_ch);
3366 int translated = TRANSLATE (ch);
3367 if (translated != ch
3368 && (ch = RE_CHAR_TO_UNIBYTE (translated)) >= 0)
3369 buf_ch = ch;
3370 if (! fastmap[TRANSLATE (buf_ch)])
3371 goto advance;
3372 }
3373 }
3374 }
3375
3376 /* If can't match the null string, and that's all we have left, fail. */
3377 if (range >= 0 && startpos == total_size && fastmap
3378 && !bufp->can_be_null)
3379 return -1;
3380
3381 val = re_match_2_internal (bufp, string1, size1, string2, size2,
3382 startpos, regs, stop);
3383
3384 if (val >= 0)
3385 return startpos;
3386
3387 if (val == -2)
3388 return -2;
3389
3390 advance:
3391 if (!range)
3392 break;
3393 else if (range > 0)
3394 {
3395 /* Update STARTPOS to the next character boundary. */
3396 if (multibyte)
3397 {
3398 re_char *p = POS_ADDR_VSTRING (startpos);
3399 int len = BYTES_BY_CHAR_HEAD (*p);
3400
3401 range -= len;
3402 if (range < 0)
3403 break;
3404 startpos += len;
3405 }
3406 else
3407 {
3408 range--;
3409 startpos++;
3410 }
3411 }
3412 else
3413 {
3414 range++;
3415 startpos--;
3416
3417 /* Update STARTPOS to the previous character boundary. */
3418 if (multibyte)
3419 {
3420 re_char *p = POS_ADDR_VSTRING (startpos) + 1;
3421 int len = raw_prev_char_len (p);
3422
3423 range += len - 1;
3424 if (range > 0)
3425 break;
3426 startpos -= len - 1;
3427 }
3428 }
3429 }
3430 return -1;
3431 } /* re_search_2 */
3432
3433 /* Declarations and macros for re_match_2. */
3434
3435 static bool bcmp_translate (re_char *, re_char *, ptrdiff_t,
3436 Lisp_Object, bool);
3437
3438 /* This converts PTR, a pointer into one of the search strings 'string1'
3439 and 'string2' into an offset from the beginning of that string. */
3440 #define POINTER_TO_OFFSET(ptr) \
3441 (FIRST_STRING_P (ptr) \
3442 ? (ptr) - string1 \
3443 : (ptr) - string2 + (ptrdiff_t) size1)
3444
3445 /* Call before fetching a character with *d. This switches over to
3446 string2 if necessary.
3447 Check re_match_2_internal for a discussion of why end_match_2 might
3448 not be within string2 (but be equal to end_match_1 instead). */
3449 #define PREFETCH() \
3450 while (d == dend) \
3451 { \
3452 /* End of string2 => fail. */ \
3453 if (dend == end_match_2) \
3454 goto fail; \
3455 /* End of string1 => advance to string2. */ \
3456 d = string2; \
3457 dend = end_match_2; \
3458 }
3459
3460 /* Call before fetching a char with *d if you already checked other limits.
3461 This is meant for use in lookahead operations like wordend, etc..
3462 where we might need to look at parts of the string that might be
3463 outside of the LIMITs (i.e past 'stop'). */
3464 #define PREFETCH_NOLIMIT() \
3465 if (d == end1) \
3466 { \
3467 d = string2; \
3468 dend = end_match_2; \
3469 }
3470
3471 /* Test if at very beginning or at very end of the virtual concatenation
3472 of STRING1 and STRING2. If only one string, it's STRING2. */
3473 #define AT_STRINGS_BEG(d) ((d) == (size1 ? string1 : string2) || !size2)
3474 #define AT_STRINGS_END(d) ((d) == end2)
3475
3476 /* Disabled due to a compiler bug -- see comment at case wordbound */
3477
3478 /* The comment at case wordbound is following one, but we don't use
3479 AT_WORD_BOUNDARY anymore to support multibyte form.
3480
3481 The DEC Alpha C compiler 3.x generates incorrect code for the
3482 test WORDCHAR_P (d - 1) != WORDCHAR_P (d) in the expansion of
3483 AT_WORD_BOUNDARY, so this code is disabled. Expanding the
3484 macro and introducing temporary variables works around the bug. */
3485
3486 #if 0
3487 /* Test if D points to a character which is word-constituent. We have
3488 two special cases to check for: if past the end of string1, look at
3489 the first character in string2; and if before the beginning of
3490 string2, look at the last character in string1. */
3491 #define WORDCHAR_P(d) \
3492 (SYNTAX ((d) == end1 ? *string2 \
3493 : (d) == string2 - 1 ? *(end1 - 1) : *(d)) \
3494 == Sword)
3495
3496 /* Test if the character before D and the one at D differ with respect
3497 to being word-constituent. */
3498 #define AT_WORD_BOUNDARY(d) \
3499 (AT_STRINGS_BEG (d) || AT_STRINGS_END (d) \
3500 || WORDCHAR_P (d - 1) != WORDCHAR_P (d))
3501 #endif
3502
3503
3504 /* Optimization routines. */
3505
3506 /* If the operation is a match against one or more chars,
3507 return a pointer to the next operation, else return NULL. */
3508 static re_char *
skip_one_char(re_char * p)3509 skip_one_char (re_char *p)
3510 {
3511 switch (*p++)
3512 {
3513 case anychar:
3514 break;
3515
3516 case exactn:
3517 p += *p + 1;
3518 break;
3519
3520 case charset_not:
3521 case charset:
3522 if (CHARSET_RANGE_TABLE_EXISTS_P (p - 1))
3523 {
3524 int mcnt;
3525 p = CHARSET_RANGE_TABLE (p - 1);
3526 EXTRACT_NUMBER_AND_INCR (mcnt, p);
3527 p = CHARSET_RANGE_TABLE_END (p, mcnt);
3528 }
3529 else
3530 p += 1 + CHARSET_BITMAP_SIZE (p - 1);
3531 break;
3532
3533 case syntaxspec:
3534 case notsyntaxspec:
3535 case categoryspec:
3536 case notcategoryspec:
3537 p++;
3538 break;
3539
3540 default:
3541 p = NULL;
3542 }
3543 return p;
3544 }
3545
3546
3547 /* Jump over non-matching operations. */
3548 static re_char *
skip_noops(re_char * p,re_char * pend)3549 skip_noops (re_char *p, re_char *pend)
3550 {
3551 int mcnt;
3552 while (p < pend)
3553 {
3554 switch (*p)
3555 {
3556 case start_memory:
3557 case stop_memory:
3558 p += 2; break;
3559 case no_op:
3560 p += 1; break;
3561 case jump:
3562 p += 1;
3563 EXTRACT_NUMBER_AND_INCR (mcnt, p);
3564 p += mcnt;
3565 break;
3566 default:
3567 return p;
3568 }
3569 }
3570 eassert (p == pend);
3571 return p;
3572 }
3573
3574 /* Test if C matches charset op. *PP points to the charset or charset_not
3575 opcode. When the function finishes, *PP will be advanced past that opcode.
3576 C is character to test (possibly after translations) and CORIG is original
3577 character (i.e. without any translations). UNIBYTE denotes whether c is
3578 unibyte or multibyte character.
3579 CANON_TABLE is the canonicalisation table for case folding or Qnil. */
3580 static bool
execute_charset(re_char ** pp,int c,int corig,bool unibyte,Lisp_Object canon_table)3581 execute_charset (re_char **pp, int c, int corig, bool unibyte,
3582 Lisp_Object canon_table)
3583 {
3584 eassume (0 <= c && 0 <= corig);
3585 re_char *p = *pp, *rtp = NULL;
3586 bool not = (re_opcode_t) *p == charset_not;
3587
3588 if (CHARSET_RANGE_TABLE_EXISTS_P (p))
3589 {
3590 int count;
3591 rtp = CHARSET_RANGE_TABLE (p);
3592 EXTRACT_NUMBER_AND_INCR (count, rtp);
3593 *pp = CHARSET_RANGE_TABLE_END ((rtp), (count));
3594 }
3595 else
3596 *pp += 2 + CHARSET_BITMAP_SIZE (p);
3597
3598 if (unibyte && c < (1 << BYTEWIDTH))
3599 { /* Lookup bitmap. */
3600 /* Cast to 'unsigned' instead of 'unsigned char' in
3601 case the bit list is a full 32 bytes long. */
3602 if (c < (unsigned) (CHARSET_BITMAP_SIZE (p) * BYTEWIDTH)
3603 && p[2 + c / BYTEWIDTH] & (1 << (c % BYTEWIDTH)))
3604 return !not;
3605 }
3606 else if (rtp)
3607 {
3608 int class_bits = CHARSET_RANGE_TABLE_BITS (p);
3609 int range_start, range_end;
3610
3611 /* Sort tests by the most commonly used classes with some adjustment to which
3612 tests are easiest to perform. Take a look at comment in re_wctype_parse
3613 for table with frequencies of character class names. */
3614
3615 if ((class_bits & BIT_MULTIBYTE) ||
3616 (class_bits & BIT_ALNUM && ISALNUM (c)) ||
3617 (class_bits & BIT_ALPHA && ISALPHA (c)) ||
3618 (class_bits & BIT_SPACE && ISSPACE (c)) ||
3619 (class_bits & BIT_BLANK && ISBLANK (c)) ||
3620 (class_bits & BIT_WORD && ISWORD (c)) ||
3621 ((class_bits & BIT_UPPER) &&
3622 (ISUPPER (corig) || (!NILP (canon_table) && ISLOWER (corig)))) ||
3623 ((class_bits & BIT_LOWER) &&
3624 (ISLOWER (corig) || (!NILP (canon_table) && ISUPPER (corig)))) ||
3625 (class_bits & BIT_PUNCT && ISPUNCT (c)) ||
3626 (class_bits & BIT_GRAPH && ISGRAPH (c)) ||
3627 (class_bits & BIT_PRINT && ISPRINT (c)))
3628 return !not;
3629
3630 for (p = *pp; rtp < p; rtp += 2 * 3)
3631 {
3632 EXTRACT_CHARACTER (range_start, rtp);
3633 EXTRACT_CHARACTER (range_end, rtp + 3);
3634 if (range_start <= c && c <= range_end)
3635 return !not;
3636 }
3637 }
3638
3639 return not;
3640 }
3641
3642 /* True if "p1 matches something" implies "p2 fails". */
3643 static bool
mutually_exclusive_p(struct re_pattern_buffer * bufp,re_char * p1,re_char * p2)3644 mutually_exclusive_p (struct re_pattern_buffer *bufp, re_char *p1,
3645 re_char *p2)
3646 {
3647 re_opcode_t op2;
3648 bool multibyte = RE_MULTIBYTE_P (bufp);
3649 unsigned char *pend = bufp->buffer + bufp->used;
3650
3651 eassert (p1 >= bufp->buffer && p1 < pend
3652 && p2 >= bufp->buffer && p2 <= pend);
3653
3654 /* Skip over open/close-group commands.
3655 If what follows this loop is a ...+ construct,
3656 look at what begins its body, since we will have to
3657 match at least one of that. */
3658 p2 = skip_noops (p2, pend);
3659 /* The same skip can be done for p1, except that this function
3660 is only used in the case where p1 is a simple match operator. */
3661 /* p1 = skip_noops (p1, pend); */
3662
3663 eassert (p1 >= bufp->buffer && p1 < pend
3664 && p2 >= bufp->buffer && p2 <= pend);
3665
3666 op2 = p2 == pend ? succeed : *p2;
3667
3668 switch (op2)
3669 {
3670 case succeed:
3671 case endbuf:
3672 /* If we're at the end of the pattern, we can change. */
3673 if (skip_one_char (p1))
3674 {
3675 DEBUG_PRINT (" End of pattern: fast loop.\n");
3676 return true;
3677 }
3678 break;
3679
3680 case endline:
3681 case exactn:
3682 {
3683 int c
3684 = (re_opcode_t) *p2 == endline ? '\n'
3685 : RE_STRING_CHAR (p2 + 2, multibyte);
3686
3687 if ((re_opcode_t) *p1 == exactn)
3688 {
3689 if (c != RE_STRING_CHAR (p1 + 2, multibyte))
3690 {
3691 DEBUG_PRINT (" '%c' != '%c' => fast loop.\n", c, p1[2]);
3692 return true;
3693 }
3694 }
3695
3696 else if ((re_opcode_t) *p1 == charset
3697 || (re_opcode_t) *p1 == charset_not)
3698 {
3699 if (!execute_charset (&p1, c, c, !multibyte || ASCII_CHAR_P (c),
3700 Qnil))
3701 {
3702 DEBUG_PRINT (" No match => fast loop.\n");
3703 return true;
3704 }
3705 }
3706 else if ((re_opcode_t) *p1 == anychar
3707 && c == '\n')
3708 {
3709 DEBUG_PRINT (" . != \\n => fast loop.\n");
3710 return true;
3711 }
3712 }
3713 break;
3714
3715 case charset:
3716 {
3717 if ((re_opcode_t) *p1 == exactn)
3718 /* Reuse the code above. */
3719 return mutually_exclusive_p (bufp, p2, p1);
3720
3721 /* It is hard to list up all the character in charset
3722 P2 if it includes multibyte character. Give up in
3723 such case. */
3724 else if (!multibyte || !CHARSET_RANGE_TABLE_EXISTS_P (p2))
3725 {
3726 /* Now, we are sure that P2 has no range table.
3727 So, for the size of bitmap in P2, 'p2[1]' is
3728 enough. But P1 may have range table, so the
3729 size of bitmap table of P1 is extracted by
3730 using macro 'CHARSET_BITMAP_SIZE'.
3731
3732 In a multibyte case, we know that all the character
3733 listed in P2 is ASCII. In a unibyte case, P1 has only a
3734 bitmap table. So, in both cases, it is enough to test
3735 only the bitmap table of P1. */
3736
3737 if ((re_opcode_t) *p1 == charset)
3738 {
3739 int idx;
3740 /* We win if the charset inside the loop
3741 has no overlap with the one after the loop. */
3742 for (idx = 0;
3743 (idx < (int) p2[1]
3744 && idx < CHARSET_BITMAP_SIZE (p1));
3745 idx++)
3746 if ((p2[2 + idx] & p1[2 + idx]) != 0)
3747 break;
3748
3749 if (idx == p2[1]
3750 || idx == CHARSET_BITMAP_SIZE (p1))
3751 {
3752 DEBUG_PRINT (" No match => fast loop.\n");
3753 return true;
3754 }
3755 }
3756 else if ((re_opcode_t) *p1 == charset_not)
3757 {
3758 int idx;
3759 /* We win if the charset_not inside the loop lists
3760 every character listed in the charset after. */
3761 for (idx = 0; idx < (int) p2[1]; idx++)
3762 if (! (p2[2 + idx] == 0
3763 || (idx < CHARSET_BITMAP_SIZE (p1)
3764 && ((p2[2 + idx] & ~ p1[2 + idx]) == 0))))
3765 break;
3766
3767 if (idx == p2[1])
3768 {
3769 DEBUG_PRINT (" No match => fast loop.\n");
3770 return true;
3771 }
3772 }
3773 }
3774 }
3775 break;
3776
3777 case charset_not:
3778 switch (*p1)
3779 {
3780 case exactn:
3781 case charset:
3782 /* Reuse the code above. */
3783 return mutually_exclusive_p (bufp, p2, p1);
3784 case charset_not:
3785 /* When we have two charset_not, it's very unlikely that
3786 they don't overlap. The union of the two sets of excluded
3787 chars should cover all possible chars, which, as a matter of
3788 fact, is virtually impossible in multibyte buffers. */
3789 break;
3790 }
3791 break;
3792
3793 case wordend:
3794 return ((re_opcode_t) *p1 == syntaxspec && p1[1] == Sword);
3795 case symend:
3796 return ((re_opcode_t) *p1 == syntaxspec
3797 && (p1[1] == Ssymbol || p1[1] == Sword));
3798 case notsyntaxspec:
3799 return ((re_opcode_t) *p1 == syntaxspec && p1[1] == p2[1]);
3800
3801 case wordbeg:
3802 return ((re_opcode_t) *p1 == notsyntaxspec && p1[1] == Sword);
3803 case symbeg:
3804 return ((re_opcode_t) *p1 == notsyntaxspec
3805 && (p1[1] == Ssymbol || p1[1] == Sword));
3806 case syntaxspec:
3807 return ((re_opcode_t) *p1 == notsyntaxspec && p1[1] == p2[1]);
3808
3809 case wordbound:
3810 return (((re_opcode_t) *p1 == notsyntaxspec
3811 || (re_opcode_t) *p1 == syntaxspec)
3812 && p1[1] == Sword);
3813
3814 case categoryspec:
3815 return ((re_opcode_t) *p1 == notcategoryspec && p1[1] == p2[1]);
3816 case notcategoryspec:
3817 return ((re_opcode_t) *p1 == categoryspec && p1[1] == p2[1]);
3818
3819 default:
3820 ;
3821 }
3822
3823 /* Safe default. */
3824 return false;
3825 }
3826
3827
3828 /* Matching routines. */
3829
3830 /* re_match_2 matches the compiled pattern in BUFP against the
3831 (virtual) concatenation of STRING1 and STRING2 (of length SIZE1
3832 and SIZE2, respectively). We start matching at POS, and stop
3833 matching at STOP.
3834
3835 If REGS is non-null, store offsets for the substring each group
3836 matched in REGS.
3837
3838 We return -1 if no match, -2 if an internal error (such as the
3839 failure stack overflowing). Otherwise, we return the length of the
3840 matched substring. */
3841
3842 ptrdiff_t
re_match_2(struct re_pattern_buffer * bufp,char const * string1,ptrdiff_t size1,char const * string2,ptrdiff_t size2,ptrdiff_t pos,struct re_registers * regs,ptrdiff_t stop)3843 re_match_2 (struct re_pattern_buffer *bufp,
3844 char const *string1, ptrdiff_t size1,
3845 char const *string2, ptrdiff_t size2,
3846 ptrdiff_t pos, struct re_registers *regs, ptrdiff_t stop)
3847 {
3848 ptrdiff_t result;
3849
3850 ptrdiff_t charpos;
3851 gl_state.object = re_match_object; /* Used by SYNTAX_TABLE_BYTE_TO_CHAR. */
3852 charpos = SYNTAX_TABLE_BYTE_TO_CHAR (POS_AS_IN_BUFFER (pos));
3853 SETUP_SYNTAX_TABLE_FOR_OBJECT (re_match_object, charpos, 1);
3854
3855 result = re_match_2_internal (bufp, (re_char *) string1, size1,
3856 (re_char *) string2, size2,
3857 pos, regs, stop);
3858 return result;
3859 }
3860
3861 static void
unwind_re_match(void * ptr)3862 unwind_re_match (void *ptr)
3863 {
3864 struct buffer *b = (struct buffer *) ptr;
3865 b->text->inhibit_shrinking = 0;
3866 }
3867
3868 /* This is a separate function so that we can force an alloca cleanup
3869 afterwards. */
3870 static ptrdiff_t
re_match_2_internal(struct re_pattern_buffer * bufp,re_char * string1,ptrdiff_t size1,re_char * string2,ptrdiff_t size2,ptrdiff_t pos,struct re_registers * regs,ptrdiff_t stop)3871 re_match_2_internal (struct re_pattern_buffer *bufp,
3872 re_char *string1, ptrdiff_t size1,
3873 re_char *string2, ptrdiff_t size2,
3874 ptrdiff_t pos, struct re_registers *regs, ptrdiff_t stop)
3875 {
3876 eassume (0 <= size1);
3877 eassume (0 <= size2);
3878 eassume (0 <= pos && pos <= stop && stop <= size1 + size2);
3879
3880 /* General temporaries. */
3881 int mcnt;
3882
3883 /* Just past the end of the corresponding string. */
3884 re_char *end1, *end2;
3885
3886 /* Pointers into string1 and string2, just past the last characters in
3887 each to consider matching. */
3888 re_char *end_match_1, *end_match_2;
3889
3890 /* Where we are in the data, and the end of the current string. */
3891 re_char *d, *dend;
3892
3893 /* Used sometimes to remember where we were before starting matching
3894 an operator so that we can go back in case of failure. This "atomic"
3895 behavior of matching opcodes is indispensable to the correctness
3896 of the on_failure_keep_string_jump optimization. */
3897 re_char *dfail;
3898
3899 /* Where we are in the pattern, and the end of the pattern. */
3900 re_char *p = bufp->buffer;
3901 re_char *pend = p + bufp->used;
3902
3903 /* We use this to map every character in the string. */
3904 Lisp_Object translate = bufp->translate;
3905
3906 /* True if BUFP is setup from a multibyte regex. */
3907 bool multibyte = RE_MULTIBYTE_P (bufp);
3908
3909 /* True if STRING1/STRING2 are multibyte. */
3910 bool target_multibyte = RE_TARGET_MULTIBYTE_P (bufp);
3911
3912 /* Failure point stack. Each place that can handle a failure further
3913 down the line pushes a failure point on this stack. It consists of
3914 regstart, and regend for all registers corresponding to
3915 the subexpressions we're currently inside, plus the number of such
3916 registers, and, finally, two char *'s. The first char * is where
3917 to resume scanning the pattern; the second one is where to resume
3918 scanning the strings. */
3919 fail_stack_type fail_stack;
3920 #ifdef DEBUG_COMPILES_ARGUMENTS
3921 ptrdiff_t nfailure_points_pushed = 0, nfailure_points_popped = 0;
3922 #endif
3923
3924 /* We fill all the registers internally, independent of what we
3925 return, for use in backreferences. The number here includes
3926 an element for register zero. */
3927 ptrdiff_t num_regs = bufp->re_nsub + 1;
3928 eassume (0 < num_regs);
3929
3930 /* Information on the contents of registers. These are pointers into
3931 the input strings; they record just what was matched (on this
3932 attempt) by a subexpression part of the pattern, that is, the
3933 regnum-th regstart pointer points to where in the pattern we began
3934 matching and the regnum-th regend points to right after where we
3935 stopped matching the regnum-th subexpression. */
3936 re_char **regstart UNINIT, **regend UNINIT;
3937
3938 /* The following record the register info as found in the above
3939 variables when we find a match better than any we've seen before.
3940 This happens as we backtrack through the failure points, which in
3941 turn happens only if we have not yet matched the entire string. */
3942 bool best_regs_set = false;
3943 re_char **best_regstart UNINIT, **best_regend UNINIT;
3944
3945 /* Logically, this is 'best_regend[0]'. But we don't want to have to
3946 allocate space for that if we're not allocating space for anything
3947 else (see below). Also, we never need info about register 0 for
3948 any of the other register vectors, and it seems rather a kludge to
3949 treat 'best_regend' differently from the rest. So we keep track of
3950 the end of the best match so far in a separate variable. We
3951 initialize this to NULL so that when we backtrack the first time
3952 and need to test it, it's not garbage. */
3953 re_char *match_end = NULL;
3954
3955 #ifdef DEBUG_COMPILES_ARGUMENTS
3956 /* Counts the total number of registers pushed. */
3957 ptrdiff_t num_regs_pushed = 0;
3958 #endif
3959
3960 DEBUG_PRINT ("\nEntering re_match_2.\n");
3961
3962 REGEX_USE_SAFE_ALLOCA;
3963
3964 INIT_FAIL_STACK ();
3965
3966 ptrdiff_t count = SPECPDL_INDEX ();
3967
3968 /* Prevent shrinking and relocation of buffer text if GC happens
3969 while we are inside this function. The calls to
3970 UPDATE_SYNTAX_TABLE_* macros can call Lisp (via
3971 `internal--syntax-propertize`); these calls are careful to defend against
3972 buffer modifications, but even with no modifications, the buffer text may
3973 be relocated during GC by `compact_buffer` which would invalidate
3974 our C pointers to buffer text. */
3975 if (!current_buffer->text->inhibit_shrinking)
3976 {
3977 record_unwind_protect_ptr (unwind_re_match, current_buffer);
3978 current_buffer->text->inhibit_shrinking = 1;
3979 }
3980
3981 /* Do not bother to initialize all the register variables if there are
3982 no groups in the pattern, as it takes a fair amount of time. If
3983 there are groups, we include space for register 0 (the whole
3984 pattern) in REGSTART[0], even though we never use it, to avoid
3985 the undefined behavior of subtracting 1 from REGSTART. */
3986 ptrdiff_t re_nsub = num_regs - 1;
3987 if (0 < re_nsub)
3988 {
3989 regstart = SAFE_ALLOCA ((re_nsub * 4 + 1) * sizeof *regstart);
3990 regend = regstart + num_regs;
3991 best_regstart = regend + re_nsub;
3992 best_regend = best_regstart + re_nsub;
3993
3994 /* Initialize subexpression text positions to unset, to mark ones
3995 that no start_memory/stop_memory has been seen for. */
3996 for (re_char **apos = regstart + 1; apos < best_regstart + 1; apos++)
3997 *apos = NULL;
3998 }
3999
4000 /* We move 'string1' into 'string2' if the latter's empty -- but not if
4001 'string1' is null. */
4002 if (size2 == 0 && string1 != NULL)
4003 {
4004 string2 = string1;
4005 size2 = size1;
4006 string1 = 0;
4007 size1 = 0;
4008 }
4009 end1 = string1 + size1;
4010 end2 = string2 + size2;
4011
4012 /* P scans through the pattern as D scans through the data.
4013 DEND is the end of the input string that D points within.
4014 Advance D into the following input string whenever necessary, but
4015 this happens before fetching; therefore, at the beginning of the
4016 loop, D can be pointing at the end of a string, but it cannot
4017 equal STRING2. */
4018 if (pos >= size1)
4019 {
4020 /* Only match within string2. */
4021 d = string2 + pos - size1;
4022 dend = end_match_2 = string2 + stop - size1;
4023 end_match_1 = end1; /* Just to give it a value. */
4024 }
4025 else
4026 {
4027 if (stop < size1)
4028 {
4029 /* Only match within string1. */
4030 end_match_1 = string1 + stop;
4031 /* BEWARE!
4032 When we reach end_match_1, PREFETCH normally switches to string2.
4033 But in the present case, this means that just doing a PREFETCH
4034 makes us jump from 'stop' to 'gap' within the string.
4035 What we really want here is for the search to stop as
4036 soon as we hit end_match_1. That's why we set end_match_2
4037 to end_match_1 (since PREFETCH fails as soon as we hit
4038 end_match_2). */
4039 end_match_2 = end_match_1;
4040 }
4041 else
4042 { /* It's important to use this code when STOP == SIZE so that
4043 moving D from end1 to string2 will not prevent the D == DEND
4044 check from catching the end of string. */
4045 end_match_1 = end1;
4046 end_match_2 = string2 + stop - size1;
4047 }
4048 d = string1 + pos;
4049 dend = end_match_1;
4050 }
4051
4052 DEBUG_PRINT ("The compiled pattern is:\n");
4053 DEBUG_PRINT_COMPILED_PATTERN (bufp, p, pend);
4054 DEBUG_PRINT ("The string to match is: \"");
4055 DEBUG_PRINT_DOUBLE_STRING (d, string1, size1, string2, size2);
4056 DEBUG_PRINT ("\"\n");
4057
4058 /* This loops over pattern commands. It exits by returning from the
4059 function if the match is complete, or it drops through if the match
4060 fails at this starting point in the input data. */
4061 for (;;)
4062 {
4063 DEBUG_PRINT ("\n%p: ", p);
4064
4065 if (p == pend)
4066 {
4067 /* End of pattern means we might have succeeded. */
4068 DEBUG_PRINT ("end of pattern ... ");
4069
4070 /* If we haven't matched the entire string, and we want the
4071 longest match, try backtracking. */
4072 if (d != end_match_2)
4073 {
4074 /* True if this match is the best seen so far. */
4075 bool best_match_p;
4076
4077 {
4078 /* True if this match ends in the same string (string1
4079 or string2) as the best previous match. */
4080 bool same_str_p = (FIRST_STRING_P (match_end)
4081 == FIRST_STRING_P (d));
4082
4083 /* AIX compiler got confused when this was combined
4084 with the previous declaration. */
4085 if (same_str_p)
4086 best_match_p = d > match_end;
4087 else
4088 best_match_p = !FIRST_STRING_P (d);
4089 }
4090
4091 DEBUG_PRINT ("backtracking.\n");
4092
4093 if (!FAIL_STACK_EMPTY ())
4094 { /* More failure points to try. */
4095
4096 /* If exceeds best match so far, save it. */
4097 if (!best_regs_set || best_match_p)
4098 {
4099 best_regs_set = true;
4100 match_end = d;
4101
4102 DEBUG_PRINT ("\nSAVING match as best so far.\n");
4103
4104 for (ptrdiff_t reg = 1; reg < num_regs; reg++)
4105 {
4106 best_regstart[reg] = regstart[reg];
4107 best_regend[reg] = regend[reg];
4108 }
4109 }
4110 goto fail;
4111 }
4112
4113 /* If no failure points, don't restore garbage. And if
4114 last match is real best match, don't restore second
4115 best one. */
4116 else if (best_regs_set && !best_match_p)
4117 {
4118 restore_best_regs:
4119 /* Restore best match. It may happen that 'dend ==
4120 end_match_1' while the restored d is in string2.
4121 For example, the pattern 'x.*y.*z' against the
4122 strings 'x-' and 'y-z-', if the two strings are
4123 not consecutive in memory. */
4124 DEBUG_PRINT ("Restoring best registers.\n");
4125
4126 d = match_end;
4127 dend = ((d >= string1 && d <= end1)
4128 ? end_match_1 : end_match_2);
4129
4130 for (ptrdiff_t reg = 1; reg < num_regs; reg++)
4131 {
4132 regstart[reg] = best_regstart[reg];
4133 regend[reg] = best_regend[reg];
4134 eassert (REG_UNSET (regstart[reg])
4135 <= REG_UNSET (regend[reg]));
4136 }
4137 }
4138 } /* d != end_match_2 */
4139
4140 succeed_label:
4141 DEBUG_PRINT ("Accepting match.\n");
4142
4143 /* If caller wants register contents data back, do it. */
4144 if (regs)
4145 {
4146 /* Have the register data arrays been allocated? */
4147 if (bufp->regs_allocated == REGS_UNALLOCATED)
4148 { /* No. So allocate them with malloc. */
4149 ptrdiff_t n = max (RE_NREGS, num_regs);
4150 regs->start = xnmalloc (n, sizeof *regs->start);
4151 regs->end = xnmalloc (n, sizeof *regs->end);
4152 regs->num_regs = n;
4153 bufp->regs_allocated = REGS_REALLOCATE;
4154 }
4155 else if (bufp->regs_allocated == REGS_REALLOCATE)
4156 { /* Yes. If we need more elements than were already
4157 allocated, reallocate them. If we need fewer, just
4158 leave it alone. */
4159 ptrdiff_t n = regs->num_regs;
4160 if (n < num_regs)
4161 {
4162 n = max (n + (n >> 1), num_regs);
4163 regs->start
4164 = xnrealloc (regs->start, n, sizeof *regs->start);
4165 regs->end = xnrealloc (regs->end, n, sizeof *regs->end);
4166 regs->num_regs = n;
4167 }
4168 }
4169 else
4170 eassert (bufp->regs_allocated == REGS_FIXED);
4171
4172 /* Convert the pointer data in 'regstart' and 'regend' to
4173 indices. Register zero has to be set differently,
4174 since we haven't kept track of any info for it. */
4175 if (regs->num_regs > 0)
4176 {
4177 regs->start[0] = pos;
4178 regs->end[0] = POINTER_TO_OFFSET (d);
4179 }
4180
4181 for (ptrdiff_t reg = 1; reg < num_regs; reg++)
4182 {
4183 eassert (REG_UNSET (regstart[reg])
4184 <= REG_UNSET (regend[reg]));
4185 if (REG_UNSET (regend[reg]))
4186 regs->start[reg] = regs->end[reg] = -1;
4187 else
4188 {
4189 regs->start[reg] = POINTER_TO_OFFSET (regstart[reg]);
4190 regs->end[reg] = POINTER_TO_OFFSET (regend[reg]);
4191 }
4192 }
4193
4194 /* If the regs structure we return has more elements than
4195 were in the pattern, set the extra elements to -1. */
4196 for (ptrdiff_t reg = num_regs; reg < regs->num_regs; reg++)
4197 regs->start[reg] = regs->end[reg] = -1;
4198 }
4199
4200 DEBUG_PRINT ("%td failure points pushed, %td popped (%td remain).\n",
4201 nfailure_points_pushed, nfailure_points_popped,
4202 nfailure_points_pushed - nfailure_points_popped);
4203 DEBUG_PRINT ("%td registers pushed.\n", num_regs_pushed);
4204
4205 ptrdiff_t dcnt = POINTER_TO_OFFSET (d) - pos;
4206
4207 DEBUG_PRINT ("Returning %td from re_match_2.\n", dcnt);
4208
4209 unbind_to (count, Qnil);
4210 SAFE_FREE ();
4211 return dcnt;
4212 }
4213
4214 /* Otherwise match next pattern command. */
4215 switch (*p++)
4216 {
4217 /* Ignore these. Used to ignore the n of succeed_n's which
4218 currently have n == 0. */
4219 case no_op:
4220 DEBUG_PRINT ("EXECUTING no_op.\n");
4221 break;
4222
4223 case succeed:
4224 DEBUG_PRINT ("EXECUTING succeed.\n");
4225 goto succeed_label;
4226
4227 /* Match the next n pattern characters exactly. The following
4228 byte in the pattern defines n, and the n bytes after that
4229 are the characters to match. */
4230 case exactn:
4231 mcnt = *p++;
4232 DEBUG_PRINT ("EXECUTING exactn %d.\n", mcnt);
4233
4234 /* Remember the start point to rollback upon failure. */
4235 dfail = d;
4236
4237 /* The cost of testing 'translate' is comparatively small. */
4238 if (target_multibyte)
4239 do
4240 {
4241 int pat_charlen, buf_charlen;
4242 int pat_ch, buf_ch;
4243
4244 PREFETCH ();
4245 if (multibyte)
4246 pat_ch = string_char_and_length (p, &pat_charlen);
4247 else
4248 {
4249 pat_ch = RE_CHAR_TO_MULTIBYTE (*p);
4250 pat_charlen = 1;
4251 }
4252 buf_ch = string_char_and_length (d, &buf_charlen);
4253
4254 if (TRANSLATE (buf_ch) != pat_ch)
4255 {
4256 d = dfail;
4257 goto fail;
4258 }
4259
4260 p += pat_charlen;
4261 d += buf_charlen;
4262 mcnt -= pat_charlen;
4263 }
4264 while (mcnt > 0);
4265 else
4266 do
4267 {
4268 int pat_charlen;
4269 int pat_ch, buf_ch;
4270
4271 PREFETCH ();
4272 if (multibyte)
4273 {
4274 pat_ch = string_char_and_length (p, &pat_charlen);
4275 pat_ch = RE_CHAR_TO_UNIBYTE (pat_ch);
4276 }
4277 else
4278 {
4279 pat_ch = *p;
4280 pat_charlen = 1;
4281 }
4282 buf_ch = RE_CHAR_TO_MULTIBYTE (*d);
4283 if (! CHAR_BYTE8_P (buf_ch))
4284 {
4285 buf_ch = TRANSLATE (buf_ch);
4286 buf_ch = RE_CHAR_TO_UNIBYTE (buf_ch);
4287 if (buf_ch < 0)
4288 buf_ch = *d;
4289 }
4290 else
4291 buf_ch = *d;
4292 if (buf_ch != pat_ch)
4293 {
4294 d = dfail;
4295 goto fail;
4296 }
4297 p += pat_charlen;
4298 d++;
4299 mcnt -= pat_charlen;
4300 }
4301 while (mcnt > 0);
4302
4303 break;
4304
4305
4306 /* Match any character except newline. */
4307 case anychar:
4308 {
4309 int buf_charlen;
4310 int buf_ch;
4311
4312 DEBUG_PRINT ("EXECUTING anychar.\n");
4313
4314 PREFETCH ();
4315 buf_ch = RE_STRING_CHAR_AND_LENGTH (d, buf_charlen,
4316 target_multibyte);
4317 buf_ch = TRANSLATE (buf_ch);
4318 if (buf_ch == '\n')
4319 goto fail;
4320
4321 DEBUG_PRINT (" Matched \"%d\".\n", *d);
4322 d += buf_charlen;
4323 }
4324 break;
4325
4326
4327 case charset:
4328 case charset_not:
4329 {
4330 /* Whether matching against a unibyte character. */
4331 bool unibyte_char = false;
4332
4333 DEBUG_PRINT ("EXECUTING charset%s.\n",
4334 (re_opcode_t) *(p - 1) == charset_not ? "_not" : "");
4335
4336 PREFETCH ();
4337 int len;
4338 int corig = RE_STRING_CHAR_AND_LENGTH (d, len, target_multibyte);
4339 int c = corig;
4340 if (target_multibyte)
4341 {
4342 int c1;
4343
4344 c = TRANSLATE (c);
4345 c1 = RE_CHAR_TO_UNIBYTE (c);
4346 if (c1 >= 0)
4347 {
4348 unibyte_char = true;
4349 c = c1;
4350 }
4351 }
4352 else
4353 {
4354 int c1 = RE_CHAR_TO_MULTIBYTE (c);
4355
4356 if (! CHAR_BYTE8_P (c1))
4357 {
4358 c1 = TRANSLATE (c1);
4359 c1 = RE_CHAR_TO_UNIBYTE (c1);
4360 if (c1 >= 0)
4361 {
4362 unibyte_char = true;
4363 c = c1;
4364 }
4365 }
4366 else
4367 unibyte_char = true;
4368 }
4369
4370 p -= 1;
4371 if (!execute_charset (&p, c, corig, unibyte_char, translate))
4372 goto fail;
4373
4374 d += len;
4375 }
4376 break;
4377
4378
4379 /* The beginning of a group is represented by start_memory.
4380 The argument is the register number. The text
4381 matched within the group is recorded (in the internal
4382 registers data structure) under the register number. */
4383 case start_memory:
4384 DEBUG_PRINT ("EXECUTING start_memory %d:\n", *p);
4385 eassert (0 < *p && *p < num_regs);
4386
4387 /* In case we need to undo this operation (via backtracking). */
4388 PUSH_FAILURE_REG (*p);
4389
4390 regstart[*p] = d;
4391 DEBUG_PRINT (" regstart: %td\n", POINTER_TO_OFFSET (regstart[*p]));
4392
4393 /* Move past the register number and inner group count. */
4394 p += 1;
4395 break;
4396
4397
4398 /* The stop_memory opcode represents the end of a group. Its
4399 argument is the same as start_memory's: the register number. */
4400 case stop_memory:
4401 DEBUG_PRINT ("EXECUTING stop_memory %d:\n", *p);
4402
4403 eassert (0 < *p && *p < num_regs);
4404 eassert (!REG_UNSET (regstart[*p]));
4405 /* Strictly speaking, there should be code such as:
4406
4407 eassert (REG_UNSET (regend[*p]));
4408 PUSH_FAILURE_REGSTOP (*p);
4409
4410 But the only info to be pushed is regend[*p] and it is known to
4411 be UNSET, so there really isn't anything to push.
4412 Not pushing anything, on the other hand deprives us from the
4413 guarantee that regend[*p] is UNSET since undoing this operation
4414 will not reset its value properly. This is not important since
4415 the value will only be read on the next start_memory or at
4416 the very end and both events can only happen if this stop_memory
4417 is *not* undone. */
4418
4419 regend[*p] = d;
4420 DEBUG_PRINT (" regend: %td\n", POINTER_TO_OFFSET (regend[*p]));
4421
4422 /* Move past the register number and the inner group count. */
4423 p += 1;
4424 break;
4425
4426
4427 /* \<digit> has been turned into a 'duplicate' command which is
4428 followed by the numeric value of <digit> as the register number. */
4429 case duplicate:
4430 {
4431 re_char *d2, *dend2;
4432 int regno = *p++; /* Get which register to match against. */
4433 DEBUG_PRINT ("EXECUTING duplicate %d.\n", regno);
4434
4435 /* Can't back reference a group which we've never matched. */
4436 eassert (0 < regno && regno < num_regs);
4437 eassert (REG_UNSET (regstart[regno]) <= REG_UNSET (regend[regno]));
4438 if (REG_UNSET (regend[regno]))
4439 goto fail;
4440
4441 /* Where in input to try to start matching. */
4442 d2 = regstart[regno];
4443
4444 /* Remember the start point to rollback upon failure. */
4445 dfail = d;
4446
4447 /* Where to stop matching; if both the place to start and
4448 the place to stop matching are in the same string, then
4449 set to the place to stop, otherwise, for now have to use
4450 the end of the first string. */
4451
4452 dend2 = ((FIRST_STRING_P (regstart[regno])
4453 == FIRST_STRING_P (regend[regno]))
4454 ? regend[regno] : end_match_1);
4455 for (;;)
4456 {
4457 ptrdiff_t dcnt;
4458
4459 /* If necessary, advance to next segment in register
4460 contents. */
4461 while (d2 == dend2)
4462 {
4463 if (dend2 == end_match_2) break;
4464 if (dend2 == regend[regno]) break;
4465
4466 /* End of string1 => advance to string2. */
4467 d2 = string2;
4468 dend2 = regend[regno];
4469 }
4470 /* At end of register contents => success */
4471 if (d2 == dend2) break;
4472
4473 /* If necessary, advance to next segment in data. */
4474 PREFETCH ();
4475
4476 /* How many characters left in this segment to match. */
4477 dcnt = dend - d;
4478
4479 /* Want how many consecutive characters we can match in
4480 one shot, so, if necessary, adjust the count. */
4481 if (dcnt > dend2 - d2)
4482 dcnt = dend2 - d2;
4483
4484 /* Compare that many; failure if mismatch, else move
4485 past them. */
4486 if (!NILP (translate)
4487 ? bcmp_translate (d, d2, dcnt, translate, target_multibyte)
4488 : memcmp (d, d2, dcnt))
4489 {
4490 d = dfail;
4491 goto fail;
4492 }
4493 d += dcnt, d2 += dcnt;
4494 }
4495 }
4496 break;
4497
4498
4499 /* begline matches the empty string at the beginning of the string,
4500 and after newlines. */
4501 case begline:
4502 DEBUG_PRINT ("EXECUTING begline.\n");
4503
4504 if (AT_STRINGS_BEG (d))
4505 break;
4506 else
4507 {
4508 unsigned c;
4509 GET_CHAR_BEFORE_2 (c, d, string1, end1, string2, end2);
4510 if (c == '\n')
4511 break;
4512 }
4513 goto fail;
4514
4515
4516 /* endline is the dual of begline. */
4517 case endline:
4518 DEBUG_PRINT ("EXECUTING endline.\n");
4519
4520 if (AT_STRINGS_END (d))
4521 break;
4522 PREFETCH_NOLIMIT ();
4523 if (*d == '\n')
4524 break;
4525 goto fail;
4526
4527
4528 /* Match at the very beginning of the data. */
4529 case begbuf:
4530 DEBUG_PRINT ("EXECUTING begbuf.\n");
4531 if (AT_STRINGS_BEG (d))
4532 break;
4533 goto fail;
4534
4535
4536 /* Match at the very end of the data. */
4537 case endbuf:
4538 DEBUG_PRINT ("EXECUTING endbuf.\n");
4539 if (AT_STRINGS_END (d))
4540 break;
4541 goto fail;
4542
4543
4544 /* on_failure_keep_string_jump is used to optimize '.*\n'. It
4545 pushes NULL as the value for the string on the stack. Then
4546 'POP_FAILURE_POINT' will keep the current value for the
4547 string, instead of restoring it. To see why, consider
4548 matching 'foo\nbar' against '.*\n'. The .* matches the foo;
4549 then the . fails against the \n. But the next thing we want
4550 to do is match the \n against the \n; if we restored the
4551 string value, we would be back at the foo.
4552
4553 Because this is used only in specific cases, we don't need to
4554 check all the things that 'on_failure_jump' does, to make
4555 sure the right things get saved on the stack. Hence we don't
4556 share its code. The only reason to push anything on the
4557 stack at all is that otherwise we would have to change
4558 'anychar's code to do something besides goto fail in this
4559 case; that seems worse than this. */
4560 case on_failure_keep_string_jump:
4561 EXTRACT_NUMBER_AND_INCR (mcnt, p);
4562 DEBUG_PRINT ("EXECUTING on_failure_keep_string_jump %d (to %p):\n",
4563 mcnt, p + mcnt);
4564
4565 PUSH_FAILURE_POINT (p - 3, NULL);
4566 break;
4567
4568 /* A nasty loop is introduced by the non-greedy *? and +?.
4569 With such loops, the stack only ever contains one failure point
4570 at a time, so that a plain on_failure_jump_loop kind of
4571 cycle detection cannot work. Worse yet, such a detection
4572 can not only fail to detect a cycle, but it can also wrongly
4573 detect a cycle (between different instantiations of the same
4574 loop).
4575 So the method used for those nasty loops is a little different:
4576 We use a special cycle-detection-stack-frame which is pushed
4577 when the on_failure_jump_nastyloop failure-point is *popped*.
4578 This special frame thus marks the beginning of one iteration
4579 through the loop and we can hence easily check right here
4580 whether something matched between the beginning and the end of
4581 the loop. */
4582 case on_failure_jump_nastyloop:
4583 EXTRACT_NUMBER_AND_INCR (mcnt, p);
4584 DEBUG_PRINT ("EXECUTING on_failure_jump_nastyloop %d (to %p):\n",
4585 mcnt, p + mcnt);
4586
4587 eassert ((re_opcode_t)p[-4] == no_op);
4588 {
4589 bool cycle = false;
4590 CHECK_INFINITE_LOOP (p - 4, d);
4591 if (!cycle)
4592 /* If there's a cycle, just continue without pushing
4593 this failure point. The failure point is the "try again"
4594 option, which shouldn't be tried.
4595 We want (x?)*?y\1z to match both xxyz and xxyxz. */
4596 PUSH_FAILURE_POINT (p - 3, d);
4597 }
4598 break;
4599
4600 /* Simple loop detecting on_failure_jump: just check on the
4601 failure stack if the same spot was already hit earlier. */
4602 case on_failure_jump_loop:
4603 on_failure:
4604 EXTRACT_NUMBER_AND_INCR (mcnt, p);
4605 DEBUG_PRINT ("EXECUTING on_failure_jump_loop %d (to %p):\n",
4606 mcnt, p + mcnt);
4607 {
4608 bool cycle = false;
4609 CHECK_INFINITE_LOOP (p - 3, d);
4610 if (cycle)
4611 /* If there's a cycle, get out of the loop, as if the matching
4612 had failed. We used to just 'goto fail' here, but that was
4613 aborting the search a bit too early: we want to keep the
4614 empty-loop-match and keep matching after the loop.
4615 We want (x?)*y\1z to match both xxyz and xxyxz. */
4616 p += mcnt;
4617 else
4618 PUSH_FAILURE_POINT (p - 3, d);
4619 }
4620 break;
4621
4622
4623 /* Uses of on_failure_jump:
4624
4625 Each alternative starts with an on_failure_jump that points
4626 to the beginning of the next alternative. Each alternative
4627 except the last ends with a jump that in effect jumps past
4628 the rest of the alternatives. (They really jump to the
4629 ending jump of the following alternative, because tensioning
4630 these jumps is a hassle.)
4631
4632 Repeats start with an on_failure_jump that points past both
4633 the repetition text and either the following jump or
4634 pop_failure_jump back to this on_failure_jump. */
4635 case on_failure_jump:
4636 EXTRACT_NUMBER_AND_INCR (mcnt, p);
4637 DEBUG_PRINT ("EXECUTING on_failure_jump %d (to %p):\n",
4638 mcnt, p + mcnt);
4639
4640 PUSH_FAILURE_POINT (p -3, d);
4641 break;
4642
4643 /* This operation is used for greedy *.
4644 Compare the beginning of the repeat with what in the
4645 pattern follows its end. If we can establish that there
4646 is nothing that they would both match, i.e., that we
4647 would have to backtrack because of (as in, e.g., 'a*a')
4648 then we can use a non-backtracking loop based on
4649 on_failure_keep_string_jump instead of on_failure_jump. */
4650 case on_failure_jump_smart:
4651 EXTRACT_NUMBER_AND_INCR (mcnt, p);
4652 DEBUG_PRINT ("EXECUTING on_failure_jump_smart %d (to %p).\n",
4653 mcnt, p + mcnt);
4654 {
4655 re_char *p1 = p; /* Next operation. */
4656 /* Discard 'const', making re_search non-reentrant. */
4657 unsigned char *p2 = (unsigned char *) p + mcnt; /* Jump dest. */
4658 unsigned char *p3 = (unsigned char *) p - 3; /* opcode location. */
4659
4660 p -= 3; /* Reset so that we will re-execute the
4661 instruction once it's been changed. */
4662
4663 EXTRACT_NUMBER (mcnt, p2 - 2);
4664
4665 /* Ensure this is indeed the trivial kind of loop
4666 we are expecting. */
4667 eassert (skip_one_char (p1) == p2 - 3);
4668 eassert ((re_opcode_t) p2[-3] == jump && p2 + mcnt == p);
4669 DEBUG_STATEMENT (regex_emacs_debug += 2);
4670 if (mutually_exclusive_p (bufp, p1, p2))
4671 {
4672 /* Use a fast 'on_failure_keep_string_jump' loop. */
4673 DEBUG_PRINT (" smart exclusive => fast loop.\n");
4674 *p3 = (unsigned char) on_failure_keep_string_jump;
4675 STORE_NUMBER (p2 - 2, mcnt + 3);
4676 }
4677 else
4678 {
4679 /* Default to a safe 'on_failure_jump' loop. */
4680 DEBUG_PRINT (" smart default => slow loop.\n");
4681 *p3 = (unsigned char) on_failure_jump;
4682 }
4683 DEBUG_STATEMENT (regex_emacs_debug -= 2);
4684 }
4685 break;
4686
4687 /* Unconditionally jump (without popping any failure points). */
4688 case jump:
4689 unconditional_jump:
4690 maybe_quit ();
4691 EXTRACT_NUMBER_AND_INCR (mcnt, p); /* Get the amount to jump. */
4692 DEBUG_PRINT ("EXECUTING jump %d ", mcnt);
4693 p += mcnt; /* Do the jump. */
4694 DEBUG_PRINT ("(to %p).\n", p);
4695 break;
4696
4697
4698 /* Have to succeed matching what follows at least n times.
4699 After that, handle like 'on_failure_jump'. */
4700 case succeed_n:
4701 /* Signedness doesn't matter since we only compare MCNT to 0. */
4702 EXTRACT_NUMBER (mcnt, p + 2);
4703 DEBUG_PRINT ("EXECUTING succeed_n %d.\n", mcnt);
4704
4705 /* Originally, mcnt is how many times we HAVE to succeed. */
4706 if (mcnt != 0)
4707 {
4708 /* Discard 'const', making re_search non-reentrant. */
4709 unsigned char *p2 = (unsigned char *) p + 2; /* counter loc. */
4710 mcnt--;
4711 p += 4;
4712 PUSH_NUMBER (p2, mcnt);
4713 }
4714 else
4715 /* The two bytes encoding mcnt == 0 are two no_op opcodes. */
4716 goto on_failure;
4717 break;
4718
4719 case jump_n:
4720 /* Signedness doesn't matter since we only compare MCNT to 0. */
4721 EXTRACT_NUMBER (mcnt, p + 2);
4722 DEBUG_PRINT ("EXECUTING jump_n %d.\n", mcnt);
4723
4724 /* Originally, this is how many times we CAN jump. */
4725 if (mcnt != 0)
4726 {
4727 /* Discard 'const', making re_search non-reentrant. */
4728 unsigned char *p2 = (unsigned char *) p + 2; /* counter loc. */
4729 mcnt--;
4730 PUSH_NUMBER (p2, mcnt);
4731 goto unconditional_jump;
4732 }
4733 /* If don't have to jump any more, skip over the rest of command. */
4734 else
4735 p += 4;
4736 break;
4737
4738 case set_number_at:
4739 {
4740 unsigned char *p2; /* Location of the counter. */
4741 DEBUG_PRINT ("EXECUTING set_number_at.\n");
4742
4743 EXTRACT_NUMBER_AND_INCR (mcnt, p);
4744 /* Discard 'const', making re_search non-reentrant. */
4745 p2 = (unsigned char *) p + mcnt;
4746 /* Signedness doesn't matter since we only copy MCNT's bits. */
4747 EXTRACT_NUMBER_AND_INCR (mcnt, p);
4748 DEBUG_PRINT (" Setting %p to %d.\n", p2, mcnt);
4749 PUSH_NUMBER (p2, mcnt);
4750 break;
4751 }
4752
4753 case wordbound:
4754 case notwordbound:
4755 {
4756 bool not = (re_opcode_t) *(p - 1) == notwordbound;
4757 DEBUG_PRINT ("EXECUTING %swordbound.\n", not ? "not" : "");
4758
4759 /* We SUCCEED (or FAIL) in one of the following cases: */
4760
4761 /* Case 1: D is at the beginning or the end of string. */
4762 if (AT_STRINGS_BEG (d) || AT_STRINGS_END (d))
4763 not = !not;
4764 else
4765 {
4766 /* C1 is the character before D, S1 is the syntax of C1, C2
4767 is the character at D, and S2 is the syntax of C2. */
4768 int c1, c2;
4769 int s1, s2;
4770 int dummy;
4771 ptrdiff_t offset = PTR_TO_OFFSET (d);
4772 ptrdiff_t charpos = SYNTAX_TABLE_BYTE_TO_CHAR (offset) - 1;
4773 UPDATE_SYNTAX_TABLE (charpos);
4774 GET_CHAR_BEFORE_2 (c1, d, string1, end1, string2, end2);
4775 s1 = SYNTAX (c1);
4776 UPDATE_SYNTAX_TABLE_FORWARD (charpos + 1);
4777 PREFETCH_NOLIMIT ();
4778 GET_CHAR_AFTER (c2, d, dummy);
4779 s2 = SYNTAX (c2);
4780
4781 if (/* Case 2: Only one of S1 and S2 is Sword. */
4782 ((s1 == Sword) != (s2 == Sword))
4783 /* Case 3: Both of S1 and S2 are Sword, and macro
4784 WORD_BOUNDARY_P (C1, C2) returns nonzero. */
4785 || ((s1 == Sword) && WORD_BOUNDARY_P (c1, c2)))
4786 not = !not;
4787 }
4788 if (not)
4789 break;
4790 else
4791 goto fail;
4792 }
4793
4794 case wordbeg:
4795 DEBUG_PRINT ("EXECUTING wordbeg.\n");
4796
4797 /* We FAIL in one of the following cases: */
4798
4799 /* Case 1: D is at the end of string. */
4800 if (AT_STRINGS_END (d))
4801 goto fail;
4802 else
4803 {
4804 /* C1 is the character before D, S1 is the syntax of C1, C2
4805 is the character at D, and S2 is the syntax of C2. */
4806 int c1, c2;
4807 int s1, s2;
4808 int dummy;
4809 ptrdiff_t offset = PTR_TO_OFFSET (d);
4810 ptrdiff_t charpos = SYNTAX_TABLE_BYTE_TO_CHAR (offset);
4811 UPDATE_SYNTAX_TABLE (charpos);
4812 PREFETCH ();
4813 GET_CHAR_AFTER (c2, d, dummy);
4814 s2 = SYNTAX (c2);
4815
4816 /* Case 2: S2 is not Sword. */
4817 if (s2 != Sword)
4818 goto fail;
4819
4820 /* Case 3: D is not at the beginning of string ... */
4821 if (!AT_STRINGS_BEG (d))
4822 {
4823 GET_CHAR_BEFORE_2 (c1, d, string1, end1, string2, end2);
4824 UPDATE_SYNTAX_TABLE_BACKWARD (charpos - 1);
4825 s1 = SYNTAX (c1);
4826
4827 /* ... and S1 is Sword, and WORD_BOUNDARY_P (C1, C2)
4828 returns 0. */
4829 if ((s1 == Sword) && !WORD_BOUNDARY_P (c1, c2))
4830 goto fail;
4831 }
4832 }
4833 break;
4834
4835 case wordend:
4836 DEBUG_PRINT ("EXECUTING wordend.\n");
4837
4838 /* We FAIL in one of the following cases: */
4839
4840 /* Case 1: D is at the beginning of string. */
4841 if (AT_STRINGS_BEG (d))
4842 goto fail;
4843 else
4844 {
4845 /* C1 is the character before D, S1 is the syntax of C1, C2
4846 is the character at D, and S2 is the syntax of C2. */
4847 int c1, c2;
4848 int s1, s2;
4849 int dummy;
4850 ptrdiff_t offset = PTR_TO_OFFSET (d);
4851 ptrdiff_t charpos = SYNTAX_TABLE_BYTE_TO_CHAR (offset) - 1;
4852 UPDATE_SYNTAX_TABLE (charpos);
4853 GET_CHAR_BEFORE_2 (c1, d, string1, end1, string2, end2);
4854 s1 = SYNTAX (c1);
4855
4856 /* Case 2: S1 is not Sword. */
4857 if (s1 != Sword)
4858 goto fail;
4859
4860 /* Case 3: D is not at the end of string ... */
4861 if (!AT_STRINGS_END (d))
4862 {
4863 PREFETCH_NOLIMIT ();
4864 GET_CHAR_AFTER (c2, d, dummy);
4865 UPDATE_SYNTAX_TABLE_FORWARD (charpos + 1);
4866 s2 = SYNTAX (c2);
4867
4868 /* ... and S2 is Sword, and WORD_BOUNDARY_P (C1, C2)
4869 returns 0. */
4870 if ((s2 == Sword) && !WORD_BOUNDARY_P (c1, c2))
4871 goto fail;
4872 }
4873 }
4874 break;
4875
4876 case symbeg:
4877 DEBUG_PRINT ("EXECUTING symbeg.\n");
4878
4879 /* We FAIL in one of the following cases: */
4880
4881 /* Case 1: D is at the end of string. */
4882 if (AT_STRINGS_END (d))
4883 goto fail;
4884 else
4885 {
4886 /* C1 is the character before D, S1 is the syntax of C1, C2
4887 is the character at D, and S2 is the syntax of C2. */
4888 int c1, c2;
4889 int s1, s2;
4890 ptrdiff_t offset = PTR_TO_OFFSET (d);
4891 ptrdiff_t charpos = SYNTAX_TABLE_BYTE_TO_CHAR (offset);
4892 UPDATE_SYNTAX_TABLE (charpos);
4893 PREFETCH ();
4894 c2 = RE_STRING_CHAR (d, target_multibyte);
4895 s2 = SYNTAX (c2);
4896
4897 /* Case 2: S2 is neither Sword nor Ssymbol. */
4898 if (s2 != Sword && s2 != Ssymbol)
4899 goto fail;
4900
4901 /* Case 3: D is not at the beginning of string ... */
4902 if (!AT_STRINGS_BEG (d))
4903 {
4904 GET_CHAR_BEFORE_2 (c1, d, string1, end1, string2, end2);
4905 UPDATE_SYNTAX_TABLE_BACKWARD (charpos - 1);
4906 s1 = SYNTAX (c1);
4907
4908 /* ... and S1 is Sword or Ssymbol. */
4909 if (s1 == Sword || s1 == Ssymbol)
4910 goto fail;
4911 }
4912 }
4913 break;
4914
4915 case symend:
4916 DEBUG_PRINT ("EXECUTING symend.\n");
4917
4918 /* We FAIL in one of the following cases: */
4919
4920 /* Case 1: D is at the beginning of string. */
4921 if (AT_STRINGS_BEG (d))
4922 goto fail;
4923 else
4924 {
4925 /* C1 is the character before D, S1 is the syntax of C1, C2
4926 is the character at D, and S2 is the syntax of C2. */
4927 int c1, c2;
4928 int s1, s2;
4929 ptrdiff_t offset = PTR_TO_OFFSET (d);
4930 ptrdiff_t charpos = SYNTAX_TABLE_BYTE_TO_CHAR (offset) - 1;
4931 UPDATE_SYNTAX_TABLE (charpos);
4932 GET_CHAR_BEFORE_2 (c1, d, string1, end1, string2, end2);
4933 s1 = SYNTAX (c1);
4934
4935 /* Case 2: S1 is neither Ssymbol nor Sword. */
4936 if (s1 != Sword && s1 != Ssymbol)
4937 goto fail;
4938
4939 /* Case 3: D is not at the end of string ... */
4940 if (!AT_STRINGS_END (d))
4941 {
4942 PREFETCH_NOLIMIT ();
4943 c2 = RE_STRING_CHAR (d, target_multibyte);
4944 UPDATE_SYNTAX_TABLE_FORWARD (charpos + 1);
4945 s2 = SYNTAX (c2);
4946
4947 /* ... and S2 is Sword or Ssymbol. */
4948 if (s2 == Sword || s2 == Ssymbol)
4949 goto fail;
4950 }
4951 }
4952 break;
4953
4954 case syntaxspec:
4955 case notsyntaxspec:
4956 {
4957 bool not = (re_opcode_t) *(p - 1) == notsyntaxspec;
4958 mcnt = *p++;
4959 DEBUG_PRINT ("EXECUTING %ssyntaxspec %d.\n", not ? "not" : "",
4960 mcnt);
4961 PREFETCH ();
4962 {
4963 ptrdiff_t offset = PTR_TO_OFFSET (d);
4964 ptrdiff_t pos1 = SYNTAX_TABLE_BYTE_TO_CHAR (offset);
4965 UPDATE_SYNTAX_TABLE (pos1);
4966 }
4967 {
4968 int len;
4969 int c;
4970
4971 GET_CHAR_AFTER (c, d, len);
4972 if ((SYNTAX (c) != (enum syntaxcode) mcnt) ^ not)
4973 goto fail;
4974 d += len;
4975 }
4976 }
4977 break;
4978
4979 case at_dot:
4980 DEBUG_PRINT ("EXECUTING at_dot.\n");
4981 if (PTR_BYTE_POS (d) != PT_BYTE)
4982 goto fail;
4983 break;
4984
4985 case categoryspec:
4986 case notcategoryspec:
4987 {
4988 bool not = (re_opcode_t) *(p - 1) == notcategoryspec;
4989 mcnt = *p++;
4990 DEBUG_PRINT ("EXECUTING %scategoryspec %d.\n",
4991 not ? "not" : "", mcnt);
4992 PREFETCH ();
4993
4994 {
4995 int len;
4996 int c;
4997 GET_CHAR_AFTER (c, d, len);
4998 if ((!CHAR_HAS_CATEGORY (c, mcnt)) ^ not)
4999 goto fail;
5000 d += len;
5001 }
5002 }
5003 break;
5004
5005 default:
5006 abort ();
5007 }
5008 continue; /* Successfully executed one pattern command; keep going. */
5009
5010
5011 /* We goto here if a matching operation fails. */
5012 fail:
5013 maybe_quit ();
5014 if (!FAIL_STACK_EMPTY ())
5015 {
5016 re_char *str, *pat;
5017 /* A restart point is known. Restore to that state. */
5018 DEBUG_PRINT ("\nFAIL:\n");
5019 POP_FAILURE_POINT (str, pat);
5020 switch (*pat++)
5021 {
5022 case on_failure_keep_string_jump:
5023 eassert (str == NULL);
5024 goto continue_failure_jump;
5025
5026 case on_failure_jump_nastyloop:
5027 eassert ((re_opcode_t)pat[-2] == no_op);
5028 PUSH_FAILURE_POINT (pat - 2, str);
5029 FALLTHROUGH;
5030 case on_failure_jump_loop:
5031 case on_failure_jump:
5032 case succeed_n:
5033 d = str;
5034 continue_failure_jump:
5035 EXTRACT_NUMBER_AND_INCR (mcnt, pat);
5036 p = pat + mcnt;
5037 break;
5038
5039 case no_op:
5040 /* A special frame used for nastyloops. */
5041 goto fail;
5042
5043 default:
5044 abort ();
5045 }
5046
5047 eassert (p >= bufp->buffer && p <= pend);
5048
5049 if (d >= string1 && d <= end1)
5050 dend = end_match_1;
5051 }
5052 else
5053 break; /* Matching at this starting point really fails. */
5054 } /* for (;;) */
5055
5056 if (best_regs_set)
5057 goto restore_best_regs;
5058
5059 unbind_to (count, Qnil);
5060 SAFE_FREE ();
5061
5062 return -1; /* Failure to match. */
5063 }
5064
5065 /* Subroutine definitions for re_match_2. */
5066
5067 /* Return true if TRANSLATE[S1] and TRANSLATE[S2] are not identical
5068 for LEN bytes. */
5069
5070 static bool
bcmp_translate(re_char * s1,re_char * s2,ptrdiff_t len,Lisp_Object translate,bool target_multibyte)5071 bcmp_translate (re_char *s1, re_char *s2, ptrdiff_t len,
5072 Lisp_Object translate, bool target_multibyte)
5073 {
5074 re_char *p1 = s1, *p2 = s2;
5075 re_char *p1_end = s1 + len;
5076 re_char *p2_end = s2 + len;
5077
5078 /* FIXME: Checking both p1 and p2 presumes that the two strings might have
5079 different lengths, but relying on a single LEN would break this. -sm */
5080 while (p1 < p1_end && p2 < p2_end)
5081 {
5082 int p1_charlen, p2_charlen;
5083 int p1_ch, p2_ch;
5084
5085 GET_CHAR_AFTER (p1_ch, p1, p1_charlen);
5086 GET_CHAR_AFTER (p2_ch, p2, p2_charlen);
5087
5088 if (RE_TRANSLATE (translate, p1_ch)
5089 != RE_TRANSLATE (translate, p2_ch))
5090 return true;
5091
5092 p1 += p1_charlen, p2 += p2_charlen;
5093 }
5094
5095 return p1 != p1_end || p2 != p2_end;
5096 }
5097
5098 /* Entry points for GNU code. */
5099
5100 /* re_compile_pattern is the GNU regular expression compiler: it
5101 compiles PATTERN (of length SIZE) and puts the result in BUFP.
5102 Returns 0 if the pattern was valid, otherwise an error string.
5103
5104 Assumes the 'allocated' (and perhaps 'buffer') and 'translate' fields
5105 are set in BUFP on entry.
5106
5107 We call regex_compile to do the actual compilation. */
5108
5109 const char *
re_compile_pattern(const char * pattern,ptrdiff_t length,bool posix_backtracking,const char * whitespace_regexp,struct re_pattern_buffer * bufp)5110 re_compile_pattern (const char *pattern, ptrdiff_t length,
5111 bool posix_backtracking, const char *whitespace_regexp,
5112 struct re_pattern_buffer *bufp)
5113 {
5114 bufp->regs_allocated = REGS_UNALLOCATED;
5115
5116 reg_errcode_t ret
5117 = regex_compile ((re_char *) pattern, length,
5118 posix_backtracking,
5119 whitespace_regexp,
5120 bufp);
5121
5122 if (!ret)
5123 return NULL;
5124 return re_error_msgid[ret];
5125 }
5126