1 /* -*-C-*-
2
3 Copyright (C) 1986, 1987, 1988, 1989, 1990, 1991, 1992, 1993, 1994,
4 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004, 2005,
5 2006, 2007, 2008, 2009, 2010, 2011, 2012, 2013, 2014 Massachusetts
6 Institute of Technology
7
8 This file is part of MIT/GNU Scheme.
9
10 MIT/GNU Scheme is free software; you can redistribute it and/or modify
11 it under the terms of the GNU General Public License as published by
12 the Free Software Foundation; either version 2 of the License, or (at
13 your option) any later version.
14
15 MIT/GNU Scheme is distributed in the hope that it will be useful, but
16 WITHOUT ANY WARRANTY; without even the implied warranty of
17 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 General Public License for more details.
19
20 You should have received a copy of the GNU General Public License
21 along with MIT/GNU Scheme; if not, write to the Free Software
22 Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301,
23 USA.
24
25 */
26
27 /* Regular expression matching and search */
28
29 /* NOTE: This program was created by translation from the regular
30 expression code of GNU Emacs; it was translated from the original C
31 to 68000 assembly language (in 1986), and then translated back from
32 68000 assembly language to C (in 1987). */
33
34 #include "scheme.h"
35 #include "syntax.h"
36 #include "regex.h"
37
38 #if defined(__IRIX__) || defined(_AIX)
39 #define SIGN_EXTEND_CHAR(x) \
40 ((((int) (x)) >= 0x80) ? (((int) (x)) - 0x100) : ((int) (x)))
41 #endif
42
43 #ifndef SIGN_EXTEND_CHAR
44 #define SIGN_EXTEND_CHAR(x) (x)
45 #endif /* not SIGN_EXTEND_CHAR */
46
47 #ifndef SWITCH_ENUM
48 #define SWITCH_ENUM(enum_type, expression) \
49 switch ((enum enum_type) (expression))
50 #endif /* not SWITCH_ENUM */
51
52 #ifndef RE_NFAILURES
53 #define RE_NFAILURES 512
54 #endif
55
56 #define FOR_INDEX_RANGE(index, start, end) \
57 for (index = (start); (index < (end)); index += 1)
58
59 #define FOR_INDEX_BELOW(index, limit) \
60 FOR_INDEX_RANGE (index, 0, (limit))
61
62 #define FOR_ALL_ASCII(index) \
63 FOR_INDEX_BELOW (index, MAX_ASCII)
64
65 #define FOR_ALL_ASCII_SUCH_THAT(index, expression) \
66 FOR_ALL_ASCII (index) \
67 if (expression)
68
69 #define TRANSLATE_CHAR(ascii) \
70 ((translation == NULL) ? (ascii) : (translation [(ascii)]))
71
72 #define WORD_CONSTITUENT_P(ascii) \
73 (SYNTAX_CONSTITUENT_P (syntaxcode_word, (ascii)))
74
75 #define SYNTAX_CONSTITUENT_P(code, ascii) \
76 ((SYNTAX_ENTRY_CODE (SYNTAX_TABLE_REF (syntax_table, (ascii)))) == (code))
77
78 #define CHAR_SET_MEMBER_P(length, char_set, ascii) \
79 (((ascii) < ((length) * ASCII_LENGTH)) && \
80 (CHAR_SET_MEMBER_P_INTERNAL (char_set, ascii)))
81
82 #define CHAR_SET_MEMBER_P_INTERNAL(char_set, ascii) \
83 ((((char_set) [((ascii) / ASCII_LENGTH)]) & \
84 (1 << ((ascii) % ASCII_LENGTH))) \
85 != 0)
86
87 #define READ_PATTERN_CHAR(target) do \
88 { \
89 if (pattern_pc >= pattern_end) \
90 BAD_PATTERN (); \
91 (target) = (*pattern_pc++); \
92 } while (0)
93
94 #define READ_PATTERN_OFFSET(target) do \
95 { \
96 signed char _fetched; \
97 if ((pattern_pc + 1) >= pattern_end) \
98 BAD_PATTERN (); \
99 (target) = (*pattern_pc++); \
100 _fetched = (* ((signed char *) (pattern_pc++))); \
101 (target) += ((SIGN_EXTEND_CHAR (_fetched)) << ASCII_LENGTH); \
102 if (((pattern_pc + (target)) < pattern_start) || \
103 ((pattern_pc + (target)) > pattern_end)) \
104 BAD_PATTERN (); \
105 } while (0)
106
107 #define READ_PATTERN_LENGTH(target) do \
108 { \
109 int _len; \
110 \
111 if (pattern_pc >= pattern_end) \
112 BAD_PATTERN (); \
113 _len = ((int) (*pattern_pc++)); \
114 if ((pattern_pc + _len) > pattern_end) \
115 BAD_PATTERN (); \
116 (target) = _len; \
117 } while (0)
118
119 #define READ_PATTERN_REGISTER(target) do \
120 { \
121 if ((pattern_pc >= pattern_end) || \
122 (((target) = (*pattern_pc++)) >= RE_NREGS)) \
123 BAD_PATTERN (); \
124 } while (0)
125
126 #define READ_PATTERN_SYNTAXCODE(target) do \
127 { \
128 if ((pattern_pc >= pattern_end) || \
129 (((int) ((target) = ((enum syntaxcode) (*pattern_pc++)))) \
130 >= ((int) syntaxcode_max))) \
131 BAD_PATTERN (); \
132 } while (0)
133
134 #define BAD_PATTERN() RE_RETURN (-2)
135
136 #define PUSH_FAILURE_POINT(pattern_pc, match_pc) do \
137 { \
138 if (stack_pointer == stack_end) \
139 { \
140 long stack_length; \
141 unsigned char **stack_temporary; \
142 \
143 stack_length = ((stack_end - stack_start) * 2); \
144 if (stack_length > (re_max_failures * 2)) \
145 RE_RETURN (-4); \
146 stack_temporary = \
147 ((unsigned char **) \
148 (realloc \
149 (stack_start, (stack_length * (sizeof (unsigned char *)))))); \
150 if (stack_temporary == NULL) \
151 RE_RETURN (-3); \
152 stack_end = (& (stack_temporary [stack_length])); \
153 stack_pointer = \
154 (& (stack_temporary [(stack_pointer - stack_start)])); \
155 stack_start = stack_temporary; \
156 } \
157 (*stack_pointer++) = (pattern_pc); \
158 (*stack_pointer++) = (match_pc); \
159 } while (0)
160
161 #define RE_RETURN(value) \
162 { \
163 return_value = (value); \
164 goto return_point; \
165 }
166
167 void
re_buffer_initialize(struct re_buffer * buffer,unsigned char * translation,SYNTAX_TABLE_TYPE syntax_table,unsigned char * text,unsigned long text_start_index,unsigned long text_end_index,unsigned long gap_start_index,unsigned long gap_end_index)168 re_buffer_initialize (struct re_buffer * buffer,
169 unsigned char * translation,
170 SYNTAX_TABLE_TYPE syntax_table,
171 unsigned char * text,
172 unsigned long text_start_index,
173 unsigned long text_end_index,
174 unsigned long gap_start_index,
175 unsigned long gap_end_index)
176 {
177 unsigned char *text_start, *text_end, *gap_start, *gap_end;
178
179 /* Assumes that
180 ((text_start_index <= gap_start_index) &&
181 (gap_start_index <= gap_end_index) &&
182 (gap_end_index <= text_end_index)) */
183
184 text_start = (text + text_start_index);
185 text_end = (text + text_end_index);
186 gap_start = (text + gap_start_index);
187 gap_end = (text + gap_end_index);
188
189 (buffer -> translation) = translation;
190 (buffer -> syntax_table) = syntax_table;
191 (buffer -> text) = text;
192 (buffer -> text_start) = ((text_start == gap_start) ? gap_end : text_start);
193 (buffer -> text_end) = ((text_end == gap_end) ? gap_start : text_end);
194 (buffer -> gap_start) = gap_start;
195 (buffer -> gap_end) = gap_end;
196 return;
197 }
198
199 /* Given a compiled pattern between `pattern_start' and `pattern_end',
200 generate a character set which is true of all characters which can
201 be the first character of a match.
202
203 See the documentation of `struct re_buffer' for a description of
204 `translation' and `syntax_table'.
205
206 `fastmap' is the resulting character set. It is a character array
207 whose elements are either `FASTMAP_FALSE' or `FASTMAP_TRUE'.
208
209 Return values:
210 0 => pattern cannot match the null string.
211 1 => pattern can match the null string.
212 2 => pattern can match the null string, but only at end of match
213 text or to left of a character in `fastmap'.
214 -2 => the pattern is improperly formed.
215 else => undefined. */
216
217 #define FASTMAP_FALSE '\0'
218 #define FASTMAP_TRUE '\1'
219
220 int
re_compile_fastmap(unsigned char * pattern_start,unsigned char * pattern_end,unsigned char * translation,SYNTAX_TABLE_TYPE syntax_table,unsigned char * fastmap)221 re_compile_fastmap (unsigned char * pattern_start,
222 unsigned char * pattern_end,
223 unsigned char * translation,
224 SYNTAX_TABLE_TYPE syntax_table,
225 unsigned char * fastmap)
226 {
227 unsigned char *pattern_pc;
228 unsigned char *stack_start[RE_NFAILURES];
229 unsigned char **stack_pointer;
230 int return_value;
231
232 pattern_pc = pattern_start;
233 return_value = 0;
234 stack_pointer = stack_start;
235
236 {
237 int i;
238
239 FOR_ALL_ASCII (i)
240 (fastmap [i]) = FASTMAP_FALSE;
241 }
242
243 loop:
244 if (pattern_pc >= pattern_end)
245 RE_RETURN (1);
246
247 SWITCH_ENUM (regexpcode, (*pattern_pc++))
248 {
249 case regexpcode_unused:
250 case regexpcode_line_start:
251 case regexpcode_buffer_start:
252 case regexpcode_buffer_end:
253 case regexpcode_word_start:
254 case regexpcode_word_end:
255 case regexpcode_word_bound:
256 case regexpcode_not_word_bound:
257 goto loop;
258
259 case regexpcode_line_end:
260 {
261 (fastmap [(TRANSLATE_CHAR ('\n'))]) = FASTMAP_TRUE;
262 if (return_value == 0)
263 return_value = 2;
264 goto next;
265 }
266
267 case regexpcode_exact_1:
268 {
269 int ascii;
270
271 READ_PATTERN_CHAR (ascii);
272 (fastmap [(TRANSLATE_CHAR (ascii))]) = FASTMAP_TRUE;
273 goto next;
274 }
275
276 case regexpcode_exact_n:
277 {
278 int length;
279
280 READ_PATTERN_LENGTH (length);
281 if (length == 0)
282 goto loop;
283 (fastmap [(TRANSLATE_CHAR (pattern_pc [0]))]) = FASTMAP_TRUE;
284 goto next;
285 }
286
287 case regexpcode_any_char:
288 {
289 int ascii;
290
291 FOR_ALL_ASCII_SUCH_THAT (ascii, (ascii != '\n'))
292 (fastmap [(TRANSLATE_CHAR (ascii))]) = FASTMAP_TRUE;
293 if (return_value != 0)
294 goto return_point;
295 goto next;
296 }
297
298 case regexpcode_char_set:
299 {
300 int length;
301 int ascii;
302
303 READ_PATTERN_LENGTH (length);
304 length = (length * ASCII_LENGTH);
305 FOR_INDEX_BELOW (ascii, length)
306 if (CHAR_SET_MEMBER_P_INTERNAL (pattern_pc, ascii))
307 (fastmap [(TRANSLATE_CHAR (ascii))]) = FASTMAP_TRUE;
308 goto next;
309 }
310
311 case regexpcode_not_char_set:
312 {
313 int length;
314 int ascii;
315
316 READ_PATTERN_LENGTH (length);
317 length = (length * ASCII_LENGTH);
318 FOR_INDEX_BELOW (ascii, length)
319 if (! (CHAR_SET_MEMBER_P_INTERNAL (pattern_pc, ascii)))
320 (fastmap [(TRANSLATE_CHAR (ascii))]) = FASTMAP_TRUE;
321 FOR_INDEX_RANGE (ascii, length, MAX_ASCII)
322 (fastmap [(TRANSLATE_CHAR (ascii))]) = FASTMAP_TRUE;
323 goto next;
324 }
325
326 case regexpcode_word_char:
327 {
328 int ascii;
329
330 FOR_ALL_ASCII_SUCH_THAT (ascii, (WORD_CONSTITUENT_P (ascii)))
331 (fastmap [(TRANSLATE_CHAR (ascii))]) = FASTMAP_TRUE;
332 goto next;
333 }
334
335 case regexpcode_not_word_char:
336 {
337 int ascii;
338
339 FOR_ALL_ASCII_SUCH_THAT (ascii, (! (WORD_CONSTITUENT_P (ascii))))
340 (fastmap [(TRANSLATE_CHAR (ascii))]) = FASTMAP_TRUE;
341 goto next;
342 }
343
344 case regexpcode_syntax_spec:
345 {
346 enum syntaxcode code;
347 int ascii;
348
349 READ_PATTERN_SYNTAXCODE (code);
350 FOR_ALL_ASCII_SUCH_THAT (ascii, (SYNTAX_CONSTITUENT_P (code, ascii)))
351 (fastmap [(TRANSLATE_CHAR (ascii))]) = FASTMAP_TRUE;
352 goto next;
353 }
354
355 case regexpcode_not_syntax_spec:
356 {
357 enum syntaxcode code;
358 int ascii;
359
360 READ_PATTERN_SYNTAXCODE (code);
361 FOR_ALL_ASCII_SUCH_THAT (ascii,
362 (! (SYNTAX_CONSTITUENT_P (code, ascii))))
363 (fastmap [(TRANSLATE_CHAR (ascii))]) = FASTMAP_TRUE;
364 goto next;
365 }
366
367 case regexpcode_start_memory:
368 case regexpcode_stop_memory:
369 {
370 int register_number;
371
372 READ_PATTERN_REGISTER (register_number);
373 goto loop;
374 }
375
376 case regexpcode_duplicate:
377 {
378 int register_number;
379 int ascii;
380
381 READ_PATTERN_REGISTER (register_number);
382 FOR_ALL_ASCII (ascii)
383 (fastmap [ascii]) = FASTMAP_TRUE;
384 RE_RETURN (1);
385 }
386
387 case regexpcode_jump:
388 case regexpcode_finalize_jump:
389 case regexpcode_maybe_finalize_jump:
390 case regexpcode_dummy_failure_jump:
391 {
392 int offset;
393
394 return_value = 1;
395 READ_PATTERN_OFFSET (offset);
396 pattern_pc += offset;
397 if (offset > 0)
398 goto loop;
399
400 /* Jump backward reached implies we just went through the
401 body of a loop and matched nothing. Opcode jumped to
402 should be an on_failure_jump. Just treat it like an
403 ordinary jump. For a * loop, it has pushed its failure
404 point already; if so, discard that as redundant. */
405 if (pattern_pc >= pattern_end)
406 BAD_PATTERN ();
407 if (((enum regexpcode) (pattern_pc [0])) !=
408 regexpcode_on_failure_jump)
409 goto loop;
410 READ_PATTERN_OFFSET (offset);
411 pattern_pc += offset;
412 if ((stack_pointer != stack_start) &&
413 ((stack_pointer [-1]) == pattern_pc))
414 stack_pointer -= 1;
415 goto loop;
416 }
417
418 case regexpcode_on_failure_jump:
419 {
420 int offset;
421
422 READ_PATTERN_OFFSET (offset);
423 (*stack_pointer++) = (pattern_pc + offset);
424 goto loop;
425 }
426
427 default:
428 BAD_PATTERN ();
429 }
430
431 next:
432 if (stack_pointer != stack_start)
433 {
434 pattern_pc = (*--stack_pointer);
435 goto loop;
436 }
437
438 return_point:
439 return (return_value);
440 }
441
442 /* Match the compiled pattern described by `pattern_start' and
443 `pattern_end' against the characters in `buffer' between
444 `match_start' and `match_end'.
445
446 `registers', if not NULL, will be filled with the start and end
447 indices of the match registers if the match succeeds.
448
449 It is assumed that the following is true:
450
451 (! ((gap_start < gap_end) &&
452 (match_start < match_end) &&
453 ((match_start == gap_start) || (match_end == gap_end))))
454
455 Return values:
456
457 non-negative => the end index (exclusive) of the match.
458 -1 => no match.
459 -2 => the pattern is badly formed.
460 -3 => memory allocation error.
461 -4 => match stack overflow.
462 other => undefined. */
463
464 #define RE_MATCH_FAILED (-1)
465
466 /* This macro is used by search procedures to decide when a match at a
467 particular place has failed. If true, the search continues by
468 advancing to the next match point. */
469 #define RE_MATCH_FAILURE_RESULT(result) \
470 (((result) == RE_MATCH_FAILED) || ((result) == (-4)))
471
472 #define ADDRESS_TO_INDEX(address) \
473 ((((address) > gap_start) ? ((address) - gap_length) : (address)) \
474 - (buffer -> text))
475
476 #define READ_MATCH_CHAR(target) do \
477 { \
478 if (match_pc >= match_end) \
479 goto re_match_fail; \
480 (target) = (TRANSLATE_CHAR (*match_pc++)); \
481 if (match_pc == gap_start) \
482 match_pc = gap_end; \
483 } while (0)
484
485 static bool
beq_translate(unsigned char * scan1,unsigned char * scan2,long length,unsigned char * translation)486 beq_translate (unsigned char * scan1,
487 unsigned char * scan2,
488 long length,
489 unsigned char * translation)
490 {
491 while ((length--) > 0)
492 if ((TRANSLATE_CHAR (*scan1++)) != (TRANSLATE_CHAR (*scan2++)))
493 return (false);
494 return (true);
495 }
496
497 int re_max_failures = 1000;
498
499 int
re_match(unsigned char * pattern_start,unsigned char * pattern_end,struct re_buffer * buffer,struct re_registers * registers,unsigned char * match_start,unsigned char * match_end)500 re_match (unsigned char * pattern_start,
501 unsigned char * pattern_end,
502 struct re_buffer * buffer,
503 struct re_registers * registers,
504 unsigned char * match_start,
505 unsigned char * match_end)
506 {
507 unsigned char *pattern_pc, *match_pc;
508 unsigned char *gap_start, *gap_end;
509 unsigned char *translation;
510 SYNTAX_TABLE_TYPE syntax_table;
511 long gap_length;
512 int return_value;
513
514 /* Failure point stack. Each place that can handle a failure
515 further down the line pushes a failure point on this stack. It
516 consists of two char *'s. The first one pushed is where to
517 resume scanning the pattern; the second pushed is where to resume
518 scanning the match text. If the latter is NULL, the failure
519 point is a "dummy". If a failure happens and the innermost
520 failure point is dormant, it discards that failure point and
521 tries the next one. */
522
523 unsigned char **stack_start, **stack_end, **stack_pointer;
524
525 /* Information on the "contents" of registers. These are pointers
526 into the match text; they record just what was matched (on this
527 attempt) by some part of the pattern. The start_memory command
528 stores the start of a register's contents and the stop_memory
529 command stores the end.
530
531 At that point, (register_start [regnum]) points to the first
532 character in the register, and (register_end [regnum]) points to
533 the first character beyond the end of the register. */
534
535 unsigned char *register_start[RE_NREGS];
536 unsigned char *register_end[RE_NREGS];
537
538 pattern_pc = pattern_start;
539 match_pc = match_start;
540 gap_start = (buffer -> gap_start);
541 gap_end = (buffer -> gap_end);
542 gap_length = (gap_end - gap_start);
543 translation = (buffer -> translation);
544 syntax_table = (buffer -> syntax_table);
545
546 stack_start = (malloc ((2 * RE_NFAILURES) * (sizeof (*stack_start))));
547 if (stack_start == NULL)
548 RE_RETURN (-3);
549
550 stack_end = (& (stack_start [(2 * RE_NFAILURES)]));
551 stack_pointer = stack_start;
552
553 {
554 int i;
555
556 FOR_INDEX_BELOW (i, RE_NREGS)
557 {
558 (register_start [i]) = NULL;
559 (register_end [i]) = NULL;
560 }
561 }
562
563 re_match_loop:
564 if (pattern_pc >= pattern_end)
565 {
566 /* Reaching here indicates that match was successful. */
567 if (registers != NULL)
568 {
569 int i;
570
571 (register_start [0]) = match_start;
572 (register_end [0]) = match_pc;
573 FOR_INDEX_BELOW (i, RE_NREGS)
574 {
575 ((registers -> start) [i]) =
576 (((register_start [i]) == NULL)
577 ? -1
578 : (ADDRESS_TO_INDEX (register_start [i])));
579 ((registers -> end) [i]) =
580 (((register_end [i]) == NULL)
581 ? -1
582 : (ADDRESS_TO_INDEX (register_end [i])));
583 }
584 }
585 RE_RETURN (ADDRESS_TO_INDEX (match_pc));
586 }
587
588 SWITCH_ENUM (regexpcode, (*pattern_pc++))
589 {
590 case regexpcode_unused:
591 goto re_match_loop;
592
593 case regexpcode_exact_1:
594 {
595 int ascii;
596 int ascii_p;
597
598 READ_MATCH_CHAR (ascii);
599 READ_PATTERN_CHAR (ascii_p);
600 if (ascii == ascii_p)
601 goto re_match_loop;
602 goto re_match_fail;
603 }
604
605 case regexpcode_exact_n:
606 {
607 int length;
608 int ascii;
609
610 READ_PATTERN_LENGTH (length);
611 while ((length--) > 0)
612 {
613 READ_MATCH_CHAR (ascii);
614 if (ascii != (*pattern_pc++))
615 goto re_match_fail;
616 }
617 goto re_match_loop;
618 }
619
620 case regexpcode_any_char:
621 {
622 int ascii;
623
624 READ_MATCH_CHAR (ascii);
625 if (ascii == '\n')
626 goto re_match_fail;
627 goto re_match_loop;
628 }
629
630 #define RE_MATCH_CHAR_SET(winning_label, losing_label) \
631 { \
632 int ascii; \
633 int length; \
634 \
635 READ_MATCH_CHAR (ascii); \
636 READ_PATTERN_LENGTH (length); \
637 if (CHAR_SET_MEMBER_P (length, pattern_pc, ascii)) \
638 { \
639 pattern_pc += length; \
640 goto winning_label; \
641 } \
642 else \
643 { \
644 pattern_pc += length; \
645 goto losing_label; \
646 } \
647 }
648
649 case regexpcode_char_set:
650 RE_MATCH_CHAR_SET (re_match_loop, re_match_fail);
651
652 case regexpcode_not_char_set:
653 RE_MATCH_CHAR_SET (re_match_fail, re_match_loop);
654
655 #undef RE_MATCH_CHAR_SET
656
657 /* \( is represented by a start_memory, \) by a stop_memory. Both
658 of those commands contain a "register number" argument. The
659 text matched within the \( and \) is recorded under that
660 number. Then, \<digit> turns into a `duplicate' command which
661 is followed by the numeric value of <digit> as the register
662 number. */
663
664 case regexpcode_start_memory:
665 {
666 int register_number;
667
668 READ_PATTERN_REGISTER (register_number);
669 (register_start [register_number]) = match_pc;
670 goto re_match_loop;
671 }
672
673 case regexpcode_stop_memory:
674 {
675 int register_number;
676
677 READ_PATTERN_REGISTER (register_number);
678 (register_end [register_number]) =
679 ((match_pc == gap_end) ? gap_start : match_pc);
680 goto re_match_loop;
681 }
682
683 case regexpcode_duplicate:
684 {
685 int register_number;
686 unsigned char *start, *end, *new_end;
687 long length;
688
689 READ_PATTERN_REGISTER (register_number);
690 start = (register_start [register_number]);
691 end = (register_end [register_number]);
692 length = (end - start);
693 if (length <= 0)
694 goto re_match_loop;
695 new_end = (match_pc + length);
696 if (new_end > match_end)
697 goto re_match_fail;
698 if ((match_pc <= gap_start) && (new_end > gap_start))
699 {
700 long length1, length2;
701
702 new_end += gap_length;
703 if (new_end > match_end)
704 goto re_match_fail;
705 length1 = (gap_start - match_pc);
706 length2 = (length - length1);
707 if (!
708 ((beq_translate (match_pc, start, length1, translation)) &&
709 (beq_translate (gap_end, (start + length1), length2,
710 translation))))
711 goto re_match_fail;
712 }
713 else if ((start <= gap_start) && (end > gap_start))
714 {
715 long length1, length2;
716
717 length1 = (gap_start - start);
718 length2 = (end - gap_end);
719 if (!
720 ((beq_translate (match_pc, start, length1, translation)) &&
721 (beq_translate ((match_pc + length1), gap_end, length2,
722 translation))))
723 goto re_match_fail;
724 }
725 else
726 {
727 if (! (beq_translate (match_pc, start, length, translation)))
728 goto re_match_fail;
729 }
730 match_pc = ((new_end == gap_start) ? gap_end : new_end);
731 goto re_match_loop;
732 }
733
734 case regexpcode_buffer_start:
735 {
736 if ((ADDRESS_TO_INDEX (match_pc))
737 == (ADDRESS_TO_INDEX (buffer -> text_start)))
738 goto re_match_loop;
739 goto re_match_fail;
740 }
741
742 case regexpcode_buffer_end:
743 {
744 if ((ADDRESS_TO_INDEX (match_pc))
745 == (ADDRESS_TO_INDEX (buffer -> text_end)))
746 goto re_match_loop;
747 goto re_match_fail;
748 }
749
750 case regexpcode_line_start:
751 {
752 if ((ADDRESS_TO_INDEX (match_pc))
753 == (ADDRESS_TO_INDEX (buffer -> text_start)))
754 goto re_match_loop;
755 if ((TRANSLATE_CHAR
756 (((match_pc == gap_end) ? gap_start : match_pc) [-1]))
757 == '\n')
758 goto re_match_loop;
759 goto re_match_fail;
760 }
761
762 case regexpcode_line_end:
763 {
764 if (((ADDRESS_TO_INDEX (match_pc))
765 == (ADDRESS_TO_INDEX (buffer -> text_end)))
766 || ((TRANSLATE_CHAR (match_pc [0])) == '\n'))
767 goto re_match_loop;
768 goto re_match_fail;
769 }
770
771 #define RE_MATCH_WORD_BOUND(word_bound_p) \
772 if ((match_pc == gap_end) \
773 ? (word_bound_p \
774 (((gap_start != (buffer -> text_start)) \
775 && (WORD_CONSTITUENT_P (TRANSLATE_CHAR (gap_start[-1])))), \
776 ((gap_end != (buffer -> text_end)) \
777 && (WORD_CONSTITUENT_P (TRANSLATE_CHAR (gap_end[0])))))) \
778 : (word_bound_p \
779 (((match_pc != (buffer -> text_start)) \
780 && (WORD_CONSTITUENT_P (TRANSLATE_CHAR (match_pc[-1])))), \
781 ((match_pc != (buffer -> text_end)) \
782 && (WORD_CONSTITUENT_P (TRANSLATE_CHAR (match_pc[0]))))))) \
783 goto re_match_loop; \
784 goto re_match_fail
785
786 case regexpcode_word_bound:
787 #define WORD_BOUND_P(left_p, right_p) ((left_p) != (right_p))
788 RE_MATCH_WORD_BOUND (WORD_BOUND_P);
789 #undef WORD_BOUND_P
790
791 case regexpcode_not_word_bound:
792 #define NOT_WORD_BOUND_P(left_p, right_p) ((left_p) == (right_p))
793 RE_MATCH_WORD_BOUND (NOT_WORD_BOUND_P);
794 #undef NOT_WORD_BOUND_P
795
796 case regexpcode_word_start:
797 #define WORD_START_P(left_p, right_p) ((! (left_p)) && (right_p))
798 RE_MATCH_WORD_BOUND (WORD_START_P);
799 #undef WORD_START_P
800
801 case regexpcode_word_end:
802 #define WORD_END_P(left_p, right_p) ((left_p) && (! (right_p)))
803 RE_MATCH_WORD_BOUND (WORD_END_P);
804 #undef WORD_END_P
805
806 #undef RE_MATCH_WORD_BOUND
807
808 case regexpcode_syntax_spec:
809 {
810 int ascii;
811 enum syntaxcode code;
812
813 READ_MATCH_CHAR (ascii);
814 READ_PATTERN_SYNTAXCODE (code);
815 if (SYNTAX_CONSTITUENT_P (code, ascii))
816 goto re_match_loop;
817 goto re_match_fail;
818 }
819
820 case regexpcode_not_syntax_spec:
821 {
822 int ascii;
823 enum syntaxcode code;
824
825 READ_MATCH_CHAR (ascii);
826 READ_PATTERN_SYNTAXCODE (code);
827 if (! (SYNTAX_CONSTITUENT_P (code, ascii)))
828 goto re_match_loop;
829 goto re_match_fail;
830 }
831
832 case regexpcode_word_char:
833 {
834 int ascii;
835
836 READ_MATCH_CHAR (ascii);
837 if (WORD_CONSTITUENT_P (ascii))
838 goto re_match_loop;
839 goto re_match_fail;
840 }
841
842 case regexpcode_not_word_char:
843 {
844 int ascii;
845
846 READ_MATCH_CHAR (ascii);
847 if (! (WORD_CONSTITUENT_P (ascii)))
848 goto re_match_loop;
849 goto re_match_fail;
850 }
851
852 /* "or" constructs ("|") are handled by starting each alternative
853 with an on_failure_jump that points to the start of the next
854 alternative. Each alternative except the last ends with a jump
855 to the joining point. (Actually, each jump except for the last
856 one really jumps to the following jump, because tensioning the
857 jumps is a hassle.)
858
859 The start of a stupid repeat has an on_failure_jump that points
860 past the end of the repeat text. This makes a failure point so
861 that, on failure to match a repetition, matching restarts past
862 as many repetitions have been found with no way to fail and
863 look for another one.
864
865 A smart repeat is similar but loops back to the on_failure_jump
866 so that each repetition makes another failure point. */
867
868 case regexpcode_on_failure_jump:
869 {
870 long offset;
871
872 READ_PATTERN_OFFSET (offset);
873 PUSH_FAILURE_POINT ((pattern_pc + offset), match_pc);
874 goto re_match_loop;
875 }
876
877 /* The end of a smart repeat has a maybe_finalize_jump back.
878 Change it either to a finalize_jump or an ordinary jump. */
879
880 case regexpcode_maybe_finalize_jump:
881 {
882 long offset;
883 long ascii;
884
885 READ_PATTERN_OFFSET (offset);
886 if (pattern_pc == pattern_end)
887 goto finalize_jump;
888
889 /* Compare what follows with the beginning of the repeat.
890 If we can establish that there is nothing that they
891 would both match, we can change to `finalize_jump'. */
892
893 SWITCH_ENUM (regexpcode, (pattern_pc [0]))
894 {
895 case regexpcode_exact_1:
896 ascii = (pattern_pc [1]);
897 break;
898
899 case regexpcode_exact_n:
900 ascii = (pattern_pc [2]);
901 break;
902
903 case regexpcode_line_end:
904 ascii = ('\n');
905 break;
906
907 default:
908 goto dont_finalize_jump;
909 }
910
911 /* (pattern_pc [(offset - 3)]) is an `on_failure_jump'.
912 Examine what follows that. */
913 SWITCH_ENUM (regexpcode, (pattern_pc [offset]))
914 {
915 case regexpcode_exact_1:
916 {
917 if (ascii != (pattern_pc [(offset + 1)]))
918 goto finalize_jump;
919 goto dont_finalize_jump;
920 }
921
922 case regexpcode_exact_n:
923 {
924 if (ascii != (pattern_pc [(offset + 2)]))
925 goto finalize_jump;
926 goto dont_finalize_jump;
927 }
928
929 case regexpcode_char_set:
930 {
931 if (CHAR_SET_MEMBER_P ((pattern_pc [(offset + 1)]),
932 (& (pattern_pc [(offset + 2)])),
933 ascii))
934 goto dont_finalize_jump;
935 goto finalize_jump;
936 }
937
938 case regexpcode_not_char_set:
939 {
940 if (CHAR_SET_MEMBER_P ((pattern_pc [(offset + 1)]),
941 (& (pattern_pc [(offset + 2)])),
942 ascii))
943 goto finalize_jump;
944 goto dont_finalize_jump;
945 }
946
947 default:
948 goto dont_finalize_jump;
949 }
950
951 finalize_jump:
952 pattern_pc -= 2;
953 (pattern_pc [-1]) = ((unsigned char) regexpcode_finalize_jump);
954 goto re_match_finalize_jump;
955
956 dont_finalize_jump:
957 pattern_pc -= 2;
958 (pattern_pc [-1]) = ((unsigned char) regexpcode_jump);
959 goto re_match_jump;
960 }
961
962 case regexpcode_finalize_jump:
963 re_match_finalize_jump:
964 {
965 stack_pointer -= 2;
966 goto re_match_jump;
967 }
968
969 case regexpcode_jump:
970 re_match_jump:
971 {
972 long offset;
973
974 READ_PATTERN_OFFSET (offset);
975 pattern_pc += offset;
976 goto re_match_loop;
977 }
978
979 case regexpcode_dummy_failure_jump:
980 {
981 PUSH_FAILURE_POINT (NULL, NULL);
982 goto re_match_jump;
983 }
984
985 default:
986 {
987 BAD_PATTERN ();
988 }
989 }
990
991 re_match_fail:
992 if (stack_pointer == stack_start)
993 RE_RETURN (RE_MATCH_FAILED);
994 match_pc = (*--stack_pointer);
995 pattern_pc = (*--stack_pointer);
996 if (pattern_pc != NULL)
997 goto re_match_loop;
998 goto re_match_fail;
999
1000 return_point:
1001 if (stack_start != NULL)
1002 free (stack_start);
1003 return (return_value);
1004 }
1005
1006 #define DEFINE_RE_SEARCH(name) \
1007 int \
1008 name (unsigned char * pattern_start, \
1009 unsigned char * pattern_end, \
1010 struct re_buffer * buffer, \
1011 struct re_registers * registers, \
1012 unsigned char * match_start, \
1013 unsigned char * match_end)
1014
1015 #define INITIALIZE_RE_SEARCH(pc, limit, gap_limit) \
1016 int can_be_null; \
1017 unsigned char *translation; \
1018 int match_result; \
1019 \
1020 unsigned char *match_pc; \
1021 unsigned char *match_limit; \
1022 unsigned char *gap_limit; \
1023 unsigned char *fastmap; \
1024 unsigned char fastmap_array[MAX_ASCII]; \
1025 \
1026 fastmap = &fastmap_array[0]; \
1027 translation = (buffer -> translation); \
1028 can_be_null = \
1029 (re_compile_fastmap \
1030 (pattern_start, pattern_end, translation, \
1031 (buffer -> syntax_table), fastmap)); \
1032 if (can_be_null < 0) \
1033 return (can_be_null); \
1034 \
1035 match_pc = (pc); \
1036 match_limit = (limit); \
1037 gap_limit = (buffer -> gap_limit)
1038
1039 #define RE_SEARCH_TEST(start) \
1040 (re_match \
1041 (pattern_start, pattern_end, buffer, registers, (start), match_end))
1042
1043 #define RE_SEARCH_FORWARD_FAST(limit) do \
1044 { \
1045 while (true) \
1046 { \
1047 if (match_pc >= (limit)) \
1048 break; \
1049 \
1050 if ((fastmap [(TRANSLATE_CHAR (*match_pc++))]) == FASTMAP_FALSE) \
1051 continue; \
1052 \
1053 match_result = (RE_SEARCH_TEST (match_pc - 1)); \
1054 if (RE_MATCH_FAILURE_RESULT (match_result)) \
1055 continue; \
1056 \
1057 return (match_result); \
1058 } \
1059 } while (0)
1060
DEFINE_RE_SEARCH(re_search_forward)1061 DEFINE_RE_SEARCH (re_search_forward)
1062 {
1063 INITIALIZE_RE_SEARCH (match_start, match_end, gap_start);
1064
1065 if (can_be_null != 1)
1066 {
1067 if ((match_pc < gap_start) && (gap_start < match_limit))
1068 RE_SEARCH_FORWARD_FAST (gap_start);
1069 if (match_pc == gap_start)
1070 match_pc = (buffer -> gap_end);
1071 RE_SEARCH_FORWARD_FAST (match_limit);
1072 return
1073 ((can_be_null == 0)
1074 ? RE_MATCH_FAILED
1075 : (RE_SEARCH_TEST (match_limit)));
1076 }
1077 else
1078 {
1079 while (true)
1080 {
1081 match_result = (RE_SEARCH_TEST (match_pc));
1082 if (! (RE_MATCH_FAILURE_RESULT (match_result)))
1083 return (match_result);
1084 match_pc += 1;
1085 if (match_pc == gap_start)
1086 match_pc = (buffer -> gap_end);
1087 if (match_pc > match_limit)
1088 return (match_result);
1089 }
1090 }
1091 }
1092
1093 #define RE_SEARCH_BACKWARD_FAST(limit) do \
1094 { \
1095 while (true) \
1096 { \
1097 if (match_pc <= (limit)) \
1098 break; \
1099 \
1100 if ((fastmap [(TRANSLATE_CHAR (*--match_pc))]) == FASTMAP_FALSE) \
1101 continue; \
1102 \
1103 match_result = (RE_SEARCH_TEST (match_pc)); \
1104 if (RE_MATCH_FAILURE_RESULT (match_result)) \
1105 continue; \
1106 \
1107 RE_SEARCH_BACKWARD_RETURN (match_pc); \
1108 } \
1109 } while (0)
1110
1111 #define RE_SEARCH_BACKWARD_RETURN(start) \
1112 return \
1113 ((match_result < 0) \
1114 ? match_result \
1115 : ((((start) > (buffer -> gap_start)) \
1116 ? ((start) - (gap_end - (buffer -> gap_start))) \
1117 : (start)) \
1118 - (buffer -> text)))
1119
DEFINE_RE_SEARCH(re_search_backward)1120 DEFINE_RE_SEARCH (re_search_backward)
1121 {
1122 INITIALIZE_RE_SEARCH (match_end, match_start, gap_end);
1123
1124 if (can_be_null != 1)
1125 {
1126 if ((match_pc > gap_end) && (gap_end > match_limit))
1127 RE_SEARCH_BACKWARD_FAST (gap_end);
1128 if (match_pc == gap_end)
1129 match_pc = (buffer -> gap_start);
1130 RE_SEARCH_BACKWARD_FAST (match_limit);
1131 if (can_be_null == 0)
1132 return (RE_MATCH_FAILED);
1133 match_result = (RE_SEARCH_TEST (match_limit));
1134 RE_SEARCH_BACKWARD_RETURN (match_limit);
1135 }
1136 else
1137 {
1138 while (true)
1139 {
1140 match_result = (RE_SEARCH_TEST (match_pc));
1141 if (! (RE_MATCH_FAILURE_RESULT (match_result)))
1142 RE_SEARCH_BACKWARD_RETURN (match_pc);
1143 if (match_pc == gap_end)
1144 match_pc = (buffer -> gap_start);
1145 match_pc -= 1;
1146 if (match_pc < match_limit)
1147 RE_SEARCH_BACKWARD_RETURN (match_pc);
1148 }
1149 }
1150 }
1151