1 /*
2 regexpr.c
3 
4 Author: Tatu Ylonen <ylo@ngs.fi>
5 
6 Copyright (c) 1991 Tatu Ylonen, Espoo, Finland
7 
8 Permission to use, copy, modify, distribute, and sell this software
9 and its documentation is hereby granted without fee, provided that the
10 above copyright notice appears in all source code copies, the name of
11 Tatu Ylonen is not used to advertise products containing this software
12 or a derivation thereof, and all modified versions are clearly marked
13 as such.
14 
15 This software is provided "as is" without express or implied warranty.
16 
17 Created: Thu Sep 26 17:14:05 1991 ylo
18 Last modified: Sun Mar 29 16:47:31 1992 ylo
19 
20 This code draws many ideas from the regular expression packages by
21 Henry Spencer of the University of Toronto and Richard Stallman of the
22 Free Software Foundation.
23 
24 Emacs-specific code and syntax table code is almost directly borrowed
25 from GNU regexp.
26 
27 $Header: /u/src/lib/tools/RCS/regexpr.c,v 1.4 92/03/29 16:52:04 ylo Exp $
28 
29 */
30 
31 #include <stdio.h>
32 #include <assert.h>
33 #include "regexpr.h"
34 
35 char *malloc();
36 void free();
37 char *realloc();
38 
39 #define MACRO_BEGIN do {
40 #define MACRO_END } while (0)
41 
42 enum regexp_compiled_ops /* opcodes for compiled regexp */
43 {
44   Cend,			/* end of pattern reached */
45   Cbol,			/* beginning of line */
46   Ceol,			/* end of line */
47   Cset,			/* character set.  Followed by 32 bytes of set. */
48   Cexact,		/* followed by a byte to match */
49   Canychar,		/* matches any character except newline */
50   Cstart_memory,	/* set register start addr (followed by reg number) */
51   Cend_memory,		/* set register end addr (followed by reg number) */
52   Cmatch_memory,	/* match a duplicate of reg contents (regnum follows)*/
53   Cjump,		/* followed by two bytes (lsb,msb) of displacement. */
54   Cstar_jump,		/* will change to jump/update_failure_jump at runtime */
55   Cfailure_jump,	/* jump to addr on failure */
56   Cupdate_failure_jump,	/* update topmost failure point and jump */
57   Cdummy_failure_jump,	/* push a dummy failure point and jump */
58   Cbegbuf,		/* match at beginning of buffer */
59   Cendbuf,		/* match at end of buffer */
60   Cwordbeg,		/* match at beginning of word */
61   Cwordend,		/* match at end of word */
62   Cwordbound,		/* match if at word boundary */
63   Cnotwordbound,	/* match if not at word boundary */
64 #ifdef emacs
65   Cemacs_at_dot,	/* emacs only: matches at dot */
66 #endif /* emacs */
67   Csyntaxspec,		/* matches syntax code (1 byte follows) */
68   Cnotsyntaxspec	/* matches if syntax code does not match (1 byte foll)*/
69 };
70 
71 enum regexp_syntax_op	/* syntax codes for plain and quoted characters */
72 {
73   Rend,			/* special code for end of regexp */
74   Rnormal,		/* normal character */
75   Ranychar,		/* any character except newline */
76   Rquote,		/* the quote character */
77   Rbol,			/* match beginning of line */
78   Reol,			/* match end of line */
79   Roptional,		/* match preceding expression optionally */
80   Rstar,		/* match preceding expr zero or more times */
81   Rplus,		/* match preceding expr one or more times */
82   Ror,			/* match either of alternatives */
83   Ropenpar,		/* opening parenthesis */
84   Rclosepar,		/* closing parenthesis */
85   Rmemory,		/* match memory register */
86   Rextended_memory,	/* \vnn to match registers 10-99 */
87   Ropenset,		/* open set.  Internal syntax hard-coded below. */
88   /* the following are gnu extensions to "normal" regexp syntax */
89   Rbegbuf,		/* beginning of buffer */
90   Rendbuf,		/* end of buffer */
91   Rwordchar,		/* word character */
92   Rnotwordchar,		/* not word character */
93   Rwordbeg,		/* beginning of word */
94   Rwordend,		/* end of word */
95   Rwordbound,		/* word bound */
96   Rnotwordbound,	/* not word bound */
97 #ifdef emacs
98   Remacs_at_dot,	/* emacs: at dot */
99   Remacs_syntaxspec,	/* syntaxspec */
100   Remacs_notsyntaxspec,	/* notsyntaxspec */
101 #endif /* emacs */
102   Rnum_ops
103 };
104 
105 static int re_compile_initialized = 0;
106 static int regexp_syntax = 0;
107 static unsigned char regexp_plain_ops[256];
108 static unsigned char regexp_quoted_ops[256];
109 static unsigned char regexp_precedences[Rnum_ops];
110 static int regexp_context_indep_ops;
111 static int regexp_ansi_sequences;
112 
113 #define NUM_LEVELS  5    /* number of precedence levels in use */
114 #define MAX_NESTING 100  /* max nesting level of operators */
115 
116 #ifdef emacs
117 
118 /* This code is for emacs compatibility only. */
119 
120 #include "config.h"
121 #include "lisp.h"
122 #include "buffer.h"
123 #include "syntax.h"
124 
125 /* emacs defines NULL in some strange way? */
126 #undef NULL
127 #define NULL 0
128 
129 #else /* emacs */
130 
131 #define SYNTAX(ch) re_syntax_table[(unsigned char)(ch)]
132 #define Sword 1
133 
134 /* 94/01/19 (cjk) made re_syntax_table unsigned */
135 #ifdef SYNTAX_TABLE
136 unsigned char *re_syntax_table;
137 #else
138 static unsigned char re_syntax_table[256];
139 #endif /* SYNTAX_TABLE */
140 
141 #endif /* emacs */
142 
143 /* 94/01/19 (cjk) forward declaration for memset added */
144 /* CFS - MacOS X Server continues to not declare memset */
145 //#if defined(NeXT) || defined(__APPLE__)
146 //  void *memset(void *, unsigned char, unsigned int);
147 //#endif /* defined(NeXT) */
148 
149 //MacOS X 1.0.0 finally defines memset - all these years of waiting are over!
150 
re_compile_initialize()151 static void re_compile_initialize()
152 {
153   int a;
154 
155 #if !defined(emacs) && !defined(SYNTAX_TABLE)
156   static int syntax_table_inited = 0;
157 
158   if (!syntax_table_inited)
159     {
160       syntax_table_inited = 1;
161       memset(re_syntax_table, 0, 256);
162       for (a = 'a'; a <= 'z'; a++)
163 	re_syntax_table[a] = Sword;
164       for (a = 'A'; a <= 'Z'; a++)
165 	re_syntax_table[a] = Sword;
166       for (a = '0'; a <= '9'; a++)
167 	re_syntax_table[a] = Sword;
168     }
169 #endif /* !emacs && !SYNTAX_TABLE */
170   re_compile_initialized = 1;
171   for (a = 0; a < 256; a++)
172     {
173       regexp_plain_ops[a] = Rnormal;
174       regexp_quoted_ops[a] = Rnormal;
175     }
176   for (a = '0'; a <= '9'; a++)
177     regexp_quoted_ops[a] = Rmemory;
178   regexp_plain_ops['\134'] = Rquote;
179   if (regexp_syntax & RE_NO_BK_PARENS)
180     {
181       regexp_plain_ops['('] = Ropenpar;
182       regexp_plain_ops[')'] = Rclosepar;
183     }
184   else
185     {
186       regexp_quoted_ops['('] = Ropenpar;
187       regexp_quoted_ops[')'] = Rclosepar;
188     }
189   if (regexp_syntax & RE_NO_BK_VBAR)
190     regexp_plain_ops['\174'] = Ror;
191   else
192     regexp_quoted_ops['\174'] = Ror;
193   regexp_plain_ops['*'] = Rstar;
194   if (regexp_syntax & RE_BK_PLUS_QM)
195     {
196       regexp_quoted_ops['+'] = Rplus;
197       regexp_quoted_ops['?'] = Roptional;
198     }
199   else
200     {
201       regexp_plain_ops['+'] = Rplus;
202       regexp_plain_ops['?'] = Roptional;
203     }
204   if (regexp_syntax & RE_NEWLINE_OR)
205     regexp_plain_ops['\n'] = Ror;
206   regexp_plain_ops['\133'] = Ropenset;
207   regexp_plain_ops['\136'] = Rbol;
208   regexp_plain_ops['$'] = Reol;
209   regexp_plain_ops['.'] = Ranychar;
210   if (!(regexp_syntax & RE_NO_GNU_EXTENSIONS))
211     {
212 #ifdef emacs
213       regexp_quoted_ops['='] = Remacs_at_dot;
214       regexp_quoted_ops['s'] = Remacs_syntaxspec;
215       regexp_quoted_ops['S'] = Remacs_notsyntaxspec;
216 #endif /* emacs */
217       regexp_quoted_ops['w'] = Rwordchar;
218       regexp_quoted_ops['W'] = Rnotwordchar;
219       regexp_quoted_ops['<'] = Rwordbeg;
220       regexp_quoted_ops['>'] = Rwordend;
221       regexp_quoted_ops['b'] = Rwordbound;
222       regexp_quoted_ops['B'] = Rnotwordbound;
223       regexp_quoted_ops['`'] = Rbegbuf;
224       regexp_quoted_ops['\''] = Rendbuf;
225     }
226   if (regexp_syntax & RE_ANSI_HEX)
227     regexp_quoted_ops['v'] = Rextended_memory;
228   for (a = 0; a < Rnum_ops; a++)
229     regexp_precedences[a] = 4;
230   if (regexp_syntax & RE_TIGHT_VBAR)
231     {
232       regexp_precedences[Ror] = 3;
233       regexp_precedences[Rbol] = 2;
234       regexp_precedences[Reol] = 2;
235     }
236   else
237     {
238       regexp_precedences[Ror] = 2;
239       regexp_precedences[Rbol] = 3;
240       regexp_precedences[Reol] = 3;
241     }
242   regexp_precedences[Rclosepar] = 1;
243   regexp_precedences[Rend] = 0;
244   regexp_context_indep_ops = (regexp_syntax & RE_CONTEXT_INDEP_OPS) != 0;
245   regexp_ansi_sequences = (regexp_syntax & RE_ANSI_HEX) != 0;
246 }
247 
re_set_syntax(syntax)248 int re_set_syntax(syntax)
249 int syntax;
250 {
251   int ret;
252 
253   ret = regexp_syntax;
254   regexp_syntax = syntax;
255   re_compile_initialize();
256   return ret;
257 }
258 
hex_char_to_decimal(ch)259 static int hex_char_to_decimal(ch)
260 int ch;
261 {
262   if (ch >= '0' && ch <= '9')
263     return ch - '0';
264   if (ch >= 'a' && ch <= 'f')
265     return ch - 'a' + 10;
266   if (ch >= 'A' && ch <= 'F')
267     return ch - 'A' + 10;
268   return 16;
269 }
270 
re_compile_pattern(regex,size,bufp)271 char *re_compile_pattern(regex, size, bufp)
272 char *regex;
273 int size;
274 regexp_t bufp;
275 {
276   int a, pos, op, current_level, level, opcode;
277   int pattern_offset=0, alloc;  /* 94/01/19 (cjk) pattern_offset initialized */
278   int starts[NUM_LEVELS * MAX_NESTING], starts_base;
279   int future_jumps[MAX_NESTING], num_jumps;
280   unsigned char ch=0;  /* 94/01/19 (cjk) ch initialized */
281   char *pattern, *translate;
282   int next_register, paren_depth, num_open_registers, open_registers[RE_NREGS];
283   int beginning_context;
284 
285 #define NEXTCHAR(var)			\
286   MACRO_BEGIN				\
287     if (pos >= size)			\
288       goto ends_prematurely;		\
289     (var) = regex[pos];			\
290     pos++;				\
291   MACRO_END
292 
293 #define ALLOC(amount)				\
294   MACRO_BEGIN					\
295     if (pattern_offset+(amount) > alloc)	\
296       {						\
297 	alloc += 256 + (amount);		\
298 	pattern = realloc(pattern, alloc);	\
299 	if (!pattern)				\
300 	  goto out_of_memory;			\
301       }						\
302   MACRO_END
303 
304 #define STORE(ch) pattern[pattern_offset++] = (ch)
305 
306 #define CURRENT_LEVEL_START (starts[starts_base + current_level])
307 
308 #define SET_LEVEL_START starts[starts_base + current_level] = pattern_offset
309 
310 #define PUSH_LEVEL_STARTS if (starts_base < (MAX_NESTING-1)*NUM_LEVELS) \
311 		            starts_base += NUM_LEVELS;			\
312                           else						\
313 			    goto too_complex
314 
315 #define POP_LEVEL_STARTS starts_base -= NUM_LEVELS
316 
317 #define PUT_ADDR(offset,addr)				\
318   MACRO_BEGIN						\
319     int disp = (addr) - (offset) - 2;			\
320     pattern[(offset)] = disp & 0xff;			\
321     pattern[(offset)+1] = (disp>>8) & 0xff;		\
322   MACRO_END
323 
324 #define INSERT_JUMP(pos,type,addr)			\
325   MACRO_BEGIN						\
326     int a, p = (pos), t = (type), ad = (addr);		\
327     for (a = pattern_offset - 1; a >= p; a--)		\
328       pattern[a + 3] = pattern[a];			\
329     pattern[p] = t;					\
330     PUT_ADDR(p+1,ad);					\
331     pattern_offset += 3;				\
332   MACRO_END
333 
334 #define SETBIT(buf,offset,bit) (buf)[(offset)+(bit)/8] |= (1<<((bit) & 7))
335 
336 #define SET_FIELDS				\
337   MACRO_BEGIN					\
338     bufp->allocated = alloc;			\
339     bufp->buffer = pattern;			\
340     bufp->used = pattern_offset;		\
341   MACRO_END
342 
343 #define GETHEX(var)						\
344   MACRO_BEGIN							\
345     char gethex_ch, gethex_value;				\
346     NEXTCHAR(gethex_ch);					\
347     gethex_value = hex_char_to_decimal(gethex_ch);		\
348     if (gethex_value == 16)					\
349       goto hex_error;						\
350     NEXTCHAR(gethex_ch);					\
351     gethex_ch = hex_char_to_decimal(gethex_ch);			\
352     if (gethex_ch == 16)					\
353       goto hex_error;						\
354     (var) = gethex_value * 16 + gethex_ch;			\
355   MACRO_END
356 
357 #define ANSI_TRANSLATE(ch)				\
358   MACRO_BEGIN						\
359     switch (ch)						\
360       {							\
361       case 'a':						\
362       case 'A':						\
363 	ch = 7; /* audible bell */			\
364 	break;						\
365       case 'b':						\
366       case 'B':						\
367 	ch = 8; /* backspace */				\
368 	break;						\
369       case 'f':						\
370       case 'F':						\
371 	ch = 12; /* form feed */			\
372 	break;						\
373       case 'n':						\
374       case 'N':						\
375 	ch = 10; /* line feed */			\
376 	break;						\
377       case 'r':						\
378       case 'R':						\
379 	ch = 13; /* carriage return */			\
380 	break;						\
381       case 't':						\
382       case 'T':						\
383 	ch = 9; /* tab */				\
384 	break;						\
385       case 'v':						\
386       case 'V':						\
387 	ch = 11; /* vertical tab */			\
388 	break;						\
389       case 'x': /* hex code */				\
390       case 'X':						\
391 	GETHEX(ch);					\
392 	break;						\
393       default:						\
394 	/* other characters passed through */		\
395 	if (translate)					\
396 	  ch = translate[(unsigned char)ch];		\
397 	break;						\
398       }							\
399   MACRO_END
400 
401   if (!re_compile_initialized)
402     re_compile_initialize();
403   bufp->used = 0;
404   bufp->fastmap_accurate = 0;
405   bufp->uses_registers = 0;
406   translate = bufp->translate;
407   pattern = bufp->buffer;
408   alloc = bufp->allocated;
409   if (alloc == 0 || pattern == NULL)
410     {
411       alloc = 256;
412       pattern = malloc(alloc);
413       if (!pattern)
414 	goto out_of_memory;
415     }
416   pattern_offset = 0;
417   starts_base = 0;
418   num_jumps = 0;
419   current_level = 0;
420   SET_LEVEL_START;
421   num_open_registers = 0;
422   next_register = 1;
423   paren_depth = 0;
424   beginning_context = 1;
425   op = -1;
426   /* we use Rend dummy to ensure that pending jumps are updated (due to
427      low priority of Rend) before exiting the loop. */
428   pos = 0;
429   while (op != Rend)
430     {
431       if (pos >= size)
432 	op = Rend;
433       else
434 	{
435 	  NEXTCHAR(ch);
436 	  if (translate)
437 	    ch = translate[(unsigned char)ch];
438 	  op = regexp_plain_ops[(unsigned char)ch];
439 	  if (op == Rquote)
440 	    {
441 	      NEXTCHAR(ch);
442 	      op = regexp_quoted_ops[(unsigned char)ch];
443 	      if (op == Rnormal && regexp_ansi_sequences)
444 		ANSI_TRANSLATE(ch);
445 	    }
446 	}
447       level = regexp_precedences[op];
448       /* printf("ch='%c' op=%d level=%d current_level=%d curlevstart=%d\n",
449 	     ch, op, level, current_level, CURRENT_LEVEL_START); */
450       if (level > current_level)
451 	{
452 	  for (current_level++; current_level < level; current_level++)
453 	    SET_LEVEL_START;
454 	  SET_LEVEL_START;
455 	}
456       else
457 	if (level < current_level)
458 	  {
459 	    current_level = level;
460 	    for (;num_jumps > 0 &&
461 		 future_jumps[num_jumps-1] >= CURRENT_LEVEL_START;
462 		 num_jumps--)
463 	      PUT_ADDR(future_jumps[num_jumps-1], pattern_offset);
464 	  }
465       switch (op)
466 	{
467 	case Rend:
468 	  break;
469 	case Rnormal:
470 	normal_char:
471 	  opcode = Cexact;
472 	store_opcode_and_arg: /* opcode & ch must be set */
473 	  SET_LEVEL_START;
474 	  ALLOC(2);
475 	  STORE(opcode);
476 	  STORE(ch);
477 	  break;
478 	case Ranychar:
479 	  opcode = Canychar;
480 	store_opcode:
481 	  SET_LEVEL_START;
482 	  ALLOC(1);
483 	  STORE(opcode);
484 	  break;
485 	case Rquote:
486 	  abort();
487 	  /*NOTREACHED*/
488 	case Rbol:
489 	  if (!beginning_context)
490 	    if (regexp_context_indep_ops)
491 	      goto op_error;
492 	    else
493 	      goto normal_char;
494 	  opcode = Cbol;
495 	  goto store_opcode;
496 	case Reol:
497 	  if (!((pos >= size) ||
498 		((regexp_syntax & RE_NO_BK_VBAR) ?
499 		 (regex[pos] == '\174') :
500 		 (pos+1 < size && regex[pos] == '\134' &&
501 		  regex[pos+1] == '\174')) ||
502 		((regexp_syntax & RE_NO_BK_PARENS)?
503 		 (regex[pos] == ')'):
504 		 (pos+1 < size && regex[pos] == '\134' &&
505 		  regex[pos+1] == ')'))))
506 	    if (regexp_context_indep_ops)
507 	      goto op_error;
508 	    else
509 	      goto normal_char;
510 	  opcode = Ceol;
511 	  goto store_opcode;
512 	  break;
513 	case Roptional:
514 	  if (beginning_context)
515 	    if (regexp_context_indep_ops)
516 	      goto op_error;
517 	    else
518 	      goto normal_char;
519 	  if (CURRENT_LEVEL_START == pattern_offset)
520 	    break; /* ignore empty patterns for ? */
521 	  ALLOC(3);
522 	  INSERT_JUMP(CURRENT_LEVEL_START, Cfailure_jump,
523 		      pattern_offset + 3);
524 	  break;
525 	case Rstar:
526 	case Rplus:
527 	  if (beginning_context)
528 	    if (regexp_context_indep_ops)
529 	      goto op_error;
530 	    else
531 	      goto normal_char;
532 	  if (CURRENT_LEVEL_START == pattern_offset)
533 	    break; /* ignore empty patterns for + and * */
534 	  ALLOC(9);
535 	  INSERT_JUMP(CURRENT_LEVEL_START, Cfailure_jump,
536 		      pattern_offset + 6);
537 	  INSERT_JUMP(pattern_offset, Cstar_jump, CURRENT_LEVEL_START);
538 	  if (op == Rplus)  /* jump over initial failure_jump */
539 	    INSERT_JUMP(CURRENT_LEVEL_START, Cdummy_failure_jump,
540 			CURRENT_LEVEL_START + 6);
541 	  break;
542 	case Ror:
543 	  ALLOC(6);
544 	  INSERT_JUMP(CURRENT_LEVEL_START, Cfailure_jump,
545 		      pattern_offset + 6);
546 	  if (num_jumps >= MAX_NESTING)
547 	    goto too_complex;
548 	  STORE(Cjump);
549 	  future_jumps[num_jumps++] = pattern_offset;
550 	  STORE(0);
551 	  STORE(0);
552 	  SET_LEVEL_START;
553 	  break;
554 	case Ropenpar:
555 	  SET_LEVEL_START;
556 	  if (next_register < RE_NREGS)
557 	    {
558 	      bufp->uses_registers = 1;
559 	      ALLOC(2);
560 	      STORE(Cstart_memory);
561 	      STORE(next_register);
562 	      open_registers[num_open_registers++] = next_register;
563 	      next_register++;
564 	    }
565 	  paren_depth++;
566 	  PUSH_LEVEL_STARTS;
567 	  current_level = 0;
568 	  SET_LEVEL_START;
569 	  break;
570 	case Rclosepar:
571 	  if (paren_depth <= 0)
572 	    goto parenthesis_error;
573 	  POP_LEVEL_STARTS;
574 	  current_level = regexp_precedences[Ropenpar];
575 	  paren_depth--;
576 	  if (paren_depth < num_open_registers)
577 	    {
578 	      bufp->uses_registers = 1;
579 	      ALLOC(2);
580 	      STORE(Cend_memory);
581 	      num_open_registers--;
582 	      STORE(open_registers[num_open_registers]);
583 	    }
584 	  break;
585 	case Rmemory:
586 	  if (ch == '0')
587 	    goto bad_match_register;
588 	  assert(ch >= '0' && ch <= '9');
589 	  bufp->uses_registers = 1;
590 	  opcode = Cmatch_memory;
591 	  ch -= '0';
592 	  goto store_opcode_and_arg;
593 	case Rextended_memory:
594 	  NEXTCHAR(ch);
595 	  if (ch < '0' || ch > '9')
596 	    goto bad_match_register;
597 	  NEXTCHAR(a);
598 	  if (a < '0' || a > '9')
599 	    goto bad_match_register;
600 	  ch = 10 * (a - '0') + ch - '0';
601 	  if (ch <= 0 || ch >= RE_NREGS)
602 	    goto bad_match_register;
603 	  bufp->uses_registers = 1;
604 	  opcode = Cmatch_memory;
605 	  goto store_opcode_and_arg;
606 	case Ropenset:
607 	  {
608 	    int complement,prev,offset,range,firstchar;
609 
610 	    SET_LEVEL_START;
611 	    ALLOC(1+256/8);
612 	    STORE(Cset);
613 	    offset = pattern_offset;
614 	    for (a = 0; a < 256/8; a++)
615 	      STORE(0);
616 	    NEXTCHAR(ch);
617 	    if (translate)
618 	      ch = translate[(unsigned char)ch];
619 	    if (ch == '\136')
620 	      {
621 		complement = 1;
622 		NEXTCHAR(ch);
623 		if (translate)
624 		  ch = translate[(unsigned char)ch];
625 	      }
626 	    else
627 	      complement = 0;
628 	    prev = -1;
629 	    range = 0;
630 	    firstchar = 1;
631 	    while (ch != '\135' || firstchar)
632 	      {
633 		firstchar = 0;
634 		if (regexp_ansi_sequences && ch == '\134')
635 		  {
636 		    NEXTCHAR(ch);
637 		    ANSI_TRANSLATE(ch);
638 		  }
639 		if (range)
640 		  {
641 		    for (a = prev; a <= ch; a++)
642 		      SETBIT(pattern, offset, a);
643 		    prev = -1;
644 		    range = 0;
645 		  }
646 		else
647 		  if (prev != -1 && ch == '-')
648 		    range = 1;
649 		  else
650 		    {
651 		      SETBIT(pattern, offset, ch);
652 		      prev = ch;
653 		    }
654 		NEXTCHAR(ch);
655 		if (translate)
656 		  ch = translate[(unsigned char)ch];
657 	      }
658 	    if (range)
659 	      SETBIT(pattern, offset, '-');
660 	    if (complement)
661 	      {
662 		for (a = 0; a < 256/8; a++)
663 		  pattern[offset+a] ^= 0xff;
664 	      }
665 	    break;
666 	  }
667 	case Rbegbuf:
668 	  opcode = Cbegbuf;
669 	  goto store_opcode;
670 	case Rendbuf:
671 	  opcode = Cendbuf;
672 	  goto store_opcode;
673 	case Rwordchar:
674 	  opcode = Csyntaxspec;
675 	  ch = Sword;
676 	  goto store_opcode_and_arg;
677 	case Rnotwordchar:
678 	  opcode = Cnotsyntaxspec;
679 	  ch = Sword;
680 	  goto store_opcode_and_arg;
681 	case Rwordbeg:
682 	  opcode = Cwordbeg;
683 	  goto store_opcode;
684 	case Rwordend:
685 	  opcode = Cwordend;
686 	  goto store_opcode;
687 	case Rwordbound:
688 	  opcode = Cwordbound;
689 	  goto store_opcode;
690 	case Rnotwordbound:
691 	  opcode = Cnotwordbound;
692 	  goto store_opcode;
693 #ifdef emacs
694 	case Remacs_at_dot:
695 	  opcode = Cemacs_at_dot;
696 	  goto store_opcode;
697 	case Remacs_syntaxspec:
698 	  NEXTCHAR(ch);
699 	  if (translate)
700 	    ch = translate[(unsigned char)ch];
701 	  opcode = Csyntaxspec;
702 	  ch = syntax_spec_code[(unsigned char)ch];
703 	  goto store_opcode_and_arg;
704 	case Remacs_notsyntaxspec:
705 	  NEXTCHAR(ch);
706 	  if (translate)
707 	    ch = translate[(unsigned char)ch];
708 	  opcode = Cnotsyntaxspec;
709 	  ch = syntax_spec_code[(unsigned char)ch];
710 	  goto store_opcode_and_arg;
711 #endif /* emacs */
712 	default:
713 	  abort();
714 	}
715       beginning_context = (op == Ropenpar || op == Ror);
716     }
717   if (starts_base != 0)
718     goto parenthesis_error;
719   assert(num_jumps == 0);
720   ALLOC(1);
721   STORE(Cend);
722   SET_FIELDS;
723   return NULL;
724 
725  op_error:
726   SET_FIELDS;
727   return "Badly placed special character";
728 
729  bad_match_register:
730   SET_FIELDS;
731   return "Bad match register number";
732 
733  hex_error:
734   SET_FIELDS;
735   return "Bad hexadecimal number";
736 
737  parenthesis_error:
738   SET_FIELDS;
739   return "Badly placed parenthesis";
740 
741  out_of_memory:
742   SET_FIELDS;
743   return "Out of memory";
744 
745  ends_prematurely:
746   SET_FIELDS;
747   return "Regular expression ends prematurely";
748 
749  too_complex:
750   SET_FIELDS;
751   return "Regular expression too complex";
752 }
753 #undef CHARAT
754 #undef NEXTCHAR
755 #undef GETHEX
756 #undef ALLOC
757 #undef STORE
758 #undef CURRENT_LEVEL_START
759 #undef SET_LEVEL_START
760 #undef PUSH_LEVEL_STARTS
761 #undef POP_LEVEL_STARTS
762 #undef PUT_ADDR
763 #undef INSERT_JUMP
764 #undef SETBIT
765 #undef SET_FIELDS
766 
re_compile_fastmap_aux(code,pos,visited,can_be_null,fastmap)767 static void re_compile_fastmap_aux(code, pos, visited, can_be_null, fastmap)
768 char *code, *visited, *can_be_null, *fastmap;
769 int pos;
770 {
771   int a, b, syntaxcode;
772 
773   if (visited[pos])
774     return;  /* we have already been here */
775   visited[pos] = 1;
776   for (;;)
777     switch (code[pos++])
778       {
779       case Cend:
780 	*can_be_null = 1;
781 	return;
782       case Cbol:
783       case Cbegbuf:
784       case Cendbuf:
785       case Cwordbeg:
786       case Cwordend:
787       case Cwordbound:
788       case Cnotwordbound:
789 #ifdef emacs
790       case Cemacs_at_dot:
791 #endif /* emacs */
792 	break;
793       case Csyntaxspec:
794 	syntaxcode = code[pos++];
795 	for (a = 0; a < 256; a++)
796 	  if (SYNTAX(a) == syntaxcode)
797 	    fastmap[a] = 1;
798 	return;
799       case Cnotsyntaxspec:
800 	syntaxcode = code[pos++];
801 	for (a = 0; a < 256; a++)
802 	  if (SYNTAX(a) != syntaxcode)
803 	    fastmap[a] = 1;
804 	return;
805       case Ceol:
806 	fastmap['\n'] = 1;
807 	if (*can_be_null == 0)
808 	  *can_be_null = 2;  /* can match null, but only at end of buffer*/
809 	return;
810       case Cset:
811 	for (a = 0; a < 256/8; a++)
812 	  if (code[pos + a] != 0)
813 	    for (b = 0; b < 8; b++)
814 	      if (code[pos + a] & (1 << b))
815 		fastmap[(a << 3) + b] = 1;
816 	pos += 256/8;
817 	return;
818       case Cexact:
819 	fastmap[(unsigned char)code[pos]] = 1;
820 	return;
821       case Canychar:
822 	for (a = 0; a < 256; a++)
823 	  if (a != '\n')
824 	    fastmap[a] = 1;
825 	return;
826       case Cstart_memory:
827       case Cend_memory:
828 	pos++;
829 	break;
830       case Cmatch_memory:
831 	for (a = 0; a < 256; a++)
832 	  fastmap[a] = 1;
833 	*can_be_null = 1;
834 	return;
835       case Cjump:
836       case Cdummy_failure_jump:
837       case Cupdate_failure_jump:
838       case Cstar_jump:
839 	a = (unsigned char)code[pos++];
840 	a |= (unsigned char)code[pos++] << 8;
841 	pos += (int)(short)a;
842 	if (visited[pos])
843 	  {
844 	    /* argh... the regexp contains empty loops.  This is not
845 	       good, as this may cause a failure stack overflow when
846 	       matching.  Oh well. */
847 	    /* this path leads nowhere; pursue other paths. */
848 	    return;
849 	  }
850 	visited[pos] = 1;
851 	break;
852       case Cfailure_jump:
853 	a = (unsigned char)code[pos++];
854 	a |= (unsigned char)code[pos++] << 8;
855 	a = pos + (int)(short)a;
856 	re_compile_fastmap_aux(code, a, visited, can_be_null, fastmap);
857 	break;
858       default:
859 	abort();  /* probably some opcode is missing from this switch */
860 	/*NOTREACHED*/
861       }
862 }
863 
re_do_compile_fastmap(buffer,used,pos,can_be_null,fastmap)864 static int re_do_compile_fastmap(buffer, used, pos, can_be_null, fastmap)
865 char *buffer, *fastmap, *can_be_null;
866 int used, pos;
867 {
868   char small_visited[512], *visited;
869 
870   if (used <= sizeof(small_visited))
871     visited = small_visited;
872   else
873     {
874       visited = malloc(used);
875       if (!visited)
876 	return 0;
877     }
878   *can_be_null = 0;
879   memset(fastmap, 0, 256);
880   memset(visited, 0, used);
881   re_compile_fastmap_aux(buffer, pos, visited, can_be_null, fastmap);
882   if (visited != small_visited)
883     free(visited);
884   return 1;
885 }
886 
re_compile_fastmap(bufp)887 void re_compile_fastmap(bufp)
888 regexp_t bufp;
889 {
890   if (!bufp->fastmap || bufp->fastmap_accurate)
891     return;
892   assert(bufp->used > 0);
893   if (!re_do_compile_fastmap(bufp->buffer, bufp->used, 0, &bufp->can_be_null,
894 			     bufp->fastmap))
895     return;
896   if (bufp->buffer[0] == Cbol)
897     bufp->anchor = 1;   /* begline */
898   else
899     if (bufp->buffer[0] == Cbegbuf)
900       bufp->anchor = 2; /* begbuf */
901     else
902       bufp->anchor = 0; /* none */
903   bufp->fastmap_accurate = 1;
904 }
905 
906 #define INITIAL_FAILURES  128  /* initial # failure points to allocate */
907 #define MAX_FAILURES     4100  /* max # of failure points before failing */
908 
re_match_pattern_2(bufp,string1,size1,string2,size2,pos,regs,mstop)909 int re_match_pattern_2(bufp, string1, size1, string2, size2, pos, regs, mstop)
910 regexp_t bufp;
911 char *string1, *string2;
912 int size1, size2, pos, mstop;
913 regexp_registers_t regs;
914 {
915   struct failure_point { char *text, *partend, *code; }
916     *failure_stack_start, *failure_sp, *failure_stack_end,
917     initial_failure_stack[INITIAL_FAILURES];
918   char *code, *translate, *text, *textend, *partend, *part_2_end;
919   char *regstart_text[RE_NREGS], *regstart_partend[RE_NREGS];
920   char *regend_text[RE_NREGS], *regend_partend[RE_NREGS];
921   int a, b, ch, reg, regch, match_end;
922   char *regtext, *regpartend, *regtextend;
923 
924 #define PREFETCH					\
925   MACRO_BEGIN						\
926     if (text == partend)				\
927       {							\
928 	if (text == textend)				\
929 	  goto fail;					\
930 	text = string2;					\
931 	partend = part_2_end;				\
932       }							\
933   MACRO_END
934 
935 #define NEXTCHAR(var)				\
936   MACRO_BEGIN					\
937     PREFETCH;					\
938     (var) = (unsigned char)*text++;		\
939     if (translate)				\
940       (var) = (unsigned char)translate[(var)];	\
941   MACRO_END
942 
943   assert(pos >= 0 && size1 >= 0 && size2 >= 0 && mstop >= 0);
944   assert(mstop <= size1 + size2);
945   assert(pos <= mstop);
946 
947   if (pos <= size1)
948     {
949       text = string1 + pos;
950       if (mstop <= size1)
951 	{
952 	  partend = string1 + mstop;
953 	  textend = partend;
954 	}
955       else
956 	{
957 	  partend = string1 + size1;
958 	  textend = string2 + mstop - size1;
959 	}
960       part_2_end = string2 + mstop - size1;
961     }
962   else
963     {
964       text = string2 + pos - size1;
965       partend = string2 + mstop - size1;
966       textend = partend;
967       part_2_end = partend;
968     }
969 
970   if (bufp->uses_registers && regs != NULL)
971     for (a = 0; a < RE_NREGS; a++)
972       regend_text[a] = NULL;
973 
974   code = bufp->buffer;
975   translate = bufp->translate;
976   failure_stack_start = failure_sp = initial_failure_stack;
977   failure_stack_end = initial_failure_stack + INITIAL_FAILURES;
978 
979 #if 0
980   /* re_search_pattern_2 has already done this, and otherwise we get little benefit
981      from this.  So I'll leave this out. */
982   if (bufp->fastmap_accurate && !bufp->can_be_null &&
983       text != textend &&
984       !bufp->fastmap[translate ?
985 		     (unsigned char)translate[(unsigned char)*text] :
986 		     (unsigned char)*text])
987     return -1;  /* it can't possibly match */
988 #endif
989 
990  continue_matching:
991   for (;;)
992     {
993       switch (*code++)
994 	{
995 	case Cend:
996 	  if (partend != part_2_end)
997 	    match_end = text - string1;
998 	  else
999 	    match_end = text - string2 + size1;
1000 	  if (regs)
1001 	    {
1002 	      regs->start[0] = pos;
1003 	      regs->end[0] = match_end;
1004 	      if (!bufp->uses_registers)
1005 		{
1006 		  for (a = 1; a < RE_NREGS; a++)
1007 		    {
1008 		      regs->start[a] = -1;
1009 		      regs->end[a] = -1;
1010 		    }
1011 		}
1012 	      else
1013 		{
1014 		  for (a = 1; a < RE_NREGS; a++)
1015 		    {
1016 		      if (regend_text[a] == NULL)
1017 			{
1018 			  regs->start[a] = -1;
1019 			  regs->end[a] = -1;
1020 			  continue;
1021 			}
1022 		      if (regstart_partend[a] != part_2_end)
1023 			regs->start[a] = regstart_text[a] - string1;
1024 		      else
1025 			regs->start[a] = regstart_text[a] - string2 + size1;
1026 		      if (regend_partend[a] != part_2_end)
1027 			regs->end[a] = regend_text[a] - string1;
1028 		      else
1029 			regs->end[a] = regend_text[a] - string2 + size1;
1030 		    }
1031 		}
1032 	    }
1033 	  if (failure_stack_start != initial_failure_stack)
1034 	    free((char *)failure_stack_start);
1035 	  return match_end - pos;
1036 	case Cbol:
1037 	  if (text == string1 || text[-1] == '\n') /* text[-1] always valid */
1038 	    break;
1039 	  goto fail;
1040 	case Ceol:
1041 	  if (text == string2 + size2 ||
1042 	      (text == string1 + size1 ?
1043 	       (size2 == 0 || *string2 == '\n') :
1044 	       *text == '\n'))
1045 	    break;
1046 	  goto fail;
1047 	case Cset:
1048 	  NEXTCHAR(ch);
1049 	  if (code[ch/8] & (1<<(ch & 7)))
1050 	    {
1051 	      code += 256/8;
1052 	      break;
1053 	    }
1054 	  goto fail;
1055 	case Cexact:
1056 	  NEXTCHAR(ch);
1057 	  if (ch != (unsigned char)*code++)
1058 	    goto fail;
1059 	  break;
1060 	case Canychar:
1061 	  NEXTCHAR(ch);
1062 	  if (ch == '\n')
1063 	    goto fail;
1064 	  break;
1065 	case Cstart_memory:
1066 	  reg = *code++;
1067 	  regstart_text[reg] = text;
1068 	  regstart_partend[reg] = partend;
1069 	  break;
1070 	case Cend_memory:
1071 	  reg = *code++;
1072 	  regend_text[reg] = text;
1073 	  regend_partend[reg] = partend;
1074 	  break;
1075 	case Cmatch_memory:
1076 	  reg = *code++;
1077 	  if (regend_text[reg] == NULL)
1078 	    goto fail;  /* or should we just match nothing? */
1079 	  regtext = regstart_text[reg];
1080 	  regtextend = regend_text[reg];
1081 	  if (regstart_partend[reg] == regend_partend[reg])
1082 	    regpartend = regtextend;
1083 	  else
1084 	    regpartend = string1 + size1;
1085 
1086 	  for (;regtext != regtextend;)
1087 	    {
1088 	      NEXTCHAR(ch);
1089 	      if (regtext == regpartend)
1090 		regtext = string2;
1091 	      regch = (unsigned char)*regtext++;
1092 	      if (translate)
1093 		regch = (unsigned char)translate[regch];
1094 	      if (regch != ch)
1095 		goto fail;
1096 	    }
1097 	  break;
1098 	case Cstar_jump:
1099 	  /* star is coded as:
1100 	       1: failure_jump 2
1101 	          ... code for operand of star
1102 		  star_jump 1
1103 	       2: ... code after star
1104 	     We change the star_jump to update_failure_jump if we can determine
1105 	     that it is safe to do so; otherwise we change it to an ordinary
1106 	     jump.
1107 	     plus is coded as
1108 	          jump 2
1109 	       1: failure_jump 3
1110 	       2: ... code for operand of plus
1111 	          star_jump 1
1112 	       3: ... code after plus
1113 	     For star_jump considerations this is processed identically
1114 	     to star. */
1115 	  a = (unsigned char)*code++;
1116 	  a |= (unsigned char)*code++ << 8;
1117 	  a = (int)(short)a;
1118 	  {
1119 	    char map[256], can_be_null;
1120 	    char *p1, *p2;
1121 
1122 	    p1 = code + a + 3; /* skip the failure_jump */
1123 	    assert(p1[-3] == Cfailure_jump);
1124 	    p2 = code;
1125 	    /* p1 points inside loop, p2 points to after loop */
1126 	    if (!re_do_compile_fastmap(bufp->buffer, bufp->used,
1127 				       p2 - bufp->buffer, &can_be_null, map))
1128 	      goto make_normal_jump;
1129 	    /* If we might introduce a new update point inside the loop,
1130 	       we can't optimize because then update_jump would update a
1131 	       wrong failure point.  Thus we have to be quite careful here. */
1132 	  loop_p1:
1133 	    /* loop until we find something that consumes a character */
1134 	    switch (*p1++)
1135 	      {
1136               case Cbol:
1137               case Ceol:
1138               case Cbegbuf:
1139               case Cendbuf:
1140               case Cwordbeg:
1141               case Cwordend:
1142               case Cwordbound:
1143               case Cnotwordbound:
1144 #ifdef emacs
1145               case Cemacs_at_dot:
1146 #endif /* emacs */
1147                 goto loop_p1;
1148               case Cstart_memory:
1149               case Cend_memory:
1150                 p1++;
1151                 goto loop_p1;
1152 	      case Cexact:
1153 		ch = (unsigned char)*p1++;
1154 		if (map[ch])
1155 		  goto make_normal_jump;
1156 		break;
1157 	      case Canychar:
1158 		for (b = 0; b < 256; b++)
1159 		  if (b != '\n' && map[b])
1160 		    goto make_normal_jump;
1161 		break;
1162 	      case Cset:
1163 		for (b = 0; b < 256; b++)
1164 		  if ((p1[b >> 3] & (1 << (b & 7))) && map[b])
1165 		    goto make_normal_jump;
1166 		p1 += 256/8;
1167 		break;
1168 	      default:
1169 		goto make_normal_jump;
1170 	      }
1171 	    /* now we know that we can't backtrack. */
1172 	    while (p1 != p2 - 3)
1173 	      {
1174 		switch (*p1++)
1175 		  {
1176 		  case Cend:
1177 		    abort();  /* we certainly shouldn't get this inside loop */
1178 		    /*NOTREACHED*/
1179 		  case Cbol:
1180 		  case Ceol:
1181 		  case Canychar:
1182 		  case Cbegbuf:
1183 		  case Cendbuf:
1184 		  case Cwordbeg:
1185 		  case Cwordend:
1186 		  case Cwordbound:
1187 		  case Cnotwordbound:
1188 #ifdef emacs
1189 		  case Cemacs_at_dot:
1190 #endif /* emacs */
1191 		    break;
1192 		  case Cset:
1193 		    p1 += 256/8;
1194 		    break;
1195 		  case Cexact:
1196 		  case Cstart_memory:
1197 		  case Cend_memory:
1198 		  case Cmatch_memory:
1199 		  case Csyntaxspec:
1200 		  case Cnotsyntaxspec:
1201 		    p1++;
1202 		    break;
1203 		  case Cjump:
1204 		  case Cstar_jump:
1205 		  case Cfailure_jump:
1206 		  case Cupdate_failure_jump:
1207 		  case Cdummy_failure_jump:
1208 		    goto make_normal_jump;
1209 		  default:
1210 		    printf("regexpr.c: processing star_jump: unknown op %d\n", p1[-1]);
1211 		    break;
1212 		  }
1213 	      }
1214 	    goto make_update_jump;
1215 	  }
1216 	make_normal_jump:
1217 	  /* printf("changing to normal jump\n"); */
1218 	  code -= 3;
1219 	  *code = Cjump;
1220 	  break;
1221 	make_update_jump:
1222 	  /* printf("changing to update jump\n"); */
1223 	  code -= 2;
1224 	  a += 3;  /* jump to after the Cfailure_jump */
1225 	  code[-1] = Cupdate_failure_jump;
1226 	  code[0] = a & 0xff;
1227 	  code[1] = a >> 8;
1228 	  /* fall to next case */
1229 	case Cupdate_failure_jump:
1230 	  failure_sp[-1].text = text;
1231 	  failure_sp[-1].partend = partend;
1232 	  /* fall to next case */
1233 	case Cjump:
1234 	  a = (unsigned char)*code++;
1235 	  a |= (unsigned char)*code++ << 8;
1236 	  code += (int)(short)a;
1237 	  break;
1238 	case Cdummy_failure_jump:
1239 	case Cfailure_jump:
1240 	  if (failure_sp == failure_stack_end)
1241 	    {
1242 	      if (failure_stack_start != initial_failure_stack)
1243 		goto error;
1244 	      failure_stack_start = (struct failure_point *)
1245 		malloc(MAX_FAILURES * sizeof(*failure_stack_start));
1246 	      failure_stack_end = failure_stack_start + MAX_FAILURES;
1247 	      memcpy((char *)failure_stack_start, (char *)initial_failure_stack,
1248 		     INITIAL_FAILURES * sizeof(*failure_stack_start));
1249 	      failure_sp = failure_stack_start + INITIAL_FAILURES;
1250 	    }
1251 	  a = (unsigned char)*code++;
1252 	  a |= (unsigned char)*code++ << 8;
1253 	  a = (int)(short)a;
1254 	  if (code[-3] == Cdummy_failure_jump)
1255 	    { /* this is only used in plus */
1256 	      assert(*code == Cfailure_jump);
1257 	      b = (unsigned char)code[1];
1258 	      b |= (unsigned char)code[2] << 8;
1259 	      failure_sp->code = code + (int)(short)b + 3;
1260 	      failure_sp->text = NULL;
1261 	      code += a;
1262 	    }
1263 	  else
1264 	    {
1265 	      failure_sp->code = code + a;
1266 	      failure_sp->text = text;
1267 	      failure_sp->partend = partend;
1268 	    }
1269 	  failure_sp++;
1270 	  break;
1271 	case Cbegbuf:
1272 	  if (text == string1)
1273 	    break;
1274 	  goto fail;
1275 	case Cendbuf:
1276 	  if (size2 == 0 ? text == string1 + size1 : text == string2 + size2)
1277 	    break;
1278 	  goto fail;
1279 	case Cwordbeg:
1280 	  if (text == string2 + size2)
1281 	    goto fail;
1282 	  if (size2 == 0 && text == string1 + size1)
1283 	    goto fail;
1284 	  if (SYNTAX(text == string1 + size1 ? *string1 : *text) != Sword)
1285 	    goto fail;
1286 	  if (text == string1)
1287 	    break;
1288 	  if (SYNTAX(text[-1]) != Sword)
1289 	    break;
1290 	  goto fail;
1291 	case Cwordend:
1292 	  if (text == string1)
1293 	    goto fail;
1294 	  if (SYNTAX(text[-1]) != Sword)
1295 	    goto fail;
1296 	  if (text == string2 + size2)
1297 	    break;
1298 	  if (size2 == 0 && text == string1 + size1)
1299 	    break;
1300 	  if (SYNTAX(*text) == Sword)
1301 	    goto fail;
1302 	  break;
1303 	case Cwordbound:
1304 	  /* Note: as in gnu regexp, this also matches at the beginning
1305 	     and end of buffer. */
1306 	  if (text == string1 || text == string2 + size2 ||
1307 	      (size2 == 0 && text == string1 + size1))
1308 	    break;
1309 	  if ((SYNTAX(text[-1]) == Sword) ^
1310 	      (SYNTAX(text == string1 + size1 ? *string2 : *text) == Sword))
1311 	    break;
1312 	  goto fail;
1313 	case Cnotwordbound:
1314 	  /* Note: as in gnu regexp, this never matches at the beginning
1315 	     and end of buffer. */
1316 	  if (text == string1 || text == string2 + size2 ||
1317 	      (size2 == 0 && text == string1 + size1))
1318 	    goto fail;
1319 	  if (!((SYNTAX(text[-1]) == Sword) ^
1320 		(SYNTAX(text == string1 + size1 ? *string2 : *text) == Sword)))
1321 	    goto fail;
1322 	  break;
1323 	case Csyntaxspec:
1324 	  NEXTCHAR(ch);
1325 	  if (SYNTAX(ch) != (unsigned char)*code++)
1326 	    goto fail;
1327 	  break;
1328 	case Cnotsyntaxspec:
1329 	  NEXTCHAR(ch);
1330 	  if (SYNTAX(ch) != (unsigned char)*code++)
1331 	    break;
1332 	  goto fail;
1333 #ifdef emacs
1334 	case Cemacs_at_dot:
1335 	  if (PTR_CHAR_POS((unsigned char *)text) + 1 != point)
1336 	    goto fail;
1337 	  break;
1338 #endif /* emacs */
1339 	default:
1340 	  abort();
1341 	  /*NOTREACHED*/
1342 	}
1343     }
1344   abort();
1345   /*NOTREACHED*/
1346 
1347  fail:
1348   if (failure_sp != failure_stack_start)
1349     {
1350       failure_sp--;
1351       text = failure_sp->text;
1352       if (text == NULL)
1353 	goto fail;
1354       partend = failure_sp->partend;
1355       code = failure_sp->code;
1356       goto continue_matching;
1357     }
1358   if (failure_stack_start != initial_failure_stack)
1359     free((char *)failure_stack_start);
1360   return -1;
1361 
1362  error:
1363   if (failure_stack_start != initial_failure_stack)
1364     free((char *)failure_stack_start);
1365   return -2;
1366 }
1367 
1368 #undef PREFETCH
1369 #undef NEXTCHAR
1370 #undef PUSH_FAILURE
1371 
re_match_pattern(bufp,string,size,pos,regs)1372 int re_match_pattern(bufp, string, size, pos, regs)
1373 regexp_t bufp;
1374 char *string;
1375 int size, pos;
1376 regexp_registers_t regs;
1377 {
1378   return re_match_pattern_2(bufp, string, size, (char *)NULL, 0, pos, regs, size);
1379 }
1380 
re_search_pattern_2(bufp,string1,size1,string2,size2,pos,range,regs,mstop)1381 int re_search_pattern_2(bufp, string1, size1, string2, size2, pos, range, regs,
1382 		mstop)
1383 regexp_t bufp;
1384 char *string1, *string2;
1385 int size1, size2, pos, range, mstop;
1386 regexp_registers_t regs;
1387 {
1388   char *fastmap, *translate, *text, *partstart, *partend;
1389   int dir, ret;
1390   char anchor;
1391 
1392   assert(size1 >= 0 && size2 >= 0 && pos >= 0 && mstop >= 0);
1393   assert(pos + range >= 0 && pos + range <= size1 + size2);
1394   assert(pos <= mstop);
1395 
1396   fastmap = bufp->fastmap;
1397   translate = bufp->translate;
1398   if (fastmap && !bufp->fastmap_accurate)
1399     re_compile_fastmap(bufp);
1400   anchor = bufp->anchor;
1401   if (bufp->can_be_null == 1) /* can_be_null == 2: can match null at eob */
1402     fastmap = NULL;
1403   if (range < 0)
1404     {
1405       dir = -1;
1406       range = -range;
1407     }
1408   else
1409     dir = 1;
1410   if (anchor == 2)
1411     if (pos != 0)
1412       return -1;
1413     else
1414       range = 0;
1415   for (; range >= 0; range--, pos += dir)
1416     {
1417       if (fastmap)
1418 	{
1419 	  if (dir == 1)
1420 	    { /* searching forwards */
1421 	      if (pos < size1)
1422 		{
1423 		  text = string1 + pos;
1424 		  if (pos + range > size1)
1425 		    partend = string1 + size1;
1426 		  else
1427 		    partend = string1 + pos + range;
1428 		}
1429 	      else
1430 		{
1431 		  text = string2 + pos - size1;
1432 		  partend = string2 + pos + range - size1;
1433 		}
1434 	      partstart = text;
1435 	      if (translate)
1436 		while (text != partend &&
1437 		       !fastmap[(unsigned char)
1438 				translate[(unsigned char)*text]])
1439 		  text++;
1440 	      else
1441 		while (text != partend && !fastmap[(unsigned char)*text])
1442 		  text++;
1443 	      pos += text - partstart;
1444 	      range -= text - partstart;
1445 	      if (pos == size1 + size2 && bufp->can_be_null == 0)
1446 		return -1;
1447 	    }
1448 	  else
1449 	    { /* searching backwards */
1450 	      if (pos <= size1)
1451 		{
1452 		  text = string1 + pos;
1453 		  partstart = string1 + pos - range;
1454 		}
1455 	      else
1456 		{
1457 		  text = string2 + pos - size1;
1458 		  if (range < pos - size1)
1459 		    partstart = string2 + pos - size1 - range;
1460 		  else
1461 		    partstart = string2;
1462 		}
1463 	      partend = text;
1464 	      if (translate)
1465 		while (text != partstart &&
1466 		       !fastmap[(unsigned char)
1467 				translate[(unsigned char)*text]])
1468 		  text--;
1469 	      else
1470 		while (text != partstart &&
1471 		       !fastmap[(unsigned char)*text])
1472 		  text--;
1473 	      pos -= partend - text;
1474 	      range -= partend - text;
1475 	    }
1476 	}
1477       if (anchor == 1)
1478 	{ /* anchored to begline */
1479 	  if (pos > 0 &&
1480 	      (pos <= size1 ? string1[pos - 1] :
1481 	       string2[pos - size1 - 1]) != '\n')
1482 	    continue;
1483 	}
1484       assert(pos >= 0 && pos <= size1 + size2);
1485       ret = re_match_pattern_2(bufp, string1, size1, string2, size2, pos, regs, mstop);
1486       if (ret >= 0)
1487 	return pos;
1488       if (ret == -2)
1489 	return -2;
1490     }
1491   return -1;
1492 }
1493 
re_search_pattern(bufp,string,size,startpos,range,regs)1494 int re_search_pattern(bufp, string, size, startpos, range, regs)
1495 regexp_t bufp;
1496 char *string;
1497 int size, startpos, range;
1498 regexp_registers_t regs;
1499 {
1500   return re_search_pattern_2(bufp, string, size, (char *)NULL, 0,
1501 		     startpos, range, regs, size);
1502 }
1503 
1504 /* 94/01/19 (cjk) hide these on NeXT */
1505 #if !defined(NeXT)
1506 static struct re_pattern_buffer re_comp_buf;
1507 
re_comp(s)1508 char *re_comp(s)
1509 char *s;
1510 {
1511   if (s == NULL)
1512     {
1513       if (!re_comp_buf.buffer)
1514 	return "Out of memory";
1515       return NULL;
1516     }
1517   if (!re_comp_buf.buffer)
1518     {
1519       /* the buffer will be allocated automatically */
1520       re_comp_buf.fastmap = malloc(256);
1521       re_comp_buf.translate = NULL;
1522     }
1523   return re_compile_pattern(s, strlen(s), &re_comp_buf);
1524 }
1525 
re_exec(s)1526 int re_exec(s)
1527 char *s;
1528 {
1529   int len = strlen(s);
1530 
1531   return re_search_pattern(&re_comp_buf, s, len, 0, len, (regexp_registers_t)NULL) >= 0;
1532 }
1533 #endif /* !defined(NeXT) */
1534 
1535 #ifdef TEST_REGEXP
1536 
main()1537 int main()
1538 {
1539   char buf[500];
1540   char *cp;
1541   struct re_pattern_buffer exp;
1542   struct re_registers regs;
1543   int a,pos;
1544   char fastmap[256];
1545 
1546   exp.allocated = 0;
1547   exp.buffer = 0;
1548   exp.translate = NULL;
1549   exp.fastmap = fastmap;
1550 
1551   /* re_set_syntax(RE_NO_BK_PARENS|RE_NO_BK_VBAR|RE_ANSI_HEX); */
1552 
1553   while (1)
1554     {
1555       printf("Enter regexp:\n");
1556       gets(buf);
1557       cp=re_compile_pattern(buf, strlen(buf), &exp);
1558       if (cp)
1559 	{
1560 	  printf("Error: %s\n", cp);
1561 	  continue;
1562 	}
1563       re_compile_fastmap(&exp);
1564       printf("dump:\n");
1565       for (pos = 0; pos < exp.used;)
1566 	{
1567 	  printf("%d: ", pos);
1568 	  switch (exp.buffer[pos++])
1569 	    {
1570 	    case Cend:
1571 	      strcpy(buf, "end");
1572 	      break;
1573 	    case Cbol:
1574 	      strcpy(buf, "bol");
1575 	      break;
1576 	    case Ceol:
1577 	      strcpy(buf, "eol");
1578 	      break;
1579 	    case Cset:
1580 	      strcpy(buf, "set ");
1581 	      for (a = 0; a < 256/8; a++)
1582 		sprintf(buf+strlen(buf)," %02x",
1583 			(unsigned char)exp.buffer[pos++]);
1584 	      break;
1585 	    case Cexact:
1586 	      sprintf(buf, "exact '%c' 0x%x", exp.buffer[pos],
1587 		      (unsigned char)exp.buffer[pos]);
1588 	      pos++;
1589 	      break;
1590 	    case Canychar:
1591 	      strcpy(buf, "anychar");
1592 	      break;
1593 	    case Cstart_memory:
1594 	      sprintf(buf, "start_memory %d", exp.buffer[pos++]);
1595 	      break;
1596 	    case Cend_memory:
1597 	      sprintf(buf, "end_memory %d", exp.buffer[pos++]);
1598 	      break;
1599 	    case Cmatch_memory:
1600 	      sprintf(buf, "match_memory %d", exp.buffer[pos++]);
1601 	      break;
1602 	    case Cjump:
1603 	    case Cdummy_failure_jump:
1604 	    case Cstar_jump:
1605 	    case Cfailure_jump:
1606 	    case Cupdate_failure_jump:
1607 	      a = (unsigned char)exp.buffer[pos++];
1608 	      a += (unsigned char)exp.buffer[pos++] << 8;
1609 	      a = (int)(short)a;
1610 	      switch (exp.buffer[pos-3])
1611 		{
1612 		case Cjump:
1613 		  cp = "jump";
1614 		  break;
1615 		case Cstar_jump:
1616 		  cp = "star_jump";
1617 		  break;
1618 		case Cfailure_jump:
1619 		  cp = "failure_jump";
1620 		  break;
1621 		case Cupdate_failure_jump:
1622 		  cp = "update_failure_jump";
1623 		  break;
1624 		case Cdummy_failure_jump:
1625 		  cp = "dummy_failure_jump";
1626 		  break;
1627 		default:
1628 		  cp = "unknown jump";
1629 		  break;
1630 		}
1631 	      sprintf(buf, "%s %d", cp, a + pos);
1632 	      break;
1633 	    case Cbegbuf:
1634 	      strcpy(buf,"begbuf");
1635 	      break;
1636 	    case Cendbuf:
1637 	      strcpy(buf,"endbuf");
1638 	      break;
1639 	    case Cwordbeg:
1640 	      strcpy(buf,"wordbeg");
1641 	      break;
1642 	    case Cwordend:
1643 	      strcpy(buf,"wordend");
1644 	      break;
1645 	    case Cwordbound:
1646 	      strcpy(buf,"wordbound");
1647 	      break;
1648 	    case Cnotwordbound:
1649 	      strcpy(buf,"notwordbound");
1650 	      break;
1651 	    default:
1652 	      sprintf(buf, "unknown code %d",
1653 		      (unsigned char)exp.buffer[pos - 1]);
1654 	      break;
1655 	    }
1656 	  printf("%s\n", buf);
1657 	}
1658       printf("can_be_null = %d uses_registers = %d anchor = %d\n",
1659 	     exp.can_be_null, exp.uses_registers, exp.anchor);
1660 
1661       printf("fastmap:");
1662       for (a = 0; a < 256; a++)
1663 	if (exp.fastmap[a])
1664 	  printf(" %d", a);
1665       printf("\n");
1666       printf("Enter strings.  An empty line terminates.\n");
1667       while (fgets(buf, sizeof(buf), stdin))
1668 	{
1669 	  if (buf[0] == '\n')
1670 	    break;
1671 	  a = re_search_pattern(&exp, buf, strlen(buf), 0, strlen(buf), &regs);
1672 	  printf("search returns %d\n", a);
1673 	  if (a != -1)
1674 	    {
1675 	      for (a = 0; a < RE_NREGS; a++)
1676 		{
1677 		  printf("buf %d: %d to %d\n", a, regs.start[a], regs.end[a]);
1678 		}
1679 	    }
1680 	}
1681     }
1682 }
1683 
1684 #endif /* TEST_REGEXP */
1685