1 /* regex2.c                                               -*- coding: utf-8; -*-
2  *
3  *   Copyright (c) 2010-2021  Takashi Kato <ktakashi@ymail.com>
4  *
5  *   Redistribution and use in source and binary forms, with or without
6  *   modification, are permitted provided that the following conditions
7  *   are met:
8  *
9  *   1. Redistributions of source code must retain the above copyright
10  *      notice, this list of conditions and the following disclaimer.
11  *
12  *   2. Redistributions in binary form must reproduce the above copyright
13  *      notice, this list of conditions and the following disclaimer in the
14  *      documentation and/or other materials provided with the distribution.
15  *
16  *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
17  *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
18  *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
19  *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
20  *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
21  *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
22  *   TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
23  *   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
24  *   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
25  *   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
26  *   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27  *
28  *  $Id: $
29  */
30 #include <string.h>
31 #include <ctype.h>
32 #define LIBSAGITTARIUS_BODY
33 #include "sagittarius/private/pair.h"
34 #include "sagittarius/private/regex.h"
35 #include "sagittarius/private/error.h"
36 #include "sagittarius/private/port.h"
37 #include "sagittarius/private/string.h"
38 #include "sagittarius/private/symbol.h"
39 #include "sagittarius/private/cache.h"
40 
41 #include "shortnames.incl"
42 
43 /* #define DEBUG_REGEX 1 */
44 
45 static SgSymbol *constant_symbol_table[33] = {NULL};
46 
47 #define SYM_ALTER           	  	   (constant_symbol_table[0])
48 #define SYM_NON_GREEDY_REP  	  	   (constant_symbol_table[1])
49 #define SYM_GREEDY_REP      	  	   (constant_symbol_table[2])
50 #define SYM_CLOSE_PAREN     	  	   (constant_symbol_table[3])
51 #define SYM_VERTICAL_BAR    	  	   (constant_symbol_table[4])
52 #define SYM_QUESTION_MARK   	  	   (constant_symbol_table[5])
53 #define SYM_EVERYTHING      	  	   (constant_symbol_table[6])
54 #define SYM_END_ANCHOR      	  	   (constant_symbol_table[7])
55 #define SYM_INVERTED_CHAR_CLASS   	   (constant_symbol_table[8])
56 #define SYM_MODELESS_START_ANCHOR 	   (constant_symbol_table[9])
57 #define SYM_MODELESS_END_ANCHOR   	   (constant_symbol_table[10])
58 #define SYM_MODELESS_END_ANCHOR_NO_NEWLINE (constant_symbol_table[11])
59 #define SYM_START_ANCHOR      		   (constant_symbol_table[12])
60 #define SYM_BACKREF           		   (constant_symbol_table[13])
61 #define SYM_WORD_BOUNDARY     		   (constant_symbol_table[14])
62 #define SYM_NON_WORD_BOUNDARY 		   (constant_symbol_table[15])
63 #define SYM_BRANCH             		   (constant_symbol_table[16])
64 #define SYM_FLAGS              		   (constant_symbol_table[17])
65 #define SYM_OPEN_PAREN         		   (constant_symbol_table[18])
66 #define SYM_OPEN_PAREN_PAREN   		   (constant_symbol_table[19])
67 #define SYM_OPEN_PAREN_GREATER 		   (constant_symbol_table[20])
68 #define SYM_OPEN_PAREN_EQUAL 		   (constant_symbol_table[21])
69 #define SYM_OPEN_PAREN_LESS_EXCLAMATION    (constant_symbol_table[22])
70 #define SYM_OPEN_PAREN_COLON 	   	   (constant_symbol_table[23])
71 #define SYM_OPEN_PAREN_EXCLAMATION 	   (constant_symbol_table[24])
72 #define SYM_OPEN_PAREN_LESS_LETTER 	   (constant_symbol_table[25])
73 #define SYM_REGISTER   		  	   (constant_symbol_table[26])
74 #define SYM_STANDALONE 		  	   (constant_symbol_table[27])
75 #define SYM_LOOKAHEAD  		  	   (constant_symbol_table[28])
76 #define SYM_OPEN_PAREN_LESS_EQUAL 	   (constant_symbol_table[29])
77 #define SYM_SEQUENCE  	    	 	   (constant_symbol_table[30])
78 #define SYM_LOOKBHIND 	    	 	   (constant_symbol_table[31])
79 #define SYM_FLAGGED_SEQUENCE 	 	   (constant_symbol_table[32])
80 
81 /* convenient macros */
82 #define has(p, f) (((p)->flags & (f)) != 0)
83 
84 /* lexer_ctx_t is used to hold the regex string which is currently
85    lexed and to keep track of the lexer's state.
86 */
87 typedef struct lexer_ctx_rec_t
88 {
89   SgChar *str;
90   SgChar *ostr;			/* original string for error message */
91   long    len;
92   int     reg;
93   long    pos;
94   SgObject last_pos;
95   int     flags;
96   int     reg_num;
97   SgObject reg_names;
98 } lexer_ctx_t;
99 
100 /* convenient macro */
101 /* we don't use vm.h's PUSH and POP here */
102 #ifdef PUSH
103 # undef PUSH
104 #endif
105 #ifdef POP
106 # undef POP
107 #endif
108 #define PUSH(v, l) ((l) = Sg_Cons((v), (l)))
109 #define POP(l)     ((l) = SG_CDR(l))
110 
remove_qe_quoting(lexer_ctx_t * ctx)111 static void remove_qe_quoting(lexer_ctx_t *ctx)
112 {
113   const long plen = ctx->len;
114   int i = 0, j, inQuote = TRUE;
115   SgChar *newtemp;
116   while (i < plen - 1) {
117     if (ctx->str[i] != '\\') i += 1;
118     else if (ctx->str[i+1] != 'Q') i += 2;
119     else break;
120   }
121   if (i >= plen - 1) return;	/* no \Q sequence found */
122   j = i;
123   i += 2;
124   newtemp = SG_NEW_ATOMIC2(SgChar *, sizeof(SgChar)*(j+2*(plen-i)+2));
125   memcpy(newtemp, ctx->str, j*sizeof(SgChar));
126   while (i < plen) {
127     SgChar c = ctx->str[i++];
128     if (!isascii(c) || isalnum(c)) {
129       newtemp[j++] = c;
130     } else if (c != '\\') {
131       if (inQuote) newtemp[j++] = '\\';
132       newtemp[j++] = c;
133     } else if (inQuote) {
134       if (ctx->str[i] == 'E') {
135 	i++;
136 	inQuote = FALSE;
137       } else {
138 	newtemp[j++] = '\\';
139 	newtemp[j++] = '\\';
140       }
141     } else {
142       if (ctx->str[i] == 'Q') {
143 	i++;
144 	inQuote = TRUE;
145       } else {
146 	newtemp[j++] = c;
147 	if (i != plen) newtemp[j++] = ctx->str[i++];
148       }
149     }
150   }
151   ctx->len = j;
152   ctx->str = SG_NEW_ATOMIC2(SgChar *, sizeof(SgChar)*j);
153   memcpy(ctx->str, newtemp, sizeof(SgChar)*j);
154   newtemp = NULL;		/* gc friendliness */
155 }
156 
init_lexer(lexer_ctx_t * ctx,SgString * str,int flags)157 static void init_lexer(lexer_ctx_t *ctx, SgString *str, int flags)
158 {
159   ctx->ostr = ctx->str = SG_STRING_VALUE(str);
160   ctx->len = SG_STRING_SIZE(str);
161   ctx->reg = ctx->pos = 0;
162   ctx->last_pos = SG_NIL;
163   ctx->flags = flags;
164   ctx->reg_num = 1;		/* 0 is the whole matched string */
165   ctx->reg_names = SG_NIL;
166   if (!has(ctx, SG_LITERAL)) {
167     /* remove \Q \E quoting now */
168     remove_qe_quoting(ctx);
169   }
170 }
171 
172 /* error */
raise_syntax_error(lexer_ctx_t * ctx,long pos,const SgChar * str)173 static void raise_syntax_error(lexer_ctx_t *ctx, long pos, const SgChar *str)
174 {
175   /* TODO create regex parser error or so */
176   Sg_Error(UC("bad regex syntax in %s: %s, [posision %d]"),
177 	   ctx->ostr, str, pos);
178 }
179 
180 /* compile error. this is actually for pass3, but i put it here */
raise_compile_error(const SgChar * msg,SgObject irr)181 static void raise_compile_error(const SgChar *msg, SgObject irr)
182 {
183   if (SG_FALSEP(irr)) {
184     Sg_Error(msg);
185   } else {
186     Sg_Error(msg, irr);
187   }
188 }
189 
190 /* null sequence */
null_seq()191 static SgObject null_seq()
192 {
193   return SG_LIST1(SYM_SEQUENCE);
194 }
195 
196 /* lexer functions */
197 /*
198   Tests whether we're at the end of the regex string
199  */
200 #define END_OF_STRING_P(ctx) ((ctx)->len <= (ctx)->pos)
201 /*
202   Tests whether we're at the end of the regex string
203  */
204 #define LOOKING_AT_P(ctx, c) (!END_OF_STRING_P(ctx) && c == ctx->str[ctx->pos])
205 
206 static SgObject get_number(lexer_ctx_t *ctx, int radix, int maxlen,
207 			   int nowhilespace);
208 static SgChar next_char(lexer_ctx_t *ctx);
209 static SgChar next_char_non_extended(lexer_ctx_t *ctx);
210 /*
211    Create character from char-code number. number can be #f
212    which is interpreted as 0. error-pos is the position where
213    the corresponding number started within the regex string.
214  */
make_char_from_code(lexer_ctx_t * ctx,SgObject number,long error_pos)215 static SgObject make_char_from_code(lexer_ctx_t *ctx, SgObject number,
216 				    long error_pos)
217 {
218   int code;
219   if (SG_FALSEP(number)) code = 0;
220   else {
221     /* only look at right most eight bits in compliance with Perl */
222     code = 0xFF & SG_INT_VALUE(number);
223   }
224   if (code <= SG_CHAR_MAX) return SG_MAKE_CHAR(code);
225   raise_syntax_error(ctx, error_pos,
226 		     UC("No character of given code"));
227   return SG_UNDEF;		/* dummy */
228 }
229 
230 /*
231    Convert the characters(s) following a backslash into a token
232    which is returned. This function is to be called when the backslash
233    has already been consumed. Special character classes like \\W are
234    handled elsewhere.
235  */
unescape_char(lexer_ctx_t * ctx)236 static SgObject unescape_char(lexer_ctx_t *ctx)
237 {
238   SgChar chr, nc;
239   long error_pos;
240   SgObject n;
241   if (END_OF_STRING_P(ctx)) {
242     raise_syntax_error(ctx, -1, UC("String ends with backslash."));
243   }
244   chr = next_char_non_extended(ctx);
245   switch (chr) {
246     /* it's already resolved. */
247     /* case 'E': return SYM_VOID; */
248   case 'c':
249     /* \cx means control-x in Perl */
250     nc = next_char_non_extended(ctx);
251     if (nc == EOF) {
252       raise_syntax_error(ctx, ctx->pos,
253 			 UC("Character missing after '\\c'."));
254     }
255     return SG_MAKE_CHAR(Sg_CharUpCase(nc) | 0x40);
256   case 'x':
257     /* \x should be followed by hexadecimal char code, two digis or less */
258     error_pos = ctx->pos - 1;
259     n = get_number(ctx, 16, 2, TRUE);
260     return make_char_from_code(ctx, n, error_pos);
261   case 'u':
262     /* \u should be followed by hexadecimal char code, 4 digis or less */
263     error_pos = ctx->pos - 1;
264     n = get_number(ctx, 16, 4, TRUE);
265     return make_char_from_code(ctx, n, error_pos);
266   case 'U':
267     /* \U should be followed by hexadecimal char code, 8 digis or less */
268     error_pos = ctx->pos - 1;
269     n = get_number(ctx, 16, 8, TRUE);
270     return make_char_from_code(ctx, n, error_pos);
271   case '0': case '1': case '2': case '3': case '4':
272   case '5': case '6': case '7': case '8': case '9':
273     error_pos = ctx->pos - 1;
274     n = get_number(ctx, 8, 3, FALSE);
275     return make_char_from_code(ctx, n, error_pos);
276   case 't': return SG_MAKE_CHAR('\t');
277   case 'n': return SG_MAKE_CHAR('\n');
278   case 'r': return SG_MAKE_CHAR('\r');
279   case 'f': return SG_MAKE_CHAR('\f');
280   case 'b': return SG_MAKE_CHAR('\b');
281   case 'a': return SG_MAKE_CHAR(0x07);
282   case 'e': return SG_MAKE_CHAR(0x27);
283   default:
284     return SG_MAKE_CHAR(chr);
285   }
286 }
287 
288 /* I forgot to make ustrchr in unicode.c */
ustrchr(const SgChar * str,SgChar c,long len)289 static long ustrchr(const SgChar *str, SgChar c, long len)
290 {
291   long i;
292   for (i = 0; i < len; i++, str++) {
293     if (*str == c) return i;
294   }
295   return -1;
296 }
297 
298 /*
299   Returns the next character which is to be examined and updates the pos.
300   Does not respect extended mode.
301  */
next_char_non_extended(lexer_ctx_t * ctx)302 static SgChar next_char_non_extended(lexer_ctx_t *ctx)
303 {
304   if (END_OF_STRING_P(ctx)) return EOF;
305   return ctx->str[ctx->pos++];
306 }
307 
308 /*
309   Returns the next character which is to be examined and updates the pos.
310   Respects extended mode, i.e. white spece, comments, and also nested comments
311   are skipped if applicable.
312  */
next_char(lexer_ctx_t * ctx)313 static SgChar next_char(lexer_ctx_t *ctx)
314 {
315   SgChar nc = next_char_non_extended(ctx);
316   long last_pos;
317   while (1) {
318     last_pos = ctx->pos;
319     if (nc != EOF && nc == '(' && LOOKING_AT_P(ctx, '?')) {
320       ctx->pos++;
321       if (LOOKING_AT_P(ctx, '#')) {
322 	/* "(?#" must be a nested comment - so we have to search for the
323 	   closing parenthesis
324 	*/
325 	long error_pos = ctx->pos - 2;
326 	SgChar skip_char = nc;
327 	while (skip_char != EOF && skip_char != ')') {
328 	  skip_char = next_char_non_extended(ctx);
329 	  if (skip_char == EOF) {
330 	    raise_syntax_error(ctx, error_pos,
331 			       UC("Comment group not closed"));
332 	  }
333 	}
334 	nc = next_char_non_extended(ctx);
335       } else {
336 	/* undo */
337 	ctx->pos--;
338       }
339     }
340     if (has(ctx, SG_COMMENTS)) {
341       /* now - if we're in extended mode - we skip whitespace and comments;
342 	 repeat the following loop while we look at whitespace or #\#
343        */
344       while (nc != EOF && (nc == '#' || Sg_Ucs4WhiteSpaceP(nc))) {
345 	if (nc == '#') {
346 	  /* if we saw a comment marker skip util we're behinf \n */
347 	  SgChar skip_char = nc;
348 	  while(skip_char != EOF && skip_char != '\n') {
349 	    skip_char = next_char_non_extended(ctx);
350 	  }
351 	  nc = next_char_non_extended(ctx);
352 	} else {
353 	  /* ...otherwise (whitespace) skip until we see the next
354 	     non-whitespace character
355 	   */
356 	  SgChar skip_char = nc;
357 	  while (skip_char != EOF && Sg_Ucs4WhiteSpaceP(skip_char)) {
358 	    skip_char = next_char_non_extended(ctx);
359 	  }
360 	  nc = skip_char;
361 	}
362       }
363     }
364     /*
365       if the position has moved we have to repeat out tests because of cases
366       like /^a (?#xxx) (?#yyy) {3}c/x
367      */
368     if (ctx->pos <= last_pos) {
369       return nc;
370     }
371   }
372 }
373 /*
374    Tests whether the next token can start a valid sub-expression, i.e. a
375    stand-alone regex
376  */
start_of_subexpr_p(lexer_ctx_t * ctx)377 static int start_of_subexpr_p(lexer_ctx_t *ctx)
378 {
379   long pos = ctx->pos;
380   SgChar nc = next_char(ctx);
381   if (nc == EOF) return FALSE;
382   /* rest position */
383   ctx->pos = pos;
384   return !(nc == ')' || nc == '|');
385 }
386 
387 /*
388   TODO separate context of unicode, but how?
389  */
read_char_property(lexer_ctx_t * ctx,SgChar first)390 static SgObject read_char_property(lexer_ctx_t *ctx, SgChar first)
391 {
392   if (next_char_non_extended(ctx) != '{') {
393     /* one letter property */
394     /* well, what shall we do with one letter property? just a character */
395     ctx->pos--;
396     return SG_MAKE_CHAR(first);
397   } else {
398     /* a bit tricky. char property must start with 'In' or 'Is'. I have no idea
399        what the difference is. So for now we treat both the same.
400      */
401     SgObject es;
402     SgGloc *gloc;
403     long pos;
404     /* first check the property has 'Is' or 'In. */
405     if (ctx->len-ctx->pos <= 2 ||
406 	!(ctx->str[ctx->pos] == 'I' &&
407 	  (ctx->str[ctx->pos + 1] == 's' || ctx->str[ctx->pos + 1] == 'n'))) {
408       raise_syntax_error(ctx, ctx->pos,
409 			 UC("Invalid character property name."));
410     }
411 
412     pos = ustrchr(ctx->str+ctx->pos+2, '}', ctx->len-ctx->pos-2);
413     if (pos == -1) {
414       /* no closing '}' */
415       raise_syntax_error(ctx, ctx->pos,
416 			 UC("Character property does not have '}'"));
417     }
418     /* we convert property name with prefix 'char-set:' then we can look up
419        from builtin charset.
420      */
421     /* does not seem smart solution ... */
422     es = SG_MAKE_STRING("char-set:");
423     es = Sg_StringAppendC(SG_STRING(es), ctx->str+ctx->pos+2, pos);
424     es = Sg_StringDownCase(SG_STRING(es));
425     gloc = Sg_FindBinding(Sg_VM()->currentLibrary, Sg_Intern(es), SG_FALSE);
426     if (SG_FALSEP(gloc) || !SG_CHAR_SET_P(SG_GLOC_GET(gloc))) {
427       raise_syntax_error(ctx, ctx->pos,
428 			 UC("Given character property is not supported"));
429     }
430     ctx->pos += pos+3;
431     return SG_GLOC_GET(gloc);
432   }
433 }
434 
435 /* I'm not sure if this one should be in charset.c */
digit_to_int(SgChar ch,int radix)436 static int digit_to_int(SgChar ch, int radix)
437 {
438   if (ch < '0') return -1;
439   if (radix <= 10) {
440     if (ch < '0' + radix) return ch - '0';
441   } else {
442     if (ch <= '9') return ch - '0';
443     if (ch < 'A') return -1;
444     if (ch < 'A' + radix - 10) return ch - 'A' + 10;
445     if (ch < 'a') return -1;
446     if (ch < 'a' + radix - 10) return ch - 'a' + 10;
447   }
448   return -1;
449 }
450 
read_xdigit(lexer_ctx_t * ctx,int ndigits,char * buf,int * nread)451 static SgChar read_xdigit(lexer_ctx_t *ctx, int ndigits,
452 			  char *buf, int *nread)
453 {
454   int i, c, val = 0, dig;
455   for (i = 0; i < ndigits; i++) {
456     c = next_char_non_extended(ctx);
457     if (c == EOF) break;
458     dig = digit_to_int(c, 16);
459     if (dig < 0) {
460       ctx->pos--;
461       break;
462     }
463     buf[i] = (char)c;
464     val = val * 16 + dig;
465   }
466   *nread = i;
467   if (i < ndigits) return -1;
468   return (SgChar)val;
469 }
470 
read_charset_xdigits(lexer_ctx_t * ctx,int ndigs,int key)471 static SgChar read_charset_xdigits(lexer_ctx_t *ctx, int ndigs, int key)
472 {
473   char buf[8];			/* max 8 */
474   int nread;
475   SgChar r;
476   ASSERT(ndigs <= 8);
477   r = read_xdigit(ctx, ndigs, buf, &nread);
478   if (r == -1) {
479     raise_syntax_error(ctx, ctx->pos,
480 		       UC("Character class contains invalid escaped character")
481 		       );
482   }
483   return r;
484 }
485 
486 /*
487    char-set-difference and char-set-intersection,
488 
489    ugh, this is defined in Scheme as well
490    TODO implement this properly in C and expose it to Scheme world. */
sub_range(SgObject ranges,long from,long to)491 static SgObject sub_range(SgObject ranges, long from, long to)
492 {
493   SgObject h = SG_NIL, t = SG_NIL, cp;
494   SG_FOR_EACH(cp, ranges) {
495     long lo = SG_INT_VALUE(SG_CAAR(cp));
496     long hi = SG_INT_VALUE(SG_CDAR(cp));
497     if (lo <= from && from <= hi) {
498       if (lo <= to && to <= hi) {
499 	if (lo == from) {
500 	  if (to != hi) {
501 	    SG_APPEND1(h, t, Sg_Cons(SG_MAKE_INT(to + 1), SG_MAKE_INT(hi)));
502 	  }
503 	} else {
504 	  if (to == hi) {
505 	    SG_APPEND1(h, t, Sg_Cons(SG_MAKE_INT(lo), SG_MAKE_INT(from - 1)));
506 	  } else {
507 	    SG_APPEND1(h, t, Sg_Cons(SG_MAKE_INT(to + 1), SG_MAKE_INT(hi)));
508 	    SG_APPEND1(h, t, Sg_Cons(SG_MAKE_INT(lo), SG_MAKE_INT(from - 1)));
509 	  }
510 	}
511       } else {
512 	if (lo != from) {
513 	  SG_APPEND1(h, t, Sg_Cons(SG_MAKE_INT(lo), SG_MAKE_INT(from - 1)));
514 	}
515       }
516     } else {
517       if (lo <= to && to <= hi) {
518 	if (to != hi) {
519 	  SG_APPEND1(h, t, Sg_Cons(SG_MAKE_INT(to + 1), SG_MAKE_INT(hi)));
520 	}
521       } else {
522 	if (!(from < lo && hi < to)) {
523 	  SG_APPEND1(h, t, Sg_Cons(SG_MAKE_INT(lo), SG_MAKE_INT(hi)));
524 	}
525       }
526     }
527   }
528   return h;
529 }
530 
cset_diff(SgObject base,SgObject cset)531 static SgObject cset_diff(SgObject base, SgObject cset)
532 {
533   SgObject ranges = Sg_CharSetRanges(base), sub = Sg_CharSetRanges(cset), cp;
534   SG_FOR_EACH(cp, sub) {
535     ranges = sub_range(ranges, SG_INT_VALUE(SG_CAAR(cp)),
536 		       SG_INT_VALUE(SG_CDAR(cp)));
537   }
538   base = Sg_MakeEmptyCharSet();
539   SG_FOR_EACH(cp, ranges) {
540     SgChar from = (SgChar)SG_INT_VALUE(SG_CAAR(cp));
541     SgChar to = (SgChar)SG_INT_VALUE(SG_CDAR(cp));
542     Sg_CharSetAddRange(SG_CHAR_SET(base), from, to);
543   }
544   return base;
545 }
546 
cset_intersect(SgObject x,SgObject y)547 static SgObject cset_intersect(SgObject x, SgObject y)
548 {
549   return cset_diff(x, Sg_CharSetComplement(Sg_CharSetCopy(y)));
550 }
551 
asciinise(lexer_ctx_t * ctx,SgObject cset)552 static SgObject asciinise(lexer_ctx_t *ctx, SgObject cset)
553 {
554   if ((ctx->flags & SG_UNICODE_CASE) == 0) {
555     return cset_intersect(cset, Sg_GetStandardCharSet(SG_CHAR_SET_ASCII));
556   }
557   /* unicode char */
558   return cset;
559 }
560 
get_defined_cset(lexer_ctx_t * ctx,int i)561 static SgObject get_defined_cset(lexer_ctx_t *ctx, int i)
562 {
563   return asciinise(ctx, Sg_GetStandardCharSet(i));
564 }
565 
566 
read_defined_charset(lexer_ctx_t * ctx)567 static SgObject read_defined_charset(lexer_ctx_t *ctx)
568 {
569   /* almost the same as read_char_property.
570   */
571   SgObject es;
572   SgGloc *gloc;
573   long pos;
574   SgObject lib, default_lib = SG_FALSE;
575   /* [[:name:]] thing */
576   if (ctx->len-ctx->pos < 2 ||
577       ctx->str[ctx->pos] != ':') {
578     raise_syntax_error(ctx, ctx->pos, UC("Invalid character set name."));
579   }
580   /* skip first ']' */
581   pos = ustrchr(ctx->str+ctx->pos, ']', ctx->len - ctx->pos);
582   if (pos == -1) {
583     /* no closing ':' */
584     raise_syntax_error(ctx, ctx->pos,
585 		       UC("Invalid charset name. ']' is missing"));
586   }
587   /* we convert property name with prefix 'char-set:' then we can look up
588      from builtin charset.
589   */
590   /* does not seem smart solution ... */
591   es = Sg_MakeEmptyString();
592   /* including ':' */
593   es = Sg_StringAppendC(SG_STRING(es), ctx->str+ctx->pos, pos);
594   lib = Sg_VM()->currentLibrary;
595  retry:
596   gloc = Sg_FindBinding(lib, Sg_Intern(es), SG_FALSE);
597   if (SG_FALSEP(gloc) || !SG_CHAR_SET_P(SG_GLOC_GET(gloc))) {
598     if (SG_FALSEP(default_lib)) {
599       lib = default_lib = Sg_FindLibrary(SG_INTERN("(sagittarius)"), FALSE);
600       goto retry;
601     }
602     raise_syntax_error(ctx, ctx->pos,
603 		       UC("Given character set is not supported"));
604   }
605   /* Sg_Printf(Sg_StandardErrorPort(), UC("%A\n"), es); */
606   ctx->pos += pos;
607   if (ctx->len <= ctx->pos || ctx->str[++ctx->pos] != ']') {
608     raise_syntax_error(ctx, ctx->pos,
609 		       UC("charset name is not closed by 2 ']'s"));
610   }
611   return asciinise(ctx, SG_GLOC_GET(gloc));
612 }
613 
read_char_set(lexer_ctx_t * ctx,int * complement_p)614 static SgObject read_char_set(lexer_ctx_t *ctx, int *complement_p)
615 {
616 #define REAL_BEGIN 1
617 #define CARET_BEGIN 2
618   int begin = REAL_BEGIN, complement = FALSE;
619   int lastchar = -1, inrange = FALSE, moreset_complement = FALSE;
620   SgCharSet *set = SG_CHAR_SET(Sg_MakeEmptyCharSet());
621   SgObject moreset;
622   SgObject chars = SG_NIL;
623   SgChar ch = 0;
624   long start_pos = ctx->pos;
625 
626   for (;;) {
627     ch = next_char_non_extended(ctx);
628     if (ch == EOF) goto err;
629     chars = Sg_Cons(SG_MAKE_CHAR(ch), chars);
630     if (begin == REAL_BEGIN && ch == '^') {
631       complement = TRUE;
632       begin = CARET_BEGIN;
633       continue;
634     }
635     if (begin && ch == ']') {
636       Sg_CharSetAddRange(set, ch, ch);
637       lastchar = ch;
638       begin = FALSE;
639       continue;
640     }
641     begin = FALSE;
642     switch (ch) {
643     case '-':
644       if (inrange) goto ordchar;
645       inrange = TRUE;
646       continue;
647     case ']':
648       if (inrange) {
649 	if (lastchar >= 0) {
650 	  Sg_CharSetAddRange(set, lastchar, lastchar);
651 	  Sg_CharSetAddRange(set, '-', '-');
652 	} else {
653 	  Sg_CharSetAddRange(set, '-', '-');
654 	}
655       }
656       break;
657     case '\\':
658       ch = next_char_non_extended(ctx);
659       if (ch == EOF) goto err;
660       chars = Sg_Cons(SG_MAKE_CHAR(ch), chars);
661       switch (ch) {
662       case 'a': ch = 7; goto ordchar;
663       case 'b': ch = 8; goto ordchar;
664       case 'n': ch = '\n'; goto ordchar;
665       case 'r': ch = '\r'; goto ordchar;
666       case 't': ch = '\t'; goto ordchar;
667       case 'f': ch = '\f'; goto ordchar;
668       case 'e': ch = 0x1b; goto ordchar;
669       case 'x':
670 	ch = read_charset_xdigits(ctx, 2, 'x'); goto ordchar;
671       case 'u':
672 	ch = read_charset_xdigits(ctx, 4, 'u'); goto ordchar;
673       case 'U':
674 	ch = read_charset_xdigits(ctx, 8, 'u'); goto ordchar;
675       case 'd':
676 	moreset_complement = FALSE;
677 	moreset = get_defined_cset(ctx, SG_CHAR_SET_DIGIT);
678 	break;
679       case 'D':
680 	moreset_complement = TRUE;
681 	moreset = get_defined_cset(ctx, SG_CHAR_SET_DIGIT);
682 	break;
683       case 's':
684 	moreset_complement = FALSE;
685 	moreset = get_defined_cset(ctx, SG_CHAR_SET_SPACE);
686 	break;
687       case 'S':
688 	moreset_complement = TRUE;
689 	moreset = get_defined_cset(ctx, SG_CHAR_SET_SPACE);
690 	break;
691       case 'w':
692 	moreset_complement = FALSE;
693 	moreset = get_defined_cset(ctx, SG_CHAR_SET_WORD);
694 	break;
695       case 'W':
696 	moreset_complement = TRUE;
697 	moreset = get_defined_cset(ctx, SG_CHAR_SET_WORD);
698 	break;
699       case 'p':
700 	moreset_complement = FALSE;
701 	moreset = read_char_property(ctx, ch);
702 	break;
703       case 'P':
704 	moreset_complement = TRUE;
705 	moreset = read_char_property(ctx, ch);
706 	break;
707       default: goto ordchar;
708       }
709       if (moreset_complement) {
710 	moreset = Sg_CharSetComplement(SG_CHAR_SET(Sg_CharSetCopy(SG_CHAR_SET(moreset))));
711       }
712       Sg_CharSetAdd(set, SG_CHAR_SET(moreset));
713       continue;
714     case '[':
715       moreset = read_defined_charset(ctx);
716       if (!SG_CHAR_SET_P(moreset)) goto err;
717       Sg_CharSetAdd(set, SG_CHAR_SET(moreset));
718       continue;
719     ordchar:
720     default:
721       if (inrange) {
722 	if (lastchar < 0) {
723 	  Sg_CharSetAddRange(set, '-', '-');
724 	  Sg_CharSetAddRange(set, ch, ch);
725 	  lastchar = ch;
726 	} else {
727 	  Sg_CharSetAddRange(set, lastchar, ch);
728 	  lastchar = -1;
729 	}
730 	inrange = FALSE;
731       } else {
732 	Sg_CharSetAddRange(set, ch, ch);
733 	lastchar = ch;
734       }
735       continue;
736     }
737     break;
738   }
739   if (complement_p) {
740     *complement_p = complement;
741     return SG_OBJ(set);
742   } else {
743     if (complement) Sg_CharSetComplement(set);
744     return SG_OBJ(set);
745   }
746 
747  err:
748   raise_syntax_error(ctx, start_pos -1,
749 		     UC("bad char-set spec in pattern"));
750   return SG_FALSE;
751 }
752 
unget_token(lexer_ctx_t * ctx)753 static void unget_token(lexer_ctx_t *ctx)
754 {
755   if (SG_NULLP(ctx->last_pos)) {
756     Sg_Error(UC("[internal error] No token to unget."));
757   }
758   ctx->pos = SG_INT_VALUE(SG_CAR(ctx->last_pos));
759   POP(ctx->last_pos);
760 }
761 
fail(lexer_ctx_t * ctx)762 static SgObject fail(lexer_ctx_t *ctx)
763 {
764   if (SG_NULLP(ctx->last_pos)) {
765     raise_syntax_error(ctx, -1,
766 		       UC("last-pos stack of lexer is empty"));
767   }
768   ctx->pos = SG_INT_VALUE(SG_CAR(ctx->last_pos));
769   POP(ctx->last_pos);
770   return SG_FALSE;
771 }
772 
get_number(lexer_ctx_t * ctx,int radix,int maxlen,int nowhilespace)773 static SgObject get_number(lexer_ctx_t *ctx, int radix, int maxlen,
774 			   int nowhilespace)
775 {
776   long i, size = ctx->len - ctx->pos, end, n = 0;
777   if (nowhilespace &&
778       Sg_Ucs4WhiteSpaceP(ctx->str[ctx->pos])) {
779     return SG_FALSE;
780   }
781   if (maxlen > 0) {
782     long end_pos = ctx->pos + maxlen;
783     if (end_pos < size) end = end_pos;
784     else end = size;
785   } else {
786     end = size;
787   }
788   for (i = 0; i < end; i++) {
789     if (((radix == 8 || radix == 10) && isdigit(ctx->str[ctx->pos + i])) ||
790 	(radix == 16 && isxdigit(ctx->str[ctx->pos + i]))) {
791       n = n*radix + digit_to_int(ctx->str[ctx->pos + i], radix);
792     } else {
793       if (i == 0) n = -1;
794       /* something else is here */
795       break;
796     }
797   }
798   if (n != -1) {
799     /* sanity */
800     ctx->pos += i;
801     return SG_MAKE_INT(n);
802   } else {
803     return SG_FALSE;
804   }
805 }
806 
try_number(lexer_ctx_t * ctx,int radix,int maxlen,int nowhilespace)807 static SgObject try_number(lexer_ctx_t *ctx, int radix, int maxlen,
808 			   int nowhilespace)
809 {
810   SgObject n;
811   PUSH(SG_MAKE_INT(ctx->pos), ctx->last_pos);
812   n = get_number(ctx, radix, maxlen, nowhilespace);
813   if (SG_FALSEP(n)) return fail(ctx);
814   return n;
815 }
816 
817 /*
818   Reads a sequence of modifiers (including #\\- to reverse their
819   meaning) and returns a corresponding list of "flag" tokens.
820  */
maybe_parse_flags(lexer_ctx_t * ctx)821 static SgObject maybe_parse_flags(lexer_ctx_t *ctx)
822 {
823   SgChar c = next_char_non_extended(ctx);
824   SgObject h = SG_NIL, t = SG_NIL;
825   int set = TRUE;
826   while (1) {
827     switch (c) {
828     case '-':
829       set = FALSE;
830       break;
831     case 'x':
832       if (set) {
833 	ctx->flags |= SG_COMMENTS;
834       } else {
835 	ctx->flags &= ~SG_COMMENTS;
836       }
837       break;
838     case 'i':
839       if (set) {
840 	ctx->flags |= SG_CASE_INSENSITIVE;
841 	SG_APPEND1(h, t, Sg_Cons(SG_MAKE_CHAR(c), SG_TRUE));
842       } else {
843 	ctx->flags &= ~SG_CASE_INSENSITIVE;
844 	SG_APPEND1(h, t, Sg_Cons(SG_MAKE_CHAR(c), SG_FALSE));
845       }
846       break;
847     case 'm':
848       /* 'm' flag is resolved during compile time. */
849       if (set) {
850 	ctx->flags |= SG_MULTILINE;
851 	/* SG_APPEND1(h, t, Sg_Cons(SG_MAKE_CHAR(c), SG_TRUE)); */
852       } else {
853 	ctx->flags &= ~SG_MULTILINE;
854 	/* SG_APPEND1(h, t, Sg_Cons(SG_MAKE_CHAR(c), SG_FALSE)); */
855       }
856       break;
857     case 's':
858       if (set) {
859 	ctx->flags |= SG_DOTALL;
860 	SG_APPEND1(h, t, Sg_Cons(SG_MAKE_CHAR(c), SG_TRUE));
861       } else {
862 	ctx->flags &= ~SG_DOTALL;
863 	SG_APPEND1(h, t, Sg_Cons(SG_MAKE_CHAR(c), SG_FALSE));
864       }
865       break;
866     case 'u':
867       if (set) {
868 	ctx->flags |= SG_UNICODE_CASE;
869 	SG_APPEND1(h, t, Sg_Cons(SG_MAKE_CHAR(c), SG_TRUE));
870       } else {
871 	ctx->flags &= ~SG_UNICODE_CASE;
872 	SG_APPEND1(h, t, Sg_Cons(SG_MAKE_CHAR(c), SG_FALSE));
873       }
874       break;
875     default: goto end;
876     }
877     c = next_char_non_extended(ctx);
878   }
879  end:
880   ctx->pos--;
881   return h;
882 }
883 
884 /*
885   Returns a list of two values (min max) if what the lexer is looking at can be
886   interpreted as a quantifier. Otherwise returns #f and resets the lexer to its
887   old position.
888 */
get_quantifier(lexer_ctx_t * ctx,int * standalone)889 static SgObject get_quantifier(lexer_ctx_t *ctx, int *standalone)
890 {
891   SgChar nc;
892   PUSH(SG_MAKE_INT(ctx->pos), ctx->last_pos);
893   nc = next_char(ctx);
894   switch (nc) {
895   case '*':
896     return SG_LIST2(SG_MAKE_INT(0), SG_FALSE); /* 0 or more times */
897   case '+':
898     return SG_LIST2(SG_MAKE_INT(1), SG_FALSE); /*  1 or more times */
899   case '?':
900     return SG_LIST2(SG_MAKE_INT(0), SG_MAKE_INT(1)); /* 0 or 1 */
901   case '{':
902     /* one of
903        {n}:   match exactly n times
904        {n,}:  match at least n times
905        {n,m}: match at least n but not more than m times
906        Note: anything not matching one of these patterns will be interpreted
907              literally - even whitespace isn't allowed.
908      */
909     {
910       SgObject num1 = get_number(ctx, 10, -1, TRUE), num2;
911       if (SG_FALSEP(num1)) {
912 	/* no number following left curly brace, so we treat it like a normal
913 	   character*/
914 	return fail(ctx);
915       } else {
916 	nc = next_char_non_extended(ctx);
917 	switch (nc) {
918 	case ',':
919 	  num2 = get_number(ctx, 10, -1, TRUE);
920 	  nc = next_char_non_extended(ctx);
921 	  switch (nc) {
922 	  case '}': return SG_LIST2(num1, num2); /* {n,} or {n,m} */
923 	  default: return fail(ctx);
924 	  }
925 	case '}': return SG_LIST2(num1, num1); /* {n} */
926 	default: return fail(ctx);
927 	}
928       }
929     }
930   default: return fail(ctx);
931   }
932 }
933 /*
934    Reads and returns the name in a named register group. It is assumed that the
935    starting #\< character has already been read. The closing #\> will also be
936    consumed.
937  */
parse_register_name_aux(lexer_ctx_t * ctx)938 static SgObject parse_register_name_aux(lexer_ctx_t *ctx)
939 {
940   /* we have to look for an ending > character now */
941   long end_name = ustrchr(ctx->str+ctx->pos, '>', ctx->len-ctx->pos), i;
942   SgObject s;
943   if (end_name < 0) {
944     raise_syntax_error(ctx, ctx->pos - 1,
945 		       UC("Opening #\\< in named group has no closing #\\>."));
946   }
947   s = Sg_ReserveString(end_name, 0);
948   for (i = 0; i < end_name; i++) {
949     SgChar c = ctx->str[ctx->pos + i];
950     if (isalnum(c) || c == '-') {
951       SG_STRING_VALUE_AT(s, i) = c;
952     } else {
953       raise_syntax_error(ctx, ctx->pos,
954 			 UC("Invalid character in named register group."));
955     }
956   }
957   /* advance lexer beyond "<name>" part */
958   ctx->pos += i + 1;
959   return Sg_Intern(s);
960 }
961 
962 /*
963    Returns and consumes the next token from the regex string or '#f
964  */
get_token(lexer_ctx_t * ctx,SgObject * ret)965 static SgObject get_token(lexer_ctx_t *ctx, SgObject *ret)
966 {
967   SgChar nc;
968   PUSH(SG_MAKE_INT(ctx->pos), ctx->last_pos);
969   nc = next_char(ctx);
970   if (nc != EOF) {
971     switch (nc) {
972       /* the each cases first - the following six characters always have a
973 	 special meaning and get translated into tokens immediately
974        */
975     case ')': return SYM_CLOSE_PAREN;
976     case '|':
977       /* as far as I know I've never seen this pattern, and this must be
978          error.*/
979       raise_syntax_error(ctx, SG_INT_VALUE(SG_CAR(ctx->last_pos)),
980 			 UC("Please report this regex to the developper."));
981       return SG_UNDEF;		/* dummy */
982       /* return SYM_VERTICAL_BAR; */
983     case '?':
984       /* well this question mark must be error.
985 	 or should this be literal character?
986        */
987       /* return SYM_QUESTION_MARK; */
988       raise_syntax_error(ctx, SG_INT_VALUE(SG_CAR(ctx->last_pos)),
989 			 UC("Quantifier('?') follows nothing in regex."));
990       return SG_UNDEF;		/* dummy */
991     case '.': return SYM_EVERYTHING;
992     case '^':
993       if (has(ctx, SG_MULTILINE)) return SYM_START_ANCHOR;
994       else return SYM_MODELESS_START_ANCHOR;
995     case '$':
996       if (has(ctx, SG_MULTILINE)) return SYM_END_ANCHOR;
997       else return SYM_MODELESS_END_ANCHOR;
998     case '+': case '*':
999       /* quantifiers will always be consumed by get_quantifier, they must not
1000 	 appear here */
1001       raise_syntax_error(ctx, ctx->pos - 1,
1002 			 UC("Quanrifier '+' or '*' not allowed."));
1003       return SG_FALSE;		/* dummy */
1004     case '{':
1005       /* left brace isn't a special character in it's own right but we must
1006 	 check if what follows might look like a quantifier.*/
1007       {
1008 	long here = ctx->pos;
1009 	SgObject last = ctx->last_pos;
1010 	unget_token(ctx);
1011 	if (!SG_FALSEP(get_quantifier(ctx, NULL))) {
1012 	  raise_syntax_error(ctx, SG_INT_VALUE(SG_CAR(last)),
1013 			     UC("Quanrifier not allowed"));
1014 	}
1015 	ctx->pos = here;
1016 	ctx->last_pos = last;
1017 	return SG_MAKE_CHAR(nc);
1018       }
1019     case '[':
1020       /* left bracket always starts */
1021       {
1022 	int comp = FALSE;
1023 	SgObject cs = read_char_set(ctx, &comp);
1024 	if (comp) {
1025 	  return SG_LIST2(SYM_INVERTED_CHAR_CLASS, cs);
1026 	} else {
1027 	  return cs;
1028 	}
1029       }
1030     case '\\':
1031       /* backslash might mean different things so we have to peek one char
1032 	 ahead */
1033       nc = next_char_non_extended(ctx);
1034       switch (nc) {
1035       case 'A': return SYM_MODELESS_START_ANCHOR;
1036       case 'Z': return SYM_MODELESS_END_ANCHOR;
1037       case 'z': return SYM_MODELESS_END_ANCHOR_NO_NEWLINE;
1038       case 'b': return SYM_WORD_BOUNDARY;
1039       case 'B': return SYM_NON_WORD_BOUNDARY;
1040       case 'k':
1041 	if (LOOKING_AT_P(ctx, '<')) {
1042 	  SgObject name, num = SG_FALSE;
1043 	  long pos = ctx->pos - 2;
1044 	  ctx->pos++;
1045 	  name = parse_register_name_aux(ctx);
1046 	  /* search backref name from context */
1047 	  /* (slot n1 n2 ...) */
1048 	  num = Sg_Assq(name, ctx->reg_names);
1049 	  if (SG_FALSEP(num)) {
1050 	    raise_syntax_error(ctx, pos,
1051 			       UC("Non defined named register is refered."));
1052 	  }
1053 	  num = SG_CDR(num);
1054 	  if (SG_NULLP(SG_CDR(num))) {
1055 	    return Sg_Cons(SYM_BACKREF, SG_CAR(num));
1056 	  } else {
1057 	    SgObject cp, h = SG_NIL, t = SG_NIL;
1058 	    SG_FOR_EACH(cp, num) {
1059 	      SG_APPEND1(h, t, Sg_Cons(SYM_BACKREF, SG_CAR(cp)));
1060 	    }
1061 	    /* (alternation (back-reference . n1) ...) */
1062 	    return Sg_Cons(SYM_ALTER, h);
1063 	  }
1064 	} else{
1065 	  return SG_MAKE_CHAR('k');
1066 	}
1067       case 'd': return get_defined_cset(ctx, SG_CHAR_SET_DIGIT);
1068       case 'D': return SG_LIST2(SYM_INVERTED_CHAR_CLASS,
1069 				get_defined_cset(ctx, SG_CHAR_SET_DIGIT));
1070       case 'w': return get_defined_cset(ctx, SG_CHAR_SET_WORD);
1071       case 'W': return SG_LIST2(SYM_INVERTED_CHAR_CLASS,
1072 				get_defined_cset(ctx, SG_CHAR_SET_WORD));
1073       case 's': return get_defined_cset(ctx, SG_CHAR_SET_SPACE);
1074       case 'S': return SG_LIST2(SYM_INVERTED_CHAR_CLASS,
1075 				get_defined_cset(ctx, SG_CHAR_SET_SPACE));
1076       case '1': case '2': case '3': case '4': case '5':
1077       case '6': case '7': case '8': case '9':
1078 	{
1079 	  long oldpos = --ctx->pos;
1080 	  SgObject num = get_number(ctx, 10, -1, FALSE);
1081 	  if (SG_INT_VALUE(num) > ctx->reg &&
1082 	      10 <= SG_INT_VALUE(num)) {
1083 	    /* \10 and higher are treaded as octal character codes if we haven't
1084 	       opened that much register groups yet. */
1085 	    ctx->pos = oldpos;
1086 	    return make_char_from_code(ctx,
1087 				       get_number(ctx, 8, 3, FALSE), oldpos);
1088 	  } else {
1089 	    return Sg_Cons(SYM_BACKREF, num);
1090 	  }
1091 	}
1092 	break;
1093       case '0':
1094 	/* this always means an octal character code */
1095 	{
1096 	  long oldpos = ctx->pos - 1;
1097 	  return make_char_from_code(ctx, get_number(ctx, 8, 3, FALSE), oldpos);
1098 	}
1099 	break;
1100       case 'P': case 'p':
1101 	/* might be a named property */
1102 	return read_char_property(ctx, nc);
1103       default:
1104 	ctx->pos--;
1105 	return unescape_char(ctx);
1106       }
1107     case '(':
1108       /* an open parenthesis might mean different thigs depending on what
1109 	 follows... */
1110       if (LOOKING_AT_P(ctx, '?')) {
1111 	/* this is the case '(?' (and probably more behind) */
1112 	SgObject flags;
1113 	ctx->pos++;
1114 	flags = maybe_parse_flags(ctx);
1115 	nc = next_char_non_extended(ctx);
1116 	/* modifiers are only allowed if a colon or closing parenthesis are
1117 	   following. */
1118 	if (!SG_NULLP(flags) && !(nc == ':' || nc == ')')) {
1119 	  raise_syntax_error(ctx, SG_INT_VALUE(SG_CAR(ctx->last_pos)),
1120 			     UC("Sequence not recoginzed"));
1121 	}
1122 	switch (nc) {
1123 	case EOF: raise_syntax_error(ctx, -1,
1124 				     UC("End of string following '(?'."));
1125 	case ')':
1126 	  /* an empty group except for the flags */
1127 	  if (!SG_NULLP(flags)) return Sg_Cons(SYM_FLAGS, flags);
1128 	  return null_seq();
1129 	/* branch */
1130 	case '(': return SYM_OPEN_PAREN_PAREN;
1131 	/* standalone */
1132 	case '>': return SYM_OPEN_PAREN_GREATER;
1133 	  /* positive look-ahead */
1134 	case '=': return SYM_OPEN_PAREN_EQUAL;
1135 	  /* negative look-ahead */
1136 	case '!': return SYM_OPEN_PAREN_EXCLAMATION;
1137 	case ':':
1138 	  /* non capturing group - return flags if it not NULL*/
1139 	  if (ret) *ret = flags;
1140 	  return SYM_OPEN_PAREN_COLON;
1141 	case '<':
1142 	  /* might be look-behind assertion or a named group, so check next */
1143 	  nc = next_char_non_extended(ctx);
1144 	  if (isalpha(nc)) {
1145 	    /* we have encountered a named group. */
1146 	    ctx->pos--;
1147 	    return SYM_OPEN_PAREN_LESS_LETTER;
1148 	  } else {
1149 	    switch (nc) {
1150 	    case '=': return SYM_OPEN_PAREN_LESS_EQUAL; /* positive */
1151 	    case '!': return SYM_OPEN_PAREN_LESS_EXCLAMATION; /* negative */
1152 	    case ')':
1153 	      /* Perl allows "(?<" and treats it like a null string*/
1154 	      return null_seq();
1155 	    case EOF:
1156 	      raise_syntax_error(ctx, -1, UC("End of string following '(?<'."));
1157 	    default:
1158 	      raise_syntax_error(ctx, ctx->pos -1,
1159 				 UC("'(?<' is followed by illigal character."));
1160 	    }
1161 	  }
1162 	default:
1163 	  raise_syntax_error(ctx, ctx->pos -1,
1164 			     UC("'(?' is followed by illigal character."));
1165 	}
1166       } else {
1167 	/* if nc was not '?' (this is within the first switch, we've just seen
1168 	   an opening parenthesis and leave it like that*/
1169 	return SYM_OPEN_PAREN;
1170       }
1171     default:
1172       /* all other characters are their onw tokens */
1173       return SG_MAKE_CHAR(nc);
1174     }
1175   } else {
1176     /* we didn't get a character (this if the "else" branch form the first if),
1177        so we don't return a token but #f.
1178      */
1179     POP(ctx->last_pos);
1180     return SG_FALSE;
1181   }
1182 }
1183 
1184 /* parser functions*/
1185 static SgObject reg_expr(lexer_ctx_t *ctx);
1186 
1187 /*
1188   Parses and consumes a <group>.
1189   The productions are: <group> -> "(" <regex> ")"
1190                                   "(?:" <regex> ")"
1191 				  "(?>" <regex> ")"
1192 				  "(?<flags>:" <regex> ")"
1193 				  "(?=" <regex> ")"
1194 				  "(?!" <regex> ")"
1195 				  "(?<=" <regex> ")"
1196 				  "(?<!" <regex> ")"
1197 				  "(?(" <num> ")" <regex> ")"
1198 				  "(?(" <regex> ")" <regex> ")"
1199 				  "(?<name>" <regex> ")"
1200 				  <legal-token>
1201   where <flags> is parsed by the lexer function MAYBE-PARSE-FLAGS.
1202   Will return <parse-tree> or (<grouping-type> <parse-tree>) where
1203   <grouping-type> is one of six keywords.
1204  */
group(lexer_ctx_t * ctx)1205 static SgObject group(lexer_ctx_t *ctx)
1206 {
1207   SgObject flags = SG_NIL;
1208   int save = ctx->flags;
1209   SgObject open_token = get_token(ctx, &flags);
1210   SgObject ret = SG_UNDEF;
1211 
1212   if (SG_EQ(open_token, SYM_OPEN_PAREN_PAREN)) {
1213     /* special case for conditional regular expression; not that at this point
1214        we accespt a couple of illegal combinations which'll be sorted out later
1215        by the converter
1216      */
1217     long open_paren_pos = SG_INT_VALUE(SG_CAR(ctx->last_pos));
1218     /* check if what follows "(?(" is a number*/
1219     SgObject number = try_number(ctx, 10, -1, TRUE);
1220     if (!SG_FALSEP(number)) {
1221       /* condition is a number (i.e refers to a back-reference */
1222       SgObject inner_close_token = get_token(ctx, NULL);
1223       SgObject regexpr = reg_expr(ctx);
1224       SgObject close_token = get_token(ctx, NULL);
1225       if (!SG_EQ(inner_close_token, SYM_CLOSE_PAREN)) {
1226 	raise_syntax_error(ctx, open_paren_pos + 2,
1227 			   UC("Opening paren has no matching closing paren"));
1228       }
1229       if (!SG_EQ(close_token, SYM_CLOSE_PAREN)) {
1230 	raise_syntax_error(ctx, open_paren_pos,
1231 			   UC("Opening paren has no matching closing paren"));
1232       }
1233       if (!SG_PAIRP(regexpr) ||
1234 	  (SG_PAIRP(regexpr) && !SG_EQ(SG_CAR(regexpr), SYM_ALTER))) {
1235 	/* most definitely (?(1)aaa) pattern so make this
1236 	 (alternation $regexpr (sequence))*/
1237 	regexpr = SG_LIST3(SYM_ALTER, regexpr, SG_LIST1(SYM_SEQUENCE));
1238       }
1239       ret = SG_LIST3(SYM_BRANCH, number, regexpr);
1240       goto end_group;
1241     } else {
1242       /* condition must be a full regex (actually a look-behind or look-ahead);
1243 	 and comes a terrible kludge: instead of being cleanly separated from
1244 	 the lexer, the parser pushes back lexer by one position, thereby
1245 	 landing in the middle of the 'token' "(?(".
1246        */
1247       SgObject inner_reg_expr, regexpr, close_token;
1248       ctx->pos--;
1249       inner_reg_expr = group(ctx);
1250       regexpr = reg_expr(ctx);
1251       close_token = get_token(ctx, NULL);
1252       if (!SG_EQ(close_token, SYM_CLOSE_PAREN)) {
1253 	raise_syntax_error(ctx, open_paren_pos,
1254 			   UC("Opening paren has no matching closing paren."));
1255       }
1256       if (!(SG_PAIRP(inner_reg_expr) &&
1257 	    (SG_EQ(SG_CAR(inner_reg_expr), SYM_LOOKBHIND) ||
1258 	     SG_EQ(SG_CAR(inner_reg_expr), SYM_LOOKAHEAD)))) {
1259 	raise_syntax_error(ctx, open_paren_pos,
1260 			   UC("Branch test must be lookahead, look-behind or number"));
1261       }
1262       if (!SG_PAIRP(regexpr) ||
1263 	  (SG_PAIRP(regexpr) && !SG_EQ(SG_CAR(regexpr), SYM_ALTER))) {
1264 	/* most definitely (?(1)aaa) pattern so make this
1265 	 (alternation $regexpr (sequence))*/
1266 	regexpr = SG_LIST3(SYM_ALTER, regexpr, SG_LIST1(SYM_SEQUENCE));
1267       }
1268       ret = SG_LIST3(SYM_BRANCH, inner_reg_expr, regexpr);
1269       goto end_group;
1270     }
1271 
1272   } else if (SG_EQ(open_token, SYM_OPEN_PAREN) ||
1273 	     SG_EQ(open_token, SYM_OPEN_PAREN_EQUAL) ||
1274 	     SG_EQ(open_token, SYM_OPEN_PAREN_COLON) ||
1275 	     SG_EQ(open_token, SYM_OPEN_PAREN_GREATER) ||
1276 	     SG_EQ(open_token, SYM_OPEN_PAREN_EXCLAMATION) ||
1277 	     SG_EQ(open_token, SYM_OPEN_PAREN_LESS_EQUAL) ||
1278 	     SG_EQ(open_token, SYM_OPEN_PAREN_LESS_EXCLAMATION) ||
1279 	     SG_EQ(open_token, SYM_OPEN_PAREN_LESS_LETTER)) {
1280     /* we saw one of the six token representing opening parentheses */
1281     int saved_reg_num = 0;
1282     long open_paren_pos = SG_INT_VALUE(SG_CAR(ctx->last_pos));
1283     SgObject register_name = (SG_EQ(open_token, SYM_OPEN_PAREN_LESS_LETTER))
1284       ? parse_register_name_aux(ctx) : SG_FALSE;
1285     SgObject regexpr, close_token;
1286     if (SG_EQ(open_token, SYM_OPEN_PAREN) ||
1287 	SG_EQ(open_token, SYM_OPEN_PAREN_LESS_LETTER)) {
1288       /* if this is the "(" <regex> ")" or "(?" <name> "" <regex> ")" production
1289 	 we have to increment the register counter of the lexer
1290        */
1291       ctx->reg++;
1292       saved_reg_num = ctx->reg_num++;
1293     }
1294     regexpr = reg_expr(ctx);
1295     close_token = get_token(ctx, NULL);
1296 
1297     if (!SG_EQ(close_token, SYM_CLOSE_PAREN)) {
1298       raise_syntax_error(ctx, open_paren_pos,
1299 			 UC("Opening paren has no matching closing paren."));
1300     }
1301     if (!SG_NULLP(flags)) {
1302       /* if the lexer has returned a list of flags this must have been the
1303 	 "(?:" <regex> ")" production
1304        */
1305       ret = SG_LIST3(SYM_FLAGGED_SEQUENCE, flags, regexpr);
1306       goto end_group;
1307     } else {
1308       if (SG_EQ(open_token, SYM_OPEN_PAREN_LESS_LETTER)) {
1309 	/* make alist */
1310 	SgObject names = Sg_Assq(register_name, ctx->reg_names);
1311 	if (SG_FALSEP(names)) {
1312 	  PUSH(Sg_Cons(register_name, SG_LIST1(SG_MAKE_INT(saved_reg_num))),
1313 	       ctx->reg_names);
1314 	} else {
1315 	  /* push it */
1316 	  SG_SET_CDR(names, Sg_Cons(SG_MAKE_INT(saved_reg_num),
1317 				    SG_CDR(names)));
1318 	}
1319 	ret = SG_LIST4(SYM_REGISTER,
1320 		       SG_MAKE_INT(saved_reg_num),
1321 		       register_name,
1322 		       regexpr);
1323 	goto end_group;
1324       } else {
1325 	if (SG_EQ(open_token, SYM_OPEN_PAREN)) {
1326 	  ret = SG_LIST4(SYM_REGISTER, SG_MAKE_INT(saved_reg_num),
1327 			 SG_FALSE, regexpr);
1328 	} else if (SG_EQ(open_token, SYM_OPEN_PAREN_COLON)) {
1329 	  /* (?:...) does not create any group */
1330 	  ret = regexpr;
1331 	} else if (SG_EQ(open_token, SYM_OPEN_PAREN_GREATER)) {
1332 	  ret = SG_LIST2(SYM_STANDALONE, regexpr);
1333 	} else if (SG_EQ(open_token, SYM_OPEN_PAREN_EQUAL)) {
1334 	  ret = SG_LIST3(SYM_LOOKAHEAD, SG_TRUE, regexpr);
1335 	} else if (SG_EQ(open_token, SYM_OPEN_PAREN_EXCLAMATION)) {
1336 	  ret = SG_LIST3(SYM_LOOKAHEAD, SG_FALSE, regexpr);
1337 	} else if (SG_EQ(open_token, SYM_OPEN_PAREN_LESS_EQUAL)) {
1338 	  ret = SG_LIST3(SYM_LOOKBHIND, SG_TRUE, regexpr);
1339 	} else if (SG_EQ(open_token, SYM_OPEN_PAREN_LESS_EXCLAMATION)) {
1340 	  ret = SG_LIST3(SYM_LOOKBHIND, SG_FALSE, regexpr);
1341 	} else {
1342 	  ASSERT(FALSE);
1343 	  ret = SG_UNDEF;	/* dummy */
1344 	}
1345 	goto end_group;
1346       }
1347     }
1348   } else {
1349     /* This is the <lexical-token> production; <legal-token> is any token which
1350        passes start_of_subexpr_p (otherwise parsing had already stopped int the
1351        sequence.
1352      */
1353     ret = open_token;
1354   }
1355  end_group:
1356   ctx->flags = save;
1357   return ret;
1358 }
1359 
1360 /*
1361    Parses and consume a <greedy-quant>.
1362    The productions are: <greedy-quant> -> <group> | <group><quantifier>
1363    where <quantifier> is parsed by the lexer function get_quantifier.
1364    Will return <parse-tree> or (SYM_GREEDY_REP <min> <max> <parse-tree).
1365  */
greedy_quant(lexer_ctx_t * ctx)1366 static SgObject greedy_quant(lexer_ctx_t *ctx)
1367 {
1368   SgObject grp = group(ctx);
1369   int standalone = FALSE;
1370   SgObject token = get_quantifier(ctx, &standalone);
1371   if (!SG_FALSEP(token)) {
1372     /* if get_quantifier returned a true value it's the tow element list
1373        (<min> <max)
1374      */
1375     return SG_LIST4(SYM_GREEDY_REP, SG_CAR(token), SG_CADR(token), grp);
1376   }
1377   return grp;
1378 }
1379 
1380 /*
1381    Parses and consume a <quant>.
1382    The productions are: <quant> -> <greedy-quant> | <greedy-quant>"?".
1383    Will return the <parse-tree> returned by greedy_quant and optionally change
1384    SYM_GREEDY_REP to SYM_NON_GREEDY_REP.
1385  */
quant(lexer_ctx_t * ctx)1386 static SgObject quant(lexer_ctx_t *ctx)
1387 {
1388   SgObject greedy = greedy_quant(ctx);
1389   long pos = ctx->pos;
1390   SgChar nc = next_char(ctx);
1391   if (nc != EOF) {
1392     if (nc == '?') {
1393       SG_SET_CAR(greedy, SYM_NON_GREEDY_REP);
1394     } else if (nc == '+') {
1395       greedy = SG_LIST2(SYM_STANDALONE, greedy);
1396     } else {
1397       ctx->pos = pos;
1398     }
1399   }
1400   return greedy;
1401 }
1402 
1403 /*
1404   Parses and consumes a <seq>.
1405   The productions are: <sex> -> <quant> | <quant><seq>.
1406   Will return <parse-tree> or (sequence <parse-tree> <parse-tree>).
1407  */
make_string_from_two_char(SgObject c1,SgObject c2)1408 static SgObject make_string_from_two_char(SgObject c1, SgObject c2)
1409 {
1410   /* for the smarter way we should use, list->string, however it is slow */
1411   SgObject s = Sg_ReserveString(2, 0);
1412   ASSERT(SG_CHARP(c1) && SG_CHARP(c2));
1413   SG_STRING_VALUE_AT(s, 0) = SG_CHAR_VALUE(c1);
1414   SG_STRING_VALUE_AT(s, 1) = SG_CHAR_VALUE(c2);
1415   return s;
1416 }
sequence(lexer_ctx_t * ctx)1417 static SgObject sequence(lexer_ctx_t *ctx)
1418 {
1419   if (start_of_subexpr_p(ctx)) {
1420     SgObject quan = quant(ctx);
1421     if (start_of_subexpr_p(ctx)) {
1422       SgObject seq = sequence(ctx);
1423       int is_quant_char = SG_CHARP(quant);
1424       int is_seq_sequence = (SG_PAIRP(seq) && SG_EQ(SG_CAR(seq), SYM_SEQUENCE));
1425       if (is_seq_sequence && SG_CHARP(seq)) {
1426 	return make_string_from_two_char(seq, quan);
1427       } else if (is_quant_char && SG_STRINGP(seq)) {
1428 	SgChar ca[1];
1429 	ca[0] = SG_CHAR_VALUE(quan);
1430 	return Sg_StringAppendC(SG_STRING(seq), ca, 1);
1431       } else if (is_quant_char && is_seq_sequence && SG_CHARP(SG_CADR(seq))) {
1432 	if (SG_NULLP(SG_CDDR(seq))) {
1433 	  SG_SET_CDR(seq,
1434 		     Sg_Cons(make_string_from_two_char(SG_CADR(seq), quan),
1435 			     SG_CDDR(seq)));
1436 	  return seq;
1437 	} else {
1438 	  return make_string_from_two_char(SG_CADR(seq), quan);
1439 	}
1440       } else if (is_seq_sequence) {
1441 	/* if <seq> is also a 'sequence parse tree we merge both lists into one
1442 	   to avoid unneccessary consing*/
1443 	SG_SET_CDR(seq, Sg_Cons(quan, SG_CDR(seq)));
1444 	return seq;
1445       } else {
1446 	return SG_LIST3(SYM_SEQUENCE, quan, seq);
1447       }
1448     } else {
1449       return quan;
1450     }
1451   }
1452   return null_seq();
1453 }
1454 
1455 /*
1456    Parses and consumes a <regex>, a complete regular expression. The productions
1457 are: <regex> -> <seq> | <seq> "|" <regex>.
1458 Will return <parse-tree> or (SYM_ALTER <parse-tree> <parse-tree>).
1459  */
reg_expr(lexer_ctx_t * ctx)1460 static SgObject reg_expr(lexer_ctx_t *ctx)
1461 {
1462   long pos = ctx->pos;
1463   switch (next_char(ctx)) {
1464   case EOF:
1465     /* if we didn't get any token we return 'void which stands for
1466        "empty regular expression"
1467      */
1468     return null_seq();
1469   case '|':
1470     /* now check whether the expression started with a vertical bar,
1471        i.e. <seq> - the left alternation - is empty
1472      */
1473     return SG_LIST3(SYM_ALTER, null_seq(), reg_expr(ctx));
1474   default: {
1475     /* otherwise un-read the character we just saw and parse a <seq> plus
1476        the character following it
1477      */
1478     SgObject seq;
1479     ctx->pos = pos;
1480     seq = sequence(ctx);
1481     pos = ctx->pos;
1482     switch (next_char(ctx)) {
1483     case EOF:
1484       /* no further character, just a <seq> */
1485       return seq;
1486     case '|': {
1487       /* if the character was a vertical bar, this is an alternation and we
1488 	 have the second production
1489        */
1490       SgObject expr = reg_expr(ctx);
1491       if (SG_PAIRP(expr) &&
1492 	  SG_EQ(SG_CAR(expr), SYM_ALTER)) {
1493 	/* again we try to merge as above in SEQ */
1494 	SG_SET_CDR(expr, Sg_Cons(seq, SG_CDR(expr)));
1495 	return expr;
1496       } else {
1497 	return SG_LIST3(SYM_ALTER, seq, expr);
1498       }
1499     }
1500     default:
1501       /* a character which is not a vertical bar - this is either a syntax
1502 	 error or we're inside of a group and the next character is closing
1503 	 parenthesis; so we just un-read the character and let another function
1504 	 take care of it
1505        */
1506       ctx->pos = pos;
1507       return seq;
1508     }
1509   }
1510   }
1511 }
1512 
parse_string(lexer_ctx_t * ctx)1513 static SgObject parse_string(lexer_ctx_t *ctx)
1514 {
1515   SgObject r;
1516   if (has(ctx, SG_LITERAL)) {
1517     /* the whole input string is just literal */
1518     SgObject h = SG_NIL, t = SG_NIL;
1519     int i;
1520     SG_APPEND1(h, t, SYM_SEQUENCE);
1521     for (i = 0; i < ctx->len; i++) {
1522       SG_APPEND1(h, t, SG_MAKE_CHAR(ctx->str[i]));
1523     }
1524     ctx->pos = ctx->len;
1525     r = h;
1526   } else {
1527     r = reg_expr(ctx);
1528   }
1529   return SG_LIST4(SYM_REGISTER, SG_MAKE_INT(0), SG_FALSE, r);
1530 }
1531 
1532 /* optimization */
1533 /*
1534   cl-ppcre does convertion from s-expression tree to regex object to use generic
1535   methods.
1536   however on Sagittarius, we don't create such class and we do not use generic
1537   method(we can not!). So let's optimize AST directly and destructively.
1538 
1539   we do:
1540    - flattening nested sequence: (sequence a (sequence b) c) => (sequence a b c)
1541    - splits a repetition into constant and varying part: a{3,} -> a{3}a*
1542 */
1543 static SgObject optimize(SgObject ast, SgObject rest);
1544 
1545 /*
1546   a{3,} -> a{3}a*
1547   If the given regex is (abc|efg){3,}, then the back reference of $1 is "efg".
1548   So we can simple omit the first register tag if there is.
1549   (checked with Perl v5.10.1, with this one liner;
1550    $ perl -e 'my $s="abcabcabcefg"; $s=~/(abc|efg){3,}/; print $1."\n";'
1551    efg
1552   )
1553  */
maybe_split_repetition(SgObject ast,SgObject regex)1554 static SgObject maybe_split_repetition(SgObject ast, SgObject regex)
1555 {
1556   long minimum, maximum = -1;
1557   SgObject max = SG_CAR(SG_CDDR(ast));
1558   SgObject constant = SG_NIL;
1559   minimum = SG_INT_VALUE(SG_CADR(ast));
1560   if (!SG_FALSEP(max)) {
1561     maximum = SG_INT_VALUE(max);
1562     /* trivial case: don't repeat at all */
1563     if (maximum == 0) return null_seq();
1564     /* another trivial case "repeat" exactly once */
1565     if (minimum == 1 && maximum == 1) return SG_CADR(SG_CDDR(ast));
1566     /* well result would be exactly the same but make AST a bit shorter
1567        and hope this makes a bit of better memory usage. */
1568     /* if (minimum == 1 && maximum == 1) return ast; */
1569   }
1570   if (minimum > 0) {
1571     SgObject in = regex;
1572     /* if (SG_PAIRP(regex) && SG_EQ(SG_CAR(regex), SYM_REGISTER)) */
1573     /*   in = SG_CADR(SG_CDDR(regex)); */
1574     /* else in = regex; */
1575     constant = SG_LIST4(SG_CAR(ast), SG_CADR(ast), SG_CADR(ast), in);
1576   }
1577   if (!SG_FALSEP(max) && maximum == minimum) {
1578     /* no varying part needed bacuause min = max */
1579     return constant;
1580   } else {
1581     SgObject varying = SG_LIST4(SG_CAR(ast),
1582 				SG_MAKE_INT(0),
1583 				(SG_FALSEP(max)
1584 				 ? SG_FALSE : SG_MAKE_INT(maximum - minimum)),
1585 				regex);
1586     if (minimum == 0) return varying; /* min = 0, no constant part needed */
1587     else if (minimum == 1) {
1588       /* min = 1, constant part needs not repetition wrapped around */
1589       return SG_LIST3(SYM_SEQUENCE, regex, varying);
1590     } else {
1591       return SG_LIST3(SYM_SEQUENCE, constant, varying);
1592     }
1593   }
1594 }
1595 
optimize_seq(SgObject seq,SgObject rest)1596 static SgObject optimize_seq(SgObject seq, SgObject rest)
1597 {
1598   SgObject elt, tail, etype, opted;
1599   if (!SG_PAIRP(seq)) return seq;
1600   elt = SG_CAR(seq);
1601   tail = optimize_seq(SG_CDR(seq), rest);
1602   rest = SG_NULLP(tail) ? rest : tail;
1603   if (!SG_PAIRP(elt) || SG_EQ(SG_CAR(elt), SYM_INVERTED_CHAR_CLASS)) {
1604     if (SG_EQ(tail, SG_CDR(seq))) return seq;
1605     else return Sg_Cons(elt, tail);
1606   }
1607   etype = SG_CAR(elt);
1608   if (SG_EQ(etype, SYM_SEQUENCE)) {
1609     return Sg_Append2(optimize_seq(SG_CDR(elt), rest), tail);
1610   }
1611   opted = optimize(elt, rest);
1612   if (SG_EQ(elt, opted) && SG_EQ(tail, SG_CDR(seq))) return seq;
1613   else return Sg_Cons(opted, tail);
1614 }
1615 
optimize(SgObject ast,SgObject rest)1616 static SgObject optimize(SgObject ast, SgObject rest)
1617 {
1618   /* assume given AST is not literal list. so caller must copy original AST. */
1619   SgObject type, seq, seqo;
1620   if (!SG_PAIRP(ast)) return ast;
1621   type = SG_CAR(ast);
1622   /* we know after inverted-char-class it has only char-set. so just return */
1623   if (SG_EQ(type, SYM_INVERTED_CHAR_CLASS)) return ast;
1624 
1625   if (SG_EQ(type, SYM_GREEDY_REP) ||
1626       SG_EQ(type, SYM_NON_GREEDY_REP)) {
1627     return maybe_split_repetition(ast, optimize(SG_CADR(SG_CDDR(ast)), rest));
1628   }
1629 
1630   if (SG_EQ(type, SYM_ALTER)) {
1631     SgObject sp, sp2, e = SG_UNBOUND, h, t;
1632     SG_FOR_EACH(sp, SG_CDR(ast)) {
1633       e = optimize(SG_CAR(sp), rest);
1634       if (!SG_EQ(e, SG_CAR(sp))) break;
1635     }
1636     if (SG_NULLP(sp)) return ast;
1637     /* need to copy the spine */
1638     h = t = SG_NIL;
1639     SG_FOR_EACH(sp2, SG_CDR(ast)) {
1640       if (SG_EQ(sp, sp2)) { SG_APPEND1(h, t, e); break; }
1641       SG_APPEND1(h, t, SG_CAR(sp2));
1642     }
1643     SG_FOR_EACH(sp2, SG_CDR(sp2)) {
1644       SG_APPEND1(h, t, optimize(SG_CAR(sp2), rest));
1645     }
1646     return Sg_Cons(SYM_ALTER, h);
1647   }
1648   if (SG_EQ(type, SYM_SEQUENCE)) {
1649     seq = SG_CDR(ast);
1650     seqo = optimize_seq(seq, rest);
1651     if (SG_EQ(seq, seqo)) return ast;
1652     return Sg_Cons(type, seqo);
1653   }
1654 
1655   if (SG_EQ(type, SYM_REGISTER)) {
1656     /* (register n name ast) */
1657     SgObject n, name;
1658     if (Sg_Length(ast) != 4) {
1659       Sg_Error(UC("Invalid AST register: %S"), ast);
1660     }
1661     n = SG_CADR(ast);
1662     name = SG_CAR(SG_CDDR(ast));
1663     seq = SG_CADR(SG_CDDR(ast));
1664     seqo = optimize(seq, rest);
1665     /* Sg_Printf(Sg_StandardErrorPort(), UC("ast: %S\n"), seq); */
1666     /* Sg_Printf(Sg_StandardErrorPort(), UC("seq: %S\n"), seqo); */
1667     if (SG_EQ(seq, seqo)) return ast;
1668     return SG_LIST4(SYM_REGISTER, n, name, seqo);
1669   }
1670 
1671   seq = SG_CDR(ast);
1672   seqo = optimize(seq, rest);
1673   if (SG_EQ(seq, seqo)) return ast;
1674   return Sg_Cons(type, seqo);
1675 }
1676 
1677 #include "regex_priv.inc"
1678 
1679 typedef struct
1680 {
1681   inst_t *pc;			/* current pc */
1682   int     flags;		/* compile time flags */
1683   int     emitp;		/* flag for count or emit */
1684   int     codemax;		/* max code count */
1685   prog_t *prog;			/* building prog */
1686   inst_t *inst;
1687   long    index;		/* current inst index */
1688   int     extendedp;		/* extended regular expression or not */
1689   int     lookbehindp;
1690 } compile_ctx_t;
1691 
1692 static inst_arg_t null_arg = {0};
1693 static inst_t     null_inst = {0};
1694 
check_start_anchor(SgObject ast,int * modelessp)1695 static int check_start_anchor(SgObject ast, int *modelessp)
1696 {
1697   SgObject type;
1698   if (!SG_PAIRP(ast))
1699     type = ast;
1700   else
1701     type = SG_CAR(ast);
1702 
1703   if (SG_EQ(type, SYM_START_ANCHOR)) return TRUE;
1704   else if (SG_EQ(type, SYM_MODELESS_START_ANCHOR)) {
1705     if (modelessp) *modelessp = TRUE;
1706     return TRUE;
1707   }
1708   else return FALSE;
1709 }
1710 
emit(compile_ctx_t * ctx,unsigned char opcode,inst_arg_t arg)1711 static void emit(compile_ctx_t *ctx, unsigned char opcode,
1712 		 inst_arg_t arg)
1713 {
1714   if (ctx->emitp) {
1715     inst_t *i = &ctx->inst[ctx->index++];
1716     i->opcode = opcode;
1717     i->arg = arg;
1718     INST_FLAG_SET(i, ctx->flags);
1719     ctx->pc = ++i;
1720   } else {
1721     ctx->codemax++;
1722   }
1723 }
1724 
1725 #define emit2(ctx, inst1, inst2, arg)				\
1726   emit((ctx), (ctx)->negative ? (inst2) : (inst1), (arg))
1727 
1728 static void compile_rec(compile_ctx_t *ctx, SgObject ast, int lastp);
1729 
compile_seq(compile_ctx_t * ctx,SgObject seq,int lastp)1730 static void compile_seq(compile_ctx_t *ctx, SgObject seq, int lastp)
1731 {
1732   SgObject cp;
1733   if (ctx->emitp && ctx->lookbehindp) {
1734     seq = Sg_ReverseX(seq);
1735   }
1736   SG_FOR_EACH(cp, seq) {
1737     SgObject item = SG_CAR(cp);
1738     inst_arg_t arg;
1739     if (SG_CHARP(item)) {
1740       /* TODO we need to concat chars to string. but it must be done by
1741  	      optimization.*/
1742       arg.c = SG_CHAR_VALUE(item);
1743       emit(ctx, RX_CHAR, arg);
1744     } else {
1745       int p;
1746       p = lastp && SG_NULLP(SG_CDR(cp));
1747       compile_rec(ctx, item, p);
1748     }
1749   }
1750 }
1751 
compile_rep_seq(compile_ctx_t * ctx,SgObject seq,long count,int lastp)1752 static void compile_rep_seq(compile_ctx_t *ctx, SgObject seq,
1753 			    long count, int lastp)
1754 {
1755   SgObject h = SG_NIL, t = SG_NIL;
1756   /* I don't remenber why I needed to check this, but this causes an error with
1757      #/(?:aa?){n}/ pattern.
1758    */
1759   /* int seqp = (SG_PAIRP(seq) && SG_EQ(SG_CAR(seq), SYM_SEQUENCE)); */
1760   if (count <= 0) return;
1761   while (count-- > 0) {
1762     /* if (seqp) { */
1763     /*   SG_APPEND(h, t, Sg_CopyList(seq)); */
1764     /* } else */
1765     if (SG_PAIRP(seq)) {
1766       SG_APPEND1(h, t, Sg_CopyList(seq));
1767     } else {
1768       SG_APPEND1(h, t, seq);
1769     }
1770   }
1771   /* h is ((sequence ...) ...) so we can simply pass it to compile_seq
1772      TODO maybe we need to flatten */
1773   compile_seq(ctx, h, lastp);
1774 }
1775 
compile_min_max(compile_ctx_t * ctx,SgObject type,long count,SgObject item,int lastp)1776 static void compile_min_max(compile_ctx_t *ctx, SgObject type,
1777 			    long count, SgObject item, int lastp)
1778 {
1779   /* {m, n} pattern. it can be replaced like this.
1780      x{2,5} = xx(x(x(x)?)?)?
1781      VM: Instruction
1782      0: char x  ;; These parts are already emited by compile_rec
1783      1: char x  ;;
1784      2: split 3 8
1785      3: char x
1786      4: split 5 8
1787      5: char x
1788      6: split 7 8
1789      7: char x
1790      8: match
1791    */
1792   long i;
1793   SgObject h = SG_NIL, t = SG_NIL, cp;
1794   int nongreedyp = SG_EQ(type, SYM_NON_GREEDY_REP);
1795 
1796   for (i = 0; i < count; i++) {
1797     inst_t *pc1 = ctx->pc;
1798     emit(ctx, RX_SPLIT, null_arg);
1799     pc1->arg.pos.x = ctx->pc;
1800     compile_rec(ctx, item, lastp);
1801     /* save current pc to patch later */
1802     if (ctx->emitp) SG_APPEND1(h, t, pc1);
1803   }
1804   if (ctx->emitp) {
1805     SG_FOR_EACH(cp, h) {
1806       inst_t *pc = (inst_t*)SG_CAR(cp);
1807       pc->arg.pos.y = ctx->pc;
1808     }
1809     /* swap for non greedy */
1810     if (nongreedyp) {
1811       SG_FOR_EACH(cp, h) {
1812 	inst_t *pc1 = (inst_t*)SG_CAR(cp);
1813 	inst_t *pc2 = pc1->arg.pos.x;
1814 	pc1->arg.pos.x = pc1->arg.pos.y;
1815 	pc1->arg.pos.y = pc2;
1816       }
1817     }
1818   }
1819   return;
1820 }
1821 
calculate_flags(int flag,SgObject flags)1822 static int calculate_flags(int flag, SgObject flags)
1823 {
1824   SgObject cp;
1825   SG_FOR_EACH(cp, flags) {
1826     SgObject slot = SG_CAR(cp);
1827     ASSERT(SG_CHARP(SG_CAR(slot)));
1828     switch (SG_CHAR_VALUE(SG_CAR(slot))) {
1829     case 'i':
1830       if (SG_FALSEP(SG_CDR(slot)))
1831 	flag &= ~SG_CASE_INSENSITIVE;
1832       else
1833 	flag |= SG_CASE_INSENSITIVE;
1834       break;
1835     case 'm':
1836       if (SG_FALSEP(SG_CDR(slot)))
1837 	flag &= ~SG_MULTILINE;
1838       else
1839 	flag |= SG_MULTILINE;
1840       break;
1841     case 's':
1842       if (SG_FALSEP(SG_CDR(slot)))
1843 	flag &= ~SG_DOTALL;
1844       else
1845 	flag |= SG_DOTALL;
1846       break;
1847     case 'u':
1848       if (SG_FALSEP(SG_CDR(slot)))
1849 	flag &= ~SG_UNICODE_CASE;
1850       else
1851 	flag |= SG_UNICODE_CASE;
1852       break;
1853     }
1854   }
1855   return flag;
1856 }
1857 
compile_rec(compile_ctx_t * ctx,SgObject ast,int lastp)1858 static void compile_rec(compile_ctx_t *ctx, SgObject ast, int lastp)
1859 {
1860   SgObject type;
1861   inst_arg_t arg;
1862   /* first, deal with atom */
1863   if (!SG_PAIRP(ast)) {
1864     /* a char */
1865     if (SG_CHARP(ast)) {
1866       /* TODO maybe we can deal with case insensitive here */
1867       arg.c = SG_CHAR_VALUE(ast);
1868       emit(ctx, RX_CHAR, arg);
1869       return;
1870     }
1871     /* charset */
1872     if (SG_CHAR_SET_P(ast)) {
1873       arg.set = ast;
1874       emit(ctx, RX_SET, arg);
1875       return;
1876     }
1877     /* special stuff */
1878     if (SG_SYMBOLP(ast)) {
1879       if (SG_EQ(ast, SYM_EVERYTHING)) {
1880 	emit(ctx, RX_ANY, null_arg);
1881 	return;
1882       }
1883       if (SG_EQ(ast, SYM_START_ANCHOR) ||
1884 	  SG_EQ(ast, SYM_MODELESS_START_ANCHOR)) {
1885 	/* set flags */
1886 	/* TODO check compile time flag */
1887 	arg.flags = SG_EQ(ast, SYM_START_ANCHOR)
1888 	  ? EmptyBeginLine : EmptyBeginText;
1889 	emit(ctx, RX_EMPTY, arg);
1890 	return;
1891       }
1892       if (SG_EQ(ast, SYM_END_ANCHOR) ||
1893 	  SG_EQ(ast, SYM_MODELESS_END_ANCHOR) ||
1894 	  SG_EQ(ast, SYM_MODELESS_END_ANCHOR_NO_NEWLINE)) {
1895 	/* Gauche supports end-anchor as literal char '$' in some context.
1896 	   But we do as defact standard(Perl) way.*/
1897 	/* set flags */
1898 	/* TODO check compile time flag */
1899 	if (SG_EQ(ast, SYM_END_ANCHOR)) {
1900 	  arg.flags = EmptyEndLine;
1901 	} else if (SG_EQ(ast, SYM_MODELESS_END_ANCHOR)) {
1902 	  /* '$' without multiline mode flag is the same as \\Z */
1903 	  arg.flags = EmptyEndTextNoNewLine;
1904 	} else {
1905 	  arg.flags = EmptyEndText;
1906 	}
1907 	emit(ctx, RX_EMPTY, arg);
1908 	return;
1909       }
1910       if (SG_EQ(ast, SYM_WORD_BOUNDARY)) {
1911 	arg.flags = EmptyWordBoundary;
1912 	emit(ctx, RX_EMPTY, arg);
1913 	return;
1914       }
1915       if (SG_EQ(ast, SYM_NON_WORD_BOUNDARY)) {
1916 	arg.flags = EmptyNonWordBoundary;
1917 	emit(ctx, RX_EMPTY, arg);
1918 	return;
1919       }
1920       /* fallback */
1921     }
1922     Sg_Error(UC("[internal:regex] unrecognized AST item: %S"), ast);
1923   }
1924   /* structured node */
1925   type = SG_CAR(ast);
1926   /* do with simple ones */
1927   if (SG_EQ(type, SYM_SEQUENCE)) {
1928     /* we do not have any implicit sequence */
1929     compile_seq(ctx, SG_CDR(ast), lastp);
1930     return;
1931   }
1932 
1933   if (SG_EQ(type, SYM_FLAGGED_SEQUENCE)) {
1934     SgObject flags = SG_CADR(ast);
1935     SgObject seq = SG_CAR(SG_CDDR(ast));
1936     int flag = calculate_flags(ctx->flags, flags), save = ctx->flags;
1937     ctx->flags = arg.flags = flag;
1938     compile_rec(ctx, seq, lastp);
1939     ctx->flags = arg.flags = save;
1940     return;
1941   }
1942 
1943   if (SG_EQ(type, SYM_INVERTED_CHAR_CLASS)) {
1944     SgObject cs = SG_CADR(ast);
1945     ASSERT(SG_CHAR_SET_P(cs));
1946     arg.set = cs;
1947     emit(ctx, RX_NSET, arg);
1948     return;
1949   }
1950   if (SG_EQ(type, SYM_REGISTER)) {
1951     /* (register <number> <name> <ast>) */
1952     long grpno = SG_INT_VALUE(SG_CADR(ast));
1953     arg.n = 2*grpno;
1954     emit(ctx, RX_SAVE, arg);
1955     compile_rec(ctx, SG_CADR(SG_CDDR(ast)), lastp);
1956     arg.n = 2*grpno+1;
1957     emit(ctx, RX_SAVE, arg);
1958     return;
1959   }
1960 
1961   /*
1962     (alter (seq aa) (seq bb) (seq cc))
1963     ->
1964     0: split 1 8
1965     1: split 2 5
1966     2: char a
1967     3: char a
1968     4: jmp 7
1969     5: char b
1970     6: char b
1971     7: jmp 10
1972     8: char c
1973     9: char c
1974     10: match
1975     so we need to separate into two, cadr part and the rest.
1976     -> (alter (seq aa) (alter (seq bb) (seq cc)))
1977    */
1978   if (SG_EQ(type, SYM_ALTER)) {
1979     if (SG_PAIRP(SG_CDR(ast))) {
1980       inst_t *pc1 = ctx->pc, *pc2;
1981       emit(ctx, RX_SPLIT, null_arg);
1982       pc1->arg.pos.x = ctx->pc;
1983       compile_rec(ctx, SG_CADR(ast), lastp);
1984       pc2 = ctx->pc;
1985       emit(ctx, RX_JMP, null_arg);
1986       pc1->arg.pos.y = ctx->pc;
1987       if (Sg_Length((SG_CDDR(ast))) != 1) {
1988 	/* more than two */
1989 	compile_rec(ctx, Sg_Cons(SYM_ALTER, SG_CDDR(ast)), lastp);
1990       } else {
1991 	compile_rec(ctx, SG_CAR(SG_CDDR(ast)), lastp);
1992       }
1993       pc2->arg.pos.x = ctx->pc;
1994     } else {
1995       emit(ctx, RX_FAIL, null_arg);
1996     }
1997     return;
1998   }
1999 
2000   if (SG_EQ(type, SYM_GREEDY_REP) || SG_EQ(type, SYM_NON_GREEDY_REP)) {
2001     SgObject min = SG_CADR(ast), max = SG_CAR(SG_CDDR(ast));
2002     SgObject item = SG_CADR(SG_CDDR(ast));
2003     int multip = 0;
2004     inst_t *pc1, *pc2;
2005 
2006     if (SG_FALSEP(max) || SG_INT_VALUE(max) > 1)
2007       multip = TRUE;
2008     compile_rep_seq(ctx, item, SG_INT_VALUE(min), multip);
2009 
2010     if (SG_EQ(min, max)) return; /* well, it must match exact times */
2011     if (!SG_FALSEP(max)) {
2012       long count = SG_INT_VALUE(max) - SG_INT_VALUE(min);
2013       compile_min_max(ctx, type, count, item, lastp);
2014       return;
2015     }
2016     /* save current instruction position */
2017     pc1 = ctx->pc;
2018     emit(ctx, RX_SPLIT, null_arg);
2019     pc1->arg.pos.x = ctx->pc;
2020     compile_rec(ctx, item, FALSE);
2021     /* we've already resolved minimam match so let introduce jmp here */
2022     arg.pos.x = pc1;
2023     emit(ctx, RX_JMP, arg);
2024     pc1->arg.pos.y = ctx->pc;
2025     if (SG_EQ(type, SYM_NON_GREEDY_REP)) {
2026       pc2 = pc1->arg.pos.x;
2027       pc1->arg.pos.x = pc1->arg.pos.y;
2028       pc1->arg.pos.y = pc2;
2029     }
2030     return;
2031   }
2032 
2033   if (SG_EQ(type, SYM_BACKREF)) {
2034     SgObject num = SG_CDR(ast);
2035     ctx->extendedp = TRUE;
2036     if (SG_INTP(num)) {
2037       arg.index = SG_INT_VALUE(num);
2038       emit(ctx, RX_BREF, arg);
2039     } else {
2040       raise_compile_error(UC("invalid backreference number. %S"), num);
2041     }
2042     return;
2043   }
2044 
2045   if (SG_EQ(type, SYM_LOOKAHEAD) || SG_EQ(type, SYM_LOOKBHIND)) {
2046     SgObject neg = SG_CADR(ast);
2047     SgObject seq = SG_CAR(SG_CDDR(ast));
2048     inst_t *pc;
2049     int saved = ctx->lookbehindp;
2050     ctx->extendedp = TRUE;
2051     pc = ctx->pc;
2052     if (SG_EQ(type, SYM_LOOKAHEAD)) {
2053       emit(ctx, (SG_FALSEP(neg)) ? RX_NAHEAD : RX_AHEAD, null_arg);
2054     } else {
2055       emit(ctx, (SG_FALSEP(neg)) ? RX_NBEHIND : RX_BEHIND, null_arg);
2056       ctx->lookbehindp = TRUE;
2057     }
2058     /* Comment from Gauche regexp.c
2059        Assertions can check EOF even other regexps follow, so '$'
2060        in the last pos of this group should be treated as EOL.
2061        (?>$) as well. It is consistent with Perl and Oniguruma. */
2062     compile_rec(ctx, seq, FALSE);
2063     emit(ctx, RX_RESTORE, null_arg);
2064     pc->arg.pos.x = ctx->pc;
2065     ctx->lookbehindp = saved;
2066     return;
2067   }
2068 
2069   if (SG_EQ(type, SYM_STANDALONE)) {
2070     /* almost the same as look ahead/behind */
2071     SgObject seq = SG_CADR(ast);
2072     inst_t *pc;
2073     ctx->extendedp = TRUE;
2074     pc = ctx->pc;
2075     emit(ctx, RX_ONCE, null_arg);
2076     compile_rec(ctx, seq, FALSE);
2077     emit(ctx, RX_RESTORE, null_arg);
2078     pc->arg.pos.x = ctx->pc;
2079     return;
2080   }
2081 
2082   if (SG_EQ(type, SYM_BRANCH)) {
2083     SgObject cond = SG_CADR(ast);
2084     SgObject rest = SG_CAR(SG_CDDR(ast));
2085 
2086 #define NULL_SEQP(ast) (SG_PAIRP(ast) && SG_NULLP(SG_CDR(ast)))
2087 
2088     ctx->extendedp = TRUE;
2089     if (SG_INTP(cond)) {
2090       /* (branch n regexp)
2091 	 0: branch <n> 1 3
2092 	 1: <yes-pattern>
2093 	 2: jmp 4
2094 	 3: <no-pattern>
2095 	 4: rest
2096       */
2097       inst_t *pc = ctx->pc, *pc2;
2098       arg.cond.n = SG_INT_VALUE(cond) * 2;
2099       emit(ctx, RX_BRANCH, arg);
2100       pc->arg.cond.x = ctx->pc;
2101       if (!NULL_SEQP(rest) && SG_PAIRP(rest)) {
2102 	/* check syntax */
2103 	if (!SG_EQ(SG_CAR(rest), SYM_ALTER)) {
2104 	  raise_compile_error(UC("branch has non alter regex."), ast);
2105 	  return;		/* dummy */
2106 	}
2107       }
2108       compile_rec(ctx, (!NULL_SEQP(rest) && SG_PAIRP(rest))
2109 		  ? SG_CADR(rest) : rest, lastp);
2110       pc2 = ctx->pc;
2111       emit(ctx, RX_JMP, null_arg);
2112       pc->arg.cond.y = ctx->pc;
2113       if (!NULL_SEQP(rest) && SG_PAIRP(rest)) {
2114 	compile_rec(ctx, SG_CAR(SG_CDDR(rest)), lastp);
2115       } else {
2116 	compile_rec(ctx, SG_NIL, lastp);
2117       }
2118       pc2->arg.pos.x = ctx->pc;
2119     } else {
2120       /*
2121 	(branch assert regexp)
2122 	0: brancha 3 5
2123 	1: <assert>
2124 	2: restore  ;; this must be in assert
2125 	3: <yes-pattern>
2126 	4: jmp 6
2127 	5: <no-pattern>
2128 	6: rest
2129       */
2130       inst_t *pc = ctx->pc, *pc2;
2131       emit(ctx, RX_BRANCHA, null_arg);
2132       compile_rec(ctx, cond, lastp);
2133       pc->arg.cond.x = ctx->pc;
2134       if (!NULL_SEQP(rest) && SG_PAIRP(rest)) {
2135 	/* check syntax */
2136 	if (!SG_EQ(SG_CAR(rest), SYM_ALTER)) {
2137 	  raise_compile_error(UC("branch has non alter regex."), ast);
2138 	  return;		/* dummy */
2139 	}
2140       }
2141       compile_rec(ctx, (!NULL_SEQP(rest) && SG_PAIRP(rest))
2142 		  ? SG_CADR(rest) : rest, lastp);
2143       pc2 = ctx->pc;
2144       emit(ctx, RX_JMP, null_arg);
2145       pc->arg.cond.y = ctx->pc;
2146       if (!NULL_SEQP(rest) && SG_PAIRP(rest)) {
2147 	compile_rec(ctx, SG_CAR(SG_CDDR(rest)), lastp);
2148       } else {
2149 	emit(ctx, RX_FAIL, null_arg);
2150       }
2151       pc2->arg.pos.x = ctx->pc;
2152     }
2153     return;
2154   }
2155 
2156   Sg_Error(UC("unknown AST type: %S"), type);
2157 }
2158 
2159 /*
2160   We run compile_rec max 3 times.
2161   First time is to count instructions. Second time is for root and third time
2162   is for matchRoot. The second and third time could be only once if given
2163   regular expression starts with \A.
2164  */
compile(compile_ctx_t * ctx,SgObject ast)2165 static prog_t* compile(compile_ctx_t *ctx, SgObject ast)
2166 {
2167   int n, modeless = FALSE, offset = 0;
2168   prog_t *p;
2169   inst_t *match;
2170 
2171   ctx->pc = &null_inst;		/* put dummy */
2172   check_start_anchor(ast, &modeless);
2173   ctx->emitp = FALSE;
2174   compile_rec(ctx, ast, TRUE);
2175   n = ctx->codemax + 1;
2176 
2177   p = SG_NEW(prog_t);
2178   /* offset = (modeless) ? 0 : 3; */
2179   /* we need to add split, any and jmp */
2180   p->root = SG_NEW_ARRAY(inst_t, n + offset);
2181   p->rootLength = n + offset;
2182   ctx->prog = p;
2183   ctx->pc = &p->root[0];
2184   ctx->inst = p->root;
2185   ctx->index = 0;
2186   ctx->emitp = TRUE;
2187 
2188   compile_rec(ctx, ast, TRUE);
2189   /* last instruction must be RX_MATCH */
2190   match = &p->root[n+offset-1];
2191   match->opcode = RX_MATCH;
2192 
2193   return p;
2194 }
2195 
2196 
2197 /* compile takes 3 pass
2198    pass1: string->ast
2199    pass2: optimize
2200    pass3: code generation
2201  */
pattern_printer(SgObject self,SgPort * port,SgWriteContext * ctx)2202 static void pattern_printer(SgObject self, SgPort *port, SgWriteContext *ctx)
2203 {
2204   SgPattern *pattern = SG_PATTERN(self);
2205   long i, size = SG_STRING_SIZE(pattern->pattern);
2206   SG_PORT_LOCK_WRITE(port);
2207   Sg_PutzUnsafe(port, "#/");
2208   /* Sg_Printf(port, UC("#/%A/"), pattern->pattern); */
2209   for (i = 0; i < size;) {
2210     SgChar c = SG_STRING_VALUE_AT(pattern->pattern, i++);
2211     if (c == '\\') {
2212       Sg_PutcUnsafe(port, '\\');
2213       Sg_PutcUnsafe(port, SG_STRING_VALUE_AT(pattern->pattern, i++));
2214     } else if (c == '/') {
2215       Sg_PutcUnsafe(port, '\\');
2216       Sg_PutcUnsafe(port, c);
2217     } else {
2218       Sg_PutcUnsafe(port, c);
2219     }
2220   }
2221   Sg_PutcUnsafe(port, '/');
2222   /* flags */
2223   if (has(pattern, SG_COMMENTS)) {
2224     Sg_PutcUnsafe(port, 'x');
2225   }
2226   if (has(pattern, SG_CASE_INSENSITIVE)) {
2227     Sg_PutcUnsafe(port, 'i');
2228   }
2229   if (has(pattern, SG_MULTILINE)) {
2230     Sg_PutcUnsafe(port, 'm');
2231   }
2232   if (has(pattern, SG_DOTALL)) {
2233     Sg_PutcUnsafe(port, 's');
2234   }
2235   if (has(pattern, SG_UNICODE_CASE)) {
2236     Sg_PutcUnsafe(port, 'u');
2237   }
2238   SG_PORT_UNLOCK_WRITE(port);
2239 }
2240 
pattern_cache_reader(SgPort * port,SgReadCacheCtx * ctx)2241 static SgObject pattern_cache_reader(SgPort *port, SgReadCacheCtx *ctx)
2242 {
2243   /* assume next object is pattern */
2244   SgObject pattern = Sg_ReadCacheObject(port, ctx);
2245   SgObject flags = Sg_ReadCacheObject(port, ctx);
2246   if (SG_STRINGP(pattern) && SG_INTP(flags)) {
2247     return Sg_CompileRegex(pattern, (int)SG_INT_VALUE(flags), FALSE);
2248   } else {
2249     return SG_FALSE;
2250   }
2251 }
2252 
pattern_cache_scanner(SgObject obj,SgObject cbs,SgWriteCacheCtx * ctx)2253 static SgObject pattern_cache_scanner(SgObject obj, SgObject cbs,
2254 				      SgWriteCacheCtx *ctx)
2255 {
2256   /* since we don't have any compiler from AST, we can assume pattern
2257      has original pattern. */
2258   return Sg_WriteCacheScanRec(SG_PATTERN(obj)->pattern, cbs, ctx);;
2259 }
2260 
pattern_cache_writer(SgObject obj,SgPort * port,SgWriteCacheCtx * ctx)2261 static void pattern_cache_writer(SgObject obj, SgPort *port,
2262 				 SgWriteCacheCtx *ctx)
2263 {
2264   /* just put original pattern and flags*/
2265   Sg_WriteObjectCache(SG_PATTERN(obj)->pattern, port, ctx);
2266   Sg_WriteObjectCache(SG_MAKE_INT(SG_PATTERN(obj)->flags), port, ctx);
2267 }
2268 
2269 #define DEFINE_CLASS_WITH_CACHE SG_DEFINE_BUILTIN_CLASS_SIMPLE_WITH_CACHE
2270 
2271 DEFINE_CLASS_WITH_CACHE(Sg_PatternClass,
2272 			pattern_cache_reader,
2273 			pattern_cache_scanner,
2274 			pattern_cache_writer,
2275 			pattern_printer);
2276 
2277 static void unparse(SgObject reg, SgPort *out);
charset_print_ch(SgPort * out,SgChar ch,int firstp)2278 static void charset_print_ch(SgPort *out, SgChar ch, int firstp)
2279 {
2280   if (ch == '[' || ch == ']' || ch == '-' || (ch == '^' && firstp)) {
2281     Sg_Printf(out, UC("\\%c"), ch);
2282   } else if (ch < 0x20 || ch == 0x7f) {
2283     /* \xXX is enough for range of ascii */
2284     Sg_Printf(out, UC("\\x%02x"), ch);
2285   } else {
2286     switch (Sg_CharGeneralCategory(ch)) {
2287     case Mn: case Mc: case Me: case Cc: case Cf: case Cs: case Co: case Cn:
2288       if (ch < 0x10000) Sg_Printf(out, UC("\\u%04x"), ch);
2289       else              Sg_Printf(out, UC("\\U%08x"), ch);
2290       break;
2291     default:
2292       Sg_PutcUnsafe(out, ch);
2293     }
2294   }
2295 }
2296 
charset_to_regex(SgObject cs,int invertP,SgPort * out)2297 static void charset_to_regex(SgObject cs, int invertP, SgPort *out)
2298 {
2299   SgObject ranges = Sg_CharSetRanges(cs), cp;
2300   int firstp = TRUE;
2301 
2302   Sg_PutcUnsafe(out, '[');
2303   if (invertP) {
2304     Sg_PutcUnsafe(out, '^');
2305   }
2306   SG_FOR_EACH(cp, ranges) {
2307     SgObject cell = SG_CAR(cp);
2308     SgChar start = (SgChar)SG_INT_VALUE(SG_CAR(cell));
2309     SgChar end = (SgChar)SG_INT_VALUE(SG_CDR(cell));
2310     charset_print_ch(out, start, firstp);
2311     firstp = FALSE;
2312     if (start != end) {
2313       Sg_PutcUnsafe(out, '-');
2314       charset_print_ch(out, end, FALSE);
2315     }
2316   }
2317   Sg_PutcUnsafe(out, ']');
2318 }
2319 
unparse_seq(SgObject reg,SgPort * out)2320 static void unparse_seq(SgObject reg, SgPort *out)
2321 {
2322   SgObject cp;
2323   SG_FOR_EACH(cp, reg) {
2324     unparse(SG_CAR(cp), out);
2325   }
2326 }
2327 
unparse_between(const char * a,SgObject n,const char * b,SgPort * out)2328 static void unparse_between(const char *a, SgObject n, const char *b,
2329 			    SgPort *out)
2330 {
2331   Sg_PutzUnsafe(out, a);
2332   unparse_seq(n, out);
2333   Sg_PutzUnsafe(out, b);
2334 }
2335 
unparse_reg(SgObject reg,SgPort * out)2336 static void unparse_reg(SgObject reg, SgPort *out)
2337 {
2338   SgObject n, name, rest;
2339   if (SG_NULLP(SG_CDR(reg))) goto err;
2340   n = SG_CADR(reg);
2341   if (SG_NULLP(SG_CDDR(reg))) goto err;
2342   name = SG_CAR(SG_CDDR(reg));
2343   rest = SG_CDR(SG_CDDR(reg));
2344 
2345   if (SG_EQ(n, SG_MAKE_INT(0))) {
2346     if (!SG_FALSEP(name)) {
2347       Sg_Error(UC("toplevel group can't have name"), name);
2348     }
2349     unparse_seq(rest, out);
2350   } else if (!SG_FALSEP(name)) {
2351     Sg_PutzUnsafe(out, "(?<");
2352     Sg_Write(name, out, SG_WRITE_DISPLAY);
2353     Sg_PutcUnsafe(out, '>');
2354     unparse_seq(rest, out);
2355     Sg_PutcUnsafe(out, ')');
2356   } else {
2357     Sg_PutcUnsafe(out, '(');
2358     unparse_seq(rest, out);
2359     Sg_PutcUnsafe(out, ')');
2360   }
2361   return;
2362  err:
2363   Sg_Error(UC("invalid AST node %S [reg]"), reg);
2364 }
2365 
unparse_intersp(SgObject n,const char * sep,SgPort * out)2366 static void unparse_intersp(SgObject n, const char *sep, SgPort *out)
2367 {
2368   SgObject cp;
2369   if (SG_NULLP(n)) return;
2370   Sg_PutzUnsafe(out, "(?:");
2371   unparse(SG_CAR(n), out);
2372   SG_FOR_EACH(cp, SG_CDR(n)) {
2373     Sg_PutzUnsafe(out, sep);
2374     unparse(SG_CAR(cp), out);
2375   }
2376   Sg_PutcUnsafe(out, ')');
2377 }
2378 
unparse_rep(SgObject o,int greedyP,SgPort * out)2379 static void unparse_rep(SgObject o, int greedyP, SgPort *out)
2380 {
2381   /* must have 4 elements */
2382   SgObject min, max, ns, n = o;
2383   if (!SG_PAIRP(SG_CDR(n))) goto err;
2384   n = SG_CDR(n);
2385   min = SG_CAR(n);
2386   if (!SG_INTP(min) && !SG_FALSEP(min)) goto err;
2387   if (!SG_PAIRP(SG_CDR(n))) goto err;
2388   n = SG_CDR(n);
2389   max = SG_CAR(n);
2390   if (!SG_INTP(max) && !SG_FALSEP(max)) goto err;
2391   ns = SG_CDR(n);
2392 
2393   unparse_between("(?:", ns, ")", out);
2394   if (SG_FALSEP(max)) {
2395     if (SG_EQ(min, SG_MAKE_INT(0)) || SG_FALSEP(min)) {
2396       Sg_PutcUnsafe(out, '*');
2397     } else if (SG_EQ(min, SG_MAKE_INT(1))) {
2398       Sg_PutcUnsafe(out, '+');
2399     } else {
2400       Sg_Printf(out, UC("{%d,}"), SG_INT_VALUE(min));
2401     }
2402   } else if (SG_EQ(min, max)) {
2403     /* who write this? */
2404     if (SG_EQ(min, SG_MAKE_INT(0))) {
2405       Sg_PutzUnsafe(out, "{0}");
2406     } else if (!SG_EQ(min, SG_MAKE_INT(1))) {
2407       Sg_Printf(out, UC("{%d}"), SG_INT_VALUE(min));
2408     } /* if min == max == 1 then we don't have to write */
2409   } else if (SG_EQ(SG_MAKE_INT(0), min) && SG_EQ(SG_MAKE_INT(1), max)) {
2410     Sg_PutcUnsafe(out, '?');
2411   } else {
2412     if (SG_FALSEP(min)) {
2413       Sg_Printf(out, UC("{0,%d}"), SG_INT_VALUE(max));
2414     } else {
2415       Sg_Printf(out, UC("{%d,%d}"), SG_INT_VALUE(min), SG_INT_VALUE(max));
2416     }
2417   }
2418   if (!greedyP) {
2419     Sg_PutcUnsafe(out, '?');
2420   }
2421   return;
2422  err:
2423   Sg_Error(UC("invalid AST node %S [rep]"), o);
2424 }
2425 
unparse_look(SgObject o,const char * behind,SgPort * out)2426 static void unparse_look(SgObject o, const char *behind, SgPort *out)
2427 {
2428   SgObject n = o, assert, ns;
2429   if (!SG_PAIRP(SG_CDR(n))) goto err;
2430   n = SG_CDR(n);
2431   assert = SG_CAR(n);
2432   ns = SG_CDR(n);
2433 
2434   Sg_PutzUnsafe(out, "(?");
2435   Sg_PutzUnsafe(out, behind);
2436   unparse_between((SG_FALSEP(assert) ? "!" : "="), ns, ")", out);
2437   return;
2438  err:
2439   Sg_Error(UC("invalid AST node %S [look]"), o);
2440 }
2441 
unparse(SgObject n,SgPort * out)2442 static void unparse(SgObject n, SgPort *out)
2443 {
2444   if (SG_CHARP(n)) {
2445     SgChar c = SG_CHAR_VALUE(n);
2446     if (c == '.' || c == '^' || c == '$' || c == '(' || c == ')' ||
2447 	c == '{' || c == '}' || c == '[' || c == ']' || c == '\\' ||
2448 	c == '*' || c == '+' || c == '?' || c == '|') {
2449       Sg_PutcUnsafe(out, '\\');
2450     }
2451     Sg_PutcUnsafe(out, c);
2452   } else if (SG_CHAR_SET_P(n)) {
2453     charset_to_regex(n, FALSE, out);
2454   } else if (SG_EQ(n, SYM_EVERYTHING)) {
2455     Sg_PutcUnsafe(out, '.');
2456   } else if (SG_EQ(n, SYM_START_ANCHOR)) {
2457     Sg_PutcUnsafe(out, '^');
2458   } else if (SG_EQ(n, SYM_MODELESS_START_ANCHOR)) {
2459     Sg_PutzUnsafe(out, "\\A");
2460   } else if (SG_EQ(n, SYM_END_ANCHOR)) {
2461     Sg_PutcUnsafe(out, '$');
2462   } else if (SG_EQ(n, SYM_MODELESS_END_ANCHOR)) {
2463     Sg_PutzUnsafe(out, "\\Z");
2464   } else if (SG_EQ(n, SYM_MODELESS_END_ANCHOR_NO_NEWLINE)) {
2465     Sg_PutzUnsafe(out, "\\z");
2466   } else if (SG_EQ(n, SYM_WORD_BOUNDARY)) {
2467     Sg_PutzUnsafe(out, "\\b");
2468   } else if (SG_EQ(n, SYM_NON_WORD_BOUNDARY)) {
2469     Sg_PutzUnsafe(out, "\\B");
2470   } else if (SG_PAIRP(n)) {
2471     SgObject f = SG_CAR(n);
2472     if (SG_EQ(f, SYM_REGISTER)) {
2473       /* capture */
2474       unparse_reg(n, out);
2475     } else if (SG_EQ(f, SYM_SEQUENCE)) {
2476       unparse_seq(SG_CDR(n), out);
2477     } else if (SG_EQ(f, SYM_FLAGGED_SEQUENCE)) {
2478       SgObject flags, cp;
2479       if (SG_NULLP(SG_CDR(n))) goto err;
2480       flags = SG_CADR(n);
2481       if (!SG_PAIRP(flags)) goto err;
2482       Sg_PutzUnsafe(out, "(?");
2483       SG_FOR_EACH(cp, flags) {
2484 	SgObject flag = SG_CAR(cp);
2485 	if (!SG_PAIRP(flag)) goto err;
2486 	if (!SG_CHARP(SG_CAR(flag))) goto err;
2487 	if (SG_FALSEP(SG_CDR(flag))) {
2488 	  Sg_PutcUnsafe(out, '-');
2489 	}
2490 	Sg_PutcUnsafe(out, SG_CHAR_VALUE(SG_CAR(flag)));
2491       }
2492       unparse_between(":", SG_CDDR(n), ")", out);
2493     } else if (SG_EQ(f, SYM_ALTER)) {
2494       unparse_intersp(SG_CDR(n), "|", out);
2495     } else if (SG_EQ(f, SYM_STANDALONE)) {
2496       unparse_between("(?>", SG_CDR(n), ")", out);
2497     } else if (SG_EQ(f, SYM_GREEDY_REP)) {
2498       unparse_rep(n, TRUE, out);
2499     } else if (SG_EQ(f, SYM_NON_GREEDY_REP)) {
2500       unparse_rep(n, FALSE, out);
2501     } else if (SG_EQ(f, SYM_LOOKAHEAD)) {
2502       unparse_look(n, "", out);
2503     } else if (SG_EQ(f, SYM_LOOKBHIND)) {
2504       unparse_look(n, "<", out);
2505     } else if (SG_EQ(f, SYM_BACKREF)) {
2506       if (!SG_INTP(SG_CDR(n))) goto err;
2507       Sg_Printf(out, UC("\\%d"), SG_INT_VALUE(SG_CDR(n)));
2508     } else if (SG_EQ(f, SYM_BRANCH)) {
2509       /* check alter */
2510       SgObject name, ns, t = n, yes, no;
2511       if (!SG_PAIRP(SG_CDR(t))) goto err;
2512       name = SG_CADR(t);
2513       ns = SG_CDDR(t);
2514       if (!SG_PAIRP(ns) || !SG_PAIRP(SG_CAR(ns)) ||
2515 	  !SG_EQ(SG_CAAR(ns), SYM_ALTER)) {
2516 	Sg_Error(UC("branch must have alter. %S"), n);
2517       }
2518       /* (alternation (sequence ...) (sequence ...) */
2519       ns = SG_CAR(ns);
2520 
2521       Sg_PutzUnsafe(out, "(?");
2522       if (SG_INTP(name)) {
2523 	Sg_Printf(out, UC("(%d)"), SG_INT_VALUE(name));
2524       } else if (SG_PAIRP(name)) {
2525 	if (SG_EQ(SG_CAR(name), SYM_LOOKAHEAD)) {
2526 	  unparse_look(name, "", out);
2527 	} else if (SG_EQ(SG_CAR(name), SYM_LOOKBHIND)) {
2528 	  unparse_look(name, "<", out);
2529 	} else goto err;
2530       } else goto err;
2531       /* if the false condition contains (sequence) then
2532 	 we don't want to emit | */
2533       if (!SG_PAIRP(SG_CDR(ns))) goto err;
2534       yes = SG_CADR(ns);
2535       ns = SG_CDR(ns);
2536       if (!SG_PAIRP(SG_CDR(ns))) goto err;
2537       no = SG_CADR(ns);
2538 
2539       /* just emit yes */
2540       unparse(yes, out);
2541       if (SG_PAIRP(no) && !SG_NULLP(SG_CDR(no))) {
2542 	Sg_PutcUnsafe(out, '|');
2543 	unparse(no, out);
2544       }
2545 
2546     } else if (SG_EQ(f, SYM_INVERTED_CHAR_CLASS)) {
2547       if (!SG_PAIRP(SG_CDR(n))) goto err;
2548       if (!SG_CHAR_SET_P(SG_CADR(n))) goto err;
2549       charset_to_regex(SG_CADR(n), TRUE, out);
2550     } else goto err;
2551 
2552   } else {
2553   err:
2554     /* never happen but raise an error */
2555     Sg_Error(UC("invalid AST node %S [unparse]"), n);
2556   }
2557 }
2558 
unparse_ast(SgObject ast)2559 static SgObject unparse_ast(SgObject ast)
2560 {
2561   SgPort *out;
2562   SgStringPort tp;
2563   SgObject str;
2564   out = Sg_InitStringOutputPort(&tp, 0);
2565 
2566   unparse(ast, out);
2567 
2568   str = SG_STRING(Sg_GetStringFromStringPort(&tp));
2569   SG_CLEAN_STRING_PORT(&tp);
2570   return str;
2571 }
2572 
make_pattern(SgObject p,SgObject ast,int flags,int reg_num,SgObject names,prog_t * prog,compile_ctx_t * cctx)2573 static SgPattern* make_pattern(SgObject p, SgObject ast, int flags,
2574 			       int reg_num, SgObject names, prog_t *prog,
2575 			       compile_ctx_t *cctx)
2576 {
2577   SgPattern *pt = SG_NEW(SgPattern);
2578   SG_SET_CLASS(pt, SG_CLASS_PATTERN);
2579   pt->pattern = p;
2580   pt->ast = ast;
2581   pt->flags = flags;
2582   pt->groupCount = reg_num;
2583   pt->prog = prog;
2584   pt->extendedp = cctx->extendedp;
2585   pt->groupNames = names;
2586   return pt;
2587 }
2588 
count_register(SgObject ast,int * acc,SgObject * regs)2589 static void count_register(SgObject ast, int *acc, SgObject *regs)
2590 {
2591   if (SG_EQ(ast, SYM_REGISTER)) {
2592     (*acc)++;
2593     return;
2594   }
2595   if (SG_PAIRP(ast)) {
2596     if (SG_EQ(SG_CAR(ast), SYM_REGISTER)) {
2597       /* get name and capture number */
2598       SgObject n = SG_CADR(ast);
2599       SgObject name = SG_CAR(SG_CDDR(ast));
2600       if (!SG_FALSEP(name)) {
2601 	SgObject slot = Sg_Assq(name, *regs);
2602 	if (SG_FALSEP(slot)) {
2603 	  PUSH(SG_LIST2(name, n), *regs);
2604 	} else {
2605 	  SG_SET_CDR(slot, Sg_Cons(n, SG_CDR(slot)));
2606 	}
2607       }
2608     }
2609     count_register(SG_CAR(ast), acc, regs);
2610     count_register(SG_CDR(ast), acc, regs);
2611   }
2612   return;
2613 }
2614 
compile_regex_ast(SgString * pattern,SgObject ast,int flags)2615 static SgObject compile_regex_ast(SgString *pattern, SgObject ast, int flags)
2616 {
2617   SgPattern *p;
2618   prog_t *prog;
2619   compile_ctx_t cctx = {0};
2620   int reg_num = 0;
2621   SgObject regs = SG_NIL;
2622   /* optimize */
2623   ast = optimize(ast, SG_NIL);
2624   /* count group number */
2625   count_register(ast, &reg_num, &regs);
2626   /* compile */
2627   cctx.flags = flags;
2628   prog = compile(&cctx, ast);
2629 
2630   p = make_pattern(pattern ? pattern : unparse_ast(ast),
2631 		   ast, flags, reg_num, regs, prog, &cctx);
2632   return SG_OBJ(p);
2633 }
2634 
Sg_ParseCharSetString(SgString * s,int asciiP,long start,long end)2635 SgObject Sg_ParseCharSetString(SgString *s, int asciiP, long start, long end)
2636 {
2637   /* sanity check */
2638   lexer_ctx_t ctx;
2639   long size = SG_STRING_SIZE(s);
2640   SgObject r;
2641   SG_CHECK_START_END(start, end, size);
2642   if (size < 2) {
2643     Sg_Error(UC("invalid regex char-set string. %S"), s);
2644   }
2645   if (SG_STRING_VALUE_AT(s, start) != '[') {
2646     Sg_Error(UC("regex char-set must start with '['. %S[%d-%d]"),
2647 	     s, start, end);
2648   }
2649   if (SG_STRING_VALUE_AT(s, end-1) != ']') {
2650     Sg_Error(UC("regex char-set must end with ']'. %S[%d-%d]"), s, start, end);
2651   }
2652   init_lexer(&ctx, s, (asciiP) ? 0 : SG_UNICODE_CASE);
2653   /* init_lexer only initialise the context with whole string but
2654      we want to limit the range. */
2655   ctx.pos = start+1;
2656   ctx.len = end;
2657   r = read_char_set(&ctx, NULL);
2658   if (ctx.pos != ctx.len) {
2659     Sg_Error(UC("non finished charset string. %S[%d-%d]"), s, start, end);
2660   }
2661   return r;
2662 }
2663 
Sg_CharSetToRegexString(SgObject cset,int invertP)2664 SgObject Sg_CharSetToRegexString(SgObject cset, int invertP)
2665 {
2666   SgPort *out;
2667   SgStringPort tp;
2668   SgObject str;
2669 
2670   out = Sg_InitStringOutputPort(&tp, 0);
2671   charset_to_regex(cset, invertP, out);
2672   str = Sg_GetStringFromStringPort(&tp);
2673   SG_CLEAN_STRING_PORT(&tp);
2674   return str;
2675 }
2676 
Sg_CompileRegex(SgString * pattern,int flags,int parseOnly)2677 SgObject Sg_CompileRegex(SgString *pattern, int flags, int parseOnly)
2678 {
2679   SgObject ast;
2680   lexer_ctx_t ctx;
2681 
2682   init_lexer(&ctx, pattern, flags);
2683   ast = parse_string(&ctx);
2684   if (!END_OF_STRING_P(&ctx)) {
2685     raise_syntax_error(&ctx, ctx.pos,
2686 		       UC("Expected end of string."));
2687   }
2688   if (parseOnly) return ast;
2689   return compile_regex_ast(pattern, ast, flags);
2690 }
2691 
Sg_CompileRegexAST(SgObject ast,int flags)2692 SgObject Sg_CompileRegexAST(SgObject ast, int flags)
2693 {
2694   return compile_regex_ast(NULL, ast, flags);
2695 }
2696 
Sg_DumpRegex(SgPattern * pattern,SgObject port)2697 void Sg_DumpRegex(SgPattern *pattern, SgObject port)
2698 {
2699 #if 1
2700   int i;
2701   const int size = pattern->prog->rootLength;
2702   inst_t *start = &pattern->prog->root[0];
2703   Sg_Printf(port, UC("input regex : %S extended? %A\n"), pattern->pattern,
2704 	    SG_MAKE_BOOL(pattern->extendedp));
2705   Sg_Printf(port, UC(" group count: %d\n"), pattern->groupCount);
2706   for (i = 0; i < size; i++) {
2707     inst_t *inst = &pattern->prog->root[i];
2708     int op = INST_OPCODE(inst);
2709     switch (op) {
2710     case RX_ANY:
2711       Sg_Printf(port, UC("%3d: RX_ANY[%d]\n"), i, op);
2712       break;
2713     case RX_CHAR:
2714       Sg_Printf(port, UC("%3d: RX_CHAR[%d] %c\n"), i, op, inst->arg.c);
2715       break;
2716     case RX_SET:
2717       Sg_Printf(port, UC("%3d: RX_SET[%d] %S\n"), i, op, inst->arg.set);
2718       break;
2719     case RX_NSET:
2720       Sg_Printf(port, UC("%3d: RX_NSET[%d] %S\n"), i, op, inst->arg.set);
2721       break;
2722     case RX_SPLIT:
2723       Sg_Printf(port, UC("%3d: RX_SPLIT[%d] %d %d\n"),
2724 		i, op, inst->arg.pos.x - start, inst->arg.pos.y - start);
2725       break;
2726     case RX_JMP:
2727       Sg_Printf(port, UC("%3d: RX_JMP[%d] %d\n"),
2728 		i, op, inst->arg.pos.x - start);
2729       break;
2730     case RX_SAVE:
2731       Sg_Printf(port, UC("%3d: RX_SAVE[%d] %d\n"), i, op, inst->arg.n);
2732       break;
2733     case RX_EMPTY:
2734       Sg_Printf(port, UC("%3d: RX_EMPTY[%d] %x\n"), i, op, inst->arg.flags);
2735       break;
2736     case RX_FAIL:
2737       Sg_Printf(port, UC("%3d: RX_FAIL[%d]\n"), i, op);
2738       break;
2739     case RX_MATCH:
2740       Sg_Printf(port, UC("%3d: RX_MATCH[%d]\n"), i, op);
2741       break;
2742     case RX_BREF:
2743       Sg_Printf(port, UC("%3d: RX_BREF[%d] %d\n"), i, op, inst->arg.index);
2744       break;
2745     case RX_AHEAD:
2746     case RX_NAHEAD:
2747       Sg_Printf(port, UC("%3d: %s[%d] %d\n"), i,
2748 		op == RX_AHEAD ? UC("RX_AHEAD") : UC("RX_NAHEAD"), op,
2749 		inst->arg.pos.x - start);
2750       break;
2751     case RX_BEHIND:
2752     case RX_NBEHIND:
2753       Sg_Printf(port, UC("%3d: %s[%d] %d\n"), i,
2754 		op == RX_BEHIND ? UC("RX_BEHIND") : UC("RX_NBEHIND"), op,
2755 		inst->arg.pos.x - start);
2756       break;
2757     case RX_ONCE:
2758       Sg_Printf(port, UC("%3d: RX_ONCE[%d] %d\n"), i, op,
2759 		inst->arg.pos.x - start);
2760       break;
2761     case RX_RESTORE:
2762       Sg_Printf(port, UC("%3d: RX_RESTORE[%d] %d\n"), i, op, inst->arg.index);
2763       break;
2764     case RX_BRANCH:
2765       Sg_Printf(port, UC("%3d: RX_BRANCH[%d] %d %d %d\n"), i, op,
2766 		inst->arg.cond.n, inst->arg.cond.x-start,
2767 		inst->arg.cond.y-start);
2768       break;
2769     case RX_BRANCHA:
2770       Sg_Printf(port, UC("%3d: RX_BRANCHA[%d] %d %d\n"), i, op,
2771 		inst->arg.cond.x-start, inst->arg.cond.y-start);
2772       break;
2773     default:
2774       Sg_Printf(port, UC("%3d: ??? %d\n"), i, op);
2775       break;
2776     }
2777   }
2778 #endif
2779 }
2780 
2781 /* returns (compile-regex "str" flag) */
read_regex_string(SgPort * port)2782 static SgObject read_regex_string(SgPort *port)
2783 {
2784   SgPort *buf;
2785   SgStringPort tp;
2786   buf = Sg_InitStringOutputPort(&tp, -1);
2787   while(1) {
2788     SgChar c = Sg_GetcUnsafe(port);
2789     if (c == EOF) {
2790       Sg_ReadError(UC("unexpected end-of-file. (file %S, line %d)"),
2791 		   Sg_FileName(port), Sg_LineNo(port));
2792     }
2793     if (c == '\\') {
2794       /* escape. */
2795       Sg_PutcUnsafe(buf, c);
2796       Sg_PutcUnsafe(buf, Sg_GetcUnsafe(port));
2797     } else if (c == '/') {
2798       /* end mark */
2799       int flag = 0, add = 0;
2800       SgObject tmp;
2801     entry:
2802       c = Sg_PeekcUnsafe(port);
2803       switch (c) {
2804       case 'x':
2805 	add = SG_COMMENTS;
2806 	goto add_flag;
2807       case 'i':
2808 	add = SG_CASE_INSENSITIVE;
2809 	goto add_flag;
2810       case 'm':
2811 	add = SG_MULTILINE;
2812 	goto add_flag;
2813       case 's':
2814 	add = SG_DOTALL;
2815 	goto add_flag;
2816       case 'u':
2817 	add = SG_UNICODE_CASE;
2818 	goto add_flag;
2819       default:
2820 	tmp = Sg_GetStringFromStringPort(&tp);
2821 	SG_CLEAN_STRING_PORT(&tp);
2822 	return Sg_CompileRegex(tmp, flag, FALSE);
2823       }
2824     add_flag:
2825       flag |= add;
2826       Sg_GetcUnsafe(port);
2827       goto entry;
2828     } else {
2829       Sg_PutcUnsafe(buf, c);
2830     }
2831   }
2832 }
2833 
hash_slash_reader(SgObject * args,int argc,void * data_)2834 static SgObject hash_slash_reader(SgObject *args, int argc, void *data_)
2835 {
2836   SgPort *p;
2837 
2838   if (!SG_PORTP(args[0])) {
2839     Sg_WrongTypeOfArgumentViolation(SG_INTERN("#/-reader"),
2840 				    SG_MAKE_STRING("port"), args[0], SG_NIL);
2841   }
2842   p = SG_PORT(args[0]);
2843 
2844   return read_regex_string(p);
2845 }
2846 
2847 SG_DEFINE_SUBR(hash_slash_reader_stub, 3, 0, hash_slash_reader, SG_FALSE, NULL);
2848 
add_reader_macro(SgLibrary * lib)2849 static void add_reader_macro(SgLibrary *lib)
2850 {
2851   Sg_EnsureLibraryReadTable(lib);
2852   Sg_MakeDispatchMacroCharacter('#', FALSE, SG_LIBRARY_READTABLE(lib));
2853   Sg_SetDispatchMacroCharacter('#', '/', SG_OBJ(&hash_slash_reader_stub),
2854 			       SG_LIBRARY_READTABLE(lib));
2855 }
2856 
2857 /* abstract matcher class */
2858 SG_DEFINE_ABSTRACT_CLASS(Sg_MatcherClass, NULL);
2859 
2860 /* old interface */
Sg_RegexReplaceFirst(SgMatcher * m,SgObject replacement)2861 SgObject Sg_RegexReplaceFirst(SgMatcher *m, SgObject replacement)
2862 {
2863   return Sg_RegexReplace(m, replacement, 0);
2864 }
2865 
Sg_RegexMatcher(SgPattern * pattern,SgObject text,long start,long end)2866 SgMatcher* Sg_RegexMatcher(SgPattern *pattern, SgObject text,
2867 			   long start, long end)
2868 {
2869   if (SG_STRINGP(text)) {
2870     return Sg_RegexTextMatcher(pattern, SG_STRING(text), start, end);
2871   } else if (SG_BVECTORP(text)) {
2872     return Sg_RegexBinaryMatcher(pattern, SG_BVECTOR(text), start, end);
2873   } else {
2874     Sg_Error(UC("string or bytevector required as text: %S, %S"), pattern, text);
2875     return SG_UNDEF;		/* dummy */
2876   }
2877 }
2878 
Sg_RegexMatches(SgMatcher * m)2879 int Sg_RegexMatches(SgMatcher *m)
2880 {
2881   if (SG_TEXT_MATCHERP(m)) {
2882     return Sg_RegexTextMatches(SG_TEXT_MATCHER(m));
2883   } else if (SG_BINARY_MATCHERP(m)) {
2884     return Sg_RegexBinaryMatches(SG_BINARY_MATCHER(m));
2885   } else {
2886     Sg_Error(UC("matcher requires but god %S"), m);
2887     return FALSE;
2888   }
2889 }
Sg_RegexLookingAt(SgMatcher * m)2890 int Sg_RegexLookingAt(SgMatcher *m)
2891 {
2892   if (SG_TEXT_MATCHERP(m)) {
2893     return Sg_RegexTextLookingAt(SG_TEXT_MATCHER(m));
2894   } else if (SG_BINARY_MATCHERP(m)) {
2895     return Sg_RegexBinaryLookingAt(SG_BINARY_MATCHER(m));
2896   } else {
2897     Sg_Error(UC("matcher requires but god %S"), m);
2898     return FALSE;
2899   }
2900 }
Sg_RegexFind(SgMatcher * m,long start)2901 int Sg_RegexFind(SgMatcher *m, long start)
2902 {
2903   if (SG_TEXT_MATCHERP(m)) {
2904     return Sg_RegexTextFind(SG_TEXT_MATCHER(m), start);
2905   } else if (SG_BINARY_MATCHERP(m)) {
2906     return Sg_RegexBinaryFind(SG_BINARY_MATCHER(m), start);
2907   } else {
2908     Sg_Error(UC("matcher requires but god %S"), m);
2909     return FALSE;
2910   }
2911 }
2912 
Sg_RegexGroup(SgMatcher * m,SgObject groupOrName)2913 SgObject Sg_RegexGroup(SgMatcher *m, SgObject groupOrName)
2914 {
2915   if (SG_TEXT_MATCHERP(m)) {
2916     return Sg_RegexTextGroup(SG_TEXT_MATCHER(m), groupOrName);
2917   } else if (SG_BINARY_MATCHERP(m)) {
2918     return Sg_RegexBinaryGroup(SG_BINARY_MATCHER(m), groupOrName);
2919   } else {
2920     Sg_Error(UC("matcher requires but god %S"), m);
2921     return SG_UNDEF;
2922   }
2923 }
Sg_RegexGroupPosition(SgMatcher * m,SgObject groupOrName,int startP)2924 int Sg_RegexGroupPosition(SgMatcher *m, SgObject groupOrName, int startP)
2925 {
2926   if (SG_TEXT_MATCHERP(m)) {
2927     return Sg_RegexTextGroupPosition(SG_TEXT_MATCHER(m), groupOrName, startP);
2928   } else if (SG_BINARY_MATCHERP(m)) {
2929     return Sg_RegexBinaryGroupPosition(SG_BINARY_MATCHER(m),
2930 				       groupOrName, startP);
2931   } else {
2932     Sg_Error(UC("matcher requires but god %S"), m);
2933     return -1;
2934   }
2935 }
2936 
Sg_RegexReplaceAll(SgMatcher * m,SgObject replacement)2937 SgObject Sg_RegexReplaceAll(SgMatcher *m,
2938 			    SgObject replacement)
2939 {
2940   if (SG_TEXT_MATCHERP(m)) {
2941     return Sg_RegexTextReplaceAll(SG_TEXT_MATCHER(m), replacement);
2942   } else if (SG_BINARY_MATCHERP(m)) {
2943     return Sg_RegexBinaryReplaceAll(SG_BINARY_MATCHER(m), replacement);
2944   } else {
2945     Sg_Error(UC("matcher requires but god %S"), m);
2946     return SG_UNDEF;
2947   }
2948 }
Sg_RegexReplace(SgMatcher * m,SgObject replacement,long count)2949 SgObject Sg_RegexReplace(SgMatcher *m, SgObject replacement,
2950 			 long count)
2951 {
2952   if (SG_TEXT_MATCHERP(m)) {
2953     return Sg_RegexTextReplace(SG_TEXT_MATCHER(m), replacement, count);
2954   } else if (SG_BINARY_MATCHERP(m)) {
2955     return Sg_RegexBinaryReplace(SG_BINARY_MATCHER(m), replacement, count);
2956   } else {
2957     Sg_Error(UC("matcher requires but god %S"), m);
2958     return SG_UNDEF;
2959   }
2960 }
Sg_RegexCaptureCount(SgMatcher * m)2961 int Sg_RegexCaptureCount(SgMatcher *m)
2962 {
2963   if (SG_TEXT_MATCHERP(m)) {
2964     return Sg_RegexTextCaptureCount(SG_TEXT_MATCHER(m));
2965   } else if (SG_BINARY_MATCHERP(m)) {
2966     return Sg_RegexBinaryCaptureCount(SG_BINARY_MATCHER(m));
2967   } else {
2968     Sg_Error(UC("matcher requires but god %S"), m);
2969     return -1;
2970   }
2971 }
2972 
Sg_RegexAfter(SgMatcher * m)2973 SgObject Sg_RegexAfter(SgMatcher *m)
2974 {
2975   if (SG_TEXT_MATCHERP(m)) {
2976     return Sg_RegexTextAfter(SG_TEXT_MATCHER(m));
2977   } else if (SG_BINARY_MATCHERP(m)) {
2978     return Sg_RegexBinaryAfter(SG_BINARY_MATCHER(m));
2979   } else {
2980     Sg_Error(UC("matcher requires but god %S"), m);
2981     return SG_UNDEF;
2982   }
2983 }
Sg_RegexBefore(SgMatcher * m)2984 SgObject Sg_RegexBefore(SgMatcher *m)
2985 {
2986   if (SG_TEXT_MATCHERP(m)) {
2987     return Sg_RegexTextBefore(SG_TEXT_MATCHER(m));
2988   } else if (SG_BINARY_MATCHERP(m)) {
2989     return Sg_RegexBinaryBefore(SG_BINARY_MATCHER(m));
2990   } else {
2991     Sg_Error(UC("matcher requires but god %S"), m);
2992     return SG_UNDEF;
2993   }
2994 }
2995 
2996 SG_CDECL_BEGIN
2997 extern void Sg__Init_sagittarius_regex_impl();
2998 SG_CDECL_END
2999 
Sg__InitRegex()3000 void Sg__InitRegex()
3001 {
3002   SgLibrary *lib;
3003   Sg__Init_sagittarius_regex_impl();
3004   lib = SG_LIBRARY(Sg_FindLibrary(SG_INTERN("(sagittarius regex impl)"),
3005 				  FALSE));
3006 
3007   Sg_InitStaticClass(SG_CLASS_PATTERN, UC("<pattern>"), lib, NULL, 0);
3008   Sg_InitStaticClass(SG_CLASS_MATCHER, UC("<matcher>"), lib, NULL, 0);
3009 
3010   Sg_InitStaticClass(SG_CLASS_TEXT_MATCHER, UC("<text-matcher>"), lib, NULL, 0);
3011   Sg_InitStaticClass(SG_CLASS_BINARY_MATCHER, UC("<binary-matcher>"),
3012 		     lib, NULL, 0);
3013 
3014   add_reader_macro(lib);
3015 #define insert_binding(name, value)			\
3016   Sg_MakeBinding(lib, SG_INTERN(#name), SG_MAKE_INT(value), TRUE);
3017   insert_binding(CASE-INSENSITIVE, SG_CASE_INSENSITIVE);
3018   insert_binding(COMMENTS, SG_COMMENTS);
3019   insert_binding(MULTILINE, SG_MULTILINE);
3020   insert_binding(LITERAL, SG_LITERAL);
3021   insert_binding(DOTALL, SG_DOTALL);
3022   /* deprecated rename exported on Scheme */
3023   /* insert_binding(UNICODE-CASE, SG_UNICODE_CASE); */
3024   insert_binding(UNICODE,      SG_UNICODE_CASE);
3025 #undef insert_binding
3026 
3027   SYM_ALTER = SG_INTERN("alternation");
3028   SYM_NON_GREEDY_REP = SG_INTERN("non-greedy-repetition");
3029   SYM_GREEDY_REP = SG_INTERN("greedy-repetition");
3030   SYM_CLOSE_PAREN = SG_INTERN("close-paren");
3031   SYM_VERTICAL_BAR = SG_INTERN("vertical-bar");
3032   SYM_QUESTION_MARK = SG_INTERN("question-mark");
3033   SYM_EVERYTHING = SG_INTERN("everything");
3034   SYM_END_ANCHOR = SG_INTERN("end-anchor");
3035   SYM_INVERTED_CHAR_CLASS = SG_INTERN("inverted-char-class");
3036   SYM_MODELESS_START_ANCHOR = SG_INTERN("modeless-start-anchor");
3037   SYM_MODELESS_END_ANCHOR = SG_INTERN("modeless-end-anchor");
3038   SYM_MODELESS_END_ANCHOR_NO_NEWLINE
3039     = SG_INTERN("modeless-end-anchor-no-newline");
3040   SYM_START_ANCHOR = SG_INTERN("start-anchor");
3041   SYM_BACKREF = SG_INTERN("back-reference");
3042   SYM_WORD_BOUNDARY = SG_INTERN("word-boundary");
3043   SYM_NON_WORD_BOUNDARY = SG_INTERN("non-word-boundary");
3044   SYM_BRANCH = SG_INTERN("branch");
3045   SYM_FLAGS = SG_INTERN("flags");
3046   SYM_OPEN_PAREN = SG_INTERN("open-paren");
3047   SYM_OPEN_PAREN_PAREN = SG_INTERN("open-paren-paren");
3048   SYM_OPEN_PAREN_GREATER = SG_INTERN("open-paren-greater");
3049   SYM_OPEN_PAREN_EQUAL = SG_INTERN("open-paren-equal");
3050   SYM_OPEN_PAREN_LESS_EXCLAMATION = SG_INTERN("open-paren-less-exclamation");
3051   SYM_OPEN_PAREN_COLON = SG_INTERN("open-paren-colon");
3052   SYM_OPEN_PAREN_EXCLAMATION = SG_INTERN("open-paren-exclamation");
3053   SYM_OPEN_PAREN_LESS_LETTER = SG_INTERN("open-paren-less-letter");
3054   SYM_REGISTER = SG_INTERN("register");
3055   SYM_STANDALONE = SG_INTERN("standalone");
3056   SYM_LOOKAHEAD = SG_INTERN("lookahead");
3057   SYM_OPEN_PAREN_LESS_EQUAL = SG_INTERN("open-paren-less-equal");
3058   SYM_SEQUENCE = SG_INTERN("sequence");
3059   SYM_LOOKBHIND = SG_INTERN("lookbehind");
3060   SYM_FLAGGED_SEQUENCE = SG_INTERN("flagged-sequence");
3061 }
3062