1 /* regex2.c -*- coding: utf-8; -*-
2 *
3 * Copyright (c) 2010-2021 Takashi Kato <ktakashi@ymail.com>
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 *
9 * 1. Redistributions of source code must retain the above copyright
10 * notice, this list of conditions and the following disclaimer.
11 *
12 * 2. Redistributions in binary form must reproduce the above copyright
13 * notice, this list of conditions and the following disclaimer in the
14 * documentation and/or other materials provided with the distribution.
15 *
16 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
17 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
18 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
19 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
20 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
21 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
22 * TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
23 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
24 * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
25 * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
26 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27 *
28 * $Id: $
29 */
30 #include <string.h>
31 #include <ctype.h>
32 #define LIBSAGITTARIUS_BODY
33 #include "sagittarius/private/pair.h"
34 #include "sagittarius/private/regex.h"
35 #include "sagittarius/private/error.h"
36 #include "sagittarius/private/port.h"
37 #include "sagittarius/private/string.h"
38 #include "sagittarius/private/symbol.h"
39 #include "sagittarius/private/cache.h"
40
41 #include "shortnames.incl"
42
43 /* #define DEBUG_REGEX 1 */
44
45 static SgSymbol *constant_symbol_table[33] = {NULL};
46
47 #define SYM_ALTER (constant_symbol_table[0])
48 #define SYM_NON_GREEDY_REP (constant_symbol_table[1])
49 #define SYM_GREEDY_REP (constant_symbol_table[2])
50 #define SYM_CLOSE_PAREN (constant_symbol_table[3])
51 #define SYM_VERTICAL_BAR (constant_symbol_table[4])
52 #define SYM_QUESTION_MARK (constant_symbol_table[5])
53 #define SYM_EVERYTHING (constant_symbol_table[6])
54 #define SYM_END_ANCHOR (constant_symbol_table[7])
55 #define SYM_INVERTED_CHAR_CLASS (constant_symbol_table[8])
56 #define SYM_MODELESS_START_ANCHOR (constant_symbol_table[9])
57 #define SYM_MODELESS_END_ANCHOR (constant_symbol_table[10])
58 #define SYM_MODELESS_END_ANCHOR_NO_NEWLINE (constant_symbol_table[11])
59 #define SYM_START_ANCHOR (constant_symbol_table[12])
60 #define SYM_BACKREF (constant_symbol_table[13])
61 #define SYM_WORD_BOUNDARY (constant_symbol_table[14])
62 #define SYM_NON_WORD_BOUNDARY (constant_symbol_table[15])
63 #define SYM_BRANCH (constant_symbol_table[16])
64 #define SYM_FLAGS (constant_symbol_table[17])
65 #define SYM_OPEN_PAREN (constant_symbol_table[18])
66 #define SYM_OPEN_PAREN_PAREN (constant_symbol_table[19])
67 #define SYM_OPEN_PAREN_GREATER (constant_symbol_table[20])
68 #define SYM_OPEN_PAREN_EQUAL (constant_symbol_table[21])
69 #define SYM_OPEN_PAREN_LESS_EXCLAMATION (constant_symbol_table[22])
70 #define SYM_OPEN_PAREN_COLON (constant_symbol_table[23])
71 #define SYM_OPEN_PAREN_EXCLAMATION (constant_symbol_table[24])
72 #define SYM_OPEN_PAREN_LESS_LETTER (constant_symbol_table[25])
73 #define SYM_REGISTER (constant_symbol_table[26])
74 #define SYM_STANDALONE (constant_symbol_table[27])
75 #define SYM_LOOKAHEAD (constant_symbol_table[28])
76 #define SYM_OPEN_PAREN_LESS_EQUAL (constant_symbol_table[29])
77 #define SYM_SEQUENCE (constant_symbol_table[30])
78 #define SYM_LOOKBHIND (constant_symbol_table[31])
79 #define SYM_FLAGGED_SEQUENCE (constant_symbol_table[32])
80
81 /* convenient macros */
82 #define has(p, f) (((p)->flags & (f)) != 0)
83
84 /* lexer_ctx_t is used to hold the regex string which is currently
85 lexed and to keep track of the lexer's state.
86 */
87 typedef struct lexer_ctx_rec_t
88 {
89 SgChar *str;
90 SgChar *ostr; /* original string for error message */
91 long len;
92 int reg;
93 long pos;
94 SgObject last_pos;
95 int flags;
96 int reg_num;
97 SgObject reg_names;
98 } lexer_ctx_t;
99
100 /* convenient macro */
101 /* we don't use vm.h's PUSH and POP here */
102 #ifdef PUSH
103 # undef PUSH
104 #endif
105 #ifdef POP
106 # undef POP
107 #endif
108 #define PUSH(v, l) ((l) = Sg_Cons((v), (l)))
109 #define POP(l) ((l) = SG_CDR(l))
110
remove_qe_quoting(lexer_ctx_t * ctx)111 static void remove_qe_quoting(lexer_ctx_t *ctx)
112 {
113 const long plen = ctx->len;
114 int i = 0, j, inQuote = TRUE;
115 SgChar *newtemp;
116 while (i < plen - 1) {
117 if (ctx->str[i] != '\\') i += 1;
118 else if (ctx->str[i+1] != 'Q') i += 2;
119 else break;
120 }
121 if (i >= plen - 1) return; /* no \Q sequence found */
122 j = i;
123 i += 2;
124 newtemp = SG_NEW_ATOMIC2(SgChar *, sizeof(SgChar)*(j+2*(plen-i)+2));
125 memcpy(newtemp, ctx->str, j*sizeof(SgChar));
126 while (i < plen) {
127 SgChar c = ctx->str[i++];
128 if (!isascii(c) || isalnum(c)) {
129 newtemp[j++] = c;
130 } else if (c != '\\') {
131 if (inQuote) newtemp[j++] = '\\';
132 newtemp[j++] = c;
133 } else if (inQuote) {
134 if (ctx->str[i] == 'E') {
135 i++;
136 inQuote = FALSE;
137 } else {
138 newtemp[j++] = '\\';
139 newtemp[j++] = '\\';
140 }
141 } else {
142 if (ctx->str[i] == 'Q') {
143 i++;
144 inQuote = TRUE;
145 } else {
146 newtemp[j++] = c;
147 if (i != plen) newtemp[j++] = ctx->str[i++];
148 }
149 }
150 }
151 ctx->len = j;
152 ctx->str = SG_NEW_ATOMIC2(SgChar *, sizeof(SgChar)*j);
153 memcpy(ctx->str, newtemp, sizeof(SgChar)*j);
154 newtemp = NULL; /* gc friendliness */
155 }
156
init_lexer(lexer_ctx_t * ctx,SgString * str,int flags)157 static void init_lexer(lexer_ctx_t *ctx, SgString *str, int flags)
158 {
159 ctx->ostr = ctx->str = SG_STRING_VALUE(str);
160 ctx->len = SG_STRING_SIZE(str);
161 ctx->reg = ctx->pos = 0;
162 ctx->last_pos = SG_NIL;
163 ctx->flags = flags;
164 ctx->reg_num = 1; /* 0 is the whole matched string */
165 ctx->reg_names = SG_NIL;
166 if (!has(ctx, SG_LITERAL)) {
167 /* remove \Q \E quoting now */
168 remove_qe_quoting(ctx);
169 }
170 }
171
172 /* error */
raise_syntax_error(lexer_ctx_t * ctx,long pos,const SgChar * str)173 static void raise_syntax_error(lexer_ctx_t *ctx, long pos, const SgChar *str)
174 {
175 /* TODO create regex parser error or so */
176 Sg_Error(UC("bad regex syntax in %s: %s, [posision %d]"),
177 ctx->ostr, str, pos);
178 }
179
180 /* compile error. this is actually for pass3, but i put it here */
raise_compile_error(const SgChar * msg,SgObject irr)181 static void raise_compile_error(const SgChar *msg, SgObject irr)
182 {
183 if (SG_FALSEP(irr)) {
184 Sg_Error(msg);
185 } else {
186 Sg_Error(msg, irr);
187 }
188 }
189
190 /* null sequence */
null_seq()191 static SgObject null_seq()
192 {
193 return SG_LIST1(SYM_SEQUENCE);
194 }
195
196 /* lexer functions */
197 /*
198 Tests whether we're at the end of the regex string
199 */
200 #define END_OF_STRING_P(ctx) ((ctx)->len <= (ctx)->pos)
201 /*
202 Tests whether we're at the end of the regex string
203 */
204 #define LOOKING_AT_P(ctx, c) (!END_OF_STRING_P(ctx) && c == ctx->str[ctx->pos])
205
206 static SgObject get_number(lexer_ctx_t *ctx, int radix, int maxlen,
207 int nowhilespace);
208 static SgChar next_char(lexer_ctx_t *ctx);
209 static SgChar next_char_non_extended(lexer_ctx_t *ctx);
210 /*
211 Create character from char-code number. number can be #f
212 which is interpreted as 0. error-pos is the position where
213 the corresponding number started within the regex string.
214 */
make_char_from_code(lexer_ctx_t * ctx,SgObject number,long error_pos)215 static SgObject make_char_from_code(lexer_ctx_t *ctx, SgObject number,
216 long error_pos)
217 {
218 int code;
219 if (SG_FALSEP(number)) code = 0;
220 else {
221 /* only look at right most eight bits in compliance with Perl */
222 code = 0xFF & SG_INT_VALUE(number);
223 }
224 if (code <= SG_CHAR_MAX) return SG_MAKE_CHAR(code);
225 raise_syntax_error(ctx, error_pos,
226 UC("No character of given code"));
227 return SG_UNDEF; /* dummy */
228 }
229
230 /*
231 Convert the characters(s) following a backslash into a token
232 which is returned. This function is to be called when the backslash
233 has already been consumed. Special character classes like \\W are
234 handled elsewhere.
235 */
unescape_char(lexer_ctx_t * ctx)236 static SgObject unescape_char(lexer_ctx_t *ctx)
237 {
238 SgChar chr, nc;
239 long error_pos;
240 SgObject n;
241 if (END_OF_STRING_P(ctx)) {
242 raise_syntax_error(ctx, -1, UC("String ends with backslash."));
243 }
244 chr = next_char_non_extended(ctx);
245 switch (chr) {
246 /* it's already resolved. */
247 /* case 'E': return SYM_VOID; */
248 case 'c':
249 /* \cx means control-x in Perl */
250 nc = next_char_non_extended(ctx);
251 if (nc == EOF) {
252 raise_syntax_error(ctx, ctx->pos,
253 UC("Character missing after '\\c'."));
254 }
255 return SG_MAKE_CHAR(Sg_CharUpCase(nc) | 0x40);
256 case 'x':
257 /* \x should be followed by hexadecimal char code, two digis or less */
258 error_pos = ctx->pos - 1;
259 n = get_number(ctx, 16, 2, TRUE);
260 return make_char_from_code(ctx, n, error_pos);
261 case 'u':
262 /* \u should be followed by hexadecimal char code, 4 digis or less */
263 error_pos = ctx->pos - 1;
264 n = get_number(ctx, 16, 4, TRUE);
265 return make_char_from_code(ctx, n, error_pos);
266 case 'U':
267 /* \U should be followed by hexadecimal char code, 8 digis or less */
268 error_pos = ctx->pos - 1;
269 n = get_number(ctx, 16, 8, TRUE);
270 return make_char_from_code(ctx, n, error_pos);
271 case '0': case '1': case '2': case '3': case '4':
272 case '5': case '6': case '7': case '8': case '9':
273 error_pos = ctx->pos - 1;
274 n = get_number(ctx, 8, 3, FALSE);
275 return make_char_from_code(ctx, n, error_pos);
276 case 't': return SG_MAKE_CHAR('\t');
277 case 'n': return SG_MAKE_CHAR('\n');
278 case 'r': return SG_MAKE_CHAR('\r');
279 case 'f': return SG_MAKE_CHAR('\f');
280 case 'b': return SG_MAKE_CHAR('\b');
281 case 'a': return SG_MAKE_CHAR(0x07);
282 case 'e': return SG_MAKE_CHAR(0x27);
283 default:
284 return SG_MAKE_CHAR(chr);
285 }
286 }
287
288 /* I forgot to make ustrchr in unicode.c */
ustrchr(const SgChar * str,SgChar c,long len)289 static long ustrchr(const SgChar *str, SgChar c, long len)
290 {
291 long i;
292 for (i = 0; i < len; i++, str++) {
293 if (*str == c) return i;
294 }
295 return -1;
296 }
297
298 /*
299 Returns the next character which is to be examined and updates the pos.
300 Does not respect extended mode.
301 */
next_char_non_extended(lexer_ctx_t * ctx)302 static SgChar next_char_non_extended(lexer_ctx_t *ctx)
303 {
304 if (END_OF_STRING_P(ctx)) return EOF;
305 return ctx->str[ctx->pos++];
306 }
307
308 /*
309 Returns the next character which is to be examined and updates the pos.
310 Respects extended mode, i.e. white spece, comments, and also nested comments
311 are skipped if applicable.
312 */
next_char(lexer_ctx_t * ctx)313 static SgChar next_char(lexer_ctx_t *ctx)
314 {
315 SgChar nc = next_char_non_extended(ctx);
316 long last_pos;
317 while (1) {
318 last_pos = ctx->pos;
319 if (nc != EOF && nc == '(' && LOOKING_AT_P(ctx, '?')) {
320 ctx->pos++;
321 if (LOOKING_AT_P(ctx, '#')) {
322 /* "(?#" must be a nested comment - so we have to search for the
323 closing parenthesis
324 */
325 long error_pos = ctx->pos - 2;
326 SgChar skip_char = nc;
327 while (skip_char != EOF && skip_char != ')') {
328 skip_char = next_char_non_extended(ctx);
329 if (skip_char == EOF) {
330 raise_syntax_error(ctx, error_pos,
331 UC("Comment group not closed"));
332 }
333 }
334 nc = next_char_non_extended(ctx);
335 } else {
336 /* undo */
337 ctx->pos--;
338 }
339 }
340 if (has(ctx, SG_COMMENTS)) {
341 /* now - if we're in extended mode - we skip whitespace and comments;
342 repeat the following loop while we look at whitespace or #\#
343 */
344 while (nc != EOF && (nc == '#' || Sg_Ucs4WhiteSpaceP(nc))) {
345 if (nc == '#') {
346 /* if we saw a comment marker skip util we're behinf \n */
347 SgChar skip_char = nc;
348 while(skip_char != EOF && skip_char != '\n') {
349 skip_char = next_char_non_extended(ctx);
350 }
351 nc = next_char_non_extended(ctx);
352 } else {
353 /* ...otherwise (whitespace) skip until we see the next
354 non-whitespace character
355 */
356 SgChar skip_char = nc;
357 while (skip_char != EOF && Sg_Ucs4WhiteSpaceP(skip_char)) {
358 skip_char = next_char_non_extended(ctx);
359 }
360 nc = skip_char;
361 }
362 }
363 }
364 /*
365 if the position has moved we have to repeat out tests because of cases
366 like /^a (?#xxx) (?#yyy) {3}c/x
367 */
368 if (ctx->pos <= last_pos) {
369 return nc;
370 }
371 }
372 }
373 /*
374 Tests whether the next token can start a valid sub-expression, i.e. a
375 stand-alone regex
376 */
start_of_subexpr_p(lexer_ctx_t * ctx)377 static int start_of_subexpr_p(lexer_ctx_t *ctx)
378 {
379 long pos = ctx->pos;
380 SgChar nc = next_char(ctx);
381 if (nc == EOF) return FALSE;
382 /* rest position */
383 ctx->pos = pos;
384 return !(nc == ')' || nc == '|');
385 }
386
387 /*
388 TODO separate context of unicode, but how?
389 */
read_char_property(lexer_ctx_t * ctx,SgChar first)390 static SgObject read_char_property(lexer_ctx_t *ctx, SgChar first)
391 {
392 if (next_char_non_extended(ctx) != '{') {
393 /* one letter property */
394 /* well, what shall we do with one letter property? just a character */
395 ctx->pos--;
396 return SG_MAKE_CHAR(first);
397 } else {
398 /* a bit tricky. char property must start with 'In' or 'Is'. I have no idea
399 what the difference is. So for now we treat both the same.
400 */
401 SgObject es;
402 SgGloc *gloc;
403 long pos;
404 /* first check the property has 'Is' or 'In. */
405 if (ctx->len-ctx->pos <= 2 ||
406 !(ctx->str[ctx->pos] == 'I' &&
407 (ctx->str[ctx->pos + 1] == 's' || ctx->str[ctx->pos + 1] == 'n'))) {
408 raise_syntax_error(ctx, ctx->pos,
409 UC("Invalid character property name."));
410 }
411
412 pos = ustrchr(ctx->str+ctx->pos+2, '}', ctx->len-ctx->pos-2);
413 if (pos == -1) {
414 /* no closing '}' */
415 raise_syntax_error(ctx, ctx->pos,
416 UC("Character property does not have '}'"));
417 }
418 /* we convert property name with prefix 'char-set:' then we can look up
419 from builtin charset.
420 */
421 /* does not seem smart solution ... */
422 es = SG_MAKE_STRING("char-set:");
423 es = Sg_StringAppendC(SG_STRING(es), ctx->str+ctx->pos+2, pos);
424 es = Sg_StringDownCase(SG_STRING(es));
425 gloc = Sg_FindBinding(Sg_VM()->currentLibrary, Sg_Intern(es), SG_FALSE);
426 if (SG_FALSEP(gloc) || !SG_CHAR_SET_P(SG_GLOC_GET(gloc))) {
427 raise_syntax_error(ctx, ctx->pos,
428 UC("Given character property is not supported"));
429 }
430 ctx->pos += pos+3;
431 return SG_GLOC_GET(gloc);
432 }
433 }
434
435 /* I'm not sure if this one should be in charset.c */
digit_to_int(SgChar ch,int radix)436 static int digit_to_int(SgChar ch, int radix)
437 {
438 if (ch < '0') return -1;
439 if (radix <= 10) {
440 if (ch < '0' + radix) return ch - '0';
441 } else {
442 if (ch <= '9') return ch - '0';
443 if (ch < 'A') return -1;
444 if (ch < 'A' + radix - 10) return ch - 'A' + 10;
445 if (ch < 'a') return -1;
446 if (ch < 'a' + radix - 10) return ch - 'a' + 10;
447 }
448 return -1;
449 }
450
read_xdigit(lexer_ctx_t * ctx,int ndigits,char * buf,int * nread)451 static SgChar read_xdigit(lexer_ctx_t *ctx, int ndigits,
452 char *buf, int *nread)
453 {
454 int i, c, val = 0, dig;
455 for (i = 0; i < ndigits; i++) {
456 c = next_char_non_extended(ctx);
457 if (c == EOF) break;
458 dig = digit_to_int(c, 16);
459 if (dig < 0) {
460 ctx->pos--;
461 break;
462 }
463 buf[i] = (char)c;
464 val = val * 16 + dig;
465 }
466 *nread = i;
467 if (i < ndigits) return -1;
468 return (SgChar)val;
469 }
470
read_charset_xdigits(lexer_ctx_t * ctx,int ndigs,int key)471 static SgChar read_charset_xdigits(lexer_ctx_t *ctx, int ndigs, int key)
472 {
473 char buf[8]; /* max 8 */
474 int nread;
475 SgChar r;
476 ASSERT(ndigs <= 8);
477 r = read_xdigit(ctx, ndigs, buf, &nread);
478 if (r == -1) {
479 raise_syntax_error(ctx, ctx->pos,
480 UC("Character class contains invalid escaped character")
481 );
482 }
483 return r;
484 }
485
486 /*
487 char-set-difference and char-set-intersection,
488
489 ugh, this is defined in Scheme as well
490 TODO implement this properly in C and expose it to Scheme world. */
sub_range(SgObject ranges,long from,long to)491 static SgObject sub_range(SgObject ranges, long from, long to)
492 {
493 SgObject h = SG_NIL, t = SG_NIL, cp;
494 SG_FOR_EACH(cp, ranges) {
495 long lo = SG_INT_VALUE(SG_CAAR(cp));
496 long hi = SG_INT_VALUE(SG_CDAR(cp));
497 if (lo <= from && from <= hi) {
498 if (lo <= to && to <= hi) {
499 if (lo == from) {
500 if (to != hi) {
501 SG_APPEND1(h, t, Sg_Cons(SG_MAKE_INT(to + 1), SG_MAKE_INT(hi)));
502 }
503 } else {
504 if (to == hi) {
505 SG_APPEND1(h, t, Sg_Cons(SG_MAKE_INT(lo), SG_MAKE_INT(from - 1)));
506 } else {
507 SG_APPEND1(h, t, Sg_Cons(SG_MAKE_INT(to + 1), SG_MAKE_INT(hi)));
508 SG_APPEND1(h, t, Sg_Cons(SG_MAKE_INT(lo), SG_MAKE_INT(from - 1)));
509 }
510 }
511 } else {
512 if (lo != from) {
513 SG_APPEND1(h, t, Sg_Cons(SG_MAKE_INT(lo), SG_MAKE_INT(from - 1)));
514 }
515 }
516 } else {
517 if (lo <= to && to <= hi) {
518 if (to != hi) {
519 SG_APPEND1(h, t, Sg_Cons(SG_MAKE_INT(to + 1), SG_MAKE_INT(hi)));
520 }
521 } else {
522 if (!(from < lo && hi < to)) {
523 SG_APPEND1(h, t, Sg_Cons(SG_MAKE_INT(lo), SG_MAKE_INT(hi)));
524 }
525 }
526 }
527 }
528 return h;
529 }
530
cset_diff(SgObject base,SgObject cset)531 static SgObject cset_diff(SgObject base, SgObject cset)
532 {
533 SgObject ranges = Sg_CharSetRanges(base), sub = Sg_CharSetRanges(cset), cp;
534 SG_FOR_EACH(cp, sub) {
535 ranges = sub_range(ranges, SG_INT_VALUE(SG_CAAR(cp)),
536 SG_INT_VALUE(SG_CDAR(cp)));
537 }
538 base = Sg_MakeEmptyCharSet();
539 SG_FOR_EACH(cp, ranges) {
540 SgChar from = (SgChar)SG_INT_VALUE(SG_CAAR(cp));
541 SgChar to = (SgChar)SG_INT_VALUE(SG_CDAR(cp));
542 Sg_CharSetAddRange(SG_CHAR_SET(base), from, to);
543 }
544 return base;
545 }
546
cset_intersect(SgObject x,SgObject y)547 static SgObject cset_intersect(SgObject x, SgObject y)
548 {
549 return cset_diff(x, Sg_CharSetComplement(Sg_CharSetCopy(y)));
550 }
551
asciinise(lexer_ctx_t * ctx,SgObject cset)552 static SgObject asciinise(lexer_ctx_t *ctx, SgObject cset)
553 {
554 if ((ctx->flags & SG_UNICODE_CASE) == 0) {
555 return cset_intersect(cset, Sg_GetStandardCharSet(SG_CHAR_SET_ASCII));
556 }
557 /* unicode char */
558 return cset;
559 }
560
get_defined_cset(lexer_ctx_t * ctx,int i)561 static SgObject get_defined_cset(lexer_ctx_t *ctx, int i)
562 {
563 return asciinise(ctx, Sg_GetStandardCharSet(i));
564 }
565
566
read_defined_charset(lexer_ctx_t * ctx)567 static SgObject read_defined_charset(lexer_ctx_t *ctx)
568 {
569 /* almost the same as read_char_property.
570 */
571 SgObject es;
572 SgGloc *gloc;
573 long pos;
574 SgObject lib, default_lib = SG_FALSE;
575 /* [[:name:]] thing */
576 if (ctx->len-ctx->pos < 2 ||
577 ctx->str[ctx->pos] != ':') {
578 raise_syntax_error(ctx, ctx->pos, UC("Invalid character set name."));
579 }
580 /* skip first ']' */
581 pos = ustrchr(ctx->str+ctx->pos, ']', ctx->len - ctx->pos);
582 if (pos == -1) {
583 /* no closing ':' */
584 raise_syntax_error(ctx, ctx->pos,
585 UC("Invalid charset name. ']' is missing"));
586 }
587 /* we convert property name with prefix 'char-set:' then we can look up
588 from builtin charset.
589 */
590 /* does not seem smart solution ... */
591 es = Sg_MakeEmptyString();
592 /* including ':' */
593 es = Sg_StringAppendC(SG_STRING(es), ctx->str+ctx->pos, pos);
594 lib = Sg_VM()->currentLibrary;
595 retry:
596 gloc = Sg_FindBinding(lib, Sg_Intern(es), SG_FALSE);
597 if (SG_FALSEP(gloc) || !SG_CHAR_SET_P(SG_GLOC_GET(gloc))) {
598 if (SG_FALSEP(default_lib)) {
599 lib = default_lib = Sg_FindLibrary(SG_INTERN("(sagittarius)"), FALSE);
600 goto retry;
601 }
602 raise_syntax_error(ctx, ctx->pos,
603 UC("Given character set is not supported"));
604 }
605 /* Sg_Printf(Sg_StandardErrorPort(), UC("%A\n"), es); */
606 ctx->pos += pos;
607 if (ctx->len <= ctx->pos || ctx->str[++ctx->pos] != ']') {
608 raise_syntax_error(ctx, ctx->pos,
609 UC("charset name is not closed by 2 ']'s"));
610 }
611 return asciinise(ctx, SG_GLOC_GET(gloc));
612 }
613
read_char_set(lexer_ctx_t * ctx,int * complement_p)614 static SgObject read_char_set(lexer_ctx_t *ctx, int *complement_p)
615 {
616 #define REAL_BEGIN 1
617 #define CARET_BEGIN 2
618 int begin = REAL_BEGIN, complement = FALSE;
619 int lastchar = -1, inrange = FALSE, moreset_complement = FALSE;
620 SgCharSet *set = SG_CHAR_SET(Sg_MakeEmptyCharSet());
621 SgObject moreset;
622 SgObject chars = SG_NIL;
623 SgChar ch = 0;
624 long start_pos = ctx->pos;
625
626 for (;;) {
627 ch = next_char_non_extended(ctx);
628 if (ch == EOF) goto err;
629 chars = Sg_Cons(SG_MAKE_CHAR(ch), chars);
630 if (begin == REAL_BEGIN && ch == '^') {
631 complement = TRUE;
632 begin = CARET_BEGIN;
633 continue;
634 }
635 if (begin && ch == ']') {
636 Sg_CharSetAddRange(set, ch, ch);
637 lastchar = ch;
638 begin = FALSE;
639 continue;
640 }
641 begin = FALSE;
642 switch (ch) {
643 case '-':
644 if (inrange) goto ordchar;
645 inrange = TRUE;
646 continue;
647 case ']':
648 if (inrange) {
649 if (lastchar >= 0) {
650 Sg_CharSetAddRange(set, lastchar, lastchar);
651 Sg_CharSetAddRange(set, '-', '-');
652 } else {
653 Sg_CharSetAddRange(set, '-', '-');
654 }
655 }
656 break;
657 case '\\':
658 ch = next_char_non_extended(ctx);
659 if (ch == EOF) goto err;
660 chars = Sg_Cons(SG_MAKE_CHAR(ch), chars);
661 switch (ch) {
662 case 'a': ch = 7; goto ordchar;
663 case 'b': ch = 8; goto ordchar;
664 case 'n': ch = '\n'; goto ordchar;
665 case 'r': ch = '\r'; goto ordchar;
666 case 't': ch = '\t'; goto ordchar;
667 case 'f': ch = '\f'; goto ordchar;
668 case 'e': ch = 0x1b; goto ordchar;
669 case 'x':
670 ch = read_charset_xdigits(ctx, 2, 'x'); goto ordchar;
671 case 'u':
672 ch = read_charset_xdigits(ctx, 4, 'u'); goto ordchar;
673 case 'U':
674 ch = read_charset_xdigits(ctx, 8, 'u'); goto ordchar;
675 case 'd':
676 moreset_complement = FALSE;
677 moreset = get_defined_cset(ctx, SG_CHAR_SET_DIGIT);
678 break;
679 case 'D':
680 moreset_complement = TRUE;
681 moreset = get_defined_cset(ctx, SG_CHAR_SET_DIGIT);
682 break;
683 case 's':
684 moreset_complement = FALSE;
685 moreset = get_defined_cset(ctx, SG_CHAR_SET_SPACE);
686 break;
687 case 'S':
688 moreset_complement = TRUE;
689 moreset = get_defined_cset(ctx, SG_CHAR_SET_SPACE);
690 break;
691 case 'w':
692 moreset_complement = FALSE;
693 moreset = get_defined_cset(ctx, SG_CHAR_SET_WORD);
694 break;
695 case 'W':
696 moreset_complement = TRUE;
697 moreset = get_defined_cset(ctx, SG_CHAR_SET_WORD);
698 break;
699 case 'p':
700 moreset_complement = FALSE;
701 moreset = read_char_property(ctx, ch);
702 break;
703 case 'P':
704 moreset_complement = TRUE;
705 moreset = read_char_property(ctx, ch);
706 break;
707 default: goto ordchar;
708 }
709 if (moreset_complement) {
710 moreset = Sg_CharSetComplement(SG_CHAR_SET(Sg_CharSetCopy(SG_CHAR_SET(moreset))));
711 }
712 Sg_CharSetAdd(set, SG_CHAR_SET(moreset));
713 continue;
714 case '[':
715 moreset = read_defined_charset(ctx);
716 if (!SG_CHAR_SET_P(moreset)) goto err;
717 Sg_CharSetAdd(set, SG_CHAR_SET(moreset));
718 continue;
719 ordchar:
720 default:
721 if (inrange) {
722 if (lastchar < 0) {
723 Sg_CharSetAddRange(set, '-', '-');
724 Sg_CharSetAddRange(set, ch, ch);
725 lastchar = ch;
726 } else {
727 Sg_CharSetAddRange(set, lastchar, ch);
728 lastchar = -1;
729 }
730 inrange = FALSE;
731 } else {
732 Sg_CharSetAddRange(set, ch, ch);
733 lastchar = ch;
734 }
735 continue;
736 }
737 break;
738 }
739 if (complement_p) {
740 *complement_p = complement;
741 return SG_OBJ(set);
742 } else {
743 if (complement) Sg_CharSetComplement(set);
744 return SG_OBJ(set);
745 }
746
747 err:
748 raise_syntax_error(ctx, start_pos -1,
749 UC("bad char-set spec in pattern"));
750 return SG_FALSE;
751 }
752
unget_token(lexer_ctx_t * ctx)753 static void unget_token(lexer_ctx_t *ctx)
754 {
755 if (SG_NULLP(ctx->last_pos)) {
756 Sg_Error(UC("[internal error] No token to unget."));
757 }
758 ctx->pos = SG_INT_VALUE(SG_CAR(ctx->last_pos));
759 POP(ctx->last_pos);
760 }
761
fail(lexer_ctx_t * ctx)762 static SgObject fail(lexer_ctx_t *ctx)
763 {
764 if (SG_NULLP(ctx->last_pos)) {
765 raise_syntax_error(ctx, -1,
766 UC("last-pos stack of lexer is empty"));
767 }
768 ctx->pos = SG_INT_VALUE(SG_CAR(ctx->last_pos));
769 POP(ctx->last_pos);
770 return SG_FALSE;
771 }
772
get_number(lexer_ctx_t * ctx,int radix,int maxlen,int nowhilespace)773 static SgObject get_number(lexer_ctx_t *ctx, int radix, int maxlen,
774 int nowhilespace)
775 {
776 long i, size = ctx->len - ctx->pos, end, n = 0;
777 if (nowhilespace &&
778 Sg_Ucs4WhiteSpaceP(ctx->str[ctx->pos])) {
779 return SG_FALSE;
780 }
781 if (maxlen > 0) {
782 long end_pos = ctx->pos + maxlen;
783 if (end_pos < size) end = end_pos;
784 else end = size;
785 } else {
786 end = size;
787 }
788 for (i = 0; i < end; i++) {
789 if (((radix == 8 || radix == 10) && isdigit(ctx->str[ctx->pos + i])) ||
790 (radix == 16 && isxdigit(ctx->str[ctx->pos + i]))) {
791 n = n*radix + digit_to_int(ctx->str[ctx->pos + i], radix);
792 } else {
793 if (i == 0) n = -1;
794 /* something else is here */
795 break;
796 }
797 }
798 if (n != -1) {
799 /* sanity */
800 ctx->pos += i;
801 return SG_MAKE_INT(n);
802 } else {
803 return SG_FALSE;
804 }
805 }
806
try_number(lexer_ctx_t * ctx,int radix,int maxlen,int nowhilespace)807 static SgObject try_number(lexer_ctx_t *ctx, int radix, int maxlen,
808 int nowhilespace)
809 {
810 SgObject n;
811 PUSH(SG_MAKE_INT(ctx->pos), ctx->last_pos);
812 n = get_number(ctx, radix, maxlen, nowhilespace);
813 if (SG_FALSEP(n)) return fail(ctx);
814 return n;
815 }
816
817 /*
818 Reads a sequence of modifiers (including #\\- to reverse their
819 meaning) and returns a corresponding list of "flag" tokens.
820 */
maybe_parse_flags(lexer_ctx_t * ctx)821 static SgObject maybe_parse_flags(lexer_ctx_t *ctx)
822 {
823 SgChar c = next_char_non_extended(ctx);
824 SgObject h = SG_NIL, t = SG_NIL;
825 int set = TRUE;
826 while (1) {
827 switch (c) {
828 case '-':
829 set = FALSE;
830 break;
831 case 'x':
832 if (set) {
833 ctx->flags |= SG_COMMENTS;
834 } else {
835 ctx->flags &= ~SG_COMMENTS;
836 }
837 break;
838 case 'i':
839 if (set) {
840 ctx->flags |= SG_CASE_INSENSITIVE;
841 SG_APPEND1(h, t, Sg_Cons(SG_MAKE_CHAR(c), SG_TRUE));
842 } else {
843 ctx->flags &= ~SG_CASE_INSENSITIVE;
844 SG_APPEND1(h, t, Sg_Cons(SG_MAKE_CHAR(c), SG_FALSE));
845 }
846 break;
847 case 'm':
848 /* 'm' flag is resolved during compile time. */
849 if (set) {
850 ctx->flags |= SG_MULTILINE;
851 /* SG_APPEND1(h, t, Sg_Cons(SG_MAKE_CHAR(c), SG_TRUE)); */
852 } else {
853 ctx->flags &= ~SG_MULTILINE;
854 /* SG_APPEND1(h, t, Sg_Cons(SG_MAKE_CHAR(c), SG_FALSE)); */
855 }
856 break;
857 case 's':
858 if (set) {
859 ctx->flags |= SG_DOTALL;
860 SG_APPEND1(h, t, Sg_Cons(SG_MAKE_CHAR(c), SG_TRUE));
861 } else {
862 ctx->flags &= ~SG_DOTALL;
863 SG_APPEND1(h, t, Sg_Cons(SG_MAKE_CHAR(c), SG_FALSE));
864 }
865 break;
866 case 'u':
867 if (set) {
868 ctx->flags |= SG_UNICODE_CASE;
869 SG_APPEND1(h, t, Sg_Cons(SG_MAKE_CHAR(c), SG_TRUE));
870 } else {
871 ctx->flags &= ~SG_UNICODE_CASE;
872 SG_APPEND1(h, t, Sg_Cons(SG_MAKE_CHAR(c), SG_FALSE));
873 }
874 break;
875 default: goto end;
876 }
877 c = next_char_non_extended(ctx);
878 }
879 end:
880 ctx->pos--;
881 return h;
882 }
883
884 /*
885 Returns a list of two values (min max) if what the lexer is looking at can be
886 interpreted as a quantifier. Otherwise returns #f and resets the lexer to its
887 old position.
888 */
get_quantifier(lexer_ctx_t * ctx,int * standalone)889 static SgObject get_quantifier(lexer_ctx_t *ctx, int *standalone)
890 {
891 SgChar nc;
892 PUSH(SG_MAKE_INT(ctx->pos), ctx->last_pos);
893 nc = next_char(ctx);
894 switch (nc) {
895 case '*':
896 return SG_LIST2(SG_MAKE_INT(0), SG_FALSE); /* 0 or more times */
897 case '+':
898 return SG_LIST2(SG_MAKE_INT(1), SG_FALSE); /* 1 or more times */
899 case '?':
900 return SG_LIST2(SG_MAKE_INT(0), SG_MAKE_INT(1)); /* 0 or 1 */
901 case '{':
902 /* one of
903 {n}: match exactly n times
904 {n,}: match at least n times
905 {n,m}: match at least n but not more than m times
906 Note: anything not matching one of these patterns will be interpreted
907 literally - even whitespace isn't allowed.
908 */
909 {
910 SgObject num1 = get_number(ctx, 10, -1, TRUE), num2;
911 if (SG_FALSEP(num1)) {
912 /* no number following left curly brace, so we treat it like a normal
913 character*/
914 return fail(ctx);
915 } else {
916 nc = next_char_non_extended(ctx);
917 switch (nc) {
918 case ',':
919 num2 = get_number(ctx, 10, -1, TRUE);
920 nc = next_char_non_extended(ctx);
921 switch (nc) {
922 case '}': return SG_LIST2(num1, num2); /* {n,} or {n,m} */
923 default: return fail(ctx);
924 }
925 case '}': return SG_LIST2(num1, num1); /* {n} */
926 default: return fail(ctx);
927 }
928 }
929 }
930 default: return fail(ctx);
931 }
932 }
933 /*
934 Reads and returns the name in a named register group. It is assumed that the
935 starting #\< character has already been read. The closing #\> will also be
936 consumed.
937 */
parse_register_name_aux(lexer_ctx_t * ctx)938 static SgObject parse_register_name_aux(lexer_ctx_t *ctx)
939 {
940 /* we have to look for an ending > character now */
941 long end_name = ustrchr(ctx->str+ctx->pos, '>', ctx->len-ctx->pos), i;
942 SgObject s;
943 if (end_name < 0) {
944 raise_syntax_error(ctx, ctx->pos - 1,
945 UC("Opening #\\< in named group has no closing #\\>."));
946 }
947 s = Sg_ReserveString(end_name, 0);
948 for (i = 0; i < end_name; i++) {
949 SgChar c = ctx->str[ctx->pos + i];
950 if (isalnum(c) || c == '-') {
951 SG_STRING_VALUE_AT(s, i) = c;
952 } else {
953 raise_syntax_error(ctx, ctx->pos,
954 UC("Invalid character in named register group."));
955 }
956 }
957 /* advance lexer beyond "<name>" part */
958 ctx->pos += i + 1;
959 return Sg_Intern(s);
960 }
961
962 /*
963 Returns and consumes the next token from the regex string or '#f
964 */
get_token(lexer_ctx_t * ctx,SgObject * ret)965 static SgObject get_token(lexer_ctx_t *ctx, SgObject *ret)
966 {
967 SgChar nc;
968 PUSH(SG_MAKE_INT(ctx->pos), ctx->last_pos);
969 nc = next_char(ctx);
970 if (nc != EOF) {
971 switch (nc) {
972 /* the each cases first - the following six characters always have a
973 special meaning and get translated into tokens immediately
974 */
975 case ')': return SYM_CLOSE_PAREN;
976 case '|':
977 /* as far as I know I've never seen this pattern, and this must be
978 error.*/
979 raise_syntax_error(ctx, SG_INT_VALUE(SG_CAR(ctx->last_pos)),
980 UC("Please report this regex to the developper."));
981 return SG_UNDEF; /* dummy */
982 /* return SYM_VERTICAL_BAR; */
983 case '?':
984 /* well this question mark must be error.
985 or should this be literal character?
986 */
987 /* return SYM_QUESTION_MARK; */
988 raise_syntax_error(ctx, SG_INT_VALUE(SG_CAR(ctx->last_pos)),
989 UC("Quantifier('?') follows nothing in regex."));
990 return SG_UNDEF; /* dummy */
991 case '.': return SYM_EVERYTHING;
992 case '^':
993 if (has(ctx, SG_MULTILINE)) return SYM_START_ANCHOR;
994 else return SYM_MODELESS_START_ANCHOR;
995 case '$':
996 if (has(ctx, SG_MULTILINE)) return SYM_END_ANCHOR;
997 else return SYM_MODELESS_END_ANCHOR;
998 case '+': case '*':
999 /* quantifiers will always be consumed by get_quantifier, they must not
1000 appear here */
1001 raise_syntax_error(ctx, ctx->pos - 1,
1002 UC("Quanrifier '+' or '*' not allowed."));
1003 return SG_FALSE; /* dummy */
1004 case '{':
1005 /* left brace isn't a special character in it's own right but we must
1006 check if what follows might look like a quantifier.*/
1007 {
1008 long here = ctx->pos;
1009 SgObject last = ctx->last_pos;
1010 unget_token(ctx);
1011 if (!SG_FALSEP(get_quantifier(ctx, NULL))) {
1012 raise_syntax_error(ctx, SG_INT_VALUE(SG_CAR(last)),
1013 UC("Quanrifier not allowed"));
1014 }
1015 ctx->pos = here;
1016 ctx->last_pos = last;
1017 return SG_MAKE_CHAR(nc);
1018 }
1019 case '[':
1020 /* left bracket always starts */
1021 {
1022 int comp = FALSE;
1023 SgObject cs = read_char_set(ctx, &comp);
1024 if (comp) {
1025 return SG_LIST2(SYM_INVERTED_CHAR_CLASS, cs);
1026 } else {
1027 return cs;
1028 }
1029 }
1030 case '\\':
1031 /* backslash might mean different things so we have to peek one char
1032 ahead */
1033 nc = next_char_non_extended(ctx);
1034 switch (nc) {
1035 case 'A': return SYM_MODELESS_START_ANCHOR;
1036 case 'Z': return SYM_MODELESS_END_ANCHOR;
1037 case 'z': return SYM_MODELESS_END_ANCHOR_NO_NEWLINE;
1038 case 'b': return SYM_WORD_BOUNDARY;
1039 case 'B': return SYM_NON_WORD_BOUNDARY;
1040 case 'k':
1041 if (LOOKING_AT_P(ctx, '<')) {
1042 SgObject name, num = SG_FALSE;
1043 long pos = ctx->pos - 2;
1044 ctx->pos++;
1045 name = parse_register_name_aux(ctx);
1046 /* search backref name from context */
1047 /* (slot n1 n2 ...) */
1048 num = Sg_Assq(name, ctx->reg_names);
1049 if (SG_FALSEP(num)) {
1050 raise_syntax_error(ctx, pos,
1051 UC("Non defined named register is refered."));
1052 }
1053 num = SG_CDR(num);
1054 if (SG_NULLP(SG_CDR(num))) {
1055 return Sg_Cons(SYM_BACKREF, SG_CAR(num));
1056 } else {
1057 SgObject cp, h = SG_NIL, t = SG_NIL;
1058 SG_FOR_EACH(cp, num) {
1059 SG_APPEND1(h, t, Sg_Cons(SYM_BACKREF, SG_CAR(cp)));
1060 }
1061 /* (alternation (back-reference . n1) ...) */
1062 return Sg_Cons(SYM_ALTER, h);
1063 }
1064 } else{
1065 return SG_MAKE_CHAR('k');
1066 }
1067 case 'd': return get_defined_cset(ctx, SG_CHAR_SET_DIGIT);
1068 case 'D': return SG_LIST2(SYM_INVERTED_CHAR_CLASS,
1069 get_defined_cset(ctx, SG_CHAR_SET_DIGIT));
1070 case 'w': return get_defined_cset(ctx, SG_CHAR_SET_WORD);
1071 case 'W': return SG_LIST2(SYM_INVERTED_CHAR_CLASS,
1072 get_defined_cset(ctx, SG_CHAR_SET_WORD));
1073 case 's': return get_defined_cset(ctx, SG_CHAR_SET_SPACE);
1074 case 'S': return SG_LIST2(SYM_INVERTED_CHAR_CLASS,
1075 get_defined_cset(ctx, SG_CHAR_SET_SPACE));
1076 case '1': case '2': case '3': case '4': case '5':
1077 case '6': case '7': case '8': case '9':
1078 {
1079 long oldpos = --ctx->pos;
1080 SgObject num = get_number(ctx, 10, -1, FALSE);
1081 if (SG_INT_VALUE(num) > ctx->reg &&
1082 10 <= SG_INT_VALUE(num)) {
1083 /* \10 and higher are treaded as octal character codes if we haven't
1084 opened that much register groups yet. */
1085 ctx->pos = oldpos;
1086 return make_char_from_code(ctx,
1087 get_number(ctx, 8, 3, FALSE), oldpos);
1088 } else {
1089 return Sg_Cons(SYM_BACKREF, num);
1090 }
1091 }
1092 break;
1093 case '0':
1094 /* this always means an octal character code */
1095 {
1096 long oldpos = ctx->pos - 1;
1097 return make_char_from_code(ctx, get_number(ctx, 8, 3, FALSE), oldpos);
1098 }
1099 break;
1100 case 'P': case 'p':
1101 /* might be a named property */
1102 return read_char_property(ctx, nc);
1103 default:
1104 ctx->pos--;
1105 return unescape_char(ctx);
1106 }
1107 case '(':
1108 /* an open parenthesis might mean different thigs depending on what
1109 follows... */
1110 if (LOOKING_AT_P(ctx, '?')) {
1111 /* this is the case '(?' (and probably more behind) */
1112 SgObject flags;
1113 ctx->pos++;
1114 flags = maybe_parse_flags(ctx);
1115 nc = next_char_non_extended(ctx);
1116 /* modifiers are only allowed if a colon or closing parenthesis are
1117 following. */
1118 if (!SG_NULLP(flags) && !(nc == ':' || nc == ')')) {
1119 raise_syntax_error(ctx, SG_INT_VALUE(SG_CAR(ctx->last_pos)),
1120 UC("Sequence not recoginzed"));
1121 }
1122 switch (nc) {
1123 case EOF: raise_syntax_error(ctx, -1,
1124 UC("End of string following '(?'."));
1125 case ')':
1126 /* an empty group except for the flags */
1127 if (!SG_NULLP(flags)) return Sg_Cons(SYM_FLAGS, flags);
1128 return null_seq();
1129 /* branch */
1130 case '(': return SYM_OPEN_PAREN_PAREN;
1131 /* standalone */
1132 case '>': return SYM_OPEN_PAREN_GREATER;
1133 /* positive look-ahead */
1134 case '=': return SYM_OPEN_PAREN_EQUAL;
1135 /* negative look-ahead */
1136 case '!': return SYM_OPEN_PAREN_EXCLAMATION;
1137 case ':':
1138 /* non capturing group - return flags if it not NULL*/
1139 if (ret) *ret = flags;
1140 return SYM_OPEN_PAREN_COLON;
1141 case '<':
1142 /* might be look-behind assertion or a named group, so check next */
1143 nc = next_char_non_extended(ctx);
1144 if (isalpha(nc)) {
1145 /* we have encountered a named group. */
1146 ctx->pos--;
1147 return SYM_OPEN_PAREN_LESS_LETTER;
1148 } else {
1149 switch (nc) {
1150 case '=': return SYM_OPEN_PAREN_LESS_EQUAL; /* positive */
1151 case '!': return SYM_OPEN_PAREN_LESS_EXCLAMATION; /* negative */
1152 case ')':
1153 /* Perl allows "(?<" and treats it like a null string*/
1154 return null_seq();
1155 case EOF:
1156 raise_syntax_error(ctx, -1, UC("End of string following '(?<'."));
1157 default:
1158 raise_syntax_error(ctx, ctx->pos -1,
1159 UC("'(?<' is followed by illigal character."));
1160 }
1161 }
1162 default:
1163 raise_syntax_error(ctx, ctx->pos -1,
1164 UC("'(?' is followed by illigal character."));
1165 }
1166 } else {
1167 /* if nc was not '?' (this is within the first switch, we've just seen
1168 an opening parenthesis and leave it like that*/
1169 return SYM_OPEN_PAREN;
1170 }
1171 default:
1172 /* all other characters are their onw tokens */
1173 return SG_MAKE_CHAR(nc);
1174 }
1175 } else {
1176 /* we didn't get a character (this if the "else" branch form the first if),
1177 so we don't return a token but #f.
1178 */
1179 POP(ctx->last_pos);
1180 return SG_FALSE;
1181 }
1182 }
1183
1184 /* parser functions*/
1185 static SgObject reg_expr(lexer_ctx_t *ctx);
1186
1187 /*
1188 Parses and consumes a <group>.
1189 The productions are: <group> -> "(" <regex> ")"
1190 "(?:" <regex> ")"
1191 "(?>" <regex> ")"
1192 "(?<flags>:" <regex> ")"
1193 "(?=" <regex> ")"
1194 "(?!" <regex> ")"
1195 "(?<=" <regex> ")"
1196 "(?<!" <regex> ")"
1197 "(?(" <num> ")" <regex> ")"
1198 "(?(" <regex> ")" <regex> ")"
1199 "(?<name>" <regex> ")"
1200 <legal-token>
1201 where <flags> is parsed by the lexer function MAYBE-PARSE-FLAGS.
1202 Will return <parse-tree> or (<grouping-type> <parse-tree>) where
1203 <grouping-type> is one of six keywords.
1204 */
group(lexer_ctx_t * ctx)1205 static SgObject group(lexer_ctx_t *ctx)
1206 {
1207 SgObject flags = SG_NIL;
1208 int save = ctx->flags;
1209 SgObject open_token = get_token(ctx, &flags);
1210 SgObject ret = SG_UNDEF;
1211
1212 if (SG_EQ(open_token, SYM_OPEN_PAREN_PAREN)) {
1213 /* special case for conditional regular expression; not that at this point
1214 we accespt a couple of illegal combinations which'll be sorted out later
1215 by the converter
1216 */
1217 long open_paren_pos = SG_INT_VALUE(SG_CAR(ctx->last_pos));
1218 /* check if what follows "(?(" is a number*/
1219 SgObject number = try_number(ctx, 10, -1, TRUE);
1220 if (!SG_FALSEP(number)) {
1221 /* condition is a number (i.e refers to a back-reference */
1222 SgObject inner_close_token = get_token(ctx, NULL);
1223 SgObject regexpr = reg_expr(ctx);
1224 SgObject close_token = get_token(ctx, NULL);
1225 if (!SG_EQ(inner_close_token, SYM_CLOSE_PAREN)) {
1226 raise_syntax_error(ctx, open_paren_pos + 2,
1227 UC("Opening paren has no matching closing paren"));
1228 }
1229 if (!SG_EQ(close_token, SYM_CLOSE_PAREN)) {
1230 raise_syntax_error(ctx, open_paren_pos,
1231 UC("Opening paren has no matching closing paren"));
1232 }
1233 if (!SG_PAIRP(regexpr) ||
1234 (SG_PAIRP(regexpr) && !SG_EQ(SG_CAR(regexpr), SYM_ALTER))) {
1235 /* most definitely (?(1)aaa) pattern so make this
1236 (alternation $regexpr (sequence))*/
1237 regexpr = SG_LIST3(SYM_ALTER, regexpr, SG_LIST1(SYM_SEQUENCE));
1238 }
1239 ret = SG_LIST3(SYM_BRANCH, number, regexpr);
1240 goto end_group;
1241 } else {
1242 /* condition must be a full regex (actually a look-behind or look-ahead);
1243 and comes a terrible kludge: instead of being cleanly separated from
1244 the lexer, the parser pushes back lexer by one position, thereby
1245 landing in the middle of the 'token' "(?(".
1246 */
1247 SgObject inner_reg_expr, regexpr, close_token;
1248 ctx->pos--;
1249 inner_reg_expr = group(ctx);
1250 regexpr = reg_expr(ctx);
1251 close_token = get_token(ctx, NULL);
1252 if (!SG_EQ(close_token, SYM_CLOSE_PAREN)) {
1253 raise_syntax_error(ctx, open_paren_pos,
1254 UC("Opening paren has no matching closing paren."));
1255 }
1256 if (!(SG_PAIRP(inner_reg_expr) &&
1257 (SG_EQ(SG_CAR(inner_reg_expr), SYM_LOOKBHIND) ||
1258 SG_EQ(SG_CAR(inner_reg_expr), SYM_LOOKAHEAD)))) {
1259 raise_syntax_error(ctx, open_paren_pos,
1260 UC("Branch test must be lookahead, look-behind or number"));
1261 }
1262 if (!SG_PAIRP(regexpr) ||
1263 (SG_PAIRP(regexpr) && !SG_EQ(SG_CAR(regexpr), SYM_ALTER))) {
1264 /* most definitely (?(1)aaa) pattern so make this
1265 (alternation $regexpr (sequence))*/
1266 regexpr = SG_LIST3(SYM_ALTER, regexpr, SG_LIST1(SYM_SEQUENCE));
1267 }
1268 ret = SG_LIST3(SYM_BRANCH, inner_reg_expr, regexpr);
1269 goto end_group;
1270 }
1271
1272 } else if (SG_EQ(open_token, SYM_OPEN_PAREN) ||
1273 SG_EQ(open_token, SYM_OPEN_PAREN_EQUAL) ||
1274 SG_EQ(open_token, SYM_OPEN_PAREN_COLON) ||
1275 SG_EQ(open_token, SYM_OPEN_PAREN_GREATER) ||
1276 SG_EQ(open_token, SYM_OPEN_PAREN_EXCLAMATION) ||
1277 SG_EQ(open_token, SYM_OPEN_PAREN_LESS_EQUAL) ||
1278 SG_EQ(open_token, SYM_OPEN_PAREN_LESS_EXCLAMATION) ||
1279 SG_EQ(open_token, SYM_OPEN_PAREN_LESS_LETTER)) {
1280 /* we saw one of the six token representing opening parentheses */
1281 int saved_reg_num = 0;
1282 long open_paren_pos = SG_INT_VALUE(SG_CAR(ctx->last_pos));
1283 SgObject register_name = (SG_EQ(open_token, SYM_OPEN_PAREN_LESS_LETTER))
1284 ? parse_register_name_aux(ctx) : SG_FALSE;
1285 SgObject regexpr, close_token;
1286 if (SG_EQ(open_token, SYM_OPEN_PAREN) ||
1287 SG_EQ(open_token, SYM_OPEN_PAREN_LESS_LETTER)) {
1288 /* if this is the "(" <regex> ")" or "(?" <name> "" <regex> ")" production
1289 we have to increment the register counter of the lexer
1290 */
1291 ctx->reg++;
1292 saved_reg_num = ctx->reg_num++;
1293 }
1294 regexpr = reg_expr(ctx);
1295 close_token = get_token(ctx, NULL);
1296
1297 if (!SG_EQ(close_token, SYM_CLOSE_PAREN)) {
1298 raise_syntax_error(ctx, open_paren_pos,
1299 UC("Opening paren has no matching closing paren."));
1300 }
1301 if (!SG_NULLP(flags)) {
1302 /* if the lexer has returned a list of flags this must have been the
1303 "(?:" <regex> ")" production
1304 */
1305 ret = SG_LIST3(SYM_FLAGGED_SEQUENCE, flags, regexpr);
1306 goto end_group;
1307 } else {
1308 if (SG_EQ(open_token, SYM_OPEN_PAREN_LESS_LETTER)) {
1309 /* make alist */
1310 SgObject names = Sg_Assq(register_name, ctx->reg_names);
1311 if (SG_FALSEP(names)) {
1312 PUSH(Sg_Cons(register_name, SG_LIST1(SG_MAKE_INT(saved_reg_num))),
1313 ctx->reg_names);
1314 } else {
1315 /* push it */
1316 SG_SET_CDR(names, Sg_Cons(SG_MAKE_INT(saved_reg_num),
1317 SG_CDR(names)));
1318 }
1319 ret = SG_LIST4(SYM_REGISTER,
1320 SG_MAKE_INT(saved_reg_num),
1321 register_name,
1322 regexpr);
1323 goto end_group;
1324 } else {
1325 if (SG_EQ(open_token, SYM_OPEN_PAREN)) {
1326 ret = SG_LIST4(SYM_REGISTER, SG_MAKE_INT(saved_reg_num),
1327 SG_FALSE, regexpr);
1328 } else if (SG_EQ(open_token, SYM_OPEN_PAREN_COLON)) {
1329 /* (?:...) does not create any group */
1330 ret = regexpr;
1331 } else if (SG_EQ(open_token, SYM_OPEN_PAREN_GREATER)) {
1332 ret = SG_LIST2(SYM_STANDALONE, regexpr);
1333 } else if (SG_EQ(open_token, SYM_OPEN_PAREN_EQUAL)) {
1334 ret = SG_LIST3(SYM_LOOKAHEAD, SG_TRUE, regexpr);
1335 } else if (SG_EQ(open_token, SYM_OPEN_PAREN_EXCLAMATION)) {
1336 ret = SG_LIST3(SYM_LOOKAHEAD, SG_FALSE, regexpr);
1337 } else if (SG_EQ(open_token, SYM_OPEN_PAREN_LESS_EQUAL)) {
1338 ret = SG_LIST3(SYM_LOOKBHIND, SG_TRUE, regexpr);
1339 } else if (SG_EQ(open_token, SYM_OPEN_PAREN_LESS_EXCLAMATION)) {
1340 ret = SG_LIST3(SYM_LOOKBHIND, SG_FALSE, regexpr);
1341 } else {
1342 ASSERT(FALSE);
1343 ret = SG_UNDEF; /* dummy */
1344 }
1345 goto end_group;
1346 }
1347 }
1348 } else {
1349 /* This is the <lexical-token> production; <legal-token> is any token which
1350 passes start_of_subexpr_p (otherwise parsing had already stopped int the
1351 sequence.
1352 */
1353 ret = open_token;
1354 }
1355 end_group:
1356 ctx->flags = save;
1357 return ret;
1358 }
1359
1360 /*
1361 Parses and consume a <greedy-quant>.
1362 The productions are: <greedy-quant> -> <group> | <group><quantifier>
1363 where <quantifier> is parsed by the lexer function get_quantifier.
1364 Will return <parse-tree> or (SYM_GREEDY_REP <min> <max> <parse-tree).
1365 */
greedy_quant(lexer_ctx_t * ctx)1366 static SgObject greedy_quant(lexer_ctx_t *ctx)
1367 {
1368 SgObject grp = group(ctx);
1369 int standalone = FALSE;
1370 SgObject token = get_quantifier(ctx, &standalone);
1371 if (!SG_FALSEP(token)) {
1372 /* if get_quantifier returned a true value it's the tow element list
1373 (<min> <max)
1374 */
1375 return SG_LIST4(SYM_GREEDY_REP, SG_CAR(token), SG_CADR(token), grp);
1376 }
1377 return grp;
1378 }
1379
1380 /*
1381 Parses and consume a <quant>.
1382 The productions are: <quant> -> <greedy-quant> | <greedy-quant>"?".
1383 Will return the <parse-tree> returned by greedy_quant and optionally change
1384 SYM_GREEDY_REP to SYM_NON_GREEDY_REP.
1385 */
quant(lexer_ctx_t * ctx)1386 static SgObject quant(lexer_ctx_t *ctx)
1387 {
1388 SgObject greedy = greedy_quant(ctx);
1389 long pos = ctx->pos;
1390 SgChar nc = next_char(ctx);
1391 if (nc != EOF) {
1392 if (nc == '?') {
1393 SG_SET_CAR(greedy, SYM_NON_GREEDY_REP);
1394 } else if (nc == '+') {
1395 greedy = SG_LIST2(SYM_STANDALONE, greedy);
1396 } else {
1397 ctx->pos = pos;
1398 }
1399 }
1400 return greedy;
1401 }
1402
1403 /*
1404 Parses and consumes a <seq>.
1405 The productions are: <sex> -> <quant> | <quant><seq>.
1406 Will return <parse-tree> or (sequence <parse-tree> <parse-tree>).
1407 */
make_string_from_two_char(SgObject c1,SgObject c2)1408 static SgObject make_string_from_two_char(SgObject c1, SgObject c2)
1409 {
1410 /* for the smarter way we should use, list->string, however it is slow */
1411 SgObject s = Sg_ReserveString(2, 0);
1412 ASSERT(SG_CHARP(c1) && SG_CHARP(c2));
1413 SG_STRING_VALUE_AT(s, 0) = SG_CHAR_VALUE(c1);
1414 SG_STRING_VALUE_AT(s, 1) = SG_CHAR_VALUE(c2);
1415 return s;
1416 }
sequence(lexer_ctx_t * ctx)1417 static SgObject sequence(lexer_ctx_t *ctx)
1418 {
1419 if (start_of_subexpr_p(ctx)) {
1420 SgObject quan = quant(ctx);
1421 if (start_of_subexpr_p(ctx)) {
1422 SgObject seq = sequence(ctx);
1423 int is_quant_char = SG_CHARP(quant);
1424 int is_seq_sequence = (SG_PAIRP(seq) && SG_EQ(SG_CAR(seq), SYM_SEQUENCE));
1425 if (is_seq_sequence && SG_CHARP(seq)) {
1426 return make_string_from_two_char(seq, quan);
1427 } else if (is_quant_char && SG_STRINGP(seq)) {
1428 SgChar ca[1];
1429 ca[0] = SG_CHAR_VALUE(quan);
1430 return Sg_StringAppendC(SG_STRING(seq), ca, 1);
1431 } else if (is_quant_char && is_seq_sequence && SG_CHARP(SG_CADR(seq))) {
1432 if (SG_NULLP(SG_CDDR(seq))) {
1433 SG_SET_CDR(seq,
1434 Sg_Cons(make_string_from_two_char(SG_CADR(seq), quan),
1435 SG_CDDR(seq)));
1436 return seq;
1437 } else {
1438 return make_string_from_two_char(SG_CADR(seq), quan);
1439 }
1440 } else if (is_seq_sequence) {
1441 /* if <seq> is also a 'sequence parse tree we merge both lists into one
1442 to avoid unneccessary consing*/
1443 SG_SET_CDR(seq, Sg_Cons(quan, SG_CDR(seq)));
1444 return seq;
1445 } else {
1446 return SG_LIST3(SYM_SEQUENCE, quan, seq);
1447 }
1448 } else {
1449 return quan;
1450 }
1451 }
1452 return null_seq();
1453 }
1454
1455 /*
1456 Parses and consumes a <regex>, a complete regular expression. The productions
1457 are: <regex> -> <seq> | <seq> "|" <regex>.
1458 Will return <parse-tree> or (SYM_ALTER <parse-tree> <parse-tree>).
1459 */
reg_expr(lexer_ctx_t * ctx)1460 static SgObject reg_expr(lexer_ctx_t *ctx)
1461 {
1462 long pos = ctx->pos;
1463 switch (next_char(ctx)) {
1464 case EOF:
1465 /* if we didn't get any token we return 'void which stands for
1466 "empty regular expression"
1467 */
1468 return null_seq();
1469 case '|':
1470 /* now check whether the expression started with a vertical bar,
1471 i.e. <seq> - the left alternation - is empty
1472 */
1473 return SG_LIST3(SYM_ALTER, null_seq(), reg_expr(ctx));
1474 default: {
1475 /* otherwise un-read the character we just saw and parse a <seq> plus
1476 the character following it
1477 */
1478 SgObject seq;
1479 ctx->pos = pos;
1480 seq = sequence(ctx);
1481 pos = ctx->pos;
1482 switch (next_char(ctx)) {
1483 case EOF:
1484 /* no further character, just a <seq> */
1485 return seq;
1486 case '|': {
1487 /* if the character was a vertical bar, this is an alternation and we
1488 have the second production
1489 */
1490 SgObject expr = reg_expr(ctx);
1491 if (SG_PAIRP(expr) &&
1492 SG_EQ(SG_CAR(expr), SYM_ALTER)) {
1493 /* again we try to merge as above in SEQ */
1494 SG_SET_CDR(expr, Sg_Cons(seq, SG_CDR(expr)));
1495 return expr;
1496 } else {
1497 return SG_LIST3(SYM_ALTER, seq, expr);
1498 }
1499 }
1500 default:
1501 /* a character which is not a vertical bar - this is either a syntax
1502 error or we're inside of a group and the next character is closing
1503 parenthesis; so we just un-read the character and let another function
1504 take care of it
1505 */
1506 ctx->pos = pos;
1507 return seq;
1508 }
1509 }
1510 }
1511 }
1512
parse_string(lexer_ctx_t * ctx)1513 static SgObject parse_string(lexer_ctx_t *ctx)
1514 {
1515 SgObject r;
1516 if (has(ctx, SG_LITERAL)) {
1517 /* the whole input string is just literal */
1518 SgObject h = SG_NIL, t = SG_NIL;
1519 int i;
1520 SG_APPEND1(h, t, SYM_SEQUENCE);
1521 for (i = 0; i < ctx->len; i++) {
1522 SG_APPEND1(h, t, SG_MAKE_CHAR(ctx->str[i]));
1523 }
1524 ctx->pos = ctx->len;
1525 r = h;
1526 } else {
1527 r = reg_expr(ctx);
1528 }
1529 return SG_LIST4(SYM_REGISTER, SG_MAKE_INT(0), SG_FALSE, r);
1530 }
1531
1532 /* optimization */
1533 /*
1534 cl-ppcre does convertion from s-expression tree to regex object to use generic
1535 methods.
1536 however on Sagittarius, we don't create such class and we do not use generic
1537 method(we can not!). So let's optimize AST directly and destructively.
1538
1539 we do:
1540 - flattening nested sequence: (sequence a (sequence b) c) => (sequence a b c)
1541 - splits a repetition into constant and varying part: a{3,} -> a{3}a*
1542 */
1543 static SgObject optimize(SgObject ast, SgObject rest);
1544
1545 /*
1546 a{3,} -> a{3}a*
1547 If the given regex is (abc|efg){3,}, then the back reference of $1 is "efg".
1548 So we can simple omit the first register tag if there is.
1549 (checked with Perl v5.10.1, with this one liner;
1550 $ perl -e 'my $s="abcabcabcefg"; $s=~/(abc|efg){3,}/; print $1."\n";'
1551 efg
1552 )
1553 */
maybe_split_repetition(SgObject ast,SgObject regex)1554 static SgObject maybe_split_repetition(SgObject ast, SgObject regex)
1555 {
1556 long minimum, maximum = -1;
1557 SgObject max = SG_CAR(SG_CDDR(ast));
1558 SgObject constant = SG_NIL;
1559 minimum = SG_INT_VALUE(SG_CADR(ast));
1560 if (!SG_FALSEP(max)) {
1561 maximum = SG_INT_VALUE(max);
1562 /* trivial case: don't repeat at all */
1563 if (maximum == 0) return null_seq();
1564 /* another trivial case "repeat" exactly once */
1565 if (minimum == 1 && maximum == 1) return SG_CADR(SG_CDDR(ast));
1566 /* well result would be exactly the same but make AST a bit shorter
1567 and hope this makes a bit of better memory usage. */
1568 /* if (minimum == 1 && maximum == 1) return ast; */
1569 }
1570 if (minimum > 0) {
1571 SgObject in = regex;
1572 /* if (SG_PAIRP(regex) && SG_EQ(SG_CAR(regex), SYM_REGISTER)) */
1573 /* in = SG_CADR(SG_CDDR(regex)); */
1574 /* else in = regex; */
1575 constant = SG_LIST4(SG_CAR(ast), SG_CADR(ast), SG_CADR(ast), in);
1576 }
1577 if (!SG_FALSEP(max) && maximum == minimum) {
1578 /* no varying part needed bacuause min = max */
1579 return constant;
1580 } else {
1581 SgObject varying = SG_LIST4(SG_CAR(ast),
1582 SG_MAKE_INT(0),
1583 (SG_FALSEP(max)
1584 ? SG_FALSE : SG_MAKE_INT(maximum - minimum)),
1585 regex);
1586 if (minimum == 0) return varying; /* min = 0, no constant part needed */
1587 else if (minimum == 1) {
1588 /* min = 1, constant part needs not repetition wrapped around */
1589 return SG_LIST3(SYM_SEQUENCE, regex, varying);
1590 } else {
1591 return SG_LIST3(SYM_SEQUENCE, constant, varying);
1592 }
1593 }
1594 }
1595
optimize_seq(SgObject seq,SgObject rest)1596 static SgObject optimize_seq(SgObject seq, SgObject rest)
1597 {
1598 SgObject elt, tail, etype, opted;
1599 if (!SG_PAIRP(seq)) return seq;
1600 elt = SG_CAR(seq);
1601 tail = optimize_seq(SG_CDR(seq), rest);
1602 rest = SG_NULLP(tail) ? rest : tail;
1603 if (!SG_PAIRP(elt) || SG_EQ(SG_CAR(elt), SYM_INVERTED_CHAR_CLASS)) {
1604 if (SG_EQ(tail, SG_CDR(seq))) return seq;
1605 else return Sg_Cons(elt, tail);
1606 }
1607 etype = SG_CAR(elt);
1608 if (SG_EQ(etype, SYM_SEQUENCE)) {
1609 return Sg_Append2(optimize_seq(SG_CDR(elt), rest), tail);
1610 }
1611 opted = optimize(elt, rest);
1612 if (SG_EQ(elt, opted) && SG_EQ(tail, SG_CDR(seq))) return seq;
1613 else return Sg_Cons(opted, tail);
1614 }
1615
optimize(SgObject ast,SgObject rest)1616 static SgObject optimize(SgObject ast, SgObject rest)
1617 {
1618 /* assume given AST is not literal list. so caller must copy original AST. */
1619 SgObject type, seq, seqo;
1620 if (!SG_PAIRP(ast)) return ast;
1621 type = SG_CAR(ast);
1622 /* we know after inverted-char-class it has only char-set. so just return */
1623 if (SG_EQ(type, SYM_INVERTED_CHAR_CLASS)) return ast;
1624
1625 if (SG_EQ(type, SYM_GREEDY_REP) ||
1626 SG_EQ(type, SYM_NON_GREEDY_REP)) {
1627 return maybe_split_repetition(ast, optimize(SG_CADR(SG_CDDR(ast)), rest));
1628 }
1629
1630 if (SG_EQ(type, SYM_ALTER)) {
1631 SgObject sp, sp2, e = SG_UNBOUND, h, t;
1632 SG_FOR_EACH(sp, SG_CDR(ast)) {
1633 e = optimize(SG_CAR(sp), rest);
1634 if (!SG_EQ(e, SG_CAR(sp))) break;
1635 }
1636 if (SG_NULLP(sp)) return ast;
1637 /* need to copy the spine */
1638 h = t = SG_NIL;
1639 SG_FOR_EACH(sp2, SG_CDR(ast)) {
1640 if (SG_EQ(sp, sp2)) { SG_APPEND1(h, t, e); break; }
1641 SG_APPEND1(h, t, SG_CAR(sp2));
1642 }
1643 SG_FOR_EACH(sp2, SG_CDR(sp2)) {
1644 SG_APPEND1(h, t, optimize(SG_CAR(sp2), rest));
1645 }
1646 return Sg_Cons(SYM_ALTER, h);
1647 }
1648 if (SG_EQ(type, SYM_SEQUENCE)) {
1649 seq = SG_CDR(ast);
1650 seqo = optimize_seq(seq, rest);
1651 if (SG_EQ(seq, seqo)) return ast;
1652 return Sg_Cons(type, seqo);
1653 }
1654
1655 if (SG_EQ(type, SYM_REGISTER)) {
1656 /* (register n name ast) */
1657 SgObject n, name;
1658 if (Sg_Length(ast) != 4) {
1659 Sg_Error(UC("Invalid AST register: %S"), ast);
1660 }
1661 n = SG_CADR(ast);
1662 name = SG_CAR(SG_CDDR(ast));
1663 seq = SG_CADR(SG_CDDR(ast));
1664 seqo = optimize(seq, rest);
1665 /* Sg_Printf(Sg_StandardErrorPort(), UC("ast: %S\n"), seq); */
1666 /* Sg_Printf(Sg_StandardErrorPort(), UC("seq: %S\n"), seqo); */
1667 if (SG_EQ(seq, seqo)) return ast;
1668 return SG_LIST4(SYM_REGISTER, n, name, seqo);
1669 }
1670
1671 seq = SG_CDR(ast);
1672 seqo = optimize(seq, rest);
1673 if (SG_EQ(seq, seqo)) return ast;
1674 return Sg_Cons(type, seqo);
1675 }
1676
1677 #include "regex_priv.inc"
1678
1679 typedef struct
1680 {
1681 inst_t *pc; /* current pc */
1682 int flags; /* compile time flags */
1683 int emitp; /* flag for count or emit */
1684 int codemax; /* max code count */
1685 prog_t *prog; /* building prog */
1686 inst_t *inst;
1687 long index; /* current inst index */
1688 int extendedp; /* extended regular expression or not */
1689 int lookbehindp;
1690 } compile_ctx_t;
1691
1692 static inst_arg_t null_arg = {0};
1693 static inst_t null_inst = {0};
1694
check_start_anchor(SgObject ast,int * modelessp)1695 static int check_start_anchor(SgObject ast, int *modelessp)
1696 {
1697 SgObject type;
1698 if (!SG_PAIRP(ast))
1699 type = ast;
1700 else
1701 type = SG_CAR(ast);
1702
1703 if (SG_EQ(type, SYM_START_ANCHOR)) return TRUE;
1704 else if (SG_EQ(type, SYM_MODELESS_START_ANCHOR)) {
1705 if (modelessp) *modelessp = TRUE;
1706 return TRUE;
1707 }
1708 else return FALSE;
1709 }
1710
emit(compile_ctx_t * ctx,unsigned char opcode,inst_arg_t arg)1711 static void emit(compile_ctx_t *ctx, unsigned char opcode,
1712 inst_arg_t arg)
1713 {
1714 if (ctx->emitp) {
1715 inst_t *i = &ctx->inst[ctx->index++];
1716 i->opcode = opcode;
1717 i->arg = arg;
1718 INST_FLAG_SET(i, ctx->flags);
1719 ctx->pc = ++i;
1720 } else {
1721 ctx->codemax++;
1722 }
1723 }
1724
1725 #define emit2(ctx, inst1, inst2, arg) \
1726 emit((ctx), (ctx)->negative ? (inst2) : (inst1), (arg))
1727
1728 static void compile_rec(compile_ctx_t *ctx, SgObject ast, int lastp);
1729
compile_seq(compile_ctx_t * ctx,SgObject seq,int lastp)1730 static void compile_seq(compile_ctx_t *ctx, SgObject seq, int lastp)
1731 {
1732 SgObject cp;
1733 if (ctx->emitp && ctx->lookbehindp) {
1734 seq = Sg_ReverseX(seq);
1735 }
1736 SG_FOR_EACH(cp, seq) {
1737 SgObject item = SG_CAR(cp);
1738 inst_arg_t arg;
1739 if (SG_CHARP(item)) {
1740 /* TODO we need to concat chars to string. but it must be done by
1741 optimization.*/
1742 arg.c = SG_CHAR_VALUE(item);
1743 emit(ctx, RX_CHAR, arg);
1744 } else {
1745 int p;
1746 p = lastp && SG_NULLP(SG_CDR(cp));
1747 compile_rec(ctx, item, p);
1748 }
1749 }
1750 }
1751
compile_rep_seq(compile_ctx_t * ctx,SgObject seq,long count,int lastp)1752 static void compile_rep_seq(compile_ctx_t *ctx, SgObject seq,
1753 long count, int lastp)
1754 {
1755 SgObject h = SG_NIL, t = SG_NIL;
1756 /* I don't remenber why I needed to check this, but this causes an error with
1757 #/(?:aa?){n}/ pattern.
1758 */
1759 /* int seqp = (SG_PAIRP(seq) && SG_EQ(SG_CAR(seq), SYM_SEQUENCE)); */
1760 if (count <= 0) return;
1761 while (count-- > 0) {
1762 /* if (seqp) { */
1763 /* SG_APPEND(h, t, Sg_CopyList(seq)); */
1764 /* } else */
1765 if (SG_PAIRP(seq)) {
1766 SG_APPEND1(h, t, Sg_CopyList(seq));
1767 } else {
1768 SG_APPEND1(h, t, seq);
1769 }
1770 }
1771 /* h is ((sequence ...) ...) so we can simply pass it to compile_seq
1772 TODO maybe we need to flatten */
1773 compile_seq(ctx, h, lastp);
1774 }
1775
compile_min_max(compile_ctx_t * ctx,SgObject type,long count,SgObject item,int lastp)1776 static void compile_min_max(compile_ctx_t *ctx, SgObject type,
1777 long count, SgObject item, int lastp)
1778 {
1779 /* {m, n} pattern. it can be replaced like this.
1780 x{2,5} = xx(x(x(x)?)?)?
1781 VM: Instruction
1782 0: char x ;; These parts are already emited by compile_rec
1783 1: char x ;;
1784 2: split 3 8
1785 3: char x
1786 4: split 5 8
1787 5: char x
1788 6: split 7 8
1789 7: char x
1790 8: match
1791 */
1792 long i;
1793 SgObject h = SG_NIL, t = SG_NIL, cp;
1794 int nongreedyp = SG_EQ(type, SYM_NON_GREEDY_REP);
1795
1796 for (i = 0; i < count; i++) {
1797 inst_t *pc1 = ctx->pc;
1798 emit(ctx, RX_SPLIT, null_arg);
1799 pc1->arg.pos.x = ctx->pc;
1800 compile_rec(ctx, item, lastp);
1801 /* save current pc to patch later */
1802 if (ctx->emitp) SG_APPEND1(h, t, pc1);
1803 }
1804 if (ctx->emitp) {
1805 SG_FOR_EACH(cp, h) {
1806 inst_t *pc = (inst_t*)SG_CAR(cp);
1807 pc->arg.pos.y = ctx->pc;
1808 }
1809 /* swap for non greedy */
1810 if (nongreedyp) {
1811 SG_FOR_EACH(cp, h) {
1812 inst_t *pc1 = (inst_t*)SG_CAR(cp);
1813 inst_t *pc2 = pc1->arg.pos.x;
1814 pc1->arg.pos.x = pc1->arg.pos.y;
1815 pc1->arg.pos.y = pc2;
1816 }
1817 }
1818 }
1819 return;
1820 }
1821
calculate_flags(int flag,SgObject flags)1822 static int calculate_flags(int flag, SgObject flags)
1823 {
1824 SgObject cp;
1825 SG_FOR_EACH(cp, flags) {
1826 SgObject slot = SG_CAR(cp);
1827 ASSERT(SG_CHARP(SG_CAR(slot)));
1828 switch (SG_CHAR_VALUE(SG_CAR(slot))) {
1829 case 'i':
1830 if (SG_FALSEP(SG_CDR(slot)))
1831 flag &= ~SG_CASE_INSENSITIVE;
1832 else
1833 flag |= SG_CASE_INSENSITIVE;
1834 break;
1835 case 'm':
1836 if (SG_FALSEP(SG_CDR(slot)))
1837 flag &= ~SG_MULTILINE;
1838 else
1839 flag |= SG_MULTILINE;
1840 break;
1841 case 's':
1842 if (SG_FALSEP(SG_CDR(slot)))
1843 flag &= ~SG_DOTALL;
1844 else
1845 flag |= SG_DOTALL;
1846 break;
1847 case 'u':
1848 if (SG_FALSEP(SG_CDR(slot)))
1849 flag &= ~SG_UNICODE_CASE;
1850 else
1851 flag |= SG_UNICODE_CASE;
1852 break;
1853 }
1854 }
1855 return flag;
1856 }
1857
compile_rec(compile_ctx_t * ctx,SgObject ast,int lastp)1858 static void compile_rec(compile_ctx_t *ctx, SgObject ast, int lastp)
1859 {
1860 SgObject type;
1861 inst_arg_t arg;
1862 /* first, deal with atom */
1863 if (!SG_PAIRP(ast)) {
1864 /* a char */
1865 if (SG_CHARP(ast)) {
1866 /* TODO maybe we can deal with case insensitive here */
1867 arg.c = SG_CHAR_VALUE(ast);
1868 emit(ctx, RX_CHAR, arg);
1869 return;
1870 }
1871 /* charset */
1872 if (SG_CHAR_SET_P(ast)) {
1873 arg.set = ast;
1874 emit(ctx, RX_SET, arg);
1875 return;
1876 }
1877 /* special stuff */
1878 if (SG_SYMBOLP(ast)) {
1879 if (SG_EQ(ast, SYM_EVERYTHING)) {
1880 emit(ctx, RX_ANY, null_arg);
1881 return;
1882 }
1883 if (SG_EQ(ast, SYM_START_ANCHOR) ||
1884 SG_EQ(ast, SYM_MODELESS_START_ANCHOR)) {
1885 /* set flags */
1886 /* TODO check compile time flag */
1887 arg.flags = SG_EQ(ast, SYM_START_ANCHOR)
1888 ? EmptyBeginLine : EmptyBeginText;
1889 emit(ctx, RX_EMPTY, arg);
1890 return;
1891 }
1892 if (SG_EQ(ast, SYM_END_ANCHOR) ||
1893 SG_EQ(ast, SYM_MODELESS_END_ANCHOR) ||
1894 SG_EQ(ast, SYM_MODELESS_END_ANCHOR_NO_NEWLINE)) {
1895 /* Gauche supports end-anchor as literal char '$' in some context.
1896 But we do as defact standard(Perl) way.*/
1897 /* set flags */
1898 /* TODO check compile time flag */
1899 if (SG_EQ(ast, SYM_END_ANCHOR)) {
1900 arg.flags = EmptyEndLine;
1901 } else if (SG_EQ(ast, SYM_MODELESS_END_ANCHOR)) {
1902 /* '$' without multiline mode flag is the same as \\Z */
1903 arg.flags = EmptyEndTextNoNewLine;
1904 } else {
1905 arg.flags = EmptyEndText;
1906 }
1907 emit(ctx, RX_EMPTY, arg);
1908 return;
1909 }
1910 if (SG_EQ(ast, SYM_WORD_BOUNDARY)) {
1911 arg.flags = EmptyWordBoundary;
1912 emit(ctx, RX_EMPTY, arg);
1913 return;
1914 }
1915 if (SG_EQ(ast, SYM_NON_WORD_BOUNDARY)) {
1916 arg.flags = EmptyNonWordBoundary;
1917 emit(ctx, RX_EMPTY, arg);
1918 return;
1919 }
1920 /* fallback */
1921 }
1922 Sg_Error(UC("[internal:regex] unrecognized AST item: %S"), ast);
1923 }
1924 /* structured node */
1925 type = SG_CAR(ast);
1926 /* do with simple ones */
1927 if (SG_EQ(type, SYM_SEQUENCE)) {
1928 /* we do not have any implicit sequence */
1929 compile_seq(ctx, SG_CDR(ast), lastp);
1930 return;
1931 }
1932
1933 if (SG_EQ(type, SYM_FLAGGED_SEQUENCE)) {
1934 SgObject flags = SG_CADR(ast);
1935 SgObject seq = SG_CAR(SG_CDDR(ast));
1936 int flag = calculate_flags(ctx->flags, flags), save = ctx->flags;
1937 ctx->flags = arg.flags = flag;
1938 compile_rec(ctx, seq, lastp);
1939 ctx->flags = arg.flags = save;
1940 return;
1941 }
1942
1943 if (SG_EQ(type, SYM_INVERTED_CHAR_CLASS)) {
1944 SgObject cs = SG_CADR(ast);
1945 ASSERT(SG_CHAR_SET_P(cs));
1946 arg.set = cs;
1947 emit(ctx, RX_NSET, arg);
1948 return;
1949 }
1950 if (SG_EQ(type, SYM_REGISTER)) {
1951 /* (register <number> <name> <ast>) */
1952 long grpno = SG_INT_VALUE(SG_CADR(ast));
1953 arg.n = 2*grpno;
1954 emit(ctx, RX_SAVE, arg);
1955 compile_rec(ctx, SG_CADR(SG_CDDR(ast)), lastp);
1956 arg.n = 2*grpno+1;
1957 emit(ctx, RX_SAVE, arg);
1958 return;
1959 }
1960
1961 /*
1962 (alter (seq aa) (seq bb) (seq cc))
1963 ->
1964 0: split 1 8
1965 1: split 2 5
1966 2: char a
1967 3: char a
1968 4: jmp 7
1969 5: char b
1970 6: char b
1971 7: jmp 10
1972 8: char c
1973 9: char c
1974 10: match
1975 so we need to separate into two, cadr part and the rest.
1976 -> (alter (seq aa) (alter (seq bb) (seq cc)))
1977 */
1978 if (SG_EQ(type, SYM_ALTER)) {
1979 if (SG_PAIRP(SG_CDR(ast))) {
1980 inst_t *pc1 = ctx->pc, *pc2;
1981 emit(ctx, RX_SPLIT, null_arg);
1982 pc1->arg.pos.x = ctx->pc;
1983 compile_rec(ctx, SG_CADR(ast), lastp);
1984 pc2 = ctx->pc;
1985 emit(ctx, RX_JMP, null_arg);
1986 pc1->arg.pos.y = ctx->pc;
1987 if (Sg_Length((SG_CDDR(ast))) != 1) {
1988 /* more than two */
1989 compile_rec(ctx, Sg_Cons(SYM_ALTER, SG_CDDR(ast)), lastp);
1990 } else {
1991 compile_rec(ctx, SG_CAR(SG_CDDR(ast)), lastp);
1992 }
1993 pc2->arg.pos.x = ctx->pc;
1994 } else {
1995 emit(ctx, RX_FAIL, null_arg);
1996 }
1997 return;
1998 }
1999
2000 if (SG_EQ(type, SYM_GREEDY_REP) || SG_EQ(type, SYM_NON_GREEDY_REP)) {
2001 SgObject min = SG_CADR(ast), max = SG_CAR(SG_CDDR(ast));
2002 SgObject item = SG_CADR(SG_CDDR(ast));
2003 int multip = 0;
2004 inst_t *pc1, *pc2;
2005
2006 if (SG_FALSEP(max) || SG_INT_VALUE(max) > 1)
2007 multip = TRUE;
2008 compile_rep_seq(ctx, item, SG_INT_VALUE(min), multip);
2009
2010 if (SG_EQ(min, max)) return; /* well, it must match exact times */
2011 if (!SG_FALSEP(max)) {
2012 long count = SG_INT_VALUE(max) - SG_INT_VALUE(min);
2013 compile_min_max(ctx, type, count, item, lastp);
2014 return;
2015 }
2016 /* save current instruction position */
2017 pc1 = ctx->pc;
2018 emit(ctx, RX_SPLIT, null_arg);
2019 pc1->arg.pos.x = ctx->pc;
2020 compile_rec(ctx, item, FALSE);
2021 /* we've already resolved minimam match so let introduce jmp here */
2022 arg.pos.x = pc1;
2023 emit(ctx, RX_JMP, arg);
2024 pc1->arg.pos.y = ctx->pc;
2025 if (SG_EQ(type, SYM_NON_GREEDY_REP)) {
2026 pc2 = pc1->arg.pos.x;
2027 pc1->arg.pos.x = pc1->arg.pos.y;
2028 pc1->arg.pos.y = pc2;
2029 }
2030 return;
2031 }
2032
2033 if (SG_EQ(type, SYM_BACKREF)) {
2034 SgObject num = SG_CDR(ast);
2035 ctx->extendedp = TRUE;
2036 if (SG_INTP(num)) {
2037 arg.index = SG_INT_VALUE(num);
2038 emit(ctx, RX_BREF, arg);
2039 } else {
2040 raise_compile_error(UC("invalid backreference number. %S"), num);
2041 }
2042 return;
2043 }
2044
2045 if (SG_EQ(type, SYM_LOOKAHEAD) || SG_EQ(type, SYM_LOOKBHIND)) {
2046 SgObject neg = SG_CADR(ast);
2047 SgObject seq = SG_CAR(SG_CDDR(ast));
2048 inst_t *pc;
2049 int saved = ctx->lookbehindp;
2050 ctx->extendedp = TRUE;
2051 pc = ctx->pc;
2052 if (SG_EQ(type, SYM_LOOKAHEAD)) {
2053 emit(ctx, (SG_FALSEP(neg)) ? RX_NAHEAD : RX_AHEAD, null_arg);
2054 } else {
2055 emit(ctx, (SG_FALSEP(neg)) ? RX_NBEHIND : RX_BEHIND, null_arg);
2056 ctx->lookbehindp = TRUE;
2057 }
2058 /* Comment from Gauche regexp.c
2059 Assertions can check EOF even other regexps follow, so '$'
2060 in the last pos of this group should be treated as EOL.
2061 (?>$) as well. It is consistent with Perl and Oniguruma. */
2062 compile_rec(ctx, seq, FALSE);
2063 emit(ctx, RX_RESTORE, null_arg);
2064 pc->arg.pos.x = ctx->pc;
2065 ctx->lookbehindp = saved;
2066 return;
2067 }
2068
2069 if (SG_EQ(type, SYM_STANDALONE)) {
2070 /* almost the same as look ahead/behind */
2071 SgObject seq = SG_CADR(ast);
2072 inst_t *pc;
2073 ctx->extendedp = TRUE;
2074 pc = ctx->pc;
2075 emit(ctx, RX_ONCE, null_arg);
2076 compile_rec(ctx, seq, FALSE);
2077 emit(ctx, RX_RESTORE, null_arg);
2078 pc->arg.pos.x = ctx->pc;
2079 return;
2080 }
2081
2082 if (SG_EQ(type, SYM_BRANCH)) {
2083 SgObject cond = SG_CADR(ast);
2084 SgObject rest = SG_CAR(SG_CDDR(ast));
2085
2086 #define NULL_SEQP(ast) (SG_PAIRP(ast) && SG_NULLP(SG_CDR(ast)))
2087
2088 ctx->extendedp = TRUE;
2089 if (SG_INTP(cond)) {
2090 /* (branch n regexp)
2091 0: branch <n> 1 3
2092 1: <yes-pattern>
2093 2: jmp 4
2094 3: <no-pattern>
2095 4: rest
2096 */
2097 inst_t *pc = ctx->pc, *pc2;
2098 arg.cond.n = SG_INT_VALUE(cond) * 2;
2099 emit(ctx, RX_BRANCH, arg);
2100 pc->arg.cond.x = ctx->pc;
2101 if (!NULL_SEQP(rest) && SG_PAIRP(rest)) {
2102 /* check syntax */
2103 if (!SG_EQ(SG_CAR(rest), SYM_ALTER)) {
2104 raise_compile_error(UC("branch has non alter regex."), ast);
2105 return; /* dummy */
2106 }
2107 }
2108 compile_rec(ctx, (!NULL_SEQP(rest) && SG_PAIRP(rest))
2109 ? SG_CADR(rest) : rest, lastp);
2110 pc2 = ctx->pc;
2111 emit(ctx, RX_JMP, null_arg);
2112 pc->arg.cond.y = ctx->pc;
2113 if (!NULL_SEQP(rest) && SG_PAIRP(rest)) {
2114 compile_rec(ctx, SG_CAR(SG_CDDR(rest)), lastp);
2115 } else {
2116 compile_rec(ctx, SG_NIL, lastp);
2117 }
2118 pc2->arg.pos.x = ctx->pc;
2119 } else {
2120 /*
2121 (branch assert regexp)
2122 0: brancha 3 5
2123 1: <assert>
2124 2: restore ;; this must be in assert
2125 3: <yes-pattern>
2126 4: jmp 6
2127 5: <no-pattern>
2128 6: rest
2129 */
2130 inst_t *pc = ctx->pc, *pc2;
2131 emit(ctx, RX_BRANCHA, null_arg);
2132 compile_rec(ctx, cond, lastp);
2133 pc->arg.cond.x = ctx->pc;
2134 if (!NULL_SEQP(rest) && SG_PAIRP(rest)) {
2135 /* check syntax */
2136 if (!SG_EQ(SG_CAR(rest), SYM_ALTER)) {
2137 raise_compile_error(UC("branch has non alter regex."), ast);
2138 return; /* dummy */
2139 }
2140 }
2141 compile_rec(ctx, (!NULL_SEQP(rest) && SG_PAIRP(rest))
2142 ? SG_CADR(rest) : rest, lastp);
2143 pc2 = ctx->pc;
2144 emit(ctx, RX_JMP, null_arg);
2145 pc->arg.cond.y = ctx->pc;
2146 if (!NULL_SEQP(rest) && SG_PAIRP(rest)) {
2147 compile_rec(ctx, SG_CAR(SG_CDDR(rest)), lastp);
2148 } else {
2149 emit(ctx, RX_FAIL, null_arg);
2150 }
2151 pc2->arg.pos.x = ctx->pc;
2152 }
2153 return;
2154 }
2155
2156 Sg_Error(UC("unknown AST type: %S"), type);
2157 }
2158
2159 /*
2160 We run compile_rec max 3 times.
2161 First time is to count instructions. Second time is for root and third time
2162 is for matchRoot. The second and third time could be only once if given
2163 regular expression starts with \A.
2164 */
compile(compile_ctx_t * ctx,SgObject ast)2165 static prog_t* compile(compile_ctx_t *ctx, SgObject ast)
2166 {
2167 int n, modeless = FALSE, offset = 0;
2168 prog_t *p;
2169 inst_t *match;
2170
2171 ctx->pc = &null_inst; /* put dummy */
2172 check_start_anchor(ast, &modeless);
2173 ctx->emitp = FALSE;
2174 compile_rec(ctx, ast, TRUE);
2175 n = ctx->codemax + 1;
2176
2177 p = SG_NEW(prog_t);
2178 /* offset = (modeless) ? 0 : 3; */
2179 /* we need to add split, any and jmp */
2180 p->root = SG_NEW_ARRAY(inst_t, n + offset);
2181 p->rootLength = n + offset;
2182 ctx->prog = p;
2183 ctx->pc = &p->root[0];
2184 ctx->inst = p->root;
2185 ctx->index = 0;
2186 ctx->emitp = TRUE;
2187
2188 compile_rec(ctx, ast, TRUE);
2189 /* last instruction must be RX_MATCH */
2190 match = &p->root[n+offset-1];
2191 match->opcode = RX_MATCH;
2192
2193 return p;
2194 }
2195
2196
2197 /* compile takes 3 pass
2198 pass1: string->ast
2199 pass2: optimize
2200 pass3: code generation
2201 */
pattern_printer(SgObject self,SgPort * port,SgWriteContext * ctx)2202 static void pattern_printer(SgObject self, SgPort *port, SgWriteContext *ctx)
2203 {
2204 SgPattern *pattern = SG_PATTERN(self);
2205 long i, size = SG_STRING_SIZE(pattern->pattern);
2206 SG_PORT_LOCK_WRITE(port);
2207 Sg_PutzUnsafe(port, "#/");
2208 /* Sg_Printf(port, UC("#/%A/"), pattern->pattern); */
2209 for (i = 0; i < size;) {
2210 SgChar c = SG_STRING_VALUE_AT(pattern->pattern, i++);
2211 if (c == '\\') {
2212 Sg_PutcUnsafe(port, '\\');
2213 Sg_PutcUnsafe(port, SG_STRING_VALUE_AT(pattern->pattern, i++));
2214 } else if (c == '/') {
2215 Sg_PutcUnsafe(port, '\\');
2216 Sg_PutcUnsafe(port, c);
2217 } else {
2218 Sg_PutcUnsafe(port, c);
2219 }
2220 }
2221 Sg_PutcUnsafe(port, '/');
2222 /* flags */
2223 if (has(pattern, SG_COMMENTS)) {
2224 Sg_PutcUnsafe(port, 'x');
2225 }
2226 if (has(pattern, SG_CASE_INSENSITIVE)) {
2227 Sg_PutcUnsafe(port, 'i');
2228 }
2229 if (has(pattern, SG_MULTILINE)) {
2230 Sg_PutcUnsafe(port, 'm');
2231 }
2232 if (has(pattern, SG_DOTALL)) {
2233 Sg_PutcUnsafe(port, 's');
2234 }
2235 if (has(pattern, SG_UNICODE_CASE)) {
2236 Sg_PutcUnsafe(port, 'u');
2237 }
2238 SG_PORT_UNLOCK_WRITE(port);
2239 }
2240
pattern_cache_reader(SgPort * port,SgReadCacheCtx * ctx)2241 static SgObject pattern_cache_reader(SgPort *port, SgReadCacheCtx *ctx)
2242 {
2243 /* assume next object is pattern */
2244 SgObject pattern = Sg_ReadCacheObject(port, ctx);
2245 SgObject flags = Sg_ReadCacheObject(port, ctx);
2246 if (SG_STRINGP(pattern) && SG_INTP(flags)) {
2247 return Sg_CompileRegex(pattern, (int)SG_INT_VALUE(flags), FALSE);
2248 } else {
2249 return SG_FALSE;
2250 }
2251 }
2252
pattern_cache_scanner(SgObject obj,SgObject cbs,SgWriteCacheCtx * ctx)2253 static SgObject pattern_cache_scanner(SgObject obj, SgObject cbs,
2254 SgWriteCacheCtx *ctx)
2255 {
2256 /* since we don't have any compiler from AST, we can assume pattern
2257 has original pattern. */
2258 return Sg_WriteCacheScanRec(SG_PATTERN(obj)->pattern, cbs, ctx);;
2259 }
2260
pattern_cache_writer(SgObject obj,SgPort * port,SgWriteCacheCtx * ctx)2261 static void pattern_cache_writer(SgObject obj, SgPort *port,
2262 SgWriteCacheCtx *ctx)
2263 {
2264 /* just put original pattern and flags*/
2265 Sg_WriteObjectCache(SG_PATTERN(obj)->pattern, port, ctx);
2266 Sg_WriteObjectCache(SG_MAKE_INT(SG_PATTERN(obj)->flags), port, ctx);
2267 }
2268
2269 #define DEFINE_CLASS_WITH_CACHE SG_DEFINE_BUILTIN_CLASS_SIMPLE_WITH_CACHE
2270
2271 DEFINE_CLASS_WITH_CACHE(Sg_PatternClass,
2272 pattern_cache_reader,
2273 pattern_cache_scanner,
2274 pattern_cache_writer,
2275 pattern_printer);
2276
2277 static void unparse(SgObject reg, SgPort *out);
charset_print_ch(SgPort * out,SgChar ch,int firstp)2278 static void charset_print_ch(SgPort *out, SgChar ch, int firstp)
2279 {
2280 if (ch == '[' || ch == ']' || ch == '-' || (ch == '^' && firstp)) {
2281 Sg_Printf(out, UC("\\%c"), ch);
2282 } else if (ch < 0x20 || ch == 0x7f) {
2283 /* \xXX is enough for range of ascii */
2284 Sg_Printf(out, UC("\\x%02x"), ch);
2285 } else {
2286 switch (Sg_CharGeneralCategory(ch)) {
2287 case Mn: case Mc: case Me: case Cc: case Cf: case Cs: case Co: case Cn:
2288 if (ch < 0x10000) Sg_Printf(out, UC("\\u%04x"), ch);
2289 else Sg_Printf(out, UC("\\U%08x"), ch);
2290 break;
2291 default:
2292 Sg_PutcUnsafe(out, ch);
2293 }
2294 }
2295 }
2296
charset_to_regex(SgObject cs,int invertP,SgPort * out)2297 static void charset_to_regex(SgObject cs, int invertP, SgPort *out)
2298 {
2299 SgObject ranges = Sg_CharSetRanges(cs), cp;
2300 int firstp = TRUE;
2301
2302 Sg_PutcUnsafe(out, '[');
2303 if (invertP) {
2304 Sg_PutcUnsafe(out, '^');
2305 }
2306 SG_FOR_EACH(cp, ranges) {
2307 SgObject cell = SG_CAR(cp);
2308 SgChar start = (SgChar)SG_INT_VALUE(SG_CAR(cell));
2309 SgChar end = (SgChar)SG_INT_VALUE(SG_CDR(cell));
2310 charset_print_ch(out, start, firstp);
2311 firstp = FALSE;
2312 if (start != end) {
2313 Sg_PutcUnsafe(out, '-');
2314 charset_print_ch(out, end, FALSE);
2315 }
2316 }
2317 Sg_PutcUnsafe(out, ']');
2318 }
2319
unparse_seq(SgObject reg,SgPort * out)2320 static void unparse_seq(SgObject reg, SgPort *out)
2321 {
2322 SgObject cp;
2323 SG_FOR_EACH(cp, reg) {
2324 unparse(SG_CAR(cp), out);
2325 }
2326 }
2327
unparse_between(const char * a,SgObject n,const char * b,SgPort * out)2328 static void unparse_between(const char *a, SgObject n, const char *b,
2329 SgPort *out)
2330 {
2331 Sg_PutzUnsafe(out, a);
2332 unparse_seq(n, out);
2333 Sg_PutzUnsafe(out, b);
2334 }
2335
unparse_reg(SgObject reg,SgPort * out)2336 static void unparse_reg(SgObject reg, SgPort *out)
2337 {
2338 SgObject n, name, rest;
2339 if (SG_NULLP(SG_CDR(reg))) goto err;
2340 n = SG_CADR(reg);
2341 if (SG_NULLP(SG_CDDR(reg))) goto err;
2342 name = SG_CAR(SG_CDDR(reg));
2343 rest = SG_CDR(SG_CDDR(reg));
2344
2345 if (SG_EQ(n, SG_MAKE_INT(0))) {
2346 if (!SG_FALSEP(name)) {
2347 Sg_Error(UC("toplevel group can't have name"), name);
2348 }
2349 unparse_seq(rest, out);
2350 } else if (!SG_FALSEP(name)) {
2351 Sg_PutzUnsafe(out, "(?<");
2352 Sg_Write(name, out, SG_WRITE_DISPLAY);
2353 Sg_PutcUnsafe(out, '>');
2354 unparse_seq(rest, out);
2355 Sg_PutcUnsafe(out, ')');
2356 } else {
2357 Sg_PutcUnsafe(out, '(');
2358 unparse_seq(rest, out);
2359 Sg_PutcUnsafe(out, ')');
2360 }
2361 return;
2362 err:
2363 Sg_Error(UC("invalid AST node %S [reg]"), reg);
2364 }
2365
unparse_intersp(SgObject n,const char * sep,SgPort * out)2366 static void unparse_intersp(SgObject n, const char *sep, SgPort *out)
2367 {
2368 SgObject cp;
2369 if (SG_NULLP(n)) return;
2370 Sg_PutzUnsafe(out, "(?:");
2371 unparse(SG_CAR(n), out);
2372 SG_FOR_EACH(cp, SG_CDR(n)) {
2373 Sg_PutzUnsafe(out, sep);
2374 unparse(SG_CAR(cp), out);
2375 }
2376 Sg_PutcUnsafe(out, ')');
2377 }
2378
unparse_rep(SgObject o,int greedyP,SgPort * out)2379 static void unparse_rep(SgObject o, int greedyP, SgPort *out)
2380 {
2381 /* must have 4 elements */
2382 SgObject min, max, ns, n = o;
2383 if (!SG_PAIRP(SG_CDR(n))) goto err;
2384 n = SG_CDR(n);
2385 min = SG_CAR(n);
2386 if (!SG_INTP(min) && !SG_FALSEP(min)) goto err;
2387 if (!SG_PAIRP(SG_CDR(n))) goto err;
2388 n = SG_CDR(n);
2389 max = SG_CAR(n);
2390 if (!SG_INTP(max) && !SG_FALSEP(max)) goto err;
2391 ns = SG_CDR(n);
2392
2393 unparse_between("(?:", ns, ")", out);
2394 if (SG_FALSEP(max)) {
2395 if (SG_EQ(min, SG_MAKE_INT(0)) || SG_FALSEP(min)) {
2396 Sg_PutcUnsafe(out, '*');
2397 } else if (SG_EQ(min, SG_MAKE_INT(1))) {
2398 Sg_PutcUnsafe(out, '+');
2399 } else {
2400 Sg_Printf(out, UC("{%d,}"), SG_INT_VALUE(min));
2401 }
2402 } else if (SG_EQ(min, max)) {
2403 /* who write this? */
2404 if (SG_EQ(min, SG_MAKE_INT(0))) {
2405 Sg_PutzUnsafe(out, "{0}");
2406 } else if (!SG_EQ(min, SG_MAKE_INT(1))) {
2407 Sg_Printf(out, UC("{%d}"), SG_INT_VALUE(min));
2408 } /* if min == max == 1 then we don't have to write */
2409 } else if (SG_EQ(SG_MAKE_INT(0), min) && SG_EQ(SG_MAKE_INT(1), max)) {
2410 Sg_PutcUnsafe(out, '?');
2411 } else {
2412 if (SG_FALSEP(min)) {
2413 Sg_Printf(out, UC("{0,%d}"), SG_INT_VALUE(max));
2414 } else {
2415 Sg_Printf(out, UC("{%d,%d}"), SG_INT_VALUE(min), SG_INT_VALUE(max));
2416 }
2417 }
2418 if (!greedyP) {
2419 Sg_PutcUnsafe(out, '?');
2420 }
2421 return;
2422 err:
2423 Sg_Error(UC("invalid AST node %S [rep]"), o);
2424 }
2425
unparse_look(SgObject o,const char * behind,SgPort * out)2426 static void unparse_look(SgObject o, const char *behind, SgPort *out)
2427 {
2428 SgObject n = o, assert, ns;
2429 if (!SG_PAIRP(SG_CDR(n))) goto err;
2430 n = SG_CDR(n);
2431 assert = SG_CAR(n);
2432 ns = SG_CDR(n);
2433
2434 Sg_PutzUnsafe(out, "(?");
2435 Sg_PutzUnsafe(out, behind);
2436 unparse_between((SG_FALSEP(assert) ? "!" : "="), ns, ")", out);
2437 return;
2438 err:
2439 Sg_Error(UC("invalid AST node %S [look]"), o);
2440 }
2441
unparse(SgObject n,SgPort * out)2442 static void unparse(SgObject n, SgPort *out)
2443 {
2444 if (SG_CHARP(n)) {
2445 SgChar c = SG_CHAR_VALUE(n);
2446 if (c == '.' || c == '^' || c == '$' || c == '(' || c == ')' ||
2447 c == '{' || c == '}' || c == '[' || c == ']' || c == '\\' ||
2448 c == '*' || c == '+' || c == '?' || c == '|') {
2449 Sg_PutcUnsafe(out, '\\');
2450 }
2451 Sg_PutcUnsafe(out, c);
2452 } else if (SG_CHAR_SET_P(n)) {
2453 charset_to_regex(n, FALSE, out);
2454 } else if (SG_EQ(n, SYM_EVERYTHING)) {
2455 Sg_PutcUnsafe(out, '.');
2456 } else if (SG_EQ(n, SYM_START_ANCHOR)) {
2457 Sg_PutcUnsafe(out, '^');
2458 } else if (SG_EQ(n, SYM_MODELESS_START_ANCHOR)) {
2459 Sg_PutzUnsafe(out, "\\A");
2460 } else if (SG_EQ(n, SYM_END_ANCHOR)) {
2461 Sg_PutcUnsafe(out, '$');
2462 } else if (SG_EQ(n, SYM_MODELESS_END_ANCHOR)) {
2463 Sg_PutzUnsafe(out, "\\Z");
2464 } else if (SG_EQ(n, SYM_MODELESS_END_ANCHOR_NO_NEWLINE)) {
2465 Sg_PutzUnsafe(out, "\\z");
2466 } else if (SG_EQ(n, SYM_WORD_BOUNDARY)) {
2467 Sg_PutzUnsafe(out, "\\b");
2468 } else if (SG_EQ(n, SYM_NON_WORD_BOUNDARY)) {
2469 Sg_PutzUnsafe(out, "\\B");
2470 } else if (SG_PAIRP(n)) {
2471 SgObject f = SG_CAR(n);
2472 if (SG_EQ(f, SYM_REGISTER)) {
2473 /* capture */
2474 unparse_reg(n, out);
2475 } else if (SG_EQ(f, SYM_SEQUENCE)) {
2476 unparse_seq(SG_CDR(n), out);
2477 } else if (SG_EQ(f, SYM_FLAGGED_SEQUENCE)) {
2478 SgObject flags, cp;
2479 if (SG_NULLP(SG_CDR(n))) goto err;
2480 flags = SG_CADR(n);
2481 if (!SG_PAIRP(flags)) goto err;
2482 Sg_PutzUnsafe(out, "(?");
2483 SG_FOR_EACH(cp, flags) {
2484 SgObject flag = SG_CAR(cp);
2485 if (!SG_PAIRP(flag)) goto err;
2486 if (!SG_CHARP(SG_CAR(flag))) goto err;
2487 if (SG_FALSEP(SG_CDR(flag))) {
2488 Sg_PutcUnsafe(out, '-');
2489 }
2490 Sg_PutcUnsafe(out, SG_CHAR_VALUE(SG_CAR(flag)));
2491 }
2492 unparse_between(":", SG_CDDR(n), ")", out);
2493 } else if (SG_EQ(f, SYM_ALTER)) {
2494 unparse_intersp(SG_CDR(n), "|", out);
2495 } else if (SG_EQ(f, SYM_STANDALONE)) {
2496 unparse_between("(?>", SG_CDR(n), ")", out);
2497 } else if (SG_EQ(f, SYM_GREEDY_REP)) {
2498 unparse_rep(n, TRUE, out);
2499 } else if (SG_EQ(f, SYM_NON_GREEDY_REP)) {
2500 unparse_rep(n, FALSE, out);
2501 } else if (SG_EQ(f, SYM_LOOKAHEAD)) {
2502 unparse_look(n, "", out);
2503 } else if (SG_EQ(f, SYM_LOOKBHIND)) {
2504 unparse_look(n, "<", out);
2505 } else if (SG_EQ(f, SYM_BACKREF)) {
2506 if (!SG_INTP(SG_CDR(n))) goto err;
2507 Sg_Printf(out, UC("\\%d"), SG_INT_VALUE(SG_CDR(n)));
2508 } else if (SG_EQ(f, SYM_BRANCH)) {
2509 /* check alter */
2510 SgObject name, ns, t = n, yes, no;
2511 if (!SG_PAIRP(SG_CDR(t))) goto err;
2512 name = SG_CADR(t);
2513 ns = SG_CDDR(t);
2514 if (!SG_PAIRP(ns) || !SG_PAIRP(SG_CAR(ns)) ||
2515 !SG_EQ(SG_CAAR(ns), SYM_ALTER)) {
2516 Sg_Error(UC("branch must have alter. %S"), n);
2517 }
2518 /* (alternation (sequence ...) (sequence ...) */
2519 ns = SG_CAR(ns);
2520
2521 Sg_PutzUnsafe(out, "(?");
2522 if (SG_INTP(name)) {
2523 Sg_Printf(out, UC("(%d)"), SG_INT_VALUE(name));
2524 } else if (SG_PAIRP(name)) {
2525 if (SG_EQ(SG_CAR(name), SYM_LOOKAHEAD)) {
2526 unparse_look(name, "", out);
2527 } else if (SG_EQ(SG_CAR(name), SYM_LOOKBHIND)) {
2528 unparse_look(name, "<", out);
2529 } else goto err;
2530 } else goto err;
2531 /* if the false condition contains (sequence) then
2532 we don't want to emit | */
2533 if (!SG_PAIRP(SG_CDR(ns))) goto err;
2534 yes = SG_CADR(ns);
2535 ns = SG_CDR(ns);
2536 if (!SG_PAIRP(SG_CDR(ns))) goto err;
2537 no = SG_CADR(ns);
2538
2539 /* just emit yes */
2540 unparse(yes, out);
2541 if (SG_PAIRP(no) && !SG_NULLP(SG_CDR(no))) {
2542 Sg_PutcUnsafe(out, '|');
2543 unparse(no, out);
2544 }
2545
2546 } else if (SG_EQ(f, SYM_INVERTED_CHAR_CLASS)) {
2547 if (!SG_PAIRP(SG_CDR(n))) goto err;
2548 if (!SG_CHAR_SET_P(SG_CADR(n))) goto err;
2549 charset_to_regex(SG_CADR(n), TRUE, out);
2550 } else goto err;
2551
2552 } else {
2553 err:
2554 /* never happen but raise an error */
2555 Sg_Error(UC("invalid AST node %S [unparse]"), n);
2556 }
2557 }
2558
unparse_ast(SgObject ast)2559 static SgObject unparse_ast(SgObject ast)
2560 {
2561 SgPort *out;
2562 SgStringPort tp;
2563 SgObject str;
2564 out = Sg_InitStringOutputPort(&tp, 0);
2565
2566 unparse(ast, out);
2567
2568 str = SG_STRING(Sg_GetStringFromStringPort(&tp));
2569 SG_CLEAN_STRING_PORT(&tp);
2570 return str;
2571 }
2572
make_pattern(SgObject p,SgObject ast,int flags,int reg_num,SgObject names,prog_t * prog,compile_ctx_t * cctx)2573 static SgPattern* make_pattern(SgObject p, SgObject ast, int flags,
2574 int reg_num, SgObject names, prog_t *prog,
2575 compile_ctx_t *cctx)
2576 {
2577 SgPattern *pt = SG_NEW(SgPattern);
2578 SG_SET_CLASS(pt, SG_CLASS_PATTERN);
2579 pt->pattern = p;
2580 pt->ast = ast;
2581 pt->flags = flags;
2582 pt->groupCount = reg_num;
2583 pt->prog = prog;
2584 pt->extendedp = cctx->extendedp;
2585 pt->groupNames = names;
2586 return pt;
2587 }
2588
count_register(SgObject ast,int * acc,SgObject * regs)2589 static void count_register(SgObject ast, int *acc, SgObject *regs)
2590 {
2591 if (SG_EQ(ast, SYM_REGISTER)) {
2592 (*acc)++;
2593 return;
2594 }
2595 if (SG_PAIRP(ast)) {
2596 if (SG_EQ(SG_CAR(ast), SYM_REGISTER)) {
2597 /* get name and capture number */
2598 SgObject n = SG_CADR(ast);
2599 SgObject name = SG_CAR(SG_CDDR(ast));
2600 if (!SG_FALSEP(name)) {
2601 SgObject slot = Sg_Assq(name, *regs);
2602 if (SG_FALSEP(slot)) {
2603 PUSH(SG_LIST2(name, n), *regs);
2604 } else {
2605 SG_SET_CDR(slot, Sg_Cons(n, SG_CDR(slot)));
2606 }
2607 }
2608 }
2609 count_register(SG_CAR(ast), acc, regs);
2610 count_register(SG_CDR(ast), acc, regs);
2611 }
2612 return;
2613 }
2614
compile_regex_ast(SgString * pattern,SgObject ast,int flags)2615 static SgObject compile_regex_ast(SgString *pattern, SgObject ast, int flags)
2616 {
2617 SgPattern *p;
2618 prog_t *prog;
2619 compile_ctx_t cctx = {0};
2620 int reg_num = 0;
2621 SgObject regs = SG_NIL;
2622 /* optimize */
2623 ast = optimize(ast, SG_NIL);
2624 /* count group number */
2625 count_register(ast, ®_num, ®s);
2626 /* compile */
2627 cctx.flags = flags;
2628 prog = compile(&cctx, ast);
2629
2630 p = make_pattern(pattern ? pattern : unparse_ast(ast),
2631 ast, flags, reg_num, regs, prog, &cctx);
2632 return SG_OBJ(p);
2633 }
2634
Sg_ParseCharSetString(SgString * s,int asciiP,long start,long end)2635 SgObject Sg_ParseCharSetString(SgString *s, int asciiP, long start, long end)
2636 {
2637 /* sanity check */
2638 lexer_ctx_t ctx;
2639 long size = SG_STRING_SIZE(s);
2640 SgObject r;
2641 SG_CHECK_START_END(start, end, size);
2642 if (size < 2) {
2643 Sg_Error(UC("invalid regex char-set string. %S"), s);
2644 }
2645 if (SG_STRING_VALUE_AT(s, start) != '[') {
2646 Sg_Error(UC("regex char-set must start with '['. %S[%d-%d]"),
2647 s, start, end);
2648 }
2649 if (SG_STRING_VALUE_AT(s, end-1) != ']') {
2650 Sg_Error(UC("regex char-set must end with ']'. %S[%d-%d]"), s, start, end);
2651 }
2652 init_lexer(&ctx, s, (asciiP) ? 0 : SG_UNICODE_CASE);
2653 /* init_lexer only initialise the context with whole string but
2654 we want to limit the range. */
2655 ctx.pos = start+1;
2656 ctx.len = end;
2657 r = read_char_set(&ctx, NULL);
2658 if (ctx.pos != ctx.len) {
2659 Sg_Error(UC("non finished charset string. %S[%d-%d]"), s, start, end);
2660 }
2661 return r;
2662 }
2663
Sg_CharSetToRegexString(SgObject cset,int invertP)2664 SgObject Sg_CharSetToRegexString(SgObject cset, int invertP)
2665 {
2666 SgPort *out;
2667 SgStringPort tp;
2668 SgObject str;
2669
2670 out = Sg_InitStringOutputPort(&tp, 0);
2671 charset_to_regex(cset, invertP, out);
2672 str = Sg_GetStringFromStringPort(&tp);
2673 SG_CLEAN_STRING_PORT(&tp);
2674 return str;
2675 }
2676
Sg_CompileRegex(SgString * pattern,int flags,int parseOnly)2677 SgObject Sg_CompileRegex(SgString *pattern, int flags, int parseOnly)
2678 {
2679 SgObject ast;
2680 lexer_ctx_t ctx;
2681
2682 init_lexer(&ctx, pattern, flags);
2683 ast = parse_string(&ctx);
2684 if (!END_OF_STRING_P(&ctx)) {
2685 raise_syntax_error(&ctx, ctx.pos,
2686 UC("Expected end of string."));
2687 }
2688 if (parseOnly) return ast;
2689 return compile_regex_ast(pattern, ast, flags);
2690 }
2691
Sg_CompileRegexAST(SgObject ast,int flags)2692 SgObject Sg_CompileRegexAST(SgObject ast, int flags)
2693 {
2694 return compile_regex_ast(NULL, ast, flags);
2695 }
2696
Sg_DumpRegex(SgPattern * pattern,SgObject port)2697 void Sg_DumpRegex(SgPattern *pattern, SgObject port)
2698 {
2699 #if 1
2700 int i;
2701 const int size = pattern->prog->rootLength;
2702 inst_t *start = &pattern->prog->root[0];
2703 Sg_Printf(port, UC("input regex : %S extended? %A\n"), pattern->pattern,
2704 SG_MAKE_BOOL(pattern->extendedp));
2705 Sg_Printf(port, UC(" group count: %d\n"), pattern->groupCount);
2706 for (i = 0; i < size; i++) {
2707 inst_t *inst = &pattern->prog->root[i];
2708 int op = INST_OPCODE(inst);
2709 switch (op) {
2710 case RX_ANY:
2711 Sg_Printf(port, UC("%3d: RX_ANY[%d]\n"), i, op);
2712 break;
2713 case RX_CHAR:
2714 Sg_Printf(port, UC("%3d: RX_CHAR[%d] %c\n"), i, op, inst->arg.c);
2715 break;
2716 case RX_SET:
2717 Sg_Printf(port, UC("%3d: RX_SET[%d] %S\n"), i, op, inst->arg.set);
2718 break;
2719 case RX_NSET:
2720 Sg_Printf(port, UC("%3d: RX_NSET[%d] %S\n"), i, op, inst->arg.set);
2721 break;
2722 case RX_SPLIT:
2723 Sg_Printf(port, UC("%3d: RX_SPLIT[%d] %d %d\n"),
2724 i, op, inst->arg.pos.x - start, inst->arg.pos.y - start);
2725 break;
2726 case RX_JMP:
2727 Sg_Printf(port, UC("%3d: RX_JMP[%d] %d\n"),
2728 i, op, inst->arg.pos.x - start);
2729 break;
2730 case RX_SAVE:
2731 Sg_Printf(port, UC("%3d: RX_SAVE[%d] %d\n"), i, op, inst->arg.n);
2732 break;
2733 case RX_EMPTY:
2734 Sg_Printf(port, UC("%3d: RX_EMPTY[%d] %x\n"), i, op, inst->arg.flags);
2735 break;
2736 case RX_FAIL:
2737 Sg_Printf(port, UC("%3d: RX_FAIL[%d]\n"), i, op);
2738 break;
2739 case RX_MATCH:
2740 Sg_Printf(port, UC("%3d: RX_MATCH[%d]\n"), i, op);
2741 break;
2742 case RX_BREF:
2743 Sg_Printf(port, UC("%3d: RX_BREF[%d] %d\n"), i, op, inst->arg.index);
2744 break;
2745 case RX_AHEAD:
2746 case RX_NAHEAD:
2747 Sg_Printf(port, UC("%3d: %s[%d] %d\n"), i,
2748 op == RX_AHEAD ? UC("RX_AHEAD") : UC("RX_NAHEAD"), op,
2749 inst->arg.pos.x - start);
2750 break;
2751 case RX_BEHIND:
2752 case RX_NBEHIND:
2753 Sg_Printf(port, UC("%3d: %s[%d] %d\n"), i,
2754 op == RX_BEHIND ? UC("RX_BEHIND") : UC("RX_NBEHIND"), op,
2755 inst->arg.pos.x - start);
2756 break;
2757 case RX_ONCE:
2758 Sg_Printf(port, UC("%3d: RX_ONCE[%d] %d\n"), i, op,
2759 inst->arg.pos.x - start);
2760 break;
2761 case RX_RESTORE:
2762 Sg_Printf(port, UC("%3d: RX_RESTORE[%d] %d\n"), i, op, inst->arg.index);
2763 break;
2764 case RX_BRANCH:
2765 Sg_Printf(port, UC("%3d: RX_BRANCH[%d] %d %d %d\n"), i, op,
2766 inst->arg.cond.n, inst->arg.cond.x-start,
2767 inst->arg.cond.y-start);
2768 break;
2769 case RX_BRANCHA:
2770 Sg_Printf(port, UC("%3d: RX_BRANCHA[%d] %d %d\n"), i, op,
2771 inst->arg.cond.x-start, inst->arg.cond.y-start);
2772 break;
2773 default:
2774 Sg_Printf(port, UC("%3d: ??? %d\n"), i, op);
2775 break;
2776 }
2777 }
2778 #endif
2779 }
2780
2781 /* returns (compile-regex "str" flag) */
read_regex_string(SgPort * port)2782 static SgObject read_regex_string(SgPort *port)
2783 {
2784 SgPort *buf;
2785 SgStringPort tp;
2786 buf = Sg_InitStringOutputPort(&tp, -1);
2787 while(1) {
2788 SgChar c = Sg_GetcUnsafe(port);
2789 if (c == EOF) {
2790 Sg_ReadError(UC("unexpected end-of-file. (file %S, line %d)"),
2791 Sg_FileName(port), Sg_LineNo(port));
2792 }
2793 if (c == '\\') {
2794 /* escape. */
2795 Sg_PutcUnsafe(buf, c);
2796 Sg_PutcUnsafe(buf, Sg_GetcUnsafe(port));
2797 } else if (c == '/') {
2798 /* end mark */
2799 int flag = 0, add = 0;
2800 SgObject tmp;
2801 entry:
2802 c = Sg_PeekcUnsafe(port);
2803 switch (c) {
2804 case 'x':
2805 add = SG_COMMENTS;
2806 goto add_flag;
2807 case 'i':
2808 add = SG_CASE_INSENSITIVE;
2809 goto add_flag;
2810 case 'm':
2811 add = SG_MULTILINE;
2812 goto add_flag;
2813 case 's':
2814 add = SG_DOTALL;
2815 goto add_flag;
2816 case 'u':
2817 add = SG_UNICODE_CASE;
2818 goto add_flag;
2819 default:
2820 tmp = Sg_GetStringFromStringPort(&tp);
2821 SG_CLEAN_STRING_PORT(&tp);
2822 return Sg_CompileRegex(tmp, flag, FALSE);
2823 }
2824 add_flag:
2825 flag |= add;
2826 Sg_GetcUnsafe(port);
2827 goto entry;
2828 } else {
2829 Sg_PutcUnsafe(buf, c);
2830 }
2831 }
2832 }
2833
hash_slash_reader(SgObject * args,int argc,void * data_)2834 static SgObject hash_slash_reader(SgObject *args, int argc, void *data_)
2835 {
2836 SgPort *p;
2837
2838 if (!SG_PORTP(args[0])) {
2839 Sg_WrongTypeOfArgumentViolation(SG_INTERN("#/-reader"),
2840 SG_MAKE_STRING("port"), args[0], SG_NIL);
2841 }
2842 p = SG_PORT(args[0]);
2843
2844 return read_regex_string(p);
2845 }
2846
2847 SG_DEFINE_SUBR(hash_slash_reader_stub, 3, 0, hash_slash_reader, SG_FALSE, NULL);
2848
add_reader_macro(SgLibrary * lib)2849 static void add_reader_macro(SgLibrary *lib)
2850 {
2851 Sg_EnsureLibraryReadTable(lib);
2852 Sg_MakeDispatchMacroCharacter('#', FALSE, SG_LIBRARY_READTABLE(lib));
2853 Sg_SetDispatchMacroCharacter('#', '/', SG_OBJ(&hash_slash_reader_stub),
2854 SG_LIBRARY_READTABLE(lib));
2855 }
2856
2857 /* abstract matcher class */
2858 SG_DEFINE_ABSTRACT_CLASS(Sg_MatcherClass, NULL);
2859
2860 /* old interface */
Sg_RegexReplaceFirst(SgMatcher * m,SgObject replacement)2861 SgObject Sg_RegexReplaceFirst(SgMatcher *m, SgObject replacement)
2862 {
2863 return Sg_RegexReplace(m, replacement, 0);
2864 }
2865
Sg_RegexMatcher(SgPattern * pattern,SgObject text,long start,long end)2866 SgMatcher* Sg_RegexMatcher(SgPattern *pattern, SgObject text,
2867 long start, long end)
2868 {
2869 if (SG_STRINGP(text)) {
2870 return Sg_RegexTextMatcher(pattern, SG_STRING(text), start, end);
2871 } else if (SG_BVECTORP(text)) {
2872 return Sg_RegexBinaryMatcher(pattern, SG_BVECTOR(text), start, end);
2873 } else {
2874 Sg_Error(UC("string or bytevector required as text: %S, %S"), pattern, text);
2875 return SG_UNDEF; /* dummy */
2876 }
2877 }
2878
Sg_RegexMatches(SgMatcher * m)2879 int Sg_RegexMatches(SgMatcher *m)
2880 {
2881 if (SG_TEXT_MATCHERP(m)) {
2882 return Sg_RegexTextMatches(SG_TEXT_MATCHER(m));
2883 } else if (SG_BINARY_MATCHERP(m)) {
2884 return Sg_RegexBinaryMatches(SG_BINARY_MATCHER(m));
2885 } else {
2886 Sg_Error(UC("matcher requires but god %S"), m);
2887 return FALSE;
2888 }
2889 }
Sg_RegexLookingAt(SgMatcher * m)2890 int Sg_RegexLookingAt(SgMatcher *m)
2891 {
2892 if (SG_TEXT_MATCHERP(m)) {
2893 return Sg_RegexTextLookingAt(SG_TEXT_MATCHER(m));
2894 } else if (SG_BINARY_MATCHERP(m)) {
2895 return Sg_RegexBinaryLookingAt(SG_BINARY_MATCHER(m));
2896 } else {
2897 Sg_Error(UC("matcher requires but god %S"), m);
2898 return FALSE;
2899 }
2900 }
Sg_RegexFind(SgMatcher * m,long start)2901 int Sg_RegexFind(SgMatcher *m, long start)
2902 {
2903 if (SG_TEXT_MATCHERP(m)) {
2904 return Sg_RegexTextFind(SG_TEXT_MATCHER(m), start);
2905 } else if (SG_BINARY_MATCHERP(m)) {
2906 return Sg_RegexBinaryFind(SG_BINARY_MATCHER(m), start);
2907 } else {
2908 Sg_Error(UC("matcher requires but god %S"), m);
2909 return FALSE;
2910 }
2911 }
2912
Sg_RegexGroup(SgMatcher * m,SgObject groupOrName)2913 SgObject Sg_RegexGroup(SgMatcher *m, SgObject groupOrName)
2914 {
2915 if (SG_TEXT_MATCHERP(m)) {
2916 return Sg_RegexTextGroup(SG_TEXT_MATCHER(m), groupOrName);
2917 } else if (SG_BINARY_MATCHERP(m)) {
2918 return Sg_RegexBinaryGroup(SG_BINARY_MATCHER(m), groupOrName);
2919 } else {
2920 Sg_Error(UC("matcher requires but god %S"), m);
2921 return SG_UNDEF;
2922 }
2923 }
Sg_RegexGroupPosition(SgMatcher * m,SgObject groupOrName,int startP)2924 int Sg_RegexGroupPosition(SgMatcher *m, SgObject groupOrName, int startP)
2925 {
2926 if (SG_TEXT_MATCHERP(m)) {
2927 return Sg_RegexTextGroupPosition(SG_TEXT_MATCHER(m), groupOrName, startP);
2928 } else if (SG_BINARY_MATCHERP(m)) {
2929 return Sg_RegexBinaryGroupPosition(SG_BINARY_MATCHER(m),
2930 groupOrName, startP);
2931 } else {
2932 Sg_Error(UC("matcher requires but god %S"), m);
2933 return -1;
2934 }
2935 }
2936
Sg_RegexReplaceAll(SgMatcher * m,SgObject replacement)2937 SgObject Sg_RegexReplaceAll(SgMatcher *m,
2938 SgObject replacement)
2939 {
2940 if (SG_TEXT_MATCHERP(m)) {
2941 return Sg_RegexTextReplaceAll(SG_TEXT_MATCHER(m), replacement);
2942 } else if (SG_BINARY_MATCHERP(m)) {
2943 return Sg_RegexBinaryReplaceAll(SG_BINARY_MATCHER(m), replacement);
2944 } else {
2945 Sg_Error(UC("matcher requires but god %S"), m);
2946 return SG_UNDEF;
2947 }
2948 }
Sg_RegexReplace(SgMatcher * m,SgObject replacement,long count)2949 SgObject Sg_RegexReplace(SgMatcher *m, SgObject replacement,
2950 long count)
2951 {
2952 if (SG_TEXT_MATCHERP(m)) {
2953 return Sg_RegexTextReplace(SG_TEXT_MATCHER(m), replacement, count);
2954 } else if (SG_BINARY_MATCHERP(m)) {
2955 return Sg_RegexBinaryReplace(SG_BINARY_MATCHER(m), replacement, count);
2956 } else {
2957 Sg_Error(UC("matcher requires but god %S"), m);
2958 return SG_UNDEF;
2959 }
2960 }
Sg_RegexCaptureCount(SgMatcher * m)2961 int Sg_RegexCaptureCount(SgMatcher *m)
2962 {
2963 if (SG_TEXT_MATCHERP(m)) {
2964 return Sg_RegexTextCaptureCount(SG_TEXT_MATCHER(m));
2965 } else if (SG_BINARY_MATCHERP(m)) {
2966 return Sg_RegexBinaryCaptureCount(SG_BINARY_MATCHER(m));
2967 } else {
2968 Sg_Error(UC("matcher requires but god %S"), m);
2969 return -1;
2970 }
2971 }
2972
Sg_RegexAfter(SgMatcher * m)2973 SgObject Sg_RegexAfter(SgMatcher *m)
2974 {
2975 if (SG_TEXT_MATCHERP(m)) {
2976 return Sg_RegexTextAfter(SG_TEXT_MATCHER(m));
2977 } else if (SG_BINARY_MATCHERP(m)) {
2978 return Sg_RegexBinaryAfter(SG_BINARY_MATCHER(m));
2979 } else {
2980 Sg_Error(UC("matcher requires but god %S"), m);
2981 return SG_UNDEF;
2982 }
2983 }
Sg_RegexBefore(SgMatcher * m)2984 SgObject Sg_RegexBefore(SgMatcher *m)
2985 {
2986 if (SG_TEXT_MATCHERP(m)) {
2987 return Sg_RegexTextBefore(SG_TEXT_MATCHER(m));
2988 } else if (SG_BINARY_MATCHERP(m)) {
2989 return Sg_RegexBinaryBefore(SG_BINARY_MATCHER(m));
2990 } else {
2991 Sg_Error(UC("matcher requires but god %S"), m);
2992 return SG_UNDEF;
2993 }
2994 }
2995
2996 SG_CDECL_BEGIN
2997 extern void Sg__Init_sagittarius_regex_impl();
2998 SG_CDECL_END
2999
Sg__InitRegex()3000 void Sg__InitRegex()
3001 {
3002 SgLibrary *lib;
3003 Sg__Init_sagittarius_regex_impl();
3004 lib = SG_LIBRARY(Sg_FindLibrary(SG_INTERN("(sagittarius regex impl)"),
3005 FALSE));
3006
3007 Sg_InitStaticClass(SG_CLASS_PATTERN, UC("<pattern>"), lib, NULL, 0);
3008 Sg_InitStaticClass(SG_CLASS_MATCHER, UC("<matcher>"), lib, NULL, 0);
3009
3010 Sg_InitStaticClass(SG_CLASS_TEXT_MATCHER, UC("<text-matcher>"), lib, NULL, 0);
3011 Sg_InitStaticClass(SG_CLASS_BINARY_MATCHER, UC("<binary-matcher>"),
3012 lib, NULL, 0);
3013
3014 add_reader_macro(lib);
3015 #define insert_binding(name, value) \
3016 Sg_MakeBinding(lib, SG_INTERN(#name), SG_MAKE_INT(value), TRUE);
3017 insert_binding(CASE-INSENSITIVE, SG_CASE_INSENSITIVE);
3018 insert_binding(COMMENTS, SG_COMMENTS);
3019 insert_binding(MULTILINE, SG_MULTILINE);
3020 insert_binding(LITERAL, SG_LITERAL);
3021 insert_binding(DOTALL, SG_DOTALL);
3022 /* deprecated rename exported on Scheme */
3023 /* insert_binding(UNICODE-CASE, SG_UNICODE_CASE); */
3024 insert_binding(UNICODE, SG_UNICODE_CASE);
3025 #undef insert_binding
3026
3027 SYM_ALTER = SG_INTERN("alternation");
3028 SYM_NON_GREEDY_REP = SG_INTERN("non-greedy-repetition");
3029 SYM_GREEDY_REP = SG_INTERN("greedy-repetition");
3030 SYM_CLOSE_PAREN = SG_INTERN("close-paren");
3031 SYM_VERTICAL_BAR = SG_INTERN("vertical-bar");
3032 SYM_QUESTION_MARK = SG_INTERN("question-mark");
3033 SYM_EVERYTHING = SG_INTERN("everything");
3034 SYM_END_ANCHOR = SG_INTERN("end-anchor");
3035 SYM_INVERTED_CHAR_CLASS = SG_INTERN("inverted-char-class");
3036 SYM_MODELESS_START_ANCHOR = SG_INTERN("modeless-start-anchor");
3037 SYM_MODELESS_END_ANCHOR = SG_INTERN("modeless-end-anchor");
3038 SYM_MODELESS_END_ANCHOR_NO_NEWLINE
3039 = SG_INTERN("modeless-end-anchor-no-newline");
3040 SYM_START_ANCHOR = SG_INTERN("start-anchor");
3041 SYM_BACKREF = SG_INTERN("back-reference");
3042 SYM_WORD_BOUNDARY = SG_INTERN("word-boundary");
3043 SYM_NON_WORD_BOUNDARY = SG_INTERN("non-word-boundary");
3044 SYM_BRANCH = SG_INTERN("branch");
3045 SYM_FLAGS = SG_INTERN("flags");
3046 SYM_OPEN_PAREN = SG_INTERN("open-paren");
3047 SYM_OPEN_PAREN_PAREN = SG_INTERN("open-paren-paren");
3048 SYM_OPEN_PAREN_GREATER = SG_INTERN("open-paren-greater");
3049 SYM_OPEN_PAREN_EQUAL = SG_INTERN("open-paren-equal");
3050 SYM_OPEN_PAREN_LESS_EXCLAMATION = SG_INTERN("open-paren-less-exclamation");
3051 SYM_OPEN_PAREN_COLON = SG_INTERN("open-paren-colon");
3052 SYM_OPEN_PAREN_EXCLAMATION = SG_INTERN("open-paren-exclamation");
3053 SYM_OPEN_PAREN_LESS_LETTER = SG_INTERN("open-paren-less-letter");
3054 SYM_REGISTER = SG_INTERN("register");
3055 SYM_STANDALONE = SG_INTERN("standalone");
3056 SYM_LOOKAHEAD = SG_INTERN("lookahead");
3057 SYM_OPEN_PAREN_LESS_EQUAL = SG_INTERN("open-paren-less-equal");
3058 SYM_SEQUENCE = SG_INTERN("sequence");
3059 SYM_LOOKBHIND = SG_INTERN("lookbehind");
3060 SYM_FLAGGED_SEQUENCE = SG_INTERN("flagged-sequence");
3061 }
3062