1 /*
2 * regexp.c - regular expression
3 *
4 * Copyright (c) 2000-2020 Shiro Kawai <shiro@acm.org>
5 * Copyright (c) 2019-2020 Duy Nguyen <pclouds@gmail.com>
6 * Copyright (c) 2006 Rui Ueyama, All rights reserved.
7 *
8 * Redistribution and use in source and binary forms, with or without
9 * modification, are permitted provided that the following conditions
10 * are met:
11 *
12 * 1. Redistributions of source code must retain the above copyright
13 * notice, this list of conditions and the following disclaimer.
14 *
15 * 2. Redistributions in binary form must reproduce the above copyright
16 * notice, this list of conditions and the following disclaimer in the
17 * documentation and/or other materials provided with the distribution.
18 *
19 * 3. Neither the name of the authors nor the names of its contributors
20 * may be used to endorse or promote products derived from this
21 * software without specific prior written permission.
22 *
23 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
24 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
25 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
26 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
27 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
28 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
29 * TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
30 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
31 * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
32 * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
33 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
34 */
35
36 #include <setjmp.h>
37 #include <ctype.h>
38 #define LIBGAUCHE_BODY
39 #include "gauche.h"
40 #include "gauche/regexp.h"
41 #include "gauche/class.h"
42 #include "gauche/priv/builtin-syms.h"
43 #include "gauche/priv/charP.h"
44 #include "gauche/priv/stringP.h"
45
46 /* I don't like to reinvent wheels, so I looked for a regexp implementation
47 * that can handle multibyte encodings and not bound to Unicode.
48 * Without assuming Unicode it'll be difficult to define character classes
49 * correctly, but there are domains that you don't want to do native
50 * charset <-> UTF-8 each time for regexp match, trading correctness of
51 * character classes.
52 *
53 * The most recent version of famous Henry Spencer's regex is found in Tcl
54 * 8.3, that supports wide characters (the state machine seems to work
55 * with UCS-4, but the internal tables seem to be set up for UCS-2 only).
56 * Tcl does UTF-8 <-> UCS-2 conversion in order to do regexp match.
57 *
58 * Lots of variants of Spencer's old regex code is floating around, such
59 * as http://arglist.com/regex/ and the one in BSD. They don't support
60 * multibyte strings, as far as I know.
61 *
62 * Another popular package is PCRE. PCRE 3.4 has UTF-8 support, but only
63 * experimentally.
64 *
65 * None seems to satisfy my criteria.
66 *
67 * So I reluctantly started to write my own. I don't think I can beat
68 * those guys, and am willing to grab someone's code anytime if it's suitable
69 * for my purpose and under a license like BSD one.
70 */
71
72 /*
73 * The idea here is to match string without converting mb <-> char as
74 * much as possible. Actually, the conversion is done only when we see
75 * large character sets.
76 *
77 * The engine is a sort of NFA, by keeping state information for backtrack
78 * in C stack. It'll bust the C stack if you try to match something like
79 * (..)* with a long input string (there's a code to check the stack size
80 * and aborts matching when the recursion goes too deep).
81 * A possible fix is to check if recursion level exceeds some limit,
82 * then save the C stack into heap (as in the C-stack-copying continuation
83 * does) and reuse the stack area.
84 */
85
86 /* Instructions. `RL' suffix indicates that the instruction moves the
87 current position pointer right to left. These instructions are
88 used within lookbehind assertion. */
89 enum {
90 #define DEF_RE_INSN(name, _) SCM_CPP_CAT(RE_, name),
91 #include "gauche/regexp_insn.h"
92 #undef DEF_RE_INSN
93 RE_NUM_INSN
94 };
95
96 /* maximum # of {n,m}-type limited repeat count */
97 #define MAX_LIMITED_REPEAT 255
98
99 /* internal regexp flag. */
100 #define SCM_REGEXP_BOL_ANCHORED (1L<<2) /* The regexp beginning is anchored
101 by ^.*/
102 #define SCM_REGEXP_SIMPLE_PREFIX (1L<<3) /* The regexp begins with a repeating
103 character or charset, e.g. #/a+b/.
104 See is_simple_prefixed() below. */
105
106 /* AST - the first pass of regexp compiler creates intermediate AST.
107 * Alternatively, you can provide AST directly to the regexp compiler,
108 * using Scm_RegCompFromAST().
109 *
110 * <ast> : (<element> ...)
111 *
112 * <element> : <clause> ; special clause
113 * | <item> ; matches <item>
114 *
115 * <item> : <char> ; matches char
116 * | <char-set> ; matches char set
117 * | (comp . <char-set>) ; matches complement of char set
118 * | any ; matches any char
119 * | bos | eos ; beginning/end of string assertion
120 * | bol | eol ; beginning/end of line assertion
121 * | bow | eow | wb | nwb ; word-boundary/negative word boundary assertion
122 *
123 * <clause> : (seq . <ast>) ; sequence
124 * | (seq-uncase . <ast>) ; sequence (case insensitive match)
125 * | (seq-case . <ast>) ; sequence (case sensitive match)
126 * | (alt . <ast>) ; alternative
127 * | (rep <m> <n> . <ast>) ; repetition at least <m> up to <n> (greedy)
128 * ; <n> may be `#f'
129 * | (rep-min <m> <n> . <ast>)
130 * ; repetition at least <m> up to <n> (lazy)
131 * ; <n> may be `#f'
132 * | (rep-while <m> <n> . <ast>)
133 * ; like rep, but no backtrack
134 * | (<integer> <symbol> . <ast>)
135 * ; capturing group. <symbol> may be #f.
136 * | (cpat <condition> (<ast>) (<ast>))
137 * ; conditional expression
138 * | (backref . <integer>) ; backreference by group number
139 * | (backref . <symbol>) ; backreference by name
140 * | (once . <ast>) ; standalone pattern. no backtrack
141 * | (assert . <asst>) ; positive lookahead assertion
142 * | (nassert . <asst>) ; negative lookahead assertion
143 *
144 * <condition> : <integer> ; (?(1)yes|no) style conditional expression
145 * | (assert . <asst>) ; (?(?=condition)...) or (?(?<=condition)...)
146 * | (nassert . <asst>) ; (?(?!condition)...) or (?(?<!condition)...)
147 *
148 * <asst> : <ast>
149 * | (lookbehind . <ast>)
150 *
151 * For seq-uncase, items inside <ast> has to be prepared for case-insensitive
152 * match, i.e. chars have to be downcased and char-sets have to be
153 * case-folded.
154 */
155
156 /* NB: regexp printer is defined in libobj.scm */
157 static int regexp_compare(ScmObj x, ScmObj y, int equalp);
158
159 SCM_DEFINE_BUILTIN_CLASS(Scm_RegexpClass,
160 NULL, regexp_compare, NULL, NULL,
161 SCM_CLASS_DEFAULT_CPL);
162 SCM_DEFINE_BUILTIN_CLASS_SIMPLE(Scm_RegMatchClass, NULL);
163
make_regexp(void)164 static ScmRegexp *make_regexp(void)
165 {
166 ScmRegexp *rx = SCM_NEW(ScmRegexp);
167 SCM_SET_CLASS(rx, SCM_CLASS_REGEXP);
168 rx->code = NULL;
169 rx->numCodes = 0;
170 rx->numGroups = 0;
171 rx->numSets = 0;
172 rx->sets = NULL;
173 rx->grpNames = SCM_NIL;
174 rx->mustMatch = NULL;
175 rx->flags = 0;
176 rx->pattern = SCM_FALSE;
177 rx->ast = SCM_FALSE;
178 return rx;
179 }
180
regexp_compare(ScmObj x,ScmObj y,int equalp)181 static int regexp_compare(ScmObj x, ScmObj y, int equalp)
182 {
183 if (!equalp) {
184 Scm_Error("cannot compare regexps: %S and %S", x, y);
185 }
186 ScmRegexp *rx = SCM_REGEXP(x);
187 ScmRegexp *ry = SCM_REGEXP(y);
188
189 if ((rx->numCodes != ry->numCodes)
190 || (rx->numGroups != ry->numGroups)
191 || (rx->numSets != ry->numSets)
192 || !Scm_EqualP(rx->grpNames, ry->grpNames)
193 || (rx->flags != ry->flags)) {
194 return 1;
195 } else {
196 /* we compare bytecode. */
197 for (int i=0; i<rx->numCodes; i++) {
198 if (rx->code[i] != ry->code[i]) return 1;
199 }
200 for (int i=0; i<rx->numSets; i++) {
201 if (rx->sets[i] == ry->sets[i]) continue;
202 if (!Scm_CharSetEq(rx->sets[i], ry->sets[i])) return 1;
203 }
204 }
205 return 0;
206 }
207
208 #ifndef CHAR_MAX
209 #define CHAR_MAX 256
210 #endif
211
212 #define REGEXP_OFFSET_MAX 65535
213
214 /*=======================================================================
215 * Compiler
216 */
217
218 /* 3-pass compiler.
219 *
220 * pass 1: parses the pattern and creates an AST.
221 * pass 2: optimize on AST.
222 * pass 3: byte code generation.
223 */
224
225 /* compiler state information */
226 typedef struct regcomp_ctx_rec {
227 ScmRegexp *rx; /* the building regexp */
228 ScmObj pattern; /* original pattern or AST for diag msg */
229 int casefoldp; /* TRUE if case-folding match */
230 int lookbehindp; /* TRUE if lookbehind assertion match */
231 ScmPort *ipat; /* [pass1] string port for pattern */
232 ScmObj sets; /* [pass1] list of charsets */
233 int grpcount; /* [pass1] group count */
234 unsigned char *code; /* [pass3] code being built */
235 int codep; /* [pass3] front of code generation */
236 int emitp; /* [pass3] am I generating code? */
237 int codemax; /* [pass3] max codep */
238 } regcomp_ctx;
239
240 /* If we run from pass1, input string should be passed to PATTERN.
241 If we run from pass2, PATTERN must be NULL and we take AST from RX. */
rc_ctx_init(regcomp_ctx * ctx,ScmRegexp * rx,ScmString * pattern)242 static void rc_ctx_init(regcomp_ctx *ctx,
243 ScmRegexp *rx,
244 ScmString *pattern)
245 {
246 ctx->rx = rx;
247 if (pattern) {
248 ctx->pattern = Scm_CopyStringWithFlags(pattern,
249 SCM_STRING_IMMUTABLE,
250 SCM_STRING_IMMUTABLE);
251 ctx->ipat = SCM_PORT(Scm_MakeInputStringPort(pattern, FALSE));
252 } else {
253 ctx->pattern = rx->ast;
254 ctx->ipat = NULL;
255 }
256 ctx->casefoldp = FALSE;
257 ctx->lookbehindp = FALSE;
258 ctx->sets = SCM_NIL;
259 ctx->grpcount = 0;
260 ctx->code = NULL;
261 ctx->codep = 0;
262 ctx->emitp = FALSE;
263 ctx->codemax = 1;
264 }
265
266 static ScmObj rc_charset(regcomp_ctx *ctx);
267 static void rc_register_charset(regcomp_ctx *ctx, ScmCharSet *cs);
268 static ScmObj rc1_rep(regcomp_ctx *ctx, ScmObj greedy,
269 ScmObj lazy, ScmObj atom);
270 static ScmObj rc1_read_integer(regcomp_ctx *ctx);
271 static ScmObj rc1_group_name(regcomp_ctx *ctx);
272 static ScmObj rc1_lex_minmax(regcomp_ctx *ctx);
273 static ScmObj rc1_lex_open_paren(regcomp_ctx *ctx);
274 static ScmObj rc1_lex_xdigits(ScmPort *port, int key);
275
276 /*----------------------------------------------------------------
277 * pass1 - parser
278 */
279
280 /* EBNF Syntax of Gauche's regexp.
281 * This is a rough sketch. The condition of BOL/EOL ("^" and "$"), for
282 * example, has to be dealt with context information.
283 * To follow the convention, "{" and "}" token that don't appear to
284 * consist of the "limited repetition" syntax are regarded as literal
285 * characters.
286 *
287 * <re> :
288 * | <re> <alt>
289 *
290 * <alt> :
291 * | <item>
292 * | <alt> "|" <item>
293 *
294 * <item> : <atom> "*"
295 * | <atom> "+"
296 * | <atom> "?"
297 * | <atom> "{" <n> ("," <m>?)? "}"
298 * | <atom> "*?"
299 * | <atom> "+?"
300 * | <atom> "??"
301 * | <atom> "{" <n> ("," <m>?)? "}?"
302 * | <atom> "*+"
303 * | <atom> "++"
304 * | <atom> "?+"
305 * | <atom>
306 *
307 * <atom> : a normal char, an escaped char, or a char-set
308 * | "\\" <integer> ;; backreference
309 * | "\\k<" <string> ">" ;; backreference to named group
310 * | "(" <re> ")" ;; grouping w/ capturing
311 * | "(?:" <re> ")" ;; grouping w/o capturing
312 * | "(?i:" <re> ")" ;; grouping w/o capturing (case insensitive)
313 * | "(?-i:" <re> ")" ;; grouping w/o capturing (case sensitive)
314 * | "(?<" <string> ">" <re> ")" ;; named capturing group
315 * | "(?>" <re> ")" ;; standalone pattern
316 * | "(?=" <re> ")" ;; positive lookahead assertion
317 * | "(?!" <re> ")" ;; negative lookahead assertion
318 * | "(?<=" <re> ")" ;; positive lookbehind assertion
319 * | "(?<!" <re> ")" ;; negative lookbehind assertion
320 * | "(?("cond")"yes-pattern"|"no-pattern")"
321 * | "(?("cond")"yes-pattern")" ;; conditional pattern
322 *
323 */
324 /* TODO: It'd be nicer to have a dedicated condition to throw a parse error. */
325
326 /* Lexer */
rc1_lex(regcomp_ctx * ctx)327 static ScmObj rc1_lex(regcomp_ctx *ctx)
328 {
329 ScmObj cs;
330
331 ScmChar ch = Scm_GetcUnsafe(ctx->ipat);
332 if (ch == SCM_CHAR_INVALID) return SCM_EOF;
333 switch (ch) {
334 case '(': return rc1_lex_open_paren(ctx);
335 case ')': return SCM_SYM_CLOSE_PAREN;
336 case '|': return SCM_SYM_ALT;
337 case '^': return SCM_SYM_BOL;
338 case '.': return SCM_SYM_ANY;
339 case '$': return SCM_SYM_EOL;
340 case '[': return rc_charset(ctx);
341 case '{': return rc1_lex_minmax(ctx);
342 case '+': return rc1_rep(ctx, SCM_SYM_PLUS, SCM_SYM_PLUSQ, SCM_SYM_PLUSP);
343 case '*': return rc1_rep(ctx, SCM_SYM_STAR, SCM_SYM_STARQ, SCM_SYM_STARP);
344 case '?': return rc1_rep(ctx, SCM_SYM_QUESTION, SCM_SYM_QUESTIONQ, SCM_SYM_QUESTIONP);
345 case '\\':
346 ch = Scm_GetcUnsafe(ctx->ipat);
347 if (ch == SCM_CHAR_INVALID) {
348 Scm_Error("stray backslash at the end of pattern: %S",
349 ctx->pattern);
350 }
351 switch (ch) {
352 case 'a': return SCM_MAKE_CHAR(0x07);
353 case 'n': return SCM_MAKE_CHAR('\n');
354 case 'r': return SCM_MAKE_CHAR('\r');
355 case 't': return SCM_MAKE_CHAR('\t');
356 case 'f': return SCM_MAKE_CHAR('\f');
357 case 'e': return SCM_MAKE_CHAR(0x1b);
358 case 'b': return SCM_SYM_WB;
359 case 'B': return SCM_SYM_NWB;
360 case 'x': case 'u': case 'U':
361 return rc1_lex_xdigits(ctx->ipat, ch);
362 case 'd':
363 cs = Scm_GetStandardCharSet(SCM_CHAR_SET_ASCII_DIGIT);
364 rc_register_charset(ctx, SCM_CHAR_SET(cs));
365 return cs;
366 case 'D':
367 cs = Scm_GetStandardCharSet(-SCM_CHAR_SET_ASCII_DIGIT);
368 rc_register_charset(ctx, SCM_CHAR_SET(cs));
369 return cs;
370 case 'w':
371 cs = Scm_GetStandardCharSet(SCM_CHAR_SET_ASCII_WORD);
372 rc_register_charset(ctx, SCM_CHAR_SET(cs));
373 return cs;
374 case 'W':
375 cs = Scm_GetStandardCharSet(-SCM_CHAR_SET_ASCII_WORD);
376 rc_register_charset(ctx, SCM_CHAR_SET(cs));
377 return cs;
378 case 's':
379 cs = Scm_GetStandardCharSet(SCM_CHAR_SET_ASCII_WHITESPACE);
380 rc_register_charset(ctx, SCM_CHAR_SET(cs));
381 return cs;
382 case 'S':
383 cs = Scm_GetStandardCharSet(-SCM_CHAR_SET_ASCII_WHITESPACE);
384 rc_register_charset(ctx, SCM_CHAR_SET(cs));
385 return cs;
386 case 'p': case 'P':
387 cs = Scm_GetStandardCharSet(Scm_CharSetParseCategory(ctx->ipat, ch));
388
389 rc_register_charset(ctx, SCM_CHAR_SET(cs));
390 return cs;
391 case '0': case '1': case '2': case '3': case '4':
392 case '5': case '6': case '7': case '8': case '9':
393 Scm_UngetcUnsafe(ch, ctx->ipat);
394 return Scm_Cons(SCM_SYM_BACKREF, rc1_read_integer(ctx));
395 case 'k':
396 if (Scm_GetcUnsafe(ctx->ipat) != '<') {
397 Scm_Error("\\k must be followed by '<': %S", ctx->pattern);
398 }
399 ScmObj name = rc1_group_name(ctx);
400 if (SCM_FALSEP(name)) {
401 Scm_Error("malformed backreference found in regexp %S", ctx->pattern);
402 }
403 return Scm_Cons(SCM_SYM_BACKREF, name);
404 }
405 /*FALLTHROUGH*/
406 default:
407 if (ctx->casefoldp) ch = SCM_CHAR_DOWNCASE(ch);
408 return SCM_MAKE_CHAR(ch);
409 }
410 /*NOTREACHED*/
411 }
412
413 /* Read \x, \u, \U escape sequence in the regexp spec.
414 This may read-ahead some hexdigit characters. In such case, it
415 returns SEQ node. */
rc1_lex_xdigits(ScmPort * port,int key)416 static ScmObj rc1_lex_xdigits(ScmPort *port, int key)
417 {
418 ScmDString buf;
419
420 Scm_DStringInit(&buf);
421 ScmObj bad = Scm_ReadXdigitsFromPort(port, key, 0, FALSE, &buf);
422 if (SCM_STRINGP(bad)) {
423 /* skip chars to the end of regexp, so that the reader will read
424 after the erroneous string */
425 for (;;) {
426 int c;
427 SCM_GETC(c, port);
428 if (c == EOF || c == '/') break;
429 if (c == '\\') SCM_GETC(c, port);
430 }
431 /* construct an error message */
432 Scm_ReadError(port,
433 "Bad '\\%c' escape sequence in a regexp literal: \\%c%A",
434 key, key, bad);
435 return SCM_UNDEFINED; /* dummy */
436 } else {
437 ScmSmallInt size, len;
438 const char *chars = Scm_DStringPeek(&buf, &size, &len);
439 if (len == 1) {
440 ScmChar ch;
441 SCM_CHAR_GET(chars, ch);
442 return SCM_MAKE_CHAR(ch);
443 } else {
444 ScmObj h = SCM_NIL, t = SCM_NIL;
445 SCM_APPEND1(h, t, SCM_SYM_SEQ);
446 while (len-- > 0) {
447 ScmChar ch;
448 SCM_CHAR_GET(chars, ch);
449 chars += SCM_CHAR_NBYTES(ch);
450 SCM_APPEND1(h, t, SCM_MAKE_CHAR(ch));
451 }
452 return h;
453 }
454 }
455 }
456
457 /* Called after '+', '*' or '?' is read, and check if there's a
458 following '?' (lazy quantifier) or '+' (atomic expression) */
rc1_rep(regcomp_ctx * ctx,ScmObj greedy,ScmObj lazy,ScmObj atom)459 static ScmObj rc1_rep(regcomp_ctx *ctx, ScmObj greedy,
460 ScmObj lazy, ScmObj atom)
461 {
462 ScmChar ch = Scm_GetcUnsafe(ctx->ipat);
463 if (ch == '?') return lazy;
464 if (ch == '+') return atom;
465 Scm_UngetcUnsafe(ch, ctx->ipat);
466 return greedy;
467 }
468
469 /* Reads '('-sequence - either one of "(", "(?:", "(?i:", "(?-i:",
470 "(?=", "(?<=", "(?<!", "(?<name>", "(?(condition)|). The leading
471 "(" has already been read. */
rc1_lex_open_paren(regcomp_ctx * ctx)472 static ScmObj rc1_lex_open_paren(regcomp_ctx *ctx)
473 {
474 ScmObj pos = Scm_PortSeekUnsafe(ctx->ipat, SCM_MAKE_INT(0), SEEK_CUR);
475 ScmChar ch = Scm_GetcUnsafe(ctx->ipat);
476 if (ch != '?') {
477 Scm_UngetcUnsafe(ch, ctx->ipat);
478 return SCM_SYM_OPEN_PAREN;
479 }
480 ch = Scm_GetcUnsafe(ctx->ipat);
481 switch (ch) {
482 case ':': return SCM_SYM_SEQ;
483 case '>': return SCM_SYM_ONCE;
484 case '=': return SCM_SYM_ASSERT;
485 case '!': return SCM_SYM_NASSERT;
486 case '(': return SCM_SYM_CPAT;
487 case '<': {
488 ch = Scm_GetcUnsafe(ctx->ipat);
489 if (ch == '=') return SCM_SYM_LOOKBEHIND;
490 if (ch == '!') return SCM_SYM_NLOOKBEHIND;
491 Scm_UngetcUnsafe(ch, ctx->ipat);
492 ScmObj name = rc1_group_name(ctx);
493 if (!SCM_FALSEP(name)) return Scm_Cons(SCM_SYM_REG, name);
494 break;
495 }
496 case 'i':
497 ch = Scm_GetcUnsafe(ctx->ipat);
498 if (ch == ':') return SCM_SYM_SEQ_UNCASE;
499 break;
500 case '-':
501 ch = Scm_GetcUnsafe(ctx->ipat);
502 if (ch == 'i') {
503 ch = Scm_GetcUnsafe(ctx->ipat);
504 if (ch == ':') return SCM_SYM_SEQ_CASE;
505 }
506 break;
507 }
508 /* fail. */
509 Scm_PortSeekUnsafe(ctx->ipat, pos, SEEK_SET);
510 return SCM_SYM_OPEN_PAREN;
511 }
512
rc1_read_integer(regcomp_ctx * ctx)513 static ScmObj rc1_read_integer(regcomp_ctx *ctx)
514 {
515 ScmChar ch = Scm_GetcUnsafe(ctx->ipat);
516 if (!isdigit(ch)) {
517 Scm_Error("number expected, but got '%c'", ch);
518 }
519 ScmDString ds;
520 Scm_DStringInit(&ds);
521 do {
522 Scm_DStringPutc(&ds, ch);
523 ch = Scm_GetcUnsafe(ctx->ipat);
524 } while (ch != SCM_CHAR_INVALID && isdigit(ch));
525 if (ch != SCM_CHAR_INVALID) {
526 Scm_UngetcUnsafe(ch, ctx->ipat);
527 }
528 ScmObj r = Scm_StringToNumber(SCM_STRING(Scm_DStringGet(&ds, 0)), 10, 0);
529 if (SCM_BIGNUMP(r)) {
530 Scm_Error("number too big: %S", r);
531 }
532 SCM_ASSERT(SCM_INTP(r));
533 return r;
534 }
535
rc1_group_name(regcomp_ctx * ctx)536 static ScmObj rc1_group_name(regcomp_ctx *ctx)
537 {
538 ScmDString ds;
539 Scm_DStringInit(&ds);
540
541 for (;;) {
542 ScmChar ch = Scm_GetcUnsafe(ctx->ipat);
543 if (ch == SCM_CHAR_INVALID) return SCM_FALSE;
544 if (ch == '>') {
545 return Scm_Intern(SCM_STRING(Scm_DStringGet(&ds, 0)));
546 }
547 if (ch == '\\') {
548 ch = Scm_GetcUnsafe(ctx->ipat);
549 if (ch == SCM_CHAR_INVALID) return SCM_FALSE;
550 /* fall through */
551 }
552 Scm_DStringPutc(&ds, ch);
553 }
554 return SCM_UNDEFINED; /* dummy */
555 }
556
557 /* Reads {n,m}-type repeat syntax. The leading "{" has been read.
558 If the character sequence doesn't consist of valid syntax, rollback
559 to the ordinary character sequence.
560 If successfully parsed, returns (rep <n> . <m>) where
561 <n> == <m> if the pattern is "{n}" (exact count), or
562 <m> == #f if the pattern is "{n,}" (minimum count), or
563 <m> == integer if the pattern is "{n,m}" (limited count).
564 If the pattern is followed by '?', rep-min is used instead.
565 */
rc1_lex_minmax(regcomp_ctx * ctx)566 static ScmObj rc1_lex_minmax(regcomp_ctx *ctx)
567 {
568 int rep_min = -1, rep_max = -1, exact = FALSE;
569 ScmObj type = SCM_SYM_REP; /* default is greedy */
570 ScmObj pos = Scm_PortSeekUnsafe(ctx->ipat, SCM_MAKE_INT(0), SEEK_CUR);
571
572 for (;;) {
573 int ch = Scm_GetcUnsafe(ctx->ipat);
574 if (SCM_CHAR_ASCII_P(ch) && isdigit(ch)) {
575 if (rep_min < 0) {
576 rep_min = (ch - '0');
577 } else {
578 rep_min = rep_min*10 + (ch - '0');
579 }
580 } else if (ch == ',') {
581 /* NB: The following line makes us to treat {,m} as {0,m}.
582 Oniguruma does so. Perl doesn't. Strictly speaking they're
583 incompatible to each other (if Perl code /a{,3}/ expects
584 to match "a{,3}", it needs to be written as /a\{,3}/ in
585 Oniguruma). Let's take Oniguruma now. */
586 if (rep_min < 0) rep_min = 0;
587 break;
588 } else if (ch == '}') {
589 exact = TRUE;
590 break;
591 } else {
592 goto bad_min_max;
593 }
594 }
595 if (rep_min < 0) goto bad_min_max;
596 if (rep_min > MAX_LIMITED_REPEAT) goto out_of_range;
597 if (!exact) {
598 for (;;) {
599 int ch = Scm_GetcUnsafe(ctx->ipat);
600 if (SCM_CHAR_ASCII_P(ch) && isdigit(ch)) {
601 if (rep_max < 0) {
602 rep_max = (ch - '0');
603 } else {
604 rep_max = rep_max*10 + (ch - '0');
605 }
606 } else if (ch == '}') {
607 break;
608 } else {
609 goto bad_min_max;
610 }
611 }
612 if (rep_max > MAX_LIMITED_REPEAT) goto out_of_range;
613 if (rep_max >= 0 && rep_max < rep_min) {
614 Scm_Error("{n,m}-syntax requires n <= m: %S", ctx->pattern);
615 }
616 }
617
618 ScmObj m;
619 if (exact) m = SCM_MAKE_INT(rep_min);
620 else if (rep_max < 0) m = SCM_FALSE;
621 else m = SCM_MAKE_INT(rep_max);
622
623 int ch = Scm_GetcUnsafe(ctx->ipat);
624 if (ch == '?') type = SCM_SYM_REP_MIN;
625 else Scm_UngetcUnsafe(ch, ctx->ipat);
626 return Scm_Cons(type, Scm_Cons(SCM_MAKE_INT(rep_min), m));
627
628 out_of_range:
629 Scm_Error("{n,m}-syntax can accept up to %d count: %S",
630 MAX_LIMITED_REPEAT, ctx->pattern);
631 /*NOTREACHED*/
632 bad_min_max:
633 /* back up */
634 Scm_PortSeekUnsafe(ctx->ipat, pos, SEEK_SET);
635 return SCM_MAKE_CHAR('{');
636 }
637
rc1_fold_alts(regcomp_ctx * ctx SCM_UNUSED,ScmObj alts)638 static ScmObj rc1_fold_alts(regcomp_ctx *ctx SCM_UNUSED, ScmObj alts)
639 {
640 ScmObj r = SCM_NIL, ap;
641 SCM_FOR_EACH(ap, alts) {
642 ScmObj alt = SCM_CAR(ap);
643 if (SCM_PAIRP(alt) && SCM_NULLP(SCM_CDR(alt))) {
644 r = Scm_Cons(SCM_CAR(alt), r);
645 } else {
646 r = Scm_Cons(Scm_Cons(SCM_SYM_SEQ, alt), r);
647 }
648 }
649 return Scm_Cons(SCM_SYM_ALT, r);
650 }
651
652 static ScmObj rc1_parse(regcomp_ctx *, int, ScmObj);
653
rc1_lex_conditional_pattern(regcomp_ctx * ctx,int bolp,ScmObj grps)654 static ScmObj rc1_lex_conditional_pattern(regcomp_ctx *ctx, int bolp,
655 ScmObj grps)
656 {
657 ScmChar ch = Scm_GetcUnsafe(ctx->ipat);
658 if (ch == SCM_CHAR_INVALID)
659 goto error;
660 if (isdigit(ch)) {
661 Scm_UngetcUnsafe(ch, ctx->ipat);
662 ScmObj r = rc1_read_integer(ctx);
663 if (!SCM_EQ(rc1_lex(ctx), SCM_SYM_CLOSE_PAREN))
664 goto error;
665 return r;
666 }
667 if (ch == '?') {
668 ch = Scm_GetcUnsafe(ctx->ipat);
669 if (ch == '=')
670 return Scm_Cons(SCM_SYM_ASSERT, rc1_parse(ctx, bolp, grps));
671 if (ch == '!')
672 return Scm_Cons(SCM_SYM_NASSERT, rc1_parse(ctx, bolp, grps));
673 if (ch == '<') {
674 ScmObj type;
675 ch = Scm_GetcUnsafe(ctx->ipat);
676 if (ch == '=') {
677 type = SCM_SYM_ASSERT;
678 } else if (ch == '!') {
679 type = SCM_SYM_NASSERT;
680 } else {
681 Scm_Error("unknown switch condition (?>%c...) in regexp %S",
682 ch, ctx->pattern);
683 type = SCM_SYM_NASSERT; /* dummy */
684 }
685 ScmObj item = rc1_parse(ctx, bolp, grps);
686 return SCM_LIST2(type, Scm_Cons(SCM_SYM_LOOKBEHIND, item));
687 }
688 /* fallthru */
689 }
690 Scm_Error("unknown switch condition (?%c...) in regexp %S", ch, ctx->pattern);
691
692 error:
693 Scm_Error("unterminated conditional pattern in regexp %S", ctx->pattern);
694 return SCM_UNDEFINED; /* dummy */
695 }
696
697 /* Parser */
698 /* Groups represents the current nestings of parentheses, including both
699 capturing groups and non-capturing parens. It works like a stack, where
700 the leftmost item is for the innermost open paren. The value is a
701 group number for capturing groups, and #f for others. */
rc1_parse(regcomp_ctx * ctx,int bolp,ScmObj groups)702 static ScmObj rc1_parse(regcomp_ctx *ctx, int bolp, ScmObj groups)
703 {
704 ScmObj stack = SCM_NIL, alts = SCM_NIL;
705 int bolpsave = bolp;
706
707 #define TOPP() (SCM_NULLP(groups))
708 #define PUSH(elt) (stack = Scm_Cons((elt), stack))
709 #define PUSH1(elt) (stack = Scm_Cons((elt), SCM_CDR(stack)))
710
711 for (;;) {
712 ScmObj token = rc1_lex(ctx);
713 ScmObj item;
714 if (SCM_EOFP(token)) {
715 if (!TOPP()) {
716 Scm_Error("unterminated grouping in regexp %S", ctx->pattern);
717 }
718 break;
719 }
720 if (SCM_EQ(token, SCM_SYM_CLOSE_PAREN)) {
721 if (TOPP()) {
722 Scm_Error("extra close parenthesis in regexp %S", ctx->pattern);
723 }
724 groups = SCM_CDR(groups);
725 break;
726 }
727 if (SCM_EQ(token, SCM_SYM_BOL)) {
728 if (bolp) {
729 PUSH(SCM_SYM_BOL);
730 bolp = FALSE;
731 continue;
732 } else {
733 token = SCM_MAKE_CHAR('^');
734 }
735 /*FALLTHROUGH*/
736 }
737 if (SCM_EQ(token, SCM_SYM_ALT)) {
738 alts = Scm_Cons(Scm_ReverseX(stack), alts);
739 stack = SCM_NIL;
740 bolp = bolpsave;
741 continue;
742 }
743 if (SCM_EQ(token, SCM_SYM_OPEN_PAREN)) {
744 int grpno = ++ctx->grpcount;
745 item = rc1_parse(ctx, bolp, Scm_Cons(SCM_MAKE_INT(grpno), groups));
746 PUSH(Scm_Cons(SCM_MAKE_INT(grpno), Scm_Cons(SCM_FALSE, item)));
747 bolp = FALSE;
748 continue;
749 }
750 if (SCM_EQ(token, SCM_SYM_SEQ)) {
751 item = rc1_parse(ctx, bolp, Scm_Cons(SCM_FALSE, groups));
752 PUSH(Scm_Cons(SCM_SYM_SEQ, item));
753 bolp = FALSE;
754 continue;
755 }
756 if (SCM_EQ(token, SCM_SYM_SEQ_UNCASE) || SCM_EQ(token, SCM_SYM_SEQ_CASE)) {
757 int oldflag = ctx->casefoldp;
758 ctx->casefoldp = SCM_EQ(token, SCM_SYM_SEQ_UNCASE);
759 item = rc1_parse(ctx, bolp, Scm_Cons(SCM_FALSE, groups));
760 PUSH(Scm_Cons(token, item));
761 ctx->casefoldp = oldflag;
762 bolp = FALSE;
763 continue;
764 }
765 if (SCM_EQ(token, SCM_SYM_ONCE)) {
766 /* (?>re) can have BOL/EOL.*/
767 item = rc1_parse(ctx, TRUE, Scm_Cons(SCM_FALSE, groups));
768 PUSH(Scm_Cons(SCM_SYM_ONCE, item));
769 bolp = FALSE;
770 continue;
771 }
772 if (SCM_EQ(token, SCM_SYM_ASSERT) || SCM_EQ(token, SCM_SYM_NASSERT)) {
773 /* (?=re) and (?!re) can have BOL/EOL.*/
774 item = rc1_parse(ctx, TRUE, Scm_Cons(SCM_FALSE, groups));
775 PUSH(Scm_Cons(token, item));
776 continue;
777 }
778 if (SCM_EQ(token, SCM_SYM_LOOKBEHIND) || SCM_EQ(token, SCM_SYM_NLOOKBEHIND)) {
779 /* "(?<=a)" => (assert (lookbehind a))
780 "(?<!a)" => (nassert (lookbehind a)) */
781 item = rc1_parse(ctx, TRUE, Scm_Cons(SCM_FALSE, groups));
782 PUSH(SCM_LIST2(SCM_EQ(token, SCM_SYM_LOOKBEHIND)? SCM_SYM_ASSERT : SCM_SYM_NASSERT,
783 Scm_Cons(SCM_SYM_LOOKBEHIND, item)));
784 continue;
785 }
786 if (SCM_EQ(token, SCM_SYM_STAR) || SCM_EQ(token, SCM_SYM_STARP)) {
787 /* "x*" => (rep 0 #f x)
788 "x*+" => (once (rep 0 #f x)) */
789 if (SCM_NULLP(stack)) goto synerr;
790 item = SCM_LIST4(SCM_SYM_REP, SCM_MAKE_INT(0), SCM_FALSE, SCM_CAR(stack));
791 PUSH1(SCM_EQ(token, SCM_SYM_STAR) ? item : SCM_LIST2(SCM_SYM_ONCE, item));
792 bolp = FALSE;
793 continue;
794 }
795 if (SCM_EQ(token, SCM_SYM_STARQ)) {
796 /* "x*?" => (rep-min 0 #f x) */
797 if (SCM_NULLP(stack)) goto synerr;
798 item = SCM_LIST4(SCM_SYM_REP_MIN, SCM_MAKE_INT(0), SCM_FALSE, SCM_CAR(stack));
799 PUSH1(item);
800 bolp = FALSE;
801 continue;
802 }
803 if (SCM_EQ(token, SCM_SYM_PLUS) || SCM_EQ(token, SCM_SYM_PLUSP)) {
804 /* "x+" => (rep 1 #f x))
805 "x++" => (once (rep 1 #f x)) */
806 if (SCM_NULLP(stack)) goto synerr;
807 item = SCM_LIST4(SCM_SYM_REP, SCM_MAKE_INT(1), SCM_FALSE, SCM_CAR(stack));
808 PUSH1(SCM_EQ(token, SCM_SYM_PLUS) ? item : SCM_LIST2(SCM_SYM_ONCE, item));
809 bolp = FALSE;
810 continue;
811 }
812 if (SCM_EQ(token, SCM_SYM_PLUSQ)) {
813 /* "x+?" => (rep-min 1 #f x) */
814 if (SCM_NULLP(stack)) goto synerr;
815 item = SCM_LIST4(SCM_SYM_REP_MIN, SCM_MAKE_INT(1), SCM_FALSE, SCM_CAR(stack));
816 PUSH1(item);
817 bolp = FALSE;
818 continue;
819 }
820 if (SCM_EQ(token, SCM_SYM_QUESTION) || SCM_EQ(token, SCM_SYM_QUESTIONP)) {
821 /* "x?" => (rep 0 1 x)
822 "x?+" => (once (rep 0 1 x)) */
823 if (SCM_NULLP(stack)) goto synerr;
824 item = SCM_LIST4(SCM_SYM_REP, SCM_MAKE_INT(0), SCM_MAKE_INT(1), SCM_CAR(stack));
825 PUSH1(SCM_EQ(token, SCM_SYM_QUESTION) ? item : SCM_LIST2(SCM_SYM_ONCE, item));
826 bolp = FALSE;
827 continue;
828 }
829 if (SCM_EQ(token, SCM_SYM_QUESTIONQ)) {
830 /* "x??" => (rep-min 0 1 x) */
831 if (SCM_NULLP(stack)) goto synerr;
832 item = SCM_LIST4(SCM_SYM_REP_MIN, SCM_MAKE_INT(0), SCM_MAKE_INT(1), SCM_CAR(stack));
833 PUSH1(item);
834 bolp = FALSE;
835 continue;
836 }
837 if (SCM_EQ(token, SCM_SYM_CPAT)) {
838 ScmObj cond, ypat, npat;
839 ScmObj new_groups = Scm_Cons(SCM_FALSE, groups);
840 cond = rc1_lex_conditional_pattern(ctx, bolp, new_groups);
841 item = rc1_parse(ctx, bolp, new_groups);
842 if (SCM_PAIRP(item) && SCM_PAIRP(SCM_CAR(item))
843 && SCM_EQ(SCM_CAAR(item), SCM_SYM_ALT)) {
844 ScmObj elt = SCM_CAR(item);
845 if (Scm_Length(elt) > 3) {
846 Scm_Error("Conditional pattern contains too much branches: %S",
847 ctx->pattern);
848 }
849 ypat = SCM_LIST1(SCM_CADR(elt));
850 npat = SCM_CDDR(elt);
851 } else {
852 ypat = item;
853 npat = SCM_NIL;
854 }
855 PUSH(SCM_LIST4(SCM_SYM_CPAT, cond, ypat, npat));
856 bolp = FALSE;
857 continue;
858 }
859 if (SCM_PAIRP(token) && SCM_EQ(SCM_CAR(token), SCM_SYM_REG)) {
860 /* "(?P<name>x)" => (<integer> name . <ast>)) */
861 int grpno = ++ctx->grpcount;
862 ScmObj name = SCM_CDR(token);
863 item = rc1_parse(ctx, bolp, Scm_Cons(SCM_MAKE_INT(grpno), groups));
864 PUSH(Scm_Cons(SCM_MAKE_INT(grpno), Scm_Cons(name, item)));
865 ctx->rx->grpNames = Scm_Acons(name, SCM_MAKE_INT(grpno), ctx->rx->grpNames);
866 bolp = FALSE;
867 continue;
868 }
869 if (SCM_PAIRP(token) &&
870 (SCM_EQ(SCM_CAR(token), SCM_SYM_REP) ||
871 SCM_EQ(SCM_CAR(token), SCM_SYM_REP_MIN))) {
872 /* "x{n}" => (rep n n x)
873 "x{n,}" => (rep n #f x)
874 "x{n,m}" => (rep n m x)
875 "x{n,}?" => (rep-min n #t x)
876 "x{n,m}?" => (rep-min n m x) */
877 if (SCM_NULLP(stack)) goto synerr;
878 item = SCM_LIST4(SCM_CAR(token), SCM_CADR(token), SCM_CDDR(token), SCM_CAR(stack));
879 PUSH1(item);
880 bolp = FALSE;
881 continue;
882 }
883 if (SCM_PAIRP(token) && SCM_EQ(SCM_CAR(token), SCM_SYM_BACKREF)) {
884 ScmObj h = SCM_NIL, t = SCM_NIL;
885 ScmObj ref = SCM_CDR(token);
886 if (SCM_INTP(ref)) {
887 int grpno = SCM_INT_VALUE(ref);
888 if (ctx->grpcount < grpno
889 || !SCM_FALSEP(Scm_Memv(SCM_MAKE_INT(grpno), groups))) {
890 Scm_Error("Backreference \\%d refers to an unfinished group.",
891 grpno);
892 }
893 PUSH(token);
894 bolp = FALSE;
895 continue;
896 }
897
898 SCM_ASSERT(SCM_SYMBOLP(ref));
899 ScmObj ep;
900 SCM_FOR_EACH(ep, ctx->rx->grpNames) {
901 if (!SCM_EQ(SCM_CAAR(ep), ref)) continue;
902 SCM_APPEND1(h, t, Scm_Cons(SCM_SYM_BACKREF, SCM_CDAR(ep)));
903 }
904 if (SCM_NULLP(h)) goto synerr;
905
906 PUSH(SCM_NULLP(SCM_CDR(h))? SCM_CAR(h) : Scm_Cons(SCM_SYM_ALT, h));
907 bolp = FALSE;
908 continue;
909 }
910 PUSH(token);
911 bolp = FALSE;
912 continue;
913 }
914 if (SCM_NULLP(alts)) {
915 return Scm_ReverseX(stack);
916 } else {
917 alts = Scm_Cons(Scm_ReverseX(stack), alts);
918 return SCM_LIST1(rc1_fold_alts(ctx, alts));
919 }
920 synerr:
921 Scm_Error("bad regexp syntax in %S", ctx->pattern);
922 return SCM_UNDEFINED; /* dummy */
923 #undef PUSH
924 #undef PUSH1
925 }
926
rc1(regcomp_ctx * ctx)927 static ScmObj rc1(regcomp_ctx *ctx)
928 {
929 ScmObj ast = rc1_parse(ctx, TRUE, SCM_NIL);
930 if (ctx->casefoldp) {
931 ast = SCM_LIST3(SCM_MAKE_INT(0), SCM_FALSE,
932 Scm_Cons(SCM_SYM_SEQ_UNCASE, ast));
933 } else {
934 ast = Scm_Cons(SCM_MAKE_INT(0), Scm_Cons(SCM_FALSE, ast));
935 }
936 int ngrp = ctx->grpcount + 1;
937 ctx->rx->numGroups = ngrp;
938 return ast;
939 }
940
rc_casefold(ScmObj * set,int complement)941 static void rc_casefold(ScmObj *set, int complement)
942 {
943 int set1 = SCM_CHAR_SET_LOWER;
944 int set2 = SCM_CHAR_SET_UPPER;
945 int set3 = SCM_CHAR_SET_TITLE;
946 int set4 = SCM_CHAR_SET_LC;
947 if (complement) {
948 set1 = -set1;
949 set2 = -set2;
950 set3 = -set3;
951 set4 = -set4;
952 }
953
954 ScmCharSet* cur = SCM_CHAR_SET(*set);
955 /* NB: Scm_CharSetEq can be slow. This is only called during regexp
956 compilation, so we expect this isn't too bad, but if this ever gets
957 a bottle-neck, we can add an extra field in ScmCharSet to indicate
958 that it's one of the predefined charset. */
959 if (SCM_CHAR_SET_LARGE_P(cur) &&
960 (Scm_CharSetEq(cur, SCM_CHAR_SET(Scm_GetStandardCharSet(set1))) ||
961 Scm_CharSetEq(cur, SCM_CHAR_SET(Scm_GetStandardCharSet(set2))) ||
962 Scm_CharSetEq(cur, SCM_CHAR_SET(Scm_GetStandardCharSet(set3))))) {
963 *set = Scm_MakeEmptyCharSet();
964 Scm_CharSetAdd(SCM_CHAR_SET(*set),
965 SCM_CHAR_SET(Scm_GetStandardCharSet(set4)));
966 }
967
968 Scm_CharSetCaseFold(SCM_CHAR_SET(*set));
969 }
970
971 /* character range */
rc_charset(regcomp_ctx * ctx)972 static ScmObj rc_charset(regcomp_ctx *ctx)
973 {
974 int complement;
975 ScmObj set = Scm_CharSetRead(ctx->ipat, &complement, FALSE, TRUE);
976 if (!SCM_CHAR_SET_P(set)) {
977 Scm_Error("bad charset spec in pattern: %S", ctx->pattern);
978 }
979 if (ctx->casefoldp) {
980 rc_casefold(&set, complement);
981 }
982 Scm_CharSetFreezeX(SCM_CHAR_SET(set));
983 rc_register_charset(ctx, SCM_CHAR_SET(set));
984 if (complement) {
985 return Scm_Cons(SCM_SYM_COMP, SCM_OBJ(set));
986 } else {
987 return SCM_OBJ(set);
988 }
989 }
990
991 /* Remember charset so that we can construct charset vector later */
rc_register_charset(regcomp_ctx * ctx,ScmCharSet * cs)992 static void rc_register_charset(regcomp_ctx *ctx, ScmCharSet *cs)
993 {
994 if (SCM_FALSEP(Scm_Memq(SCM_OBJ(cs), ctx->sets))) {
995 ctx->sets = Scm_Cons(SCM_OBJ(cs), ctx->sets);
996 }
997 }
998
999 /* An interlude between pass1 and pass2. From the information of
1000 parser context, build a charset vector. */
rc_setup_charsets(ScmRegexp * rx,regcomp_ctx * ctx)1001 static void rc_setup_charsets(ScmRegexp *rx, regcomp_ctx *ctx)
1002 {
1003 rx->numSets = Scm_Length(ctx->sets);
1004 rx->sets = SCM_NEW_ARRAY(ScmCharSet*, rx->numSets);
1005 ScmObj cp = Scm_Reverse(ctx->sets);
1006 for (int i=0; !SCM_NULLP(cp); cp = SCM_CDR(cp)) {
1007 rx->sets[i++] = SCM_CHAR_SET(SCM_CAR(cp));
1008 }
1009 }
1010
1011 /*-------------------------------------------------------------
1012 * pass 2: optimizer
1013 *
1014 * - flattening nested sequences: (seq a (seq b) c) => (seq a b c)
1015 * - introduces short-cut construct for certain cases.
1016 * (... (rep <m> <n> #\a) #\b ...)
1017 * => (... (rep-while <m> <n> #\a) #\b ...)
1018 */
1019 static ScmObj rc2_optimize(ScmObj ast, ScmObj rest);
1020 static int is_distinct(ScmObj x, ScmObj y);
1021
rc2_optimize_seq(ScmObj seq,ScmObj rest)1022 static ScmObj rc2_optimize_seq(ScmObj seq, ScmObj rest)
1023 {
1024 if (!SCM_PAIRP(seq)) return seq;
1025 ScmObj opted;
1026 ScmObj elt = SCM_CAR(seq);
1027 ScmObj tail = rc2_optimize_seq(SCM_CDR(seq), rest);
1028 rest = SCM_NULLP(tail)? rest : tail;
1029 if (!SCM_PAIRP(elt) || SCM_EQ(SCM_CAR(elt), SCM_SYM_COMP)) {
1030 if (SCM_EQ(tail, SCM_CDR(seq))) return seq;
1031 else return Scm_Cons(elt, tail);
1032 }
1033 ScmObj etype = SCM_CAR(elt);
1034 if (SCM_EQ(etype, SCM_SYM_SEQ)) {
1035 return Scm_Append2(rc2_optimize_seq(SCM_CDR(elt), rest), tail);
1036 }
1037 if (SCM_EQ(etype, SCM_SYM_REP)) {
1038 /* If the head of repeating sequence and the beginning of the
1039 following sequence are distinct, like #/\s*foo/, the branch
1040 becomes deterministic (i.e. we don't need backtrack). */
1041 ScmObj repbody = rc2_optimize_seq(SCM_CDR(SCM_CDDR(elt)), rest);
1042 if (SCM_NULLP(repbody)) {
1043 /* This is the case that an empy string (?:) is repeated.
1044 it always matches, so we collapse entire thing to (). */
1045 return tail;
1046 }
1047 if (SCM_NULLP(rest) || is_distinct(SCM_CAR(repbody), SCM_CAR(rest))) {
1048 ScmObj elt2 = Scm_Append2(SCM_LIST3(SCM_SYM_REP_WHILE, SCM_CADR(elt),
1049 SCM_CAR(SCM_CDDR(elt))),
1050 repbody);
1051 return Scm_Cons(elt2, tail);
1052 }
1053 if (SCM_EQ(repbody, SCM_CDR(SCM_CDDR(elt)))) opted = elt;
1054 else opted = Scm_Append2(SCM_LIST3(SCM_SYM_REP, SCM_CADR(elt),
1055 SCM_CAR(SCM_CDDR(elt))),
1056 repbody);
1057 } else {
1058 opted = rc2_optimize(elt, rest);
1059 }
1060 if (SCM_EQ(elt, opted) && SCM_EQ(tail, SCM_CDR(seq))) return seq;
1061 else return Scm_Cons(opted, tail);
1062 }
1063
rc2_optimize(ScmObj ast,ScmObj rest)1064 static ScmObj rc2_optimize(ScmObj ast, ScmObj rest)
1065 {
1066 if (!SCM_PAIRP(ast)) return ast;
1067 ScmObj seq, seqo;
1068 ScmObj type = SCM_CAR(ast);
1069 if (SCM_EQ(type, SCM_SYM_COMP)) return ast;
1070 if (SCM_EQ(type, SCM_SYM_LOOKBEHIND)) return ast;
1071
1072 if (SCM_EQ(type, SCM_SYM_ALT)) {
1073 ScmObj sp, sp2, e = SCM_UNBOUND, h, t;
1074 SCM_FOR_EACH(sp, SCM_CDR(ast)) {
1075 e = rc2_optimize(SCM_CAR(sp), rest);
1076 if (!SCM_EQ(e, SCM_CAR(sp))) break;
1077 }
1078 if (SCM_NULLP(sp)) return ast;
1079 /* need to copy the spine */
1080 h = t = SCM_NIL;
1081 SCM_FOR_EACH(sp2, SCM_CDR(ast)) {
1082 if (SCM_EQ(sp, sp2)) { SCM_APPEND1(h, t, e); break; }
1083 SCM_APPEND1(h, t, SCM_CAR(sp2));
1084 }
1085 SCM_FOR_EACH(sp2, SCM_CDR(sp2)) {
1086 SCM_APPEND1(h, t, rc2_optimize(SCM_CAR(sp2), rest));
1087 }
1088 return Scm_Cons(SCM_SYM_ALT, h);
1089 }
1090 if (SCM_EQ(type, SCM_SYM_REP) || SCM_EQ(type, SCM_SYM_REP_MIN)
1091 || SCM_EQ(type, SCM_SYM_REP_WHILE)) {
1092 seq = SCM_CADR(SCM_CDDR(ast));
1093 seqo = rc2_optimize_seq(seq, rest);
1094 if (SCM_EQ(seq, seqo)) return ast;
1095 return SCM_LIST4(type, SCM_CADR(ast), SCM_CAR(SCM_CDDR(ast)), seqo);
1096 }
1097 seq = SCM_CDR(ast);
1098 seqo = rc2_optimize_seq(seq, rest);
1099 if (SCM_EQ(seq, seqo)) return ast;
1100 return Scm_Cons(type, seqo);
1101 }
1102
is_distinct(ScmObj x,ScmObj y)1103 static int is_distinct(ScmObj x, ScmObj y)
1104 {
1105 if (SCM_PAIRP(x)) {
1106 ScmObj carx = SCM_CAR(x);
1107 if (SCM_EQ(carx, SCM_SYM_COMP)) {
1108 SCM_ASSERT(SCM_CHAR_SET_P(SCM_CDR(x)));
1109 if (SCM_CHARP(y) || SCM_CHAR_SET_P(y)) {
1110 return !is_distinct(SCM_CDR(x), y);
1111 }
1112 return FALSE;
1113 }
1114 if (SCM_INTP(carx)) {
1115 if (SCM_PAIRP(SCM_CDDR(x))) {
1116 return is_distinct(SCM_CAR(SCM_CDDR(x)), y);
1117 }
1118 }
1119 if (SCM_EQ(carx, SCM_SYM_SEQ_UNCASE)
1120 || SCM_EQ(carx, SCM_SYM_SEQ_CASE)) {
1121 if (SCM_PAIRP(SCM_CDR(x))) {
1122 return is_distinct(SCM_CADR(x), y);
1123 }
1124 }
1125 return FALSE;
1126 }
1127 if (SCM_CHARP(x)) {
1128 if (SCM_CHARP(y)) return !SCM_EQ(x, y);
1129 return is_distinct(y, x);
1130 }
1131 if (SCM_CHAR_SET_P(x)) {
1132 if (SCM_CHARP(y)) {
1133 return !Scm_CharSetContains(SCM_CHAR_SET(x), SCM_CHAR_VALUE(y));
1134 }
1135 if (SCM_CHAR_SET_P(y)) {
1136 ScmObj ccs = Scm_CharSetCopy(SCM_CHAR_SET(y));
1137 ccs = Scm_CharSetComplement(SCM_CHAR_SET(ccs));
1138 return Scm_CharSetLE(SCM_CHAR_SET(x), SCM_CHAR_SET(ccs));
1139 }
1140 return is_distinct(y, x);
1141 }
1142 return FALSE;
1143 }
1144
Scm_RegOptimizeAST(ScmObj ast)1145 ScmObj Scm_RegOptimizeAST(ScmObj ast)
1146 {
1147 return rc2_optimize(ast, SCM_NIL);
1148 }
1149
1150 /*-------------------------------------------------------------
1151 * pass 3 - code generation
1152 * This pass actually called twice; the first run counts
1153 * the size of the bytecode, and the second run fills
1154 * the bytecode. EMITP == FALSE for the first, EMITP == TRUE
1155 * for the second.
1156 * LASTP indicates this call is dealing with the last part of
1157 * the compiled tree, thus need to deal with EOL marker.
1158 */
1159
1160 static void rc3_rec(regcomp_ctx *ctx, ScmObj ast, int lastp);
1161
1162 /* Util function for pass3, to get an index of the charset vector
1163 * for the given charset.
1164 */
rc3_charset_index(ScmRegexp * rx,ScmObj cs)1165 static int rc3_charset_index(ScmRegexp *rx, ScmObj cs)
1166 {
1167 for (int i=0; i<rx->numSets; i++)
1168 if (cs == SCM_OBJ(rx->sets[i])) return i;
1169 Scm_Panic("rc3_charset_index: can't be here");
1170 return 0; /* dummy */
1171 }
1172
rc3_emit(regcomp_ctx * ctx,char code)1173 static void rc3_emit(regcomp_ctx *ctx, char code)
1174 {
1175 if (ctx->emitp) {
1176 SCM_ASSERT(ctx->codep < ctx->codemax);
1177 ctx->code[ctx->codep++] = code;
1178 } else {
1179 ctx->codemax++;
1180 }
1181 }
1182
rc3_emit_offset(regcomp_ctx * ctx,int offset)1183 static void rc3_emit_offset(regcomp_ctx *ctx, int offset)
1184 {
1185 if (offset > REGEXP_OFFSET_MAX) {
1186 Scm_Error("regexp too large. consider splitting it up: %50.1S",
1187 SCM_OBJ(ctx->rx));
1188 }
1189
1190 if (ctx->emitp) {
1191 SCM_ASSERT(ctx->codep < ctx->codemax-1);
1192 ctx->code[ctx->codep++] = (offset>>8) & 0xff;
1193 ctx->code[ctx->codep++] = offset & 0xff;
1194 } else {
1195 ctx->codemax+=2;
1196 }
1197 }
1198
rc3_fill_offset(regcomp_ctx * ctx,int codep,int offset)1199 static void rc3_fill_offset(regcomp_ctx *ctx, int codep, int offset)
1200 {
1201 if (offset > REGEXP_OFFSET_MAX) {
1202 Scm_Error("regexp too large. consider splitting it up: %50.1S",
1203 SCM_OBJ(ctx->rx));
1204 }
1205
1206 if (ctx->emitp) {
1207 SCM_ASSERT(codep < ctx->codemax-1);
1208 ctx->code[codep] = (offset >> 8) & 0xff;
1209 ctx->code[codep+1] = offset & 0xff;
1210 }
1211 }
1212
1213 #define EMIT4(cond, insn1, insn2, insn3, insn4) \
1214 rc3_emit(ctx, (cond)? (!ctx->lookbehindp)? insn1 : insn2 \
1215 : (!ctx->lookbehindp)? insn3 : insn4)
1216
rc3_seq(regcomp_ctx * ctx,ScmObj seq,int lastp)1217 static void rc3_seq(regcomp_ctx *ctx, ScmObj seq, int lastp)
1218 {
1219 ScmObj cp;
1220
1221 if (ctx->lookbehindp) seq = Scm_Reverse(seq);
1222
1223 SCM_FOR_EACH(cp, seq) {
1224 ScmObj item = SCM_CAR(cp);
1225
1226 /* concatenate literal character sequence */
1227 if (SCM_CHARP(item)) {
1228 ScmObj h = SCM_NIL, t = SCM_NIL;
1229 int nrun = 0;
1230 char chbuf[SCM_CHAR_MAX_BYTES];
1231
1232 do {
1233 ScmChar ch = SCM_CHAR_VALUE(item);
1234 nrun += SCM_CHAR_NBYTES(ch);
1235 SCM_APPEND1(h, t, item);
1236 cp = SCM_CDR(cp);
1237 if (SCM_NULLP(cp)) break;
1238 item = SCM_CAR(cp);
1239 } while (SCM_CHARP(item) && nrun < CHAR_MAX);
1240 if (ctx->lookbehindp) h = Scm_ReverseX(h);
1241 if (nrun == 1) {
1242 EMIT4(!ctx->casefoldp, RE_MATCH1, RE_MATCH1_RL, RE_MATCH1_CI, RE_MATCH1_CI_RL);
1243 rc3_emit(ctx, (char)SCM_CHAR_VALUE(SCM_CAR(h)));
1244 } else {
1245 EMIT4(!ctx->casefoldp, RE_MATCH, RE_MATCH_RL, RE_MATCH_CI, RE_MATCH_CI_RL);
1246 rc3_emit(ctx, (char)nrun);
1247 ScmObj ht;
1248 SCM_FOR_EACH(ht, h) {
1249 ScmChar ch = SCM_CHAR_VALUE(SCM_CAR(ht));
1250 int nb = SCM_CHAR_NBYTES(ch);
1251 SCM_CHAR_PUT(chbuf, ch);
1252 for (int i = 0; i < nb; i++) rc3_emit(ctx, chbuf[i]);
1253 }
1254 }
1255 if (SCM_NULLP(cp)) break;
1256 cp = Scm_Cons(item, cp); /* pushback */
1257 } else {
1258 int p;
1259 if (ctx->lookbehindp) p = lastp && SCM_EQ(cp, seq);
1260 else p = lastp && SCM_NULLP(SCM_CDR(cp));
1261 rc3_rec(ctx, item, p);
1262 }
1263 }
1264 }
1265
rc3_seq_rep(regcomp_ctx * ctx,ScmObj seq,int count,int lastp)1266 static void rc3_seq_rep(regcomp_ctx *ctx, ScmObj seq, int count, int lastp)
1267 {
1268 ScmObj h = SCM_NIL, t = SCM_NIL;
1269 if (count <= 0) return;
1270 while (count-- > 0) {
1271 SCM_APPEND(h, t, Scm_CopyList(seq));
1272 }
1273 rc3_seq(ctx, h, lastp);
1274 }
1275
rc3_minmax(regcomp_ctx * ctx,ScmObj type,int count,ScmObj item,int lastp SCM_UNUSED)1276 static void rc3_minmax(regcomp_ctx *ctx, ScmObj type, int count,
1277 ScmObj item, int lastp SCM_UNUSED)
1278 {
1279 /* (rep <n> . <x>)
1280 TRY #01
1281 JUMP #11
1282 #01: TRY #02
1283 JUMP #12
1284 :
1285 #0<n>:JUMP #1<N>
1286 #11: <X>
1287 #12: <X>
1288 :
1289 #1<n>:<X>
1290 #1<N>:
1291
1292 (rep-min <n> . <x>)
1293 TRY #01
1294 JUMP #1N
1295 #01: TRY #02
1296 JUMP #1n
1297 :
1298 #0<n>:TRY #11
1299 JUMP #12
1300 #11: <X>
1301 #12: <X>
1302 :
1303 #1<n>:<X>
1304 #1<N>:
1305 */
1306 ScmObj jlist = SCM_NIL;
1307 int j0 = 0, jn;
1308 int greedy = SCM_EQ(type, SCM_SYM_REP);
1309
1310 /* first part - TRYs and JUMPs
1311 j0 is used to patch the label #0k
1312 the destination of jumps to be patched are linked to jlist */
1313 for (int n=0; n<count; n++) {
1314 if (n>0) rc3_fill_offset(ctx, j0, ctx->codep);
1315 rc3_emit(ctx, RE_TRY);
1316 if (ctx->emitp) j0 = ctx->codep;
1317 rc3_emit_offset(ctx, 0); /* to be patched */
1318 rc3_emit(ctx, RE_JUMP);
1319 if (ctx->emitp) {
1320 jlist = Scm_Cons(SCM_MAKE_INT(ctx->codep), jlist);
1321 }
1322 rc3_emit_offset(ctx, 0); /* to be patched */
1323 }
1324 rc3_fill_offset(ctx, j0, ctx->codep); /* patch #0n */
1325 /* finishing the first part.
1326 for non-greedy match, we need one more TRY. */
1327 if (greedy) {
1328 rc3_emit(ctx, RE_JUMP);
1329 jn = ctx->codep;
1330 rc3_emit_offset(ctx, 0); /* to be patched */
1331 } else {
1332 rc3_emit(ctx, RE_TRY);
1333 jn = ctx->codep;
1334 rc3_emit_offset(ctx, 0); /* to be patched */
1335 rc3_emit(ctx, RE_JUMP);
1336 if (ctx->emitp) {
1337 jlist = Scm_Cons(SCM_MAKE_INT(ctx->codep), jlist);
1338 }
1339 rc3_emit_offset(ctx, 0); /* to be patched */
1340 rc3_fill_offset(ctx, jn, ctx->codep);
1341 }
1342 if (ctx->emitp && greedy) jlist = Scm_ReverseX(jlist);
1343 for (int n=0; n<count; n++) {
1344 if (ctx->emitp) {
1345 rc3_fill_offset(ctx, SCM_INT_VALUE(SCM_CAR(jlist)),
1346 ctx->codep);
1347 }
1348 rc3_seq(ctx, item, FALSE);
1349 if (ctx->emitp) jlist = SCM_CDR(jlist);
1350 }
1351 if (greedy) {
1352 /* the last JUMP to #1N */
1353 rc3_fill_offset(ctx, jn, ctx->codep);
1354 } else {
1355 /* the first JUMP to #1N */
1356 if (ctx->emitp) {
1357 SCM_ASSERT(SCM_PAIRP(jlist));
1358 rc3_fill_offset(ctx, SCM_INT_VALUE(SCM_CAR(jlist)), ctx->codep);
1359 }
1360 }
1361 }
1362
rc3_rec(regcomp_ctx * ctx,ScmObj ast,int lastp)1363 static void rc3_rec(regcomp_ctx *ctx, ScmObj ast, int lastp)
1364 {
1365 ScmRegexp *rx = ctx->rx;
1366
1367 /* first, deal with atoms */
1368 if (!SCM_PAIRP(ast)) {
1369 /* a char */
1370 if (SCM_CHARP(ast)) {
1371 char chbuf[SCM_CHAR_MAX_BYTES];
1372 ScmChar ch = SCM_CHAR_VALUE(ast);
1373 int nb = SCM_CHAR_NBYTES(ch);
1374 SCM_CHAR_PUT(chbuf, ch);
1375 if (nb == 1) {
1376 EMIT4(!ctx->casefoldp, RE_MATCH1, RE_MATCH1_RL, RE_MATCH1_CI, RE_MATCH1_CI_RL);
1377 rc3_emit(ctx, chbuf[0]);
1378 } else {
1379 EMIT4(!ctx->casefoldp, RE_MATCH, RE_MATCH_RL, RE_MATCH_CI, RE_MATCH_CI_RL);
1380 rc3_emit(ctx, nb);
1381 for (int i=0; i<nb; i++) rc3_emit(ctx, chbuf[i]);
1382 }
1383 return;
1384 }
1385 /* charset */
1386 if (SCM_CHAR_SET_P(ast)) {
1387 EMIT4(SCM_CHAR_SET_LARGE_P(ast),
1388 RE_SET, RE_SET_RL, RE_SET1, RE_SET1_RL);
1389 rc3_emit(ctx, rc3_charset_index(rx, ast));
1390 return;
1391 }
1392 /* special stuff */
1393 if (SCM_SYMBOLP(ast)) {
1394 if (SCM_EQ(ast, SCM_SYM_ANY)) {
1395 rc3_emit(ctx, ctx->lookbehindp?RE_ANY_RL:RE_ANY);
1396 return;
1397 }
1398 if (SCM_EQ(ast, SCM_SYM_BOS)) {
1399 rc3_emit(ctx, RE_BOS);
1400 return;
1401 }
1402 if (SCM_EQ(ast, SCM_SYM_EOS)) {
1403 rc3_emit(ctx, RE_EOS);
1404 return;
1405 }
1406 if (SCM_EQ(ast, SCM_SYM_BOL)) {
1407 rc3_emit(ctx, RE_BOL);
1408 return;
1409 }
1410 if (SCM_EQ(ast, SCM_SYM_EOL)) {
1411 if (lastp || (ctx->rx->flags & SCM_REGEXP_MULTI_LINE)) {
1412 rc3_emit(ctx, RE_EOL);
1413 } else {
1414 rc3_emit(ctx, ctx->lookbehindp? RE_MATCH1_RL:RE_MATCH1);
1415 rc3_emit(ctx, '$');
1416 }
1417 return;
1418 }
1419 if (SCM_EQ(ast, SCM_SYM_WB)) {
1420 rc3_emit(ctx, RE_WB);
1421 return;
1422 }
1423 if (SCM_EQ(ast, SCM_SYM_BOW)) {
1424 rc3_emit(ctx, RE_BOW);
1425 return;
1426 }
1427 if (SCM_EQ(ast, SCM_SYM_EOW)) {
1428 rc3_emit(ctx, RE_EOW);
1429 return;
1430 }
1431 if (SCM_EQ(ast, SCM_SYM_NWB)) {
1432 rc3_emit(ctx, RE_NWB);
1433 return;
1434 }
1435 if (SCM_EQ(ast, SCM_SYM_BOG)) {
1436 rc3_emit(ctx, RE_BOG);
1437 return;
1438 }
1439 if (SCM_EQ(ast, SCM_SYM_EOG)) {
1440 rc3_emit(ctx, RE_EOG);
1441 return;
1442 }
1443 /* fallback */
1444 }
1445 Scm_Error("internal error in regexp compilation: unrecognized AST item: %S", ast);
1446 }
1447
1448 /* now we have a structured node */
1449 ScmObj type = SCM_CAR(ast);
1450 if (SCM_EQ(type, SCM_SYM_COMP)) {
1451 ScmObj cs = SCM_CDR(ast);
1452 SCM_ASSERT(SCM_CHAR_SET_P(cs));
1453 EMIT4(SCM_CHAR_SET_LARGE_P(cs),
1454 RE_NSET, RE_NSET_RL, RE_NSET1, RE_NSET1_RL);
1455 rc3_emit(ctx, rc3_charset_index(rx, cs));
1456 return;
1457 }
1458 if (SCM_EQ(type, SCM_SYM_SEQ)) {
1459 rc3_seq(ctx, SCM_CDR(ast), lastp);
1460 return;
1461 }
1462 if (SCM_INTP(type)) {
1463 /* (<integer> <name> . <ast>) */
1464 int grpno = SCM_INT_VALUE(SCM_CAR(ast));
1465 rc3_emit(ctx, ctx->lookbehindp?RE_BEGIN_RL:RE_BEGIN);
1466 rc3_emit(ctx, grpno);
1467 rc3_seq(ctx, SCM_CDDR(ast), lastp);
1468 rc3_emit(ctx, ctx->lookbehindp?RE_END_RL:RE_END);
1469 rc3_emit(ctx, grpno);
1470 return;
1471 }
1472 if (SCM_EQ(type, SCM_SYM_SEQ_UNCASE) || SCM_EQ(type, SCM_SYM_SEQ_CASE)) {
1473 int oldcase = ctx->casefoldp;
1474 ctx->casefoldp = SCM_EQ(type, SCM_SYM_SEQ_UNCASE);
1475 rc3_seq(ctx, SCM_CDR(ast), lastp);
1476 ctx->casefoldp = oldcase;
1477 return;
1478 }
1479 if (SCM_EQ(type, SCM_SYM_REP_WHILE)) {
1480 /* here we have an opportunity to generate an optimized code.
1481 for now, we only check elem is a single item case, but we can
1482 do better. */
1483 /* (rep-while m n . elem) */
1484 ScmObj m = SCM_CADR(ast), n = SCM_CAR(SCM_CDDR(ast));
1485 ScmObj elem = SCM_CDR(SCM_CDDR(ast));
1486 if (SCM_FALSEP(n) && SCM_PAIRP(elem) && SCM_NULLP(SCM_CDR(elem))) {
1487 /* (rep-while m #f elem1) */
1488 ScmObj elem1 = SCM_CAR(elem);
1489 if (SCM_EQ(elem1, SCM_SYM_ANY) && !ctx->lookbehindp) {
1490 rc3_seq_rep(ctx, elem, SCM_INT_VALUE(m), FALSE);
1491 rc3_emit(ctx, RE_ANYR);
1492 return;
1493 }
1494 if (SCM_CHARP(elem1) && !ctx->lookbehindp) {
1495 ScmChar ch = SCM_CHAR_VALUE(elem1);
1496 rc3_seq_rep(ctx, elem, SCM_INT_VALUE(m), FALSE);
1497 int n = SCM_CHAR_NBYTES(ch);
1498 if (n == 1) {
1499 rc3_emit(ctx, RE_MATCH1R);
1500 rc3_emit(ctx, (char)ch);
1501 } else {
1502 char chbuf[SCM_CHAR_MAX_BYTES];
1503 SCM_CHAR_PUT(chbuf, ch);
1504 rc3_emit(ctx, RE_MATCHR);
1505 rc3_emit(ctx, (char)n); /* we know it's never overflow */
1506 for (int i=0; i < n; i++) rc3_emit(ctx, chbuf[i]);
1507 }
1508 return;
1509 }
1510 if (SCM_CHAR_SET_P(elem1)) {
1511 rc3_seq_rep(ctx, elem, SCM_INT_VALUE(m), FALSE);
1512 EMIT4(SCM_CHAR_SET_LARGE_P(elem1),
1513 RE_SETR, RE_SETR_RL, RE_SET1R, RE_SET1R_RL);
1514 rc3_emit(ctx, rc3_charset_index(rx, elem1));
1515 return;
1516 }
1517 if (SCM_PAIRP(elem1)&&SCM_EQ(SCM_CAR(elem1), SCM_SYM_COMP)) {
1518 rc3_seq_rep(ctx, elem, SCM_INT_VALUE(m), FALSE);
1519 ScmObj cs = SCM_CDR(elem1);
1520 SCM_ASSERT(SCM_CHAR_SET_P(cs));
1521 EMIT4(!ctx->lookbehindp, RE_NSETR, RE_NSETR_RL, RE_NSET1R, RE_NSET1R_RL);
1522 rc3_emit(ctx, rc3_charset_index(rx, cs));
1523 return;
1524 }
1525 }
1526 /* fallthrough to rep */
1527 type = SCM_SYM_REP;
1528 }
1529 if (SCM_EQ(type, SCM_SYM_ONCE) && ctx->lookbehindp) {
1530 /* [Rui] I couldn't make a decision about the behavior of standalone
1531 pattern (?>re) within a lookbehind assertion ((?<=re) or
1532 (?<!re)). It raises an error for now. */
1533 Scm_Error("standalone pattern in lookbehind assertion is not supported: %S",
1534 ctx->pattern);
1535 }
1536 if (SCM_EQ(type, SCM_SYM_ASSERT) || SCM_EQ(type, SCM_SYM_NASSERT)
1537 || SCM_EQ(type, SCM_SYM_ONCE)) {
1538 int ocodep = ctx->codep;
1539 int op = SCM_EQ(type, SCM_SYM_ASSERT) ? RE_ASSERT :
1540 SCM_EQ(type, SCM_SYM_NASSERT) ? RE_NASSERT : RE_ONCE;
1541 rc3_emit(ctx, op);
1542 rc3_emit_offset(ctx, 0); /* will be patched */
1543 /* Assertions can check EOF even other regexps follow, so '$'
1544 in the last pos of this group should be treated as EOL.
1545 (?>$) as well. It is consistent with Perl and Oniguruma. */
1546 rc3_seq(ctx, SCM_CDR(ast), TRUE);
1547 rc3_emit(ctx, RE_SUCCESS);
1548 rc3_fill_offset(ctx, ocodep+1, ctx->codep);
1549 return;
1550 }
1551 if (SCM_EQ(type, SCM_SYM_ALT)) {
1552 /* TRY #1
1553 <alt0>
1554 JUMP next
1555 #1: TRY #2
1556 <alt1>
1557 JUMP next
1558 :
1559 :
1560 TRY next
1561 <altN>
1562 next:
1563 */
1564 ScmObj clause;
1565 ScmObj jumps = SCM_NIL;
1566 int patchp;
1567
1568 if (SCM_PAIRP(SCM_CDR(ast))) {
1569 for (clause = SCM_CDR(ast);
1570 SCM_PAIRP(SCM_CDR(clause));
1571 clause = SCM_CDR(clause)) {
1572 rc3_emit(ctx, RE_TRY);
1573 patchp = ctx->codep;
1574 rc3_emit_offset(ctx, 0); /* will be patched */
1575 rc3_rec(ctx, SCM_CAR(clause), lastp);
1576 rc3_emit(ctx, RE_JUMP);
1577 if (ctx->emitp) {
1578 jumps = Scm_Cons(SCM_MAKE_INT(ctx->codep), jumps);
1579 }
1580 rc3_emit_offset(ctx, 0); /* will be patched */
1581 rc3_fill_offset(ctx, patchp, ctx->codep);
1582 }
1583 rc3_rec(ctx, SCM_CAR(clause), lastp);
1584 if (ctx->emitp) {
1585 SCM_FOR_EACH(jumps, jumps) {
1586 patchp = SCM_INT_VALUE(SCM_CAR(jumps));
1587 rc3_fill_offset(ctx, patchp, ctx->codep);
1588 }
1589 }
1590 } else {
1591 /* NB: alternation without any choices won't appear from the
1592 parsed AST, but the caller can pass in a programatically
1593 constructed AST. It fails unconditionally, since we have
1594 no possible choice. */
1595 rc3_emit(ctx, RE_FAIL);
1596 }
1597 return;
1598 }
1599 if (SCM_EQ(type, SCM_SYM_REP) || SCM_EQ(type, SCM_SYM_REP_MIN)) {
1600 ScmObj min = SCM_CADR(ast), max = SCM_CAR(SCM_CDDR(ast));
1601 ScmObj item = SCM_CDR(SCM_CDDR(ast));
1602 int multip = 0;
1603
1604 if (SCM_FALSEP(max) || SCM_INT_VALUE(max) > 1)
1605 multip = TRUE;
1606 rc3_seq_rep(ctx, item, SCM_INT_VALUE(min), multip);
1607
1608 if (SCM_EQ(min, max)) {
1609 /* (rep <m> <m> <x>)
1610 <x>
1611 : (m-1 times)
1612 */
1613 return;
1614 }
1615 if (!SCM_FALSEP(max)) {
1616 int count = SCM_INT_VALUE(max) - SCM_INT_VALUE(min);
1617 rc3_minmax(ctx, type, count, item, lastp);
1618 return;
1619 }
1620 if (SCM_EQ(type, SCM_SYM_REP)) {
1621 /* (rep <m> #f <x>)
1622 <x>
1623 : (m-1 times)
1624 rep: TRY next
1625 <x>
1626 JUMP rep
1627 next:
1628 */
1629 int ocodep = ctx->codep;
1630 rc3_emit(ctx, RE_TRY);
1631 rc3_emit_offset(ctx, 0); /* will be patched */
1632 rc3_seq(ctx, item, FALSE);
1633 rc3_emit(ctx, RE_JUMP);
1634 rc3_emit_offset(ctx, ocodep);
1635 rc3_fill_offset(ctx, ocodep+1, ctx->codep);
1636 return;
1637 }
1638 if (SCM_EQ(type, SCM_SYM_REP_MIN)) {
1639 /* (rep-min <m> #f <x>)
1640 <x>
1641 : (m-1 times)
1642 rep: TRY seq
1643 JUMP next
1644 seq: <seq>
1645 JUMP rep
1646 next:
1647 */
1648 int ocodep1 = ctx->codep, ocodep2;
1649 rc3_emit(ctx, RE_TRY);
1650 rc3_emit_offset(ctx, 0); /* will be patched */
1651 ocodep2 = ctx->codep;
1652 rc3_emit(ctx, RE_JUMP);
1653 rc3_emit_offset(ctx, 0); /* will be patched */
1654 rc3_fill_offset(ctx, ocodep1+1, ctx->codep);
1655 rc3_seq(ctx, item, FALSE);
1656 rc3_emit(ctx, RE_JUMP);
1657 rc3_emit_offset(ctx, ocodep1);
1658 rc3_fill_offset(ctx, ocodep2+1, ctx->codep);
1659 return;
1660 }
1661 }
1662 if (SCM_EQ(type, SCM_SYM_LOOKBEHIND)) {
1663 int oldval = ctx->lookbehindp;
1664 ctx->lookbehindp = TRUE;
1665 rc3_seq(ctx, SCM_CDR(ast), lastp);
1666 ctx->lookbehindp = oldval;
1667 return;
1668 }
1669 if (SCM_EQ(type, SCM_SYM_BACKREF)) {
1670 SCM_ASSERT(SCM_INTP(SCM_CDR(ast)));
1671 EMIT4(!ctx->casefoldp, RE_BACKREF, RE_BACKREF_RL, RE_BACKREF_CI, RE_BACKREF_CI_RL);
1672 rc3_emit(ctx, (char)SCM_INT_VALUE(SCM_CDR(ast)));
1673 return;
1674 }
1675 if (SCM_EQ(type, SCM_SYM_CPAT)) {
1676 /* (cpat <n> <yes-pattern> <no-pattern>)
1677 CPAT <n> #1
1678 <yes-pattern>
1679 JUMP #2
1680 #1: <no-pattern>
1681 #2:
1682
1683 (cpat <assert> <yes-pattern> <no-pattern>)
1684 CPATA #1 #2
1685 <assert>
1686 SUCCESS
1687 #1: <yes-pattern>
1688 JUMP #3
1689 #2: <no-pattern>
1690 #3:
1691 */
1692 ScmObj cond = SCM_CADR(ast);
1693 ScmObj ypat = SCM_CAR(SCM_CDDR(ast));
1694 ScmObj npat = SCM_CADR(SCM_CDDR(ast));
1695 if (SCM_INTP(cond)) {
1696 rc3_emit(ctx, RE_CPAT);
1697 rc3_emit(ctx, (char)SCM_INT_VALUE(cond));
1698 int ocodep1 = ctx->codep;
1699 rc3_emit_offset(ctx, 0); /* will be patched */
1700 rc3_seq(ctx, ypat, lastp);
1701 rc3_emit(ctx, RE_JUMP);
1702 int ocodep2 = ctx->codep;
1703 rc3_emit_offset(ctx, 0); /* will be patched */
1704 rc3_fill_offset(ctx, ocodep1, ctx->codep);
1705 rc3_seq(ctx, npat, lastp);
1706 rc3_fill_offset(ctx, ocodep2, ctx->codep);
1707 } else {
1708 SCM_ASSERT(SCM_EQ(SCM_CAR(cond), SCM_SYM_ASSERT)
1709 || SCM_EQ(SCM_CAR(cond), SCM_SYM_NASSERT));
1710 rc3_emit(ctx, RE_CPATA);
1711 int ocodep1 = ctx->codep;
1712 rc3_emit_offset(ctx, 0); /* will be patched */
1713 int ocodep2 = ctx->codep;
1714 rc3_emit_offset(ctx, 0); /* will be patched */
1715 rc3_rec(ctx, cond, lastp);
1716 rc3_emit(ctx, RE_SUCCESS);
1717 rc3_fill_offset(ctx, ocodep1, ctx->codep);
1718 rc3_seq(ctx, ypat, lastp);
1719 rc3_emit(ctx, RE_JUMP);
1720 int ocodep3 = ctx->codep;
1721 rc3_emit_offset(ctx, 0); /* will be patched */
1722 rc3_fill_offset(ctx, ocodep2, ctx->codep);
1723 rc3_seq(ctx, npat, lastp);
1724 rc3_fill_offset(ctx, ocodep3, ctx->codep);
1725 }
1726 return;
1727 }
1728 Scm_Error("internal error in regexp compilation: bad node: %S", ast);
1729 }
1730
is_atom_anchored(ScmObj ast,ScmObj atom)1731 static int is_atom_anchored(ScmObj ast, ScmObj atom)
1732 {
1733 if (!SCM_PAIRP(ast)) {
1734 if (SCM_EQ(ast, atom)) return TRUE;
1735 else return FALSE;
1736 }
1737 ScmObj type = SCM_CAR(ast);
1738 if (SCM_INTP(type)) {
1739 if (!SCM_PAIRP(SCM_CDDR(ast))) return FALSE;
1740 return is_atom_anchored(SCM_CAR(SCM_CDDR(ast)), atom);
1741 } else if (SCM_EQ(type, SCM_SYM_SEQ)
1742 || SCM_EQ(type, SCM_SYM_SEQ_UNCASE)
1743 || SCM_EQ(type, SCM_SYM_SEQ_CASE)) {
1744 if (!SCM_PAIRP(SCM_CDR(ast))) return FALSE;
1745 return is_atom_anchored(SCM_CADR(ast), atom);
1746 }
1747 if (SCM_EQ(type, SCM_SYM_ALT)) {
1748 ScmObj ap;
1749 SCM_FOR_EACH(ap, SCM_CDR(ast)) {
1750 if (!is_atom_anchored(SCM_CAR(ap), atom)) return FALSE;
1751 }
1752 return TRUE;
1753 }
1754 return FALSE;
1755 }
1756
1757 /* Aux function for is_simple_prefixed.
1758 Returns TRUE if AST is <char>, <char-set>, or (comp . <char-set>)*/
is_char_or_charset(ScmObj ast)1759 static int is_char_or_charset(ScmObj ast)
1760 {
1761 if (SCM_CHARP(ast) || SCM_CHAR_SET_P(ast)
1762 || (SCM_PAIRP(ast)
1763 && SCM_EQ(SCM_CAR(ast), SCM_SYM_COMP)
1764 && SCM_CHAR_SET_P(SCM_CDR(ast)))) {
1765 return TRUE;
1766 } else {
1767 return FALSE;
1768 }
1769 }
1770
1771 /* Returns TRUE iff ast has a form #/A+B/ where A is a char or charset,
1772 and B begins with distinct charset from A (B may be empty).
1773 After optimization, the AST begins with (rep-while 1 #f A).
1774 If so, we can greatly optimize the failure case.
1775 Suppose if we try input s against #/A+B/ and find it fail. Then
1776 we can skip prefix of s as far as it matches #/A/. */
is_simple_prefixed(ScmObj ast)1777 static int is_simple_prefixed(ScmObj ast)
1778 {
1779 if (!SCM_PAIRP(ast)) return FALSE;
1780 ScmObj car = SCM_CAR(ast);
1781 if (SCM_EQ(car, SCM_SYM_REP_WHILE)) {
1782 if (SCM_EQ(SCM_CADR(ast), SCM_MAKE_INT(1))
1783 && SCM_FALSEP(SCM_CAR(SCM_CDDR(ast)))) {
1784 ScmObj body = SCM_CDR(SCM_CDDR(ast));
1785 if (SCM_PAIRP(body) && SCM_NULLP(SCM_CDR(body))) {
1786 return is_char_or_charset(SCM_CAR(body));
1787 }
1788 }
1789 return FALSE;
1790 } else if (SCM_EQ(car, SCM_SYM_SEQ)) { /* TODO: handle uncase */
1791 if (SCM_PAIRP(SCM_CDR(ast))) {
1792 return is_simple_prefixed(SCM_CADR(ast));
1793 }
1794 return FALSE;
1795 } else if (SCM_INTP(car)) {
1796 ScmObj s = SCM_CDDR(ast);
1797 if (SCM_PAIRP(s)) return is_simple_prefixed(SCM_CAR(s));
1798 }
1799 return FALSE;
1800 }
1801
1802
1803 /* returns lookahead set. modifies the first arg. */
merge_laset(ScmObj la1,ScmObj la2)1804 static ScmObj merge_laset(ScmObj la1, ScmObj la2)
1805 {
1806 if (SCM_CHAR_SET_P(la1) && SCM_CHAR_SET_P(la2)) {
1807 return Scm_CharSetAdd(SCM_CHAR_SET(la1),
1808 SCM_CHAR_SET(la2));
1809 } else {
1810 return SCM_FALSE;
1811 }
1812 }
1813
1814 static ScmObj calculate_lasetn(ScmObj ast);
1815
1816 /* returns lookahead set. returned charset is fresh.
1817 TODO: We can also take advantage of wb and nwb condition to
1818 skip the input. */
calculate_laset(ScmObj head,ScmObj rest)1819 static ScmObj calculate_laset(ScmObj head, ScmObj rest)
1820 {
1821 if (!SCM_PAIRP(head)) {
1822 if (SCM_CHARP(head)) {
1823 return Scm_CharSetAddRange(SCM_CHAR_SET(Scm_MakeEmptyCharSet()),
1824 SCM_CHAR_VALUE(head),
1825 SCM_CHAR_VALUE(head));
1826 } else if (SCM_CHAR_SET_P(head)) {
1827 return Scm_CharSetCopy(SCM_CHAR_SET(head));
1828 }
1829 return SCM_FALSE;
1830 }
1831 ScmObj head_car = SCM_CAR(head);
1832
1833 if (SCM_EQ(head_car, SCM_SYM_COMP)) {
1834 SCM_ASSERT(SCM_CHAR_SET_P(SCM_CDR(head)));
1835 ScmObj cs = Scm_CharSetCopy(SCM_CHAR_SET(SCM_CDR(head)));
1836 return Scm_CharSetComplement(SCM_CHAR_SET(cs));
1837 } else if (SCM_EQ(head_car, SCM_SYM_SEQ)||SCM_EQ(head_car, SCM_SYM_ONCE)) {
1838 return calculate_lasetn(SCM_CDR(head));
1839 } else if (SCM_EQ(head_car, SCM_SYM_ALT)) {
1840 ScmObj choices = SCM_CDR(head);
1841 if (!SCM_PAIRP(choices)) return SCM_FALSE;
1842 ScmObj r = calculate_laset(SCM_CAR(choices), SCM_NIL);
1843 choices = SCM_CDR(choices);
1844 while (!SCM_FALSEP(r) && SCM_PAIRP(choices)) {
1845 r = merge_laset(r, calculate_laset(SCM_CAR(choices), SCM_NIL));
1846 choices = SCM_CDR(choices);
1847 }
1848 return r;
1849 } else if (SCM_EQ(head_car, SCM_SYM_REP)
1850 || SCM_EQ(head_car, SCM_SYM_REP_WHILE)
1851 || SCM_EQ(head_car, SCM_SYM_REP_MIN)) {
1852 SCM_ASSERT(SCM_PAIRP(SCM_CDR(head)) && SCM_PAIRP(SCM_CDDR(head)));
1853 if (SCM_EQ(SCM_CADR(head), SCM_MAKE_INT(0))) {
1854 return merge_laset(calculate_lasetn(SCM_CDR(SCM_CDDR(head))),
1855 calculate_lasetn(rest));
1856 } else {
1857 return calculate_lasetn(SCM_CDR(SCM_CDDR(head)));
1858 }
1859 } else if (SCM_INTP(head_car)) {
1860 SCM_ASSERT(SCM_PAIRP(SCM_CDR(head)));
1861 return calculate_lasetn(SCM_CDDR(head));
1862 } else {
1863 return SCM_FALSE;
1864 }
1865 }
1866
calculate_lasetn(ScmObj ast)1867 static ScmObj calculate_lasetn(ScmObj ast)
1868 {
1869 if (!SCM_PAIRP(ast)) return SCM_FALSE;
1870 else return calculate_laset(SCM_CAR(ast), SCM_CDR(ast));
1871 }
1872
1873 /* pass 3 */
rc3(regcomp_ctx * ctx,ScmObj ast)1874 static ScmObj rc3(regcomp_ctx *ctx, ScmObj ast)
1875 {
1876 /* set flags and laset */
1877 if (is_atom_anchored(ast, SCM_SYM_BOS)
1878 || (!(ctx->rx->flags & SCM_REGEXP_MULTI_LINE)
1879 && is_atom_anchored(ast, SCM_SYM_BOL))) {
1880 ctx->rx->flags |= SCM_REGEXP_BOL_ANCHORED;
1881 }
1882 else if (is_simple_prefixed(ast)) ctx->rx->flags |= SCM_REGEXP_SIMPLE_PREFIX;
1883 ctx->rx->laset = calculate_laset(ast, SCM_NIL);
1884
1885 /* pass 3-1 : count # of insns */
1886 ctx->codemax = 1;
1887 ctx->emitp = FALSE;
1888 rc3_rec(ctx, ast, TRUE);
1889
1890 /* pass 3-2 : code generation */
1891 ctx->code = SCM_NEW_ATOMIC2(unsigned char *, ctx->codemax);
1892 ctx->emitp = TRUE;
1893 rc3_rec(ctx, ast, TRUE);
1894 rc3_emit(ctx, RE_SUCCESS);
1895 ctx->rx->code = ctx->code;
1896 ctx->rx->numCodes = ctx->codep;
1897
1898 ctx->rx->ast = ast;
1899 return SCM_OBJ(ctx->rx);
1900 }
1901
1902 /* For debug */
Scm_RegDump(ScmRegexp * rx)1903 void Scm_RegDump(ScmRegexp *rx)
1904 {
1905 static const char *opnames[] = {
1906 #define DEF_RE_INSN(name, _) #name,
1907 #include "gauche/regexp_insn.h"
1908 #undef DEF_RE_INSN
1909 };
1910
1911 static enum {
1912 OP_none,
1913 OP_octet,
1914 OP_string,
1915 OP_cset,
1916 OP_group,
1917 OP_offset2,
1918 OP_offset1_2,
1919 OP_offset2_2
1920 } optypes[] = {
1921 #define DEF_RE_INSN(_, optype) optype,
1922 #include "gauche/regexp_insn.h"
1923 #undef DEF_RE_INSN
1924 };
1925
1926 Scm_Printf(SCM_CUROUT, "Regexp %p: (flags=%08x", rx, rx->flags);
1927 if (rx->flags&SCM_REGEXP_BOL_ANCHORED)
1928 Scm_Printf(SCM_CUROUT, ",BOL_ANCHORED");
1929 if (rx->flags&SCM_REGEXP_SIMPLE_PREFIX)
1930 Scm_Printf(SCM_CUROUT, ",SIMPLE_PREFIX");
1931 Scm_Printf(SCM_CUROUT, ")\n");
1932 Scm_Printf(SCM_CUROUT, " laset = %S\n", rx->laset);
1933 Scm_Printf(SCM_CUROUT, " must = ");
1934 if (rx->mustMatch) {
1935 Scm_Printf(SCM_CUROUT, "%S\n", rx->mustMatch);
1936 } else {
1937 Scm_Printf(SCM_CUROUT, "(none)\n");
1938 }
1939
1940 int end = rx->numCodes;
1941 for (int codep = 0; codep < end; codep++) {
1942 int code = rx->code[codep];
1943 int optype = optypes[code];
1944 Scm_Printf(SCM_CUROUT, "%4d ", codep);
1945 switch (optype) {
1946 case OP_none:
1947 Scm_Printf(SCM_CUROUT, "%s\n", opnames[code]);
1948 break;
1949 case OP_octet:
1950 Scm_Printf(SCM_CUROUT, "%s 0x%02x '%c'\n",
1951 opnames[code],
1952 rx->code[codep+1],
1953 rx->code[codep+1]);
1954 codep++;
1955 break;
1956 case OP_string:
1957 {
1958 u_int numchars = (u_int)rx->code[++codep];
1959 u_int i;
1960 Scm_Printf(SCM_CUROUT, "%s(%3d) '",
1961 opnames[code], numchars);
1962 for (i=0; i< numchars; i++)
1963 Scm_Printf(SCM_CUROUT, "%c", rx->code[++codep]);
1964 Scm_Printf(SCM_CUROUT, "'\n");
1965 }
1966 break;
1967 case OP_cset:
1968 Scm_Printf(SCM_CUROUT, "%s %d %S\n",
1969 opnames[code],
1970 rx->code[codep+1],
1971 rx->sets[rx->code[codep+1]]);
1972 codep++;
1973 break;
1974 case OP_group:
1975 Scm_Printf(SCM_CUROUT, "%s %d\n",
1976 opnames[code],
1977 rx->code[codep+1]);
1978 codep++;
1979 break;
1980 case OP_offset2:
1981 Scm_Printf(SCM_CUROUT, "%s %d\n",
1982 opnames[code],
1983 (rx->code[codep+1])*256 + rx->code[codep+2]);
1984 codep += 2;
1985 break;
1986 case OP_offset1_2:
1987 Scm_Printf(SCM_CUROUT, "%s %d %d\n",
1988 opnames[code],
1989 rx->code[codep+1],
1990 rx->code[codep+2]*256 + rx->code[codep+3]);
1991 codep += 3;
1992 break;
1993 case OP_offset2_2:
1994 Scm_Printf(SCM_CUROUT, "%s %d %d\n",
1995 opnames[code],
1996 rx->code[codep+1]*256 + rx->code[codep+2],
1997 rx->code[codep+3]*256 + rx->code[codep+4]);
1998 codep += 4;
1999 break;
2000 }
2001 }
2002 }
2003
2004 /* Helper routine to be used for compilation from AST.
2005 Traverses AST to reorder groups and collect charsets.
2006 Note that the native regcomp path doesn't use these fns.
2007 Only the AST provided from outside is processed. */
2008 static ScmObj rc_setup_context_seq(regcomp_ctx *ctx, ScmObj seq);
2009
rc_setup_context(regcomp_ctx * ctx,ScmObj ast)2010 static ScmObj rc_setup_context(regcomp_ctx *ctx, ScmObj ast)
2011 {
2012 if (!SCM_PAIRP(ast)) {
2013 if (SCM_CHARP(ast)) return ast;
2014 if (SCM_CHAR_SET_P(ast)) {
2015 rc_register_charset(ctx, SCM_CHAR_SET(ast));
2016 return ast;
2017 }
2018 if (SCM_EQ(ast, SCM_SYM_BOS) || SCM_EQ(ast, SCM_SYM_EOS)
2019 || SCM_EQ(ast, SCM_SYM_BOL) || SCM_EQ(ast, SCM_SYM_EOL)
2020 || SCM_EQ(ast, SCM_SYM_WB) || SCM_EQ(ast, SCM_SYM_NWB)
2021 || SCM_EQ(ast, SCM_SYM_BOW) || SCM_EQ(ast, SCM_SYM_EOW)
2022 || SCM_EQ(ast, SCM_SYM_BOG) || SCM_EQ(ast, SCM_SYM_EOG)
2023 || SCM_EQ(ast, SCM_SYM_ANY)) {
2024 return ast;
2025 }
2026 goto badast;
2027 }
2028 ScmObj type = SCM_CAR(ast);
2029 if (SCM_INTP(type)) {
2030 int grpno = ctx->grpcount++;
2031 ScmObj prevno = type, name = SCM_CADR(ast), body = SCM_CDDR(ast);
2032 ScmObj rest = rc_setup_context_seq(ctx, body);
2033 if (SCM_SYMBOLP(name)) {
2034 ctx->rx->grpNames = Scm_Acons(name, SCM_MAKE_INT(grpno),
2035 ctx->rx->grpNames);
2036 }
2037 if (SCM_INT_VALUE(prevno) == grpno && SCM_EQ(body, rest)) {
2038 return ast;
2039 } else {
2040 return Scm_Cons(SCM_MAKE_INT(grpno), Scm_Cons(name, rest));
2041 }
2042 }
2043 if (SCM_EQ(type, SCM_SYM_COMP)) {
2044 if (!SCM_CHAR_SET_P(SCM_CDR(ast))) goto badast;
2045 rc_register_charset(ctx, SCM_CHAR_SET(SCM_CDR(ast)));
2046 return ast;
2047 }
2048 if (SCM_EQ(type, SCM_SYM_BACKREF)) {
2049 if (!SCM_INTP(SCM_CDR(ast))) goto badast;
2050 return ast;
2051 }
2052 if (SCM_EQ(type, SCM_SYM_CPAT)) {
2053 if (!SCM_PAIRP(SCM_CDR(ast))
2054 || !SCM_PAIRP(SCM_CDDR(ast))
2055 || !SCM_PAIRP(SCM_CDR(SCM_CDDR(ast)))
2056 || !SCM_NULLP(SCM_CDDR(SCM_CDDR(ast))))
2057 goto badast;
2058 ScmObj cond = SCM_CADR(ast);
2059 ScmObj then = SCM_CAR(SCM_CDDR(ast));
2060 ScmObj alt = SCM_CADR(SCM_CDDR(ast));
2061 if (SCM_PAIRP(cond)) {
2062 if (!SCM_EQ(SCM_CAR(cond), SCM_SYM_ASSERT)
2063 && !SCM_EQ(SCM_CAR(cond), SCM_SYM_NASSERT)) goto badast;
2064 cond = rc_setup_context(ctx, cond);
2065 } else if (!SCM_INTP(cond)) {
2066 goto badast;
2067 }
2068 then = rc_setup_context_seq(ctx, then);
2069 if (!SCM_FALSEP(alt))
2070 alt = rc_setup_context_seq(ctx, alt);
2071 if (SCM_EQ(cond, SCM_CADR(ast))
2072 && SCM_EQ(then, SCM_CAR(SCM_CDDR(ast)))
2073 && SCM_EQ(alt, SCM_CADR(SCM_CDDR(ast)))) return ast;
2074 else return SCM_LIST4(type, cond, then, alt);
2075 }
2076 if (SCM_EQ(type, SCM_SYM_SEQ) || SCM_EQ(type, SCM_SYM_ALT)
2077 || SCM_EQ(type, SCM_SYM_SEQ_UNCASE) || SCM_EQ(type, SCM_SYM_SEQ_CASE)
2078 || SCM_EQ(type, SCM_SYM_ONCE) || SCM_EQ(type, SCM_SYM_LOOKBEHIND)
2079 || SCM_EQ(type, SCM_SYM_ASSERT) || SCM_EQ(type, SCM_SYM_NASSERT)) {
2080 ScmObj rest = rc_setup_context_seq(ctx, SCM_CDR(ast));
2081 if (SCM_EQ(SCM_CDR(ast), rest)) return ast;
2082 else return Scm_Cons(type, rest);
2083 }
2084 if (SCM_EQ(type, SCM_SYM_REP_WHILE) || SCM_EQ(type, SCM_SYM_REP)
2085 || SCM_EQ(type, SCM_SYM_REP_MIN)) {
2086 if (!SCM_PAIRP(SCM_CDR(ast)) || !SCM_PAIRP(SCM_CDDR(ast)))
2087 goto badast;
2088 ScmObj m = SCM_CADR(ast);
2089 ScmObj n = SCM_CAR(SCM_CDDR(ast));
2090 ScmObj item = SCM_CDR(SCM_CDDR(ast));
2091 if (!SCM_INTP(m) || SCM_INT_VALUE(m) < 0) goto badast;
2092 if (!SCM_FALSEP(n) && (!SCM_INTP(n) || SCM_INT_VALUE(m) < 0))
2093 goto badast;
2094 ScmObj rest = rc_setup_context_seq(ctx, item);
2095 if (SCM_EQ(item, rest)) return ast;
2096 else return SCM_LIST4(type, m, n, rest);
2097 }
2098 badast:
2099 Scm_Error("invalid regexp AST: %S", ast);
2100 return SCM_UNDEFINED; /* dummy */
2101 }
2102
rc_setup_context_seq(regcomp_ctx * ctx,ScmObj seq)2103 static ScmObj rc_setup_context_seq(regcomp_ctx *ctx, ScmObj seq)
2104 {
2105 ScmObj sp, sp2, obj = SCM_NIL, head = SCM_NIL, tail = SCM_NIL;
2106 SCM_FOR_EACH(sp, seq) {
2107 obj = rc_setup_context(ctx, SCM_CAR(sp));
2108 if (!SCM_EQ(obj, SCM_CAR(sp))) break;
2109 }
2110 if (SCM_NULLP(sp)) return seq;
2111 /* we need to copy the spine */
2112 SCM_FOR_EACH(sp2, seq) {
2113 if (SCM_EQ(sp2, sp)) break;
2114 SCM_APPEND1(head, tail, SCM_CAR(sp2));
2115 }
2116 SCM_APPEND1(head, tail, obj);
2117 SCM_FOR_EACH(sp2, SCM_CDR(sp2)) {
2118 SCM_APPEND1(head, tail, rc_setup_context(ctx, SCM_CAR(sp2)));
2119 }
2120 return head;
2121 }
2122
2123 /*--------------------------------------------------------------
2124 * Compiler entry point
2125 */
Scm_RegComp(ScmString * pattern,int flags)2126 ScmObj Scm_RegComp(ScmString *pattern, int flags)
2127 {
2128 if (SCM_STRING_INCOMPLETE_P(pattern)) {
2129 Scm_Error("incomplete string is not allowed: %S", pattern);
2130 }
2131
2132 ScmRegexp *rx = make_regexp();
2133 regcomp_ctx cctx;
2134 rc_ctx_init(&cctx, rx, pattern);
2135 cctx.casefoldp = flags & SCM_REGEXP_CASE_FOLD;
2136 rx->flags |= (flags & SCM_REGEXP_CASE_FOLD);
2137 rx->flags |= (flags & SCM_REGEXP_MULTI_LINE);
2138
2139 /* pass 1 : parse regexp spec */
2140 ScmObj ast = rc1(&cctx);
2141 rc_setup_charsets(rx, &cctx);
2142 if (flags & SCM_REGEXP_PARSE_ONLY) return ast;
2143
2144 /* pass 2 : optimization */
2145 ast = rc2_optimize(ast, SCM_NIL);
2146
2147 /* pass 3 : generate bytecode */
2148 return rc3(&cctx, ast);
2149 }
2150
2151 /* alternative entry that compiles from AST */
2152 #if GAUCHE_API_VERSION >= 1000
Scm_RegCompFromAST(ScmObj ast,int flags)2153 ScmObj Scm_RegCompFromAST(ScmObj ast, int flags)
2154 #else /* GAUCHE_API_VERSION < 1000 */
2155 ScmObj Scm_RegCompFromAST(ScmObj ast)
2156 {
2157 return Scm_RegCompFromAST2(ast, 0);
2158 }
2159
2160 ScmObj Scm_RegCompFromAST2(ScmObj ast, int flags)
2161 #endif /* GAUCHE_API_VERSION < 1000 */
2162 {
2163 ScmRegexp *rx = make_regexp();
2164 regcomp_ctx cctx;
2165 rc_ctx_init(&cctx, rx, NULL);
2166 rx->flags |= (flags & SCM_REGEXP_MULTI_LINE);
2167
2168 /* prepare some context */
2169 if (!SCM_PAIRP(ast) || !SCM_INTP(SCM_CAR(ast))) {
2170 /* ensure the entire AST is in a group #0 */
2171 ast = SCM_LIST3(SCM_MAKE_INT(0), SCM_FALSE, ast);
2172 }
2173 ast = rc_setup_context(&cctx, ast);
2174 rc_setup_charsets(rx, &cctx);
2175 rx->numGroups = cctx.grpcount;
2176
2177 /* pass 3 */
2178 return rc3(&cctx, ast);
2179 }
2180
2181 /*=======================================================================
2182 * Matcher
2183 */
2184
2185 /* For now, I use C-stack directly to keep information for backtrack,
2186 * i.e. anytime I should try something I recursively call rex_rec().
2187 * It may run out the stack space if regexp requires deep recursion.
2188 *
2189 * Rex_rec doesn't return as long as match succeeds. At the end of
2190 * code, it longjmp's to the start of matcher.
2191 *
2192 * My preliminary test showed that using C-stack & longjmp is faster than
2193 * allocating and maintaining the stack by myself. Further test is required
2194 * for practical case, though.
2195 */
2196
2197 struct match_ctx {
2198 ScmRegexp *rx;
2199 const unsigned char *codehead; /* start of code */
2200 const char *input; /* start of input */
2201 const char *stop; /* end of input */
2202 const char *last;
2203 struct ScmRegMatchSub **matches;
2204 void *begin_stack; /* C stack pointer the match began from. */
2205 sigjmp_buf *cont;
2206 ScmObj grapheme_predicate;
2207 };
2208
2209 #define MAX_STACK_USAGE 0x100000
2210
match_ci(const char ** input,const unsigned char ** code,int length)2211 static int match_ci(const char **input, const unsigned char **code, int length)
2212 {
2213 do {
2214 ScmChar inch, c;
2215 SCM_CHAR_GET(*input, inch);
2216 int csize = SCM_CHAR_NBYTES(inch);
2217 *input += csize;
2218 SCM_CHAR_GET(*code, c);
2219 *code += SCM_CHAR_NBYTES(c);
2220 if (Scm_CharDowncase(inch) != c)
2221 return FALSE;
2222 length -= csize;
2223 } while (length > 0);
2224 return TRUE;
2225 }
2226
2227 /* Check if input points to the word boundary. For now, I consider
2228 all multibyte chars word-constituent. */
is_word_constituent(unsigned char b)2229 static int is_word_constituent(unsigned char b)
2230 {
2231 if (b >= 128) return TRUE;
2232 if (b >= '0' && b <= '9') return TRUE;
2233 if (b >= 'A' && b <= 'Z') return TRUE;
2234 if (b >= 'a' && b <= 'z') return TRUE;
2235 return FALSE;
2236 }
2237
is_word_boundary(struct match_ctx * ctx,const char * input,unsigned int code)2238 static int is_word_boundary(struct match_ctx *ctx, const char *input, unsigned int code)
2239 {
2240 const char *prevp;
2241
2242 if ((code == RE_BOW || code == RE_WB) && input == ctx->input) return TRUE;
2243 if ((code == RE_EOW || code == RE_WB) && input == ctx->stop) return TRUE;
2244 unsigned char nextb = (unsigned char)*input;
2245 SCM_CHAR_BACKWARD(input, ctx->input, prevp);
2246 SCM_ASSERT(prevp != NULL);
2247 unsigned char prevb = (unsigned char)*prevp;
2248 if ((code == RE_BOW || code == RE_WB)
2249 && is_word_constituent(nextb) && !is_word_constituent(prevb)) {
2250 return TRUE;
2251 }
2252 if ((code == RE_EOW || code == RE_WB)
2253 && !is_word_constituent(nextb) && is_word_constituent(prevb)) {
2254 return TRUE;
2255 }
2256 return FALSE;
2257 }
2258
is_beginning_of_line(struct match_ctx * ctx,const char * input)2259 static int is_beginning_of_line(struct match_ctx *ctx, const char *input)
2260 {
2261 if (input == ctx->input) return TRUE;
2262 if (!(ctx->rx->flags & SCM_REGEXP_MULTI_LINE)) return FALSE;
2263
2264 const char *prevp;
2265 SCM_CHAR_BACKWARD(input, ctx->input, prevp);
2266 SCM_ASSERT(prevp != NULL);
2267
2268 unsigned char prevb = (unsigned char)*prevp;
2269 if (prevb == '\n' || prevb == '\r') return TRUE;
2270
2271 return FALSE;
2272 }
2273
is_end_of_line(struct match_ctx * ctx,const char * input)2274 static int is_end_of_line(struct match_ctx *ctx, const char *input)
2275 {
2276 if (input == ctx->stop) return TRUE;
2277 if (!(ctx->rx->flags & SCM_REGEXP_MULTI_LINE)) return FALSE;
2278
2279 unsigned char nextb = (unsigned char)*input;
2280 if (nextb == '\n' || nextb == '\r') return TRUE;
2281
2282 return FALSE;
2283 }
2284
is_grapheme_boundary(struct match_ctx * ctx,const char * input,unsigned int code)2285 static int is_grapheme_boundary(struct match_ctx *ctx,
2286 const char *input,
2287 unsigned int code)
2288 {
2289 if (input == ctx->input) return code == RE_BOG;
2290 if (input == ctx->stop) return code == RE_EOG;
2291
2292 if (ctx->grapheme_predicate == SCM_UNDEFINED) {
2293 ScmObj make_predicate = SCM_UNDEFINED;
2294 /*
2295 * So far only gauche.regexp.sre can produce AST grapheme
2296 * nodes, so we're sure it's already loaded.
2297 */
2298 SCM_BIND_PROC(make_predicate, "make-grapheme-predicate",
2299 SCM_FIND_MODULE("gauche.regexp.sre", 0));
2300
2301 ScmObj str = Scm_MakeString(ctx->input,
2302 ctx->stop - ctx->input, -1,
2303 SCM_STRING_IMMUTABLE);
2304 ctx->grapheme_predicate = Scm_ApplyRec1(make_predicate, str);
2305 }
2306
2307 /*
2308 * Using small cursors essentially puts a limit on input string's
2309 * length. But regex should not be run on very long strings to
2310 * begin with, so this should be fine.
2311 */
2312 ScmObj cursor = SCM_MAKE_STRING_CURSOR_SMALL(input - ctx->input);
2313 ScmObj result = Scm_ApplyRec1(ctx->grapheme_predicate, cursor);
2314 return !SCM_FALSEP(result);
2315 }
2316
rex_rec(const unsigned char * code,const char * input,struct match_ctx * ctx)2317 static void rex_rec(const unsigned char *code,
2318 const char *input,
2319 struct match_ctx *ctx)
2320 {
2321 int param;
2322 ScmChar ch;
2323 ScmCharSet *cset;
2324 const char *bpos;
2325
2326 /* TODO: here we assume C-stack grows downward; need to check by
2327 configure */
2328 if ((char*)&cset < (char*)ctx->begin_stack - MAX_STACK_USAGE) {
2329 Scm_Error("Ran out of stack during matching regexp %S. "
2330 "Too many retries?", ctx->rx);
2331 }
2332
2333 for (;;) {
2334 switch(*code++) {
2335 case RE_MATCH:
2336 param = *code++;
2337 if (ctx->stop - input < param) return;
2338 while (param-- > 0) {
2339 if (*code++ != (unsigned char)*input++) return;
2340 }
2341 continue;
2342 case RE_MATCH_RL:
2343 param = *code++;
2344 if (input - param < ctx->input) return;
2345 bpos = input = input - param;
2346 while (param-- > 0) {
2347 if (*code++ != (unsigned char)*bpos++) return;
2348 }
2349 continue;
2350 case RE_MATCH1:
2351 if (ctx->stop == input) return;
2352 if (*code++ != (unsigned char)*input++) return;
2353 continue;
2354 case RE_MATCH1_RL:
2355 if (ctx->input == input) return;
2356 if (*code++ != (unsigned char)*--input) return;
2357 continue;
2358 case RE_MATCH_CI:
2359 param = *code++;
2360 if (ctx->stop - input < param) return;
2361 if (!match_ci(&input, &code, param)) return;
2362 continue;
2363 case RE_MATCH_CI_RL:
2364 param = *code++;
2365 if (input - param < ctx->input) return;
2366 bpos = input = input - param;
2367 if (!match_ci(&bpos, &code, param)) return;
2368 continue;
2369 case RE_MATCH1_CI:
2370 if (ctx->stop == input) return;
2371 param = (unsigned char)*input++;
2372 if (SCM_CHAR_NFOLLOWS(param)!=0
2373 || (*code++)!=SCM_CHAR_DOWNCASE(param)) {
2374 return;
2375 }
2376 continue;
2377 case RE_MATCH1_CI_RL:
2378 if (ctx->input == input) return;
2379 param = (unsigned char)*--input;
2380 if (SCM_CHAR_NFOLLOWS(param)!=0
2381 || (*code++)!=SCM_CHAR_DOWNCASE(param)) {
2382 return;
2383 }
2384 continue;
2385 case RE_ANY:
2386 if (ctx->stop == input) return;
2387 input += SCM_CHAR_NFOLLOWS(*input) + 1;
2388 continue;
2389 case RE_ANY_RL:
2390 if (ctx->input == input) return;
2391 SCM_CHAR_BACKWARD(input, ctx->input, bpos);
2392 input = bpos;
2393 continue;
2394 case RE_TRY:
2395 rex_rec(code+2, input, ctx);
2396 code = ctx->codehead + code[0]*256 + code[1];
2397 continue;
2398 case RE_JUMP:
2399 code = ctx->codehead + code[0]*256 + code[1];
2400 continue;
2401 case RE_SET1:
2402 if (ctx->stop == input) return;
2403 if ((unsigned char)*input >= 128) return;
2404 if (!Scm_CharSetContains(ctx->rx->sets[*code++], *input)) return;
2405 input++;
2406 continue;
2407 case RE_SET1_RL:
2408 if (ctx->input == input) return;
2409 SCM_CHAR_BACKWARD(input, ctx->input, bpos);
2410 if ((unsigned char)*bpos >= 128) return;
2411 if (!Scm_CharSetContains(ctx->rx->sets[*code++], *bpos)) return;
2412 input = bpos;
2413 continue;
2414 case RE_NSET1:
2415 if (ctx->stop == input) return;
2416 if ((unsigned char)*input < 128) {
2417 if (Scm_CharSetContains(ctx->rx->sets[*code++], *input))
2418 return;
2419 input++;
2420 } else {
2421 code++;
2422 input += SCM_CHAR_NFOLLOWS((unsigned char)*input) + 1;
2423 }
2424 continue;
2425 case RE_NSET1_RL:
2426 if (ctx->input == input) return;
2427 SCM_CHAR_BACKWARD(input, ctx->input, bpos);
2428 if ((unsigned char)*bpos < 128) {
2429 if (Scm_CharSetContains(ctx->rx->sets[*code++], *bpos))
2430 return;
2431 }
2432 input = bpos;
2433 continue;
2434 case RE_SET:
2435 if (ctx->stop == input) return;
2436 SCM_CHAR_GET(input, ch);
2437 cset = ctx->rx->sets[*code++];
2438 if (!Scm_CharSetContains(cset, ch)) return;
2439 input += SCM_CHAR_NBYTES(ch);
2440 continue;
2441 case RE_SET_RL:
2442 if (ctx->input == input) return;
2443 SCM_CHAR_BACKWARD(input, ctx->input, bpos);
2444 SCM_CHAR_GET(bpos, ch);
2445 cset = ctx->rx->sets[*code++];
2446 if (!Scm_CharSetContains(cset, ch)) return;
2447 input = bpos;
2448 continue;
2449 case RE_NSET:
2450 if (ctx->stop == input) return;
2451 SCM_CHAR_GET(input, ch);
2452 cset = ctx->rx->sets[*code++];
2453 if (Scm_CharSetContains(cset, ch)) return;
2454 input += SCM_CHAR_NBYTES(ch);
2455 continue;
2456 case RE_NSET_RL:
2457 if (ctx->input == input) return;
2458 SCM_CHAR_BACKWARD(input, ctx->input, bpos);
2459 SCM_CHAR_GET(bpos, ch);
2460 cset = ctx->rx->sets[*code++];
2461 if (Scm_CharSetContains(cset, ch)) return;
2462 input = bpos;
2463 continue;
2464 case RE_BEGIN: {
2465 int grpno = *code++;
2466 const char *opos = ctx->matches[grpno]->startp;
2467 const char *oend = ctx->matches[grpno]->endp;
2468 ctx->matches[grpno]->startp = input;
2469 rex_rec(code, input, ctx);
2470 ctx->matches[grpno]->startp = opos;
2471 ctx->matches[grpno]->endp = oend;
2472 return;
2473 }
2474 case RE_BEGIN_RL: {
2475 int grpno = *code++;
2476 const char *opos = ctx->matches[grpno]->endp;
2477 ctx->matches[grpno]->endp = input;
2478 rex_rec(code, input, ctx);
2479 ctx->matches[grpno]->endp = opos;
2480 return;
2481 }
2482 case RE_END: {
2483 int grpno = *code++;
2484 ctx->matches[grpno]->endp = input;
2485 continue;
2486 }
2487 case RE_END_RL: {
2488 int grpno = *code++;
2489 ctx->matches[grpno]->startp = input;
2490 continue;
2491 }
2492 case RE_BOS:
2493 if (input != ctx->input) return;
2494 continue;
2495 case RE_EOS:
2496 if (input != ctx->stop) return;
2497 continue;
2498 case RE_BOL:
2499 if (!is_beginning_of_line(ctx, input)) return;
2500 continue;
2501 case RE_EOL:
2502 if (!is_end_of_line(ctx, input)) return;
2503 continue;
2504 case RE_WB: case RE_BOW: case RE_EOW:
2505 if (!is_word_boundary(ctx, input, code[-1])) return;
2506 continue;
2507 case RE_NWB:
2508 if (is_word_boundary(ctx, input, RE_WB)) return;
2509 continue;
2510 case RE_BOG: case RE_EOG:
2511 if (!is_grapheme_boundary(ctx, input, code[-1])) return;
2512 continue;
2513 case RE_SUCCESS:
2514 ctx->last = input;
2515 siglongjmp(*ctx->cont, 1);
2516 /*NOTREACHED*/
2517 case RE_FAIL:
2518 return;
2519 case RE_SET1R:
2520 cset = ctx->rx->sets[*code++];
2521 for (;;) {
2522 if (ctx->stop <= input) break;
2523 if ((unsigned char)*input >= 128) break;
2524 if (!Scm_CharSetContains(cset, *input)) break;
2525 input++;
2526 }
2527 continue;
2528 case RE_SET1R_RL:
2529 cset = ctx->rx->sets[*code++];
2530 for (;;) {
2531 if (input == ctx->input) break;
2532 SCM_CHAR_BACKWARD(input, ctx->input, bpos);
2533 if ((unsigned char)*bpos >= 128) break;
2534 if (!Scm_CharSetContains(cset, *bpos)) break;
2535 input = bpos;
2536 }
2537 continue;
2538 case RE_NSET1R:
2539 cset = ctx->rx->sets[*code++];
2540 for (;;) {
2541 if (ctx->stop <= input) break;
2542 if ((unsigned char)*input < 128 ) {
2543 if (Scm_CharSetContains(cset, *input)) break;
2544 input++;
2545 } else {
2546 input+=SCM_CHAR_NFOLLOWS(*input)+1;
2547 }
2548 }
2549 continue;
2550 case RE_NSET1R_RL:
2551 cset = ctx->rx->sets[*code++];
2552 for (;;) {
2553 if (ctx->input == input) break;
2554 SCM_CHAR_BACKWARD(input, ctx->input, bpos);
2555 if ((unsigned char)*bpos < 128) {
2556 if (Scm_CharSetContains(cset, *bpos)) break;
2557 }
2558 input = bpos;
2559 }
2560 continue;
2561 case RE_SETR:
2562 cset = ctx->rx->sets[*code++];
2563 for (;;) {
2564 if (ctx->stop <= input) break;
2565 SCM_CHAR_GET(input, ch);
2566 if (!Scm_CharSetContains(cset, ch)) break;
2567 input += SCM_CHAR_NBYTES(ch);
2568 }
2569 continue;
2570 case RE_SETR_RL:
2571 cset = ctx->rx->sets[*code++];
2572 for (;;) {
2573 if (ctx->input == input) break;
2574 SCM_CHAR_BACKWARD(input, ctx->input, bpos);
2575 SCM_CHAR_GET(bpos, ch);
2576 if (!Scm_CharSetContains(cset, ch)) break;
2577 input = bpos;
2578 }
2579 continue;
2580 case RE_NSETR:
2581 cset = ctx->rx->sets[*code++];
2582 for (;;) {
2583 if (ctx->stop <= input) break;
2584 SCM_CHAR_GET(input, ch);
2585 if (Scm_CharSetContains(cset, ch)) break;
2586 input += SCM_CHAR_NBYTES(ch);
2587 }
2588 continue;
2589 case RE_NSETR_RL:
2590 cset = ctx->rx->sets[*code++];
2591 for (;;) {
2592 if (ctx->input == input) break;
2593 SCM_CHAR_BACKWARD(input, ctx->input, bpos);
2594 SCM_CHAR_GET(bpos, ch);
2595 if (Scm_CharSetContains(cset, ch)) break;
2596 input = bpos;
2597 }
2598 continue;
2599 case RE_MATCH1R:
2600 for (;;) {
2601 if (ctx->stop <= input) break;
2602 if ((unsigned char)*input >= 128) break;
2603 if (*code != (unsigned char)*input) break;
2604 input++;
2605 }
2606 code++;
2607 continue;
2608 case RE_MATCHR:
2609 param = *code++;
2610 for (;;) {
2611 if (ctx->stop < input + param) break;
2612 const unsigned char *str = code;
2613 const unsigned char *ip = (const unsigned char*)input;
2614 for (int i = 0; i < param; i++) {
2615 if (*str++ != (unsigned char)*ip++) goto matchr_out;
2616 }
2617 input = (const char *)ip;
2618 }
2619 matchr_out:
2620 code += param;
2621 continue;
2622 case RE_ANYR:
2623 for (;;) {
2624 if (ctx->stop <= input) break;
2625 input += SCM_CHAR_NFOLLOWS(*input) + 1;
2626 }
2627 continue;
2628 case RE_CPAT: {
2629 int grpno = *code++;
2630 if (ctx->matches[grpno]->startp) code += 2;
2631 else code = ctx->codehead + code[0]*256 + code[1];
2632 continue;
2633 }
2634 case RE_CPATA: {
2635 sigjmp_buf cont, *ocont = ctx->cont;
2636 ctx->cont = &cont;
2637 if (sigsetjmp(cont, FALSE) == 0) {
2638 rex_rec(code+4, input, ctx);
2639 ctx->cont = ocont;
2640 code = ctx->codehead + code[2]*256 + code[3];
2641 continue;
2642 }
2643 code = ctx->codehead + code[0]*256 + code[1];
2644 ctx->cont = ocont;
2645 continue;
2646 }
2647 case RE_BACKREF: {
2648 int grpno = *code++;
2649 const char *match = ctx->matches[grpno]->startp;
2650 const char *end = ctx->matches[grpno]->endp;
2651 if (!match || !end) return;
2652 while (match < end) {
2653 if (*input++ != *match++) return;
2654 }
2655 continue;
2656 }
2657 case RE_BACKREF_RL: {
2658 int grpno = *code++, len;
2659 const char *match = ctx->matches[grpno]->startp;
2660 const char *end = ctx->matches[grpno]->endp;
2661 if (!match || !end) return;
2662 len = (int)(end - match);
2663 if (input - len < ctx->input) return;
2664 bpos = input = input - len;
2665 while (len-- > 0) {
2666 if (*match++ != (unsigned char)*bpos++) return;
2667 }
2668 continue;
2669 }
2670 case RE_BACKREF_CI: {
2671 int grpno = *code++;
2672 const char *match = ctx->matches[grpno]->startp;
2673 const char *end = ctx->matches[grpno]->endp;
2674 int i = 0;
2675 ScmChar cx, cy;
2676 if (!match || !end) return;
2677 while (match+i < end) {
2678 if (input == ctx->stop) return;
2679 SCM_CHAR_GET(input+i, cx);
2680 SCM_CHAR_GET(match+i, cy);
2681 if (SCM_CHAR_UPCASE(cx) != SCM_CHAR_UPCASE(cy))
2682 return;
2683 i += SCM_CHAR_NBYTES(cx);
2684 }
2685 input += i;
2686 continue;
2687 }
2688 case RE_BACKREF_CI_RL: {
2689 int grpno = *code++, i = 0, len;
2690 const char *match = ctx->matches[grpno]->startp;
2691 const char *end = ctx->matches[grpno]->endp;
2692 ScmChar cx, cy;
2693 if (!match || !end) return;
2694
2695 len = (int)(end - match);
2696 if (input - len < ctx->input) return;
2697 bpos = input = input - len;
2698 while (match+i < end) {
2699 if (bpos == ctx->stop) return;
2700 SCM_CHAR_GET(bpos+i, cx);
2701 SCM_CHAR_GET(match+i, cy);
2702 if (SCM_CHAR_UPCASE(cx) != SCM_CHAR_UPCASE(cy))
2703 return;
2704 i += SCM_CHAR_NBYTES(cx);
2705 }
2706 continue;
2707 }
2708 case RE_ONCE: case RE_ASSERT: {
2709 sigjmp_buf cont, *ocont = ctx->cont;
2710 ctx->cont = &cont;
2711 if (sigsetjmp(cont, FALSE) == 0) {
2712 rex_rec(code+2, input, ctx);
2713 ctx->cont = ocont;
2714 return;
2715 }
2716 if (code[-1] == RE_ONCE) input = ctx->last;
2717 code = ctx->codehead + code[0]*256 + code[1];
2718 ctx->cont = ocont;
2719 continue;
2720 }
2721 case RE_NASSERT: {
2722 sigjmp_buf cont, *ocont = ctx->cont;
2723 ctx->cont = &cont;
2724 if (sigsetjmp(cont, FALSE) == 0) {
2725 rex_rec(code+2, input, ctx);
2726 code = ctx->codehead + code[0]*256 + code[1];
2727 ctx->cont = ocont;
2728 continue;
2729 }
2730 ctx->cont = ocont;
2731 return;
2732 }
2733 default:
2734 /* shouldn't be here */
2735 Scm_Error("regexp implementation seems broken");
2736 }
2737 }
2738 }
2739
make_match(ScmRegexp * rx,ScmString * orig,struct match_ctx * ctx)2740 static ScmObj make_match(ScmRegexp *rx, ScmString *orig,
2741 struct match_ctx *ctx)
2742 {
2743 ScmRegMatch *rm = SCM_NEW(ScmRegMatch);
2744 SCM_SET_CLASS(rm, SCM_CLASS_REGMATCH);
2745 rm->numMatches = rx->numGroups;
2746 rm->grpNames = rx->grpNames;
2747 /* we keep information of original string separately, instead of
2748 keeping a pointer to orig; For orig may be destructively modified,
2749 but its elements are not. */
2750 const ScmStringBody *origb = SCM_STRING_BODY(orig);
2751 rm->input = SCM_STRING_BODY_START(origb);
2752 rm->inputLen = SCM_STRING_BODY_LENGTH(origb);
2753 rm->inputSize = SCM_STRING_BODY_SIZE(origb);
2754 rm->matches = ctx->matches;
2755 return SCM_OBJ(rm);
2756 }
2757
rex(ScmRegexp * rx,ScmString * orig,const char * orig_start,const char * start,const char * end)2758 static ScmObj rex(ScmRegexp *rx, ScmString *orig,
2759 const char *orig_start,
2760 const char *start, const char *end)
2761 {
2762 struct match_ctx ctx;
2763 sigjmp_buf cont;
2764
2765 ctx.rx = rx;
2766 ctx.codehead = rx->code;
2767 ctx.input = orig_start;
2768 ctx.stop = end;
2769 ctx.begin_stack = (void*)&ctx;
2770 ctx.cont = &cont;
2771 ctx.matches = SCM_NEW_ARRAY(struct ScmRegMatchSub *, rx->numGroups);
2772 ctx.grapheme_predicate = SCM_UNDEFINED;
2773
2774 for (int i = 0; i < rx->numGroups; i++) {
2775 ctx.matches[i] = SCM_NEW(struct ScmRegMatchSub);
2776 ctx.matches[i]->start = -1;
2777 ctx.matches[i]->length = -1;
2778 ctx.matches[i]->after = -1;
2779 ctx.matches[i]->startp = NULL;
2780 ctx.matches[i]->endp = NULL;
2781 }
2782
2783 if (sigsetjmp(cont, FALSE) == 0) {
2784 rex_rec(ctx.codehead, start, &ctx);
2785 return SCM_FALSE;
2786 }
2787 return make_match(rx, orig, &ctx);
2788 }
2789
2790 /* advance start pointer while the character matches (skip_match=TRUE) or does
2791 not match (skip_match=FALSE), until start pointer hits limit. */
skip_input(const char * start,const char * limit,ScmObj laset,int skip_match)2792 static inline const char *skip_input(const char *start, const char *limit,
2793 ScmObj laset, int skip_match)
2794 {
2795 while (start <= limit) {
2796 ScmChar ch;
2797 SCM_CHAR_GET(start, ch);
2798 if (Scm_CharSetContains(SCM_CHAR_SET(laset), ch)) {
2799 if (!skip_match) return start;
2800 } else {
2801 if (skip_match) return start;
2802 }
2803 start += SCM_CHAR_NFOLLOWS(*start)+1;
2804 }
2805 return limit;
2806 }
2807
2808 /*----------------------------------------------------------------------
2809 * entry point
2810 */
Scm_RegExec(ScmRegexp * rx,ScmString * str,ScmObj start_scm,ScmObj end_scm)2811 ScmObj Scm_RegExec(ScmRegexp *rx, ScmString *str, ScmObj start_scm, ScmObj end_scm)
2812 {
2813 const ScmStringBody *b = SCM_STRING_BODY(str);
2814 const char *orig_start = SCM_STRING_BODY_START(b);
2815 const char *start;
2816 const char *end;
2817 const ScmStringBody *mb = rx->mustMatch? SCM_STRING_BODY(rx->mustMatch) : NULL;
2818 int mustMatchLen = mb? SCM_STRING_BODY_SIZE(mb) : 0;
2819 const char *start_limit;
2820
2821 if (SCM_STRING_INCOMPLETE_P(str)) {
2822 Scm_Error("incomplete string is not allowed: %S", str);
2823 }
2824 if (!SCM_UNBOUNDP(start_scm) && !SCM_UNDEFINEDP(start_scm)) {
2825 if (!SCM_INTEGERP(start_scm)) {
2826 Scm_TypeError("start", "exact integer required but got %S", start_scm);
2827 }
2828 int value = Scm_GetInteger(start_scm);
2829 if (value < 0 || value >= SCM_STRING_BODY_LENGTH(b)) {
2830 Scm_Error("invalid start parameter: %S", start_scm);
2831 }
2832 while (value--) {
2833 orig_start += SCM_CHAR_NFOLLOWS(*orig_start) + 1;
2834 }
2835 }
2836 start = orig_start;
2837 end = SCM_STRING_BODY_START(b);
2838 if (!SCM_UNBOUNDP(end_scm) && !SCM_UNDEFINEDP(end_scm)) {
2839 if (!SCM_INTEGERP(end_scm)) {
2840 Scm_TypeError("end", "exact integer required but got %S", end_scm);
2841 }
2842 int value = Scm_GetInteger(end_scm);
2843 if (value < 0 || value > SCM_STRING_BODY_LENGTH(b)) {
2844 Scm_Error("invalid end parameter: %S", end_scm);
2845 }
2846 while (value--) {
2847 end += SCM_CHAR_NFOLLOWS(*end) + 1;
2848 }
2849 if (end < start) {
2850 Scm_Error("invalid end parameter: %S", end_scm);
2851 }
2852 } else {
2853 end += SCM_STRING_BODY_SIZE(b);
2854 }
2855 start_limit = end - mustMatchLen;
2856 #if 0
2857 /* Disabled for now; we need to use more heuristics to determine
2858 when we should apply mustMatch. For example, if the regexp
2859 begins with BOL assertion and constant string, then it would be
2860 faster to go for rex directly. */
2861 if (rx->mustMatch) {
2862 /* Prescreening. If the input string doesn't contain mustMatch
2863 string, it can't match the entire expression. */
2864 if (SCM_FALSEP(Scm_StringScan(str, rx->mustMatch,
2865 SCM_STRING_SCAN_INDEX))) {
2866 return SCM_FALSE;
2867 }
2868 }
2869 #endif
2870 /* short cut : if rx matches only at the beginning of the string,
2871 we only run from the beginning of the string */
2872 if (rx->flags & SCM_REGEXP_BOL_ANCHORED) {
2873 return rex(rx, str, orig_start, start, end);
2874 }
2875
2876 /* if we have lookahead-set, we may be able to skip input efficiently. */
2877 if (!SCM_FALSEP(rx->laset)) {
2878 if (rx->flags & SCM_REGEXP_SIMPLE_PREFIX) {
2879 while (start <= start_limit) {
2880 ScmObj r = rex(rx, str, orig_start, start, end);
2881 if (!SCM_FALSEP(r)) return r;
2882 const char *next = skip_input(start, start_limit, rx->laset,
2883 TRUE);
2884 if (start != next) start = next;
2885 else start = next + SCM_CHAR_NFOLLOWS(*start) + 1;
2886 }
2887 } else {
2888 while (start <= start_limit) {
2889 start = skip_input(start, start_limit, rx->laset, FALSE);
2890 ScmObj r = rex(rx, str, orig_start, start, end);
2891 if (!SCM_FALSEP(r)) return r;
2892 start += SCM_CHAR_NFOLLOWS(*start)+1;
2893 }
2894 }
2895 return SCM_FALSE;
2896 }
2897
2898 /* normal matching */
2899 while (start <= start_limit) {
2900 ScmObj r = rex(rx, str, orig_start, start, end);
2901 if (!SCM_FALSEP(r)) return r;
2902 start += SCM_CHAR_NFOLLOWS(*start)+1;
2903 }
2904 return SCM_FALSE;
2905 }
2906
2907 /*=======================================================================
2908 * Retrieving matches
2909 */
2910
2911 /* We calculate string length and position (in characters) lazily.
2912 * The match routine only sets ScmRegMatchSub's startp and endp
2913 * fields, and leaves start, length, and after fields -1.
2914 * When the submatch is retrieved we calculate values of those fields.
2915 *
2916 * Note that, even such retrieval functions mutate the state of
2917 * submatch objects, we don't need mutex to avoid race condition
2918 * in MT environment. The state transition is one way (-1 to a
2919 * fixed value) and idempotent, so there's no problem if more than
2920 * one thread try to change the fields.
2921 *
2922 * The three parameters, start, length, and after, indicates the
2923 * # of characters. Character counting is expensive, so we try
2924 * to avoid calling Scm_MBLen as much as possible. If other two
2925 * values are known, we just subtract them from the inputLen.
2926 *
2927 * |<-------original string------------>|
2928 * | |<---matched substr -->| |
2929 * | | | |
2930 * |<---->|<-------------------->|<---->|
2931 * |start length after|
2932 * |<---------------------------------->|
2933 * inputLen
2934 */
2935
2936 /* We want to avoid unnecessary character counting as much as
2937 possible. */
2938
2939 #define MSUB_BEFORE_SIZE(rm, sub) ((int)((sub)->startp - (rm)->input))
2940 #define MSUB_SIZE(rm, sub) ((int)((sub)->endp - (sub)->startp))
2941 #define MSUB_AFTER_SIZE(rm, sub) ((int)((rm)->input + (rm)->inputSize - (sub)->endp))
2942
2943 #define MSUB_BEFORE_LENGTH(rm, sub) \
2944 Scm_MBLen((rm)->input, (sub)->startp)
2945 #define MSUB_LENGTH(rm, sub) \
2946 Scm_MBLen((sub)->startp, (sub)->endp)
2947 #define MSUB_AFTER_LENGTH(rm, sub) \
2948 Scm_MBLen((sub)->endp, (rm)->input + (rm)->inputSize)
2949
2950 #define UNCOUNTED(rm, sub) \
2951 (((sub)->start >= 0 ? 0 : MSUB_BEFORE_SIZE(rm, sub)) \
2952 + ((sub)->length >= 0 ? 0 : MSUB_SIZE(rm, sub)) \
2953 + ((sub)->after >= 0 ? 0 : MSUB_AFTER_SIZE(rm, sub)))
2954
regmatch_count_start(ScmRegMatch * rm,struct ScmRegMatchSub * sub)2955 static void regmatch_count_start(ScmRegMatch *rm,
2956 struct ScmRegMatchSub *sub)
2957 {
2958 if (SCM_REG_MATCH_SINGLE_BYTE_P(rm)) {
2959 sub->start = MSUB_BEFORE_SIZE(rm, sub);
2960 } else if (UNCOUNTED(rm, sub) / 2 > MSUB_BEFORE_SIZE(rm, sub)) {
2961 sub->start = MSUB_BEFORE_LENGTH(rm, sub);
2962 } else {
2963 if (sub->length < 0) sub->length = MSUB_LENGTH(rm, sub);
2964 if (sub->after < 0) sub->after = MSUB_AFTER_LENGTH(rm, sub);
2965 sub->start = rm->inputLen - sub->after - sub->length;
2966 }
2967 }
2968
regmatch_count_length(ScmRegMatch * rm,struct ScmRegMatchSub * sub)2969 static void regmatch_count_length(ScmRegMatch *rm,
2970 struct ScmRegMatchSub *sub)
2971 {
2972 if (SCM_REG_MATCH_SINGLE_BYTE_P(rm)) {
2973 sub->length = MSUB_SIZE(rm, sub);
2974 } else if (UNCOUNTED(rm, sub) / 2 > MSUB_SIZE(rm, sub)) {
2975 sub->length = MSUB_LENGTH(rm, sub);
2976 } else {
2977 if (sub->start < 0) sub->start = MSUB_BEFORE_LENGTH(rm, sub);
2978 if (sub->after < 0) sub->after = MSUB_AFTER_LENGTH(rm, sub);
2979 sub->length = rm->inputLen - sub->start - sub->after;
2980 }
2981 }
2982
regmatch_count_after(ScmRegMatch * rm,struct ScmRegMatchSub * sub)2983 static void regmatch_count_after(ScmRegMatch *rm,
2984 struct ScmRegMatchSub *sub)
2985 {
2986 if (SCM_REG_MATCH_SINGLE_BYTE_P(rm)) {
2987 sub->after = MSUB_AFTER_SIZE(rm, sub);
2988 } else if (UNCOUNTED(rm, sub) / 2 > MSUB_AFTER_SIZE(rm, sub)) {
2989 sub->after = MSUB_AFTER_LENGTH(rm, sub);
2990 } else {
2991 if (sub->start < 0) sub->start = MSUB_BEFORE_LENGTH(rm, sub);
2992 if (sub->length < 0) sub->length = MSUB_LENGTH(rm, sub);
2993 sub->after = rm->inputLen - sub->start - sub->length;
2994 }
2995 }
2996
regmatch_ref(ScmRegMatch * rm,ScmObj obj)2997 static struct ScmRegMatchSub *regmatch_ref(ScmRegMatch *rm, ScmObj obj)
2998 {
2999 struct ScmRegMatchSub *sub = NULL;
3000 if (SCM_INTP(obj)) {
3001 int i = SCM_INT_VALUE(obj);
3002 if (i < 0 || i >= rm->numMatches)
3003 Scm_Error("submatch index out of range: %d", i);
3004 sub = rm->matches[i];
3005 if (!sub->startp || !sub->endp) return NULL;
3006 return sub;
3007 }
3008 if (SCM_SYMBOLP(obj)) {
3009 ScmObj ep;
3010 SCM_FOR_EACH(ep, rm->grpNames) {
3011 if (!SCM_EQ(obj, SCM_CAAR(ep))) continue;
3012 sub = rm->matches[SCM_INT_VALUE(SCM_CDAR(ep))];
3013 if (!sub->startp || !sub->endp) continue;
3014 return sub;
3015 }
3016 if (sub != NULL) {
3017 if (sub->startp && sub->endp) return sub;
3018 else return NULL;
3019 }
3020 Scm_Error("named submatch not found: %S", obj);
3021 }
3022 Scm_Error("integer or symbol expected, but got %S", obj);
3023 return NULL; /* dummy */
3024 }
3025
Scm_RegMatchStart(ScmRegMatch * rm,ScmObj obj)3026 ScmObj Scm_RegMatchStart(ScmRegMatch *rm, ScmObj obj)
3027 {
3028 struct ScmRegMatchSub *sub = regmatch_ref(rm, obj);
3029 if (sub == NULL) return SCM_FALSE;
3030 if (sub->start < 0) regmatch_count_start(rm, sub);
3031 return Scm_MakeInteger(sub->start);
3032 }
3033
Scm_RegMatchEnd(ScmRegMatch * rm,ScmObj obj)3034 ScmObj Scm_RegMatchEnd(ScmRegMatch *rm, ScmObj obj)
3035 {
3036 struct ScmRegMatchSub *sub = regmatch_ref(rm, obj);
3037 if (sub == NULL) return SCM_FALSE;
3038 if (sub->after < 0) regmatch_count_after(rm, sub);
3039 return Scm_MakeInteger(rm->inputLen - sub->after);
3040 }
3041
Scm_RegMatchBefore(ScmRegMatch * rm,ScmObj obj)3042 ScmObj Scm_RegMatchBefore(ScmRegMatch *rm, ScmObj obj)
3043 {
3044 struct ScmRegMatchSub *sub = regmatch_ref(rm, obj);
3045 if (sub == NULL) return SCM_FALSE;
3046 if (sub->start < 0) regmatch_count_start(rm, sub);
3047 return Scm_MakeString(rm->input, MSUB_BEFORE_SIZE(rm, sub),
3048 sub->start, 0);
3049 }
3050
Scm_RegMatchSubstr(ScmRegMatch * rm,ScmObj obj)3051 ScmObj Scm_RegMatchSubstr(ScmRegMatch *rm, ScmObj obj)
3052 {
3053 struct ScmRegMatchSub *sub = regmatch_ref(rm, obj);
3054 if (sub == NULL) return SCM_FALSE;
3055 if (sub->length < 0) regmatch_count_length(rm, sub);
3056 return Scm_MakeString(sub->startp, MSUB_SIZE(rm, sub),
3057 sub->length, 0);
3058 }
3059
Scm_RegMatchAfter(ScmRegMatch * rm,ScmObj obj)3060 ScmObj Scm_RegMatchAfter(ScmRegMatch *rm, ScmObj obj)
3061 {
3062 struct ScmRegMatchSub *sub = regmatch_ref(rm, obj);
3063 if (sub == NULL) return SCM_FALSE;
3064 if (sub->after < 0) regmatch_count_after(rm, sub);
3065 return Scm_MakeString(sub->endp, MSUB_AFTER_SIZE(rm, sub),
3066 sub->after, 0);
3067 }
3068
3069 /* for debug */
Scm_RegMatchDump(ScmRegMatch * rm)3070 void Scm_RegMatchDump(ScmRegMatch *rm)
3071 {
3072 Scm_Printf(SCM_CUROUT, "RegMatch %p\n", rm);
3073 Scm_Printf(SCM_CUROUT, " numMatches = %d\n", rm->numMatches);
3074 Scm_Printf(SCM_CUROUT, " input = %S\n", rm->input);
3075 for (int i=0; i<rm->numMatches; i++) {
3076 struct ScmRegMatchSub *sub = rm->matches[i];
3077 if (sub->startp) {
3078 Scm_Printf(SCM_CUROUT, "[%3d-%3d] %S\n",
3079 sub->startp - rm->input,
3080 sub->endp - rm->input,
3081 Scm_MakeString(sub->startp,
3082 (int)(sub->endp-sub->startp),
3083 -1, 0));
3084 } else {
3085 Scm_Printf(SCM_CUROUT, "[---] #f\n");
3086 }
3087 }
3088 }
3089
3090 /*=======================================================================
3091 * Initializing stuff
3092 */
3093
Scm__InitRegexp(void)3094 void Scm__InitRegexp(void)
3095 {
3096 }
3097