1 /*
2  * regexp.c - regular expression
3  *
4  *   Copyright (c) 2000-2020  Shiro Kawai  <shiro@acm.org>
5  *   Copyright (c) 2019-2020  Duy Nguyen <pclouds@gmail.com>
6  *   Copyright (c) 2006 Rui Ueyama, All rights reserved.
7  *
8  *   Redistribution and use in source and binary forms, with or without
9  *   modification, are permitted provided that the following conditions
10  *   are met:
11  *
12  *   1. Redistributions of source code must retain the above copyright
13  *      notice, this list of conditions and the following disclaimer.
14  *
15  *   2. Redistributions in binary form must reproduce the above copyright
16  *      notice, this list of conditions and the following disclaimer in the
17  *      documentation and/or other materials provided with the distribution.
18  *
19  *   3. Neither the name of the authors nor the names of its contributors
20  *      may be used to endorse or promote products derived from this
21  *      software without specific prior written permission.
22  *
23  *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
24  *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
25  *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
26  *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
27  *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
28  *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
29  *   TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
30  *   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
31  *   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
32  *   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
33  *   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
34  */
35 
36 #include <setjmp.h>
37 #include <ctype.h>
38 #define LIBGAUCHE_BODY
39 #include "gauche.h"
40 #include "gauche/regexp.h"
41 #include "gauche/class.h"
42 #include "gauche/priv/builtin-syms.h"
43 #include "gauche/priv/charP.h"
44 #include "gauche/priv/stringP.h"
45 
46 /* I don't like to reinvent wheels, so I looked for a regexp implementation
47  * that can handle multibyte encodings and not bound to Unicode.
48  * Without assuming Unicode it'll be difficult to define character classes
49  * correctly, but there are domains that you don't want to do native
50  * charset <-> UTF-8 each time for regexp match, trading correctness of
51  * character classes.
52  *
53  * The most recent version of famous Henry Spencer's regex is found in Tcl
54  * 8.3, that supports wide characters (the state machine seems to work
55  * with UCS-4, but the internal tables seem to be set up for UCS-2 only).
56  * Tcl does UTF-8 <-> UCS-2 conversion in order to do regexp match.
57  *
58  * Lots of variants of Spencer's old regex code is floating around, such
59  * as http://arglist.com/regex/ and the one in BSD.   They don't support
60  * multibyte strings, as far as I know.
61  *
62  * Another popular package is PCRE.  PCRE 3.4 has UTF-8 support, but only
63  * experimentally.
64  *
65  * None seems to satisfy my criteria.
66  *
67  * So I reluctantly started to write my own.  I don't think I can beat
68  * those guys, and am willing to grab someone's code anytime if it's suitable
69  * for my purpose and under a license like BSD one.
70  */
71 
72 /*
73  * The idea here is to match string without converting mb <-> char as
74  * much as possible.  Actually, the conversion is done only when we see
75  * large character sets.
76  *
77  * The engine is a sort of NFA, by keeping state information for backtrack
78  * in C stack.  It'll bust the C stack if you try to match something like
79  * (..)* with a long input string (there's a code to check the stack size
80  * and aborts matching when the recursion goes too deep).
81  * A possible fix is to check if recursion level exceeds some limit,
82  * then save the C stack into heap (as in the C-stack-copying continuation
83  * does) and reuse the stack area.
84  */
85 
86 /* Instructions.  `RL' suffix indicates that the instruction moves the
87    current position pointer right to left.  These instructions are
88    used within lookbehind assertion. */
89 enum {
90 #define DEF_RE_INSN(name, _) SCM_CPP_CAT(RE_, name),
91 #include "gauche/regexp_insn.h"
92 #undef DEF_RE_INSN
93     RE_NUM_INSN
94 };
95 
96 /* maximum # of {n,m}-type limited repeat count */
97 #define MAX_LIMITED_REPEAT 255
98 
99 /* internal regexp flag. */
100 #define SCM_REGEXP_BOL_ANCHORED   (1L<<2) /* The regexp beginning is anchored
101                                              by ^.*/
102 #define SCM_REGEXP_SIMPLE_PREFIX  (1L<<3) /* The regexp begins with a repeating
103                                              character or charset, e.g. #/a+b/.
104                                              See is_simple_prefixed() below. */
105 
106 /* AST - the first pass of regexp compiler creates intermediate AST.
107  * Alternatively, you can provide AST directly to the regexp compiler,
108  * using Scm_RegCompFromAST().
109  *
110  *  <ast> : (<element> ...)
111  *
112  *  <element> : <clause>   ; special clause
113  *         | <item>        ; matches <item>
114  *
115  *  <item> : <char>       ; matches char
116  *         | <char-set>   ; matches char set
117  *         | (comp . <char-set>) ; matches complement of char set
118  *         | any          ; matches any char
119  *         | bos | eos    ; beginning/end of string assertion
120  *         | bol | eol    ; beginning/end of line assertion
121  *         | bow | eow | wb | nwb ; word-boundary/negative word boundary assertion
122  *
123  *  <clause> : (seq . <ast>)       ; sequence
124  *         | (seq-uncase . <ast>)  ; sequence (case insensitive match)
125  *         | (seq-case . <ast>)    ; sequence (case sensitive match)
126  *         | (alt . <ast>)         ; alternative
127  *         | (rep <m> <n> . <ast>) ; repetition at least <m> up to <n> (greedy)
128  *                                 ; <n> may be `#f'
129  *         | (rep-min <m> <n> . <ast>)
130  *                                 ; repetition at least <m> up to <n> (lazy)
131  *                                 ; <n> may be `#f'
132  *         | (rep-while <m> <n> . <ast>)
133  *                                 ; like rep, but no backtrack
134  *         | (<integer> <symbol> . <ast>)
135  *                                 ; capturing group.  <symbol> may be #f.
136  *         | (cpat <condition> (<ast>) (<ast>))
137  *                                 ; conditional expression
138  *         | (backref . <integer>) ; backreference by group number
139  *         | (backref . <symbol>)  ; backreference by name
140  *         | (once . <ast>)        ; standalone pattern.  no backtrack
141  *         | (assert . <asst>)     ; positive lookahead assertion
142  *         | (nassert . <asst>)    ; negative lookahead assertion
143  *
144  *  <condition> : <integer>     ; (?(1)yes|no) style conditional expression
145  *         | (assert . <asst>)  ; (?(?=condition)...) or (?(?<=condition)...)
146  *         | (nassert . <asst>) ; (?(?!condition)...) or (?(?<!condition)...)
147  *
148  *  <asst> : <ast>
149  *         | (lookbehind . <ast>)
150  *
151  * For seq-uncase, items inside <ast> has to be prepared for case-insensitive
152  * match, i.e. chars have to be downcased and char-sets have to be
153  * case-folded.
154  */
155 
156 /* NB: regexp printer is defined in libobj.scm */
157 static int  regexp_compare(ScmObj x, ScmObj y, int equalp);
158 
159 SCM_DEFINE_BUILTIN_CLASS(Scm_RegexpClass,
160                          NULL, regexp_compare, NULL, NULL,
161                          SCM_CLASS_DEFAULT_CPL);
162 SCM_DEFINE_BUILTIN_CLASS_SIMPLE(Scm_RegMatchClass, NULL);
163 
make_regexp(void)164 static ScmRegexp *make_regexp(void)
165 {
166     ScmRegexp *rx = SCM_NEW(ScmRegexp);
167     SCM_SET_CLASS(rx, SCM_CLASS_REGEXP);
168     rx->code = NULL;
169     rx->numCodes = 0;
170     rx->numGroups = 0;
171     rx->numSets = 0;
172     rx->sets = NULL;
173     rx->grpNames = SCM_NIL;
174     rx->mustMatch = NULL;
175     rx->flags = 0;
176     rx->pattern = SCM_FALSE;
177     rx->ast = SCM_FALSE;
178     return rx;
179 }
180 
regexp_compare(ScmObj x,ScmObj y,int equalp)181 static int regexp_compare(ScmObj x, ScmObj y, int equalp)
182 {
183     if (!equalp) {
184         Scm_Error("cannot compare regexps: %S and %S", x, y);
185     }
186     ScmRegexp *rx = SCM_REGEXP(x);
187     ScmRegexp *ry = SCM_REGEXP(y);
188 
189     if ((rx->numCodes != ry->numCodes)
190         || (rx->numGroups != ry->numGroups)
191         || (rx->numSets != ry->numSets)
192         || !Scm_EqualP(rx->grpNames, ry->grpNames)
193         || (rx->flags != ry->flags)) {
194         return 1;
195     } else {
196         /* we compare bytecode. */
197         for (int i=0; i<rx->numCodes; i++) {
198             if (rx->code[i] != ry->code[i]) return 1;
199         }
200         for (int i=0; i<rx->numSets; i++) {
201             if (rx->sets[i] == ry->sets[i]) continue;
202             if (!Scm_CharSetEq(rx->sets[i], ry->sets[i])) return 1;
203         }
204     }
205     return 0;
206 }
207 
208 #ifndef CHAR_MAX
209 #define CHAR_MAX 256
210 #endif
211 
212 #define REGEXP_OFFSET_MAX 65535
213 
214 /*=======================================================================
215  * Compiler
216  */
217 
218 /* 3-pass compiler.
219  *
220  *  pass 1: parses the pattern and creates an AST.
221  *  pass 2: optimize on AST.
222  *  pass 3: byte code generation.
223  */
224 
225 /* compiler state information */
226 typedef struct regcomp_ctx_rec {
227     ScmRegexp *rx;              /* the building regexp */
228     ScmObj pattern;             /* original pattern or AST for diag msg */
229     int casefoldp;              /* TRUE if case-folding match */
230     int lookbehindp;            /* TRUE if lookbehind assertion match */
231     ScmPort *ipat;              /* [pass1] string port for pattern */
232     ScmObj sets;                /* [pass1] list of charsets */
233     int grpcount;               /* [pass1] group count */
234     unsigned char *code;        /* [pass3] code being built */
235     int codep;                  /* [pass3] front of code generation */
236     int emitp;                  /* [pass3] am I generating code? */
237     int codemax;                /* [pass3] max codep */
238 } regcomp_ctx;
239 
240 /* If we run from pass1, input string should be passed to PATTERN.
241    If we run from pass2, PATTERN must be NULL and we take AST from RX. */
rc_ctx_init(regcomp_ctx * ctx,ScmRegexp * rx,ScmString * pattern)242 static void rc_ctx_init(regcomp_ctx *ctx,
243                         ScmRegexp *rx,
244                         ScmString *pattern)
245 {
246     ctx->rx = rx;
247     if (pattern) {
248         ctx->pattern = Scm_CopyStringWithFlags(pattern,
249                                                SCM_STRING_IMMUTABLE,
250                                                SCM_STRING_IMMUTABLE);
251         ctx->ipat = SCM_PORT(Scm_MakeInputStringPort(pattern, FALSE));
252     } else {
253         ctx->pattern = rx->ast;
254         ctx->ipat = NULL;
255     }
256     ctx->casefoldp = FALSE;
257     ctx->lookbehindp = FALSE;
258     ctx->sets = SCM_NIL;
259     ctx->grpcount = 0;
260     ctx->code = NULL;
261     ctx->codep = 0;
262     ctx->emitp = FALSE;
263     ctx->codemax = 1;
264 }
265 
266 static ScmObj rc_charset(regcomp_ctx *ctx);
267 static void rc_register_charset(regcomp_ctx *ctx, ScmCharSet *cs);
268 static ScmObj rc1_rep(regcomp_ctx *ctx, ScmObj greedy,
269                       ScmObj lazy, ScmObj atom);
270 static ScmObj rc1_read_integer(regcomp_ctx *ctx);
271 static ScmObj rc1_group_name(regcomp_ctx *ctx);
272 static ScmObj rc1_lex_minmax(regcomp_ctx *ctx);
273 static ScmObj rc1_lex_open_paren(regcomp_ctx *ctx);
274 static ScmObj rc1_lex_xdigits(ScmPort *port, int key);
275 
276 /*----------------------------------------------------------------
277  * pass1 - parser
278  */
279 
280 /* EBNF Syntax of Gauche's regexp.
281  * This is a rough sketch.  The condition of BOL/EOL ("^" and "$"), for
282  * example, has to be dealt with context information.
283  * To follow the convention, "{" and "}" token that don't appear to
284  * consist of the "limited repetition" syntax are regarded as literal
285  * characters.
286  *
287  *  <re>   :
288  *         | <re> <alt>
289  *
290  *  <alt>  :
291  *         | <item>
292  *         | <alt> "|" <item>
293  *
294  *  <item> : <atom> "*"
295  *         | <atom> "+"
296  *         | <atom> "?"
297  *         | <atom> "{" <n> ("," <m>?)? "}"
298  *         | <atom> "*?"
299  *         | <atom> "+?"
300  *         | <atom> "??"
301  *         | <atom> "{" <n> ("," <m>?)? "}?"
302  *         | <atom> "*+"
303  *         | <atom> "++"
304  *         | <atom> "?+"
305  *         | <atom>
306  *
307  *  <atom> : a normal char, an escaped char, or a char-set
308  *         | "\\" <integer>     ;; backreference
309  *         | "\\k<" <string> ">" ;; backreference to named group
310  *         | "(" <re> ")"       ;; grouping w/  capturing
311  *         | "(?:"   <re> ")"   ;; grouping w/o capturing
312  *         | "(?i:"  <re> ")"   ;; grouping w/o capturing (case insensitive)
313  *         | "(?-i:" <re> ")"   ;; grouping w/o capturing (case sensitive)
314  *         | "(?<" <string> ">" <re> ")" ;; named capturing group
315  *         | "(?>"   <re> ")"   ;; standalone pattern
316  *         | "(?="   <re> ")"   ;; positive lookahead assertion
317  *         | "(?!"   <re> ")"   ;; negative lookahead assertion
318  *         | "(?<="  <re> ")"   ;; positive lookbehind assertion
319  *         | "(?<!"  <re> ")"   ;; negative lookbehind assertion
320  *         | "(?("cond")"yes-pattern"|"no-pattern")"
321  *         | "(?("cond")"yes-pattern")" ;; conditional pattern
322  *
323  */
324 /* TODO: It'd be nicer to have a dedicated condition to throw a parse error. */
325 
326 /* Lexer */
rc1_lex(regcomp_ctx * ctx)327 static ScmObj rc1_lex(regcomp_ctx *ctx)
328 {
329     ScmObj cs;
330 
331     ScmChar ch = Scm_GetcUnsafe(ctx->ipat);
332     if (ch == SCM_CHAR_INVALID) return SCM_EOF;
333     switch (ch) {
334     case '(': return rc1_lex_open_paren(ctx);
335     case ')': return SCM_SYM_CLOSE_PAREN;
336     case '|': return SCM_SYM_ALT;
337     case '^': return SCM_SYM_BOL;
338     case '.': return SCM_SYM_ANY;
339     case '$': return SCM_SYM_EOL;
340     case '[': return rc_charset(ctx);
341     case '{': return rc1_lex_minmax(ctx);
342     case '+': return rc1_rep(ctx, SCM_SYM_PLUS, SCM_SYM_PLUSQ, SCM_SYM_PLUSP);
343     case '*': return rc1_rep(ctx, SCM_SYM_STAR, SCM_SYM_STARQ, SCM_SYM_STARP);
344     case '?': return rc1_rep(ctx, SCM_SYM_QUESTION, SCM_SYM_QUESTIONQ, SCM_SYM_QUESTIONP);
345     case '\\':
346         ch = Scm_GetcUnsafe(ctx->ipat);
347         if (ch == SCM_CHAR_INVALID) {
348             Scm_Error("stray backslash at the end of pattern: %S",
349                       ctx->pattern);
350         }
351         switch (ch) {
352         case 'a': return SCM_MAKE_CHAR(0x07);
353         case 'n': return SCM_MAKE_CHAR('\n');
354         case 'r': return SCM_MAKE_CHAR('\r');
355         case 't': return SCM_MAKE_CHAR('\t');
356         case 'f': return SCM_MAKE_CHAR('\f');
357         case 'e': return SCM_MAKE_CHAR(0x1b);
358         case 'b': return SCM_SYM_WB;
359         case 'B': return SCM_SYM_NWB;
360         case 'x': case 'u': case 'U':
361             return rc1_lex_xdigits(ctx->ipat, ch);
362         case 'd':
363             cs = Scm_GetStandardCharSet(SCM_CHAR_SET_ASCII_DIGIT);
364             rc_register_charset(ctx, SCM_CHAR_SET(cs));
365             return cs;
366         case 'D':
367             cs = Scm_GetStandardCharSet(-SCM_CHAR_SET_ASCII_DIGIT);
368             rc_register_charset(ctx, SCM_CHAR_SET(cs));
369             return cs;
370         case 'w':
371             cs = Scm_GetStandardCharSet(SCM_CHAR_SET_ASCII_WORD);
372             rc_register_charset(ctx, SCM_CHAR_SET(cs));
373             return cs;
374         case 'W':
375             cs = Scm_GetStandardCharSet(-SCM_CHAR_SET_ASCII_WORD);
376             rc_register_charset(ctx, SCM_CHAR_SET(cs));
377             return cs;
378         case 's':
379             cs = Scm_GetStandardCharSet(SCM_CHAR_SET_ASCII_WHITESPACE);
380             rc_register_charset(ctx, SCM_CHAR_SET(cs));
381             return cs;
382         case 'S':
383             cs = Scm_GetStandardCharSet(-SCM_CHAR_SET_ASCII_WHITESPACE);
384             rc_register_charset(ctx, SCM_CHAR_SET(cs));
385             return cs;
386         case 'p': case 'P':
387             cs = Scm_GetStandardCharSet(Scm_CharSetParseCategory(ctx->ipat, ch));
388 
389             rc_register_charset(ctx, SCM_CHAR_SET(cs));
390             return cs;
391         case '0': case '1': case '2': case '3': case '4':
392         case '5': case '6': case '7': case '8': case '9':
393             Scm_UngetcUnsafe(ch, ctx->ipat);
394             return Scm_Cons(SCM_SYM_BACKREF, rc1_read_integer(ctx));
395         case 'k':
396             if (Scm_GetcUnsafe(ctx->ipat) != '<') {
397                 Scm_Error("\\k must be followed by '<': %S", ctx->pattern);
398             }
399             ScmObj name = rc1_group_name(ctx);
400             if (SCM_FALSEP(name)) {
401                 Scm_Error("malformed backreference found in regexp %S", ctx->pattern);
402             }
403             return Scm_Cons(SCM_SYM_BACKREF, name);
404         }
405         /*FALLTHROUGH*/
406     default:
407         if (ctx->casefoldp) ch = SCM_CHAR_DOWNCASE(ch);
408         return SCM_MAKE_CHAR(ch);
409     }
410     /*NOTREACHED*/
411 }
412 
413 /* Read \x, \u, \U escape sequence in the regexp spec.
414    This may read-ahead some hexdigit characters.  In such case, it
415    returns SEQ node. */
rc1_lex_xdigits(ScmPort * port,int key)416 static ScmObj rc1_lex_xdigits(ScmPort *port, int key)
417 {
418     ScmDString buf;
419 
420     Scm_DStringInit(&buf);
421     ScmObj bad = Scm_ReadXdigitsFromPort(port, key, 0, FALSE, &buf);
422     if (SCM_STRINGP(bad)) {
423         /* skip chars to the end of regexp, so that the reader will read
424            after the erroneous string */
425         for (;;) {
426             int c;
427             SCM_GETC(c, port);
428             if (c == EOF || c == '/') break;
429             if (c == '\\') SCM_GETC(c, port);
430         }
431         /* construct an error message */
432         Scm_ReadError(port,
433                       "Bad '\\%c' escape sequence in a regexp literal: \\%c%A",
434                       key, key, bad);
435         return SCM_UNDEFINED;   /* dummy */
436     } else {
437         ScmSmallInt size, len;
438         const char *chars = Scm_DStringPeek(&buf, &size, &len);
439         if (len == 1) {
440             ScmChar ch;
441             SCM_CHAR_GET(chars, ch);
442             return SCM_MAKE_CHAR(ch);
443         } else {
444             ScmObj h = SCM_NIL, t = SCM_NIL;
445             SCM_APPEND1(h, t, SCM_SYM_SEQ);
446             while (len-- > 0) {
447                 ScmChar ch;
448                 SCM_CHAR_GET(chars, ch);
449                 chars += SCM_CHAR_NBYTES(ch);
450                 SCM_APPEND1(h, t, SCM_MAKE_CHAR(ch));
451             }
452             return h;
453         }
454     }
455 }
456 
457 /* Called after '+', '*' or '?' is read, and check if there's a
458    following '?' (lazy quantifier) or '+' (atomic expression) */
rc1_rep(regcomp_ctx * ctx,ScmObj greedy,ScmObj lazy,ScmObj atom)459 static ScmObj rc1_rep(regcomp_ctx *ctx, ScmObj greedy,
460                       ScmObj lazy, ScmObj atom)
461 {
462     ScmChar ch = Scm_GetcUnsafe(ctx->ipat);
463     if (ch == '?') return lazy;
464     if (ch == '+') return atom;
465     Scm_UngetcUnsafe(ch, ctx->ipat);
466     return greedy;
467 }
468 
469 /* Reads '('-sequence - either one of "(", "(?:", "(?i:", "(?-i:",
470    "(?=", "(?<=", "(?<!", "(?<name>", "(?(condition)|).  The leading
471    "(" has already been read. */
rc1_lex_open_paren(regcomp_ctx * ctx)472 static ScmObj rc1_lex_open_paren(regcomp_ctx *ctx)
473 {
474     ScmObj pos = Scm_PortSeekUnsafe(ctx->ipat, SCM_MAKE_INT(0), SEEK_CUR);
475     ScmChar ch = Scm_GetcUnsafe(ctx->ipat);
476     if (ch != '?') {
477         Scm_UngetcUnsafe(ch, ctx->ipat);
478         return SCM_SYM_OPEN_PAREN;
479     }
480     ch = Scm_GetcUnsafe(ctx->ipat);
481     switch (ch) {
482     case ':': return SCM_SYM_SEQ;
483     case '>': return SCM_SYM_ONCE;
484     case '=': return SCM_SYM_ASSERT;
485     case '!': return SCM_SYM_NASSERT;
486     case '(': return SCM_SYM_CPAT;
487     case '<': {
488         ch = Scm_GetcUnsafe(ctx->ipat);
489         if (ch == '=') return SCM_SYM_LOOKBEHIND;
490         if (ch == '!') return SCM_SYM_NLOOKBEHIND;
491         Scm_UngetcUnsafe(ch, ctx->ipat);
492         ScmObj name = rc1_group_name(ctx);
493         if (!SCM_FALSEP(name)) return Scm_Cons(SCM_SYM_REG, name);
494         break;
495     }
496     case 'i':
497         ch = Scm_GetcUnsafe(ctx->ipat);
498         if (ch == ':') return SCM_SYM_SEQ_UNCASE;
499         break;
500     case '-':
501         ch = Scm_GetcUnsafe(ctx->ipat);
502         if (ch == 'i') {
503             ch = Scm_GetcUnsafe(ctx->ipat);
504             if (ch == ':') return SCM_SYM_SEQ_CASE;
505         }
506         break;
507     }
508     /* fail. */
509     Scm_PortSeekUnsafe(ctx->ipat, pos, SEEK_SET);
510     return SCM_SYM_OPEN_PAREN;
511 }
512 
rc1_read_integer(regcomp_ctx * ctx)513 static ScmObj rc1_read_integer(regcomp_ctx *ctx)
514 {
515     ScmChar ch = Scm_GetcUnsafe(ctx->ipat);
516     if (!isdigit(ch)) {
517         Scm_Error("number expected, but got '%c'", ch);
518     }
519     ScmDString ds;
520     Scm_DStringInit(&ds);
521     do {
522         Scm_DStringPutc(&ds, ch);
523         ch = Scm_GetcUnsafe(ctx->ipat);
524     } while (ch != SCM_CHAR_INVALID && isdigit(ch));
525     if (ch != SCM_CHAR_INVALID) {
526         Scm_UngetcUnsafe(ch, ctx->ipat);
527     }
528     ScmObj r = Scm_StringToNumber(SCM_STRING(Scm_DStringGet(&ds, 0)), 10, 0);
529     if (SCM_BIGNUMP(r)) {
530         Scm_Error("number too big: %S", r);
531     }
532     SCM_ASSERT(SCM_INTP(r));
533     return r;
534 }
535 
rc1_group_name(regcomp_ctx * ctx)536 static ScmObj rc1_group_name(regcomp_ctx *ctx)
537 {
538     ScmDString ds;
539     Scm_DStringInit(&ds);
540 
541     for (;;) {
542         ScmChar ch = Scm_GetcUnsafe(ctx->ipat);
543         if (ch == SCM_CHAR_INVALID) return SCM_FALSE;
544         if (ch == '>') {
545             return Scm_Intern(SCM_STRING(Scm_DStringGet(&ds, 0)));
546         }
547         if (ch == '\\') {
548             ch = Scm_GetcUnsafe(ctx->ipat);
549             if (ch == SCM_CHAR_INVALID) return SCM_FALSE;
550             /* fall through */
551         }
552         Scm_DStringPutc(&ds, ch);
553     }
554     return SCM_UNDEFINED;       /* dummy */
555 }
556 
557 /* Reads {n,m}-type repeat syntax.  The leading "{" has been read.
558    If the character sequence doesn't consist of valid syntax, rollback
559    to the ordinary character sequence.
560    If successfully parsed, returns (rep <n> . <m>) where
561     <n> == <m> if the pattern is "{n}" (exact count), or
562     <m> == #f if the pattern is "{n,}" (minimum count), or
563     <m> == integer if the pattern is "{n,m}" (limited count).
564    If the pattern is followed by '?', rep-min is used instead.
565  */
rc1_lex_minmax(regcomp_ctx * ctx)566 static ScmObj rc1_lex_minmax(regcomp_ctx *ctx)
567 {
568     int rep_min = -1, rep_max = -1, exact = FALSE;
569     ScmObj type = SCM_SYM_REP; /* default is greedy */
570     ScmObj pos = Scm_PortSeekUnsafe(ctx->ipat, SCM_MAKE_INT(0), SEEK_CUR);
571 
572     for (;;) {
573         int ch = Scm_GetcUnsafe(ctx->ipat);
574         if (SCM_CHAR_ASCII_P(ch) && isdigit(ch)) {
575             if (rep_min < 0) {
576                 rep_min = (ch - '0');
577             } else {
578                 rep_min = rep_min*10 + (ch - '0');
579             }
580         } else if (ch == ',') {
581             /* NB: The following line makes us to treat {,m} as {0,m}.
582                Oniguruma does so.  Perl doesn't.  Strictly speaking they're
583                incompatible to each other (if Perl code /a{,3}/ expects
584                to match "a{,3}", it needs to be written as /a\{,3}/ in
585                Oniguruma).   Let's take Oniguruma now. */
586             if (rep_min < 0) rep_min = 0;
587             break;
588         } else if (ch == '}') {
589             exact = TRUE;
590             break;
591         } else {
592             goto bad_min_max;
593         }
594     }
595     if (rep_min < 0) goto bad_min_max;
596     if (rep_min > MAX_LIMITED_REPEAT) goto out_of_range;
597     if (!exact) {
598         for (;;) {
599             int ch = Scm_GetcUnsafe(ctx->ipat);
600             if (SCM_CHAR_ASCII_P(ch) && isdigit(ch)) {
601                 if (rep_max < 0) {
602                     rep_max = (ch - '0');
603                 } else {
604                     rep_max = rep_max*10 + (ch - '0');
605                 }
606             } else if (ch == '}') {
607                 break;
608             } else {
609                 goto bad_min_max;
610             }
611         }
612         if (rep_max > MAX_LIMITED_REPEAT) goto out_of_range;
613         if (rep_max >= 0 && rep_max < rep_min) {
614             Scm_Error("{n,m}-syntax requires n <= m: %S", ctx->pattern);
615         }
616     }
617 
618     ScmObj m;
619     if (exact)            m = SCM_MAKE_INT(rep_min);
620     else if (rep_max < 0) m = SCM_FALSE;
621     else                  m = SCM_MAKE_INT(rep_max);
622 
623     int ch = Scm_GetcUnsafe(ctx->ipat);
624     if (ch == '?') type = SCM_SYM_REP_MIN;
625     else Scm_UngetcUnsafe(ch, ctx->ipat);
626     return Scm_Cons(type, Scm_Cons(SCM_MAKE_INT(rep_min), m));
627 
628   out_of_range:
629     Scm_Error("{n,m}-syntax can accept up to %d count: %S",
630               MAX_LIMITED_REPEAT, ctx->pattern);
631     /*NOTREACHED*/
632   bad_min_max:
633     /* back up */
634     Scm_PortSeekUnsafe(ctx->ipat, pos, SEEK_SET);
635     return SCM_MAKE_CHAR('{');
636 }
637 
rc1_fold_alts(regcomp_ctx * ctx SCM_UNUSED,ScmObj alts)638 static ScmObj rc1_fold_alts(regcomp_ctx *ctx SCM_UNUSED, ScmObj alts)
639 {
640     ScmObj r = SCM_NIL, ap;
641     SCM_FOR_EACH(ap, alts) {
642         ScmObj alt = SCM_CAR(ap);
643         if (SCM_PAIRP(alt) && SCM_NULLP(SCM_CDR(alt))) {
644             r = Scm_Cons(SCM_CAR(alt), r);
645         } else {
646             r = Scm_Cons(Scm_Cons(SCM_SYM_SEQ, alt), r);
647         }
648     }
649     return Scm_Cons(SCM_SYM_ALT, r);
650 }
651 
652 static ScmObj rc1_parse(regcomp_ctx *, int, ScmObj);
653 
rc1_lex_conditional_pattern(regcomp_ctx * ctx,int bolp,ScmObj grps)654 static ScmObj rc1_lex_conditional_pattern(regcomp_ctx *ctx, int bolp,
655                                           ScmObj grps)
656 {
657     ScmChar ch = Scm_GetcUnsafe(ctx->ipat);
658     if (ch == SCM_CHAR_INVALID)
659         goto error;
660     if (isdigit(ch)) {
661         Scm_UngetcUnsafe(ch, ctx->ipat);
662         ScmObj r = rc1_read_integer(ctx);
663         if (!SCM_EQ(rc1_lex(ctx), SCM_SYM_CLOSE_PAREN))
664             goto error;
665         return r;
666     }
667     if (ch == '?') {
668         ch = Scm_GetcUnsafe(ctx->ipat);
669         if (ch == '=')
670             return Scm_Cons(SCM_SYM_ASSERT, rc1_parse(ctx, bolp, grps));
671         if (ch == '!')
672             return Scm_Cons(SCM_SYM_NASSERT, rc1_parse(ctx, bolp, grps));
673         if (ch == '<') {
674             ScmObj type;
675             ch = Scm_GetcUnsafe(ctx->ipat);
676             if (ch == '=') {
677                 type = SCM_SYM_ASSERT;
678             } else if (ch == '!') {
679                 type = SCM_SYM_NASSERT;
680             } else {
681                 Scm_Error("unknown switch condition (?>%c...) in regexp %S",
682                           ch, ctx->pattern);
683                 type = SCM_SYM_NASSERT; /* dummy */
684             }
685             ScmObj item = rc1_parse(ctx, bolp, grps);
686             return SCM_LIST2(type, Scm_Cons(SCM_SYM_LOOKBEHIND, item));
687         }
688         /* fallthru */
689     }
690     Scm_Error("unknown switch condition (?%c...) in regexp %S", ch, ctx->pattern);
691 
692   error:
693     Scm_Error("unterminated conditional pattern in regexp %S", ctx->pattern);
694     return SCM_UNDEFINED;       /* dummy */
695 }
696 
697 /* Parser */
698 /* Groups represents the current nestings of parentheses, including both
699    capturing groups and non-capturing parens.  It works like a stack, where
700    the leftmost item is for the innermost open paren.  The value is a
701    group number for capturing groups, and #f for others. */
rc1_parse(regcomp_ctx * ctx,int bolp,ScmObj groups)702 static ScmObj rc1_parse(regcomp_ctx *ctx, int bolp, ScmObj groups)
703 {
704     ScmObj stack = SCM_NIL, alts = SCM_NIL;
705     int bolpsave = bolp;
706 
707 #define TOPP()     (SCM_NULLP(groups))
708 #define PUSH(elt)  (stack = Scm_Cons((elt), stack))
709 #define PUSH1(elt) (stack = Scm_Cons((elt), SCM_CDR(stack)))
710 
711     for (;;) {
712         ScmObj token = rc1_lex(ctx);
713         ScmObj item;
714         if (SCM_EOFP(token)) {
715             if (!TOPP()) {
716                 Scm_Error("unterminated grouping in regexp %S", ctx->pattern);
717             }
718             break;
719         }
720         if (SCM_EQ(token, SCM_SYM_CLOSE_PAREN)) {
721             if (TOPP()) {
722                 Scm_Error("extra close parenthesis in regexp %S", ctx->pattern);
723             }
724             groups = SCM_CDR(groups);
725             break;
726         }
727         if (SCM_EQ(token, SCM_SYM_BOL)) {
728             if (bolp) {
729                 PUSH(SCM_SYM_BOL);
730                 bolp = FALSE;
731                 continue;
732             } else {
733                 token = SCM_MAKE_CHAR('^');
734             }
735             /*FALLTHROUGH*/
736         }
737         if (SCM_EQ(token, SCM_SYM_ALT)) {
738             alts = Scm_Cons(Scm_ReverseX(stack), alts);
739             stack = SCM_NIL;
740             bolp = bolpsave;
741             continue;
742         }
743         if (SCM_EQ(token, SCM_SYM_OPEN_PAREN)) {
744             int grpno = ++ctx->grpcount;
745             item = rc1_parse(ctx, bolp, Scm_Cons(SCM_MAKE_INT(grpno), groups));
746             PUSH(Scm_Cons(SCM_MAKE_INT(grpno), Scm_Cons(SCM_FALSE, item)));
747             bolp = FALSE;
748             continue;
749         }
750         if (SCM_EQ(token, SCM_SYM_SEQ)) {
751             item = rc1_parse(ctx, bolp, Scm_Cons(SCM_FALSE, groups));
752             PUSH(Scm_Cons(SCM_SYM_SEQ, item));
753             bolp = FALSE;
754             continue;
755         }
756         if (SCM_EQ(token, SCM_SYM_SEQ_UNCASE) || SCM_EQ(token, SCM_SYM_SEQ_CASE)) {
757             int oldflag = ctx->casefoldp;
758             ctx->casefoldp = SCM_EQ(token, SCM_SYM_SEQ_UNCASE);
759             item = rc1_parse(ctx, bolp, Scm_Cons(SCM_FALSE, groups));
760             PUSH(Scm_Cons(token, item));
761             ctx->casefoldp = oldflag;
762             bolp = FALSE;
763             continue;
764         }
765         if (SCM_EQ(token, SCM_SYM_ONCE)) {
766             /* (?>re) can have BOL/EOL.*/
767             item = rc1_parse(ctx, TRUE, Scm_Cons(SCM_FALSE, groups));
768             PUSH(Scm_Cons(SCM_SYM_ONCE, item));
769             bolp = FALSE;
770             continue;
771         }
772         if (SCM_EQ(token, SCM_SYM_ASSERT) || SCM_EQ(token, SCM_SYM_NASSERT)) {
773             /* (?=re) and (?!re) can have BOL/EOL.*/
774             item = rc1_parse(ctx, TRUE, Scm_Cons(SCM_FALSE, groups));
775             PUSH(Scm_Cons(token, item));
776             continue;
777         }
778         if (SCM_EQ(token, SCM_SYM_LOOKBEHIND) || SCM_EQ(token, SCM_SYM_NLOOKBEHIND)) {
779             /* "(?<=a)" => (assert (lookbehind a))
780                "(?<!a)" => (nassert (lookbehind a)) */
781             item = rc1_parse(ctx, TRUE, Scm_Cons(SCM_FALSE, groups));
782             PUSH(SCM_LIST2(SCM_EQ(token, SCM_SYM_LOOKBEHIND)? SCM_SYM_ASSERT : SCM_SYM_NASSERT,
783                            Scm_Cons(SCM_SYM_LOOKBEHIND, item)));
784             continue;
785         }
786         if (SCM_EQ(token, SCM_SYM_STAR) || SCM_EQ(token, SCM_SYM_STARP)) {
787             /* "x*"  => (rep 0 #f x)
788                "x*+" => (once (rep 0 #f x)) */
789             if (SCM_NULLP(stack)) goto synerr;
790             item = SCM_LIST4(SCM_SYM_REP, SCM_MAKE_INT(0), SCM_FALSE, SCM_CAR(stack));
791             PUSH1(SCM_EQ(token, SCM_SYM_STAR) ? item : SCM_LIST2(SCM_SYM_ONCE, item));
792             bolp = FALSE;
793             continue;
794         }
795         if (SCM_EQ(token, SCM_SYM_STARQ)) {
796             /* "x*?" => (rep-min 0 #f x) */
797             if (SCM_NULLP(stack)) goto synerr;
798             item = SCM_LIST4(SCM_SYM_REP_MIN, SCM_MAKE_INT(0), SCM_FALSE, SCM_CAR(stack));
799             PUSH1(item);
800             bolp = FALSE;
801             continue;
802         }
803         if (SCM_EQ(token, SCM_SYM_PLUS) || SCM_EQ(token, SCM_SYM_PLUSP)) {
804             /* "x+"  => (rep 1 #f x))
805                "x++" => (once (rep 1 #f x)) */
806             if (SCM_NULLP(stack)) goto synerr;
807             item = SCM_LIST4(SCM_SYM_REP, SCM_MAKE_INT(1), SCM_FALSE, SCM_CAR(stack));
808             PUSH1(SCM_EQ(token, SCM_SYM_PLUS) ? item : SCM_LIST2(SCM_SYM_ONCE, item));
809             bolp = FALSE;
810             continue;
811         }
812         if (SCM_EQ(token, SCM_SYM_PLUSQ)) {
813             /* "x+?" => (rep-min 1 #f x) */
814             if (SCM_NULLP(stack)) goto synerr;
815             item = SCM_LIST4(SCM_SYM_REP_MIN, SCM_MAKE_INT(1), SCM_FALSE, SCM_CAR(stack));
816             PUSH1(item);
817             bolp = FALSE;
818             continue;
819         }
820         if (SCM_EQ(token, SCM_SYM_QUESTION) || SCM_EQ(token, SCM_SYM_QUESTIONP)) {
821             /* "x?"  => (rep 0 1 x)
822                "x?+" => (once (rep 0 1 x)) */
823             if (SCM_NULLP(stack)) goto synerr;
824             item = SCM_LIST4(SCM_SYM_REP, SCM_MAKE_INT(0), SCM_MAKE_INT(1), SCM_CAR(stack));
825             PUSH1(SCM_EQ(token, SCM_SYM_QUESTION) ? item : SCM_LIST2(SCM_SYM_ONCE, item));
826             bolp = FALSE;
827             continue;
828         }
829         if (SCM_EQ(token, SCM_SYM_QUESTIONQ)) {
830             /* "x??" => (rep-min 0 1 x) */
831             if (SCM_NULLP(stack)) goto synerr;
832             item = SCM_LIST4(SCM_SYM_REP_MIN, SCM_MAKE_INT(0), SCM_MAKE_INT(1), SCM_CAR(stack));
833             PUSH1(item);
834             bolp = FALSE;
835             continue;
836         }
837         if (SCM_EQ(token, SCM_SYM_CPAT)) {
838             ScmObj cond, ypat, npat;
839             ScmObj new_groups = Scm_Cons(SCM_FALSE, groups);
840             cond = rc1_lex_conditional_pattern(ctx, bolp, new_groups);
841             item = rc1_parse(ctx, bolp, new_groups);
842             if (SCM_PAIRP(item) && SCM_PAIRP(SCM_CAR(item))
843                 && SCM_EQ(SCM_CAAR(item), SCM_SYM_ALT)) {
844                 ScmObj elt = SCM_CAR(item);
845                 if (Scm_Length(elt) > 3) {
846                     Scm_Error("Conditional pattern contains too much branches: %S",
847                               ctx->pattern);
848                 }
849                 ypat = SCM_LIST1(SCM_CADR(elt));
850                 npat = SCM_CDDR(elt);
851             } else {
852                 ypat = item;
853                 npat = SCM_NIL;
854             }
855             PUSH(SCM_LIST4(SCM_SYM_CPAT, cond, ypat, npat));
856             bolp = FALSE;
857             continue;
858         }
859         if (SCM_PAIRP(token) && SCM_EQ(SCM_CAR(token), SCM_SYM_REG)) {
860             /* "(?P<name>x)" => (<integer> name . <ast>)) */
861             int grpno = ++ctx->grpcount;
862             ScmObj name = SCM_CDR(token);
863             item = rc1_parse(ctx, bolp, Scm_Cons(SCM_MAKE_INT(grpno), groups));
864             PUSH(Scm_Cons(SCM_MAKE_INT(grpno), Scm_Cons(name, item)));
865             ctx->rx->grpNames = Scm_Acons(name, SCM_MAKE_INT(grpno), ctx->rx->grpNames);
866             bolp = FALSE;
867             continue;
868         }
869         if (SCM_PAIRP(token) &&
870             (SCM_EQ(SCM_CAR(token), SCM_SYM_REP) ||
871              SCM_EQ(SCM_CAR(token), SCM_SYM_REP_MIN))) {
872             /* "x{n}"    => (rep n n x)
873                "x{n,}"   => (rep n #f x)
874                "x{n,m}"  => (rep n m x)
875                "x{n,}?"  => (rep-min n #t x)
876                "x{n,m}?" => (rep-min n m x) */
877             if (SCM_NULLP(stack)) goto synerr;
878             item = SCM_LIST4(SCM_CAR(token), SCM_CADR(token), SCM_CDDR(token), SCM_CAR(stack));
879             PUSH1(item);
880             bolp = FALSE;
881             continue;
882         }
883         if (SCM_PAIRP(token) && SCM_EQ(SCM_CAR(token), SCM_SYM_BACKREF)) {
884             ScmObj h = SCM_NIL, t = SCM_NIL;
885             ScmObj ref = SCM_CDR(token);
886             if (SCM_INTP(ref)) {
887                 int grpno = SCM_INT_VALUE(ref);
888                 if (ctx->grpcount < grpno
889                     || !SCM_FALSEP(Scm_Memv(SCM_MAKE_INT(grpno), groups))) {
890                     Scm_Error("Backreference \\%d refers to an unfinished group.",
891                               grpno);
892                 }
893                 PUSH(token);
894                 bolp = FALSE;
895                 continue;
896             }
897 
898             SCM_ASSERT(SCM_SYMBOLP(ref));
899             ScmObj ep;
900             SCM_FOR_EACH(ep, ctx->rx->grpNames) {
901                 if (!SCM_EQ(SCM_CAAR(ep), ref)) continue;
902                 SCM_APPEND1(h, t, Scm_Cons(SCM_SYM_BACKREF, SCM_CDAR(ep)));
903             }
904             if (SCM_NULLP(h)) goto synerr;
905 
906             PUSH(SCM_NULLP(SCM_CDR(h))? SCM_CAR(h) : Scm_Cons(SCM_SYM_ALT, h));
907             bolp = FALSE;
908             continue;
909         }
910         PUSH(token);
911         bolp = FALSE;
912         continue;
913     }
914     if (SCM_NULLP(alts)) {
915         return Scm_ReverseX(stack);
916     } else {
917         alts = Scm_Cons(Scm_ReverseX(stack), alts);
918         return SCM_LIST1(rc1_fold_alts(ctx, alts));
919     }
920   synerr:
921     Scm_Error("bad regexp syntax in %S", ctx->pattern);
922     return SCM_UNDEFINED;       /* dummy */
923 #undef PUSH
924 #undef PUSH1
925 }
926 
rc1(regcomp_ctx * ctx)927 static ScmObj rc1(regcomp_ctx *ctx)
928 {
929     ScmObj ast = rc1_parse(ctx, TRUE, SCM_NIL);
930     if (ctx->casefoldp) {
931         ast = SCM_LIST3(SCM_MAKE_INT(0), SCM_FALSE,
932                         Scm_Cons(SCM_SYM_SEQ_UNCASE, ast));
933     } else {
934         ast = Scm_Cons(SCM_MAKE_INT(0), Scm_Cons(SCM_FALSE, ast));
935     }
936     int ngrp = ctx->grpcount + 1;
937     ctx->rx->numGroups = ngrp;
938     return ast;
939 }
940 
rc_casefold(ScmObj * set,int complement)941 static void rc_casefold(ScmObj *set, int complement)
942 {
943     int set1 = SCM_CHAR_SET_LOWER;
944     int set2 = SCM_CHAR_SET_UPPER;
945     int set3 = SCM_CHAR_SET_TITLE;
946     int set4 = SCM_CHAR_SET_LC;
947     if (complement) {
948         set1 = -set1;
949         set2 = -set2;
950         set3 = -set3;
951         set4 = -set4;
952     }
953 
954     ScmCharSet* cur = SCM_CHAR_SET(*set);
955     /* NB: Scm_CharSetEq can be slow.  This is only called during regexp
956        compilation, so we expect this isn't too bad, but if this ever gets
957        a bottle-neck, we can add an extra field in ScmCharSet to indicate
958        that it's one of the predefined charset. */
959     if (SCM_CHAR_SET_LARGE_P(cur) &&
960         (Scm_CharSetEq(cur, SCM_CHAR_SET(Scm_GetStandardCharSet(set1))) ||
961          Scm_CharSetEq(cur, SCM_CHAR_SET(Scm_GetStandardCharSet(set2))) ||
962          Scm_CharSetEq(cur, SCM_CHAR_SET(Scm_GetStandardCharSet(set3))))) {
963             *set = Scm_MakeEmptyCharSet();
964             Scm_CharSetAdd(SCM_CHAR_SET(*set),
965                            SCM_CHAR_SET(Scm_GetStandardCharSet(set4)));
966     }
967 
968     Scm_CharSetCaseFold(SCM_CHAR_SET(*set));
969 }
970 
971 /* character range */
rc_charset(regcomp_ctx * ctx)972 static ScmObj rc_charset(regcomp_ctx *ctx)
973 {
974     int complement;
975     ScmObj set = Scm_CharSetRead(ctx->ipat, &complement, FALSE, TRUE);
976     if (!SCM_CHAR_SET_P(set)) {
977         Scm_Error("bad charset spec in pattern: %S", ctx->pattern);
978     }
979     if (ctx->casefoldp) {
980         rc_casefold(&set, complement);
981     }
982     Scm_CharSetFreezeX(SCM_CHAR_SET(set));
983     rc_register_charset(ctx, SCM_CHAR_SET(set));
984     if (complement) {
985         return Scm_Cons(SCM_SYM_COMP, SCM_OBJ(set));
986     } else {
987         return SCM_OBJ(set);
988     }
989 }
990 
991 /* Remember charset so that we can construct charset vector later */
rc_register_charset(regcomp_ctx * ctx,ScmCharSet * cs)992 static void rc_register_charset(regcomp_ctx *ctx, ScmCharSet *cs)
993 {
994     if (SCM_FALSEP(Scm_Memq(SCM_OBJ(cs), ctx->sets))) {
995         ctx->sets = Scm_Cons(SCM_OBJ(cs), ctx->sets);
996     }
997 }
998 
999 /* An interlude between pass1 and pass2.  From the information of
1000    parser context, build a charset vector. */
rc_setup_charsets(ScmRegexp * rx,regcomp_ctx * ctx)1001 static void rc_setup_charsets(ScmRegexp *rx, regcomp_ctx *ctx)
1002 {
1003     rx->numSets = Scm_Length(ctx->sets);
1004     rx->sets = SCM_NEW_ARRAY(ScmCharSet*, rx->numSets);
1005     ScmObj cp = Scm_Reverse(ctx->sets);
1006     for (int i=0; !SCM_NULLP(cp); cp = SCM_CDR(cp)) {
1007         rx->sets[i++] = SCM_CHAR_SET(SCM_CAR(cp));
1008     }
1009 }
1010 
1011 /*-------------------------------------------------------------
1012  * pass 2: optimizer
1013  *
1014  *  - flattening nested sequences: (seq a (seq b) c) => (seq a b c)
1015  *  - introduces short-cut construct for certain cases.
1016  *       (... (rep <m> <n> #\a) #\b ...)
1017  *       => (... (rep-while <m> <n> #\a) #\b ...)
1018  */
1019 static ScmObj rc2_optimize(ScmObj ast, ScmObj rest);
1020 static int    is_distinct(ScmObj x, ScmObj y);
1021 
rc2_optimize_seq(ScmObj seq,ScmObj rest)1022 static ScmObj rc2_optimize_seq(ScmObj seq, ScmObj rest)
1023 {
1024     if (!SCM_PAIRP(seq)) return seq;
1025     ScmObj opted;
1026     ScmObj elt = SCM_CAR(seq);
1027     ScmObj tail = rc2_optimize_seq(SCM_CDR(seq), rest);
1028     rest = SCM_NULLP(tail)? rest : tail;
1029     if (!SCM_PAIRP(elt) || SCM_EQ(SCM_CAR(elt), SCM_SYM_COMP)) {
1030         if (SCM_EQ(tail, SCM_CDR(seq))) return seq;
1031         else return Scm_Cons(elt, tail);
1032     }
1033     ScmObj etype = SCM_CAR(elt);
1034     if (SCM_EQ(etype, SCM_SYM_SEQ)) {
1035         return Scm_Append2(rc2_optimize_seq(SCM_CDR(elt), rest), tail);
1036     }
1037     if (SCM_EQ(etype, SCM_SYM_REP)) {
1038         /* If the head of repeating sequence and the beginning of the
1039            following sequence are distinct, like #/\s*foo/, the branch
1040            becomes deterministic (i.e. we don't need backtrack). */
1041         ScmObj repbody = rc2_optimize_seq(SCM_CDR(SCM_CDDR(elt)), rest);
1042         if (SCM_NULLP(repbody)) {
1043             /* This is the case that an empy string (?:) is repeated.
1044                it always matches, so we collapse entire thing to (). */
1045             return tail;
1046         }
1047         if (SCM_NULLP(rest) || is_distinct(SCM_CAR(repbody), SCM_CAR(rest))) {
1048             ScmObj elt2 = Scm_Append2(SCM_LIST3(SCM_SYM_REP_WHILE, SCM_CADR(elt),
1049                                                 SCM_CAR(SCM_CDDR(elt))),
1050                                       repbody);
1051             return Scm_Cons(elt2, tail);
1052         }
1053         if (SCM_EQ(repbody, SCM_CDR(SCM_CDDR(elt)))) opted = elt;
1054         else opted = Scm_Append2(SCM_LIST3(SCM_SYM_REP, SCM_CADR(elt),
1055                                            SCM_CAR(SCM_CDDR(elt))),
1056                                  repbody);
1057     } else {
1058         opted = rc2_optimize(elt, rest);
1059     }
1060     if (SCM_EQ(elt, opted) && SCM_EQ(tail, SCM_CDR(seq))) return seq;
1061     else return Scm_Cons(opted, tail);
1062 }
1063 
rc2_optimize(ScmObj ast,ScmObj rest)1064 static ScmObj rc2_optimize(ScmObj ast, ScmObj rest)
1065 {
1066     if (!SCM_PAIRP(ast)) return ast;
1067     ScmObj seq, seqo;
1068     ScmObj type = SCM_CAR(ast);
1069     if (SCM_EQ(type, SCM_SYM_COMP)) return ast;
1070     if (SCM_EQ(type, SCM_SYM_LOOKBEHIND)) return ast;
1071 
1072     if (SCM_EQ(type, SCM_SYM_ALT)) {
1073         ScmObj sp, sp2, e = SCM_UNBOUND, h, t;
1074         SCM_FOR_EACH(sp, SCM_CDR(ast)) {
1075             e = rc2_optimize(SCM_CAR(sp), rest);
1076             if (!SCM_EQ(e, SCM_CAR(sp))) break;
1077         }
1078         if (SCM_NULLP(sp)) return ast;
1079         /* need to copy the spine */
1080         h = t = SCM_NIL;
1081         SCM_FOR_EACH(sp2, SCM_CDR(ast)) {
1082             if (SCM_EQ(sp, sp2)) { SCM_APPEND1(h, t, e); break; }
1083             SCM_APPEND1(h, t, SCM_CAR(sp2));
1084         }
1085         SCM_FOR_EACH(sp2, SCM_CDR(sp2)) {
1086             SCM_APPEND1(h, t, rc2_optimize(SCM_CAR(sp2), rest));
1087         }
1088         return Scm_Cons(SCM_SYM_ALT, h);
1089     }
1090     if (SCM_EQ(type, SCM_SYM_REP) || SCM_EQ(type, SCM_SYM_REP_MIN)
1091         || SCM_EQ(type, SCM_SYM_REP_WHILE)) {
1092         seq = SCM_CADR(SCM_CDDR(ast));
1093         seqo = rc2_optimize_seq(seq, rest);
1094         if (SCM_EQ(seq, seqo)) return ast;
1095         return SCM_LIST4(type, SCM_CADR(ast), SCM_CAR(SCM_CDDR(ast)), seqo);
1096     }
1097     seq = SCM_CDR(ast);
1098     seqo = rc2_optimize_seq(seq, rest);
1099     if (SCM_EQ(seq, seqo)) return ast;
1100     return Scm_Cons(type, seqo);
1101 }
1102 
is_distinct(ScmObj x,ScmObj y)1103 static int is_distinct(ScmObj x, ScmObj y)
1104 {
1105     if (SCM_PAIRP(x)) {
1106         ScmObj carx = SCM_CAR(x);
1107         if (SCM_EQ(carx, SCM_SYM_COMP)) {
1108             SCM_ASSERT(SCM_CHAR_SET_P(SCM_CDR(x)));
1109             if (SCM_CHARP(y) || SCM_CHAR_SET_P(y)) {
1110                 return !is_distinct(SCM_CDR(x), y);
1111             }
1112             return FALSE;
1113         }
1114         if (SCM_INTP(carx)) {
1115             if (SCM_PAIRP(SCM_CDDR(x))) {
1116                 return is_distinct(SCM_CAR(SCM_CDDR(x)), y);
1117             }
1118         }
1119         if (SCM_EQ(carx, SCM_SYM_SEQ_UNCASE)
1120             || SCM_EQ(carx, SCM_SYM_SEQ_CASE)) {
1121             if (SCM_PAIRP(SCM_CDR(x))) {
1122                 return is_distinct(SCM_CADR(x), y);
1123             }
1124         }
1125         return FALSE;
1126     }
1127     if (SCM_CHARP(x)) {
1128         if (SCM_CHARP(y)) return !SCM_EQ(x, y);
1129         return is_distinct(y, x);
1130     }
1131     if (SCM_CHAR_SET_P(x)) {
1132         if (SCM_CHARP(y)) {
1133             return !Scm_CharSetContains(SCM_CHAR_SET(x), SCM_CHAR_VALUE(y));
1134         }
1135         if (SCM_CHAR_SET_P(y)) {
1136             ScmObj ccs = Scm_CharSetCopy(SCM_CHAR_SET(y));
1137             ccs = Scm_CharSetComplement(SCM_CHAR_SET(ccs));
1138             return Scm_CharSetLE(SCM_CHAR_SET(x), SCM_CHAR_SET(ccs));
1139         }
1140         return is_distinct(y, x);
1141     }
1142     return FALSE;
1143 }
1144 
Scm_RegOptimizeAST(ScmObj ast)1145 ScmObj Scm_RegOptimizeAST(ScmObj ast)
1146 {
1147     return rc2_optimize(ast, SCM_NIL);
1148 }
1149 
1150 /*-------------------------------------------------------------
1151  * pass 3 - code generation
1152  *          This pass actually called twice; the first run counts
1153  *          the size of the bytecode, and the second run fills
1154  *          the bytecode.   EMITP == FALSE for the first, EMITP == TRUE
1155  *          for the second.
1156  *          LASTP indicates this call is dealing with the last part of
1157  *          the compiled tree, thus need to deal with EOL marker.
1158  */
1159 
1160 static void rc3_rec(regcomp_ctx *ctx, ScmObj ast, int lastp);
1161 
1162 /* Util function for pass3, to get an index of the charset vector
1163  * for the given charset.
1164  */
rc3_charset_index(ScmRegexp * rx,ScmObj cs)1165 static int rc3_charset_index(ScmRegexp *rx, ScmObj cs)
1166 {
1167     for (int i=0; i<rx->numSets; i++)
1168         if (cs == SCM_OBJ(rx->sets[i])) return i;
1169     Scm_Panic("rc3_charset_index: can't be here");
1170     return 0;                   /* dummy */
1171 }
1172 
rc3_emit(regcomp_ctx * ctx,char code)1173 static void rc3_emit(regcomp_ctx *ctx, char code)
1174 {
1175     if (ctx->emitp) {
1176         SCM_ASSERT(ctx->codep < ctx->codemax);
1177         ctx->code[ctx->codep++] = code;
1178     } else {
1179         ctx->codemax++;
1180     }
1181 }
1182 
rc3_emit_offset(regcomp_ctx * ctx,int offset)1183 static void rc3_emit_offset(regcomp_ctx *ctx, int offset)
1184 {
1185     if (offset > REGEXP_OFFSET_MAX) {
1186         Scm_Error("regexp too large.  consider splitting it up: %50.1S",
1187                   SCM_OBJ(ctx->rx));
1188     }
1189 
1190     if (ctx->emitp) {
1191         SCM_ASSERT(ctx->codep < ctx->codemax-1);
1192         ctx->code[ctx->codep++] = (offset>>8) & 0xff;
1193         ctx->code[ctx->codep++] = offset & 0xff;
1194     } else {
1195         ctx->codemax+=2;
1196     }
1197 }
1198 
rc3_fill_offset(regcomp_ctx * ctx,int codep,int offset)1199 static void rc3_fill_offset(regcomp_ctx *ctx, int codep, int offset)
1200 {
1201     if (offset > REGEXP_OFFSET_MAX) {
1202         Scm_Error("regexp too large.  consider splitting it up: %50.1S",
1203                   SCM_OBJ(ctx->rx));
1204     }
1205 
1206     if (ctx->emitp) {
1207         SCM_ASSERT(codep < ctx->codemax-1);
1208         ctx->code[codep] = (offset >> 8) & 0xff;
1209         ctx->code[codep+1] = offset & 0xff;
1210     }
1211 }
1212 
1213 #define EMIT4(cond, insn1, insn2, insn3, insn4)                     \
1214     rc3_emit(ctx, (cond)? (!ctx->lookbehindp)? insn1 : insn2     \
1215                         : (!ctx->lookbehindp)? insn3 : insn4)
1216 
rc3_seq(regcomp_ctx * ctx,ScmObj seq,int lastp)1217 static void rc3_seq(regcomp_ctx *ctx, ScmObj seq, int lastp)
1218 {
1219     ScmObj cp;
1220 
1221     if (ctx->lookbehindp) seq = Scm_Reverse(seq);
1222 
1223     SCM_FOR_EACH(cp, seq) {
1224         ScmObj item = SCM_CAR(cp);
1225 
1226         /* concatenate literal character sequence */
1227         if (SCM_CHARP(item)) {
1228             ScmObj h = SCM_NIL, t = SCM_NIL;
1229             int nrun = 0;
1230             char chbuf[SCM_CHAR_MAX_BYTES];
1231 
1232             do {
1233                 ScmChar ch = SCM_CHAR_VALUE(item);
1234                 nrun += SCM_CHAR_NBYTES(ch);
1235                 SCM_APPEND1(h, t, item);
1236                 cp = SCM_CDR(cp);
1237                 if (SCM_NULLP(cp)) break;
1238                 item = SCM_CAR(cp);
1239             } while (SCM_CHARP(item) && nrun < CHAR_MAX);
1240             if (ctx->lookbehindp) h = Scm_ReverseX(h);
1241             if (nrun == 1) {
1242                 EMIT4(!ctx->casefoldp, RE_MATCH1, RE_MATCH1_RL, RE_MATCH1_CI, RE_MATCH1_CI_RL);
1243                 rc3_emit(ctx, (char)SCM_CHAR_VALUE(SCM_CAR(h)));
1244             } else {
1245                 EMIT4(!ctx->casefoldp, RE_MATCH, RE_MATCH_RL, RE_MATCH_CI, RE_MATCH_CI_RL);
1246                 rc3_emit(ctx, (char)nrun);
1247                 ScmObj ht;
1248                 SCM_FOR_EACH(ht, h) {
1249                     ScmChar ch = SCM_CHAR_VALUE(SCM_CAR(ht));
1250                     int nb = SCM_CHAR_NBYTES(ch);
1251                     SCM_CHAR_PUT(chbuf, ch);
1252                     for (int i = 0; i < nb; i++) rc3_emit(ctx, chbuf[i]);
1253                 }
1254             }
1255             if (SCM_NULLP(cp)) break;
1256             cp = Scm_Cons(item, cp); /* pushback */
1257         } else {
1258             int p;
1259             if (ctx->lookbehindp) p = lastp && SCM_EQ(cp, seq);
1260             else p = lastp && SCM_NULLP(SCM_CDR(cp));
1261             rc3_rec(ctx, item, p);
1262         }
1263     }
1264 }
1265 
rc3_seq_rep(regcomp_ctx * ctx,ScmObj seq,int count,int lastp)1266 static void rc3_seq_rep(regcomp_ctx *ctx, ScmObj seq, int count, int lastp)
1267 {
1268     ScmObj h = SCM_NIL, t = SCM_NIL;
1269     if (count <= 0) return;
1270     while (count-- > 0) {
1271         SCM_APPEND(h, t, Scm_CopyList(seq));
1272     }
1273     rc3_seq(ctx, h, lastp);
1274 }
1275 
rc3_minmax(regcomp_ctx * ctx,ScmObj type,int count,ScmObj item,int lastp SCM_UNUSED)1276 static void rc3_minmax(regcomp_ctx *ctx, ScmObj type, int count,
1277                        ScmObj item, int lastp SCM_UNUSED)
1278 {
1279     /* (rep <n> . <x>)
1280                  TRY  #01
1281                  JUMP #11
1282            #01:  TRY  #02
1283                  JUMP #12
1284                   :
1285            #0<n>:JUMP #1<N>
1286            #11:  <X>
1287            #12:  <X>
1288                   :
1289            #1<n>:<X>
1290            #1<N>:
1291 
1292        (rep-min <n> . <x>)
1293                  TRY  #01
1294                  JUMP #1N
1295            #01:  TRY  #02
1296                  JUMP #1n
1297                   :
1298            #0<n>:TRY  #11
1299                  JUMP #12
1300            #11:  <X>
1301            #12:  <X>
1302                   :
1303            #1<n>:<X>
1304            #1<N>:
1305     */
1306     ScmObj jlist = SCM_NIL;
1307     int j0 = 0, jn;
1308     int greedy = SCM_EQ(type, SCM_SYM_REP);
1309 
1310     /* first part - TRYs and JUMPs
1311        j0 is used to patch the label #0k
1312        the destination of jumps to be patched are linked to jlist */
1313     for (int n=0; n<count; n++) {
1314         if (n>0) rc3_fill_offset(ctx, j0, ctx->codep);
1315         rc3_emit(ctx, RE_TRY);
1316         if (ctx->emitp) j0 = ctx->codep;
1317         rc3_emit_offset(ctx, 0); /* to be patched */
1318         rc3_emit(ctx, RE_JUMP);
1319         if (ctx->emitp) {
1320             jlist = Scm_Cons(SCM_MAKE_INT(ctx->codep), jlist);
1321         }
1322         rc3_emit_offset(ctx, 0); /* to be patched */
1323     }
1324     rc3_fill_offset(ctx, j0, ctx->codep); /* patch #0n */
1325     /* finishing the first part.
1326        for non-greedy match, we need one more TRY. */
1327     if (greedy) {
1328         rc3_emit(ctx, RE_JUMP);
1329         jn = ctx->codep;
1330         rc3_emit_offset(ctx, 0); /* to be patched */
1331     } else {
1332         rc3_emit(ctx, RE_TRY);
1333         jn = ctx->codep;
1334         rc3_emit_offset(ctx, 0);  /* to be patched */
1335         rc3_emit(ctx, RE_JUMP);
1336         if (ctx->emitp) {
1337             jlist = Scm_Cons(SCM_MAKE_INT(ctx->codep), jlist);
1338         }
1339         rc3_emit_offset(ctx, 0);  /* to be patched */
1340         rc3_fill_offset(ctx, jn, ctx->codep);
1341     }
1342     if (ctx->emitp && greedy) jlist = Scm_ReverseX(jlist);
1343     for (int n=0; n<count; n++) {
1344         if (ctx->emitp) {
1345             rc3_fill_offset(ctx, SCM_INT_VALUE(SCM_CAR(jlist)),
1346                             ctx->codep);
1347         }
1348         rc3_seq(ctx, item, FALSE);
1349         if (ctx->emitp) jlist = SCM_CDR(jlist);
1350     }
1351     if (greedy) {
1352         /* the last JUMP to #1N */
1353         rc3_fill_offset(ctx, jn, ctx->codep);
1354     } else {
1355         /* the first JUMP to #1N */
1356         if (ctx->emitp) {
1357             SCM_ASSERT(SCM_PAIRP(jlist));
1358             rc3_fill_offset(ctx, SCM_INT_VALUE(SCM_CAR(jlist)), ctx->codep);
1359         }
1360     }
1361 }
1362 
rc3_rec(regcomp_ctx * ctx,ScmObj ast,int lastp)1363 static void rc3_rec(regcomp_ctx *ctx, ScmObj ast, int lastp)
1364 {
1365     ScmRegexp *rx = ctx->rx;
1366 
1367     /* first, deal with atoms */
1368     if (!SCM_PAIRP(ast)) {
1369         /* a char */
1370         if (SCM_CHARP(ast)) {
1371             char chbuf[SCM_CHAR_MAX_BYTES];
1372             ScmChar ch = SCM_CHAR_VALUE(ast);
1373             int nb = SCM_CHAR_NBYTES(ch);
1374             SCM_CHAR_PUT(chbuf, ch);
1375             if (nb == 1) {
1376                 EMIT4(!ctx->casefoldp, RE_MATCH1, RE_MATCH1_RL, RE_MATCH1_CI, RE_MATCH1_CI_RL);
1377                 rc3_emit(ctx, chbuf[0]);
1378             } else {
1379                 EMIT4(!ctx->casefoldp, RE_MATCH, RE_MATCH_RL, RE_MATCH_CI, RE_MATCH_CI_RL);
1380                 rc3_emit(ctx, nb);
1381                 for (int i=0; i<nb; i++) rc3_emit(ctx, chbuf[i]);
1382             }
1383             return;
1384         }
1385         /* charset */
1386         if (SCM_CHAR_SET_P(ast)) {
1387             EMIT4(SCM_CHAR_SET_LARGE_P(ast),
1388                   RE_SET, RE_SET_RL, RE_SET1, RE_SET1_RL);
1389             rc3_emit(ctx, rc3_charset_index(rx, ast));
1390             return;
1391         }
1392         /* special stuff */
1393         if (SCM_SYMBOLP(ast)) {
1394             if (SCM_EQ(ast, SCM_SYM_ANY)) {
1395                 rc3_emit(ctx, ctx->lookbehindp?RE_ANY_RL:RE_ANY);
1396                 return;
1397             }
1398             if (SCM_EQ(ast, SCM_SYM_BOS)) {
1399                 rc3_emit(ctx, RE_BOS);
1400                 return;
1401             }
1402             if (SCM_EQ(ast, SCM_SYM_EOS)) {
1403                 rc3_emit(ctx, RE_EOS);
1404                 return;
1405             }
1406             if (SCM_EQ(ast, SCM_SYM_BOL)) {
1407                 rc3_emit(ctx, RE_BOL);
1408                 return;
1409             }
1410             if (SCM_EQ(ast, SCM_SYM_EOL)) {
1411                 if (lastp || (ctx->rx->flags & SCM_REGEXP_MULTI_LINE)) {
1412                     rc3_emit(ctx, RE_EOL);
1413                 } else {
1414                     rc3_emit(ctx, ctx->lookbehindp? RE_MATCH1_RL:RE_MATCH1);
1415                     rc3_emit(ctx, '$');
1416                 }
1417                 return;
1418             }
1419             if (SCM_EQ(ast, SCM_SYM_WB)) {
1420                 rc3_emit(ctx, RE_WB);
1421                 return;
1422             }
1423             if (SCM_EQ(ast, SCM_SYM_BOW)) {
1424                 rc3_emit(ctx, RE_BOW);
1425                 return;
1426             }
1427             if (SCM_EQ(ast, SCM_SYM_EOW)) {
1428                 rc3_emit(ctx, RE_EOW);
1429                 return;
1430             }
1431             if (SCM_EQ(ast, SCM_SYM_NWB)) {
1432                 rc3_emit(ctx, RE_NWB);
1433                 return;
1434             }
1435             if (SCM_EQ(ast, SCM_SYM_BOG)) {
1436                 rc3_emit(ctx, RE_BOG);
1437                 return;
1438             }
1439             if (SCM_EQ(ast, SCM_SYM_EOG)) {
1440                 rc3_emit(ctx, RE_EOG);
1441                 return;
1442             }
1443             /* fallback */
1444         }
1445         Scm_Error("internal error in regexp compilation: unrecognized AST item: %S", ast);
1446     }
1447 
1448     /* now we have a structured node */
1449     ScmObj type = SCM_CAR(ast);
1450     if (SCM_EQ(type, SCM_SYM_COMP)) {
1451         ScmObj cs = SCM_CDR(ast);
1452         SCM_ASSERT(SCM_CHAR_SET_P(cs));
1453         EMIT4(SCM_CHAR_SET_LARGE_P(cs),
1454               RE_NSET, RE_NSET_RL, RE_NSET1, RE_NSET1_RL);
1455         rc3_emit(ctx, rc3_charset_index(rx, cs));
1456         return;
1457     }
1458     if (SCM_EQ(type, SCM_SYM_SEQ)) {
1459         rc3_seq(ctx, SCM_CDR(ast), lastp);
1460         return;
1461     }
1462     if (SCM_INTP(type)) {
1463         /* (<integer> <name> . <ast>) */
1464         int grpno = SCM_INT_VALUE(SCM_CAR(ast));
1465         rc3_emit(ctx, ctx->lookbehindp?RE_BEGIN_RL:RE_BEGIN);
1466         rc3_emit(ctx, grpno);
1467         rc3_seq(ctx, SCM_CDDR(ast), lastp);
1468         rc3_emit(ctx, ctx->lookbehindp?RE_END_RL:RE_END);
1469         rc3_emit(ctx, grpno);
1470         return;
1471     }
1472     if (SCM_EQ(type, SCM_SYM_SEQ_UNCASE) || SCM_EQ(type, SCM_SYM_SEQ_CASE)) {
1473         int oldcase = ctx->casefoldp;
1474         ctx->casefoldp = SCM_EQ(type, SCM_SYM_SEQ_UNCASE);
1475         rc3_seq(ctx, SCM_CDR(ast), lastp);
1476         ctx->casefoldp = oldcase;
1477         return;
1478     }
1479     if (SCM_EQ(type, SCM_SYM_REP_WHILE)) {
1480         /* here we have an opportunity to generate an optimized code.
1481            for now, we only check elem is a single item case, but we can
1482            do better. */
1483         /* (rep-while m n . elem) */
1484         ScmObj m = SCM_CADR(ast), n = SCM_CAR(SCM_CDDR(ast));
1485         ScmObj elem = SCM_CDR(SCM_CDDR(ast));
1486         if (SCM_FALSEP(n) && SCM_PAIRP(elem) && SCM_NULLP(SCM_CDR(elem))) {
1487             /* (rep-while m #f elem1) */
1488             ScmObj elem1 = SCM_CAR(elem);
1489             if (SCM_EQ(elem1, SCM_SYM_ANY) && !ctx->lookbehindp) {
1490                 rc3_seq_rep(ctx, elem, SCM_INT_VALUE(m), FALSE);
1491                 rc3_emit(ctx, RE_ANYR);
1492                 return;
1493             }
1494             if (SCM_CHARP(elem1) && !ctx->lookbehindp) {
1495                 ScmChar ch = SCM_CHAR_VALUE(elem1);
1496                 rc3_seq_rep(ctx, elem, SCM_INT_VALUE(m), FALSE);
1497                 int n = SCM_CHAR_NBYTES(ch);
1498                 if (n == 1) {
1499                     rc3_emit(ctx, RE_MATCH1R);
1500                     rc3_emit(ctx, (char)ch);
1501                 } else {
1502                     char chbuf[SCM_CHAR_MAX_BYTES];
1503                     SCM_CHAR_PUT(chbuf, ch);
1504                     rc3_emit(ctx, RE_MATCHR);
1505                     rc3_emit(ctx, (char)n);  /* we know it's never overflow */
1506                     for (int i=0; i < n; i++) rc3_emit(ctx, chbuf[i]);
1507                 }
1508                 return;
1509             }
1510             if (SCM_CHAR_SET_P(elem1)) {
1511                 rc3_seq_rep(ctx, elem, SCM_INT_VALUE(m), FALSE);
1512                 EMIT4(SCM_CHAR_SET_LARGE_P(elem1),
1513                       RE_SETR, RE_SETR_RL, RE_SET1R, RE_SET1R_RL);
1514                 rc3_emit(ctx, rc3_charset_index(rx, elem1));
1515                 return;
1516             }
1517             if (SCM_PAIRP(elem1)&&SCM_EQ(SCM_CAR(elem1), SCM_SYM_COMP)) {
1518                 rc3_seq_rep(ctx, elem, SCM_INT_VALUE(m), FALSE);
1519                 ScmObj cs = SCM_CDR(elem1);
1520                 SCM_ASSERT(SCM_CHAR_SET_P(cs));
1521                 EMIT4(!ctx->lookbehindp, RE_NSETR, RE_NSETR_RL, RE_NSET1R, RE_NSET1R_RL);
1522                 rc3_emit(ctx, rc3_charset_index(rx, cs));
1523                 return;
1524             }
1525         }
1526         /* fallthrough to rep */
1527         type = SCM_SYM_REP;
1528     }
1529     if (SCM_EQ(type, SCM_SYM_ONCE) && ctx->lookbehindp) {
1530         /* [Rui] I couldn't make a decision about the behavior of standalone
1531            pattern (?>re) within a lookbehind assertion ((?<=re) or
1532            (?<!re)).  It raises an error for now. */
1533         Scm_Error("standalone pattern in lookbehind assertion is not supported: %S",
1534                   ctx->pattern);
1535     }
1536     if (SCM_EQ(type, SCM_SYM_ASSERT) || SCM_EQ(type, SCM_SYM_NASSERT)
1537         || SCM_EQ(type, SCM_SYM_ONCE)) {
1538         int ocodep = ctx->codep;
1539         int op = SCM_EQ(type, SCM_SYM_ASSERT) ? RE_ASSERT :
1540                  SCM_EQ(type, SCM_SYM_NASSERT) ? RE_NASSERT : RE_ONCE;
1541         rc3_emit(ctx, op);
1542         rc3_emit_offset(ctx, 0); /* will be patched */
1543         /* Assertions can check EOF even other regexps follow, so '$'
1544            in the last pos of this group should be treated as EOL.
1545            (?>$) as well.  It is consistent with Perl and Oniguruma. */
1546         rc3_seq(ctx, SCM_CDR(ast), TRUE);
1547         rc3_emit(ctx, RE_SUCCESS);
1548         rc3_fill_offset(ctx, ocodep+1, ctx->codep);
1549         return;
1550     }
1551     if (SCM_EQ(type, SCM_SYM_ALT)) {
1552         /*     TRY #1
1553                <alt0>
1554                JUMP next
1555            #1: TRY #2
1556                <alt1>
1557                JUMP next
1558                 :
1559                 :
1560                TRY next
1561                <altN>
1562            next:
1563         */
1564         ScmObj clause;
1565         ScmObj jumps = SCM_NIL;
1566         int patchp;
1567 
1568         if (SCM_PAIRP(SCM_CDR(ast))) {
1569             for (clause = SCM_CDR(ast);
1570                  SCM_PAIRP(SCM_CDR(clause));
1571                  clause = SCM_CDR(clause)) {
1572                 rc3_emit(ctx, RE_TRY);
1573                 patchp = ctx->codep;
1574                 rc3_emit_offset(ctx, 0); /* will be patched */
1575                 rc3_rec(ctx, SCM_CAR(clause), lastp);
1576                 rc3_emit(ctx, RE_JUMP);
1577                 if (ctx->emitp) {
1578                     jumps = Scm_Cons(SCM_MAKE_INT(ctx->codep), jumps);
1579                 }
1580                 rc3_emit_offset(ctx, 0); /* will be patched */
1581                 rc3_fill_offset(ctx, patchp, ctx->codep);
1582             }
1583             rc3_rec(ctx, SCM_CAR(clause), lastp);
1584             if (ctx->emitp) {
1585                 SCM_FOR_EACH(jumps, jumps) {
1586                     patchp = SCM_INT_VALUE(SCM_CAR(jumps));
1587                     rc3_fill_offset(ctx, patchp, ctx->codep);
1588                 }
1589             }
1590         } else {
1591             /* NB: alternation without any choices won't appear from the
1592                parsed AST, but the caller can pass in a programatically
1593                constructed AST.  It fails unconditionally, since we have
1594                no possible choice. */
1595             rc3_emit(ctx, RE_FAIL);
1596         }
1597         return;
1598     }
1599     if (SCM_EQ(type, SCM_SYM_REP) || SCM_EQ(type, SCM_SYM_REP_MIN)) {
1600         ScmObj min = SCM_CADR(ast), max = SCM_CAR(SCM_CDDR(ast));
1601         ScmObj item = SCM_CDR(SCM_CDDR(ast));
1602         int multip = 0;
1603 
1604         if (SCM_FALSEP(max) || SCM_INT_VALUE(max) > 1)
1605             multip = TRUE;
1606         rc3_seq_rep(ctx, item, SCM_INT_VALUE(min), multip);
1607 
1608         if (SCM_EQ(min, max)) {
1609             /* (rep <m> <m> <x>)
1610                     <x>
1611                      : (m-1 times)
1612             */
1613             return;
1614         }
1615         if (!SCM_FALSEP(max)) {
1616             int count = SCM_INT_VALUE(max) - SCM_INT_VALUE(min);
1617             rc3_minmax(ctx, type, count, item, lastp);
1618             return;
1619         }
1620         if (SCM_EQ(type, SCM_SYM_REP)) {
1621             /* (rep <m> #f <x>)
1622                     <x>
1623                      : (m-1 times)
1624                rep: TRY next
1625                     <x>
1626                     JUMP rep
1627                next:
1628             */
1629             int ocodep = ctx->codep;
1630             rc3_emit(ctx, RE_TRY);
1631             rc3_emit_offset(ctx, 0); /* will be patched */
1632             rc3_seq(ctx, item, FALSE);
1633             rc3_emit(ctx, RE_JUMP);
1634             rc3_emit_offset(ctx, ocodep);
1635             rc3_fill_offset(ctx, ocodep+1, ctx->codep);
1636             return;
1637         }
1638         if (SCM_EQ(type, SCM_SYM_REP_MIN)) {
1639             /* (rep-min <m> #f <x>)
1640                     <x>
1641                      : (m-1 times)
1642                rep: TRY seq
1643                JUMP next
1644                seq: <seq>
1645                JUMP rep
1646                next:
1647             */
1648             int ocodep1 = ctx->codep, ocodep2;
1649             rc3_emit(ctx, RE_TRY);
1650             rc3_emit_offset(ctx, 0); /* will be patched */
1651             ocodep2 = ctx->codep;
1652             rc3_emit(ctx, RE_JUMP);
1653             rc3_emit_offset(ctx, 0); /* will be patched */
1654             rc3_fill_offset(ctx, ocodep1+1, ctx->codep);
1655             rc3_seq(ctx, item, FALSE);
1656             rc3_emit(ctx, RE_JUMP);
1657             rc3_emit_offset(ctx, ocodep1);
1658             rc3_fill_offset(ctx, ocodep2+1, ctx->codep);
1659             return;
1660         }
1661     }
1662     if (SCM_EQ(type, SCM_SYM_LOOKBEHIND)) {
1663         int oldval = ctx->lookbehindp;
1664         ctx->lookbehindp = TRUE;
1665         rc3_seq(ctx, SCM_CDR(ast), lastp);
1666         ctx->lookbehindp = oldval;
1667         return;
1668     }
1669     if (SCM_EQ(type, SCM_SYM_BACKREF)) {
1670         SCM_ASSERT(SCM_INTP(SCM_CDR(ast)));
1671         EMIT4(!ctx->casefoldp, RE_BACKREF, RE_BACKREF_RL, RE_BACKREF_CI, RE_BACKREF_CI_RL);
1672         rc3_emit(ctx, (char)SCM_INT_VALUE(SCM_CDR(ast)));
1673         return;
1674     }
1675     if (SCM_EQ(type, SCM_SYM_CPAT)) {
1676         /* (cpat <n> <yes-pattern> <no-pattern>)
1677                  CPAT <n> #1
1678                  <yes-pattern>
1679                  JUMP #2
1680            #1:   <no-pattern>
1681            #2:
1682 
1683            (cpat <assert> <yes-pattern> <no-pattern>)
1684                  CPATA #1 #2
1685                  <assert>
1686                  SUCCESS
1687            #1:   <yes-pattern>
1688                  JUMP #3
1689            #2:   <no-pattern>
1690            #3:
1691         */
1692         ScmObj cond = SCM_CADR(ast);
1693         ScmObj ypat = SCM_CAR(SCM_CDDR(ast));
1694         ScmObj npat = SCM_CADR(SCM_CDDR(ast));
1695         if (SCM_INTP(cond)) {
1696             rc3_emit(ctx, RE_CPAT);
1697             rc3_emit(ctx, (char)SCM_INT_VALUE(cond));
1698             int ocodep1 = ctx->codep;
1699             rc3_emit_offset(ctx, 0); /* will be patched */
1700             rc3_seq(ctx, ypat, lastp);
1701             rc3_emit(ctx, RE_JUMP);
1702             int ocodep2 = ctx->codep;
1703             rc3_emit_offset(ctx, 0); /* will be patched */
1704             rc3_fill_offset(ctx, ocodep1, ctx->codep);
1705             rc3_seq(ctx, npat, lastp);
1706             rc3_fill_offset(ctx, ocodep2, ctx->codep);
1707         } else {
1708             SCM_ASSERT(SCM_EQ(SCM_CAR(cond), SCM_SYM_ASSERT)
1709                        || SCM_EQ(SCM_CAR(cond), SCM_SYM_NASSERT));
1710             rc3_emit(ctx, RE_CPATA);
1711             int ocodep1 = ctx->codep;
1712             rc3_emit_offset(ctx, 0); /* will be patched */
1713             int ocodep2 = ctx->codep;
1714             rc3_emit_offset(ctx, 0); /* will be patched */
1715             rc3_rec(ctx, cond, lastp);
1716             rc3_emit(ctx, RE_SUCCESS);
1717             rc3_fill_offset(ctx, ocodep1, ctx->codep);
1718             rc3_seq(ctx, ypat, lastp);
1719             rc3_emit(ctx, RE_JUMP);
1720             int ocodep3 = ctx->codep;
1721             rc3_emit_offset(ctx, 0); /* will be patched */
1722             rc3_fill_offset(ctx, ocodep2, ctx->codep);
1723             rc3_seq(ctx, npat, lastp);
1724             rc3_fill_offset(ctx, ocodep3, ctx->codep);
1725         }
1726         return;
1727     }
1728     Scm_Error("internal error in regexp compilation: bad node: %S", ast);
1729 }
1730 
is_atom_anchored(ScmObj ast,ScmObj atom)1731 static int is_atom_anchored(ScmObj ast, ScmObj atom)
1732 {
1733     if (!SCM_PAIRP(ast)) {
1734         if (SCM_EQ(ast, atom)) return TRUE;
1735         else return FALSE;
1736     }
1737     ScmObj type = SCM_CAR(ast);
1738     if (SCM_INTP(type)) {
1739         if (!SCM_PAIRP(SCM_CDDR(ast))) return FALSE;
1740         return is_atom_anchored(SCM_CAR(SCM_CDDR(ast)), atom);
1741     } else if (SCM_EQ(type, SCM_SYM_SEQ)
1742                || SCM_EQ(type, SCM_SYM_SEQ_UNCASE)
1743                || SCM_EQ(type, SCM_SYM_SEQ_CASE)) {
1744         if (!SCM_PAIRP(SCM_CDR(ast))) return FALSE;
1745         return is_atom_anchored(SCM_CADR(ast), atom);
1746     }
1747     if (SCM_EQ(type, SCM_SYM_ALT)) {
1748         ScmObj ap;
1749         SCM_FOR_EACH(ap, SCM_CDR(ast)) {
1750             if (!is_atom_anchored(SCM_CAR(ap), atom)) return FALSE;
1751         }
1752         return TRUE;
1753     }
1754     return FALSE;
1755 }
1756 
1757 /* Aux function for is_simple_prefixed.
1758    Returns TRUE if AST is <char>, <char-set>, or (comp . <char-set>)*/
is_char_or_charset(ScmObj ast)1759 static int is_char_or_charset(ScmObj ast)
1760 {
1761     if (SCM_CHARP(ast) || SCM_CHAR_SET_P(ast)
1762         || (SCM_PAIRP(ast)
1763             && SCM_EQ(SCM_CAR(ast), SCM_SYM_COMP)
1764             && SCM_CHAR_SET_P(SCM_CDR(ast)))) {
1765         return TRUE;
1766     } else {
1767         return FALSE;
1768     }
1769 }
1770 
1771 /* Returns TRUE iff ast has a form #/A+B/ where A is a char or charset,
1772    and B begins with distinct charset from A (B may be empty).
1773    After optimization, the AST begins with (rep-while 1 #f A).
1774    If so, we can greatly optimize the failure case.
1775    Suppose if we try input s against #/A+B/ and find it fail.  Then
1776    we can skip prefix of s as far as it matches #/A/. */
is_simple_prefixed(ScmObj ast)1777 static int is_simple_prefixed(ScmObj ast)
1778 {
1779     if (!SCM_PAIRP(ast)) return FALSE;
1780     ScmObj car = SCM_CAR(ast);
1781     if (SCM_EQ(car, SCM_SYM_REP_WHILE)) {
1782         if (SCM_EQ(SCM_CADR(ast), SCM_MAKE_INT(1))
1783             && SCM_FALSEP(SCM_CAR(SCM_CDDR(ast)))) {
1784             ScmObj body = SCM_CDR(SCM_CDDR(ast));
1785             if (SCM_PAIRP(body) && SCM_NULLP(SCM_CDR(body))) {
1786                 return is_char_or_charset(SCM_CAR(body));
1787             }
1788         }
1789         return FALSE;
1790     } else if (SCM_EQ(car, SCM_SYM_SEQ)) { /* TODO: handle uncase */
1791         if (SCM_PAIRP(SCM_CDR(ast))) {
1792             return is_simple_prefixed(SCM_CADR(ast));
1793         }
1794         return FALSE;
1795     } else if (SCM_INTP(car)) {
1796         ScmObj s = SCM_CDDR(ast);
1797         if (SCM_PAIRP(s)) return is_simple_prefixed(SCM_CAR(s));
1798     }
1799     return FALSE;
1800 }
1801 
1802 
1803 /* returns lookahead set.  modifies the first arg.  */
merge_laset(ScmObj la1,ScmObj la2)1804 static ScmObj merge_laset(ScmObj la1, ScmObj la2)
1805 {
1806     if (SCM_CHAR_SET_P(la1) && SCM_CHAR_SET_P(la2)) {
1807         return Scm_CharSetAdd(SCM_CHAR_SET(la1),
1808                               SCM_CHAR_SET(la2));
1809     } else {
1810         return SCM_FALSE;
1811     }
1812 }
1813 
1814 static ScmObj calculate_lasetn(ScmObj ast);
1815 
1816 /* returns lookahead set.  returned charset is fresh.
1817    TODO: We can also take advantage of wb and nwb condition to
1818    skip the input. */
calculate_laset(ScmObj head,ScmObj rest)1819 static ScmObj calculate_laset(ScmObj head, ScmObj rest)
1820 {
1821     if (!SCM_PAIRP(head)) {
1822         if (SCM_CHARP(head)) {
1823             return Scm_CharSetAddRange(SCM_CHAR_SET(Scm_MakeEmptyCharSet()),
1824                                        SCM_CHAR_VALUE(head),
1825                                        SCM_CHAR_VALUE(head));
1826         } else if (SCM_CHAR_SET_P(head)) {
1827             return Scm_CharSetCopy(SCM_CHAR_SET(head));
1828         }
1829         return SCM_FALSE;
1830     }
1831     ScmObj head_car = SCM_CAR(head);
1832 
1833     if (SCM_EQ(head_car, SCM_SYM_COMP)) {
1834         SCM_ASSERT(SCM_CHAR_SET_P(SCM_CDR(head)));
1835         ScmObj cs = Scm_CharSetCopy(SCM_CHAR_SET(SCM_CDR(head)));
1836         return Scm_CharSetComplement(SCM_CHAR_SET(cs));
1837     } else if (SCM_EQ(head_car, SCM_SYM_SEQ)||SCM_EQ(head_car, SCM_SYM_ONCE)) {
1838         return calculate_lasetn(SCM_CDR(head));
1839     } else if (SCM_EQ(head_car, SCM_SYM_ALT)) {
1840         ScmObj choices = SCM_CDR(head);
1841         if (!SCM_PAIRP(choices)) return SCM_FALSE;
1842         ScmObj r = calculate_laset(SCM_CAR(choices), SCM_NIL);
1843         choices = SCM_CDR(choices);
1844         while (!SCM_FALSEP(r) && SCM_PAIRP(choices)) {
1845             r = merge_laset(r, calculate_laset(SCM_CAR(choices), SCM_NIL));
1846             choices = SCM_CDR(choices);
1847         }
1848         return r;
1849     } else if (SCM_EQ(head_car, SCM_SYM_REP)
1850                || SCM_EQ(head_car, SCM_SYM_REP_WHILE)
1851                || SCM_EQ(head_car, SCM_SYM_REP_MIN)) {
1852         SCM_ASSERT(SCM_PAIRP(SCM_CDR(head)) && SCM_PAIRP(SCM_CDDR(head)));
1853         if (SCM_EQ(SCM_CADR(head), SCM_MAKE_INT(0))) {
1854             return merge_laset(calculate_lasetn(SCM_CDR(SCM_CDDR(head))),
1855                                calculate_lasetn(rest));
1856         } else {
1857             return calculate_lasetn(SCM_CDR(SCM_CDDR(head)));
1858         }
1859     } else if (SCM_INTP(head_car)) {
1860         SCM_ASSERT(SCM_PAIRP(SCM_CDR(head)));
1861         return calculate_lasetn(SCM_CDDR(head));
1862     } else {
1863         return SCM_FALSE;
1864     }
1865 }
1866 
calculate_lasetn(ScmObj ast)1867 static ScmObj calculate_lasetn(ScmObj ast)
1868 {
1869     if (!SCM_PAIRP(ast)) return SCM_FALSE;
1870     else return calculate_laset(SCM_CAR(ast), SCM_CDR(ast));
1871 }
1872 
1873 /* pass 3 */
rc3(regcomp_ctx * ctx,ScmObj ast)1874 static ScmObj rc3(regcomp_ctx *ctx, ScmObj ast)
1875 {
1876     /* set flags and laset */
1877     if (is_atom_anchored(ast, SCM_SYM_BOS)
1878         || (!(ctx->rx->flags & SCM_REGEXP_MULTI_LINE)
1879             && is_atom_anchored(ast, SCM_SYM_BOL))) {
1880         ctx->rx->flags |= SCM_REGEXP_BOL_ANCHORED;
1881     }
1882     else if (is_simple_prefixed(ast)) ctx->rx->flags |= SCM_REGEXP_SIMPLE_PREFIX;
1883     ctx->rx->laset = calculate_laset(ast, SCM_NIL);
1884 
1885     /* pass 3-1 : count # of insns */
1886     ctx->codemax = 1;
1887     ctx->emitp = FALSE;
1888     rc3_rec(ctx, ast, TRUE);
1889 
1890     /* pass 3-2 : code generation */
1891     ctx->code = SCM_NEW_ATOMIC2(unsigned char *, ctx->codemax);
1892     ctx->emitp = TRUE;
1893     rc3_rec(ctx, ast, TRUE);
1894     rc3_emit(ctx, RE_SUCCESS);
1895     ctx->rx->code = ctx->code;
1896     ctx->rx->numCodes = ctx->codep;
1897 
1898     ctx->rx->ast = ast;
1899     return SCM_OBJ(ctx->rx);
1900 }
1901 
1902 /* For debug */
Scm_RegDump(ScmRegexp * rx)1903 void Scm_RegDump(ScmRegexp *rx)
1904 {
1905     static const char *opnames[] = {
1906 #define DEF_RE_INSN(name, _) #name,
1907 #include "gauche/regexp_insn.h"
1908 #undef DEF_RE_INSN
1909     };
1910 
1911     static enum {
1912         OP_none,
1913         OP_octet,
1914         OP_string,
1915         OP_cset,
1916         OP_group,
1917         OP_offset2,
1918         OP_offset1_2,
1919         OP_offset2_2
1920     } optypes[] = {
1921 #define DEF_RE_INSN(_, optype) optype,
1922 #include "gauche/regexp_insn.h"
1923 #undef DEF_RE_INSN
1924     };
1925 
1926     Scm_Printf(SCM_CUROUT, "Regexp %p: (flags=%08x", rx, rx->flags);
1927     if (rx->flags&SCM_REGEXP_BOL_ANCHORED)
1928         Scm_Printf(SCM_CUROUT, ",BOL_ANCHORED");
1929     if (rx->flags&SCM_REGEXP_SIMPLE_PREFIX)
1930         Scm_Printf(SCM_CUROUT, ",SIMPLE_PREFIX");
1931     Scm_Printf(SCM_CUROUT, ")\n");
1932     Scm_Printf(SCM_CUROUT, " laset = %S\n", rx->laset);
1933     Scm_Printf(SCM_CUROUT, "  must = ");
1934     if (rx->mustMatch) {
1935         Scm_Printf(SCM_CUROUT, "%S\n", rx->mustMatch);
1936     } else {
1937         Scm_Printf(SCM_CUROUT, "(none)\n");
1938     }
1939 
1940     int end = rx->numCodes;
1941     for (int codep = 0; codep < end; codep++) {
1942         int code = rx->code[codep];
1943         int optype = optypes[code];
1944         Scm_Printf(SCM_CUROUT, "%4d  ", codep);
1945         switch (optype) {
1946         case OP_none:
1947             Scm_Printf(SCM_CUROUT, "%s\n", opnames[code]);
1948             break;
1949         case OP_octet:
1950             Scm_Printf(SCM_CUROUT, "%s  0x%02x  '%c'\n",
1951                        opnames[code],
1952                        rx->code[codep+1],
1953                        rx->code[codep+1]);
1954             codep++;
1955             break;
1956         case OP_string:
1957             {
1958                 u_int numchars = (u_int)rx->code[++codep];
1959                 u_int i;
1960                 Scm_Printf(SCM_CUROUT, "%s(%3d) '",
1961                            opnames[code], numchars);
1962                 for (i=0; i< numchars; i++)
1963                     Scm_Printf(SCM_CUROUT, "%c", rx->code[++codep]);
1964                 Scm_Printf(SCM_CUROUT, "'\n");
1965             }
1966             break;
1967         case OP_cset:
1968             Scm_Printf(SCM_CUROUT, "%s  %d    %S\n",
1969                        opnames[code],
1970                        rx->code[codep+1],
1971                        rx->sets[rx->code[codep+1]]);
1972             codep++;
1973             break;
1974         case OP_group:
1975             Scm_Printf(SCM_CUROUT, "%s  %d\n",
1976                        opnames[code],
1977                        rx->code[codep+1]);
1978             codep++;
1979             break;
1980         case OP_offset2:
1981             Scm_Printf(SCM_CUROUT, "%s  %d\n",
1982                        opnames[code],
1983                        (rx->code[codep+1])*256 + rx->code[codep+2]);
1984             codep += 2;
1985             break;
1986         case OP_offset1_2:
1987             Scm_Printf(SCM_CUROUT, "%s %d %d\n",
1988                        opnames[code],
1989                        rx->code[codep+1],
1990                        rx->code[codep+2]*256 + rx->code[codep+3]);
1991             codep += 3;
1992             break;
1993         case OP_offset2_2:
1994             Scm_Printf(SCM_CUROUT, "%s %d %d\n",
1995                        opnames[code],
1996                        rx->code[codep+1]*256 + rx->code[codep+2],
1997                        rx->code[codep+3]*256 + rx->code[codep+4]);
1998             codep += 4;
1999             break;
2000         }
2001     }
2002 }
2003 
2004 /* Helper routine to be used for compilation from AST.
2005    Traverses AST to reorder groups and collect charsets.
2006    Note that the native regcomp path doesn't use these fns.
2007    Only the AST provided from outside is processed. */
2008 static ScmObj rc_setup_context_seq(regcomp_ctx *ctx, ScmObj seq);
2009 
rc_setup_context(regcomp_ctx * ctx,ScmObj ast)2010 static ScmObj rc_setup_context(regcomp_ctx *ctx, ScmObj ast)
2011 {
2012     if (!SCM_PAIRP(ast)) {
2013         if (SCM_CHARP(ast)) return ast;
2014         if (SCM_CHAR_SET_P(ast)) {
2015             rc_register_charset(ctx, SCM_CHAR_SET(ast));
2016             return ast;
2017         }
2018         if (SCM_EQ(ast, SCM_SYM_BOS) || SCM_EQ(ast, SCM_SYM_EOS)
2019             || SCM_EQ(ast, SCM_SYM_BOL) || SCM_EQ(ast, SCM_SYM_EOL)
2020             || SCM_EQ(ast, SCM_SYM_WB) || SCM_EQ(ast, SCM_SYM_NWB)
2021             || SCM_EQ(ast, SCM_SYM_BOW) || SCM_EQ(ast, SCM_SYM_EOW)
2022             || SCM_EQ(ast, SCM_SYM_BOG) || SCM_EQ(ast, SCM_SYM_EOG)
2023             || SCM_EQ(ast, SCM_SYM_ANY)) {
2024             return ast;
2025         }
2026         goto badast;
2027     }
2028     ScmObj type = SCM_CAR(ast);
2029     if (SCM_INTP(type)) {
2030         int grpno = ctx->grpcount++;
2031         ScmObj prevno = type, name = SCM_CADR(ast), body = SCM_CDDR(ast);
2032         ScmObj rest = rc_setup_context_seq(ctx, body);
2033         if (SCM_SYMBOLP(name)) {
2034             ctx->rx->grpNames = Scm_Acons(name, SCM_MAKE_INT(grpno),
2035                                           ctx->rx->grpNames);
2036         }
2037         if (SCM_INT_VALUE(prevno) == grpno && SCM_EQ(body, rest)) {
2038             return ast;
2039         } else {
2040             return Scm_Cons(SCM_MAKE_INT(grpno), Scm_Cons(name, rest));
2041         }
2042     }
2043     if (SCM_EQ(type, SCM_SYM_COMP)) {
2044         if (!SCM_CHAR_SET_P(SCM_CDR(ast))) goto badast;
2045         rc_register_charset(ctx, SCM_CHAR_SET(SCM_CDR(ast)));
2046         return ast;
2047     }
2048     if (SCM_EQ(type, SCM_SYM_BACKREF)) {
2049        if (!SCM_INTP(SCM_CDR(ast))) goto badast;
2050        return ast;
2051     }
2052     if (SCM_EQ(type, SCM_SYM_CPAT)) {
2053        if (!SCM_PAIRP(SCM_CDR(ast))
2054            || !SCM_PAIRP(SCM_CDDR(ast))
2055            || !SCM_PAIRP(SCM_CDR(SCM_CDDR(ast)))
2056            || !SCM_NULLP(SCM_CDDR(SCM_CDDR(ast))))
2057            goto badast;
2058        ScmObj cond = SCM_CADR(ast);
2059        ScmObj then = SCM_CAR(SCM_CDDR(ast));
2060        ScmObj alt = SCM_CADR(SCM_CDDR(ast));
2061        if (SCM_PAIRP(cond)) {
2062            if (!SCM_EQ(SCM_CAR(cond), SCM_SYM_ASSERT)
2063                && !SCM_EQ(SCM_CAR(cond), SCM_SYM_NASSERT)) goto badast;
2064            cond = rc_setup_context(ctx, cond);
2065        } else if (!SCM_INTP(cond)) {
2066            goto badast;
2067        }
2068        then = rc_setup_context_seq(ctx, then);
2069        if (!SCM_FALSEP(alt))
2070            alt = rc_setup_context_seq(ctx, alt);
2071        if (SCM_EQ(cond, SCM_CADR(ast))
2072            && SCM_EQ(then, SCM_CAR(SCM_CDDR(ast)))
2073            && SCM_EQ(alt, SCM_CADR(SCM_CDDR(ast)))) return ast;
2074        else return SCM_LIST4(type, cond, then, alt);
2075     }
2076     if (SCM_EQ(type, SCM_SYM_SEQ) || SCM_EQ(type, SCM_SYM_ALT)
2077         || SCM_EQ(type, SCM_SYM_SEQ_UNCASE) || SCM_EQ(type, SCM_SYM_SEQ_CASE)
2078         || SCM_EQ(type, SCM_SYM_ONCE) || SCM_EQ(type, SCM_SYM_LOOKBEHIND)
2079         || SCM_EQ(type, SCM_SYM_ASSERT) || SCM_EQ(type, SCM_SYM_NASSERT)) {
2080         ScmObj rest = rc_setup_context_seq(ctx, SCM_CDR(ast));
2081         if (SCM_EQ(SCM_CDR(ast), rest)) return ast;
2082         else return Scm_Cons(type, rest);
2083     }
2084     if (SCM_EQ(type, SCM_SYM_REP_WHILE) || SCM_EQ(type, SCM_SYM_REP)
2085         || SCM_EQ(type, SCM_SYM_REP_MIN)) {
2086         if (!SCM_PAIRP(SCM_CDR(ast)) || !SCM_PAIRP(SCM_CDDR(ast)))
2087             goto badast;
2088         ScmObj m = SCM_CADR(ast);
2089         ScmObj n = SCM_CAR(SCM_CDDR(ast));
2090         ScmObj item = SCM_CDR(SCM_CDDR(ast));
2091         if (!SCM_INTP(m) || SCM_INT_VALUE(m) < 0) goto badast;
2092         if (!SCM_FALSEP(n) && (!SCM_INTP(n) || SCM_INT_VALUE(m) < 0))
2093             goto badast;
2094         ScmObj rest = rc_setup_context_seq(ctx, item);
2095         if (SCM_EQ(item, rest)) return ast;
2096         else return SCM_LIST4(type, m, n, rest);
2097     }
2098   badast:
2099     Scm_Error("invalid regexp AST: %S", ast);
2100     return SCM_UNDEFINED;       /* dummy */
2101 }
2102 
rc_setup_context_seq(regcomp_ctx * ctx,ScmObj seq)2103 static ScmObj rc_setup_context_seq(regcomp_ctx *ctx, ScmObj seq)
2104 {
2105     ScmObj sp, sp2, obj = SCM_NIL, head = SCM_NIL, tail = SCM_NIL;
2106     SCM_FOR_EACH(sp, seq) {
2107         obj = rc_setup_context(ctx, SCM_CAR(sp));
2108         if (!SCM_EQ(obj, SCM_CAR(sp))) break;
2109     }
2110     if (SCM_NULLP(sp)) return seq;
2111     /* we need to copy the spine */
2112     SCM_FOR_EACH(sp2, seq) {
2113         if (SCM_EQ(sp2, sp)) break;
2114         SCM_APPEND1(head, tail, SCM_CAR(sp2));
2115     }
2116     SCM_APPEND1(head, tail, obj);
2117     SCM_FOR_EACH(sp2, SCM_CDR(sp2)) {
2118         SCM_APPEND1(head, tail, rc_setup_context(ctx, SCM_CAR(sp2)));
2119     }
2120     return head;
2121 }
2122 
2123 /*--------------------------------------------------------------
2124  * Compiler entry point
2125  */
Scm_RegComp(ScmString * pattern,int flags)2126 ScmObj Scm_RegComp(ScmString *pattern, int flags)
2127 {
2128     if (SCM_STRING_INCOMPLETE_P(pattern)) {
2129         Scm_Error("incomplete string is not allowed: %S", pattern);
2130     }
2131 
2132     ScmRegexp *rx = make_regexp();
2133     regcomp_ctx cctx;
2134     rc_ctx_init(&cctx, rx, pattern);
2135     cctx.casefoldp = flags & SCM_REGEXP_CASE_FOLD;
2136     rx->flags |= (flags & SCM_REGEXP_CASE_FOLD);
2137     rx->flags |= (flags & SCM_REGEXP_MULTI_LINE);
2138 
2139     /* pass 1 : parse regexp spec */
2140     ScmObj ast = rc1(&cctx);
2141     rc_setup_charsets(rx, &cctx);
2142     if (flags & SCM_REGEXP_PARSE_ONLY) return ast;
2143 
2144     /* pass 2 : optimization */
2145     ast = rc2_optimize(ast, SCM_NIL);
2146 
2147     /* pass 3 : generate bytecode */
2148     return rc3(&cctx, ast);
2149 }
2150 
2151 /* alternative entry that compiles from AST */
2152 #if GAUCHE_API_VERSION >= 1000
Scm_RegCompFromAST(ScmObj ast,int flags)2153 ScmObj Scm_RegCompFromAST(ScmObj ast, int flags)
2154 #else /* GAUCHE_API_VERSION < 1000 */
2155 ScmObj Scm_RegCompFromAST(ScmObj ast)
2156 {
2157     return Scm_RegCompFromAST2(ast, 0);
2158 }
2159 
2160 ScmObj Scm_RegCompFromAST2(ScmObj ast, int flags)
2161 #endif /* GAUCHE_API_VERSION < 1000 */
2162 {
2163     ScmRegexp *rx = make_regexp();
2164     regcomp_ctx cctx;
2165     rc_ctx_init(&cctx, rx, NULL);
2166     rx->flags |= (flags & SCM_REGEXP_MULTI_LINE);
2167 
2168     /* prepare some context */
2169     if (!SCM_PAIRP(ast) || !SCM_INTP(SCM_CAR(ast))) {
2170         /* ensure the entire AST is in a group #0 */
2171         ast = SCM_LIST3(SCM_MAKE_INT(0), SCM_FALSE, ast);
2172     }
2173     ast = rc_setup_context(&cctx, ast);
2174     rc_setup_charsets(rx, &cctx);
2175     rx->numGroups = cctx.grpcount;
2176 
2177     /* pass 3 */
2178     return rc3(&cctx, ast);
2179 }
2180 
2181 /*=======================================================================
2182  * Matcher
2183  */
2184 
2185 /* For now, I use C-stack directly to keep information for backtrack,
2186  * i.e. anytime I should try something I recursively call rex_rec().
2187  * It may run out the stack space if regexp requires deep recursion.
2188  *
2189  * Rex_rec doesn't return as long as match succeeds.  At the end of
2190  * code, it longjmp's to the start of matcher.
2191  *
2192  * My preliminary test showed that using C-stack & longjmp is faster than
2193  * allocating and maintaining the stack by myself.   Further test is required
2194  * for practical case, though.
2195  */
2196 
2197 struct match_ctx {
2198     ScmRegexp *rx;
2199     const unsigned char *codehead; /* start of code */
2200     const char *input;          /* start of input */
2201     const char *stop;           /* end of input */
2202     const char *last;
2203     struct ScmRegMatchSub **matches;
2204     void *begin_stack;          /* C stack pointer the match began from. */
2205     sigjmp_buf *cont;
2206     ScmObj grapheme_predicate;
2207 };
2208 
2209 #define MAX_STACK_USAGE   0x100000
2210 
match_ci(const char ** input,const unsigned char ** code,int length)2211 static int match_ci(const char **input, const unsigned char **code, int length)
2212 {
2213     do {
2214         ScmChar inch, c;
2215         SCM_CHAR_GET(*input, inch);
2216         int csize = SCM_CHAR_NBYTES(inch);
2217         *input += csize;
2218         SCM_CHAR_GET(*code, c);
2219         *code += SCM_CHAR_NBYTES(c);
2220         if (Scm_CharDowncase(inch) != c)
2221             return FALSE;
2222         length -= csize;
2223     } while (length > 0);
2224     return TRUE;
2225 }
2226 
2227 /* Check if input points to the word boundary.  For now, I consider
2228    all multibyte chars word-constituent. */
is_word_constituent(unsigned char b)2229 static int is_word_constituent(unsigned char b)
2230 {
2231     if (b >= 128) return TRUE;
2232     if (b >= '0' && b <= '9') return TRUE;
2233     if (b >= 'A' && b <= 'Z') return TRUE;
2234     if (b >= 'a' && b <= 'z') return TRUE;
2235     return FALSE;
2236 }
2237 
is_word_boundary(struct match_ctx * ctx,const char * input,unsigned int code)2238 static int is_word_boundary(struct match_ctx *ctx, const char *input, unsigned int code)
2239 {
2240     const char *prevp;
2241 
2242     if ((code == RE_BOW || code == RE_WB) && input == ctx->input) return TRUE;
2243     if ((code == RE_EOW || code == RE_WB) && input == ctx->stop) return TRUE;
2244     unsigned char nextb = (unsigned char)*input;
2245     SCM_CHAR_BACKWARD(input, ctx->input, prevp);
2246     SCM_ASSERT(prevp != NULL);
2247     unsigned char prevb = (unsigned char)*prevp;
2248     if ((code == RE_BOW || code == RE_WB)
2249         && is_word_constituent(nextb) && !is_word_constituent(prevb)) {
2250         return TRUE;
2251     }
2252     if ((code == RE_EOW || code == RE_WB)
2253         && !is_word_constituent(nextb) && is_word_constituent(prevb)) {
2254         return TRUE;
2255     }
2256     return FALSE;
2257 }
2258 
is_beginning_of_line(struct match_ctx * ctx,const char * input)2259 static int is_beginning_of_line(struct match_ctx *ctx, const char *input)
2260 {
2261     if (input == ctx->input) return TRUE;
2262     if (!(ctx->rx->flags & SCM_REGEXP_MULTI_LINE)) return FALSE;
2263 
2264     const char *prevp;
2265     SCM_CHAR_BACKWARD(input, ctx->input, prevp);
2266     SCM_ASSERT(prevp != NULL);
2267 
2268     unsigned char prevb = (unsigned char)*prevp;
2269     if (prevb == '\n' || prevb == '\r') return TRUE;
2270 
2271     return FALSE;
2272 }
2273 
is_end_of_line(struct match_ctx * ctx,const char * input)2274 static int is_end_of_line(struct match_ctx *ctx, const char *input)
2275 {
2276     if (input == ctx->stop) return TRUE;
2277     if (!(ctx->rx->flags & SCM_REGEXP_MULTI_LINE)) return FALSE;
2278 
2279     unsigned char nextb = (unsigned char)*input;
2280     if (nextb == '\n' || nextb == '\r') return TRUE;
2281 
2282     return FALSE;
2283 }
2284 
is_grapheme_boundary(struct match_ctx * ctx,const char * input,unsigned int code)2285 static int is_grapheme_boundary(struct match_ctx *ctx,
2286                                 const char *input,
2287                                 unsigned int code)
2288 {
2289     if (input == ctx->input) return code == RE_BOG;
2290     if (input == ctx->stop) return code == RE_EOG;
2291 
2292     if (ctx->grapheme_predicate == SCM_UNDEFINED) {
2293         ScmObj make_predicate = SCM_UNDEFINED;
2294         /*
2295          * So far only gauche.regexp.sre can produce AST grapheme
2296          * nodes, so we're sure it's already loaded.
2297          */
2298         SCM_BIND_PROC(make_predicate, "make-grapheme-predicate",
2299                       SCM_FIND_MODULE("gauche.regexp.sre", 0));
2300 
2301         ScmObj str = Scm_MakeString(ctx->input,
2302                                     ctx->stop - ctx->input, -1,
2303                                     SCM_STRING_IMMUTABLE);
2304         ctx->grapheme_predicate = Scm_ApplyRec1(make_predicate, str);
2305     }
2306 
2307     /*
2308      * Using small cursors essentially puts a limit on input string's
2309      * length. But regex should not be run on very long strings to
2310      * begin with, so this should be fine.
2311      */
2312     ScmObj cursor = SCM_MAKE_STRING_CURSOR_SMALL(input - ctx->input);
2313     ScmObj result = Scm_ApplyRec1(ctx->grapheme_predicate, cursor);
2314     return !SCM_FALSEP(result);
2315 }
2316 
rex_rec(const unsigned char * code,const char * input,struct match_ctx * ctx)2317 static void rex_rec(const unsigned char *code,
2318                     const char *input,
2319                     struct match_ctx *ctx)
2320 {
2321     int param;
2322     ScmChar ch;
2323     ScmCharSet *cset;
2324     const char *bpos;
2325 
2326     /* TODO: here we assume C-stack grows downward; need to check by
2327        configure */
2328     if ((char*)&cset < (char*)ctx->begin_stack - MAX_STACK_USAGE) {
2329         Scm_Error("Ran out of stack during matching regexp %S. "
2330                   "Too many retries?", ctx->rx);
2331     }
2332 
2333     for (;;) {
2334         switch(*code++) {
2335         case RE_MATCH:
2336             param = *code++;
2337             if (ctx->stop - input < param) return;
2338             while (param-- > 0) {
2339                 if (*code++ != (unsigned char)*input++) return;
2340             }
2341             continue;
2342         case RE_MATCH_RL:
2343             param = *code++;
2344             if (input - param < ctx->input) return;
2345             bpos = input = input - param;
2346             while (param-- > 0) {
2347                 if (*code++ != (unsigned char)*bpos++) return;
2348             }
2349             continue;
2350         case RE_MATCH1:
2351             if (ctx->stop == input) return;
2352             if (*code++ != (unsigned char)*input++) return;
2353             continue;
2354         case RE_MATCH1_RL:
2355             if (ctx->input == input) return;
2356             if (*code++ != (unsigned char)*--input) return;
2357             continue;
2358         case RE_MATCH_CI:
2359             param = *code++;
2360             if (ctx->stop - input < param) return;
2361             if (!match_ci(&input, &code, param)) return;
2362             continue;
2363         case RE_MATCH_CI_RL:
2364             param = *code++;
2365             if (input - param < ctx->input) return;
2366             bpos = input = input - param;
2367             if (!match_ci(&bpos, &code, param)) return;
2368             continue;
2369         case RE_MATCH1_CI:
2370             if (ctx->stop == input) return;
2371             param  = (unsigned char)*input++;
2372             if (SCM_CHAR_NFOLLOWS(param)!=0
2373                 || (*code++)!=SCM_CHAR_DOWNCASE(param)) {
2374                 return;
2375             }
2376             continue;
2377         case RE_MATCH1_CI_RL:
2378             if (ctx->input == input) return;
2379             param = (unsigned char)*--input;
2380             if (SCM_CHAR_NFOLLOWS(param)!=0
2381                 || (*code++)!=SCM_CHAR_DOWNCASE(param)) {
2382                 return;
2383             }
2384             continue;
2385         case RE_ANY:
2386             if (ctx->stop == input) return;
2387             input += SCM_CHAR_NFOLLOWS(*input) + 1;
2388             continue;
2389         case RE_ANY_RL:
2390             if (ctx->input == input) return;
2391             SCM_CHAR_BACKWARD(input, ctx->input, bpos);
2392             input = bpos;
2393             continue;
2394         case RE_TRY:
2395             rex_rec(code+2, input, ctx);
2396             code = ctx->codehead + code[0]*256 + code[1];
2397             continue;
2398         case RE_JUMP:
2399             code = ctx->codehead + code[0]*256 + code[1];
2400             continue;
2401         case RE_SET1:
2402             if (ctx->stop == input) return;
2403             if ((unsigned char)*input >= 128) return;
2404             if (!Scm_CharSetContains(ctx->rx->sets[*code++], *input)) return;
2405             input++;
2406             continue;
2407         case RE_SET1_RL:
2408             if (ctx->input == input) return;
2409             SCM_CHAR_BACKWARD(input, ctx->input, bpos);
2410             if ((unsigned char)*bpos >= 128) return;
2411             if (!Scm_CharSetContains(ctx->rx->sets[*code++], *bpos)) return;
2412             input = bpos;
2413             continue;
2414         case RE_NSET1:
2415             if (ctx->stop == input) return;
2416             if ((unsigned char)*input < 128) {
2417                 if (Scm_CharSetContains(ctx->rx->sets[*code++], *input))
2418                     return;
2419                 input++;
2420             } else {
2421                 code++;
2422                 input += SCM_CHAR_NFOLLOWS((unsigned char)*input) + 1;
2423             }
2424             continue;
2425         case RE_NSET1_RL:
2426             if (ctx->input == input) return;
2427             SCM_CHAR_BACKWARD(input, ctx->input, bpos);
2428             if ((unsigned char)*bpos < 128) {
2429                 if (Scm_CharSetContains(ctx->rx->sets[*code++], *bpos))
2430                     return;
2431             }
2432             input = bpos;
2433             continue;
2434         case RE_SET:
2435             if (ctx->stop == input) return;
2436             SCM_CHAR_GET(input, ch);
2437             cset = ctx->rx->sets[*code++];
2438             if (!Scm_CharSetContains(cset, ch)) return;
2439             input += SCM_CHAR_NBYTES(ch);
2440             continue;
2441         case RE_SET_RL:
2442             if (ctx->input == input) return;
2443             SCM_CHAR_BACKWARD(input, ctx->input, bpos);
2444             SCM_CHAR_GET(bpos, ch);
2445             cset = ctx->rx->sets[*code++];
2446             if (!Scm_CharSetContains(cset, ch)) return;
2447             input = bpos;
2448             continue;
2449         case RE_NSET:
2450             if (ctx->stop == input) return;
2451             SCM_CHAR_GET(input, ch);
2452             cset = ctx->rx->sets[*code++];
2453             if (Scm_CharSetContains(cset, ch)) return;
2454             input += SCM_CHAR_NBYTES(ch);
2455             continue;
2456         case RE_NSET_RL:
2457             if (ctx->input == input) return;
2458             SCM_CHAR_BACKWARD(input, ctx->input, bpos);
2459             SCM_CHAR_GET(bpos, ch);
2460             cset = ctx->rx->sets[*code++];
2461             if (Scm_CharSetContains(cset, ch)) return;
2462             input = bpos;
2463             continue;
2464         case RE_BEGIN: {
2465             int grpno = *code++;
2466             const char *opos = ctx->matches[grpno]->startp;
2467             const char *oend = ctx->matches[grpno]->endp;
2468             ctx->matches[grpno]->startp = input;
2469             rex_rec(code, input, ctx);
2470             ctx->matches[grpno]->startp = opos;
2471             ctx->matches[grpno]->endp = oend;
2472             return;
2473         }
2474         case RE_BEGIN_RL: {
2475             int grpno = *code++;
2476             const char *opos = ctx->matches[grpno]->endp;
2477             ctx->matches[grpno]->endp = input;
2478             rex_rec(code, input, ctx);
2479             ctx->matches[grpno]->endp = opos;
2480             return;
2481         }
2482         case RE_END: {
2483             int grpno = *code++;
2484             ctx->matches[grpno]->endp = input;
2485             continue;
2486         }
2487         case RE_END_RL: {
2488             int grpno = *code++;
2489             ctx->matches[grpno]->startp = input;
2490             continue;
2491         }
2492         case RE_BOS:
2493             if (input != ctx->input) return;
2494             continue;
2495         case RE_EOS:
2496             if (input != ctx->stop) return;
2497             continue;
2498         case RE_BOL:
2499             if (!is_beginning_of_line(ctx, input)) return;
2500             continue;
2501         case RE_EOL:
2502             if (!is_end_of_line(ctx, input)) return;
2503             continue;
2504         case RE_WB: case RE_BOW: case RE_EOW:
2505             if (!is_word_boundary(ctx, input, code[-1])) return;
2506             continue;
2507         case RE_NWB:
2508             if (is_word_boundary(ctx, input, RE_WB)) return;
2509             continue;
2510 	case RE_BOG: case RE_EOG:
2511             if (!is_grapheme_boundary(ctx, input, code[-1])) return;
2512             continue;
2513         case RE_SUCCESS:
2514             ctx->last = input;
2515             siglongjmp(*ctx->cont, 1);
2516             /*NOTREACHED*/
2517         case RE_FAIL:
2518             return;
2519         case RE_SET1R:
2520             cset = ctx->rx->sets[*code++];
2521             for (;;) {
2522                 if (ctx->stop <= input) break;
2523                 if ((unsigned char)*input >= 128) break;
2524                 if (!Scm_CharSetContains(cset, *input)) break;
2525                 input++;
2526             }
2527             continue;
2528         case RE_SET1R_RL:
2529             cset = ctx->rx->sets[*code++];
2530             for (;;) {
2531                 if (input == ctx->input) break;
2532                 SCM_CHAR_BACKWARD(input, ctx->input, bpos);
2533                 if ((unsigned char)*bpos >= 128) break;
2534                 if (!Scm_CharSetContains(cset, *bpos)) break;
2535                 input = bpos;
2536             }
2537             continue;
2538         case RE_NSET1R:
2539             cset = ctx->rx->sets[*code++];
2540             for (;;) {
2541                 if (ctx->stop <= input) break;
2542                 if ((unsigned char)*input < 128 ) {
2543                     if (Scm_CharSetContains(cset, *input)) break;
2544                     input++;
2545                 } else {
2546                     input+=SCM_CHAR_NFOLLOWS(*input)+1;
2547                 }
2548             }
2549             continue;
2550         case RE_NSET1R_RL:
2551             cset = ctx->rx->sets[*code++];
2552             for (;;) {
2553                 if (ctx->input == input) break;
2554                 SCM_CHAR_BACKWARD(input, ctx->input, bpos);
2555                 if ((unsigned char)*bpos < 128) {
2556                     if (Scm_CharSetContains(cset, *bpos)) break;
2557                 }
2558                 input = bpos;
2559             }
2560             continue;
2561         case RE_SETR:
2562             cset = ctx->rx->sets[*code++];
2563             for (;;) {
2564                 if (ctx->stop <= input) break;
2565                 SCM_CHAR_GET(input, ch);
2566                 if (!Scm_CharSetContains(cset, ch)) break;
2567                 input += SCM_CHAR_NBYTES(ch);
2568             }
2569             continue;
2570         case RE_SETR_RL:
2571             cset = ctx->rx->sets[*code++];
2572             for (;;) {
2573                 if (ctx->input == input) break;
2574                 SCM_CHAR_BACKWARD(input, ctx->input, bpos);
2575                 SCM_CHAR_GET(bpos, ch);
2576                 if (!Scm_CharSetContains(cset, ch)) break;
2577                 input = bpos;
2578             }
2579             continue;
2580         case RE_NSETR:
2581             cset = ctx->rx->sets[*code++];
2582             for (;;) {
2583                 if (ctx->stop <= input) break;
2584                 SCM_CHAR_GET(input, ch);
2585                 if (Scm_CharSetContains(cset, ch)) break;
2586                 input += SCM_CHAR_NBYTES(ch);
2587             }
2588             continue;
2589         case RE_NSETR_RL:
2590             cset = ctx->rx->sets[*code++];
2591             for (;;) {
2592                 if (ctx->input == input) break;
2593                 SCM_CHAR_BACKWARD(input, ctx->input, bpos);
2594                 SCM_CHAR_GET(bpos, ch);
2595                 if (Scm_CharSetContains(cset, ch)) break;
2596                 input = bpos;
2597             }
2598             continue;
2599         case RE_MATCH1R:
2600             for (;;) {
2601                 if (ctx->stop <= input) break;
2602                 if ((unsigned char)*input >= 128) break;
2603                 if (*code != (unsigned char)*input) break;
2604                 input++;
2605             }
2606             code++;
2607             continue;
2608         case RE_MATCHR:
2609             param = *code++;
2610             for (;;) {
2611                 if (ctx->stop < input + param) break;
2612                 const unsigned char *str = code;
2613                 const unsigned char *ip = (const unsigned char*)input;
2614                 for (int i = 0; i < param; i++) {
2615                     if (*str++ != (unsigned char)*ip++) goto matchr_out;
2616                 }
2617                 input = (const char *)ip;
2618             }
2619         matchr_out:
2620             code += param;
2621             continue;
2622         case RE_ANYR:
2623             for (;;) {
2624                 if (ctx->stop <= input) break;
2625                 input += SCM_CHAR_NFOLLOWS(*input) + 1;
2626             }
2627             continue;
2628         case RE_CPAT: {
2629             int grpno = *code++;
2630             if (ctx->matches[grpno]->startp) code += 2;
2631             else code = ctx->codehead + code[0]*256 + code[1];
2632             continue;
2633         }
2634         case RE_CPATA: {
2635             sigjmp_buf cont, *ocont = ctx->cont;
2636             ctx->cont = &cont;
2637             if (sigsetjmp(cont, FALSE) == 0) {
2638                 rex_rec(code+4, input, ctx);
2639                 ctx->cont = ocont;
2640                 code = ctx->codehead + code[2]*256 + code[3];
2641                 continue;
2642             }
2643             code = ctx->codehead + code[0]*256 + code[1];
2644             ctx->cont = ocont;
2645             continue;
2646         }
2647         case RE_BACKREF: {
2648             int grpno = *code++;
2649             const char *match = ctx->matches[grpno]->startp;
2650             const char *end = ctx->matches[grpno]->endp;
2651             if (!match || !end) return;
2652             while (match < end) {
2653                 if (*input++ != *match++) return;
2654             }
2655             continue;
2656         }
2657         case RE_BACKREF_RL: {
2658             int grpno = *code++, len;
2659             const char *match = ctx->matches[grpno]->startp;
2660             const char *end = ctx->matches[grpno]->endp;
2661             if (!match || !end) return;
2662             len = (int)(end - match);
2663             if (input - len < ctx->input) return;
2664             bpos = input = input - len;
2665             while (len-- > 0) {
2666                 if (*match++ != (unsigned char)*bpos++) return;
2667             }
2668             continue;
2669         }
2670         case RE_BACKREF_CI: {
2671             int grpno = *code++;
2672             const char *match = ctx->matches[grpno]->startp;
2673             const char *end = ctx->matches[grpno]->endp;
2674             int i = 0;
2675             ScmChar cx, cy;
2676             if (!match || !end) return;
2677             while (match+i < end) {
2678                 if (input == ctx->stop) return;
2679                 SCM_CHAR_GET(input+i, cx);
2680                 SCM_CHAR_GET(match+i, cy);
2681                 if (SCM_CHAR_UPCASE(cx) != SCM_CHAR_UPCASE(cy))
2682                     return;
2683                 i += SCM_CHAR_NBYTES(cx);
2684             }
2685             input += i;
2686             continue;
2687         }
2688         case RE_BACKREF_CI_RL: {
2689             int grpno = *code++, i = 0, len;
2690             const char *match = ctx->matches[grpno]->startp;
2691             const char *end = ctx->matches[grpno]->endp;
2692             ScmChar cx, cy;
2693             if (!match || !end) return;
2694 
2695             len = (int)(end - match);
2696             if (input - len < ctx->input) return;
2697             bpos = input = input - len;
2698             while (match+i < end) {
2699                 if (bpos == ctx->stop) return;
2700                 SCM_CHAR_GET(bpos+i, cx);
2701                 SCM_CHAR_GET(match+i, cy);
2702                 if (SCM_CHAR_UPCASE(cx) != SCM_CHAR_UPCASE(cy))
2703                     return;
2704                 i += SCM_CHAR_NBYTES(cx);
2705             }
2706             continue;
2707         }
2708         case RE_ONCE: case RE_ASSERT: {
2709             sigjmp_buf cont, *ocont = ctx->cont;
2710             ctx->cont = &cont;
2711             if (sigsetjmp(cont, FALSE) == 0) {
2712                 rex_rec(code+2, input, ctx);
2713                 ctx->cont = ocont;
2714                 return;
2715             }
2716             if (code[-1] == RE_ONCE) input = ctx->last;
2717             code = ctx->codehead + code[0]*256 + code[1];
2718             ctx->cont = ocont;
2719             continue;
2720         }
2721         case RE_NASSERT: {
2722             sigjmp_buf cont, *ocont = ctx->cont;
2723             ctx->cont = &cont;
2724             if (sigsetjmp(cont, FALSE) == 0) {
2725                 rex_rec(code+2, input, ctx);
2726                 code = ctx->codehead + code[0]*256 + code[1];
2727                 ctx->cont = ocont;
2728                 continue;
2729             }
2730             ctx->cont = ocont;
2731             return;
2732         }
2733         default:
2734             /* shouldn't be here */
2735             Scm_Error("regexp implementation seems broken");
2736         }
2737     }
2738 }
2739 
make_match(ScmRegexp * rx,ScmString * orig,struct match_ctx * ctx)2740 static ScmObj make_match(ScmRegexp *rx, ScmString *orig,
2741                          struct match_ctx *ctx)
2742 {
2743     ScmRegMatch *rm = SCM_NEW(ScmRegMatch);
2744     SCM_SET_CLASS(rm, SCM_CLASS_REGMATCH);
2745     rm->numMatches = rx->numGroups;
2746     rm->grpNames = rx->grpNames;
2747     /* we keep information of original string separately, instead of
2748        keeping a pointer to orig; For orig may be destructively modified,
2749        but its elements are not. */
2750     const ScmStringBody *origb = SCM_STRING_BODY(orig);
2751     rm->input = SCM_STRING_BODY_START(origb);
2752     rm->inputLen = SCM_STRING_BODY_LENGTH(origb);
2753     rm->inputSize = SCM_STRING_BODY_SIZE(origb);
2754     rm->matches = ctx->matches;
2755     return SCM_OBJ(rm);
2756 }
2757 
rex(ScmRegexp * rx,ScmString * orig,const char * orig_start,const char * start,const char * end)2758 static ScmObj rex(ScmRegexp *rx, ScmString *orig,
2759                   const char *orig_start,
2760                   const char *start, const char *end)
2761 {
2762     struct match_ctx ctx;
2763     sigjmp_buf cont;
2764 
2765     ctx.rx = rx;
2766     ctx.codehead = rx->code;
2767     ctx.input = orig_start;
2768     ctx.stop = end;
2769     ctx.begin_stack = (void*)&ctx;
2770     ctx.cont = &cont;
2771     ctx.matches = SCM_NEW_ARRAY(struct ScmRegMatchSub *, rx->numGroups);
2772     ctx.grapheme_predicate = SCM_UNDEFINED;
2773 
2774     for (int i = 0; i < rx->numGroups; i++) {
2775         ctx.matches[i] = SCM_NEW(struct ScmRegMatchSub);
2776         ctx.matches[i]->start = -1;
2777         ctx.matches[i]->length = -1;
2778         ctx.matches[i]->after = -1;
2779         ctx.matches[i]->startp = NULL;
2780         ctx.matches[i]->endp = NULL;
2781     }
2782 
2783     if (sigsetjmp(cont, FALSE) == 0) {
2784         rex_rec(ctx.codehead, start, &ctx);
2785         return SCM_FALSE;
2786     }
2787     return make_match(rx, orig, &ctx);
2788 }
2789 
2790 /* advance start pointer while the character matches (skip_match=TRUE) or does
2791    not match (skip_match=FALSE), until start pointer hits limit. */
skip_input(const char * start,const char * limit,ScmObj laset,int skip_match)2792 static inline const char *skip_input(const char *start, const char *limit,
2793                                      ScmObj laset, int skip_match)
2794 {
2795     while (start <= limit) {
2796         ScmChar ch;
2797         SCM_CHAR_GET(start, ch);
2798         if (Scm_CharSetContains(SCM_CHAR_SET(laset), ch)) {
2799             if (!skip_match) return start;
2800         } else {
2801             if (skip_match) return start;
2802         }
2803         start += SCM_CHAR_NFOLLOWS(*start)+1;
2804     }
2805     return limit;
2806 }
2807 
2808 /*----------------------------------------------------------------------
2809  * entry point
2810  */
Scm_RegExec(ScmRegexp * rx,ScmString * str,ScmObj start_scm,ScmObj end_scm)2811 ScmObj Scm_RegExec(ScmRegexp *rx, ScmString *str, ScmObj start_scm, ScmObj end_scm)
2812 {
2813     const ScmStringBody *b = SCM_STRING_BODY(str);
2814     const char *orig_start = SCM_STRING_BODY_START(b);
2815     const char *start;
2816     const char *end;
2817     const ScmStringBody *mb = rx->mustMatch? SCM_STRING_BODY(rx->mustMatch) : NULL;
2818     int mustMatchLen = mb? SCM_STRING_BODY_SIZE(mb) : 0;
2819     const char *start_limit;
2820 
2821     if (SCM_STRING_INCOMPLETE_P(str)) {
2822         Scm_Error("incomplete string is not allowed: %S", str);
2823     }
2824     if (!SCM_UNBOUNDP(start_scm) && !SCM_UNDEFINEDP(start_scm)) {
2825         if (!SCM_INTEGERP(start_scm)) {
2826             Scm_TypeError("start", "exact integer required but got %S", start_scm);
2827         }
2828         int value = Scm_GetInteger(start_scm);
2829         if (value < 0 || value >= SCM_STRING_BODY_LENGTH(b)) {
2830             Scm_Error("invalid start parameter: %S", start_scm);
2831         }
2832         while (value--) {
2833             orig_start += SCM_CHAR_NFOLLOWS(*orig_start) + 1;
2834         }
2835     }
2836     start = orig_start;
2837     end = SCM_STRING_BODY_START(b);
2838     if (!SCM_UNBOUNDP(end_scm) && !SCM_UNDEFINEDP(end_scm)) {
2839         if (!SCM_INTEGERP(end_scm)) {
2840             Scm_TypeError("end", "exact integer required but got %S", end_scm);
2841         }
2842         int value = Scm_GetInteger(end_scm);
2843         if (value < 0 || value > SCM_STRING_BODY_LENGTH(b)) {
2844             Scm_Error("invalid end parameter: %S", end_scm);
2845         }
2846         while (value--) {
2847             end += SCM_CHAR_NFOLLOWS(*end) + 1;
2848         }
2849         if (end < start) {
2850             Scm_Error("invalid end parameter: %S", end_scm);
2851         }
2852     } else {
2853         end += SCM_STRING_BODY_SIZE(b);
2854     }
2855     start_limit = end - mustMatchLen;
2856 #if 0
2857     /* Disabled for now; we need to use more heuristics to determine
2858        when we should apply mustMatch.  For example, if the regexp
2859        begins with BOL assertion and constant string, then it would be
2860        faster to go for rex directly. */
2861     if (rx->mustMatch) {
2862         /* Prescreening.  If the input string doesn't contain mustMatch
2863            string, it can't match the entire expression. */
2864         if (SCM_FALSEP(Scm_StringScan(str, rx->mustMatch,
2865                                       SCM_STRING_SCAN_INDEX))) {
2866             return SCM_FALSE;
2867         }
2868     }
2869 #endif
2870     /* short cut : if rx matches only at the beginning of the string,
2871        we only run from the beginning of the string */
2872     if (rx->flags & SCM_REGEXP_BOL_ANCHORED) {
2873         return rex(rx, str, orig_start, start, end);
2874     }
2875 
2876     /* if we have lookahead-set, we may be able to skip input efficiently. */
2877     if (!SCM_FALSEP(rx->laset)) {
2878         if (rx->flags & SCM_REGEXP_SIMPLE_PREFIX) {
2879             while (start <= start_limit) {
2880                 ScmObj r = rex(rx, str, orig_start, start, end);
2881                 if (!SCM_FALSEP(r)) return r;
2882                 const char *next = skip_input(start, start_limit, rx->laset,
2883                                               TRUE);
2884                 if (start != next) start = next;
2885                 else start = next + SCM_CHAR_NFOLLOWS(*start) + 1;
2886             }
2887         } else {
2888             while (start <= start_limit) {
2889                 start = skip_input(start, start_limit, rx->laset, FALSE);
2890                 ScmObj r = rex(rx, str, orig_start, start, end);
2891                 if (!SCM_FALSEP(r)) return r;
2892                 start += SCM_CHAR_NFOLLOWS(*start)+1;
2893             }
2894         }
2895         return SCM_FALSE;
2896     }
2897 
2898     /* normal matching */
2899     while (start <= start_limit) {
2900         ScmObj r = rex(rx, str, orig_start, start, end);
2901         if (!SCM_FALSEP(r)) return r;
2902         start += SCM_CHAR_NFOLLOWS(*start)+1;
2903     }
2904     return SCM_FALSE;
2905 }
2906 
2907 /*=======================================================================
2908  * Retrieving matches
2909  */
2910 
2911 /* We calculate string length and position (in characters) lazily.
2912  * The match routine only sets ScmRegMatchSub's startp and endp
2913  * fields, and leaves start, length, and after fields -1.
2914  * When the submatch is retrieved we calculate values of those fields.
2915  *
2916  * Note that, even such retrieval functions mutate the state of
2917  * submatch objects, we don't need mutex to avoid race condition
2918  * in MT environment.  The state transition is one way (-1 to a
2919  * fixed value) and idempotent, so there's no problem if more than
2920  * one thread try to change the fields.
2921  *
2922  * The three parameters, start, length, and after, indicates the
2923  * # of characters.  Character counting is expensive, so we try
2924  * to avoid calling Scm_MBLen as much as possible.   If other two
2925  * values are known, we just subtract them from the inputLen.
2926  *
2927  * |<-------original string------------>|
2928  * |      |<---matched substr -->|      |
2929  * |      |                      |      |
2930  * |<---->|<-------------------->|<---->|
2931  * |start          length          after|
2932  * |<---------------------------------->|
2933  *               inputLen
2934  */
2935 
2936 /* We want to avoid unnecessary character counting as much as
2937    possible. */
2938 
2939 #define MSUB_BEFORE_SIZE(rm, sub) ((int)((sub)->startp - (rm)->input))
2940 #define MSUB_SIZE(rm, sub)        ((int)((sub)->endp - (sub)->startp))
2941 #define MSUB_AFTER_SIZE(rm, sub)  ((int)((rm)->input + (rm)->inputSize - (sub)->endp))
2942 
2943 #define MSUB_BEFORE_LENGTH(rm, sub) \
2944     Scm_MBLen((rm)->input, (sub)->startp)
2945 #define MSUB_LENGTH(rm, sub) \
2946     Scm_MBLen((sub)->startp, (sub)->endp)
2947 #define MSUB_AFTER_LENGTH(rm, sub) \
2948     Scm_MBLen((sub)->endp, (rm)->input + (rm)->inputSize)
2949 
2950 #define UNCOUNTED(rm, sub)                                      \
2951     (((sub)->start    >= 0 ? 0 : MSUB_BEFORE_SIZE(rm, sub))     \
2952      + ((sub)->length >= 0 ? 0 : MSUB_SIZE(rm, sub))            \
2953      + ((sub)->after  >= 0 ? 0 : MSUB_AFTER_SIZE(rm, sub)))
2954 
regmatch_count_start(ScmRegMatch * rm,struct ScmRegMatchSub * sub)2955 static void regmatch_count_start(ScmRegMatch *rm,
2956                                  struct ScmRegMatchSub *sub)
2957 {
2958     if (SCM_REG_MATCH_SINGLE_BYTE_P(rm)) {
2959         sub->start = MSUB_BEFORE_SIZE(rm, sub);
2960     } else if (UNCOUNTED(rm, sub) / 2 > MSUB_BEFORE_SIZE(rm, sub)) {
2961         sub->start = MSUB_BEFORE_LENGTH(rm, sub);
2962     } else {
2963         if (sub->length < 0) sub->length = MSUB_LENGTH(rm, sub);
2964         if (sub->after < 0)  sub->after  = MSUB_AFTER_LENGTH(rm, sub);
2965         sub->start = rm->inputLen - sub->after - sub->length;
2966     }
2967 }
2968 
regmatch_count_length(ScmRegMatch * rm,struct ScmRegMatchSub * sub)2969 static void regmatch_count_length(ScmRegMatch *rm,
2970                                   struct ScmRegMatchSub *sub)
2971 {
2972     if (SCM_REG_MATCH_SINGLE_BYTE_P(rm)) {
2973         sub->length = MSUB_SIZE(rm, sub);
2974     } else if (UNCOUNTED(rm, sub) / 2 > MSUB_SIZE(rm, sub)) {
2975         sub->length = MSUB_LENGTH(rm, sub);
2976     } else {
2977         if (sub->start < 0) sub->start = MSUB_BEFORE_LENGTH(rm, sub);
2978         if (sub->after < 0) sub->after = MSUB_AFTER_LENGTH(rm, sub);
2979         sub->length = rm->inputLen - sub->start - sub->after;
2980     }
2981 }
2982 
regmatch_count_after(ScmRegMatch * rm,struct ScmRegMatchSub * sub)2983 static void regmatch_count_after(ScmRegMatch *rm,
2984                                  struct ScmRegMatchSub *sub)
2985 {
2986     if (SCM_REG_MATCH_SINGLE_BYTE_P(rm)) {
2987         sub->after = MSUB_AFTER_SIZE(rm, sub);
2988     } else if (UNCOUNTED(rm, sub) / 2 > MSUB_AFTER_SIZE(rm, sub)) {
2989         sub->after = MSUB_AFTER_LENGTH(rm, sub);
2990     } else {
2991         if (sub->start < 0)  sub->start  = MSUB_BEFORE_LENGTH(rm, sub);
2992         if (sub->length < 0) sub->length = MSUB_LENGTH(rm, sub);
2993         sub->after = rm->inputLen - sub->start - sub->length;
2994     }
2995 }
2996 
regmatch_ref(ScmRegMatch * rm,ScmObj obj)2997 static struct ScmRegMatchSub *regmatch_ref(ScmRegMatch *rm, ScmObj obj)
2998 {
2999     struct ScmRegMatchSub *sub = NULL;
3000     if (SCM_INTP(obj)) {
3001         int i = SCM_INT_VALUE(obj);
3002         if (i < 0 || i >= rm->numMatches)
3003             Scm_Error("submatch index out of range: %d", i);
3004         sub = rm->matches[i];
3005         if (!sub->startp || !sub->endp) return NULL;
3006         return sub;
3007     }
3008     if (SCM_SYMBOLP(obj)) {
3009         ScmObj ep;
3010         SCM_FOR_EACH(ep, rm->grpNames) {
3011             if (!SCM_EQ(obj, SCM_CAAR(ep))) continue;
3012             sub = rm->matches[SCM_INT_VALUE(SCM_CDAR(ep))];
3013             if (!sub->startp || !sub->endp) continue;
3014             return sub;
3015         }
3016         if (sub != NULL) {
3017             if (sub->startp && sub->endp) return sub;
3018             else return NULL;
3019         }
3020         Scm_Error("named submatch not found: %S", obj);
3021     }
3022     Scm_Error("integer or symbol expected, but got %S", obj);
3023     return NULL;       /* dummy */
3024 }
3025 
Scm_RegMatchStart(ScmRegMatch * rm,ScmObj obj)3026 ScmObj Scm_RegMatchStart(ScmRegMatch *rm, ScmObj obj)
3027 {
3028     struct ScmRegMatchSub *sub = regmatch_ref(rm, obj);
3029     if (sub == NULL) return SCM_FALSE;
3030     if (sub->start < 0) regmatch_count_start(rm, sub);
3031     return Scm_MakeInteger(sub->start);
3032 }
3033 
Scm_RegMatchEnd(ScmRegMatch * rm,ScmObj obj)3034 ScmObj Scm_RegMatchEnd(ScmRegMatch *rm, ScmObj obj)
3035 {
3036     struct ScmRegMatchSub *sub = regmatch_ref(rm, obj);
3037     if (sub == NULL) return SCM_FALSE;
3038     if (sub->after < 0) regmatch_count_after(rm, sub);
3039     return Scm_MakeInteger(rm->inputLen - sub->after);
3040 }
3041 
Scm_RegMatchBefore(ScmRegMatch * rm,ScmObj obj)3042 ScmObj Scm_RegMatchBefore(ScmRegMatch *rm, ScmObj obj)
3043 {
3044     struct ScmRegMatchSub *sub = regmatch_ref(rm, obj);
3045     if (sub == NULL) return SCM_FALSE;
3046     if (sub->start < 0) regmatch_count_start(rm, sub);
3047     return Scm_MakeString(rm->input, MSUB_BEFORE_SIZE(rm, sub),
3048                           sub->start, 0);
3049 }
3050 
Scm_RegMatchSubstr(ScmRegMatch * rm,ScmObj obj)3051 ScmObj Scm_RegMatchSubstr(ScmRegMatch *rm, ScmObj obj)
3052 {
3053     struct ScmRegMatchSub *sub = regmatch_ref(rm, obj);
3054     if (sub == NULL) return SCM_FALSE;
3055     if (sub->length < 0) regmatch_count_length(rm, sub);
3056     return Scm_MakeString(sub->startp, MSUB_SIZE(rm, sub),
3057                           sub->length, 0);
3058 }
3059 
Scm_RegMatchAfter(ScmRegMatch * rm,ScmObj obj)3060 ScmObj Scm_RegMatchAfter(ScmRegMatch *rm, ScmObj obj)
3061 {
3062     struct ScmRegMatchSub *sub = regmatch_ref(rm, obj);
3063     if (sub == NULL) return SCM_FALSE;
3064     if (sub->after < 0) regmatch_count_after(rm, sub);
3065     return Scm_MakeString(sub->endp, MSUB_AFTER_SIZE(rm, sub),
3066                           sub->after, 0);
3067 }
3068 
3069 /* for debug */
Scm_RegMatchDump(ScmRegMatch * rm)3070 void Scm_RegMatchDump(ScmRegMatch *rm)
3071 {
3072     Scm_Printf(SCM_CUROUT, "RegMatch %p\n", rm);
3073     Scm_Printf(SCM_CUROUT, "  numMatches = %d\n", rm->numMatches);
3074     Scm_Printf(SCM_CUROUT, "  input = %S\n", rm->input);
3075     for (int i=0; i<rm->numMatches; i++) {
3076         struct ScmRegMatchSub *sub = rm->matches[i];
3077         if (sub->startp) {
3078             Scm_Printf(SCM_CUROUT, "[%3d-%3d]  %S\n",
3079                        sub->startp - rm->input,
3080                        sub->endp - rm->input,
3081                        Scm_MakeString(sub->startp,
3082                                       (int)(sub->endp-sub->startp),
3083                                       -1, 0));
3084         } else {
3085             Scm_Printf(SCM_CUROUT, "[---] #f\n");
3086         }
3087     }
3088 }
3089 
3090 /*=======================================================================
3091  * Initializing stuff
3092  */
3093 
Scm__InitRegexp(void)3094 void Scm__InitRegexp(void)
3095 {
3096 }
3097