1 /*
2 * @(#)regexp.c 1.3 of 18 April 87
3 * Revised for Racket starting 1995
4 *
5 * Copyright (c) 1986 by University of Toronto.
6 * Written by Henry Spencer. Not derived from licensed software.
7 *
8 * Permission is granted to anyone to use this software for any
9 * purpose on any computer system, and to redistribute it freely,
10 * subject to the following restrictions:
11 *
12 * 1. The author is not responsible for the consequences of use of
13 * this software, no matter how awful, even if they arise
14 * from defects in it.
15 *
16 * 2. The origin of this software must not be misrepresented, either
17 * by explicit claim or by omission.
18 *
19 * 3. Altered versions must be plainly marked as such, and must not
20 * be misrepresented as being the original software.
21 *
22 * Beware that some of this code is subtly aware of the way operator
23 * precedence is structured in regular expressions. Serious changes in
24 * regular-expression syntax might require a total rethink.
25 *
26 * Notable changes for Racket:
27 * Removed hardwired limits on parenthesis nesting
28 * Changed to index-based instead of pointer-based (better for GC)
29 * Added non-greedy operators *?, +?, and ??
30 * Added (?:...) grouping without reporting the group match
31 * Added (?=...), (?!...), (?<=...), and (?<!...) lookahead and lookback
32 * Added \n backreferences
33 * Added numeric quantifiers
34 * Added case-insensitive and multi-line modes
35 * Added Racket glue
36 *
37 * from Vladimir Tsyshevsky:
38 * additional optional parameter `offset' in `regexp-match'
39 * and `regexp-match-positions'
40 */
41
42 #include "schpriv.h"
43 #include "schmach.h"
44 #include "schgencat.h"
45 #include "schrx.h"
46
47 #include <stdio.h>
48 #include <string.h>
49
50 #ifdef SIXTY_FOUR_BIT_INTEGERS
51 # define BIGGEST_RXPOS 0x7FFFFFFFFFFFFFFF
52 #else
53 # define BIGGEST_RXPOS 0x7FFFFFFF
54 #endif
55
56 # define rOP(o) OP(o, regstr)
57 # define rNEXT(o) NEXT(o, regstr)
58 # define rOPLEN(o) OPLEN(o, regstr)
59 # define rOPRNGS(o) OPRNGS(o, regstr)
60 # define NEXT_OP(scan) (scan + rNEXT(scan))
61
62 static regexp *regcomp(char *, rxpos, int, int, Scheme_Object*);
63 /* static int regexec(regexp *, char *, int, int, rxpos *, rxpos * ...); */
64
65 /*
66 * Global work variables for regcomp().
67 */
68 THREAD_LOCAL_DECL(static char *regstr);
69 THREAD_LOCAL_DECL(static char *regparsestr);
70 THREAD_LOCAL_DECL(static int regmatchmin);
71 THREAD_LOCAL_DECL(static int regmatchmax);
72 THREAD_LOCAL_DECL(static int regmaxbackposn);
73 THREAD_LOCAL_DECL(static int regsavepos);
74
75 THREAD_LOCAL_DECL(static Scheme_Hash_Table *regbackknown); /* known/assumed backreference [non-]empty */
76 THREAD_LOCAL_DECL(static Scheme_Hash_Table *regbackdepends); /* backreferences required to be non-empty for the current to be non-empty */
77
78 THREAD_LOCAL_DECL(static rxpos regparse);
79 THREAD_LOCAL_DECL(static rxpos regparse_end); /* Input-scan pointer. */
80 THREAD_LOCAL_DECL(static int regnpar); /* () count. */
81 THREAD_LOCAL_DECL(static int regncounter); /* {} count */
82 THREAD_LOCAL_DECL(static rxpos regcode) ; /* Code-emit pointer, if less than regcodesize */
83 THREAD_LOCAL_DECL(static rxpos regcodesize);
84 THREAD_LOCAL_DECL(static rxpos regcodemax);
85 THREAD_LOCAL_DECL(static intptr_t regmaxlookback);
86
87 THREAD_LOCAL_DECL(static const char *regerrorwho);
88 THREAD_LOCAL_DECL(static Scheme_Object *regerrorproc); /* error handler for regexp construction */
89 THREAD_LOCAL_DECL(static Scheme_Object *regerrorval); /* result of error handler for failed regexp construction */
90
91 /* caches to avoid gc */
92 THREAD_LOCAL_DECL(static intptr_t rx_buffer_size);
93 THREAD_LOCAL_DECL(static rxpos *startp_buffer_cache);
94 THREAD_LOCAL_DECL(static rxpos *endp_buffer_cache);
95 THREAD_LOCAL_DECL(static rxpos *maybep_buffer_cache);
96 THREAD_LOCAL_DECL(static rxpos *match_stack_buffer_cache);
97
98 #define MATCH_STACK_SIZE 24
99
100 /*
101 * Forward declarations for regcomp()'s friends.
102 */
103 static rxpos reg(int, int *, int, int, int);
104 static rxpos regbranch(int *, int, int);
105 static rxpos regpiece(int *, int, int);
106 static rxpos regatom(int *, int, int);
107 static rxpos regranges(int parse_flags, int at_start);
108 static rxpos regunicode(int invert);
109 static int regdigit();
110 static rxpos regnode(char);
111 static void regarg(int);
112 static rxpos regnext(rxpos);
113 static void regc(char);
114 static void reginsert(char, rxpos);
115 static rxpos reginsertwithop(char, rxpos, int);
116 static rxpos reginsertwithopop(char, rxpos, int, int);
117 static void regtail(rxpos, rxpos);
118 static void regoptail(rxpos, rxpos);
119 static int regstrcspn(char *, char *, char *);
120 static unsigned char *extract_regstart(rxpos scan, int *_anch);
121
122 static int check_and_propagate_depends(void);
123 static int merge_tables(Scheme_Hash_Table *dest, Scheme_Hash_Table *src);
124
125 READ_ONLY static Scheme_Object *empty_byte_string;
126
127 #define FAIL(m) { regcomperror(m); return 0; }
128
129 static void
regerror(char * s)130 regerror(char *s)
131 {
132 if (!regerrorval) {
133 if (SCHEME_FALSEP(regerrorproc)) {
134 const char *who = regerrorwho;
135 regerrorwho = NULL;
136 scheme_raise_exn(MZEXN_FAIL_CONTRACT,
137 "%s: %s",
138 (who ? who : "regexp"),
139 s);
140 } else {
141 Scheme_Object *a[1];
142 a[0] = scheme_make_utf8_string(s);
143 regerrorval = scheme_apply_multi(regerrorproc, 1, a);
144 }
145 }
146 }
147
148 THREAD_LOCAL_DECL(const char *failure_msg_for_read);
149
150 static void
regcomperror(char * s)151 regcomperror(char *s)
152 {
153 if (failure_msg_for_read) {
154 failure_msg_for_read = s;
155 scheme_longjmp(scheme_error_buf, 1);
156 } else
157 regerror(s);
158 }
159
160 /*
161 - regcomp - compile a regular expression into internal code
162 *
163 * We can't allocate space until we know how big the compiled form will be,
164 * but we can't compile it (and thus know how big it is) until we've got a
165 * place to put the code. So we cheat: we compile it twice, once with code
166 * generation turned off and size counting turned on, and once "for real".
167 * This also means that we don't allocate space until we are sure that the
168 * thing really will compile successfully, and we never have to move the
169 * code and thus invalidate pointers into it. (Note that it has to be in
170 * one piece because free() must be able to free it all.)
171 *
172 * Beware that the optimization-preparation code in here knows about some
173 * of the structure of the compiled regexp.
174 */
175 static regexp *
regcomp(char * expstr,rxpos exp,int explen,int pcre,Scheme_Object * handler)176 regcomp(char *expstr, rxpos exp, int explen, int pcre, Scheme_Object *handler)
177 {
178 regexp *r;
179 rxpos scan, next;
180 rxpos longest;
181 int len, longest_is_ci;
182 int flags;
183
184 /* First pass: determine size, legality. */
185 regstr = NULL;
186 regparsestr = expstr;
187 regparse = exp;
188 regparse_end = exp + explen;
189 regnpar = 1;
190 regncounter = 0;
191 regmaxlookback = 0;
192 regcode = 1;
193 regcodesize = 0;
194 regcodemax = 0;
195 regmaxbackposn = 0;
196 regbackknown = NULL;
197 regbackdepends = NULL;
198 regerrorproc = handler;
199 regerrorval = NULL;
200 regc((char)MAGIC);
201 if (reg(0, &flags, 0, 0, PARSE_CASE_SENS | PARSE_SINGLE_LINE | (pcre ? PARSE_PCRE : 0)) == 0) {
202 if (regerrorval)
203 return NULL;
204 FAIL("unknown regexp failure");
205 }
206
207 /* Small enough for pointer-storage convention? */
208 if (regcodemax >= 32767L) /* Probably could be 65535L. */
209 FAIL("regexp too big");
210
211 if (regmaxbackposn >= regnpar)
212 FAIL("backreference number is larger than the highest-numbered cluster");
213
214 /* Allocate space. */
215 r = (regexp *)scheme_malloc_tagged(sizeof(regexp) + N_ITO_SPACE((unsigned)regcodemax));
216 r->type = scheme_regexp_type;
217
218 #ifdef INDIRECT_TO_PROGRAM
219 r->program = (char *)scheme_malloc_atomic((unsigned)regcodemax + 1);
220 #endif
221
222 r->regsize = regcodemax;
223
224 r->nsubexp = regnpar;
225 r->ncounter = regncounter;
226 r->maxlookback = regmaxlookback;
227
228 /* Second pass: emit code. */
229 regparse = exp;
230 regparse_end = exp + explen;
231 regnpar = 1;
232 regncounter = 0;
233 regcodesize = regcodemax;
234 #ifdef INDIRECT_TO_PROGRAM
235 regstr = r->program;
236 regcode = 0;
237 #else
238 regstr = (char *)r;
239 regcode = (char *)r->program - (char *)r;
240 #endif
241 regcodesize += regcode;
242 regcodemax = 0;
243 regbackknown = NULL;
244 regbackdepends = NULL;
245 regc((char)MAGIC);
246 if (reg(0, &flags, 0, 0, PARSE_CASE_SENS | PARSE_SINGLE_LINE | (pcre ? PARSE_PCRE : 0)) == 0) {
247 FAIL("unknown regexp failure (late)");
248 }
249
250 if (regcode >= regcodesize) {
251 FAIL("wrote too far");
252 }
253
254 /* Dig out information for optimizations. */
255 r->regstart = NULL; /* Worst-case defaults. */
256 r->regmust = -1;
257 r->regmlen = 0;
258 scan = N_ITO_DELTA(r->program, 1, (char *)r); /* First BRANCH. */
259 {
260 unsigned char *rs;
261 int anch = 0;
262 rs = extract_regstart(scan, &anch);
263 r->regstart = rs;
264 if (anch)
265 r->flags |= REGEXP_ANCH;
266 }
267 next = regnext(scan);
268 if (rOP(next) == END) { /* Only one top-level choice. */
269 scan = OPERAND(scan);
270 /*
271 * If there's something expensive in the r.e., find the
272 * longest literal string that must appear and make it the
273 * regmust. Resolve ties in favor of later strings, since
274 * the regstart check works with the beginning of the r.e.
275 * and avoiding duplication strengthens checking. Not a
276 * strong reason, but sufficient in the absence of others.
277 */
278 if (flags&SPSTART) {
279 int prev_op = 0;
280 longest = 0;
281 longest_is_ci = 0;
282 len = 0;
283 for (; scan != 0; ) {
284 int mscan = scan;
285 while (1) {
286 int mop;
287 mop = rOP(mscan);
288 if ((mop == LOOKF) || (mop == LOOKBF)) {
289 /* skip over part that we don't want to match */
290 mscan = mscan + rOPLEN(OPERAND(mscan));
291 mscan = NEXT_OP(mscan);
292 } else if (((mop == EXACTLY) || (mop == EXACTLY_CI))
293 && rOPLEN(OPERAND(mscan)) >= len) {
294 /* Skip regmust if it contains a null character: */
295 rxpos ls = OPSTR(OPERAND(mscan));
296 int ll = rOPLEN(OPERAND(mscan)), i;
297 for (i = 0; i < ll; i++) {
298 if (!regstr[ls + i])
299 break;
300 }
301 if (i >= ll) {
302 longest = ls;
303 len = ll;
304 longest_is_ci = (rOP(mscan) == EXACTLY_CI);
305 }
306 break;
307 } else if ((mop == EXACTLY1) && (1 >= len)) {
308 /* Skip if it's a null character */
309 if (regstr[OPERAND(mscan)]) {
310 longest = OPERAND(mscan);
311 len = 1;
312 longest_is_ci = 0;
313 }
314 break;
315 } else if ((mop == BRANCH) && (prev_op != BRANCH)) {
316 int mnext;
317 mnext = NEXT_OP(mscan);
318 if (rOP(mnext) != BRANCH) {
319 /* A branch with only one choice */
320 mscan = OPERAND(mscan);
321 } else
322 break;
323 } else
324 break;
325 }
326 prev_op = rOP(scan);
327 if ((prev_op == LOOKF) || (prev_op == LOOKBF)) {
328 /* skip over part that we don't want to match */
329 scan = scan + rOPLEN(OPERAND(scan));
330 scan = NEXT_OP(scan);
331 } else {
332 scan = regnext(scan);
333 }
334 }
335 if (longest) {
336 r->regmust = longest;
337 if (longest_is_ci)
338 r->flags |= REGEXP_MUST_CI;
339 r->regmlen = len;
340 }
341 }
342 }
343
344 #if 0
345 if (regcode > r->regsize + sizeof(regexp))
346 scheme_signal_error("regexp too large!");
347 #endif
348
349 return(r);
350 }
351
map_create(unsigned char * map)352 static unsigned char *map_create(unsigned char *map)
353 {
354 if (!map) {
355 map = (unsigned char *)scheme_malloc_atomic(32);
356 memset(map, 0, 32);
357 }
358 return map;
359 }
360
map_start(unsigned char * map,int c)361 static unsigned char *map_start(unsigned char *map, int c)
362 {
363 map = map_create(map);
364 map[c >> 3] |= ((unsigned char)1 << (c & 0x7));
365 return map;
366 }
367
map_copy(unsigned char * map,char * s,int pos)368 static unsigned char *map_copy(unsigned char *map, char *s, int pos)
369 {
370 map = map_create(map);
371 memcpy(map, s XFORM_OK_PLUS pos, 32);
372 return map;
373 }
374
map_range(unsigned char * map,char * s,int pos,int invert)375 static unsigned char *map_range(unsigned char *map, char *s, int pos, int invert)
376 {
377 int rs, re;
378
379 rs = UCHAR(s[pos++]);
380 re = UCHAR(s[pos++]);
381
382 if (!invert) {
383 while (rs <= re) {
384 map = map_start(map, rs);
385 rs++;
386 }
387 } else {
388 while (rs > 0) {
389 map = map_start(map, rs - 1);
390 --rs;
391 }
392 while (re < 255) {
393 map = map_start(map, re + 1);
394 re++;
395 }
396 }
397
398 return map;
399 }
400
extract_regstart(rxpos scan,int * _anch)401 static unsigned char *extract_regstart(rxpos scan, int *_anch)
402 {
403 rxpos next;
404 int retry, the_op;
405 unsigned char *map = NULL;
406
407 do {
408 retry = 0;
409
410 the_op = rOP(scan);
411 switch (the_op) {
412 case BOL:
413 case EOL:
414 case NOTHING:
415 case SAVECONST:
416 case MAYBECONST:
417 case COUNTINIT:
418 case COUNTOVER:
419 case COUNTUNDER:
420 /* We can ignore zero-length things when finding starting info */
421 scan = regnext(scan);
422 retry = 1;
423 break;
424 case LOOKT:
425 case LOOKF:
426 case LOOKBT:
427 case LOOKBF:
428 /* Zero-length, but continuation in an unusual place */
429 scan += rOPLEN(OPERAND(scan));
430 scan = regnext(scan);
431 retry = 1;
432 break;
433 case LOOKTX:
434 scan = regnext(scan);
435 retry = 1;
436 break;
437 case PLUS:
438 case PLUS2:
439 scan = OPERAND(scan);
440 retry = 1;
441 break;
442 case STAR3:
443 case STAR4:
444 if (rOPLEN(OPERAND(scan)) > 0) {
445 scan = OPERAND3(scan);
446 retry = 1;
447 }
448 break;
449 case EXACTLY:
450 map = map_start(map, UCHAR(regstr[OPSTR(OPERAND(scan))]));
451 break;
452 case EXACTLY_CI:
453 {
454 int c = UCHAR(regstr[OPSTR(OPERAND(scan))]);
455 map = map_start(map, c);
456 map = map_start(map, rx_toupper(c));
457 }
458 break;
459 case ANYOF:
460 map = map_copy(map, regstr, OPERAND(scan));
461 break;
462 case EXACTLY1:
463 map = map_start(map, UCHAR(regstr[OPERAND(scan)]));
464 break;
465 case EXACTLY2:
466 map = map_start(map, UCHAR(regstr[OPERAND(scan)]));
467 map = map_start(map, UCHAR(regstr[OPERAND(scan)+1]));
468 break;
469 case RANGE:
470 map = map_range(map, regstr, OPERAND(scan), 0);
471 break;
472 case NOTRANGE:
473 map = map_range(map, regstr, OPERAND(scan), 1);
474 break;
475 case BOI:
476 if (_anch)
477 *_anch = 1;
478 break;
479 case BRANCH:
480 next = regnext(scan);
481 if (!next || (rOP(next) == END) || (rOP(next) == LOOKE)) {
482 /* Only one branch */
483 scan = OPERAND(scan);
484 retry = 1;
485 }
486 break;
487 default:
488 if ((the_op == OPENN) || (the_op >= OPEN && the_op < CLOSE)) {
489 scan = NEXT_OP(scan);
490 retry = 1;
491 }
492 break;
493 }
494 } while (retry);
495
496 return map;
497 }
498
499 #ifdef DO_STACK_CHECK
500
reg_k(void)501 static Scheme_Object *reg_k(void)
502 {
503 Scheme_Thread *p = scheme_current_thread;
504 int *flagp = (int *)p->ku.k.p1;
505 int res;
506
507 p->ku.k.p1 = NULL;
508
509 res = reg(p->ku.k.i1, flagp, p->ku.k.i2, p->ku.k.i3, p->ku.k.i4);
510
511 return scheme_make_integer(res);
512 }
513
514 #endif
515
516 /*
517 - reg - regular expression, i.e. main body or parenthesized thing
518 *
519 * Caller must absorb opening parenthesis.
520 *
521 * Combining parenthesis handling with the base level of regular expression
522 * is a trifle forced, but the need to tie the tails of the branches to what
523 * follows makes it hard to avoid.
524 */
525 static rxpos
reg(int paren,int * flagp,int paren_set,int lookahead,int parse_flags)526 reg(int paren, int *flagp, int paren_set, int lookahead, int parse_flags)
527 {
528 rxpos ret;
529 rxpos br;
530 rxpos ender;
531 int parno = 0;
532 int flags, matchmin, matchmax, maxlookback, brcount;
533 Scheme_Hash_Table *backdepends;
534
535 #ifdef DO_STACK_CHECK
536 {
537 # include "mzstkchk.h"
538 {
539 Scheme_Thread *p = scheme_current_thread;
540 Scheme_Object *ov;
541 p->ku.k.i1 = paren;
542 p->ku.k.p1 = (void *)flagp;
543 p->ku.k.i2 = paren_set;
544 p->ku.k.i3 = lookahead;
545 p->ku.k.i4 = parse_flags;
546 ov = scheme_handle_stack_overflow(reg_k);
547 return SCHEME_INT_VAL(ov);
548 }
549 }
550 #endif
551
552 *flagp = HASWIDTH; /* Tentatively. */
553
554 /* Make an OPEN node, if parenthesized. */
555 if (paren) {
556 if (lookahead) {
557 parno = 0;
558 ret = regnode(lookahead);
559 regarg(0); /* space for LOOKE pointer */
560 if ((lookahead == LOOKBT) || (lookahead == LOOKBF)) {
561 regarg(0); /* space for min count */
562 regarg(0); /* space for max count */
563 }
564 } else if (paren_set) {
565 parno = regnpar;
566 regnpar++;
567 if (OPEN + parno >= CLOSE) {
568 ret = regnode(OPENN);
569 regarg(parno);
570 } else {
571 ret = regnode(OPEN+parno);
572 }
573 } else
574 ret = 0;
575 } else
576 ret = 0;
577
578 /* Pick up the branches, linking them together. */
579 br = regbranch(&flags, parse_flags, 0);
580 if (br == 0)
581 FAIL("branch failed!?");
582 if (ret != 0)
583 regtail(ret, br); /* OPEN -> first. */
584 else
585 ret = br;
586 if (!(flags&HASWIDTH)) {
587 *flagp &= ~HASWIDTH;
588 backdepends = NULL;
589 } else if (regbackdepends) {
590 backdepends = regbackdepends;
591 regbackdepends = NULL;
592 } else
593 backdepends = NULL;
594 *flagp |= flags&(SPSTART|SPFIXED);
595 matchmin = regmatchmin;
596 matchmax = regmatchmax;
597 maxlookback = regmaxlookback;
598 brcount = 1;
599 while (regparsestr[regparse] == '|') {
600 brcount++;
601 regparse++;
602 br = regbranch(&flags, parse_flags, 0);
603 if (br == 0)
604 FAIL("next branch failed!?");
605 regtail(ret, br); /* BRANCH -> BRANCH. */
606 if (!(flags&HASWIDTH))
607 *flagp &= ~HASWIDTH;
608 else if ((*flagp) & HASWIDTH) {
609 if (regbackdepends) {
610 if (backdepends)
611 merge_tables(backdepends, regbackdepends);
612 else
613 backdepends = regbackdepends;
614 regbackdepends = NULL;
615 } else
616 backdepends = NULL;
617 }
618 *flagp |= flags&SPSTART;
619 if (!(flags&SPFIXED))
620 *flagp &= ~SPFIXED;
621 else {
622 if (regmatchmin < matchmin)
623 matchmin = regmatchmin;
624 if (regmatchmax > matchmax)
625 matchmax = regmatchmax;
626 if (regmaxlookback > maxlookback)
627 maxlookback = regmaxlookback;
628 }
629 }
630 regbackdepends = backdepends;
631 regmatchmin = matchmin;
632 regmatchmax = matchmax;
633 regmaxlookback = maxlookback;
634
635 if (paren && paren_set) {
636 Scheme_Object *assumed;
637
638 if (!regbackknown)
639 regbackknown = scheme_make_hash_table(SCHEME_hash_ptr);
640 assumed = scheme_hash_get(regbackknown, scheme_make_integer(parno));
641
642 if (!((*flagp) & HASWIDTH)) {
643 if (assumed && !SCHEME_FALSEP(assumed)) {
644 FAIL("`*', `+', or `{...,}' operand can be empty due to backreference");
645 }
646 scheme_hash_set(regbackknown, scheme_make_integer(parno), scheme_false);
647 } else {
648 if (!backdepends)
649 scheme_hash_set(regbackknown, scheme_make_integer(parno), scheme_true);
650 else {
651 if (assumed) {
652 check_and_propagate_depends();
653 } else
654 scheme_hash_set(regbackknown, scheme_make_integer(parno), (Scheme_Object *)backdepends);
655 }
656 }
657 }
658
659 if ((brcount == 1)
660 && paren
661 && (!paren_set || ((flags & SPFIXED)
662 && (regmatchmin == regmatchmax)
663 && (regmatchmax < 0x7FFFF)))
664 && !lookahead) {
665 /* Simplify to just the single branch: */
666 if (br + 3 < regcodesize) {
667 int top;
668 if (regcode <= regcodesize)
669 top = regcode;
670 else
671 top = regcodesize;
672 memmove(regstr + ret, regstr + br + 3, top - (br + 3));
673 }
674 *flagp = flags;
675 regcode -= (br + 3 - ret);
676 if (paren_set) {
677 /* Collude with regpiece: */
678 *flagp |= NEEDSAVECONST;
679 *flagp &= ~SPNOTHING;
680 regsavepos = parno;
681 }
682 } else {
683 if (lookahead) {
684 if ((lookahead == LOOKBT) || (lookahead == LOOKBF)) {
685 if (!((*flagp) & SPFIXED))
686 FAIL("lookbehind pattern does not match a bounded byte width");
687 if (matchmax > 0x7FFF)
688 FAIL("lookbehind match is potentially too long (more than 32767 bytes)");
689 regmaxlookback = matchmax + maxlookback;
690 if (ret + 8 < regcodesize) {
691 regstr[ret + 5] = (matchmin >> 8);
692 regstr[ret + 6] = (matchmin & 255);
693 regstr[ret + 7] = (matchmax >> 8);
694 regstr[ret + 8] = (matchmax & 255);
695 }
696 }
697 }
698
699 /* Make a closing node, and hook it on the end. */
700 if (paren) {
701 if (lookahead) {
702 ender = regnode(LOOKE);
703 if (ret + 4 < regcodesize) {
704 int delta = (ender - ret);
705 regstr[ret + 3] = (delta >> 8);
706 regstr[ret + 4] = (delta & 255);
707 }
708 } else if (paren_set) {
709 if (OPEN + parno >= CLOSE) {
710 ender = regcode;
711 regarg(parno);
712 reginsert(CLOSEN, ender);
713 } else
714 ender = regnode(CLOSE+parno);
715 } else {
716 ender = regnode(NOTHING);
717 }
718 } else {
719 ender = regnode(END);
720 }
721 regtail(ret, ender);
722
723 /* Hook the tails of the branches to the closing node. */
724 if (regcodesize) {
725 for (br = ret; br != 0; br = regnext(br)) {
726 regoptail(br, ender);
727 }
728 }
729 }
730
731 /* Check for proper termination. */
732 if (paren && regparsestr[regparse++] != ')') {
733 FAIL("missing closing parenthesis in pattern");
734 } else if (!paren && regparse != regparse_end) {
735 if (regparsestr[regparse] == ')') {
736 FAIL("extra closing parenthesis in pattern");
737 } else
738 FAIL("junk on end"); /* "Can't happen". */
739 /* NOTREACHED */
740 }
741
742 return ret;
743 }
744
745 /*
746 - regbranch - one alternative of an | operator
747 *
748 * Implements the concatenation operator.
749 */
750 static rxpos
regbranch(int * flagp,int parse_flags,int without_branch_node)751 regbranch(int *flagp, int parse_flags, int without_branch_node)
752 {
753 rxpos ret;
754 rxpos chain, latest;
755 int flags = 0, matchmin = 0, matchmax = 0, maxlookback = 0, pcount = 0, save_flags;
756
757 *flagp = (WORST|SPFIXED); /* Tentatively. */
758
759 if (!without_branch_node)
760 ret = regnode(BRANCH);
761 else
762 ret = 0;
763 chain = 0;
764 while (regparse != regparse_end
765 && regparsestr[regparse] != '|'
766 && regparsestr[regparse] != ')') {
767 save_flags = flags;
768 latest = regpiece(&flags, parse_flags, !chain && !without_branch_node);
769 if (latest == 0)
770 FAIL("piece failed!?");
771 if (flags & SPNOTHING) {
772 /* no need to match nothing */
773 regcode = latest; /* throw away dead code */
774 flags = save_flags; /* in case all but the first is discarded */
775 } else {
776 pcount++;
777 *flagp |= flags&HASWIDTH;
778 if (chain == 0) { /* First piece. */
779 *flagp |= flags&SPSTART;
780 if (without_branch_node)
781 ret = latest;
782 } else
783 regtail(chain, latest);
784 if (!(flags&SPFIXED))
785 *flagp &= ~SPFIXED;
786 if ((regmaxlookback - matchmin) > maxlookback)
787 maxlookback = regmaxlookback - matchmin;
788 matchmin += regmatchmin;
789 matchmax += regmatchmax;
790 if (matchmax > 0x7FFF)
791 matchmax = 0x10000;
792 chain = latest;
793 }
794 }
795 regmatchmin = matchmin;
796 regmatchmax = matchmax;
797 regmaxlookback = maxlookback;
798 if (chain == 0) { /* Loop ran zero times. */
799 latest = regnode(NOTHING);
800 if (without_branch_node)
801 ret = latest;
802 *flagp = SIMPLE|SPNOTHING|SPFIXED;
803 regmatchmin = regmatchmax = 0;
804 }
805
806 if (pcount == 1) {
807 *flagp = flags; /* BRANCH will be deleted if simplicity is relevant */
808 }
809
810 return(ret);
811 }
812
813 /*
814 - regpiece - something followed by possible [*+?]
815 *
816 * Note that the branching code sequences used for ? and the general cases
817 * of * and + are somewhat optimized: they use the same NOTHING node as
818 * both the endmarker for their branch list and the body of the last branch.
819 * It might seem that this node could be dispensed with entirely, but the
820 * endmarker role is not redundant.
821 */
822 static rxpos
regpiece(int * flagp,int parse_flags,int at_start)823 regpiece(int *flagp, int parse_flags, int at_start)
824 {
825 rxpos ret;
826 char op;
827 rxpos next;
828 int flags, greedy;
829 int minreps = 0, maxreps = 0, counter;
830 int origsavepos, origmatchmin, origmatchmax;
831
832 ret = regatom(&flags, parse_flags, at_start);
833 if (ret == 0)
834 FAIL("atom failed!?");
835
836 origsavepos = regsavepos;
837 origmatchmin = regmatchmin;
838 origmatchmax = regmatchmax;
839
840 op = regparsestr[regparse];
841 if (!ISMULT(op, parse_flags)) {
842 *flagp = (flags & ~NEEDSAVECONST);
843 } else {
844 if (op == '{') {
845 int ch, maxspec = 0;
846 minreps = maxreps = 0;
847 regparse++;
848 do {
849 ch = regparsestr[regparse];
850 if ((ch >= '0') && (ch <= '9')) {
851 minreps = (minreps * 10) + (ch - '0');
852 if (minreps > 0x7FFF)
853 FAIL("minimum repetition count too large");
854 regparse++;
855 } else if (ch == ',' || ch == '}')
856 break;
857 else {
858 FAIL("expected digit, comma, or `}' to end repetition specification started with `{'");
859 }
860 } while (1);
861 if (ch == ',') {
862 regparse++;
863 do {
864 ch = regparsestr[regparse];
865 if ((ch >= '0') && (ch <= '9')) {
866 maxspec = 1;
867 maxreps = (maxreps * 10) + (ch - '0');
868 if (maxreps > 0x7FFF)
869 FAIL("maximum repetition count too large");
870 regparse++;
871 } else if (ch == '}')
872 break;
873 else {
874 FAIL("expected digit or `}' to end repetition specification started with `{'");
875 }
876 } while (1);
877 } else {
878 maxspec = 1;
879 maxreps = minreps;
880 }
881 if (maxspec && (maxreps < minreps)) {
882 FAIL("maximum repetition count is less than maximum repetition count");
883 }
884 if (maxspec && !maxreps) {
885 /* Match 0 instances */
886 regparse++;
887 if (regparsestr[regparse] == '?')
888 regparse++; /* non-greedy */
889 if (ISMULT(regparsestr[regparse], parse_flags))
890 FAIL("nested `*', `?', `+', or `{...}' in pattern");
891 regcode = ret; /* throw away dead code */
892 *flagp = SPFIXED|SPNOTHING;
893 regmatchmin = regmatchmax = 0;
894 return regnode(NOTHING);
895 }
896 op = '*';
897 if (maxreps || minreps)
898 counter = regncounter++;
899 else
900 counter = 0;
901 } else
902 counter = 0;
903
904 if (!(flags&HASWIDTH) && (op != '?')) {
905 FAIL("`*', `+', or `{...}' operand could be empty");
906 }
907
908 if (regbackdepends) {
909 /* Operand has width only if the indicated backreferences have width. */
910 check_and_propagate_depends();
911 /* Assumptions are registered, so we no longer need these backdepends: */
912 regbackdepends = NULL;
913 }
914
915 if (maxreps || minreps) {
916 if (minreps > 0)
917 *flagp = HASWIDTH;
918 if ((flags & SPFIXED) && maxreps) {
919 regmatchmin = (origmatchmin * minreps);
920 regmatchmax = (origmatchmax * maxreps);
921 if (regmatchmax > 0x7FFF)
922 regmatchmax = 0x10000;
923 *flagp |= SPFIXED;
924 }
925 } else {
926 *flagp = (op != '+') ? WORST : HASWIDTH;
927 if ((op == '*') || (op == '?'))
928 regmatchmin = 0;
929 }
930 *flagp |= SPSTART;
931 if ((op == '?') && (flags & SPFIXED)) {
932 *flagp |= SPFIXED;
933 regmatchmin = 0;
934 }
935
936 if (regparsestr[regparse+1] == '?') {
937 greedy = 0;
938 regparse++;
939 } else
940 greedy = 1;
941
942 if (op == '*' && (flags&SIMPLE)) {
943 if (!minreps && !maxreps)
944 reginsert(greedy ? STAR : STAR2, ret);
945 else
946 reginsertwithopop(greedy ? STAR3 : STAR4, ret, minreps, maxreps);
947 } else if (op == '*' && greedy) {
948 /* Emit x* as (x&|), where & means "self".
949 If minreps or maxreps, also insert counter-managing
950 nodes. This counter detects empty matches, too.
951 The code is a little difficult to read because it often
952 uses reginsert, which puts nodes before existing nodes.
953 So, you almost have to read it backward. */
954 rxpos br, nothing;
955 if (minreps || maxreps) {
956 /* Increment iteration counter, and fail if it's
957 already at the max: */
958 rxpos x;
959 x = reginsertwithopop(COUNTUNDER, ret, counter, maxreps);
960 regtail(ret, x);
961 }
962 reginsert(BRANCH, ret); /* Either x */
963 if (minreps || maxreps) {
964 /* Initialize the iteration counter on entry: */
965 br = reginsertwithop(COUNTINIT, ret, counter);
966 regtail(ret, br);
967 } else
968 br = ret;
969 regoptail(br, regnode(BACK)); /* and loop */
970 regoptail(br, br); /* back */
971 regtail(br, regnode(BRANCH)); /* or */
972 nothing = regnode(NOTHING);
973 if (minreps) {
974 /* Fail to match if the counter isn't big enough, yet: */
975 rxpos n;
976 n = reginsertwithopop(COUNTOVER, nothing, counter, minreps);
977 regtail(nothing, n);
978 }
979 if (minreps || maxreps) {
980 /* We incremented the counter for an x match, but now
981 we're backtracking, so decrement it: */
982 rxpos n;
983 n = reginsertwithop(COUNTBACK, nothing, counter);
984 regtail(nothing, n);
985 }
986 regtail(br, nothing); /* null. */
987 } else if (op == '*') {
988 /* Emit x*? as (|x&), where & means "self".
989 With a counter, we need (|(x|-)&), where - reverts
990 the iteration count and fails. */
991 rxpos br, nothing, x, next_to_x;
992 if (minreps || maxreps) {
993 /* Increment iteration counter, and fail if it's
994 already at the max: */
995 rxpos fail;
996 x = reginsertwithopop(COUNTUNDER, ret, counter, maxreps);
997 regtail(ret, x);
998
999 fail = regnode(BRANCH);
1000 regnode(COUNTBACKFAIL);
1001 regarg(counter);
1002 reginsert(BRANCH, ret);
1003 fail += 3;
1004 regtail(ret, fail);
1005 x += 3;
1006 } else
1007 x = ret;
1008 reginsert(BRANCH, ret); /* = next */
1009 next = ret;
1010 next_to_x = (x - next) + 3;
1011 reginsert(NOTHING, ret); /* = nothing */
1012 next += 3;
1013 nothing = ret;
1014 if (minreps) {
1015 /* Fail to match if the counter isn't big enough, yet: */
1016 nothing = reginsertwithopop(COUNTOVER, ret, counter, minreps);
1017 regtail(ret, nothing); /* chain countover -> nothing */
1018 next += (nothing - ret);
1019 }
1020 reginsert(BRANCH, ret); /* b3 */
1021 next += 3;
1022 nothing += 3;
1023 if (minreps || maxreps) {
1024 /* Initialize the iteration counter on entry: */
1025 br = reginsertwithop(COUNTINIT, ret, counter);
1026 regtail(ret, br); /* chain countinit to b3 */
1027 next += (br - ret);
1028 nothing += (br - ret);
1029 } else
1030 br = ret;
1031 regtail(br, next); /* chain b3 to next */
1032 x = next + next_to_x;
1033 regtail(x, regnode(BACK)); /* loop */
1034 regtail(x, br); /* back. */
1035 regtail(next, regnode(BACK)); /* chain next to nothing */
1036 regtail(next, nothing);
1037 } else if (op == '+' && (flags&SIMPLE))
1038 reginsert(greedy ? PLUS : PLUS2, ret);
1039 else if (op == '+' && greedy) {
1040 /* Emit x+ as x(&|), where & means "self". */
1041 next = regnode(BRANCH); /* Either */
1042 regtail(ret, next);
1043 regtail(regnode(BACK), ret); /* loop back */
1044 regtail(next, regnode(BRANCH)); /* or */
1045 regtail(ret, regnode(NOTHING)); /* null. */
1046 } else if (op == '+') {
1047 /* Emit x+? as x(|&), where & means "self". */
1048 next = regnode(BRANCH); /* Either */
1049 regtail(ret, next);
1050 regnode(NOTHING); /* op */
1051 regtail(next, regnode(BRANCH)); /* or */
1052 regtail(regnode(BACK), ret); /* loop back. */
1053 regtail(next, regnode(BACK));
1054 regtail(next, next + 3);
1055 } else if (op == '?' && greedy) {
1056 /* Emit x? as (x|) */
1057 reginsert(BRANCH, ret); /* Either x */
1058 regtail(ret, regnode(BRANCH)); /* or */
1059 next = regnode(NOTHING); /* null. */
1060 regtail(ret, next);
1061 regoptail(ret, next);
1062 } else if (op == '?') {
1063 /* Emit x?? as (|x) */
1064 reginsert(BRANCH, ret); /* will be next... */
1065 reginsert(NOTHING, ret);
1066 reginsert(BRANCH, ret);
1067 regtail(ret, ret + 6);
1068 next = regnode(BACK);
1069 regtail(ret + 6, next);
1070 regoptail(ret + 6, next);
1071 regoptail(ret + 6, ret + 3);
1072 }
1073 regparse++;
1074 if (ISMULT(regparsestr[regparse], parse_flags))
1075 FAIL("nested `*', `?', `+', or `{...}' in pattern");
1076 }
1077
1078 if (flags & NEEDSAVECONST) {
1079 rxpos sv;
1080 sv = regnode(SAVECONST);
1081 regarg(origsavepos);
1082 regarg(origmatchmax);
1083 regtail(ret, sv);
1084 if (origmatchmax) {
1085 sv = reginsertwithop(MAYBECONST, ret, origsavepos);
1086 regtail(ret, sv);
1087 }
1088 *flagp &= ~SIMPLE;
1089 }
1090
1091 return(ret);
1092 }
1093
1094 /*
1095 - regatom - the lowest level
1096 *
1097 * Optimization: gobbles an entire sequence of ordinary characters so that
1098 * it can turn them into a single node, which is smaller to store and
1099 * faster to run. Backslashed characters are exceptions, each becoming a
1100 * separate node; the code is simpler that way and it's not worth fixing.
1101 */
1102 static rxpos
regatom(int * flagp,int parse_flags,int at_start)1103 regatom(int *flagp, int parse_flags, int at_start)
1104 {
1105 rxpos ret;
1106 int flags;
1107
1108 *flagp = (WORST|SPFIXED); /* Tentatively. */
1109 regmatchmin = regmatchmax = 1;
1110 regmaxlookback = 0;
1111
1112 switch (regparsestr[regparse++]) {
1113 case '^':
1114 if (parse_flags & PARSE_SINGLE_LINE)
1115 ret = regnode(BOI);
1116 else
1117 ret = regnode(BOL);
1118 regmaxlookback = 1;
1119 regmatchmin = regmatchmax = 0;
1120 break;
1121 case '$':
1122 if (parse_flags & PARSE_SINGLE_LINE)
1123 ret = regnode(EOI);
1124 else
1125 ret = regnode(EOL);
1126 regmatchmin = regmatchmax = 0;
1127 break;
1128 case '.':
1129 --regparse;
1130 ret = regranges(parse_flags, at_start);
1131 *flagp |= HASWIDTH|SIMPLE;
1132 break;
1133 case '[':
1134 --regparse;
1135 ret = regranges(parse_flags, at_start);
1136 *flagp |= HASWIDTH|SIMPLE;
1137 break;
1138 case '(':
1139 {
1140 if (regparsestr[regparse] == '?') {
1141 int moded = 0;
1142
1143 while (1) {
1144 if (regparsestr[regparse+1] == 'i') {
1145 parse_flags &= ~PARSE_CASE_SENS;
1146 regparse++;
1147 moded = 1;
1148 } else if (regparsestr[regparse+1] == 'm') {
1149 parse_flags &= ~PARSE_SINGLE_LINE;
1150 regparse++;
1151 moded = 1;
1152 } else if (regparsestr[regparse+1] == 's') {
1153 parse_flags |= PARSE_SINGLE_LINE;
1154 regparse++;
1155 moded = 1;
1156 } else if ((regparsestr[regparse+1] == '-')
1157 && (regparsestr[regparse+2] == 'i')) {
1158 parse_flags |= PARSE_CASE_SENS;
1159 regparse += 2;
1160 moded = 1;
1161 } else if ((regparsestr[regparse+1] == '-')
1162 && (regparsestr[regparse+2] == 'm')) {
1163 parse_flags |= PARSE_SINGLE_LINE;
1164 regparse += 2;
1165 moded = 1;
1166 } else if ((regparsestr[regparse+1] == '-')
1167 && (regparsestr[regparse+2] == 's')) {
1168 parse_flags &= ~PARSE_SINGLE_LINE;
1169 regparse += 2;
1170 moded = 1;
1171 } else {
1172 break;
1173 }
1174 }
1175
1176 if (regparsestr[regparse+1] == ':') {
1177 regparse += 2;
1178 ret = reg(1, &flags, 0, 0, parse_flags);
1179 *flagp = flags;
1180 } else if (moded) {
1181 FAIL("expected `:' or another mode after `(?' and a mode sequence (where a mode is `i', `-i', `m', `-m', `s', or `-s')");
1182 } else if (regparsestr[regparse+1] == '(') {
1183 /* Conditional */
1184 if (((regparsestr[regparse+2] >= '0')
1185 && (regparsestr[regparse+2] <= '9'))
1186 || ((regparsestr[regparse+2] == '?')
1187 && ((regparsestr[regparse+3] == '=')
1188 || (regparsestr[regparse+3] == '!')
1189 || (regparsestr[regparse+3] == '<')))) {
1190 rxpos test, tbr, fbr, ender;
1191 int flags, matchmin, matchmax;
1192 Scheme_Hash_Table *backdepends;
1193
1194 regparse++;
1195 ret = regnode(CONDITIONAL);
1196 regarg(0); /* space for then */
1197 regarg(0); /* space for else */
1198 if (regparsestr[regparse+1] != '?') {
1199 int posn;
1200 regparse++;
1201 posn = regdigit();
1202 test = regnode(BACKREF);
1203 regarg(posn);
1204 if (regparsestr[regparse] == ')') {
1205 regparse++;
1206 } else {
1207 FAIL("expected `)' after `(?(' followed by a digit");
1208 }
1209 } else {
1210 test = regatom(&flags, parse_flags, 1);
1211 }
1212 if (test != OPERAND3(ret)) {
1213 FAIL("test went to wrong place!?");
1214 }
1215 regtail(test, regnode(END));
1216 if (regparsestr[regparse] == ')') {
1217 FAIL("expected an expression after test in `(?(...))'");
1218 }
1219
1220 regbackdepends = NULL;
1221 *flagp |= HASWIDTH; /* tentatively */
1222
1223 tbr = regbranch(&flags, parse_flags, 1);
1224
1225 if (!(flags&HASWIDTH)) {
1226 *flagp &= ~HASWIDTH;
1227 backdepends = NULL;
1228 } else if (regbackdepends) {
1229 backdepends = regbackdepends;
1230 regbackdepends = NULL;
1231 } else
1232 backdepends = NULL;
1233
1234 if (!(flags & SPFIXED))
1235 *flagp &= ~SPFIXED;
1236 matchmin = regmatchmin;
1237 matchmax = regmatchmax;
1238
1239 if (regparsestr[regparse] == ')') {
1240 fbr = regnode(NOTHING);
1241 *flagp &= ~HASWIDTH;
1242 matchmin = 0;
1243 } else if (regparsestr[regparse] != '|') {
1244 FAIL("expected `)' or `|' after first branch of `(?(...)...)'");
1245 } else {
1246 regparse++;
1247 fbr = regbranch(&flags, parse_flags, 1);
1248 if (regparsestr[regparse] != ')') {
1249 FAIL("expected `)' to close `(?(...)...' after second branch");
1250 }
1251
1252 if (!(flags&HASWIDTH)) {
1253 *flagp &= ~HASWIDTH;
1254 backdepends = NULL;
1255 } else if (regbackdepends) {
1256 if (backdepends)
1257 merge_tables(backdepends, regbackdepends);
1258 else
1259 backdepends = regbackdepends;
1260 }
1261
1262 if (!(flags & SPFIXED))
1263 *flagp &= ~SPFIXED;
1264 else {
1265 if (regmatchmin < matchmin)
1266 matchmin = regmatchmin;
1267 if (regmatchmax > matchmax)
1268 matchmax = regmatchmax;
1269 }
1270 }
1271
1272 regmatchmax = matchmax;
1273 regmatchmin = matchmin;
1274 regbackdepends = backdepends;
1275
1276 if (OPERAND2(ret) + 1 < regcodesize) {
1277 int delta;
1278 delta = tbr - ret;
1279 regstr[OPERAND(ret)] = delta >> 8;
1280 regstr[OPERAND(ret)+1] = delta & 255;
1281 delta = fbr - ret;
1282 regstr[OPERAND2(ret)] = delta >> 8;
1283 regstr[OPERAND2(ret)+1] = delta & 255;
1284 }
1285 ender = regnode(NOTHING);
1286 regtail(tbr, ender);
1287 regtail(fbr, ender);
1288 regtail(ret, ender);
1289 regparse++;
1290 } else
1291 FAIL("expected `(?=', `(?!', `(?<', or digit after `(?('");
1292 } else if (regparsestr[regparse+1] == '>') {
1293 regparse += 2;
1294 ret = reg(1, &flags, 0, LOOKTX, parse_flags);
1295 *flagp = flags;
1296 } else {
1297 if (regparsestr[regparse+1] == '=') {
1298 regparse += 2;
1299 ret = reg(1, &flags, 0, LOOKT, parse_flags);
1300 } else if (regparsestr[regparse+1] == '!') {
1301 regparse += 2;
1302 ret = reg(1, &flags, 0, LOOKF, parse_flags);
1303 } else if ((regparsestr[regparse+1] == '<')
1304 && (regparsestr[regparse+2] == '=')) {
1305 regparse += 3;
1306 ret = reg(1, &flags, 0, LOOKBT, parse_flags);
1307 } else if ((regparsestr[regparse+1] == '<')
1308 && (regparsestr[regparse+2] == '!')) {
1309 regparse += 3;
1310 ret = reg(1, &flags, 0, LOOKBF, parse_flags);
1311 } else {
1312 FAIL("expected `:', `=', `!', `<=', `<!', `i', `-i', `m', `-m', `s', or `-s' after `(?'");
1313 }
1314 regmatchmin = regmatchmax = 0;
1315 *flagp = SPFIXED;
1316 regbackdepends = NULL;
1317 }
1318 } else {
1319 ret = reg(1, &flags, 1, 0, parse_flags);
1320 if (flags & NEEDSAVECONST) {
1321 *flagp = flags;
1322 } else {
1323 *flagp |= flags&(HASWIDTH|SPSTART);
1324 if (!(flags&SPFIXED))
1325 *flagp &= ~SPFIXED;
1326 }
1327 }
1328 /* otherwise, regmatchmin/regmatchmax is set */
1329 if (ret == 0)
1330 FAIL("cluster failed!?");
1331 }
1332 break;
1333 case '|':
1334 case ')':
1335 FAIL("internal urp"); /* Supposed to be caught earlier. */
1336 break;
1337 case '?':
1338 FAIL("`?' follows nothing in pattern");
1339 break;
1340 case '+':
1341 FAIL("`+' follows nothing in pattern");
1342 break;
1343 case '*':
1344 FAIL("`*' follows nothing in pattern");
1345 break;
1346 case '\\':
1347 {
1348 int c;
1349 if (regparse == regparse_end)
1350 FAIL("trailing backslash in pattern");
1351 c = regparsestr[regparse++];
1352 if ((parse_flags & PARSE_PCRE) && (c == 'b')) {
1353 ret = regnode(WORDBOUND);
1354 regmatchmin = regmatchmax = 0;
1355 regmaxlookback = 1;
1356 } else if ((parse_flags & PARSE_PCRE) && (c == 'B')) {
1357 ret = regnode(NOTWORDBOUND);
1358 regmatchmin = regmatchmax = 0;
1359 regmaxlookback = 1;
1360 } else if ((parse_flags & PARSE_PCRE) && (c == 'p')) {
1361 ret = regunicode(0);
1362 regmatchmax = MAX_UTF8_CHAR_BYTES;
1363 *flagp |= HASWIDTH;
1364 } else if ((parse_flags & PARSE_PCRE) && (c == 'P')) {
1365 ret = regunicode(1);
1366 regmatchmax = MAX_UTF8_CHAR_BYTES;
1367 *flagp |= HASWIDTH;
1368 } else if ((parse_flags & PARSE_PCRE) && (c >= '0') && (c <= '9')) {
1369 int posn;
1370 --regparse;
1371 posn = regdigit();
1372 if (parse_flags & PARSE_CASE_SENS)
1373 ret = regnode(BACKREF);
1374 else
1375 ret = regnode(BACKREF_CI);
1376 regarg(posn);
1377 *flagp &= ~SPFIXED;
1378 /* Set HASWIDTH flag: */
1379 {
1380 Scheme_Object *f;
1381 if (regbackknown)
1382 f = scheme_hash_get(regbackknown, scheme_make_integer(posn));
1383 else
1384 f = NULL;
1385 if (f) {
1386 if (SCHEME_TRUEP(f))
1387 *flagp |= HASWIDTH;
1388 } else {
1389 *flagp |= HASWIDTH;
1390 if (!regbackdepends)
1391 regbackdepends = scheme_make_hash_table(SCHEME_hash_ptr);
1392 scheme_hash_set(regbackdepends, scheme_make_integer(posn), scheme_true);
1393 }
1394 }
1395 } else {
1396 regparse -= 2;
1397 ret = regranges(parse_flags, at_start);
1398 *flagp |= HASWIDTH|SIMPLE;
1399 }
1400 }
1401 break;
1402 default:
1403 {
1404 int len, ilen, c;
1405 char ender;
1406
1407 regparse--;
1408
1409 if (parse_flags & PARSE_PCRE) {
1410 if (regparsestr[regparse] == '{')
1411 FAIL("`{' follows nothing in pattern");
1412 if (regparsestr[regparse] == '}')
1413 FAIL("unmatched `}' in pattern");
1414 if (regparsestr[regparse] == ']')
1415 FAIL("unmatched `]' in pattern");
1416 }
1417
1418 for (len = ilen = 0; regparse + ilen < regparse_end; len++, ilen++) {
1419 if (regparsestr[regparse + ilen] == '\\') {
1420 if (regparse + ilen + 1 >= regparse_end)
1421 break;
1422 c = regparsestr[regparse + ilen + 1];
1423 if (((c >= 'a') && (c <= 'z'))
1424 || ((c >= 'A') && (c <= 'Z'))
1425 || ((c >= '0') && (c <= '9')))
1426 break;
1427 ilen++;
1428 } else if (regstrcspn(regparsestr + regparse + ilen, regparsestr + regparse + ilen + 1,
1429 (parse_flags & PARSE_PCRE) ? PCRE_META : META) < 1)
1430 break;
1431 }
1432 if (len <= 0)
1433 FAIL("internal disaster");
1434
1435 if ((len == 1) && at_start) {
1436 /* Maybe convert "x|y" to "[xy]", etc.: */
1437 ret = regranges(parse_flags, at_start);
1438 *flagp |= HASWIDTH|SIMPLE;
1439 } else {
1440 if (!(parse_flags & PARSE_CASE_SENS)) {
1441 /* Need case insensitivity? */
1442 int i;
1443 for (i = 0; i < ilen; i++) {
1444 c = regparsestr[regparse + i];
1445 if ((rx_toupper(c) != c)
1446 || (rx_tolower(c) != c)) {
1447 break;
1448 }
1449 }
1450 if (i >= ilen)
1451 parse_flags |= PARSE_CASE_SENS;
1452 }
1453
1454 ender = regparsestr[regparse+ilen];
1455 if (len > 1 && ISMULT(ender, parse_flags)) {
1456 /* Back off from ?+* operand. */
1457 len--;
1458 ilen--;
1459 if (regparsestr[regparse + ilen] == '\\')
1460 --ilen;
1461 }
1462 *flagp |= HASWIDTH;
1463 if (len == 1)
1464 *flagp |= SIMPLE;
1465 regmatchmin = regmatchmax = len;
1466 ret = regnode((parse_flags & PARSE_CASE_SENS) ? EXACTLY : EXACTLY_CI);
1467 regarg(len);
1468 while (len > 0) {
1469 c = regparsestr[regparse++];
1470 if (c == '\\')
1471 c = regparsestr[regparse++];
1472 if (!(parse_flags & PARSE_CASE_SENS))
1473 c = rx_tolower(c);
1474 regc(c);
1475 len--;
1476 }
1477 }
1478 }
1479 break;
1480 }
1481
1482 if (!ret)
1483 FAIL("failed!?");
1484
1485 return ret;
1486 }
1487
regcharclass(int c,char * map,int * _non_ascii)1488 static int regcharclass(int c, char *map, int *_non_ascii)
1489 {
1490 switch(c) {
1491 case 'd':
1492 for (c = 0; c < 10; c++) {
1493 map['0' + c] = 1;
1494 }
1495 break;
1496 case 'D':
1497 for (c = 0; c < '0'; c++) {
1498 map[c] = 1;
1499 }
1500 for (c = '9' + 1; c < (_non_ascii ? 128 : 256); c++) {
1501 map[c] = 1;
1502 }
1503 if (_non_ascii)
1504 *_non_ascii = 1;
1505 break;
1506 case 'w':
1507 for (c = 0; c < 26; c++) {
1508 map['a' + c] = 1;
1509 map['A' + c] = 1;
1510 }
1511 for (c = 0; c < 10; c++) {
1512 map['0' + c] = 1;
1513 }
1514 map['_'] = 1;
1515 break;
1516 case 'W':
1517 for (c = 0; c < '0'; c++) {
1518 map[c] = 1;
1519 }
1520 for (c = '9' + 1; c < 'A'; c++) {
1521 map[c] = 1;
1522 }
1523 for (c = 'Z' + 1; c < '_'; c++) {
1524 map[c] = 1;
1525 }
1526 for (c = 'z' + 1; c < (_non_ascii ? 128 : 256); c++) {
1527 map[c] = 1;
1528 }
1529 if (_non_ascii)
1530 *_non_ascii = 1;
1531 break;
1532 case 's':
1533 map['\t'] = 1;
1534 map['\n'] = 1;
1535 map['\f'] = 1;
1536 map['\r'] = 1;
1537 map[' '] = 1;
1538 break;
1539 case 'S':
1540 for (c = 0; c < (_non_ascii ? 128 : 256); c++) {
1541 switch (c) {
1542 case '\t':
1543 case '\n':
1544 case '\f':
1545 case '\r':
1546 case ' ':
1547 break;
1548 default:
1549 map[c] = 1;
1550 break;
1551 }
1552 }
1553 if (_non_ascii)
1554 *_non_ascii = 1;
1555 break;
1556 default:
1557 if (((c >= 'a') && (c <= 'z'))
1558 || ((c >= 'A') && (c <= 'Z'))) {
1559 FAIL("illegal alphabetic escape");
1560 }
1561 map[c] = 1;
1562 break;
1563 }
1564
1565 return 1;
1566 }
1567
is_posix_char_class(char * str,int pos,int len,char * map)1568 static int is_posix_char_class(char *str, int pos, int len, char *map)
1569 {
1570 int c;
1571
1572 if (pos + 8 <= len) {
1573 if (!scheme_strncmp(":alnum:]", str XFORM_OK_PLUS pos, 8)) {
1574 if (map) {
1575 regcharclass('d', map, NULL);
1576 for (c = 'a'; c <= 'z'; c++) {
1577 map[c] = 1;
1578 map[c - ('a' - 'A')] = 1;
1579 }
1580 }
1581 return 1;
1582 } else if (!scheme_strncmp(":alpha:]", str XFORM_OK_PLUS pos, 8)) {
1583 if (map) {
1584 for (c = 'a'; c <= 'z'; c++) {
1585 map[c] = 1;
1586 map[c - ('a' - 'A')] = 1;
1587 }
1588 }
1589 return 1;
1590 } else if (!scheme_strncmp(":ascii:]", str XFORM_OK_PLUS pos, 8)) {
1591 if (map) {
1592 for (c = 0; c <= 127; c++) {
1593 map[c] = 1;
1594 }
1595 }
1596 return 1;
1597 } else if (!scheme_strncmp(":blank:]", str XFORM_OK_PLUS pos, 8)) {
1598 if (map) {
1599 map[' '] = 1;
1600 map['\t'] = 1;
1601 }
1602 return 1;
1603 } else if (!scheme_strncmp(":cntrl:]", str XFORM_OK_PLUS pos, 8)) {
1604 if (map) {
1605 for (c = 0; c <= 31; c++) {
1606 map[c] = 1;
1607 }
1608 }
1609 return 1;
1610 } else if (!scheme_strncmp(":digit:]", str XFORM_OK_PLUS pos, 8)) {
1611 if (map) {
1612 regcharclass('d', map, NULL);
1613 }
1614 return 1;
1615 } else if (!scheme_strncmp(":graph:]", str XFORM_OK_PLUS pos, 8)) {
1616 if (map) {
1617 for (c = 0; c <= 127; c++) {
1618 if (scheme_isgraphic(c))
1619 map[c] = 1;
1620 }
1621 }
1622 return 1;
1623 } else if (!scheme_strncmp(":lower:]", str XFORM_OK_PLUS pos, 8)) {
1624 if (map) {
1625 for (c = 'a'; c <= 'z'; c++) {
1626 map[c] = 1;
1627 }
1628 }
1629 return 1;
1630 } else if (!scheme_strncmp(":print:]", str XFORM_OK_PLUS pos, 8)) {
1631 if (map) {
1632 for (c = 0; c <= 127; c++) {
1633 if (scheme_isgraphic(c))
1634 map[c] = 1;
1635 }
1636 map[' '] = 1;
1637 map['\t'] = 1;
1638 }
1639 return 1;
1640 } else if (!scheme_strncmp(":space:]", str XFORM_OK_PLUS pos, 8)) {
1641 if (map) {
1642 regcharclass('s', map, NULL);
1643 }
1644 return 1;
1645 } else if (!scheme_strncmp(":upper:]", str XFORM_OK_PLUS pos, 8)) {
1646 if (map) {
1647 for (c = 'A'; c <= 'Z'; c++) {
1648 map[c] = 1;
1649 }
1650 }
1651 return 1;
1652 }
1653 }
1654
1655 if ((pos + 7 <= len)
1656 && !scheme_strncmp(":word:]", str XFORM_OK_PLUS pos, 7)) {
1657 if (map) {
1658 regcharclass('w', map, NULL);
1659 }
1660 return 1;
1661 }
1662
1663 if ((pos + 9 <= len)
1664 && !scheme_strncmp(":xdigit:]", str XFORM_OK_PLUS pos, 9)) {
1665 if (map) {
1666 regcharclass('d', map, NULL);
1667 for (c = 'a'; c <= 'f'; c++) {
1668 map[c] = 1;
1669 map[c - ('a' - 'A')] = 1;
1670 }
1671 }
1672 return 1;
1673 }
1674
1675 return 0;
1676 }
1677
is_posix_char_class_in_unicode(mzchar * str,int pos,int len,char * map)1678 static int is_posix_char_class_in_unicode(mzchar *str, int pos, int len, char *map)
1679 {
1680 int ulen;
1681 int i;
1682 char buf[10];
1683
1684 if (pos + 7 > len)
1685 return 0;
1686
1687 ulen = len - pos;
1688 if (ulen > 9)
1689 ulen = 9;
1690
1691 for (i = 0; i < ulen; i++) {
1692 if (str[pos + i] > 127)
1693 return 0;
1694 buf[i] = (char)str[pos + i];
1695 }
1696
1697 return is_posix_char_class(buf, 0, ulen, map);
1698 }
1699
regrange(int parse_flags,char * map)1700 static char *regrange(int parse_flags, char *map)
1701 /* [ is already consumed; result is an array of 256 bytes of included chars */
1702 {
1703 int xclass, c;
1704 int classend, can_range = 0;
1705 int exclude = 0;
1706
1707 if (regparsestr[regparse] == '^') { /* Complement of range. */
1708 exclude = 1;
1709 regparse++;
1710 }
1711
1712 if (regparsestr[regparse] == ']' || regparsestr[regparse] == '-') {
1713 c = regparsestr[regparse];
1714 map[c] = 1;
1715 regparse++;
1716 }
1717 while (regparse != regparse_end && regparsestr[regparse] != ']') {
1718 if (regparsestr[regparse] == '-') {
1719 regparse++;
1720 if (regparsestr[regparse] == ']' || regparse == regparse_end) {
1721 map['-'] = 1;
1722 } else {
1723 if (!can_range) {
1724 FAIL("misplaced hyphen within square brackets in pattern");
1725 } else {
1726 xclass = UCHAR(regparsestr[regparse-2])+1;
1727 classend = UCHAR(regparsestr[regparse]);
1728 if (classend == '-') {
1729 FAIL("misplaced hyphen within square brackets in pattern");
1730 }
1731 if ((classend == '\\') && (parse_flags & PARSE_PCRE)) {
1732 if (regparse+1 == regparse_end) {
1733 FAIL("escaping backslash at end pattern (within square brackets)");
1734 }
1735 regparse++;
1736 classend = UCHAR(regparsestr[regparse]);
1737 if (((classend >= 'a') && (classend <= 'z'))
1738 || ((classend >= 'A') && (classend <= 'Z'))) {
1739 FAIL("misplaced hyphen within square brackets in pattern");
1740 }
1741 }
1742 if (xclass > classend+1)
1743 FAIL("invalid range within square brackets in pattern");
1744 for (; xclass <= classend; xclass++) {
1745 c = xclass;
1746 map[c] = 1;
1747 if (!(parse_flags & PARSE_CASE_SENS)) {
1748 c = rx_toupper(c);
1749 map[c] = 1;
1750 c = rx_tolower(c);
1751 map[c] = 1;
1752 }
1753 }
1754 regparse++;
1755 }
1756 }
1757 can_range = 0;
1758 } else if ((regparsestr[regparse] == '\\') && (parse_flags & PARSE_PCRE)) {
1759 c = UCHAR(regparsestr[regparse + 1]);
1760 if (((c >= 'a') && (c <= 'z'))
1761 || ((c >= 'A') && (c <= 'Z'))) {
1762 regcharclass(c, map, NULL);
1763 can_range = 0;
1764 } else {
1765 map[c] = 1;
1766 can_range = 1;
1767 }
1768 regparse += 2;
1769 } else if ((regparsestr[regparse] == '[')
1770 && (parse_flags & PARSE_PCRE)
1771 && (regparsestr[regparse+1] == ':')
1772 && is_posix_char_class(regparsestr, regparse + 1, regparse_end, map)) {
1773 regparse += 2;
1774 while (regparsestr[regparse] != ']') {
1775 regparse++;
1776 }
1777 regparse++;
1778 can_range = 0;
1779 } else {
1780 c = UCHAR(regparsestr[regparse++]);
1781 map[c] = 1;
1782 if (!(parse_flags & PARSE_CASE_SENS)) {
1783 c = rx_tolower(c);
1784 map[c] = 1;
1785 c = rx_toupper(c);
1786 map[c] = 1;
1787 }
1788 can_range = 1;
1789 }
1790 }
1791
1792 if (exclude) {
1793 for (c = 0; c < 256; c++) {
1794 map[c] = !map[c];
1795 }
1796 }
1797
1798 if (regparsestr[regparse] != ']')
1799 FAIL("missing closing square bracket in pattern");
1800 regparse++;
1801
1802 return map;
1803 }
1804
1805 static rxpos
regranges(int parse_flags,int at_start)1806 regranges(int parse_flags, int at_start)
1807 {
1808 int c;
1809 rxpos ret, save_regparse = 0;
1810 int count, off_ranges, on_ranges, now_on, last_on, prev_last_on;
1811 #ifdef COUNT_CI_CHARS
1812 /* These could be used to pick an encoding as a _CI variant, but
1813 _CI variants are not picked currently: */
1814 int all_ci, num_ci;
1815 #endif
1816 char *new_map = NULL, *accum_map = NULL;
1817
1818 count = 0;
1819 while (1) {
1820 /* This loop can end up parsing a range and not using the result,
1821 so that the range is parsed twice. That's ok, because there's
1822 no nesting (and therefore no exponential explosion). */
1823
1824 if (!new_map)
1825 new_map = (char *)scheme_malloc_atomic(256);
1826 memset(new_map, 0, 256);
1827
1828 if (regparsestr[regparse] == '\\'
1829 && (regparse + 1 < regparse_end)) {
1830 /* \<char> */
1831 c = UCHAR(regparsestr[++regparse]);
1832 if (parse_flags & PARSE_PCRE) {
1833 if ((c >= '0') && (c <= '9'))
1834 break;
1835 if (((c >= 'a') && (c <= 'z'))
1836 || ((c >= 'A') && (c <= 'Z'))) {
1837 if ((c == 'p') || (c == 'P')) {
1838 /* unicode char class; give up */
1839 break;
1840 }
1841 regcharclass(regparsestr[regparse], new_map, NULL);
1842
1843 } else
1844 new_map[c] = 1;
1845 } else
1846 new_map[c] = 1;
1847 regparse++;
1848 } else if (regstrcspn(regparsestr + regparse, regparsestr + regparse + 1,
1849 (parse_flags & PARSE_PCRE) ? PCRE_META : META)) {
1850 /* <char> */
1851 c = UCHAR(regparsestr[regparse]);
1852 new_map[c] = 1;
1853 if (!(parse_flags & PARSE_CASE_SENS)) {
1854 c = rx_tolower(c);
1855 new_map[c] = 1;
1856 c = rx_toupper(c);
1857 new_map[c] = 1;
1858 }
1859 regparse++;
1860 } else if (regparsestr[regparse] == '.') {
1861 /* . */
1862 for (c = 0; c < 256; c++) {
1863 new_map[c] = 1;
1864 }
1865 if (!(parse_flags & PARSE_SINGLE_LINE))
1866 new_map['\n'] = 0;
1867 regparse++;
1868 } else if (regparsestr[regparse] == '[') {
1869 /* [...] */
1870 regparse++;
1871 regrange(parse_flags, new_map);
1872 } else
1873 break;
1874
1875 /* If the most recently parsed range is not
1876 continued by a branch or the end of a sub-sequence,
1877 then abandon it, because it actually belongs
1878 with a new sequence. */
1879 if (accum_map
1880 && (regparse < regparse_end)
1881 && (regparsestr[regparse] != '|')
1882 && (regparsestr[regparse] != ')'))
1883 break;
1884
1885 /* We'll keep it. Merge char maps so far: */
1886 if (accum_map) {
1887 for (c = 0; c < 256; c++) {
1888 accum_map[c] |= new_map[c];
1889 }
1890 } else {
1891 accum_map = new_map;
1892 new_map = NULL;
1893 }
1894 save_regparse = regparse;
1895
1896 /* If we're at the end, or if we can only do one, then we're done. */
1897 if (!at_start
1898 || (regparsestr[regparse] != '|')
1899 || (regparse >= regparse_end)
1900 || (regparsestr[regparse] == ')'))
1901 break;
1902
1903 regparse++;
1904 if (regparse == regparse_end)
1905 break;
1906 }
1907
1908 regparse = save_regparse;
1909
1910 if (!accum_map)
1911 FAIL("should have found one range!");
1912
1913 while (1) {
1914 /* Collect stats to pick the best run-time implementation for a range.
1915 We may do this twice if we decide to use a _CI variant. */
1916 count = 0;
1917 #ifdef COUNT_CI_CHARS
1918 num_ci = 0;
1919 all_ci = 1;
1920 #endif
1921 on_ranges = 0;
1922 off_ranges = 0;
1923 now_on = 0;
1924 last_on = -1;
1925 prev_last_on = -1;
1926 for (c = 0; c < 256; c++) {
1927 if (accum_map[c]) {
1928 if (now_on < 0)
1929 off_ranges++;
1930 now_on = 1;
1931 count++;
1932 prev_last_on = last_on;
1933 last_on = c;
1934
1935 #ifdef COUNT_CI_CHARS
1936 if (c != rx_tolower(c)) {
1937 if (accum_map[rx_tolower(c)] != accum_map[c])
1938 all_ci = 0;
1939 num_ci++;
1940 } else if (c != rx_toupper(c)) {
1941 if (accum_map[rx_toupper(c)] != accum_map[c])
1942 all_ci = 0;
1943 num_ci++;
1944 }
1945 #endif
1946 } else {
1947 if (now_on > 0)
1948 on_ranges++;
1949 now_on = -1;
1950 }
1951 }
1952 if (now_on > 0)
1953 on_ranges++;
1954 else
1955 off_ranges++;
1956
1957 /* Pick the best run-time implementation for a range. */
1958 if (count == 256) {
1959 return regnode(ANY);
1960 } else if ((count == 255) && !accum_map['\n']) {
1961 return regnode(ANYL);
1962 } else if (count == 1) {
1963 ret = regnode(EXACTLY1);
1964 regc(last_on);
1965 return ret;
1966 } else if (count == 2) {
1967 ret = regnode(EXACTLY2);
1968 regc(last_on);
1969 regc(prev_last_on);
1970 return ret;
1971 } else if ((on_ranges == 1)
1972 || (off_ranges == 1)) {
1973 int rs = 255, re = 255, on;
1974
1975 if (on_ranges == 1)
1976 on = 1;
1977 else
1978 on = 0;
1979
1980 for (c = 0; c < 256; c++) {
1981 if (!!accum_map[c] == on) {
1982 rs = c;
1983 break;
1984 }
1985 }
1986 for (c++; c < 256; c++) {
1987 if ((!accum_map[c]) == on) {
1988 re = c - 1;
1989 break;
1990 }
1991 }
1992
1993 if (on)
1994 ret = regnode(RANGE);
1995 else
1996 ret = regnode(NOTRANGE);
1997 regc(rs);
1998 regc(re);
1999 return ret;
2000 } else {
2001 rxpos a;
2002
2003 ret = regnode(ANYOF);
2004 a = regcode;
2005 for (c = 0; c < 32; c++) {
2006 regc(0);
2007 }
2008
2009 if (regcode <= regcodesize) {
2010 for (c = 0; c < 256; c++) {
2011 if (accum_map[c]) {
2012 regstr[a + (c >> 3)] |= (1 << (c & 0x7));
2013 }
2014 }
2015 }
2016
2017 return ret;
2018 }
2019 }
2020 }
2021
2022 READ_ONLY static const char *prop_names[] = { "Cn",
2023 "Cc",
2024 "Cf",
2025 "Cs",
2026 "Co",
2027 "Ll",
2028 "Lu",
2029 "Lt",
2030 "Lm",
2031 "Lo",
2032 "Nd",
2033 "Nl",
2034 "No",
2035 "Ps",
2036 "Pe",
2037 "Pi",
2038 "Pf",
2039 "Pc",
2040 "Pd",
2041 "Po",
2042 "Mn",
2043 "Mc",
2044 "Me",
2045 "Sc",
2046 "Sk",
2047 "Sm",
2048 "So",
2049 "Zl",
2050 "Zp",
2051 "Zs",
2052 NULL};
2053
2054 static rxpos
regunicode(int negate)2055 regunicode(int negate)
2056 {
2057 rxpos ret;
2058 int len, bottom, top, i;
2059
2060 if (regparsestr[regparse] != '{') {
2061 FAIL("expected { after \\p or \\P");
2062 }
2063 regparse++;
2064 if (regparsestr[regparse] == '^') {
2065 negate = !negate;
2066 regparse++;
2067 }
2068
2069 len = 0;
2070 while ((regparsestr[regparse + len] != '}')
2071 && (regparse + len < regparse_end)) {
2072 len++;
2073 }
2074
2075 if (regparse + len >= regparse_end) {
2076 FAIL("missing } to close \\p{ or \\P{");
2077 }
2078
2079 bottom = top = -1;
2080 if (len == 2) {
2081 for (i = 0; prop_names[i]; i++) {
2082 if ((regparsestr[regparse] == prop_names[i][0])
2083 && (regparsestr[regparse+1] == prop_names[i][1])) {
2084 bottom = top = i;
2085 break;
2086 }
2087 }
2088 if (bottom == -1) {
2089 if ((regparsestr[regparse] == 'L')
2090 && (regparsestr[regparse+1] == '&')) {
2091 bottom = mzu_Ll;
2092 top = mzu_Lm;
2093 }
2094 }
2095 } else if (len == 1) {
2096 if (regparsestr[regparse] == '.') {
2097 bottom = 0;
2098 top = mzu_LAST;
2099 } else {
2100 for (i = 0; prop_names[i]; i++) {
2101 if (regparsestr[regparse] == prop_names[i][0]) {
2102 bottom = i;
2103 while (prop_names[i+1]) {
2104 if (regparsestr[regparse] != prop_names[i+1][0])
2105 break;
2106 i++;
2107 }
2108 top = i;
2109 break;
2110 }
2111 }
2112 }
2113 }
2114
2115 if (bottom < 0) {
2116 FAIL("unrecognized property name in \\p{} or \\P{}");
2117 }
2118
2119 regparse += len + 1;
2120
2121 ret = regnode(UNIPROP);
2122 /* This encoding accommodates up to 63 categories: */
2123 regarg((negate << 13) | (bottom << 6) | top);
2124
2125 return ret;
2126 }
2127
regdigit()2128 static int regdigit()
2129 {
2130 int posn, c;
2131 c = regparsestr[regparse++];
2132 posn = c - '0';
2133 while (regparse < regparse_end) {
2134 c = regparsestr[regparse];
2135 if ((c >= '0') && (c <= '9')) {
2136 posn = (posn * 10) + (c - '0');
2137 if (posn > 0x7FFF)
2138 FAIL("backreference number is too large");
2139 regparse++;
2140 } else
2141 break;
2142 }
2143 if (posn > regmaxbackposn)
2144 regmaxbackposn = posn;
2145 return posn;
2146 }
2147
2148 /*
2149 - regnode - emit a node
2150 */
2151 static rxpos /* Location. */
regnode(char op)2152 regnode(char op)
2153 {
2154 rxpos ret;
2155 rxpos ptr;
2156
2157 ret = regcode;
2158 if (regcode + 3 >= regcodesize) {
2159 regcode += 3;
2160 if (regcode > regcodemax)
2161 regcodemax = regcode;
2162 return ret;
2163 }
2164
2165 ptr = ret;
2166 regstr[ptr++] = op;
2167 regstr[ptr++] = '\0'; /* Null "next" pointer. */
2168 regstr[ptr++] = '\0';
2169 regcode = ptr;
2170
2171 if (regcode > regcodemax)
2172 regcodemax = regcode;
2173
2174 return ret;
2175 }
2176
2177 /*
2178 - regc - emit (if appropriate) a byte of code
2179 */
2180 static void
regc(char b)2181 regc(char b)
2182 {
2183 if (regcode + 1 < regcodesize)
2184 regstr[regcode] = b;
2185 regcode++;
2186 if (regcode > regcodemax)
2187 regcodemax = regcode;
2188 }
2189
2190 static void
regarg(int v)2191 regarg(int v)
2192 {
2193 regc(v >> 8);
2194 regc(v & 255);
2195 }
2196
2197 /*
2198 - reginsert - insert an operator in front of already-emitted operand
2199 *
2200 * Means relocating the operand.
2201 */
2202 static void
regshift(int amt,rxpos opnd)2203 regshift(int amt, rxpos opnd)
2204 {
2205 if (regcode + amt < regcodesize) {
2206 memmove(regstr XFORM_OK_PLUS opnd + amt,
2207 regstr XFORM_OK_PLUS opnd,
2208 regcode - opnd);
2209 }
2210 regcode += amt;
2211 if (regcode > regcodemax)
2212 regcodemax = regcode;
2213 }
2214
2215 static void
reginsert(char op,rxpos opnd)2216 reginsert(char op, rxpos opnd)
2217 {
2218 regshift(3, opnd);
2219
2220 if (opnd + 3 >= regcodesize) {
2221 return;
2222 }
2223
2224 regstr[opnd++] = op;
2225 regstr[opnd++] = '\0'; /* tail */
2226 regstr[opnd++] = '\0';
2227 }
2228
2229 static rxpos
reginsertwithop(char op,rxpos opnd,int arg)2230 reginsertwithop(char op, rxpos opnd, int arg)
2231 {
2232 regshift(5, opnd);
2233
2234 if (opnd + 5 >= regcodesize) {
2235 return opnd + 5;
2236 }
2237
2238 regstr[opnd++] = op;
2239 regstr[opnd++] = '\0'; /* tail */
2240 regstr[opnd++] = '\0';
2241 regstr[opnd++] = (arg >> 8);
2242 regstr[opnd++] = (arg & 255);
2243
2244 return opnd;
2245 }
2246
2247 static rxpos
reginsertwithopop(char op,rxpos opnd,int arg,int arg2)2248 reginsertwithopop(char op, rxpos opnd, int arg, int arg2)
2249 {
2250 regshift(7, opnd);
2251
2252 if (opnd + 7 >= regcodesize) {
2253 return opnd + 7;
2254 }
2255
2256 regstr[opnd++] = op;
2257 regstr[opnd++] = '\0'; /* tail */
2258 regstr[opnd++] = '\0';
2259 regstr[opnd++] = (arg >> 8);
2260 regstr[opnd++] = (arg & 255);
2261 regstr[opnd++] = (arg2 >> 8);
2262 regstr[opnd++] = (arg2 & 255);
2263
2264 return opnd;
2265 }
2266
2267 /*
2268 - regtail - set the next-pointer at the end of a node chain
2269 */
2270 static void
regtail(rxpos p,rxpos val)2271 regtail(rxpos p, rxpos val)
2272 {
2273 rxpos scan;
2274 rxpos temp;
2275 int offset;
2276
2277 /* Find last node. */
2278 scan = p;
2279 for (;;) {
2280 if (scan + 2 >= regcodesize) {
2281 return;
2282 }
2283 temp = regnext(scan);
2284 if (temp == 0)
2285 break;
2286 scan = temp;
2287 }
2288
2289 if (scan + 2 >= regcodesize) {
2290 return;
2291 }
2292
2293 if (rOP(scan) == BACK)
2294 offset = scan - val;
2295 else
2296 offset = val - scan;
2297 regstr[scan+1] = (offset>>8)&255;
2298 regstr[scan+2] = offset&255;
2299 }
2300
2301 /*
2302 - regoptail - regtail on operand of first argument; nop if operandless
2303 */
2304 static void
regoptail(rxpos p,rxpos val)2305 regoptail(rxpos p, rxpos val)
2306 {
2307 /* "Operandless" and "op != BRANCH" are synonymous in practice. */
2308 if (p == 0 || (p >= regcodesize) || rOP(p) != BRANCH) {
2309 return;
2310 }
2311 regtail(OPERAND(p), val);
2312 }
2313
merge_tables(Scheme_Hash_Table * dest,Scheme_Hash_Table * src)2314 static int merge_tables(Scheme_Hash_Table *dest, Scheme_Hash_Table *src)
2315 {
2316 int i;
2317
2318 for (i = src->size; i--; ) {
2319 if (src->vals[i]) {
2320 scheme_hash_set(dest, src->keys[i], src->vals[i]);
2321 }
2322 }
2323
2324 return 1;
2325 }
2326
check_and_propagate_depends(void)2327 static int check_and_propagate_depends(void)
2328 {
2329 int i, j;
2330 Scheme_Hash_Table *backdepends = regbackdepends, *ht, *next_ht = NULL;
2331 Scheme_Object *v;
2332
2333 while (backdepends) {
2334 for (i = backdepends->size; i--; ) {
2335 if (backdepends->vals[i]) {
2336 if (regbackknown)
2337 v = scheme_hash_get(regbackknown, backdepends->keys[i]);
2338 else
2339 v = NULL;
2340 if (v) {
2341 /* Check assumption: */
2342 if (SCHEME_FALSEP(v)) {
2343 FAIL("*, +, or {...,} operand could be empty (via empty backreference)");
2344 }
2345 if (SCHEME_HASHTP(v)) {
2346 /* Check/propagate assumption. The fixpoint direction is
2347 determined by assuming "true" while recursively checking. */
2348 scheme_hash_set(regbackknown, backdepends->keys[i], scheme_true);
2349 if (!next_ht)
2350 next_ht = scheme_make_hash_table(SCHEME_hash_ptr);
2351 ht = (Scheme_Hash_Table *)v;
2352 for (j = ht->size; j--; ) {
2353 if (ht->vals[j]) {
2354 scheme_hash_set(next_ht, ht->keys[j], ht->vals[j]);
2355 }
2356 }
2357 }
2358 } else {
2359 /* Add assumption */
2360 if (!regbackknown)
2361 regbackknown = scheme_make_hash_table(SCHEME_hash_ptr);
2362 scheme_hash_set(regbackknown, backdepends->keys[i], scheme_true);
2363 }
2364 }
2365 }
2366 backdepends = next_ht;
2367 next_ht = NULL;
2368 }
2369
2370 return 1;
2371 }
2372
l_strchr(char * str,rxpos a,int l,int c)2373 static MZ_INLINE rxpos l_strchr(char *str, rxpos a, int l, int c)
2374 {
2375 int i;
2376
2377 for (i = 0; i < l; i++) {
2378 if (str[a + i] == c)
2379 return a + i;
2380 }
2381
2382 return -1;
2383 }
2384
l_strchr_ci(char * str,rxpos a,int l,int c)2385 static MZ_INLINE rxpos l_strchr_ci(char *str, rxpos a, int l, int c)
2386 {
2387 int i, ch;
2388
2389 for (i = 0; i < l; i++) {
2390 ch = str[a + i];
2391 ch = rx_tolower(ch);
2392 if (ch == c)
2393 return a + i;
2394 }
2395
2396 return -1;
2397 }
2398
2399 #if 0
2400 static MZ_INLINE int in_ranges(char *str, rxpos a, int l, int c)
2401 {
2402 int i;
2403
2404 l *= 2;
2405
2406 for (i = 0; i < l; i += 2) {
2407 if ((UCHAR(str[a + i]) <= c) && (UCHAR(str[a + i + 1]) >= c))
2408 return 1;
2409 }
2410
2411 return 0;
2412 }
2413
2414 static MZ_INLINE int in_ranges_ci(char *str, rxpos a, int l, int c)
2415 {
2416 int i;
2417
2418 l *= 2;
2419
2420 c = rx_tolower(c);
2421
2422 for (i = 0; i < l; i += 2) {
2423 if ((UCHAR(str[a + i]) <= c) && (UCHAR(str[a + i + 1]) >= c))
2424 return 1;
2425 }
2426
2427 return 0;
2428 }
2429 #endif
2430
2431 /*
2432 * regexec and friends
2433 */
2434
2435 /*
2436 * Forwards.
2437 */
2438 static int regtry(regexp *, char *, int, int, rx_lazy_str_t *, rxpos *, rxpos *, rxpos *, rxpos *, int *, Regwork *rw, rxpos,
2439 char *, rxpos, rxpos, int);
2440 static int regtry_port(regexp *, Scheme_Object *, Scheme_Object *, int nonblock,
2441 rxpos *, rxpos *, rxpos *, rxpos *, int *,
2442 char **, rxpos *, rxpos *, rxpos, Scheme_Object*, Scheme_Object*, rxpos,
2443 char*, rxpos, rxpos,
2444 int, int *);
2445 static int regmatch(Regwork *rw, rxpos);
2446 static int regrepeat(Regwork *rw, rxpos, int);
2447
stack_room(Regwork * rw,int amt)2448 static void stack_room(Regwork *rw, int amt)
2449 {
2450 if (rw->rewind_stack_count + amt > rw->rewind_stack_size) {
2451 int sz;
2452 rxpos *p;
2453 sz = rw->rewind_stack_size * 2;
2454 if (!sz) sz = MATCH_STACK_SIZE;
2455 p = (rxpos *)scheme_malloc_atomic(sizeof(rxpos)*sz);
2456 if (rw->rewind_stack_size)
2457 memcpy(p, rw->rewind_stack, rw->rewind_stack_size * sizeof(rxpos));
2458 rw->rewind_stack = p;
2459 rw->rewind_stack_size = sz;
2460 }
2461 }
2462
match_push(Regwork * rw)2463 static int match_push(Regwork *rw)
2464 {
2465 if (rw->non_tail >= 0) {
2466 int pos;
2467
2468 rw->non_tail++;
2469 pos = rw->rewind_stack_count;
2470 rw->rewind_stack_prompt = pos;
2471
2472 return pos;
2473 } else
2474 return 0;
2475 }
2476
match_pop(Regwork * rw,int pos,int matched)2477 static void match_pop(Regwork *rw, int pos, int matched)
2478 {
2479 if (rw->non_tail >= 0) {
2480 --rw->non_tail;
2481
2482 if (matched) {
2483 /* Save elements on stack in case an enclosing match
2484 needs to rewind. Area between prompt and pos are
2485 mapping that don't need to be re-recorded. */
2486 rw->rewind_stack_prompt = pos;
2487 } else {
2488 int i, no;
2489 for (i = rw->rewind_stack_count; i > pos; i -= 3) {
2490 no = rw->rewind_stack[i-3];
2491 if (no < 0) {
2492 rw->maybep[-no] = rw->rewind_stack[i-2];
2493 } else {
2494 rw->startp[no] = rw->rewind_stack[i-2];
2495 rw->endp[no] = rw->rewind_stack[i-1];
2496 }
2497 }
2498 rw->rewind_stack_count = pos;
2499 rw->rewind_stack_prompt = pos;
2500 }
2501 }
2502 }
2503
match_set(Regwork * rw,int no,rxpos start,rxpos end)2504 static void match_set(Regwork *rw, int no, rxpos start, rxpos end)
2505 {
2506 int i, count;
2507
2508 if (rw->non_tail > 0) {
2509 count = rw->rewind_stack_count;
2510 for (i = rw->rewind_stack_prompt; i < count; i += 3) {
2511 if (rw->rewind_stack[i] == no)
2512 break;
2513 }
2514
2515 if (i >= count) {
2516 stack_room(rw, 3);
2517 i = count;
2518 rw->rewind_stack[i++] = no;
2519 rw->rewind_stack[i++] = rw->startp[no];
2520 rw->rewind_stack[i++] = rw->endp[no];
2521 rw->rewind_stack_count = i;
2522 }
2523 }
2524
2525 rw->startp[no] = start;
2526 rw->endp[no] = end;
2527 }
2528
match_maybe(Regwork * rw,int no,rxpos pos)2529 static void match_maybe(Regwork *rw, int no, rxpos pos)
2530 {
2531 int i, count;
2532
2533 if (rw->non_tail > 0) {
2534 count = rw->rewind_stack_count;
2535 for (i = rw->rewind_stack_prompt; i < count; i += 3) {
2536 if (rw->rewind_stack[i] == (- no))
2537 break;
2538 }
2539
2540 if (i >= count) {
2541 stack_room(rw, 3);
2542 i = count;
2543 rw->rewind_stack[i++] = (- no);
2544 rw->rewind_stack[i++] = rw->maybep[no];
2545 rw->rewind_stack[i++] = 0;
2546 rw->rewind_stack_count = i;
2547 }
2548 }
2549
2550 rw->maybep[no] = pos;
2551 }
2552
2553 #ifdef DEBUG
2554 int regnarrate = 0;
2555 void regdump();
2556 static char *regprop();
2557 #endif
2558
2559 #define REGPORT_FLUSH_THRESHOLD 256
2560
2561 /*
2562 - regexec - match a regexp against a string
2563 */
2564 static int
regexec(const char * who,regexp * prog,char * string,int stringpos,int stringlen,int stringorigin,rx_lazy_str_t * lazy_string,rxpos * startp,rxpos * maybep,rxpos * endp,rxpos * match_stack,Scheme_Object * port,Scheme_Object * unless_evt,int nonblock,char ** stringp,int peek,int get_offsets,intptr_t save_prior,Scheme_Object * discard_oport,Scheme_Object * portstart,Scheme_Object * portend,Scheme_Object ** _dropped,char * prefix,rxpos prefix_len,rxpos prefix_offset)2565 regexec(const char *who,
2566 regexp *prog, char *string,
2567 /* Used only for (bytes) strings: */
2568 int stringpos, int stringlen, int stringorigin,
2569 /* For lazy strings: */
2570 rx_lazy_str_t *lazy_string,
2571 /* Always used: */
2572 rxpos *startp, rxpos *maybep, rxpos *endp, rxpos *match_stack,
2573 /* For port mode: */
2574 Scheme_Object *port, Scheme_Object *unless_evt, int nonblock,
2575 /* Used only when port is non-NULL: */
2576 char **stringp, int peek, int get_offsets, intptr_t save_prior,
2577 Scheme_Object *discard_oport,
2578 Scheme_Object *portstart, Scheme_Object *portend, Scheme_Object **_dropped,
2579 char *prefix, rxpos prefix_len, rxpos prefix_offset)
2580 {
2581 int spos;
2582 int *counters;
2583 Scheme_Object *dropped = NULL, *peekskip = NULL; /* used for ports, only */
2584
2585 /* Check validity of program. */
2586 if (UCHAR(prog->program[0]) != MAGIC) {
2587 regerror("corrupted program");
2588 return(0);
2589 }
2590
2591 /* If there is a "must appear" string, look for it. */
2592 if (!port && !lazy_string && (prog->regmust >= 0)) {
2593 spos = stringpos;
2594 while (1) {
2595 int i, l = prog->regmlen, ch, pos;
2596 GC_CAN_IGNORE char *p;
2597
2598 if ((spos - stringpos) + l <= stringlen) {
2599 if (prog->flags & REGEXP_MUST_CI)
2600 pos = l_strchr_ci(string, spos, stringlen - (spos - stringpos) - (l - 1),
2601 (ITO(prog->program, (char *)prog) XFORM_OK_PLUS prog->regmust)[0]);
2602 else
2603 pos = l_strchr(string, spos, stringlen - (spos - stringpos) - (l - 1),
2604 (ITO(prog->program, (char *)prog) XFORM_OK_PLUS prog->regmust)[0]);
2605 if (pos == -1)
2606 return 0; /* Not present. */
2607 } else
2608 return 0; /* Not present, since there's not enough room left. */
2609
2610 /* ASSUMING NO GC HERE! */
2611 p = (ITO(prog->program, (char *)prog) XFORM_OK_PLUS prog->regmust);
2612 if (prog->flags & REGEXP_MUST_CI) {
2613 for (i = 0; i < l; i++) {
2614 ch = string[pos + i];
2615 ch = rx_tolower(ch);
2616 if (ch != p[i])
2617 break;
2618 }
2619 } else {
2620 for (i = 0; i < l; i++) {
2621 if (string[pos + i] != p[i])
2622 break;
2623 }
2624 }
2625 if (i >= l)
2626 break; /* Found it. */
2627 spos = pos + 1;
2628 }
2629 }
2630
2631 if (prog->ncounter) {
2632 counters = (int *)scheme_malloc_atomic(sizeof(int) * prog->ncounter);
2633 } else
2634 counters = NULL;
2635
2636 if (port) {
2637 if (peek) {
2638 peekskip = portstart;
2639 dropped = portstart;
2640 /* Make sure that's there's not an EOF before peekskip: */
2641 if (!SAME_OBJ(peekskip, scheme_make_integer(0))) {
2642 char tmp[1];
2643 intptr_t got;
2644 got = scheme_get_byte_string_unless("regexp-match", port,
2645 tmp, 0, 1, 1,
2646 1, scheme_bin_minus(peekskip, scheme_make_integer(1)),
2647 unless_evt);
2648 if (got == EOF) {
2649 /* Hit EOF before peekstart, so cannot match */
2650 return 0;
2651 }
2652 }
2653 } else {
2654 /* In non-peek port mode, skip over portstart chars: */
2655 intptr_t amt, got;
2656
2657 if (SCHEME_INTP(portstart)) {
2658 amt = SCHEME_INT_VAL(portstart);
2659 if (amt > 4096)
2660 amt = 4096;
2661 } else
2662 amt = 4096;
2663
2664 dropped = scheme_make_integer(0);
2665
2666 if (amt) {
2667 char *drain;
2668
2669 drain = (char *)scheme_malloc_atomic(amt);
2670
2671 do {
2672 got = scheme_get_byte_string(who, port, drain, 0, amt, 0, 0, 0);
2673 if (got != EOF) {
2674 Scheme_Object *delta;
2675
2676 if (discard_oport)
2677 scheme_put_byte_string(who, discard_oport, drain, 0, got, 0);
2678
2679 dropped = scheme_bin_plus(dropped, scheme_make_integer(got));
2680 delta = scheme_bin_minus(portstart, dropped);
2681 if (scheme_bin_gt(scheme_make_integer(amt), delta))
2682 amt = SCHEME_INT_VAL(delta);
2683 }
2684 } while ((got != EOF) && amt);
2685 if (amt)
2686 return 0; /* can't skip far enough, so it fails */
2687 }
2688 }
2689
2690 if (portend)
2691 portend = scheme_bin_minus(portend, dropped);
2692 }
2693
2694 /* Simplest case: anchored match need be tried only once. */
2695 if (prog->flags & REGEXP_ANCH) {
2696 if (port) {
2697 rxpos len = 0, space = 0;
2698 int aborted = 0;
2699
2700 *stringp = NULL;
2701 if (regtry_port(prog, port, unless_evt, nonblock,
2702 startp, maybep, endp, match_stack, counters, stringp, &len, &space, 0,
2703 portend, peekskip, 0, prefix, prefix_len, prefix_offset, 0,
2704 &aborted)) {
2705 if (!peek) {
2706 /* Need to consume matched chars: */
2707 char *drain;
2708
2709 if (discard_oport && *startp)
2710 scheme_put_byte_string(who, discard_oport, *stringp, 0, *startp, 0);
2711
2712 if (get_offsets)
2713 drain = *stringp;
2714 else
2715 /* Allocate fresh in case we get different results from previous peek: */
2716 drain = (char *)scheme_malloc_atomic(*endp);
2717 (void)scheme_get_byte_string(who, port, drain, 0, *endp, 0, 0, 0);
2718 }
2719
2720 *_dropped = dropped;
2721
2722 return 1;
2723 } else {
2724 if (!peek) {
2725 /* Need to consume all chars, up to portend */
2726 char *drain;
2727 intptr_t got;
2728
2729 if (portend && SCHEME_INTP(portend) && SCHEME_INT_VAL(portend) < 4096) {
2730 got = SCHEME_INT_VAL(portend);
2731 } else
2732 got = 4096;
2733
2734 drain = (char *)scheme_malloc_atomic(got);
2735
2736 while ((got = scheme_get_byte_string(who, port, drain, 0, got, 0, 0, 0)) != EOF) {
2737 if (discard_oport)
2738 scheme_put_byte_string(who, discard_oport, drain, 0, got, 0);
2739
2740 if (portend) {
2741 portend = scheme_bin_minus(portend, scheme_make_integer(got));
2742 if (SCHEME_INTP(portend)) {
2743 got = SCHEME_INT_VAL(portend);
2744 if (!got)
2745 break;
2746 else if (got > 4096)
2747 got = 4096;
2748 }
2749 } else
2750 got = 4096;
2751 }
2752 }
2753 return 0;
2754 }
2755 } else
2756 return regtry(prog, string, stringpos, stringlen, lazy_string, startp, maybep, endp,
2757 match_stack, counters, 0,
2758 stringorigin, prefix, prefix_len, prefix_offset, 0);
2759 }
2760
2761 /* Messy cases: unanchored match. */
2762 if (port) {
2763 rxpos len = 0, skip = 0, space = 0;
2764 *stringp = NULL;
2765
2766 do {
2767 int discard = skip - prog->maxlookback;
2768 int aborted = 0;
2769
2770 if (discard > skip - save_prior)
2771 discard = skip - save_prior;
2772
2773 if (discard >= REGPORT_FLUSH_THRESHOLD) {
2774 if (!peek) {
2775 if (discard_oport)
2776 scheme_put_byte_string(who, discard_oport, *stringp, 0, discard, 0);
2777
2778 scheme_get_byte_string(who, port, *stringp, 0, discard, 0, 0, 0);
2779
2780 if (portend)
2781 portend = scheme_bin_minus(portend, scheme_make_integer(discard));
2782 } else {
2783 peekskip = scheme_bin_plus(peekskip, scheme_make_integer(discard));
2784 }
2785
2786 dropped = scheme_bin_plus(dropped, scheme_make_integer(discard));
2787
2788 len -= discard;
2789 skip -= discard;
2790 memmove(*stringp, *stringp + discard, len);
2791
2792 prefix = NULL;
2793 prefix_len = 0;
2794 }
2795
2796 if (regtry_port(prog, port, unless_evt, nonblock,
2797 startp, maybep, endp, match_stack, counters, stringp, &len, &space, skip,
2798 portend, peekskip, 0, prefix, prefix_len, prefix_offset, 1,
2799 &aborted)) {
2800 if (!peek) {
2801 char *drain;
2802
2803 if (discard_oport && *startp)
2804 scheme_put_byte_string(who, discard_oport, *stringp, 0, *startp, 0);
2805
2806 if (get_offsets)
2807 drain = *stringp;
2808 else
2809 /* Allocate fresh in case we get different results from previous peek: */
2810 drain = (char *)scheme_malloc_atomic(*endp);
2811
2812 scheme_get_byte_string(who, port, drain, 0, *endp, 0, 0, 0);
2813 }
2814
2815 *_dropped = dropped;
2816
2817 return 1;
2818 } else if (aborted)
2819 return 0;
2820 skip++;
2821 } while (len >= skip);
2822
2823 if (!peek) {
2824 /* If we get here, there must be `len' leftover characters in the port,
2825 and `*stringp' must hold the characters: */
2826 if (len > 0) {
2827 if (discard_oport)
2828 scheme_put_byte_string(who, discard_oport, *stringp, 0, len, 0);
2829 scheme_get_byte_string(who, port, *stringp, 0, len, 0, 0, 0);
2830 }
2831 }
2832 } else {
2833 if (regtry(prog, string, stringpos, stringlen, lazy_string,
2834 startp, maybep, endp, match_stack, counters,
2835 0, stringorigin, prefix, prefix_len, prefix_offset, 1))
2836 return 1;
2837 }
2838
2839 /* Failure. */
2840 return 0;
2841 }
2842
2843 #define NEED_INPUT(rw, v, n) if (rw->port && (((v) + (n)) > rw->input_end)) read_more_from_regport(rw, (v) + (n))
2844 static void read_more_from_regport(Regwork *rw, rxpos need_total);
2845
2846 /*
2847 - regtry - try match at specific point
2848 */
2849 static int /* 0 failure, 1 success */
regtry(regexp * prog,char * string,int stringpos,int stringlen,rx_lazy_str_t * lazy_string,rxpos * startp,rxpos * maybep,rxpos * endp,rxpos * match_stack,int * counters,Regwork * rw,rxpos stringorigin,char * prefix,rxpos prefix_len,rxpos prefix_offset,int unanchored)2850 regtry(regexp *prog, char *string, int stringpos, int stringlen, rx_lazy_str_t *lazy_string,
2851 rxpos *startp, rxpos *maybep, rxpos *endp, rxpos *match_stack, int *counters,
2852 Regwork *rw, rxpos stringorigin,
2853 char *prefix, rxpos prefix_len, rxpos prefix_offset,
2854 int unanchored)
2855 /* stringpos: where to start looking;
2856 stringlen: available bytes, counting from stringpos;
2857 stringorigin: start of input, after prefix
2858 prefix: bytes to appear before the origin to count as input */
2859 {
2860 int i;
2861 Regwork _rw;
2862
2863 if (!rw) {
2864 rw = &_rw;
2865 rw->port = NULL;
2866 }
2867 rw->instr = string;
2868 rw->input = stringpos;
2869 rw->input_end = stringpos + stringlen;
2870 rw->input_start = stringorigin;
2871 rw->input_min = stringorigin - prefix_len;
2872 rw->startp = startp;
2873 rw->maybep = maybep;
2874 rw->endp = endp;
2875 rw->counters = counters;
2876 rw->prefix = prefix;
2877 rw->prefix_len = prefix_len;
2878 rw->prefix_delta = prefix_len + prefix_offset - stringorigin;
2879 rw->boi = stringorigin - prefix_len;
2880 rw->rewind_stack_size = (match_stack ? MATCH_STACK_SIZE : 0);
2881 rw->rewind_stack_count = 0;
2882 rw->rewind_stack_prompt = 0;
2883 rw->rewind_stack = match_stack;
2884 if (prog->nsubexp < 2)
2885 rw->non_tail = -1;
2886 else
2887 rw->non_tail = 0;
2888 rw->lazy_string = lazy_string;
2889 if (lazy_string)
2890 rw->port = scheme_true; /* hack to make NEED_INPUT() work */
2891
2892 for (i = prog->nsubexp; i--; ) {
2893 startp[i] = rw->input_min - 1;
2894 endp[i] = rw->input_min - 1;
2895 }
2896
2897 #ifdef INDIRECT_TO_PROGRAM
2898 regstr = prog->program;
2899 #else
2900 regstr = (char *)prog;
2901 #endif
2902
2903 while (1) {
2904 int found;
2905
2906 found = regmatch(rw, N_ITO_DELTA(prog->program, 1, (char *)prog));
2907
2908 if (found) {
2909 startp[0] = stringpos;
2910 endp[0] = rw->input;
2911 return 1;
2912 } else if (unanchored) {
2913 if (lazy_string) {
2914 NEED_INPUT(rw, stringpos, 1);
2915 stringlen = rw->input_end - stringpos;
2916 }
2917 if (!stringlen)
2918 return 0;
2919 stringpos++;
2920 --stringlen;
2921 if (prog->regstart) {
2922 unsigned char *rs = prog->regstart;
2923 int c;
2924 while (1) {
2925 if (lazy_string) {
2926 NEED_INPUT(rw, stringpos, 1);
2927 stringlen = rw->input_end - stringpos;
2928 string = rw->instr;
2929 }
2930
2931 if (!stringlen)
2932 return 0;
2933
2934 c = UCHAR(string[stringpos]);
2935 if (rs[c >> 3] & (1 << (c & 0x7)))
2936 break;
2937 stringpos++;
2938 --stringlen;
2939 }
2940 }
2941 rw->input = stringpos;
2942 for (i = prog->nsubexp; i--; ) {
2943 startp[i] = rw->input_min - 1;
2944 endp[i] = rw->input_min - 1;
2945 }
2946 /* try again... */
2947 } else
2948 return 0;
2949 }
2950 }
2951
2952 #define LAZY_STRING_CHUNK_SIZE 32
2953
read_more_from_lazy_string(Regwork * rw,rxpos need_total)2954 static void read_more_from_lazy_string(Regwork *rw, rxpos need_total)
2955 {
2956 rx_lazy_str_t *ls = rw->lazy_string;
2957
2958 if (ls->start + ls->done < ls->end) {
2959 intptr_t amt = ls->done, blen, tlen;
2960 char *s;
2961
2962 amt = amt ? (2 * amt) : LAZY_STRING_CHUNK_SIZE;
2963 if (ls->done + amt < need_total)
2964 amt = need_total - ls->done;
2965 if (ls->start + ls->done + amt > ls->end)
2966 amt = ls->end - ls->start - ls->done;
2967
2968 blen = scheme_utf8_encode(ls->chars, ls->start + ls->done, ls->start + ls->done + amt,
2969 NULL, 0,
2970 0 /* not UTF-16 */);
2971 tlen = blen + ls->blen;
2972 s = (char *)scheme_malloc_atomic(tlen);
2973 if (ls->blen)
2974 memcpy(s, ls->s, ls->blen);
2975 scheme_utf8_encode(ls->chars, ls->start + ls->done, ls->start + ls->done + amt,
2976 (unsigned char *)s, ls->blen,
2977 0 /* not UTF-16 */);
2978
2979 ls->blen = tlen;
2980 ls->s = s;
2981 ls->done += amt;
2982
2983 rw->instr = s;
2984 rw->input_end = tlen;
2985 } else {
2986 /* turn off further port reading */
2987 rw->port = NULL;
2988 }
2989 }
2990
read_more_from_regport(Regwork * rw,rxpos need_total)2991 static void read_more_from_regport(Regwork *rw, rxpos need_total)
2992 /* Called when we're about to look past our read-ahead */
2993 {
2994 intptr_t got;
2995 Scheme_Object *peekskip;
2996
2997 if (rw->lazy_string) {
2998 read_more_from_lazy_string(rw, need_total);
2999 return;
3000 }
3001
3002 /* limit reading by rw->input_maxend: */
3003 if (need_total > rw->input_maxend) {
3004 need_total = rw->input_maxend;
3005 if (need_total <= rw->input_end) {
3006 rw->port = NULL; /* turn off further port reading */
3007 return;
3008 }
3009 }
3010
3011 if (rw->instr_size < need_total) {
3012 char *naya;
3013 intptr_t size = rw->instr_size;
3014
3015 size = size * 2;
3016 if (size < need_total)
3017 size += need_total;
3018 if (size < 16)
3019 size = 16;
3020
3021 naya = (char *)scheme_malloc_atomic(size);
3022 if (rw->input_end)
3023 memcpy(naya, rw->instr, rw->input_end);
3024
3025 rw->instr = naya;
3026 rw->instr_size = size;
3027 }
3028
3029 rw->str = regstr; /* get_string can swap threads */
3030
3031 if (rw->input_maxend < rw->instr_size)
3032 got = rw->input_maxend - rw->input_end;
3033 else
3034 got = rw->instr_size - rw->input_end;
3035
3036 if (rw->peekskip)
3037 peekskip = scheme_bin_plus(scheme_make_integer(rw->input_end), rw->peekskip);
3038 else
3039 peekskip = scheme_make_integer(rw->input_end);
3040
3041 /* Fill as much of our buffer as possible: */
3042 got = scheme_get_byte_string_unless("regexp-match", rw->port,
3043 rw->instr, rw->input_end, got,
3044 (rw->nonblock
3045 ? 2 /* non-blocking read, as much as possible */
3046 : 1), /* read at least one char, and as much as possible */
3047 1, peekskip,
3048 rw->unless_evt);
3049
3050 regstr = rw->str;
3051
3052 if (got < 1) {
3053 /* EOF, special, or 0-due-to-unless/nonblock */
3054 if (!got)
3055 rw->aborted = 1;
3056 rw->port = NULL; /* turn off further port reading */
3057 rw->unless_evt = NULL;
3058 } else {
3059 rw->input_end += got;
3060
3061 /* Non-blocking read got enough? If not, try again in blocking mode: */
3062 while (need_total > rw->input_end) {
3063 if (rw->peekskip)
3064 peekskip = scheme_bin_plus(scheme_make_integer(rw->input_end), rw->peekskip);
3065 else
3066 peekskip = scheme_make_integer(rw->input_end);
3067
3068 rw->str = regstr; /* get_string can swap threads */
3069 got = scheme_get_byte_string_unless("regexp-match", rw->port,
3070 rw->instr, rw->input_end, need_total - rw->input_end,
3071 (rw->nonblock ? 2 : 0), /* blocking mode */
3072 1, peekskip,
3073 rw->unless_evt);
3074 regstr = rw->str;
3075
3076 if (!got && rw->nonblock) {
3077 rw->port = NULL; /* turn off further port reading */
3078 rw->unless_evt = NULL;
3079 rw->aborted = 1;
3080 break;
3081 } else if (got == EOF) {
3082 rw->port = NULL; /* turn off further port reading */
3083 rw->unless_evt = NULL;
3084 break;
3085 } else {
3086 rw->input_end += got;
3087 if (!rw->nonblock)
3088 break;
3089 }
3090 }
3091 }
3092 }
3093
3094 /*
3095 - regtry - try match in a port
3096 */
3097 static int
regtry_port(regexp * prog,Scheme_Object * port,Scheme_Object * unless_evt,int nonblock,rxpos * startp,rxpos * maybep,rxpos * endp,rxpos * match_stack,int * counters,char ** work_string,rxpos * len,rxpos * size,rxpos skip,Scheme_Object * maxlen,Scheme_Object * peekskip,rxpos origin,char * prefix,rxpos prefix_len,rxpos prefix_offset,int read_at_least_one,int * _aborted)3098 regtry_port(regexp *prog, Scheme_Object *port, Scheme_Object *unless_evt, int nonblock,
3099 rxpos *startp, rxpos *maybep, rxpos *endp, rxpos *match_stack, int *counters,
3100 char **work_string, rxpos *len, rxpos *size, rxpos skip,
3101 Scheme_Object *maxlen, Scheme_Object *peekskip,
3102 rxpos origin, char *prefix, rxpos prefix_len, rxpos prefix_offset,
3103 int read_at_least_one, int *_aborted)
3104 {
3105 int m;
3106 Regwork rw;
3107
3108 rw.port = port;
3109 rw.unless_evt = unless_evt;
3110 rw.nonblock = (short)nonblock;
3111 rw.aborted = 0;
3112 rw.instr_size = *size;
3113 if (maxlen && SCHEME_INTP(maxlen))
3114 rw.input_maxend = SCHEME_INT_VAL(maxlen);
3115 else
3116 rw.input_maxend = BIGGEST_RXPOS;
3117 rw.peekskip = peekskip;
3118
3119 m = regtry(prog, *work_string, skip, (*len) - skip, NULL,
3120 startp, maybep, endp, match_stack, counters,
3121 &rw, origin, prefix, prefix_len, prefix_offset, 0);
3122
3123 if (read_at_least_one
3124 && !rw.aborted
3125 && (rw.input_end == skip)
3126 && rw.port) {
3127 read_more_from_regport(&rw, rw.input_end + 1);
3128 }
3129
3130 *work_string = rw.instr;
3131 *len = rw.input_end;
3132 *size = rw.instr_size;
3133
3134 if (rw.aborted) {
3135 *_aborted = 1;
3136 return 0;
3137 } else
3138 return m;
3139 }
3140
3141 #ifdef DO_STACK_CHECK
3142
regmatch_k(void)3143 static Scheme_Object *regmatch_k(void)
3144 {
3145 Scheme_Thread *p = scheme_current_thread;
3146 Regwork *rw = (Regwork *)p->ku.k.p1;
3147 int res;
3148
3149 p->ku.k.p1 = NULL;
3150
3151 regstr = rw->str; /* in case of thread swap */
3152
3153 res = regmatch(rw, p->ku.k.i1);
3154
3155 return (res ? scheme_true : scheme_false);
3156 }
3157
3158 #endif
3159
INPUT_REF_S(Regwork * rw,rxpos c,rxpos input_start)3160 XFORM_NONGCING static MZ_INLINE char INPUT_REF_S(Regwork *rw, rxpos c, rxpos input_start)
3161 {
3162 if (c < input_start)
3163 return rw->prefix[rw->prefix_delta + c];
3164 else
3165 return rw->instr[c];
3166 }
3167
INPUT_REF(Regwork * rw,int c)3168 XFORM_NONGCING static MZ_INLINE char INPUT_REF(Regwork *rw, int c)
3169 {
3170 return INPUT_REF_S(rw, c, rw->input_start);
3171 }
3172
3173 /*
3174 - regmatch - main matching routine
3175 *
3176 * Conceptually the strategy is simple: check to see whether the current
3177 * node matches, call self recursively to see whether the rest matches,
3178 * and then act accordingly. In practice we make some effort to avoid
3179 * recursion, in particular by going through "ordinary" nodes (that don't
3180 * need to know whether the rest of the match failed) by a loop instead of
3181 * by recursion.
3182 */
3183 static int /* 0 failure, 1 success */
regmatch(Regwork * rw,rxpos prog)3184 regmatch(Regwork *rw, rxpos prog)
3185 {
3186 rxpos scan; /* Current node. */
3187 rxpos is; /* Input string pos */
3188 int the_op;
3189
3190 #ifdef DO_STACK_CHECK
3191 {
3192 # include "mzstkchk.h"
3193 {
3194 Scheme_Thread *p = scheme_current_thread;
3195 Regwork *rw2;
3196 Scheme_Object *res;
3197
3198 /* rw is likely be stack allocated, so copy out to
3199 the heap and then copy result back in on return. */
3200 rw2 = MALLOC_ONE_RT(Regwork);
3201 memcpy(rw2, rw, sizeof(Regwork));
3202 #ifdef MZTAG_REQUIRED
3203 rw2->type = scheme_rt_regwork;
3204 #endif
3205
3206 rw2->str = regstr; /* in case of thread swap */
3207 p->ku.k.p1 = rw2;
3208 p->ku.k.i1 = prog;
3209 res = scheme_handle_stack_overflow(regmatch_k);
3210
3211 memcpy(rw, rw2, sizeof(Regwork));
3212
3213 return SCHEME_TRUEP(res);
3214 }
3215 }
3216 #endif
3217
3218 if (DECREMENT_FUEL(scheme_fuel_counter, 1) <= 0) {
3219 char *rs;
3220 rs = regstr;
3221 scheme_out_of_fuel();
3222 regstr = rs;
3223 }
3224
3225 is = rw->input;
3226 scan = prog;
3227 while (scan != 0) {
3228 the_op = rOP(scan);
3229 switch (the_op) {
3230 case BOI:
3231 if (is != rw->boi)
3232 return(0);
3233 scan = NEXT_OP(scan);
3234 break;
3235 case EOI:
3236 NEED_INPUT(rw, is, 1);
3237 if (is != rw->input_end)
3238 return(0);
3239 scan = NEXT_OP(scan);
3240 break;
3241 case BOL:
3242 if ((is != rw->boi)
3243 && ((is <= rw->input_min)
3244 || (INPUT_REF(rw, is - 1) != '\n')))
3245 return(0);
3246 scan = NEXT_OP(scan);
3247 break;
3248 case EOL:
3249 NEED_INPUT(rw, is, 1);
3250 if (is != rw->input_end) {
3251 if (INPUT_REF(rw, is) != '\n')
3252 return(0);
3253 }
3254 scan = NEXT_OP(scan);
3255 break;
3256 case ANY:
3257 NEED_INPUT(rw, is, 1);
3258 if (is == rw->input_end)
3259 return(0);
3260 is++;
3261 scan = NEXT_OP(scan);
3262 break;
3263 case ANYL:
3264 NEED_INPUT(rw, is, 1);
3265 if (is == rw->input_end)
3266 return(0);
3267 if (INPUT_REF(rw, is) == '\n')
3268 return 0;
3269 is++;
3270 scan = NEXT_OP(scan);
3271 break;
3272 case EXACTLY:
3273 {
3274 int len, i;
3275 rxpos opnd;
3276
3277 opnd = OPSTR(OPERAND(scan));
3278 len = rOPLEN(OPERAND(scan));
3279 if (rw->port) {
3280 /* Like the other branch, but demand chars one at a time, as
3281 we need them */
3282 rxpos input_start = rw->input_start;
3283 for (i = 0; i < len; i++) {
3284 NEED_INPUT(rw, is + i, 1);
3285 if (is + i >= rw->input_end)
3286 return 0;
3287 if (regstr[opnd+i] != INPUT_REF_S(rw, is+i, input_start))
3288 return 0;
3289 }
3290 } else {
3291 rxpos input_start;
3292 if (len > rw->input_end - is)
3293 return 0;
3294 input_start = rw->input_start;
3295 for (i = 0; i < len; i++) {
3296 if (regstr[opnd+i] != INPUT_REF_S(rw, is+i, input_start))
3297 return 0;
3298 }
3299 }
3300 is += len;
3301 }
3302 scan = NEXT_OP(scan);
3303 break;
3304 case EXACTLY_CI:
3305 {
3306 int len, i;
3307 char c;
3308 rxpos opnd;
3309
3310 opnd = OPSTR(OPERAND(scan));
3311 len = rOPLEN(OPERAND(scan));
3312 if (rw->port) {
3313 /* Like the other branch, but demand chars one at a time, as
3314 we need them */
3315 for (i = 0; i < len; i++) {
3316 NEED_INPUT(rw, is + i, 1);
3317 if (is + i >= rw->input_end)
3318 return 0;
3319 c = INPUT_REF(rw, is+i);
3320 c = rx_tolower(c);
3321 if (regstr[opnd+i] != c)
3322 return 0;
3323 }
3324 } else {
3325 rxpos input_start;
3326 if (len > rw->input_end - is)
3327 return 0;
3328 input_start = rw->input_start;
3329 for (i = 0; i < len; i++) {
3330 c = INPUT_REF_S(rw, is+i, input_start);
3331 c = rx_tolower(c);
3332 if (regstr[opnd+i] != c)
3333 return 0;
3334 }
3335 }
3336 is += len;
3337 }
3338 scan = NEXT_OP(scan);
3339 break;
3340 case ANYOF:
3341 {
3342 int c;
3343 NEED_INPUT(rw, is, 1);
3344 if (is == rw->input_end)
3345 return 0;
3346 c = UCHAR(INPUT_REF(rw, is));
3347 if (!(regstr[OPERAND(scan) + (c >> 3)] & (1 << (c & 0x7))))
3348 return(0);
3349 is++;
3350 scan = NEXT_OP(scan);
3351 }
3352 break;
3353 case EXACTLY1:
3354 NEED_INPUT(rw, is, 1);
3355 if (is == rw->input_end)
3356 return 0;
3357 if (INPUT_REF(rw, is) != regstr[OPERAND(scan)])
3358 return 0;
3359 is++;
3360 scan = NEXT_OP(scan);
3361 break;
3362 case EXACTLY2:
3363 NEED_INPUT(rw, is, 1);
3364 if (is == rw->input_end)
3365 return 0;
3366 if (INPUT_REF(rw, is) != regstr[OPERAND(scan)])
3367 if (INPUT_REF(rw, is) != regstr[OPERAND(scan)+1])
3368 return 0;
3369 is++;
3370 scan = NEXT_OP(scan);
3371 break;
3372 case RANGE:
3373 {
3374 int c;
3375 NEED_INPUT(rw, is, 1);
3376 if (is == rw->input_end)
3377 return 0;
3378 c = UCHAR(INPUT_REF(rw, is));
3379 if ((c < UCHAR(regstr[OPERAND(scan)]))
3380 || (c > UCHAR(regstr[OPERAND(scan)+1])))
3381 return(0);
3382 is++;
3383 scan = NEXT_OP(scan);
3384 }
3385 break;
3386 case NOTRANGE:
3387 {
3388 int c;
3389 NEED_INPUT(rw, is, 1);
3390 if (is == rw->input_end)
3391 return 0;
3392 c = UCHAR(INPUT_REF(rw, is));
3393 if ((c >= UCHAR(regstr[OPERAND(scan)]))
3394 && (c <= UCHAR(regstr[OPERAND(scan)+1])))
3395 return(0);
3396 is++;
3397 scan = NEXT_OP(scan);
3398 }
3399 break;
3400 case NOTHING:
3401 scan = NEXT_OP(scan);
3402 break;
3403 case BACK:
3404 scan = scan - rNEXT(scan);
3405 break;
3406 case BRANCH:
3407 {
3408 rxpos delta;
3409 rxpos next; /* Next node. */
3410 int stack_pos, ok;
3411
3412 next = NEXT_OP(scan);
3413
3414 if (rOP(next) != BRANCH) /* No choice. */
3415 scan = OPERAND(scan); /* Avoid recursion. */
3416 else {
3417 do {
3418 rw->input = is;
3419 stack_pos = match_push(rw);
3420 ok = regmatch(rw, OPERAND(scan));
3421 match_pop(rw, stack_pos, ok);
3422
3423 if (ok) return(1);
3424
3425 scan = next;
3426 delta = rNEXT(scan);
3427 if (!delta)
3428 break;
3429 next = scan + delta;
3430 } while (rOP(next) == BRANCH);
3431 scan = OPERAND(scan);
3432 }
3433 }
3434 break;
3435 case STAR:
3436 case PLUS:
3437 case STAR2:
3438 case PLUS2:
3439 case STAR3:
3440 case STAR4:
3441 {
3442 char nextch;
3443 int no;
3444 rxpos save, body;
3445 int min, maxc;
3446 int nongreedy = (the_op == STAR2 || the_op == PLUS2 || the_op == STAR4);
3447 rxpos next; /* Next node. */
3448 int stack_pos, ok;
3449
3450 /*
3451 * Lookahead to avoid useless match attempts
3452 * when we know what character comes next.
3453 */
3454 nextch = '\0';
3455 next = NEXT_OP(scan);
3456 if (rOP(next) == EXACTLY)
3457 nextch = regstr[OPSTR(OPERAND(next))];
3458 if ((the_op == STAR3) || (the_op == STAR4)) {
3459 min = rOPLEN(OPERAND(scan));
3460 maxc = rOPLEN(OPERAND2(scan));
3461 body = OPERAND3(scan);
3462 } else {
3463 body = OPERAND(scan);
3464 min = ((the_op == STAR) || (the_op == STAR2)) ? 0 : 1;
3465 maxc = 0;
3466 }
3467 save = is;
3468
3469 rw->input = is;
3470 if (nongreedy && rw->port) {
3471 /* Get at least `min' bytes, but then don't
3472 let regrepeat pull in arbitrary bytes: */
3473 Scheme_Object *saveport;
3474 NEED_INPUT(rw, save, min ? min : 1);
3475 saveport = rw->port;
3476 rw->port = NULL;
3477 no = regrepeat(rw, body, maxc);
3478 rw->port = saveport;
3479 nongreedy = 2;
3480 } else
3481 no = regrepeat(rw, body, maxc);
3482
3483 if (!nongreedy) {
3484 if (nextch)
3485 NEED_INPUT(rw, save + no, 1);
3486 while (no >= min) {
3487 /* If it could work, try it. */
3488 if (nextch == '\0' || ((save + no < rw->input_end)
3489 && (INPUT_REF(rw, save + no) == nextch))) {
3490 rw->input = is + no;
3491 stack_pos = match_push(rw);
3492 ok = regmatch(rw, next);
3493 match_pop(rw, stack_pos, ok);
3494 if (ok) return(1);
3495 }
3496 /* Couldn't or didn't -- back up. */
3497 no--;
3498 }
3499 } else {
3500 int i;
3501 for (i = min; i <= no; i++) {
3502 /* If it could work, try it. */
3503 if (nextch)
3504 NEED_INPUT(rw, save + i, 1);
3505 if (nextch == '\0' || ((save+i < rw->input_end)
3506 && (INPUT_REF(rw, save+i) == nextch))) {
3507 rw->input = save + i;
3508 stack_pos = match_push(rw);
3509 ok = regmatch(rw, next);
3510 match_pop(rw, stack_pos, ok);
3511 if (ok) return(1);
3512 }
3513
3514 if ((i == no) && (nongreedy == 2)) {
3515 /* Maybe regrepeat can match more if we let it read from
3516 the port. */
3517 if ((rw->input_end - save) > no) {
3518 /* We have pulled-in chars to try. */
3519 int moreno;
3520 Scheme_Object *saveport;
3521
3522 saveport = rw->port;
3523 rw->port = NULL;
3524 is = save + no;
3525 rw->input = is;
3526 moreno = regrepeat(rw, body, maxc ? maxc - no : 0);
3527 rw->port = saveport;
3528
3529 if (!moreno)
3530 nongreedy = 1;
3531 else
3532 no += moreno;
3533 }
3534 }
3535 }
3536 }
3537 return(0);
3538 }
3539 break;
3540 case END:
3541 case LOOKE:
3542 rw->input = is;
3543 return(1); /* Success! */
3544 break;
3545 case BACKREF:
3546 {
3547 int no, len, start, i;
3548 no = rOPLEN(OPERAND(scan));
3549 if (rw->endp[no] < rw->input_min)
3550 return 0;
3551
3552 start = rw->startp[no];
3553 len = rw->endp[no] - start;
3554
3555 if (rw->port) {
3556 /* Like the other branch, but demand chars one at a time, as
3557 we need them */
3558 rxpos input_start = rw->input_start;
3559 for (i = 0; i < len; i++) {
3560 NEED_INPUT(rw, is + i, 1);
3561 if (is + i >= rw->input_end)
3562 return 0;
3563 if (INPUT_REF_S(rw, start+i, input_start) != INPUT_REF_S(rw, is+i, input_start))
3564 return 0;
3565 }
3566 } else {
3567 rxpos input_start = rw->input_start;
3568 if (len > rw->input_end - is)
3569 return 0;
3570 for (i = 0; i < len; i++) {
3571 if (INPUT_REF_S(rw, start+i, input_start) != INPUT_REF_S(rw, is+i, input_start))
3572 return 0;
3573 }
3574 }
3575 is += len;
3576 scan = NEXT_OP(scan);
3577 break;
3578 }
3579 case BACKREF_CI:
3580 {
3581 int no, len, start, i, c1, c2;
3582 no = rOPLEN(OPERAND(scan));
3583 if (rw->endp[no] < rw->input_min)
3584 return 0;
3585
3586 start = rw->startp[no];
3587 len = rw->endp[no] - start;
3588
3589 if (rw->port) {
3590 /* Like the other branch, but demand chars one at a time, as
3591 we need them */
3592 rxpos input_start = rw->input_start;
3593 for (i = 0; i < len; i++) {
3594 NEED_INPUT(rw, is + i, 1);
3595 if (is + i >= rw->input_end)
3596 return 0;
3597 c1 = INPUT_REF_S(rw, start+i, input_start);
3598 c1 = rx_tolower(c1);
3599 c2 = INPUT_REF_S(rw, is+i, input_start);
3600 c2 = rx_tolower(c2);
3601 if (c1 != c2)
3602 return 0;
3603 }
3604 } else {
3605 rxpos input_start = rw->input_start;
3606 if (len > rw->input_end - is)
3607 return 0;
3608 for (i = 0; i < len; i++) {
3609 c1 = INPUT_REF_S(rw, start+i, input_start);
3610 c1 = rx_tolower(c1);
3611 c2 = INPUT_REF_S(rw, is+i, input_start);
3612 c2 = rx_tolower(c2);
3613 if (c1 != c2)
3614 return 0;
3615 }
3616 }
3617 is += len;
3618 scan = NEXT_OP(scan);
3619 break;
3620 }
3621 case LOOKT:
3622 case LOOKF:
3623 case LOOKTX:
3624 case LOOKBT:
3625 case LOOKBF:
3626 {
3627 int t, no, no_start, no_end;
3628 rxpos save, next;
3629 int stack_pos, ok;
3630 next = NEXT_OP(scan);
3631 t = ((the_op != LOOKF) && (the_op != LOOKBF));
3632 if ((the_op == LOOKBT) || (the_op == LOOKBF)) {
3633 no_start = rOPLEN(OPERAND2(scan));
3634 no_end = rOPLEN(OPERAND3(scan));
3635 } else
3636 no_start = no_end = 0;
3637 save = is;
3638 if (no_end) {
3639 /* lookbehind */
3640 int found = 0;
3641 for (no = no_start; no <= no_end; no++) {
3642 if (is - rw->input_min >= no) {
3643 rw->input = save - no;
3644 stack_pos = match_push(rw);
3645 ok = regmatch(rw, next);
3646 match_pop(rw, stack_pos, ok);
3647 if (ok) {
3648 if (rw->input == save) {
3649 /* Match */
3650 if (!t) return 0;
3651 found = 1;
3652 break;
3653 }
3654 }
3655 } else
3656 break;
3657 }
3658 if (!found) {
3659 /* No matches */
3660 if (t) return 0;
3661 }
3662 } else {
3663 /* lookahead */
3664 rw->input = is;
3665 stack_pos = match_push(rw);
3666 ok = regmatch(rw, next);
3667 match_pop(rw, stack_pos, ok);
3668 if (ok) {
3669 if (!t) return 0;
3670 } else {
3671 if (t) return 0;
3672 }
3673 if (the_op == LOOKTX)
3674 is = rw->input;
3675 }
3676 scan = scan + rOPLEN(OPERAND(scan));
3677 scan = NEXT_OP(scan);
3678 }
3679 break;
3680 case COUNTINIT:
3681 {
3682 int no;
3683 no = rOPLEN(OPERAND(scan));
3684 rw->counters[no] = 0;
3685 scan = NEXT_OP(scan);
3686 }
3687 break;
3688 case COUNTBACK:
3689 {
3690 int no;
3691 no = rOPLEN(OPERAND(scan));
3692 rw->counters[no] -= 1;
3693 scan = NEXT_OP(scan);
3694 }
3695 break;
3696 case COUNTBACKFAIL:
3697 {
3698 int no;
3699 no = rOPLEN(OPERAND(scan));
3700 rw->counters[no] -= 1;
3701 return 0;
3702 }
3703 break;
3704 case COUNTUNDER:
3705 {
3706 int no, maxreps;
3707 no = rOPLEN(OPERAND(scan));
3708 maxreps = rOPLEN(OPERAND2(scan));
3709 rw->counters[no]++;
3710 if (maxreps && (rw->counters[no] > maxreps))
3711 return 0;
3712 scan = NEXT_OP(scan);
3713 }
3714 break;
3715 case COUNTOVER:
3716 {
3717 int no, minreps;
3718 no = rOPLEN(OPERAND(scan));
3719 minreps = rOPLEN(OPERAND2(scan));
3720 if (rw->counters[no] < minreps)
3721 return 0;
3722 scan = NEXT_OP(scan);
3723 }
3724 break;
3725 case SAVECONST:
3726 {
3727 int no, len;
3728 no = rOPLEN(OPERAND(scan));
3729 len = rOPLEN(OPERAND2(scan));
3730 /* Check that the match happened more than 0 times: */
3731 if (!len || (is > rw->maybep[no])) {
3732 match_set(rw, no, is-len, is);
3733 } else {
3734 match_set(rw, no, rw->input_min - 1, rw->input_min - 1);
3735 }
3736 scan = NEXT_OP(scan);
3737 }
3738 break;
3739 case MAYBECONST:
3740 {
3741 int no;
3742 no = rOPLEN(OPERAND(scan));
3743 match_maybe(rw, no, is);
3744 scan = NEXT_OP(scan);
3745 }
3746 break;
3747 case WORDBOUND:
3748 {
3749 int c, w1, w2;
3750 NEED_INPUT(rw, is, 1);
3751 if (is > rw->input_min) {
3752 c = INPUT_REF(rw, is - 1);
3753 w1 = rx_isword(c);
3754 } else
3755 w1 = 0;
3756 if (is < rw->input_end) {
3757 c = INPUT_REF(rw, is);
3758 w2 = rx_isword(c);
3759 } else
3760 w2 = 0;
3761 if (w1 == w2) return 0;
3762 scan = NEXT_OP(scan);
3763 }
3764 break;
3765 case NOTWORDBOUND:
3766 {
3767 int c, w1, w2;
3768 NEED_INPUT(rw, is, 1);
3769 if (is > rw->input_min) {
3770 c = INPUT_REF(rw, is - 1);
3771 w1 = rx_isword(c);
3772 } else
3773 w1 = 0;
3774 if (is < rw->input_end) {
3775 c = INPUT_REF(rw, is);
3776 w2 = rx_isword(c);
3777 } else
3778 w2 = 0;
3779 if (w1 != w2) return 0;
3780 scan = NEXT_OP(scan);
3781 }
3782 break;
3783 case UNIPROP:
3784 {
3785 unsigned char buf[MAX_UTF8_CHAR_BYTES];
3786 mzchar us[1];
3787 int c, data;
3788 int v, pos;
3789 int negate, bottom, top;
3790
3791 data = rOPLEN(OPERAND(scan));
3792
3793 negate = data >> 13;
3794 bottom = (data >> 6) & 0x3F;
3795 top = data & 0x3F;
3796
3797 NEED_INPUT(rw, is, 1);
3798 if (is < rw->input_end) {
3799 c = UCHAR(INPUT_REF(rw, is));
3800 if (c < 128) {
3801 v = c;
3802 pos = 1;
3803 } else {
3804 pos = 1;
3805 buf[0] = c;
3806 while (1) {
3807 v = scheme_utf8_decode_prefix(buf, pos, us, 0);
3808 if (v == 1) {
3809 v = us[0];
3810 break;
3811 } else if (v < -1)
3812 return 0;
3813 NEED_INPUT(rw, is, pos+1);
3814 if (is + pos < rw->input_end) {
3815 buf[pos] = INPUT_REF(rw, is + pos);
3816 pos++;
3817 } else
3818 return 0;
3819 }
3820 }
3821 } else
3822 return 0;
3823
3824 is += pos;
3825
3826 v = scheme_general_category(v);
3827
3828 if (negate) {
3829 if ((v >= bottom) && (v <= top))
3830 return 0;
3831 } else {
3832 if ((v < bottom) || (v > top))
3833 return 0;
3834 }
3835
3836 scan = NEXT_OP(scan);
3837 }
3838 break;
3839 case CONDITIONAL:
3840 {
3841 rxpos test = OPERAND3(scan);
3842 int t;
3843 int stack_pos;
3844
3845 if (rOP(test) == BACKREF) {
3846 int no;
3847 no = rOPLEN(OPERAND(test));
3848 t = (rw->endp[no] > rw->input_min);
3849 } else {
3850 rw->input = is;
3851 stack_pos = match_push(rw);
3852 t = regmatch(rw, test);
3853 match_pop(rw, stack_pos, t);
3854 }
3855
3856 if (t)
3857 scan = scan + rOPLEN(OPERAND(scan));
3858 else
3859 scan = scan + rOPLEN(OPERAND2(scan));
3860 }
3861 break;
3862 default:
3863 {
3864 int isopen;
3865 int no;
3866
3867 switch (the_op) {
3868 case OPENN:
3869 isopen = 1;
3870 no = rOPLEN(OPERAND(scan));
3871 if (!no)
3872 no = -1; /* => don't set in result array */
3873 break;
3874 case CLOSEN:
3875 isopen = 0;
3876 no = rOPLEN(OPERAND(scan));
3877 if (!no)
3878 no = -1; /* => don't set in result array */
3879 break;
3880 default:
3881 if (the_op < CLOSE) {
3882 isopen = 1;
3883 no = the_op - OPEN;
3884 } else {
3885 isopen = 0;
3886 no = the_op - CLOSE;
3887 }
3888 }
3889
3890 if (no < 0) {
3891 /* Nothing to set */
3892 } else if (isopen) {
3893 /* Storing the position in maybep instead of startp
3894 allows a backreference to refer to a match from a
3895 previous iteration in patterns like `(a|\1x)*'. */
3896 match_maybe(rw, no, is);
3897 } else
3898 match_set(rw, no, rw->maybep[no], is);
3899
3900 scan = NEXT_OP(scan);
3901 }
3902 break;
3903 }
3904 }
3905
3906 /*
3907 * We get here only if there's trouble -- normally "case END" is
3908 * the terminating point.
3909 */
3910 regerror("corrupted pointers");
3911 return(0);
3912 }
3913
3914 /*
3915 - regrepeat - repeatedly match something simple, report how many
3916 */
3917 static int
regrepeat(Regwork * rw,rxpos p,int maxc)3918 regrepeat(Regwork *rw, rxpos p, int maxc)
3919 {
3920 int count = 0;
3921 rxpos scan;
3922 rxpos opnd;
3923
3924 scan = rw->input;
3925 opnd = OPERAND(p);
3926 switch (rOP(p)) {
3927 case ANY:
3928 if (rw->port) {
3929 if (maxc) {
3930 while (rw->port && (rw->input_end < scan + maxc)) {
3931 read_more_from_regport(rw, scan + maxc);
3932 }
3933 } else {
3934 /* need all port input: */
3935 while (rw->port) {
3936 read_more_from_regport(rw, rw->input_end + 4096);
3937 }
3938 }
3939 }
3940 count = rw->input_end - scan;
3941 if (maxc && (count > maxc))
3942 count = maxc;
3943 scan += count;
3944 break;
3945 case ANYL:
3946 {
3947 rxpos input_start = rw->input_start;
3948 NEED_INPUT(rw, scan, 1);
3949 while (scan != rw->input_end
3950 && (INPUT_REF_S(rw, scan, input_start) != '\n')) {
3951 count++;
3952 scan++;
3953 if (maxc) { maxc--; if (!maxc) break; }
3954 NEED_INPUT(rw, scan, 1);
3955 }
3956 }
3957 break;
3958 case EXACTLY:
3959 {
3960 rxpos input_start = rw->input_start;
3961 rxpos opnd2 = OPSTR(opnd);
3962 NEED_INPUT(rw, scan, 1);
3963 while (scan != rw->input_end
3964 && (regstr[opnd2] == INPUT_REF_S(rw, scan, input_start))) {
3965 count++;
3966 scan++;
3967 if (maxc) { maxc--; if (!maxc) break; }
3968 NEED_INPUT(rw, scan, 1);
3969 }
3970 }
3971 break;
3972 case EXACTLY_CI:
3973 {
3974 char c;
3975 rxpos input_start = rw->input_start;
3976 rxpos opnd2 = OPSTR(opnd);
3977 NEED_INPUT(rw, scan, 1);
3978 while (scan != rw->input_end) {
3979 c = INPUT_REF_S(rw, scan, input_start);
3980 c = rx_tolower(c);
3981 if (regstr[opnd2] != c)
3982 break;
3983 count++;
3984 scan++;
3985 if (maxc) { maxc--; if (!maxc) break; }
3986 NEED_INPUT(rw, scan, 1);
3987 }
3988 }
3989 break;
3990 case ANYOF:
3991 {
3992 int c;
3993 rxpos input_start = rw->input_start;
3994 rxpos init = scan;
3995 if (rw->port || maxc) {
3996 /* Slow but general version */
3997 NEED_INPUT(rw, scan, 1);
3998 while (scan != rw->input_end) {
3999 c = UCHAR(INPUT_REF_S(rw, scan, input_start));
4000 if (!(regstr[opnd + (c >> 3)] & (1 << (c & 0x7))))
4001 break;
4002 scan++;
4003 if (maxc) { maxc--; if (!maxc) break; }
4004 NEED_INPUT(rw, scan, 1);
4005 }
4006 } else {
4007 /* Fast version */
4008 int e = rw->input_end;
4009 while (scan != e) {
4010 c = UCHAR(INPUT_REF_S(rw, scan, input_start));
4011 if (!(regstr[opnd + (c >> 3)] & (1 << (c & 0x7))))
4012 break;
4013 scan++;
4014 }
4015 }
4016 count = scan - init;
4017 }
4018 break;
4019 case EXACTLY1:
4020 {
4021 rxpos init = scan;
4022 rxpos input_start = rw->input_start;
4023 char c;
4024 c = regstr[opnd];
4025 if (rw->port || maxc) {
4026 /* Slow but general version */
4027 NEED_INPUT(rw, scan, 1);
4028 while ((scan != rw->input_end)
4029 && (INPUT_REF_S(rw, scan, input_start) == c)) {
4030 scan++;
4031 if (maxc) { maxc--; if (!maxc) break; }
4032 NEED_INPUT(rw, scan, 1);
4033 }
4034 } else {
4035 /* Fast version */
4036 int e = rw->input_end;
4037 while ((scan != e)
4038 && (INPUT_REF(rw, scan) == c)) {
4039 scan++;
4040 }
4041 }
4042 count = scan - init;
4043 }
4044 break;
4045 case EXACTLY2:
4046 {
4047 rxpos init = scan;
4048 rxpos input_start = rw->input_start;
4049 char c1, c2;
4050 c1 = regstr[opnd];
4051 c2 = regstr[opnd+1];
4052 if (rw->port || maxc) {
4053 /* Slow but general version */
4054 NEED_INPUT(rw, scan, 1);
4055 while ((scan != rw->input_end)
4056 && ((INPUT_REF_S(rw, scan, input_start) == c1)
4057 || (INPUT_REF_S(rw, scan, input_start) == c2))) {
4058 scan++;
4059 if (maxc) { maxc--; if (!maxc) break; }
4060 NEED_INPUT(rw, scan, 1);
4061 }
4062 } else {
4063 /* Fast version */
4064 int e = rw->input_end;
4065 while ((scan != e)
4066 && ((INPUT_REF(rw, scan) == c1)
4067 || (INPUT_REF(rw, scan) == c2))) {
4068 scan++;
4069 }
4070 }
4071 count = scan - init;
4072 }
4073 break;
4074 case RANGE:
4075 {
4076 rxpos init = scan;
4077 rxpos input_start = rw->input_start;
4078 int c, sr, er;
4079 NEED_INPUT(rw, scan, 1);
4080 sr = UCHAR(regstr[opnd]);
4081 er = UCHAR(regstr[opnd + 1]);
4082 if (rw->port || maxc) {
4083 /* Slow but general version */
4084 while (scan != rw->input_end) {
4085 c = UCHAR(INPUT_REF_S(rw, scan, input_start));
4086 if ((c < sr) || (c > er))
4087 break;
4088 scan++;
4089 if (maxc) { maxc--; if (!maxc) break; }
4090 NEED_INPUT(rw, scan, 1);
4091 }
4092 } else {
4093 /* Fast version */
4094 int e = rw->input_end;
4095 while (scan != e) {
4096 c = UCHAR(INPUT_REF(rw, scan));
4097 if ((c < sr) || (c > er))
4098 break;
4099 scan++;
4100 }
4101 }
4102 count = scan - init;
4103 }
4104 break;
4105 case NOTRANGE:
4106 {
4107 rxpos input_start = rw->input_start;
4108 rxpos init = scan;
4109 int c, sr, er;
4110 NEED_INPUT(rw, scan, 1);
4111 sr = UCHAR(regstr[opnd]);
4112 er = UCHAR(regstr[opnd + 1]);
4113 if (rw->port || maxc) {
4114 /* Slow but general version */
4115 while (scan != rw->input_end) {
4116 c = UCHAR(INPUT_REF_S(rw, scan, input_start));
4117 if ((c >= sr) && (c <= er))
4118 break;
4119 scan++;
4120 if (maxc) { maxc--; if (!maxc) break; }
4121 NEED_INPUT(rw, scan, 1);
4122 }
4123 } else {
4124 /* Fast version */
4125 int e = rw->input_end;
4126 while (scan != e) {
4127 c = UCHAR(INPUT_REF_S(rw, scan, input_start));
4128 if ((c >= sr) && (c <= er))
4129 break;
4130 scan++;
4131 }
4132 }
4133 count = scan - init;
4134 }
4135 break;
4136 default: /* Oh dear. Called inappropriately. */
4137 regerror("internal foulup");
4138 count = 0; /* Best compromise. */
4139 break;
4140 }
4141 rw->input = scan;
4142
4143 return(count);
4144 }
4145
4146 /*
4147 - regnext - dig the "next" pointer out of a node
4148 */
4149 static rxpos
regnext(rxpos p)4150 regnext(rxpos p)
4151 {
4152 int offset;
4153
4154 if (p + 2 >= regcodesize)
4155 return 0;
4156
4157 offset = rNEXT(p);
4158 if (offset == 0)
4159 return 0;
4160
4161 if (rOP(p) == BACK)
4162 return (p-offset);
4163 else
4164 return (p+offset);
4165 }
4166
4167 /*
4168 * strcspn - find length of initial segment of s1 consisting entirely
4169 * of characters not from s2
4170 */
4171
4172 static int
regstrcspn(char * s1,char * e1,char * s2)4173 regstrcspn(char *s1, char *e1, char *s2)
4174 {
4175 char *scan1;
4176 char *scan2;
4177 int count;
4178
4179 count = 0;
4180 for (scan1 = s1; scan1 != e1; scan1++) {
4181 for (scan2 = s2; *scan2 != '\0';) { /* ++ moved down. */
4182 if (*scan1 == *scan2++)
4183 return(count);
4184 }
4185 count++;
4186 }
4187 return(count);
4188 }
4189
4190 #ifndef strncpy
4191 extern char *strncpy();
4192 #endif
4193
4194 /*
4195 - regsub - perform substitutions after a regexp match
4196 */
4197 static
regsub(regexp * prog,char * src,int sourcelen,intptr_t * lenout,char * insrc,rxpos * startp,rxpos * endp,rxpos minpos,char * prefix,rxpos prefix_offset)4198 char *regsub(regexp *prog, char *src, int sourcelen, intptr_t *lenout, char *insrc,
4199 rxpos *startp, rxpos *endp, rxpos minpos,
4200 char *prefix, rxpos prefix_offset)
4201 {
4202 char *dest;
4203 char c;
4204 intptr_t no;
4205 intptr_t len;
4206 intptr_t destalloc, destlen, srcpos;
4207
4208 destalloc = 2 * sourcelen;
4209 destlen = 0;
4210 dest = (char *)scheme_malloc_atomic(destalloc + 1);
4211
4212 srcpos = 0;
4213 while (srcpos < sourcelen) {
4214 c = src[srcpos++];
4215 if (c == '&')
4216 no = 0;
4217 else if (c == '\\') {
4218 if (src[srcpos] == '\\' || src[srcpos] == '&')
4219 no = -1;
4220 else if (src[srcpos] == '$') {
4221 no = prog->nsubexp + 1; /* Gives the empty string */
4222 srcpos++;
4223 } else {
4224 no = 0;
4225 while ('0' <= src[srcpos] && src[srcpos] <= '9') {
4226 no = (no * 10) + (src[srcpos++] - '0');
4227 }
4228 }
4229 } else
4230 no = -1;
4231
4232
4233 if (no < 0) { /* Ordinary character. */
4234 if (c == '\\' && (src[srcpos] == '\\' || src[srcpos] == '&'))
4235 c = src[srcpos++];
4236 if (destlen + 1 >= destalloc) {
4237 char *old = dest;
4238 destalloc *= 2;
4239 dest = (char *)scheme_malloc_atomic(destalloc + 1);
4240 memcpy(dest, old, destlen);
4241 }
4242 dest[destlen++] = c;
4243 } else if (no >= prog->nsubexp) {
4244 /* Number too big; prentend it's the empty string */
4245 } else if ((startp[no] >= minpos) && (endp[no] >= minpos)) {
4246 len = endp[no] - startp[no];
4247 if (len + destlen >= destalloc) {
4248 char *old = dest;
4249 destalloc = 2 * destalloc + len + destlen;
4250 dest = (char *)scheme_malloc_atomic(destalloc + 1);
4251 memcpy(dest, old, destlen);
4252 }
4253 if (startp[no] >= 0)
4254 memcpy(dest + destlen, insrc + startp[no], len);
4255 else if (endp[no] <= 0)
4256 memcpy(dest + destlen, prefix + prefix_offset + (startp[no] - minpos), len);
4257 else {
4258 intptr_t pre_len = 0 - startp[no];
4259 memcpy(dest + destlen, prefix + prefix_offset + (startp[no] - minpos), pre_len);
4260 memcpy(dest + destlen + pre_len, insrc, len - pre_len);
4261 }
4262 destlen += len;
4263 }
4264 }
4265 dest[destlen] = '\0';
4266
4267 if (lenout)
4268 *lenout = destlen;
4269
4270 return dest;
4271 }
4272
4273 /************************************************************/
4274 /* UTF-8 -> per-byte translation */
4275 /* Translate a UTF-8-encode regexp over the language of */
4276 /* unicode code points into a per-byte regexp that matches */
4277 /* equivalent portionals of a UTF-8-encoded sequences of */
4278 /* code points. */
4279 /************************************************************/
4280
4281 #ifdef MZ_XFORM
4282 START_XFORM_SKIP;
4283 #endif
4284 #include "../gc2/my_qsort.c"
4285 #ifdef MZ_XFORM
4286 END_XFORM_SKIP;
4287 #endif
4288
compare_ranges(const void * a,const void * b)4289 static int compare_ranges(const void *a, const void *b)
4290 {
4291 unsigned int av, bv;
4292 av = *(unsigned int *)a;
4293 bv = *(unsigned int *)b;
4294 if (av == bv)
4295 return 0;
4296 else if (av < bv)
4297 return -1;
4298 else
4299 return 1;
4300 }
4301
4302 /* For allocating the traslated string, as we go. When writing an
4303 original char (or something that takes its place), there's always
4304 space, but call make_room() before adding new content. */
4305 typedef struct {
4306 int i; /* number of original chars written */
4307 int orig_len; /* original length */
4308 int size; /* allocated size */
4309 } RoomState;
4310
make_room(unsigned char * r,int j,int need_extra,RoomState * rs)4311 static unsigned char *make_room(unsigned char *r, int j, int need_extra, RoomState *rs)
4312 {
4313 int nrs;
4314 unsigned char *nr;
4315
4316 if ((rs->size - j - (rs->orig_len - rs->i)) < need_extra) {
4317 nrs = ((rs->size) * 2) + need_extra;
4318 nr = (unsigned char *)scheme_malloc_atomic(nrs+1);
4319 memcpy(nr, r, j);
4320 r = nr;
4321 rs->size = nrs;
4322 }
4323
4324 return r;
4325 }
4326
add_byte_range(const unsigned char * lo,const unsigned char * hi,int count,unsigned char * r,int * _j,RoomState * rs,int did_alt,int wrap_alts)4327 static unsigned char *add_byte_range(const unsigned char *lo, const unsigned char *hi, int count,
4328 unsigned char *r, int *_j, RoomState *rs,
4329 /* did_alt => no need to start with "|" */
4330 int did_alt,
4331 /* wrap_alts => wrap "(?:...)" around multiple alts */
4332 int wrap_alts)
4333 /* Adds alternatives for matching valid UTF-8 encodings lo
4334 through hi lexicographically. See add_range to get started. */
4335 {
4336 int same_chars, j, i;
4337 const unsigned char *lowest = (unsigned char *)"\200\200\200\200\200";
4338 const unsigned char *highest = (unsigned char *)"\277\277\277\277\277";
4339 unsigned char p, q;
4340
4341 /* Look for a common prefix: */
4342 for (same_chars = 0; same_chars < count; same_chars++) {
4343 if (lo[same_chars] != hi[same_chars])
4344 break;
4345 }
4346
4347 j = *_j;
4348
4349 /* Match exactly the part that's the same for hi and lo */
4350 if (same_chars) {
4351 r = make_room(r, j, 4 + same_chars, rs);
4352 if (!did_alt) {
4353 r[j++] = '|';
4354 did_alt = 1;
4355 }
4356 for (i = 0; i < same_chars; i++) {
4357 r[j++] = lo[i];
4358 }
4359 }
4360
4361 if (same_chars < count) {
4362 /* We have something like nxxxx to mxxxx where n < m.
4363 Find p such that p >= n and p0000 >= nxxxx, and
4364 find q such that q0000 <= mxxxx */
4365 int choices = 0;
4366
4367 /* If the xxxxs in nxxxx are 0, then p is n,
4368 otherwise it's n + 1 */
4369 for (i = same_chars + 1; i < count; i++) {
4370 if (lo[i] != 128)
4371 break;
4372 }
4373 if (i == count)
4374 p = lo[same_chars];
4375 else {
4376 p = lo[same_chars] + 1;
4377 choices++;
4378 }
4379
4380 /* If the xxxxs in mxxxx are 0, then q is m,
4381 otherwise it's m - 1 */
4382 for (i = same_chars + 1; i < count; i++) {
4383 if (hi[i] != 191)
4384 break;
4385 }
4386 if (i == count)
4387 q = hi[same_chars];
4388 else {
4389 q = hi[same_chars] - 1;
4390 choices++;
4391 }
4392
4393 if (p <= q)
4394 choices++;
4395
4396 if ((wrap_alts || same_chars) && (choices > 1)) {
4397 r = make_room(r, j, 4, rs);
4398 if (!did_alt) {
4399 r[j++] = '|';
4400 did_alt = 1;
4401 }
4402 r[j++] = '(';
4403 r[j++] = '?';
4404 r[j++] = ':';
4405 }
4406
4407 /* Fill out [nxxxx, p0000) */
4408 if (p > lo[same_chars]) {
4409 r = make_room(r, j, 2, rs);
4410 if (!did_alt)
4411 r[j++] = '|';
4412 r[j++] = lo[same_chars];
4413 *_j = j;
4414 r = add_byte_range(lo XFORM_OK_PLUS same_chars + 1, highest, count - same_chars - 1,
4415 r, _j, rs, 1, 1);
4416 j = *_j;
4417 p = lo[same_chars] + 1;
4418 did_alt = 0;
4419 }
4420
4421 /* Fill out [m0000, mxxxx] */
4422 if (q < hi[same_chars]) {
4423 r = make_room(r, j, 2, rs);
4424 if (!did_alt)
4425 r[j++] = '|';
4426 r[j++] = hi[same_chars];
4427 *_j = j;
4428 r = add_byte_range(lowest, hi XFORM_OK_PLUS same_chars + 1, count - same_chars - 1,
4429 r, _j, rs, 1, 1);
4430 j = *_j;
4431 did_alt = 0;
4432
4433 q = hi[same_chars] - 1;
4434 }
4435
4436 /* Fill out [p0000,m0000) */
4437 if (p <= q) {
4438 /* Make the alternative that lets the initial digit vary,
4439 since there's room between the lo and hi leading digit */
4440 const char *any_str = "[\200-\277]";
4441 const int any_len = 5;
4442
4443 r = make_room(r, j, 6 + ((count - same_chars - 1) * any_len), rs);
4444 if (!did_alt)
4445 r[j++] = '|';
4446
4447 if (p == q) {
4448 r[j++] = p;
4449 } else {
4450 r[j++] = '[';
4451 r[j++] = p;
4452 r[j++] = '-';
4453 r[j++] = q;
4454 r[j++] = ']';
4455 }
4456 for (i = same_chars + 1; i < count; i++) {
4457 memcpy(r + j, any_str, any_len);
4458 j += any_len;
4459 }
4460 }
4461
4462 if ((wrap_alts || same_chars) && (choices > 1)) {
4463 /* Close out the grouping */
4464 r = make_room(r, j, 1, rs);
4465 r[j++] = ')';
4466 }
4467 }
4468
4469 *_j = j;
4470 return r;
4471 }
4472
add_range(unsigned char * r,int * _j,RoomState * rs,unsigned int start,unsigned int end,int did_alt)4473 static unsigned char *add_range(unsigned char *r, int *_j, RoomState *rs,
4474 unsigned int start, unsigned int end, int did_alt)
4475 {
4476 unsigned int top;
4477 int count;
4478 unsigned char lo[6], hi[6];
4479
4480 /* If this range spans different-sized encodings, split it up
4481 with a recursive call. */
4482 if (start <= 0x7FF) {
4483 top = 0x7FF;
4484 count = 2;
4485 } else if (start <= 0xFFFF) {
4486 top = 0xFFFF;
4487 count = 3;
4488 } else if (start <= 0x1FFFFF) {
4489 top = 0x1FFFFF;
4490 count = 4;
4491 } else if (start <= 0x3FFFFFF) {
4492 top = 0x3FFFFFF;
4493 count = 5;
4494 } else {
4495 top = 0x7FFFFFFF;
4496 count = 6;
4497 }
4498
4499 if (end > top) {
4500 r = add_range(r, _j, rs, top + 1, end, did_alt);
4501 end = top;
4502 did_alt = 0;
4503 }
4504
4505 /* At this point, the situation is much like creating a
4506 regexp to match decimal digits. If we wanted to match the
4507 range 28 to 75 (inclusive), we'd need three parts:
4508
4509 2[8-9]|[3-6][0-9]|7[0-5]
4510
4511 It gets more complex with three digits, say
4512 128 to 715:
4513
4514 12[8-9]|1[3-6][0-9]|[2-6][0-9][0-9]|7[0-0][0-9]|71[0-5]
4515
4516 but you get the idea. Note that any_str takes the place of
4517 [0-9].
4518
4519 This same idea works with UTF-8 "digits", so first encode
4520 our code-point numbers in UTF-8: */
4521
4522 scheme_utf8_encode_all(&start, 1, lo);
4523 scheme_utf8_encode_all(&end, 1, hi);
4524
4525 return add_byte_range(lo, hi, count, r, _j, rs, did_alt, 0);
4526 }
4527
need_ci_alternates(unsigned char * s,int delta,int len)4528 static int need_ci_alternates(unsigned char *s, int delta, int len)
4529 {
4530 mzchar us[1], c;
4531
4532 scheme_utf8_decode(s, delta, len, us, 0, 1, NULL, 0, 0);
4533 c = us[0];
4534
4535 return ((c != scheme_toupper(c))
4536 || (c != scheme_tolower(c))
4537 || (c != scheme_tofold(c))
4538 || (c != scheme_totitle(c)));
4539 }
4540
translate(unsigned char * s,int len,char ** result,int pcre)4541 static int translate(unsigned char *s, int len, char **result, int pcre)
4542 {
4543 int j, parse_flags = PARSE_CASE_SENS | PARSE_SINGLE_LINE;
4544 RoomState rs;
4545 unsigned char *r;
4546 Scheme_Object *parse_params = NULL;
4547
4548 rs.orig_len = len;
4549 rs.size = len;
4550
4551 r = (unsigned char *)scheme_malloc_atomic(rs.size + 1);
4552
4553 /* We need to translate if the pattern contains any use of ".", if
4554 there's a big character in a range, if there's a not-range, or if
4555 there's a big character before '+', '*', or '?'. */
4556
4557 for (rs.i = j = 0; rs.i < len;) {
4558 if (s[rs.i] == '[') {
4559 int k = rs.i + 1, saw_big = 0;
4560 int not_mode = 0;
4561
4562 /* First, check whether we need to translate this particular
4563 range. */
4564
4565 /* Caret at start is special: */
4566 if ((k < len) && (s[k] == '^')) {
4567 not_mode = 1;
4568 k++;
4569 }
4570 /* Close bracket start is special: */
4571 if ((k < len) && (s[k] == ']'))
4572 k++;
4573 while ((k < len) && (s[k] != ']')) {
4574 if (s[k] > 127)
4575 saw_big = 1;
4576 else if (pcre && (s[k] == '\\') && (k + 1 < len)) {
4577 if ((s[k+1] == 'D') || (s[k+1] == 'W') || (s[k+1] == 'S'))
4578 saw_big = 1;
4579 k++;
4580 } else if (pcre
4581 && (s[k] == '[')
4582 && (k + 1 < len)
4583 && (s[k+1] == ':')
4584 && is_posix_char_class((char *)s, k + 1, len, NULL)) {
4585 while (s[k] != ']') {
4586 k++;
4587 }
4588 }
4589 k++;
4590 }
4591 if ((k >= len) || (!saw_big && !not_mode)) {
4592 /* No translation necessary. */
4593 while (rs.i <= k) {
4594 r[j++] = s[rs.i++];
4595 }
4596 } else {
4597 /* Need to translate. */
4598 char *simple_on;
4599 int non_ascii = 0;
4600 Scheme_Object *ranges;
4601 unsigned int *us, *range_array;
4602 int ulen, on_count, range_len, rp, p;
4603
4604 ulen = scheme_utf8_decode(s, rs.i + 1, k, NULL, 0, -1, NULL, 0, 0);
4605 us = (unsigned int *)scheme_malloc_atomic(ulen * sizeof(unsigned int));
4606 scheme_utf8_decode(s, rs.i + 1, k, us, 0, -1, NULL, 0, 0);
4607
4608 /* The simple_on array lists ASCII chars to (not) find
4609 for the match, and `non_ascii` virtually extends
4610 to the rest of Unicode */
4611 simple_on = (char *)scheme_malloc_atomic(128);
4612 memset(simple_on, 0, 128);
4613 /* The ranges list is pairs of larger ranges */
4614 ranges = scheme_null;
4615
4616 p = 0;
4617 if (not_mode)
4618 p++;
4619 if (us[p] == '-') {
4620 simple_on['-'] = 1;
4621 p++;
4622 }
4623
4624 while (p < ulen) {
4625 if (((p + 2) < ulen)
4626 && us[p+1] == '-'
4627 && (!pcre || ((us[p] != '\\') && (us[p+2] != '\\')))) {
4628 int beg = us[p], end = us[p+2];
4629 if (end == '-') {
4630 FAIL("misplaced hyphen within square brackets in pattern");
4631 }
4632 if (end < beg) {
4633 /* Bad regexp */
4634 FAIL("invalid range within square brackets in pattern");
4635 }
4636
4637 if ((beg > 127) || (end > 127)) {
4638 /* A big-char range */
4639 ranges = scheme_make_pair(scheme_make_pair(scheme_make_integer_value_from_unsigned(beg),
4640 scheme_make_integer_value_from_unsigned(end)),
4641 ranges);
4642 if (!(parse_flags & PARSE_CASE_SENS)) {
4643 /* Try to build up parallel ranges, though they may
4644 not turn out to be parallel. If the ranges overlap,
4645 we'll clean them up in the final sort-and-merge
4646 pass for the whole ranges list. */
4647 int c, beg2, end2, c2, mode;
4648 for (mode = 0; mode < 4; mode++) {
4649 for (c = beg; c <= end; c++) {
4650 switch (mode) {
4651 case 0:
4652 beg2 = scheme_tofold(c);
4653 break;
4654 case 1:
4655 beg2 = scheme_tolower(c);
4656 break;
4657 case 2:
4658 beg2 = scheme_toupper(c);
4659 break;
4660 case 3:
4661 default:
4662 beg2 = scheme_totitle(c);
4663 break;
4664 }
4665 if (c != beg2) {
4666 end2 = beg2;
4667 for (; c <= end; c++) {
4668 switch (mode) {
4669 case 0:
4670 c2 = scheme_tofold(c);
4671 break;
4672 case 1:
4673 c2 = scheme_tolower(c);
4674 break;
4675 case 2:
4676 c2 = scheme_toupper(c);
4677 break;
4678 case 3:
4679 default:
4680 c2 = scheme_totitle(c);
4681 break;
4682 }
4683 if ((c2 == c) || (c2 != end2 + 1))
4684 break;
4685 }
4686 ranges = scheme_make_pair(scheme_make_pair(scheme_make_integer_value_from_unsigned(beg2),
4687 scheme_make_integer_value_from_unsigned(end2)),
4688 ranges);
4689 }
4690 }
4691 }
4692 }
4693 } else {
4694 /* Small range */
4695 int w;
4696 for (w = beg; w <= end; w++) {
4697 simple_on[w] = 1;
4698 }
4699 }
4700 p += 3;
4701 } else if (pcre && (us[p] == '\\')) {
4702 if ((p + 1) < ulen) {
4703 int c = us[p + 1];
4704 if (((c >= 'a') && (c <= 'z'))
4705 || ((c >= 'A') && (c <= 'Z'))) {
4706 regcharclass(c, simple_on, &non_ascii);
4707 p += 2;
4708 } else if (c < 128) {
4709 simple_on[c] = 1;
4710 p += 2;
4711 } else {
4712 /* Let next iteration handle it.
4713 (There's no danger of using it as a meta-character.) */
4714 p++;
4715 }
4716 } else
4717 FAIL("trailing \\ in pattern");
4718 } else if (us[p] > 127) {
4719 int c = us[p];
4720 ranges = scheme_make_pair(scheme_make_pair(scheme_make_integer_value_from_unsigned(c),
4721 scheme_make_integer_value_from_unsigned(c)),
4722 ranges);
4723 if (!(parse_flags & PARSE_CASE_SENS)) {
4724 int mode, c2;
4725 for (mode = 0; mode < 4; mode++) {
4726 switch (mode) {
4727 case 0:
4728 c2 = scheme_tofold(c);
4729 break;
4730 case 1:
4731 c2 = scheme_tolower(c);
4732 break;
4733 case 2:
4734 c2 = scheme_toupper(c);
4735 break;
4736 case 3:
4737 default:
4738 c2 = scheme_totitle(c);
4739 break;
4740 }
4741 if (c2 != c) {
4742 ranges = scheme_make_pair(scheme_make_pair(scheme_make_integer_value_from_unsigned(c2),
4743 scheme_make_integer_value_from_unsigned(c2)),
4744 ranges);
4745 }
4746 }
4747 }
4748 p++;
4749 } else if (pcre
4750 && (us[p] == '[')
4751 && ((p + 1) < ulen)
4752 && (us[p+1] == ':')
4753 && is_posix_char_class_in_unicode(us, p + 1, ulen, simple_on)) {
4754 while (us[p] != ']') {
4755 p++;
4756 }
4757 p++;
4758 } else {
4759 if (((p + 1) < ulen) && (us[p] == '-')) {
4760 FAIL("misplaced hyphen within square brackets in pattern");
4761 return 0;
4762 }
4763 simple_on[us[p]] = 1;
4764 p++;
4765 }
4766 }
4767
4768 if (non_ascii) {
4769 /* Replace the ranges array to cover all non-ASCII characters */
4770 ranges = scheme_make_pair(scheme_make_pair(scheme_make_integer(128),
4771 scheme_make_integer(0x10FFFF)),
4772 scheme_null);
4773 }
4774
4775 /* Turn the ranges list into an array */
4776 range_len = scheme_list_length(ranges);
4777 range_array = (unsigned int *)scheme_malloc_atomic(2 * range_len * sizeof(unsigned int));
4778 for (rp = 0; SCHEME_PAIRP(ranges); ranges = SCHEME_CDR(ranges), rp += 2) {
4779 uintptr_t hi, lo;
4780 scheme_get_unsigned_int_val(SCHEME_CAAR(ranges), &lo);
4781 scheme_get_unsigned_int_val(SCHEME_CDR(SCHEME_CAR(ranges)), &hi);
4782 range_array[rp] = (unsigned int)lo;
4783 range_array[rp+1] = (unsigned int)hi;
4784 }
4785 range_len *= 2;
4786 /* Sort the ranges by the starting index. */
4787 my_qsort(range_array, range_len >> 1, 2 * sizeof(unsigned int), compare_ranges);
4788
4789 /* If a range starts below 128, fill in the simple array */
4790 for (rp = 0; rp < range_len; rp += 2) {
4791 if (range_array[rp] < 128) {
4792 for (p = range_array[rp]; p < 128; p++) {
4793 simple_on[p] = 1;
4794 }
4795 range_array[rp] = 128;
4796 }
4797 }
4798
4799 if (!(parse_flags & PARSE_CASE_SENS)) {
4800 for (p = 'a'; p <= 'z'; p++) {
4801 if (simple_on[p])
4802 simple_on[rx_toupper(p)] = 1;
4803 if (simple_on[rx_toupper(p)])
4804 simple_on[p] = 1;
4805 }
4806 }
4807
4808 /* Count simples that are on */
4809 on_count = 0;
4810 for (p = 0; p < 128; p++) {
4811 if (simple_on[p])
4812 on_count++;
4813 }
4814
4815 if (not_mode) {
4816 /* "Not" mode. We produce something in regular mode */
4817 /* Start with "(?:[...]|" for simples. */
4818 unsigned int last_end;
4819 int did_alt;
4820 r = make_room(r, j, 6 + (128 - on_count) + ((pcre && !simple_on['\\']) ? 1 : 0), &rs);
4821 r[j++] = '(';
4822 r[j++] = '?';
4823 r[j++] = ':';
4824 if (on_count < 128) {
4825 if (!on_count) {
4826 r[j++] = '[';
4827 r[j++] = 0;
4828 r[j++] = '-';
4829 r[j++] = 127;
4830 r[j++] = ']';
4831 } else {
4832 r[j++] = '[';
4833 if (!simple_on[']'])
4834 r[j++] = ']';
4835 for (p = 0; p < 128; p++) {
4836 if ((p != '-') && (p != ']') && (!pcre || (p != '\\')))
4837 if (!simple_on[p])
4838 r[j++] = p;
4839 }
4840 if (pcre && !simple_on['\\']) {
4841 r[j++] = '\\';
4842 r[j++] = '\\';
4843 }
4844 if (!simple_on['-'])
4845 r[j++] = '-';
4846 r[j++] = ']';
4847 }
4848 did_alt = 0;
4849 } else
4850 did_alt = 1;
4851 last_end = 128;
4852 for (rp = 0; rp < range_len; rp += 2) {
4853 if (range_array[rp] > last_end) {
4854 r = add_range(r, &j, &rs, last_end, range_array[rp] - 1, did_alt);
4855 did_alt = 0;
4856 }
4857 if ((range_array[rp + 1] + 1) > last_end)
4858 last_end = range_array[rp + 1] + 1;
4859 }
4860 if (last_end <= 0x10FFFF) {
4861 if (last_end < 0xD800) {
4862 r = add_range(r, &j, &rs, last_end, 0xD7FF, did_alt);
4863 did_alt = 0;
4864 r = add_range(r, &j, &rs, 0xE000, 0x10FFFF, did_alt);
4865 } else {
4866 r = add_range(r, &j, &rs, last_end, 0x10FFFF, did_alt);
4867 did_alt = 0;
4868 }
4869 }
4870 r = make_room(r, j, 1, &rs);
4871 r[j++] = ')';
4872 } else {
4873 /* Normal mode */
4874 /* Start with "(?:[...]|" for simples. */
4875 int p, did_alt;
4876 r = make_room(r, j, 5 + on_count + ((pcre && simple_on['\\']) ? 1 : 0), &rs);
4877 r[j++] = '(';
4878 r[j++] = '?';
4879 r[j++] = ':';
4880 if (on_count) {
4881 if (on_count == 128) {
4882 r[j++] = '[';
4883 r[j++] = 0;
4884 r[j++] = '-';
4885 r[j++] = 127;
4886 r[j++] = ']';
4887 } else {
4888 r[j++] = '[';
4889 if (simple_on[']'])
4890 r[j++] = ']';
4891 for (p = 0; p < 128; p++) {
4892 if ((p != '-') && (p != ']') && (!pcre || (p != '\\')))
4893 if (simple_on[p])
4894 r[j++] = p;
4895 }
4896 if (pcre && simple_on['\\']) {
4897 r[j++] = '\\';
4898 r[j++] = '\\';
4899 }
4900 if (simple_on['-'])
4901 r[j++] = '-';
4902 r[j++] = ']';
4903 }
4904 did_alt = 0;
4905 } else
4906 did_alt = 1;
4907 for (rp = 0; rp < range_len; rp += 2) {
4908 r = add_range(r, &j, &rs, range_array[rp], range_array[rp+1], did_alt);
4909 did_alt = 0;
4910 }
4911 r = make_room(r, j, 1, &rs);
4912 r[j++] = ')';
4913 }
4914 }
4915 rs.i = k + 1;
4916 } else if (s[rs.i] == '\\') {
4917 if (pcre
4918 && (rs.i+1 < len)
4919 && ((s[rs.i+1] == 'D')
4920 || (s[rs.i+1] == 'W')
4921 || (s[rs.i+1] == 'S'))) {
4922 /* matches non-ASCII characters, so convert */
4923 char *simple_on;
4924 int non_ascii;
4925 int n;
4926
4927 simple_on = (char *)scheme_malloc_atomic(128);
4928 memset(simple_on, 0, 128);
4929
4930 rs.i++;
4931 regcharclass(s[rs.i++], simple_on, &non_ascii);
4932
4933 r = make_room(r, j, 4, &rs);
4934 r[j++] = '(';
4935 r[j++] = '?';
4936 r[j++] = ':';
4937 r[j++] = '[';
4938
4939 for (n = 0; n < 128; ) {
4940 if (simple_on[n]) {
4941 int m;
4942 for (m = n + 1; (m < 128) && simple_on[m]; m++) {
4943 }
4944 r = make_room(r, j, 3, &rs);
4945 r[j++] = n;
4946 r[j++] = '-';
4947 r[j++] = m-1;
4948 n = m;
4949 } else
4950 n++;
4951 }
4952 r = make_room(r, j, 1, &rs);
4953 r[j++] = ']';
4954
4955 if (non_ascii) { /* we expect this to be true! */
4956 r = add_range(r, &j, &rs, 128, 0x10FFFF, 0);
4957 r = make_room(r, j, 1, &rs);
4958 }
4959 r[j++] = ')';
4960 } else {
4961 /* Skip over next char, possibly big: */
4962 r[j++] = s[rs.i++];
4963 if ((rs.i < len)
4964 && (s[rs.i] > 127)) {
4965 r[j++] = s[rs.i++];
4966 while ((rs.i < len) && ((s[rs.i] & 0xC0) == 0x80)) {
4967 r[j++] = s[rs.i++];
4968 }
4969 } else
4970 r[j++] = s[rs.i++];
4971 }
4972 } else if ((s[rs.i] == '.')
4973 && (!pcre
4974 || (rs.i < 3)
4975 || (s[rs.i-1] != '{')
4976 || ((s[rs.i-2] == 'p')
4977 && (s[rs.i-2] == 'P'))
4978 || (s[rs.i-3] != '\\'))) {
4979 /* "." has to be expanded. */
4980 r = make_room(r, j, (parse_flags & PARSE_SINGLE_LINE) ? 9 : 8, &rs);
4981 r[j++] = '(';
4982 r[j++] = '?';
4983 r[j++] = ':';
4984 r[j++] = '[';
4985 r[j++] = '\00';
4986 r[j++] = '-';
4987 if (!(parse_flags & PARSE_SINGLE_LINE)) {
4988 r[j++] = '\n' - 1;
4989 r[j++] = '\n' + 1;
4990 r[j++] = '-';
4991 }
4992 r[j++] = '\177';
4993 r[j++] = ']';
4994 r = add_range(r, &j, &rs, 128, 0xD7FF, 0);
4995 r = add_range(r, &j, &rs, 0xE000, 0x10FFFF, 0);
4996 r = make_room(r, j, 1, &rs);
4997 r[j++] = ')';
4998 rs.i++;
4999 } else if (s[rs.i] > 127) {
5000 int k = rs.i + 1;
5001 /* Look for *, +, or ? after this big char */
5002 while ((k < len) && ((s[k] & 0xC0) == 0x80)) {
5003 k++;
5004 }
5005 if ((k < len) && ((s[k] == '+')
5006 || (s[k] == '*')
5007 || (s[k] == '?')
5008 || (!(parse_flags & PARSE_CASE_SENS)
5009 && need_ci_alternates(s, rs.i, k)))) {
5010 /* Need to translate; wrap char in (?: ...) */
5011 int orig_i;
5012 r = make_room(r, j, 4, &rs);
5013 r[j++] = '(';
5014 r[j++] = '?';
5015 r[j++] = ':';
5016 orig_i = rs.i;
5017 while (rs.i < k) {
5018 r[j++] = s[rs.i++];
5019 }
5020 if (!(parse_flags & PARSE_CASE_SENS)) {
5021 /* Add alternates for different cases: */
5022 mzchar us[1], c0, c1, wrote[4];
5023 int clen, ci, num_wrote = 1, mode;
5024 unsigned char s2[MAX_UTF8_CHAR_BYTES];
5025
5026 scheme_utf8_decode(s, orig_i, k, us, 0, 1, NULL, 0, 0);
5027 c0 = us[0];
5028 wrote[0] = c0;
5029 for (mode = 0; mode < 4; mode++) {
5030 switch (mode) {
5031 case 0:
5032 c1 = scheme_tofold(c0);
5033 break;
5034 case 1:
5035 c1 = scheme_tolower(c0);
5036 break;
5037 case 2:
5038 c1 = scheme_toupper(c0);
5039 break;
5040 case 3:
5041 default:
5042 c1 = scheme_totitle(c0);
5043 break;
5044 }
5045 for (ci = 0; ci < num_wrote; ci++) {
5046 if (c1 == wrote[ci])
5047 break;
5048 }
5049 if (ci >= num_wrote) {
5050 wrote[num_wrote++] = c1;
5051 us[0] = c1;
5052 clen = scheme_utf8_encode(us, 0, 1, s2, 0, 0);
5053 r = make_room(r, j, clen + 1, &rs);
5054 r[j++] = '|';
5055 for (ci = 0; ci < clen; ci++) {
5056 r[j++] = s2[ci];
5057 }
5058 }
5059 }
5060 }
5061 r[j++] = ')';
5062 } else {
5063 /* No translation. */
5064 while (rs.i < k) {
5065 r[j++] = s[rs.i++];
5066 }
5067 }
5068 } else {
5069 /* The translation needs to know about case-insensitive
5070 and single-line modes, so track parens: */
5071 if (s[rs.i] == '(') {
5072 int old_flags = parse_flags;
5073 if ((rs.i + 1 < len) && (s[rs.i + 1] == '?')) {
5074 int k;
5075 for (k = rs.i + 2; k < len; k++) {
5076 if ((s[k] == ':')
5077 || (s[k] == '<')
5078 || (s[k] == '>')
5079 || (s[k] == '=')
5080 || (s[k] == '!'))
5081 break;
5082 if (s[k] == 'i') {
5083 parse_flags &= ~PARSE_CASE_SENS;
5084 } else if (s[k] == 's') {
5085 parse_flags |= PARSE_SINGLE_LINE;
5086 } else if (s[k] == 'm') {
5087 parse_flags &= ~PARSE_SINGLE_LINE;
5088 } else if (s[k] == '-') {
5089 if (k + 1 < len) {
5090 k++;
5091 if (s[k] == 'i') {
5092 parse_flags |= PARSE_CASE_SENS;
5093 } else if (s[k] == 's') {
5094 parse_flags &= ~PARSE_SINGLE_LINE;
5095 } else if (s[k] == 'm') {
5096 parse_flags |= PARSE_SINGLE_LINE;
5097 }
5098 }
5099 }
5100 }
5101 }
5102 if (parse_params || (parse_flags != old_flags)) {
5103 parse_params = scheme_make_raw_pair(scheme_make_integer(old_flags),
5104 parse_params);
5105 }
5106 } else if (s[rs.i] == ')') {
5107 if (parse_params) {
5108 parse_flags = SCHEME_INT_VAL(SCHEME_CAR(parse_params));
5109 parse_params = SCHEME_CDR(parse_params);
5110 }
5111 }
5112 r[j++] = s[rs.i++];
5113 }
5114 }
5115
5116 r[j] = 0;
5117 *result = (char *)r;
5118 return j;
5119 }
5120
5121 /************************************************************/
5122 /* Racket front end */
5123 /************************************************************/
5124
scheme_is_pregexp(Scheme_Object * o)5125 int scheme_is_pregexp(Scheme_Object *o)
5126 {
5127 return !!(((regexp *)o)->flags & REGEXP_IS_PCRE);
5128 }
5129
do_make_regexp(const char * who,int is_byte,int pcre,int argc,Scheme_Object * argv[])5130 static Scheme_Object *do_make_regexp(const char *who, int is_byte, int pcre, int argc, Scheme_Object *argv[])
5131 {
5132 Scheme_Object *re, *bs;
5133 Scheme_Object *handler;
5134 char *s;
5135 int slen;
5136
5137 if (is_byte) {
5138 if (!SCHEME_BYTE_STRINGP(argv[0]))
5139 scheme_wrong_contract(who, "bytes?", 0, argc, argv);
5140 bs = argv[0];
5141 } else {
5142 if (!SCHEME_CHAR_STRINGP(argv[0]))
5143 scheme_wrong_contract(who, "string?", 0, argc, argv);
5144 bs = scheme_char_string_to_byte_string(argv[0]);
5145 }
5146
5147 if (argc >= 2) {
5148 if (!scheme_check_proc_arity2(who, 1, 1, argc, argv, 1))
5149 scheme_wrong_contract(who, "(or/c #f (string? -> any))", 1, argc, argv);
5150 handler = argv[1];
5151 } else
5152 handler = scheme_false;
5153
5154 s = SCHEME_BYTE_STR_VAL(bs);
5155 slen = SCHEME_BYTE_STRTAG_VAL(bs);
5156
5157 if (!is_byte) {
5158 slen = translate((unsigned char *)s, slen, &s, pcre);
5159 #if 0
5160 /* Debugging, to see the translated regexp: */
5161 {
5162 char *cp;
5163 int i;
5164 cp = (char *)scheme_malloc_atomic(slen + 1);
5165 memcpy(cp, s, slen + 1);
5166 for (i = 0; i < slen; i++) {
5167 if (!cp[i]) cp[i] = '0';
5168 }
5169 printf("%d %s\n", slen, scheme_write_to_string(scheme_make_byte_string(cp), 0));
5170 }
5171 #endif
5172 }
5173
5174 regerrorwho = who;
5175 re = (Scheme_Object *)regcomp(s, 0, slen, pcre, handler);
5176 regerrorwho = NULL;
5177
5178 /* passed a handler and regexp compilation failed */
5179 if (!re) {
5180 return regerrorval;
5181 }
5182
5183 if (!is_byte)
5184 ((regexp *)re)->flags |= REGEXP_IS_UTF8;
5185 if (pcre)
5186 ((regexp *)re)->flags |= REGEXP_IS_PCRE;
5187
5188 if (SCHEME_IMMUTABLEP(argv[0]))
5189 ((regexp *)re)->source = argv[0];
5190 else if (is_byte) {
5191 Scheme_Object *src;
5192 src = scheme_make_immutable_sized_byte_string(SCHEME_BYTE_STR_VAL(argv[0]),
5193 SCHEME_BYTE_STRTAG_VAL(argv[0]),
5194 1);
5195 ((regexp *)re)->source = src;
5196 } else {
5197 Scheme_Object *src;
5198 src = scheme_make_immutable_sized_char_string(SCHEME_CHAR_STR_VAL(argv[0]),
5199 SCHEME_CHAR_STRTAG_VAL(argv[0]),
5200 1);
5201 ((regexp *)re)->source = src;
5202 }
5203
5204 return re;
5205 }
5206
make_regexp(int argc,Scheme_Object * argv[])5207 static Scheme_Object *make_regexp(int argc, Scheme_Object *argv[])
5208 {
5209 return do_make_regexp("byte-regexp", 1, 0, argc, argv);
5210 }
5211
make_utf8_regexp(int argc,Scheme_Object * argv[])5212 static Scheme_Object *make_utf8_regexp(int argc, Scheme_Object *argv[])
5213 {
5214 return do_make_regexp("regexp", 0, 0, argc, argv);
5215 }
5216
make_pregexp(int argc,Scheme_Object * argv[])5217 static Scheme_Object *make_pregexp(int argc, Scheme_Object *argv[])
5218 {
5219 return do_make_regexp("byte-pregexp", 1, 1, argc, argv);
5220 }
5221
make_utf8_pregexp(int argc,Scheme_Object * argv[])5222 static Scheme_Object *make_utf8_pregexp(int argc, Scheme_Object *argv[])
5223 {
5224 return do_make_regexp("pregexp", 0, 1, argc, argv);
5225 }
5226
scheme_make_regexp(Scheme_Object * str,int is_byte,int pcre,int * volatile result_is_err_string)5227 Scheme_Object *scheme_make_regexp(Scheme_Object *str, int is_byte, int pcre, int * volatile result_is_err_string)
5228 {
5229 mz_jmp_buf * volatile save, newbuf;
5230 Scheme_Object * volatile result;
5231
5232 *result_is_err_string = 0;
5233
5234 /* we rely on single-threaded, non-blocking regexp compilation: */
5235 save = scheme_current_thread->error_buf;
5236 scheme_current_thread->error_buf = &newbuf;
5237 failure_msg_for_read = "yes";
5238 if (!scheme_setjmp(newbuf)) {
5239 if (is_byte) {
5240 if (pcre)
5241 result = make_pregexp(1, &str);
5242 else
5243 result = make_regexp(1, &str);
5244 } else {
5245 if (pcre)
5246 result = make_utf8_pregexp(1, &str);
5247 else
5248 result = make_utf8_regexp(1, &str);
5249 }
5250 } else {
5251 result = (Scheme_Object *)failure_msg_for_read;
5252 *result_is_err_string = 1;
5253 }
5254
5255 failure_msg_for_read = NULL;
5256 scheme_current_thread->error_buf = save;
5257 return result;
5258 }
5259
regcomp_object(Scheme_Object * str)5260 static regexp *regcomp_object(Scheme_Object *str)
5261 {
5262 if (SCHEME_BYTE_STRINGP(str))
5263 return (regexp *)make_regexp(1, &str);
5264 else
5265 return (regexp *)make_utf8_regexp(1, &str);
5266 }
5267
scheme_clear_rx_buffers(void)5268 void scheme_clear_rx_buffers(void)
5269 {
5270 startp_buffer_cache = NULL;
5271 endp_buffer_cache = NULL;
5272 maybep_buffer_cache = NULL;
5273 match_stack_buffer_cache = NULL;
5274 }
5275
gen_compare(char * name,int pos,int argc,Scheme_Object * argv[],int peek,int nonblock,int last_bytes)5276 static Scheme_Object *gen_compare(char *name, int pos,
5277 int argc, Scheme_Object *argv[],
5278 int peek, int nonblock, int last_bytes)
5279 {
5280 regexp *r;
5281 char *full_s, *prefix = NULL;
5282 rxpos *startp, *maybep, *endp, *match_stack, prefix_len = 0, prefix_offset = 0, minpos;
5283 int offset = 0, orig_offset, endset, m, was_non_byte, last_bytes_count = last_bytes;
5284 Scheme_Object *iport, *oport = NULL, *startv = NULL, *endv = NULL, *dropped, *unless_evt = NULL;
5285 Scheme_Object *last_bytes_str = scheme_false, *srcin;
5286 rx_lazy_str_t *lazy_string = NULL;
5287
5288 if (SCHEME_TYPE(argv[0]) != scheme_regexp_type
5289 && !SCHEME_BYTE_STRINGP(argv[0])
5290 && !SCHEME_CHAR_STRINGP(argv[0]))
5291 scheme_wrong_contract(name, "(or/c regexp? byte-regexp? string? bytes?)", 0, argc, argv);
5292 if ((peek || (!SCHEME_BYTE_STRINGP(argv[1]) && !SCHEME_CHAR_STRINGP(argv[1])))
5293 && !SCHEME_INPUT_PORTP(argv[1])
5294 && !SCHEME_PATHP(argv[1]))
5295 scheme_wrong_contract(name, peek ? "input-port?" : "(or/c string? bytes? path? input-port?)", 1, argc, argv);
5296
5297 srcin = argv[1];
5298 if (SCHEME_PATHP(srcin)) {
5299 if (SCHEME_BYTE_STRINGP(argv[0])
5300 || (SAME_TYPE(SCHEME_TYPE(argv[0]), scheme_regexp_type)
5301 && !(((regexp *)argv[0])->flags & REGEXP_IS_UTF8)))
5302 srcin = scheme_make_sized_byte_string(SCHEME_PATH_VAL(srcin),
5303 SCHEME_PATH_LEN(srcin),
5304 1);
5305 else
5306 srcin = scheme_path_to_char_string(srcin);
5307 }
5308
5309 if (SCHEME_CHAR_STRINGP(srcin)) {
5310 iport = NULL;
5311 endset = SCHEME_CHAR_STRLEN_VAL(srcin);
5312 } else if (SCHEME_INPUT_PORTP(srcin)) {
5313 iport = srcin;
5314 endset = -2;
5315 } else {
5316 iport = NULL;
5317 endset = SCHEME_BYTE_STRLEN_VAL(srcin);
5318 }
5319
5320 if (argc > 2) {
5321 int len = endset;
5322
5323 offset = scheme_extract_index(name, 2, argc, argv, len + 1, 0);
5324
5325 if (!iport && (offset > len)) {
5326 scheme_out_of_range(name, NULL, "offset ", argv[2], srcin, 0, len);
5327 return NULL;
5328 } else if (offset < 0) {
5329 /* argument was a bignum */
5330 offset = 0x7FFFFFFF;
5331 }
5332 startv = argv[2];
5333
5334 if (argc > 3) {
5335 if (!SCHEME_FALSEP(argv[3])) {
5336 endset = scheme_extract_index(name, 3, argc, argv, len + 1, 1);
5337
5338 if (iport) {
5339 if (endset < 0) {
5340 /* argument was a bignum */
5341 endset = 0x7FFFFFFF;
5342 }
5343 /* compare numbers */
5344 if (scheme_bin_lt(argv[3], argv[2])) {
5345 scheme_contract_error(name,
5346 "ending index is smaller than starting index",
5347 "starting index", 1, argv[2],
5348 "ending index", 1, argv[3],
5349 NULL);
5350 return NULL;
5351 }
5352 } else if (endset < offset || endset > len) {
5353 scheme_out_of_range(name, NULL, "ending ", argv[3], srcin, offset, len);
5354 return NULL;
5355 }
5356 endv = argv[3];
5357 }
5358
5359 if (argc > 4) {
5360 if (peek) {
5361 if (!SCHEME_FALSEP(argv[4])) {
5362 unless_evt = argv[4];
5363 if (!SAME_TYPE(SCHEME_TYPE(unless_evt), scheme_progress_evt_type)) {
5364 scheme_wrong_contract(name, "(or/c progress-evt? #f)", 4, argc, argv);
5365 return NULL;
5366 }
5367 if (!iport) {
5368 scheme_contract_error(name,
5369 "progress evt cannot be used with string input",
5370 "progress evt", 1, unless_evt,
5371 NULL);
5372 } else if (!SAME_OBJ(iport, SCHEME_PTR1_VAL(unless_evt))) {
5373 scheme_contract_error(name,
5374 "evt is not a progress evt for the given port",
5375 "progress evt", 1, unless_evt,
5376 "port", 1, iport,
5377 NULL);
5378 return NULL;
5379 }
5380 }
5381 } else {
5382 if (SCHEME_TRUEP(argv[4])) {
5383 if (!SCHEME_OUTPUT_PORTP(argv[4]))
5384 scheme_wrong_contract(name, "(or/c output-port? #f)", 4, argc, argv);
5385 oport = argv[4];
5386 }
5387 }
5388 }
5389
5390 if (argc > 5) {
5391 if (!SCHEME_BYTE_STRINGP(argv[5]))
5392 scheme_wrong_contract(name, "bytes?", 5, argc, argv);
5393 prefix = SCHEME_BYTE_STR_VAL(argv[5]);
5394 prefix_len = SCHEME_BYTE_STRLEN_VAL(argv[5]);
5395 prefix_offset = 0;
5396
5397 if (argc > 6) {
5398 if (!scheme_nonneg_exact_p(argv[6]))
5399 scheme_wrong_contract(name, "exact-nonnegative-integer?", 6, argc, argv);
5400 if (SCHEME_INTP(argv[6]))
5401 last_bytes_count = SCHEME_INT_VAL(argv[6]);
5402 else
5403 last_bytes_count = -1; /* => as many as available */
5404 }
5405 }
5406 }
5407 }
5408
5409 if (iport && !startv)
5410 startv = scheme_make_integer(0);
5411
5412 if (SCHEME_BYTE_STRINGP(argv[0])
5413 || SCHEME_CHAR_STRINGP(argv[0]))
5414 r = regcomp_object(argv[0]);
5415 else
5416 r = (regexp *)argv[0];
5417
5418 was_non_byte = 0;
5419 orig_offset = 0; /* extra offset */
5420 if (!iport) {
5421 if (SCHEME_BYTE_STRINGP(srcin))
5422 full_s = SCHEME_BYTE_STR_VAL(srcin);
5423 else {
5424 /* Extract substring and UTF-8 encode: */
5425 if (endset - offset < LAZY_STRING_CHUNK_SIZE) {
5426 /* String is short enough to decode in one go: */
5427 int blen;
5428 blen = scheme_utf8_encode(SCHEME_CHAR_STR_VAL(srcin), offset, endset,
5429 NULL, 0,
5430 0 /* not UTF-16 */);
5431 full_s = (char *)scheme_malloc_atomic(blen);
5432 scheme_utf8_encode(SCHEME_CHAR_STR_VAL(srcin), offset, endset,
5433 (unsigned char *)full_s, 0,
5434 0 /* not UTF-16 */);
5435 orig_offset = offset;
5436 offset = 0;
5437 endset = blen;
5438 } else {
5439 /* Handle extremely long strings by decoding lazily: */
5440 lazy_string = MALLOC_ONE_RT(rx_lazy_str_t);
5441 #ifdef MZTAG_REQUIRED
5442 lazy_string->type = scheme_rt_rx_lazy_string;
5443 #endif
5444 lazy_string->start = offset;
5445 lazy_string->end = endset;
5446 lazy_string->done = 0;
5447 lazy_string->blen = 0;
5448 lazy_string->s = NULL;
5449 lazy_string->chars = SCHEME_CHAR_STR_VAL(srcin);
5450 full_s = NULL;
5451 orig_offset = offset;
5452 offset = 0;
5453 endset = 0;
5454 }
5455 if (r->flags & REGEXP_IS_UTF8)
5456 was_non_byte = 1;
5457 else {
5458 /* Convert orig_offset into encoded bytes */
5459 orig_offset = scheme_utf8_encode(SCHEME_CHAR_STR_VAL(srcin), 0, orig_offset,
5460 NULL, 0,
5461 0);
5462 }
5463 }
5464 } else
5465 full_s = NULL;
5466
5467 if (startp_buffer_cache && (r->nsubexp <= rx_buffer_size)) {
5468 startp = startp_buffer_cache;
5469 maybep = maybep_buffer_cache;
5470 endp = endp_buffer_cache;
5471 startp_buffer_cache = NULL;
5472 } else {
5473 startp = MALLOC_N_ATOMIC(rxpos, r->nsubexp);
5474 maybep = NULL;
5475 match_stack = NULL;
5476 endp = MALLOC_N_ATOMIC(rxpos, r->nsubexp);
5477 }
5478 if ((r->nsubexp > 1) && !maybep) {
5479 maybep = MALLOC_N_ATOMIC(rxpos, r->nsubexp);
5480
5481 if (match_stack_buffer_cache) {
5482 match_stack = match_stack_buffer_cache;
5483 match_stack_buffer_cache = NULL;
5484 } else
5485 match_stack = MALLOC_N_ATOMIC(rxpos, MATCH_STACK_SIZE);
5486 } else {
5487 match_stack = NULL;
5488 }
5489
5490 dropped = scheme_make_integer(0);
5491
5492 m = regexec(name, r, full_s,
5493 offset, (endset < 0 ? endset : endset - offset),
5494 offset, lazy_string,
5495 startp, maybep, endp, match_stack,
5496 iport, unless_evt, nonblock,
5497 &full_s, peek, pos, last_bytes_count, oport,
5498 startv, endv, &dropped,
5499 prefix, prefix_len, prefix_offset);
5500
5501 if (lazy_string) {
5502 full_s = lazy_string->s;
5503 endset = lazy_string->blen;
5504 }
5505
5506 if (iport) {
5507 minpos = -prefix_len;
5508 offset = 0;
5509 } else
5510 minpos = offset - prefix_len;
5511
5512 if (m) {
5513 int i;
5514 Scheme_Object *l = scheme_null, *rs;
5515
5516 if (oport && !iport)
5517 scheme_put_byte_string(name, oport, full_s, 0, *startp, 0);
5518
5519 if (last_bytes) {
5520 rxpos frompos, tooffset;
5521
5522 if ((last_bytes_count < 0)
5523 || (endp[0] - minpos < last_bytes_count))
5524 last_bytes_count = endp[0] - minpos;
5525
5526 if (!last_bytes_count) {
5527 last_bytes_str = empty_byte_string;
5528 } else {
5529 frompos = endp[0] - last_bytes_count;
5530 tooffset = 0;
5531
5532 last_bytes_str = scheme_alloc_byte_string(last_bytes_count, 0);
5533 if (frompos < offset) {
5534 /* draw from prefix: */
5535 rxpos amt = last_bytes_count;
5536 if (frompos + last_bytes_count > offset)
5537 amt = offset - frompos;
5538 memcpy(SCHEME_BYTE_STR_VAL(last_bytes_str) XFORM_OK_PLUS tooffset,
5539 prefix + prefix_offset + prefix_len - (offset - frompos),
5540 amt);
5541 frompos += amt;
5542 tooffset += amt;
5543 last_bytes_count -= amt;
5544 }
5545 memcpy(SCHEME_BYTE_STR_VAL(last_bytes_str) XFORM_OK_PLUS tooffset,
5546 full_s + frompos,
5547 last_bytes_count);
5548 }
5549 }
5550
5551 if (pos > 1) {
5552 /* pos == 2 => just get true or false */
5553 dropped = scheme_true;
5554 } else {
5555 for (i = r->nsubexp; i--; ) {
5556 if (startp[i] >= minpos) {
5557 if (pos) {
5558 Scheme_Object *startpd, *endpd;
5559
5560 if (was_non_byte) {
5561 /* Need to figure out how startpd and endpd correspond to
5562 code points. Note that the input regexp matches only
5563 unicode chars, so the start and end points can't be in
5564 the middle of encoded characters. */
5565 int uspd, uepd;
5566 if (startp[i] >= offset)
5567 uspd = scheme_utf8_decode((const unsigned char *)full_s, offset, startp[i],
5568 NULL, 0, -1,
5569 NULL, 0, 0);
5570 else {
5571 uspd = scheme_utf8_decode((const unsigned char *)prefix,
5572 prefix_offset + prefix_len + (startp[i] - offset),
5573 prefix_offset + prefix_len,
5574 NULL, 0, -1,
5575 NULL, 0, 0);
5576 uspd = offset - uspd;
5577 }
5578 uspd += orig_offset;
5579 startpd = scheme_make_integer(uspd);
5580 if (startp[i] >= offset) {
5581 uepd = scheme_utf8_decode((const unsigned char *)full_s, startp[i], endp[i],
5582 NULL, 0, -1,
5583 NULL, 0, 0);
5584 uepd += uspd;
5585 } else if (endp[i] <= offset) {
5586 uepd = scheme_utf8_decode((const unsigned char *)prefix,
5587 prefix_offset + prefix_len + (endp[i] - offset),
5588 prefix_offset + prefix_len,
5589 NULL, 0, -1,
5590 NULL, 0, 0);
5591 uepd = offset - uepd;
5592 uepd += orig_offset;
5593 } else {
5594 uepd = scheme_utf8_decode((const unsigned char *)full_s, 0, endp[i],
5595 NULL, 0, -1,
5596 NULL, 0, 0);
5597 uepd += orig_offset;
5598 }
5599 endpd = scheme_make_integer(uepd);
5600 } else {
5601 int v;
5602 v = startp[i] + orig_offset;
5603 startpd = scheme_make_integer(v);
5604 v = endp[i] + orig_offset;
5605 endpd = scheme_make_integer(v);
5606
5607 if (iport) {
5608 /* Increment by drop count: */
5609 startpd = scheme_bin_plus(startpd, dropped);
5610 endpd = scheme_bin_plus(endpd, dropped);
5611 }
5612 }
5613
5614 l = scheme_make_pair(scheme_make_pair(startpd, endpd),
5615 l);
5616 } else {
5617 intptr_t len;
5618 len = endp[i] - startp[i];
5619 if (startp[i] >= offset) {
5620 if (was_non_byte) {
5621 rs = scheme_make_sized_offset_utf8_string(full_s, startp[i], len);
5622 } else {
5623 rs = scheme_make_sized_offset_byte_string(full_s, startp[i], len, 1);
5624 }
5625 } else if (endp[i] <= offset) {
5626 /* all in prefix */
5627 rs = scheme_make_sized_offset_byte_string(prefix,
5628 prefix_offset + (startp[i] - minpos),
5629 endp[i] - startp[i],
5630 1);
5631 if (was_non_byte)
5632 rs = scheme_byte_string_to_char_string(rs);
5633 } else {
5634 /* span both */
5635 Scheme_Object *rs2;
5636 rs = scheme_make_sized_offset_byte_string(prefix,
5637 prefix_offset + (startp[i] - minpos),
5638 offset - startp[i],
5639 1);
5640 rs2 = scheme_make_sized_offset_byte_string(full_s, offset, endp[i] - offset, 1);
5641 rs = scheme_append_byte_string(rs, rs2);
5642 if (was_non_byte)
5643 rs = scheme_byte_string_to_char_string(rs);
5644 }
5645 l = scheme_make_pair(rs, l);
5646 }
5647 } else
5648 l = scheme_make_pair(scheme_false, l);
5649 }
5650 dropped = l;
5651 }
5652 } else {
5653 if (oport && !iport)
5654 scheme_put_byte_string(name, oport, full_s, 0, endset, 0);
5655
5656 dropped = scheme_false;
5657 last_bytes_str = scheme_false;
5658 }
5659
5660 if (!startp_buffer_cache || (r->nsubexp > rx_buffer_size)) {
5661 rx_buffer_size = r->nsubexp;
5662 startp_buffer_cache = startp;
5663 maybep_buffer_cache = maybep;
5664 endp_buffer_cache = endp;
5665 } else if (maybep && !maybep_buffer_cache && (r->nsubexp == rx_buffer_size)) {
5666 maybep_buffer_cache = maybep;
5667 }
5668 if (match_stack && !match_stack_buffer_cache)
5669 match_stack_buffer_cache = match_stack;
5670
5671 if (last_bytes) {
5672 Scheme_Object *a[2];
5673 a[0] = dropped;
5674 a[1] = last_bytes_str;
5675 return scheme_values(2, a);
5676 } else
5677 return dropped;
5678 }
5679
compare(int argc,Scheme_Object * argv[])5680 static Scheme_Object *compare(int argc, Scheme_Object *argv[])
5681 {
5682 return gen_compare("regexp-match", 0, argc, argv, 0, 0, 0);
5683 }
5684
compare_end(int argc,Scheme_Object * argv[])5685 static Scheme_Object *compare_end(int argc, Scheme_Object *argv[])
5686 {
5687 return gen_compare("regexp-match/end", 0, argc, argv, 0, 0, 1);
5688 }
5689
positions(int argc,Scheme_Object * argv[])5690 static Scheme_Object *positions(int argc, Scheme_Object *argv[])
5691 {
5692 return gen_compare("regexp-match-positions", 1, argc, argv, 0, 0, 0);
5693 }
5694
positions_end(int argc,Scheme_Object * argv[])5695 static Scheme_Object *positions_end(int argc, Scheme_Object *argv[])
5696 {
5697 return gen_compare("regexp-match-positions/end", 1, argc, argv, 0, 0, 1);
5698 }
5699
compare_bool(int argc,Scheme_Object * argv[])5700 static Scheme_Object *compare_bool(int argc, Scheme_Object *argv[])
5701 {
5702 return gen_compare("regexp-match?", 2, argc, argv, 0, 0, 0);
5703 }
5704
scheme_regexp_match_p(Scheme_Object * regexp,Scheme_Object * target)5705 int scheme_regexp_match_p(Scheme_Object *regexp, Scheme_Object *target)
5706 {
5707 Scheme_Object *a[2];
5708 a[0] = regexp;
5709 a[1] = target;
5710 return SCHEME_TRUEP(compare_bool(2, a));
5711 }
5712
compare_peek(int argc,Scheme_Object * argv[])5713 static Scheme_Object *compare_peek(int argc, Scheme_Object *argv[])
5714 {
5715 return gen_compare("regexp-match-peek", 0, argc, argv, 1, 0, 0);
5716 }
5717
positions_peek(int argc,Scheme_Object * argv[])5718 static Scheme_Object *positions_peek(int argc, Scheme_Object *argv[])
5719 {
5720 return gen_compare("regexp-match-peek-positions", 1, argc, argv, 1, 0, 0);
5721 }
5722
positions_peek_end(int argc,Scheme_Object * argv[])5723 static Scheme_Object *positions_peek_end(int argc, Scheme_Object *argv[])
5724 {
5725 return gen_compare("regexp-match-peek-positions/end", 1, argc, argv, 1, 0, 1);
5726 }
5727
compare_peek_nonblock(int argc,Scheme_Object * argv[])5728 static Scheme_Object *compare_peek_nonblock(int argc, Scheme_Object *argv[])
5729 {
5730 return gen_compare("regexp-match-peek-immediate", 0, argc, argv, 1, 1, 0);
5731 }
5732
positions_peek_nonblock(int argc,Scheme_Object * argv[])5733 static Scheme_Object *positions_peek_nonblock(int argc, Scheme_Object *argv[])
5734 {
5735 return gen_compare("regexp-match-peek-positions-immediate", 1, argc, argv, 1, 1, 0);
5736 }
5737
positions_peek_nonblock_end(int argc,Scheme_Object * argv[])5738 static Scheme_Object *positions_peek_nonblock_end(int argc, Scheme_Object *argv[])
5739 {
5740 return gen_compare("regexp-match-peek-positions-immediate/end", 1, argc, argv, 1, 1, 1);
5741 }
5742
build_call_name(const char * n)5743 static char *build_call_name(const char *n)
5744 {
5745 char *m;
5746 int l;
5747 l = strlen(n);
5748 m = (char *)scheme_malloc_atomic(l + 42);
5749 memcpy(m, n, l);
5750 strcpy(m XFORM_OK_PLUS l, " (calling given filter procedure)");
5751 return m;
5752 }
5753
initial_char_len(unsigned char * source,intptr_t start,intptr_t end)5754 static int initial_char_len(unsigned char *source, intptr_t start, intptr_t end)
5755 {
5756 intptr_t i;
5757
5758 for (i = start + 1; i <= end; i++) {
5759 if (scheme_utf8_decode_count(source, start, i, NULL, 1, 1)) {
5760 return i - start;
5761 }
5762 }
5763
5764 return 1;
5765 }
5766
gen_replace(const char * name,int argc,Scheme_Object * argv[],int all)5767 static Scheme_Object *gen_replace(const char *name, int argc, Scheme_Object *argv[], int all)
5768 {
5769 Scheme_Object *orig;
5770 regexp *r;
5771 char *source, *prefix = NULL, *deststr;
5772 rxpos *startp, *maybep, *endp, minpos;
5773 int prefix_len = 0, prefix_offset = 0, sourcelen, srcoffset = 0, was_non_byte, destlen;
5774 char *result_pre = NULL;
5775 int result_pre_len = 0, result_pre_size = 0;
5776 int cannot_match_more = 0;
5777
5778 if (SCHEME_TYPE(argv[0]) != scheme_regexp_type
5779 && !SCHEME_BYTE_STRINGP(argv[0])
5780 && !SCHEME_CHAR_STRINGP(argv[0]))
5781 scheme_wrong_contract(name, "(or/c regexp? byte-regexp? string? bytes?)", 0, argc, argv);
5782 if (!SCHEME_BYTE_STRINGP(argv[1])
5783 && !SCHEME_CHAR_STRINGP(argv[1]))
5784 scheme_wrong_contract(name, "(or/c string? bytes?)", 1, argc, argv);
5785 if (!SCHEME_BYTE_STRINGP(argv[2])
5786 && !SCHEME_CHAR_STRINGP(argv[2])
5787 && !SCHEME_PROCP(argv[2]))
5788 scheme_wrong_contract(name, "(or/c string? bytes? procedure?)", 2, argc, argv);
5789
5790 if (SCHEME_BYTE_STRINGP(argv[2])) {
5791 if (SCHEME_CHAR_STRINGP(argv[0])
5792 || ((SCHEME_TYPE(argv[0]) == scheme_regexp_type)
5793 && (((regexp *)argv[0])->flags & REGEXP_IS_UTF8))) {
5794 if (SCHEME_CHAR_STRINGP(argv[1])) {
5795 scheme_contract_error(name, "cannot replace a string with a byte string",
5796 "string-matching regexp", 1, argv[0],
5797 "byte string", 1, argv[2],
5798 NULL);
5799 }
5800 }
5801 }
5802
5803 if (SCHEME_BYTE_STRINGP(argv[0])
5804 || SCHEME_CHAR_STRINGP(argv[0]))
5805 r = regcomp_object(argv[0]);
5806 else
5807 r = (regexp *)argv[0];
5808
5809 if (0) {
5810 /* This check is disabled for backward compatibility, because
5811 `regexp-replace*` in `racket/private/string` does not check,
5812 and that was the exported `regexp-replace*` for a long time. */
5813 if (SCHEME_PROCP(argv[2])) {
5814 if (!scheme_check_proc_arity(NULL, r->nsubexp, 2, argc, argv)) {
5815 scheme_contract_error(name,
5816 "replace procedure's arity does not include regexp's match count",
5817 "regexp", 1, r,
5818 "regexp match count", 1, scheme_make_integer(r->nsubexp),
5819 "replace procedure", 1, argv[2],
5820 NULL);
5821 }
5822 }
5823 }
5824
5825 if (argc > 3) {
5826 if (!SCHEME_BYTE_STRINGP(argv[3]))
5827 scheme_wrong_contract(name, "bytes?", 3, argc, argv);
5828 prefix = SCHEME_BYTE_STR_VAL(argv[3]);
5829 prefix_len = SCHEME_BYTE_STRLEN_VAL(argv[3]);
5830 prefix_offset = 0;
5831 }
5832
5833 if (SCHEME_CHAR_STRINGP(argv[1])) {
5834 orig = scheme_char_string_to_byte_string(argv[1]);
5835 if (r->flags & REGEXP_IS_UTF8)
5836 was_non_byte = 1;
5837 else
5838 was_non_byte = 0;
5839 } else {
5840 orig = argv[1];
5841 was_non_byte = 0;
5842 }
5843 source = SCHEME_BYTE_STR_VAL(orig);
5844 sourcelen = SCHEME_BYTE_STRTAG_VAL(orig);
5845 deststr = NULL;
5846 destlen = 0;
5847
5848 startp = MALLOC_N_ATOMIC(rxpos, r->nsubexp);
5849 if (r->nsubexp > 1)
5850 maybep = MALLOC_N_ATOMIC(rxpos, r->nsubexp);
5851 else
5852 maybep = NULL;
5853 endp = MALLOC_N_ATOMIC(rxpos, r->nsubexp);
5854
5855 minpos = -prefix_len;
5856
5857 while (1) {
5858 int m;
5859
5860 if (cannot_match_more)
5861 m = 0;
5862 else
5863 m = regexec(name, r, source, srcoffset, sourcelen - srcoffset, 0, NULL,
5864 startp, maybep, endp, NULL,
5865 NULL, NULL, 0,
5866 NULL, 0, 0, 0, NULL, NULL, NULL, NULL,
5867 prefix, prefix_len, prefix_offset);
5868
5869 if (m) {
5870 char *insert;
5871 intptr_t len, end, startpd, endpd;
5872
5873 if (SCHEME_PROCP(argv[2])) {
5874 int i;
5875 Scheme_Object *m, **args, *quick_args[5];
5876
5877 if (r->nsubexp <= 5) {
5878 args = quick_args;
5879 } else {
5880 args = MALLOC_N(Scheme_Object*, r->nsubexp);
5881 }
5882
5883 for (i = r->nsubexp; i--; ) {
5884 if (startp[i] < minpos) {
5885 args[i] = scheme_false;
5886 } else {
5887 intptr_t len;
5888 len = endp[i] - startp[i];
5889 if (startp[i] >= 0) {
5890 if (was_non_byte) {
5891 m = scheme_make_sized_offset_utf8_string(source, startp[i], len);
5892 } else {
5893 m = scheme_make_sized_offset_byte_string(source, startp[i], len, 1);
5894 }
5895 } else {
5896 /* at least some of prefix is included */
5897 int pre_len = len;
5898
5899 if (endp[i] > 0)
5900 pre_len -= endp[i];
5901 m = scheme_make_sized_offset_byte_string(prefix,
5902 prefix_offset + prefix_len + startp[i],
5903 pre_len,
5904 1);
5905 if (was_non_byte)
5906 m = scheme_byte_string_to_char_string(m);
5907
5908 if (endp[i] > 0) {
5909 Scheme_Object *m2;
5910 if (was_non_byte) {
5911 m2 = scheme_make_sized_offset_utf8_string(source, 0, endp[i]);
5912 m = scheme_append_char_string(m, m2);
5913 } else {
5914 m2 = scheme_make_sized_offset_byte_string(source, 0, endp[i], 1);
5915 m = scheme_append_byte_string(m, m2);
5916 }
5917 }
5918 }
5919 args[i] = m;
5920 }
5921 }
5922
5923 m = _scheme_apply(argv[2], r->nsubexp, args);
5924
5925 if (!was_non_byte) {
5926 if (!SCHEME_BYTE_STRINGP(m)) {
5927 args[0] = m;
5928 scheme_wrong_contract(build_call_name(name), "bytes?", -1, -1, args);
5929 }
5930 insert = SCHEME_BYTE_STR_VAL(m);
5931 len = SCHEME_BYTE_STRLEN_VAL(m);
5932 } else {
5933 if (!SCHEME_CHAR_STRINGP(m)) {
5934 args[0] = m;
5935 scheme_wrong_contract(build_call_name(name), "string?", -1, -1, args);
5936 }
5937 len = scheme_utf8_encode(SCHEME_CHAR_STR_VAL(m), 0,
5938 SCHEME_CHAR_STRLEN_VAL(m),
5939 NULL, 0, 0 /* not UTF-16 */);
5940 insert = (char *)scheme_malloc_atomic(len);
5941 scheme_utf8_encode(SCHEME_CHAR_STR_VAL(m), 0,
5942 SCHEME_CHAR_STRLEN_VAL(m),
5943 (unsigned char *)insert, 0, 0 /* not UTF-16 */);
5944 }
5945 } else {
5946 if (!deststr) {
5947 if (SCHEME_CHAR_STRINGP(argv[2])) {
5948 Scheme_Object *bs;
5949 bs = scheme_char_string_to_byte_string(argv[2]);
5950 deststr = SCHEME_BYTE_STR_VAL(bs);
5951 destlen = SCHEME_BYTE_STRTAG_VAL(bs);
5952 } else {
5953 deststr = SCHEME_BYTE_STR_VAL(argv[2]);
5954 destlen = SCHEME_BYTE_STRTAG_VAL(argv[2]);
5955 }
5956 }
5957 insert = regsub(r, deststr, destlen, &len, source, startp, endp,
5958 minpos, prefix, prefix_offset);
5959 }
5960
5961 end = sourcelen;
5962
5963 startpd = startp[0];
5964 endpd = endp[0];
5965
5966 if (!startpd && (endpd == end) && !result_pre && !all) {
5967 if (was_non_byte)
5968 return scheme_make_sized_utf8_string(insert, len);
5969 else
5970 return scheme_make_sized_byte_string(insert, len, 0);
5971 } else if (!all) {
5972 char *result;
5973 intptr_t total;
5974
5975 total = len + (startpd - srcoffset) + (end - endpd);
5976
5977 result = (char *)scheme_malloc_atomic(total + 1);
5978 memcpy(result, source + srcoffset, startpd - srcoffset);
5979 memcpy(result + (startpd - srcoffset), insert, len);
5980 memcpy(result + (startpd - srcoffset) + len, source + endpd, (end - endpd) + 1);
5981
5982 if (was_non_byte)
5983 return scheme_make_sized_utf8_string(result, total);
5984 else
5985 return scheme_make_sized_byte_string(result, total, 0);
5986 } else {
5987 intptr_t total;
5988 int more;
5989
5990 if (startpd == endpd) {
5991 if (endpd == end) {
5992 more = 0;
5993 cannot_match_more = 1;
5994 } else if (was_non_byte)
5995 more = initial_char_len((unsigned char *)source, startpd, sourcelen);
5996 else
5997 more = 1;
5998 } else
5999 more = 0;
6000
6001 total = len + (startpd - srcoffset) + more;
6002 if (!result_pre || (total + result_pre_len > result_pre_size)) {
6003 char *naya;
6004 intptr_t new_size = (2 * ((result_pre_len < total) ? total : result_pre_len)) + 1;
6005 naya = (char *)scheme_malloc_atomic(new_size);
6006 if (result_pre_size > 0)
6007 memcpy(naya, result_pre, result_pre_len);
6008 result_pre = naya;
6009 result_pre_size = new_size;
6010 }
6011
6012 memcpy(result_pre + result_pre_len, source + srcoffset, startpd - srcoffset);
6013 memcpy(result_pre + result_pre_len + (startpd - srcoffset), insert, len);
6014 if (more) {
6015 memcpy(result_pre + result_pre_len + (endpd - srcoffset) + len, source + startpd, more);
6016 }
6017 result_pre_len += total;
6018
6019 srcoffset = endpd + more;
6020 }
6021 } else if (!result_pre) {
6022 if (was_non_byte)
6023 return argv[1];
6024 else
6025 return orig;
6026 } else {
6027 char *result;
6028 intptr_t total, slen;
6029
6030 slen = sourcelen - srcoffset;
6031 total = result_pre_len + slen;
6032
6033 result = (char *)scheme_malloc_atomic(total + 1);
6034 memcpy(result, result_pre, result_pre_len);
6035 memcpy(result + result_pre_len, source + srcoffset, slen);
6036 result[result_pre_len + slen] = 0;
6037
6038 if (was_non_byte)
6039 return scheme_make_sized_utf8_string(result, total);
6040 else
6041 return scheme_make_sized_byte_string(result, total, 0);
6042 }
6043
6044 SCHEME_USE_FUEL(1);
6045 }
6046 }
6047
replace(int argc,Scheme_Object * argv[])6048 static Scheme_Object *replace(int argc, Scheme_Object *argv[])
6049 {
6050 return gen_replace("regexp-replace", argc, argv, 0);
6051 }
6052
replace_star(int argc,Scheme_Object * argv[])6053 static Scheme_Object *replace_star(int argc, Scheme_Object *argv[])
6054 {
6055 return gen_replace("regexp-replace*", argc, argv, 1);
6056 }
6057
regexp_p(int argc,Scheme_Object * argv[])6058 static Scheme_Object *regexp_p(int argc, Scheme_Object *argv[])
6059 {
6060 return (((SCHEME_TYPE(argv[0]) == scheme_regexp_type)
6061 && (((regexp *)argv[0])->flags & REGEXP_IS_UTF8))
6062 ? scheme_true
6063 : scheme_false);
6064 }
6065
byte_regexp_p(int argc,Scheme_Object * argv[])6066 static Scheme_Object *byte_regexp_p(int argc, Scheme_Object *argv[])
6067 {
6068 return (((SCHEME_TYPE(argv[0]) == scheme_regexp_type)
6069 && !(((regexp *)argv[0])->flags & REGEXP_IS_UTF8))
6070 ? scheme_true
6071 : scheme_false);
6072 }
6073
pregexp_p(int argc,Scheme_Object * argv[])6074 static Scheme_Object *pregexp_p(int argc, Scheme_Object *argv[])
6075 {
6076 return (((SCHEME_TYPE(argv[0]) == scheme_regexp_type)
6077 && (((regexp *)argv[0])->flags & REGEXP_IS_UTF8)
6078 && (((regexp *)argv[0])->flags & REGEXP_IS_PCRE))
6079 ? scheme_true
6080 : scheme_false);
6081 }
6082
byte_pregexp_p(int argc,Scheme_Object * argv[])6083 static Scheme_Object *byte_pregexp_p(int argc, Scheme_Object *argv[])
6084 {
6085 return (((SCHEME_TYPE(argv[0]) == scheme_regexp_type)
6086 && !(((regexp *)argv[0])->flags & REGEXP_IS_UTF8)
6087 && (((regexp *)argv[0])->flags & REGEXP_IS_PCRE))
6088 ? scheme_true
6089 : scheme_false);
6090 }
6091
regexp_lookbehind(int argc,Scheme_Object * argv[])6092 Scheme_Object *regexp_lookbehind(int argc, Scheme_Object *argv[])
6093 {
6094 if (!SAME_TYPE(SCHEME_TYPE(argv[0]), scheme_regexp_type))
6095 scheme_wrong_contract("regexp-max-lookbehind", "(or/c regexp? byte-regexp?)", 0, argc, argv);
6096 return scheme_make_integer(((regexp *)argv[0])->maxlookback);
6097 }
6098
scheme_regexp_source(Scheme_Object * re)6099 Scheme_Object *scheme_regexp_source(Scheme_Object *re)
6100 {
6101 return ((regexp *)re)->source;
6102 }
6103
scheme_regexp_is_byte(Scheme_Object * re)6104 int scheme_regexp_is_byte(Scheme_Object *re)
6105 {
6106 return !(((regexp *)re)->flags & REGEXP_IS_UTF8);
6107 }
6108
scheme_regexp_is_pregexp(Scheme_Object * re)6109 int scheme_regexp_is_pregexp(Scheme_Object *re)
6110 {
6111 return !!(((regexp *)re)->flags & REGEXP_IS_PCRE);
6112 }
6113
6114 #ifdef MZ_PRECISE_GC
6115 START_XFORM_SKIP;
6116 #include "mzmark_regexp.inc"
6117 END_XFORM_SKIP;
6118 #endif
6119
scheme_regexp_initialize(Scheme_Startup_Env * env)6120 void scheme_regexp_initialize(Scheme_Startup_Env *env)
6121 {
6122 #ifdef MZ_PRECISE_GC
6123 GC_REG_TRAV(scheme_regexp_type, mark_regexp);
6124 GC_REG_TRAV(scheme_rt_regwork, mark_regwork);
6125 GC_REG_TRAV(scheme_rt_rx_lazy_string, mark_lazy_string);
6126 #endif
6127
6128 REGISTER_SO(empty_byte_string);
6129 empty_byte_string = scheme_alloc_byte_string(0, 0);
6130
6131 ADD_PRIM_W_ARITY("byte-regexp", make_regexp, 1, 2, env);
6132 ADD_PRIM_W_ARITY("regexp", make_utf8_regexp, 1, 2, env);
6133 ADD_PRIM_W_ARITY("byte-pregexp", make_pregexp, 1, 2, env);
6134 ADD_PRIM_W_ARITY("pregexp", make_utf8_pregexp, 1, 2, env);
6135 ADD_PRIM_W_ARITY("regexp-match", compare, 2, 6, env);
6136 ADD_PRIM_W_ARITY2("regexp-match/end", compare_end, 2, 7, 2, 2, env);
6137 ADD_PRIM_W_ARITY("regexp-match-positions", positions, 2, 6, env);
6138 ADD_PRIM_W_ARITY2("regexp-match-positions/end", positions_end, 2, 7, 2, 2, env);
6139 ADD_PRIM_W_ARITY("regexp-match?", compare_bool, 2, 6, env);
6140 ADD_PRIM_W_ARITY("regexp-match-peek", compare_peek, 2, 6, env);
6141 ADD_PRIM_W_ARITY("regexp-match-peek-positions", positions_peek, 2, 6, env);
6142 ADD_PRIM_W_ARITY2("regexp-match-peek-positions/end", positions_peek_end, 2, 7, 2, 2, env);
6143 ADD_PRIM_W_ARITY("regexp-match-peek-immediate", compare_peek_nonblock, 2, 6, env);
6144 ADD_PRIM_W_ARITY("regexp-match-peek-positions-immediate", positions_peek_nonblock, 2, 6, env);
6145 ADD_PRIM_W_ARITY2("regexp-match-peek-positions-immediate/end", positions_peek_nonblock_end, 2, 7, 2, 2, env);
6146 ADD_PRIM_W_ARITY("regexp-replace", replace, 3, 4, env);
6147 ADD_PRIM_W_ARITY("regexp-replace*", replace_star, 3, 4, env);
6148
6149 ADD_FOLDING_PRIM("regexp?", regexp_p, 1, 1, 1, env);
6150 ADD_FOLDING_PRIM("byte-regexp?", byte_regexp_p, 1, 1, 1, env);
6151 ADD_FOLDING_PRIM("pregexp?", pregexp_p, 1, 1, 1, env);
6152 ADD_FOLDING_PRIM("byte-pregexp?", byte_pregexp_p, 1, 1, 1, env);
6153
6154 ADD_FOLDING_PRIM("regexp-max-lookbehind", regexp_lookbehind, 1, 1, 1, env);
6155 }
6156
scheme_init_regexp_places()6157 void scheme_init_regexp_places()
6158 {
6159 REGISTER_SO(regparsestr);
6160 REGISTER_SO(regstr);
6161 REGISTER_SO(regbackknown);
6162 REGISTER_SO(regbackdepends);
6163 REGISTER_SO(regerrorproc);
6164 REGISTER_SO(regerrorval);
6165 }
6166