1 /*
2  * @(#)regexp.c 1.3 of 18 April 87
3  * Revised for Racket starting 1995
4  *
5  * Copyright (c) 1986 by University of Toronto.
6  * Written by Henry Spencer.  Not derived from licensed software.
7  *
8  * Permission is granted to anyone to use this software for any
9  * purpose on any computer system, and to redistribute it freely,
10  * subject to the following restrictions:
11  *
12  * 1. The author is not responsible for the consequences of use of
13  *    this software, no matter how awful, even if they arise
14  *    from defects in it.
15  *
16  * 2. The origin of this software must not be misrepresented, either
17  *    by explicit claim or by omission.
18  *
19  * 3. Altered versions must be plainly marked as such, and must not
20  *    be misrepresented as being the original software.
21  *
22  * Beware that some of this code is subtly aware of the way operator
23  * precedence is structured in regular expressions.  Serious changes in
24  * regular-expression syntax might require a total rethink.
25  *
26  * Notable changes for Racket:
27  *   Removed hardwired limits on parenthesis nesting
28  *   Changed to index-based instead of pointer-based (better for GC)
29  *   Added non-greedy operators *?, +?, and ??
30  *   Added (?:...) grouping without reporting the group match
31  *   Added (?=...), (?!...), (?<=...), and (?<!...) lookahead and lookback
32  *   Added \n backreferences
33  *   Added numeric quantifiers
34  *   Added case-insensitive and multi-line modes
35  *   Added Racket glue
36  *
37  * from Vladimir Tsyshevsky:
38  *  additional optional parameter `offset' in `regexp-match'
39  *  and `regexp-match-positions'
40  */
41 
42 #include "schpriv.h"
43 #include "schmach.h"
44 #include "schgencat.h"
45 #include "schrx.h"
46 
47 #include <stdio.h>
48 #include <string.h>
49 
50 #ifdef SIXTY_FOUR_BIT_INTEGERS
51 # define BIGGEST_RXPOS 0x7FFFFFFFFFFFFFFF
52 #else
53 # define BIGGEST_RXPOS 0x7FFFFFFF
54 #endif
55 
56 # define rOP(o) OP(o, regstr)
57 # define rNEXT(o) NEXT(o, regstr)
58 # define rOPLEN(o) OPLEN(o, regstr)
59 # define rOPRNGS(o) OPRNGS(o, regstr)
60 # define NEXT_OP(scan) (scan + rNEXT(scan))
61 
62 static regexp *regcomp(char *, rxpos, int, int, Scheme_Object*);
63 /* static int regexec(regexp *, char *, int, int, rxpos *, rxpos * ...); */
64 
65 /*
66  * Global work variables for regcomp().
67  */
68 THREAD_LOCAL_DECL(static char *regstr);
69 THREAD_LOCAL_DECL(static char *regparsestr);
70 THREAD_LOCAL_DECL(static int regmatchmin);
71 THREAD_LOCAL_DECL(static int regmatchmax);
72 THREAD_LOCAL_DECL(static int regmaxbackposn);
73 THREAD_LOCAL_DECL(static int regsavepos);
74 
75 THREAD_LOCAL_DECL(static Scheme_Hash_Table *regbackknown); /* known/assumed backreference [non-]empty */
76 THREAD_LOCAL_DECL(static Scheme_Hash_Table *regbackdepends); /* backreferences required to be non-empty for the current to be non-empty */
77 
78 THREAD_LOCAL_DECL(static rxpos regparse);
79 THREAD_LOCAL_DECL(static rxpos regparse_end); /* Input-scan pointer. */
80 THREAD_LOCAL_DECL(static int regnpar);       /* () count. */
81 THREAD_LOCAL_DECL(static int regncounter);   /* {} count */
82 THREAD_LOCAL_DECL(static rxpos regcode) ;    /* Code-emit pointer, if less than regcodesize */
83 THREAD_LOCAL_DECL(static rxpos regcodesize);
84 THREAD_LOCAL_DECL(static rxpos regcodemax);
85 THREAD_LOCAL_DECL(static intptr_t regmaxlookback);
86 
87 THREAD_LOCAL_DECL(static const char *regerrorwho);
88 THREAD_LOCAL_DECL(static Scheme_Object *regerrorproc); /* error handler for regexp construction */
89 THREAD_LOCAL_DECL(static Scheme_Object *regerrorval);  /* result of error handler for failed regexp construction */
90 
91 /* caches to avoid gc */
92 THREAD_LOCAL_DECL(static intptr_t rx_buffer_size);
93 THREAD_LOCAL_DECL(static rxpos *startp_buffer_cache);
94 THREAD_LOCAL_DECL(static rxpos *endp_buffer_cache);
95 THREAD_LOCAL_DECL(static rxpos *maybep_buffer_cache);
96 THREAD_LOCAL_DECL(static rxpos *match_stack_buffer_cache);
97 
98 #define MATCH_STACK_SIZE 24
99 
100 /*
101  * Forward declarations for regcomp()'s friends.
102  */
103 static rxpos reg(int, int *, int, int, int);
104 static rxpos regbranch(int *, int, int);
105 static rxpos regpiece(int *, int, int);
106 static rxpos regatom(int *, int, int);
107 static rxpos regranges(int parse_flags, int at_start);
108 static rxpos regunicode(int invert);
109 static int regdigit();
110 static rxpos regnode(char);
111 static void regarg(int);
112 static rxpos regnext(rxpos);
113 static void regc(char);
114 static void reginsert(char, rxpos);
115 static rxpos reginsertwithop(char, rxpos, int);
116 static rxpos reginsertwithopop(char, rxpos, int, int);
117 static void regtail(rxpos, rxpos);
118 static void regoptail(rxpos, rxpos);
119 static int regstrcspn(char *, char *, char *);
120 static unsigned char *extract_regstart(rxpos scan, int *_anch);
121 
122 static int check_and_propagate_depends(void);
123 static int merge_tables(Scheme_Hash_Table *dest, Scheme_Hash_Table *src);
124 
125 READ_ONLY static Scheme_Object *empty_byte_string;
126 
127 #define	FAIL(m)	{ regcomperror(m); return 0; }
128 
129 static void
regerror(char * s)130 regerror(char *s)
131 {
132   if (!regerrorval) {
133     if (SCHEME_FALSEP(regerrorproc)) {
134       const char *who = regerrorwho;
135       regerrorwho = NULL;
136       scheme_raise_exn(MZEXN_FAIL_CONTRACT,
137                        "%s: %s",
138                        (who ? who : "regexp"),
139                        s);
140     } else {
141       Scheme_Object *a[1];
142       a[0] = scheme_make_utf8_string(s);
143       regerrorval = scheme_apply_multi(regerrorproc, 1, a);
144     }
145   }
146 }
147 
148 THREAD_LOCAL_DECL(const char *failure_msg_for_read);
149 
150 static void
regcomperror(char * s)151 regcomperror(char *s)
152 {
153   if (failure_msg_for_read) {
154     failure_msg_for_read = s;
155     scheme_longjmp(scheme_error_buf, 1);
156   } else
157     regerror(s);
158 }
159 
160 /*
161  - regcomp - compile a regular expression into internal code
162  *
163  * We can't allocate space until we know how big the compiled form will be,
164  * but we can't compile it (and thus know how big it is) until we've got a
165  * place to put the code.  So we cheat:  we compile it twice, once with code
166  * generation turned off and size counting turned on, and once "for real".
167  * This also means that we don't allocate space until we are sure that the
168  * thing really will compile successfully, and we never have to move the
169  * code and thus invalidate pointers into it.  (Note that it has to be in
170  * one piece because free() must be able to free it all.)
171  *
172  * Beware that the optimization-preparation code in here knows about some
173  * of the structure of the compiled regexp.
174  */
175 static regexp *
regcomp(char * expstr,rxpos exp,int explen,int pcre,Scheme_Object * handler)176 regcomp(char *expstr, rxpos exp, int explen, int pcre, Scheme_Object *handler)
177 {
178   regexp *r;
179   rxpos scan, next;
180   rxpos longest;
181   int len, longest_is_ci;
182   int flags;
183 
184   /* First pass: determine size, legality. */
185   regstr = NULL;
186   regparsestr = expstr;
187   regparse = exp;
188   regparse_end = exp + explen;
189   regnpar = 1;
190   regncounter = 0;
191   regmaxlookback = 0;
192   regcode = 1;
193   regcodesize = 0;
194   regcodemax = 0;
195   regmaxbackposn = 0;
196   regbackknown = NULL;
197   regbackdepends = NULL;
198   regerrorproc = handler;
199   regerrorval = NULL;
200   regc((char)MAGIC);
201   if (reg(0, &flags, 0, 0, PARSE_CASE_SENS | PARSE_SINGLE_LINE | (pcre ? PARSE_PCRE : 0)) == 0) {
202     if (regerrorval)
203       return NULL;
204     FAIL("unknown regexp failure");
205   }
206 
207   /* Small enough for pointer-storage convention? */
208   if (regcodemax >= 32767L)		/* Probably could be 65535L. */
209     FAIL("regexp too big");
210 
211   if (regmaxbackposn >= regnpar)
212     FAIL("backreference number is larger than the highest-numbered cluster");
213 
214   /* Allocate space. */
215   r = (regexp *)scheme_malloc_tagged(sizeof(regexp) + N_ITO_SPACE((unsigned)regcodemax));
216   r->type = scheme_regexp_type;
217 
218 #ifdef INDIRECT_TO_PROGRAM
219   r->program = (char *)scheme_malloc_atomic((unsigned)regcodemax + 1);
220 #endif
221 
222   r->regsize = regcodemax;
223 
224   r->nsubexp = regnpar;
225   r->ncounter = regncounter;
226   r->maxlookback = regmaxlookback;
227 
228   /* Second pass: emit code. */
229   regparse = exp;
230   regparse_end = exp + explen;
231   regnpar = 1;
232   regncounter = 0;
233   regcodesize = regcodemax;
234 #ifdef INDIRECT_TO_PROGRAM
235   regstr = r->program;
236   regcode = 0;
237 #else
238   regstr = (char *)r;
239   regcode = (char *)r->program - (char *)r;
240 #endif
241   regcodesize += regcode;
242   regcodemax = 0;
243   regbackknown = NULL;
244   regbackdepends = NULL;
245   regc((char)MAGIC);
246   if (reg(0, &flags, 0, 0, PARSE_CASE_SENS | PARSE_SINGLE_LINE | (pcre ? PARSE_PCRE : 0)) == 0) {
247     FAIL("unknown regexp failure (late)");
248   }
249 
250   if (regcode >= regcodesize) {
251     FAIL("wrote too far");
252   }
253 
254   /* Dig out information for optimizations. */
255   r->regstart = NULL;	/* Worst-case defaults. */
256   r->regmust = -1;
257   r->regmlen = 0;
258   scan = N_ITO_DELTA(r->program, 1, (char *)r);    /* First BRANCH. */
259   {
260     unsigned char *rs;
261     int anch = 0;
262     rs = extract_regstart(scan, &anch);
263     r->regstart = rs;
264     if (anch)
265       r->flags |= REGEXP_ANCH;
266   }
267   next = regnext(scan);
268   if (rOP(next) == END) {	/* Only one top-level choice. */
269     scan = OPERAND(scan);
270     /*
271      * If there's something expensive in the r.e., find the
272      * longest literal string that must appear and make it the
273      * regmust.  Resolve ties in favor of later strings, since
274      * the regstart check works with the beginning of the r.e.
275      * and avoiding duplication strengthens checking.  Not a
276      * strong reason, but sufficient in the absence of others.
277      */
278     if (flags&SPSTART) {
279       int prev_op = 0;
280       longest = 0;
281       longest_is_ci = 0;
282       len = 0;
283       for (; scan != 0; ) {
284         int mscan = scan;
285         while (1) {
286           int mop;
287           mop = rOP(mscan);
288           if ((mop == LOOKF) || (mop == LOOKBF)) {
289             /* skip over part that we don't want to match */
290             mscan = mscan + rOPLEN(OPERAND(mscan));
291             mscan = NEXT_OP(mscan);
292           } else if (((mop == EXACTLY) || (mop == EXACTLY_CI))
293               && rOPLEN(OPERAND(mscan)) >= len) {
294             /* Skip regmust if it contains a null character: */
295             rxpos ls = OPSTR(OPERAND(mscan));
296             int ll = rOPLEN(OPERAND(mscan)), i;
297             for (i = 0; i < ll; i++) {
298               if (!regstr[ls + i])
299                 break;
300             }
301             if (i >= ll) {
302               longest = ls;
303               len = ll;
304               longest_is_ci = (rOP(mscan) == EXACTLY_CI);
305             }
306             break;
307           } else if ((mop == EXACTLY1) && (1 >= len)) {
308             /* Skip if it's a null character */
309             if (regstr[OPERAND(mscan)]) {
310               longest = OPERAND(mscan);
311               len = 1;
312               longest_is_ci = 0;
313             }
314             break;
315           } else if ((mop == BRANCH) && (prev_op != BRANCH)) {
316             int mnext;
317             mnext = NEXT_OP(mscan);
318             if (rOP(mnext) != BRANCH) {
319               /* A branch with only one choice */
320               mscan = OPERAND(mscan);
321             } else
322               break;
323           } else
324             break;
325         }
326         prev_op = rOP(scan);
327         if ((prev_op == LOOKF) || (prev_op == LOOKBF)) {
328           /* skip over part that we don't want to match */
329           scan = scan + rOPLEN(OPERAND(scan));
330           scan = NEXT_OP(scan);
331         } else {
332           scan = regnext(scan);
333         }
334       }
335       if (longest) {
336 	r->regmust = longest;
337 	if (longest_is_ci)
338 	  r->flags |= REGEXP_MUST_CI;
339 	r->regmlen = len;
340       }
341     }
342   }
343 
344 #if 0
345   if (regcode > r->regsize + sizeof(regexp))
346     scheme_signal_error("regexp too large!");
347 #endif
348 
349   return(r);
350 }
351 
map_create(unsigned char * map)352 static unsigned char *map_create(unsigned char *map)
353 {
354   if (!map) {
355     map = (unsigned char *)scheme_malloc_atomic(32);
356     memset(map, 0, 32);
357   }
358   return map;
359 }
360 
map_start(unsigned char * map,int c)361 static unsigned char *map_start(unsigned char *map, int c)
362 {
363   map = map_create(map);
364   map[c >> 3] |= ((unsigned char)1 << (c & 0x7));
365   return map;
366 }
367 
map_copy(unsigned char * map,char * s,int pos)368 static unsigned char *map_copy(unsigned char *map, char *s, int pos)
369 {
370   map = map_create(map);
371   memcpy(map, s XFORM_OK_PLUS pos, 32);
372   return map;
373 }
374 
map_range(unsigned char * map,char * s,int pos,int invert)375 static unsigned char *map_range(unsigned char *map, char *s, int pos, int invert)
376 {
377   int rs, re;
378 
379   rs = UCHAR(s[pos++]);
380   re = UCHAR(s[pos++]);
381 
382   if (!invert) {
383     while (rs <= re) {
384       map = map_start(map, rs);
385       rs++;
386     }
387   } else {
388     while (rs > 0) {
389       map = map_start(map, rs - 1);
390       --rs;
391     }
392     while (re < 255) {
393       map = map_start(map, re + 1);
394       re++;
395     }
396   }
397 
398   return map;
399 }
400 
extract_regstart(rxpos scan,int * _anch)401 static unsigned char *extract_regstart(rxpos scan, int *_anch)
402 {
403   rxpos next;
404   int retry, the_op;
405   unsigned char *map = NULL;
406 
407   do {
408     retry = 0;
409 
410     the_op = rOP(scan);
411     switch (the_op) {
412     case BOL:
413     case EOL:
414     case NOTHING:
415     case SAVECONST:
416     case MAYBECONST:
417     case COUNTINIT:
418     case COUNTOVER:
419     case COUNTUNDER:
420       /* We can ignore zero-length things when finding starting info */
421       scan = regnext(scan);
422       retry = 1;
423       break;
424     case LOOKT:
425     case LOOKF:
426     case LOOKBT:
427     case LOOKBF:
428       /* Zero-length, but continuation in an unusual place */
429       scan += rOPLEN(OPERAND(scan));
430       scan = regnext(scan);
431       retry = 1;
432       break;
433     case LOOKTX:
434       scan = regnext(scan);
435       retry = 1;
436       break;
437     case PLUS:
438     case PLUS2:
439       scan = OPERAND(scan);
440       retry = 1;
441       break;
442     case STAR3:
443     case STAR4:
444       if (rOPLEN(OPERAND(scan)) > 0) {
445 	scan = OPERAND3(scan);
446 	retry = 1;
447       }
448       break;
449     case EXACTLY:
450       map = map_start(map, UCHAR(regstr[OPSTR(OPERAND(scan))]));
451       break;
452     case EXACTLY_CI:
453       {
454 	int c = UCHAR(regstr[OPSTR(OPERAND(scan))]);
455 	map = map_start(map, c);
456 	map = map_start(map, rx_toupper(c));
457       }
458       break;
459     case ANYOF:
460       map = map_copy(map, regstr, OPERAND(scan));
461       break;
462     case EXACTLY1:
463       map = map_start(map, UCHAR(regstr[OPERAND(scan)]));
464       break;
465     case EXACTLY2:
466       map = map_start(map, UCHAR(regstr[OPERAND(scan)]));
467       map = map_start(map, UCHAR(regstr[OPERAND(scan)+1]));
468       break;
469     case RANGE:
470       map = map_range(map, regstr, OPERAND(scan), 0);
471       break;
472     case NOTRANGE:
473       map = map_range(map, regstr, OPERAND(scan), 1);
474       break;
475     case BOI:
476       if (_anch)
477 	*_anch = 1;
478       break;
479     case BRANCH:
480       next = regnext(scan);
481       if (!next || (rOP(next) == END) || (rOP(next) == LOOKE)) {
482 	/* Only one branch */
483 	scan = OPERAND(scan);
484 	retry = 1;
485       }
486       break;
487     default:
488       if ((the_op == OPENN) || (the_op >= OPEN && the_op < CLOSE)) {
489 	scan = NEXT_OP(scan);
490 	retry = 1;
491       }
492       break;
493     }
494   } while (retry);
495 
496   return map;
497 }
498 
499 #ifdef DO_STACK_CHECK
500 
reg_k(void)501 static Scheme_Object *reg_k(void)
502 {
503   Scheme_Thread *p = scheme_current_thread;
504   int *flagp = (int *)p->ku.k.p1;
505   int res;
506 
507   p->ku.k.p1 = NULL;
508 
509   res = reg(p->ku.k.i1, flagp, p->ku.k.i2, p->ku.k.i3, p->ku.k.i4);
510 
511   return scheme_make_integer(res);
512 }
513 
514 #endif
515 
516 /*
517    - reg - regular expression, i.e. main body or parenthesized thing
518    *
519    * Caller must absorb opening parenthesis.
520    *
521    * Combining parenthesis handling with the base level of regular expression
522    * is a trifle forced, but the need to tie the tails of the branches to what
523    * follows makes it hard to avoid.
524    */
525 static rxpos
reg(int paren,int * flagp,int paren_set,int lookahead,int parse_flags)526 reg(int paren, int *flagp, int paren_set, int lookahead, int parse_flags)
527 {
528   rxpos ret;
529   rxpos br;
530   rxpos ender;
531   int parno = 0;
532   int flags, matchmin, matchmax, maxlookback, brcount;
533   Scheme_Hash_Table *backdepends;
534 
535 #ifdef DO_STACK_CHECK
536   {
537 # include "mzstkchk.h"
538     {
539       Scheme_Thread *p = scheme_current_thread;
540       Scheme_Object *ov;
541       p->ku.k.i1 = paren;
542       p->ku.k.p1 = (void *)flagp;
543       p->ku.k.i2 = paren_set;
544       p->ku.k.i3 = lookahead;
545       p->ku.k.i4 = parse_flags;
546       ov = scheme_handle_stack_overflow(reg_k);
547       return SCHEME_INT_VAL(ov);
548     }
549   }
550 #endif
551 
552   *flagp = HASWIDTH;		/* Tentatively. */
553 
554   /* Make an OPEN node, if parenthesized. */
555   if (paren) {
556     if (lookahead) {
557       parno = 0;
558       ret = regnode(lookahead);
559       regarg(0); /* space for LOOKE pointer */
560       if ((lookahead == LOOKBT) || (lookahead == LOOKBF)) {
561 	regarg(0); /* space for min count */
562 	regarg(0); /* space for max count */
563       }
564     } else if (paren_set) {
565       parno = regnpar;
566       regnpar++;
567       if (OPEN + parno >= CLOSE) {
568 	ret = regnode(OPENN);
569 	regarg(parno);
570       } else {
571 	ret = regnode(OPEN+parno);
572       }
573     } else
574       ret = 0;
575   } else
576     ret = 0;
577 
578   /* Pick up the branches, linking them together. */
579   br = regbranch(&flags, parse_flags, 0);
580   if (br == 0)
581     FAIL("branch failed!?");
582   if (ret != 0)
583     regtail(ret, br);		/* OPEN -> first. */
584   else
585     ret = br;
586   if (!(flags&HASWIDTH)) {
587     *flagp &= ~HASWIDTH;
588     backdepends = NULL;
589   } else if (regbackdepends) {
590     backdepends = regbackdepends;
591     regbackdepends = NULL;
592   } else
593     backdepends = NULL;
594   *flagp |= flags&(SPSTART|SPFIXED);
595   matchmin = regmatchmin;
596   matchmax = regmatchmax;
597   maxlookback = regmaxlookback;
598   brcount = 1;
599   while (regparsestr[regparse] == '|') {
600     brcount++;
601     regparse++;
602     br = regbranch(&flags, parse_flags, 0);
603     if (br == 0)
604       FAIL("next branch failed!?");
605     regtail(ret, br);		/* BRANCH -> BRANCH. */
606     if (!(flags&HASWIDTH))
607       *flagp &= ~HASWIDTH;
608     else if ((*flagp) & HASWIDTH) {
609       if (regbackdepends) {
610 	if (backdepends)
611 	  merge_tables(backdepends, regbackdepends);
612 	else
613 	  backdepends = regbackdepends;
614 	regbackdepends = NULL;
615       } else
616 	backdepends = NULL;
617     }
618     *flagp |= flags&SPSTART;
619     if (!(flags&SPFIXED))
620       *flagp &= ~SPFIXED;
621     else {
622       if (regmatchmin < matchmin)
623 	matchmin = regmatchmin;
624       if (regmatchmax > matchmax)
625 	matchmax = regmatchmax;
626       if (regmaxlookback > maxlookback)
627         maxlookback = regmaxlookback;
628     }
629   }
630   regbackdepends = backdepends;
631   regmatchmin = matchmin;
632   regmatchmax = matchmax;
633   regmaxlookback = maxlookback;
634 
635   if (paren && paren_set) {
636     Scheme_Object *assumed;
637 
638     if (!regbackknown)
639       regbackknown = scheme_make_hash_table(SCHEME_hash_ptr);
640     assumed = scheme_hash_get(regbackknown, scheme_make_integer(parno));
641 
642     if (!((*flagp) & HASWIDTH)) {
643       if (assumed && !SCHEME_FALSEP(assumed)) {
644 	FAIL("`*', `+', or `{...,}' operand can be empty due to backreference");
645       }
646       scheme_hash_set(regbackknown, scheme_make_integer(parno), scheme_false);
647     } else {
648       if (!backdepends)
649 	scheme_hash_set(regbackknown, scheme_make_integer(parno), scheme_true);
650       else {
651 	if (assumed) {
652 	  check_and_propagate_depends();
653 	} else
654 	  scheme_hash_set(regbackknown, scheme_make_integer(parno), (Scheme_Object *)backdepends);
655       }
656     }
657   }
658 
659   if ((brcount == 1)
660       && paren
661       && (!paren_set || ((flags & SPFIXED)
662 			 && (regmatchmin == regmatchmax)
663 			 && (regmatchmax < 0x7FFFF)))
664       && !lookahead) {
665     /* Simplify to just the single branch: */
666     if (br + 3 < regcodesize) {
667       int top;
668       if (regcode <= regcodesize)
669 	top = regcode;
670       else
671 	top = regcodesize;
672       memmove(regstr + ret, regstr + br + 3, top - (br + 3));
673     }
674     *flagp = flags;
675     regcode -= (br + 3 - ret);
676     if (paren_set) {
677       /* Collude with regpiece: */
678       *flagp |= NEEDSAVECONST;
679       *flagp &= ~SPNOTHING;
680       regsavepos = parno;
681     }
682   } else {
683     if (lookahead) {
684       if ((lookahead == LOOKBT) || (lookahead == LOOKBF)) {
685 	if (!((*flagp) & SPFIXED))
686 	  FAIL("lookbehind pattern does not match a bounded byte width");
687 	if (matchmax > 0x7FFF)
688 	  FAIL("lookbehind match is potentially too long (more than 32767 bytes)");
689         regmaxlookback = matchmax + maxlookback;
690 	if (ret + 8 < regcodesize) {
691 	  regstr[ret + 5] = (matchmin >> 8);
692 	  regstr[ret + 6] = (matchmin & 255);
693 	  regstr[ret + 7] = (matchmax >> 8);
694 	  regstr[ret + 8] = (matchmax & 255);
695 	}
696       }
697     }
698 
699     /* Make a closing node, and hook it on the end. */
700     if (paren) {
701       if (lookahead) {
702 	ender = regnode(LOOKE);
703 	if (ret + 4 < regcodesize) {
704 	  int delta = (ender - ret);
705 	  regstr[ret + 3] = (delta >> 8);
706 	  regstr[ret + 4] = (delta & 255);
707 	}
708       } else if (paren_set) {
709 	if (OPEN + parno >= CLOSE) {
710 	  ender = regcode;
711 	  regarg(parno);
712 	  reginsert(CLOSEN, ender);
713 	} else
714 	  ender = regnode(CLOSE+parno);
715       } else {
716 	ender = regnode(NOTHING);
717       }
718     } else {
719       ender = regnode(END);
720     }
721     regtail(ret, ender);
722 
723     /* Hook the tails of the branches to the closing node. */
724     if (regcodesize) {
725       for (br = ret; br != 0; br = regnext(br)) {
726 	regoptail(br, ender);
727       }
728     }
729   }
730 
731   /* Check for proper termination. */
732   if (paren && regparsestr[regparse++] != ')') {
733     FAIL("missing closing parenthesis in pattern");
734   } else if (!paren && regparse != regparse_end) {
735     if (regparsestr[regparse] == ')') {
736       FAIL("extra closing parenthesis in pattern");
737     } else
738       FAIL("junk on end");	/* "Can't happen". */
739     /* NOTREACHED */
740   }
741 
742   return ret;
743 }
744 
745 /*
746    - regbranch - one alternative of an | operator
747    *
748    * Implements the concatenation operator.
749    */
750 static rxpos
regbranch(int * flagp,int parse_flags,int without_branch_node)751 regbranch(int *flagp, int parse_flags, int without_branch_node)
752 {
753   rxpos ret;
754   rxpos chain, latest;
755   int flags = 0, matchmin = 0, matchmax = 0, maxlookback = 0, pcount = 0, save_flags;
756 
757   *flagp = (WORST|SPFIXED);		/* Tentatively. */
758 
759   if (!without_branch_node)
760     ret = regnode(BRANCH);
761   else
762     ret = 0;
763   chain = 0;
764   while (regparse != regparse_end
765 	 && regparsestr[regparse] != '|'
766 	 && regparsestr[regparse] != ')') {
767     save_flags = flags;
768     latest = regpiece(&flags, parse_flags, !chain && !without_branch_node);
769     if (latest == 0)
770       FAIL("piece failed!?");
771     if (flags & SPNOTHING) {
772       /* no need to match nothing */
773       regcode = latest; /* throw away dead code */
774       flags = save_flags; /* in case all but the first is discarded */
775     } else {
776       pcount++;
777       *flagp |= flags&HASWIDTH;
778       if (chain == 0) {		/* First piece. */
779 	*flagp |= flags&SPSTART;
780 	if (without_branch_node)
781 	  ret = latest;
782       } else
783 	regtail(chain, latest);
784       if (!(flags&SPFIXED))
785 	*flagp &= ~SPFIXED;
786       if ((regmaxlookback - matchmin) > maxlookback)
787         maxlookback = regmaxlookback - matchmin;
788       matchmin += regmatchmin;
789       matchmax += regmatchmax;
790       if (matchmax > 0x7FFF)
791 	matchmax = 0x10000;
792       chain = latest;
793     }
794   }
795   regmatchmin = matchmin;
796   regmatchmax = matchmax;
797   regmaxlookback = maxlookback;
798   if (chain == 0) {  /* Loop ran zero times. */
799     latest = regnode(NOTHING);
800     if (without_branch_node)
801       ret = latest;
802     *flagp = SIMPLE|SPNOTHING|SPFIXED;
803     regmatchmin = regmatchmax = 0;
804   }
805 
806   if (pcount == 1) {
807     *flagp = flags; /* BRANCH will be deleted if simplicity is relevant */
808   }
809 
810   return(ret);
811 }
812 
813 /*
814    - regpiece - something followed by possible [*+?]
815    *
816    * Note that the branching code sequences used for ? and the general cases
817    * of * and + are somewhat optimized:  they use the same NOTHING node as
818    * both the endmarker for their branch list and the body of the last branch.
819    * It might seem that this node could be dispensed with entirely, but the
820    * endmarker role is not redundant.
821    */
822 static rxpos
regpiece(int * flagp,int parse_flags,int at_start)823 regpiece(int *flagp, int parse_flags, int at_start)
824 {
825   rxpos ret;
826   char op;
827   rxpos next;
828   int flags, greedy;
829   int minreps = 0, maxreps = 0, counter;
830   int origsavepos, origmatchmin, origmatchmax;
831 
832   ret = regatom(&flags, parse_flags, at_start);
833   if (ret == 0)
834     FAIL("atom failed!?");
835 
836   origsavepos = regsavepos;
837   origmatchmin = regmatchmin;
838   origmatchmax = regmatchmax;
839 
840   op = regparsestr[regparse];
841   if (!ISMULT(op, parse_flags)) {
842     *flagp = (flags & ~NEEDSAVECONST);
843   } else {
844     if (op == '{') {
845       int ch, maxspec = 0;
846       minreps = maxreps = 0;
847       regparse++;
848       do {
849 	ch = regparsestr[regparse];
850 	if ((ch >= '0') && (ch <= '9')) {
851 	  minreps = (minreps * 10) + (ch - '0');
852 	  if (minreps > 0x7FFF)
853 	    FAIL("minimum repetition count too large");
854 	  regparse++;
855 	} else if (ch == ',' || ch == '}')
856 	  break;
857 	else {
858 	  FAIL("expected digit, comma, or `}' to end repetition specification started with `{'");
859 	}
860       } while (1);
861       if (ch == ',') {
862 	regparse++;
863 	do {
864 	  ch = regparsestr[regparse];
865 	  if ((ch >= '0') && (ch <= '9')) {
866 	    maxspec = 1;
867 	    maxreps = (maxreps * 10) + (ch - '0');
868 	    if (maxreps > 0x7FFF)
869 	      FAIL("maximum repetition count too large");
870 	    regparse++;
871 	  } else if (ch == '}')
872 	    break;
873 	  else {
874 	    FAIL("expected digit or `}' to end repetition specification started with `{'");
875 	  }
876 	} while (1);
877       } else {
878 	maxspec = 1;
879 	maxreps = minreps;
880       }
881       if (maxspec && (maxreps < minreps)) {
882 	FAIL("maximum repetition count is less than maximum repetition count");
883       }
884       if (maxspec && !maxreps) {
885 	/* Match 0 instances */
886 	regparse++;
887 	if (regparsestr[regparse] == '?')
888 	  regparse++; /* non-greedy */
889 	if (ISMULT(regparsestr[regparse], parse_flags))
890 	  FAIL("nested `*', `?', `+', or `{...}' in pattern");
891 	regcode = ret; /* throw away dead code */
892 	*flagp = SPFIXED|SPNOTHING;
893 	regmatchmin = regmatchmax = 0;
894 	return regnode(NOTHING);
895       }
896       op = '*';
897       if (maxreps || minreps)
898 	counter = regncounter++;
899       else
900 	counter = 0;
901     } else
902       counter = 0;
903 
904     if (!(flags&HASWIDTH) && (op != '?')) {
905       FAIL("`*', `+', or `{...}' operand could be empty");
906     }
907 
908     if (regbackdepends) {
909       /* Operand has width only if the indicated backreferences have width. */
910       check_and_propagate_depends();
911       /* Assumptions are registered, so we no longer need these backdepends: */
912       regbackdepends = NULL;
913     }
914 
915     if (maxreps || minreps) {
916       if (minreps > 0)
917 	*flagp = HASWIDTH;
918       if ((flags & SPFIXED) && maxreps) {
919 	regmatchmin = (origmatchmin * minreps);
920 	regmatchmax = (origmatchmax * maxreps);
921 	if (regmatchmax > 0x7FFF)
922 	  regmatchmax = 0x10000;
923 	*flagp |= SPFIXED;
924       }
925     } else {
926       *flagp = (op != '+') ? WORST : HASWIDTH;
927       if ((op == '*') || (op == '?'))
928         regmatchmin = 0;
929     }
930     *flagp |= SPSTART;
931     if ((op == '?') && (flags & SPFIXED)) {
932       *flagp |= SPFIXED;
933       regmatchmin = 0;
934     }
935 
936     if (regparsestr[regparse+1] == '?') {
937       greedy = 0;
938       regparse++;
939     } else
940       greedy = 1;
941 
942     if (op == '*' && (flags&SIMPLE)) {
943       if (!minreps && !maxreps)
944 	reginsert(greedy ? STAR : STAR2, ret);
945       else
946 	reginsertwithopop(greedy ? STAR3 : STAR4, ret, minreps, maxreps);
947     } else if (op == '*' && greedy) {
948       /* Emit x* as (x&|), where & means "self".
949 	 If minreps or maxreps, also insert counter-managing
950 	 nodes. This counter detects empty matches, too.
951 	 The code is a little difficult to read because it often
952 	 uses reginsert, which puts nodes before existing nodes.
953 	 So, you almost have to read it backward. */
954       rxpos br, nothing;
955       if (minreps || maxreps) {
956 	/* Increment iteration counter, and fail if it's
957 	   already at the max: */
958 	rxpos x;
959 	x = reginsertwithopop(COUNTUNDER, ret, counter, maxreps);
960 	regtail(ret, x);
961       }
962       reginsert(BRANCH, ret);	/* Either x */
963       if (minreps || maxreps) {
964 	/* Initialize the iteration counter on entry: */
965 	br = reginsertwithop(COUNTINIT, ret, counter);
966 	regtail(ret, br);
967       } else
968 	br = ret;
969       regoptail(br, regnode(BACK)); /* and loop */
970       regoptail(br, br);	/* back */
971       regtail(br, regnode(BRANCH)); /* or */
972       nothing = regnode(NOTHING);
973       if (minreps) {
974 	/* Fail to match if the counter isn't big enough, yet: */
975 	rxpos n;
976 	n = reginsertwithopop(COUNTOVER, nothing, counter, minreps);
977 	regtail(nothing, n);
978       }
979       if (minreps || maxreps) {
980 	/* We incremented the counter for an x match, but now
981 	   we're backtracking, so decrement it: */
982 	rxpos n;
983 	n = reginsertwithop(COUNTBACK, nothing, counter);
984 	regtail(nothing, n);
985       }
986       regtail(br, nothing); /* null. */
987     } else if (op == '*') {
988       /* Emit x*? as (|x&), where & means "self".
989 	 With a counter, we need (|(x|-)&), where - reverts
990 	 the iteration count and fails. */
991       rxpos br, nothing, x, next_to_x;
992       if (minreps || maxreps) {
993 	/* Increment iteration counter, and fail if it's
994 	   already at the max: */
995 	rxpos fail;
996 	x = reginsertwithopop(COUNTUNDER, ret, counter, maxreps);
997 	regtail(ret, x);
998 
999 	fail = regnode(BRANCH);
1000 	regnode(COUNTBACKFAIL);
1001 	regarg(counter);
1002 	reginsert(BRANCH, ret);
1003 	fail += 3;
1004 	regtail(ret, fail);
1005 	x += 3;
1006       } else
1007 	x = ret;
1008       reginsert(BRANCH, ret);  /* = next */
1009       next = ret;
1010       next_to_x = (x - next) + 3;
1011       reginsert(NOTHING, ret); /* = nothing */
1012       next += 3;
1013       nothing = ret;
1014       if (minreps) {
1015 	/* Fail to match if the counter isn't big enough, yet: */
1016 	nothing = reginsertwithopop(COUNTOVER, ret, counter, minreps);
1017 	regtail(ret, nothing); /* chain countover -> nothing */
1018 	next += (nothing - ret);
1019       }
1020       reginsert(BRANCH, ret); /* b3 */
1021       next += 3;
1022       nothing += 3;
1023       if (minreps || maxreps) {
1024 	/* Initialize the iteration counter on entry: */
1025 	br = reginsertwithop(COUNTINIT, ret, counter);
1026 	regtail(ret, br); /* chain countinit to b3 */
1027 	next += (br - ret);
1028 	nothing += (br - ret);
1029       } else
1030 	br = ret;
1031       regtail(br, next); /* chain b3 to next */
1032       x = next + next_to_x;
1033       regtail(x, regnode(BACK)); /* loop */
1034       regtail(x, br); /* back. */
1035       regtail(next, regnode(BACK)); /* chain next to nothing */
1036       regtail(next, nothing);
1037     } else if (op == '+' && (flags&SIMPLE))
1038       reginsert(greedy ? PLUS : PLUS2, ret);
1039     else if (op == '+' && greedy) {
1040       /* Emit x+ as x(&|), where & means "self". */
1041       next = regnode(BRANCH);	/* Either */
1042       regtail(ret, next);
1043       regtail(regnode(BACK), ret); /* loop back */
1044       regtail(next, regnode(BRANCH)); /* or */
1045       regtail(ret, regnode(NOTHING)); /* null. */
1046     } else if (op == '+') {
1047       /* Emit x+? as x(|&), where & means "self". */
1048       next = regnode(BRANCH);	/* Either */
1049       regtail(ret, next);
1050       regnode(NOTHING); /* op */
1051       regtail(next, regnode(BRANCH)); /* or */
1052       regtail(regnode(BACK), ret); /* loop back. */
1053       regtail(next, regnode(BACK));
1054       regtail(next, next + 3);
1055     } else if (op == '?' && greedy) {
1056       /* Emit x? as (x|) */
1057       reginsert(BRANCH, ret);	/* Either x */
1058       regtail(ret, regnode(BRANCH)); /* or */
1059       next = regnode(NOTHING);	/* null. */
1060       regtail(ret, next);
1061       regoptail(ret, next);
1062     } else if (op == '?') {
1063       /* Emit x?? as (|x) */
1064       reginsert(BRANCH, ret);  /* will be next... */
1065       reginsert(NOTHING, ret);
1066       reginsert(BRANCH, ret);
1067       regtail(ret, ret + 6);
1068       next = regnode(BACK);
1069       regtail(ret + 6, next);
1070       regoptail(ret + 6, next);
1071       regoptail(ret + 6, ret + 3);
1072     }
1073     regparse++;
1074     if (ISMULT(regparsestr[regparse], parse_flags))
1075       FAIL("nested `*', `?', `+', or `{...}' in pattern");
1076   }
1077 
1078   if (flags & NEEDSAVECONST) {
1079     rxpos sv;
1080     sv = regnode(SAVECONST);
1081     regarg(origsavepos);
1082     regarg(origmatchmax);
1083     regtail(ret, sv);
1084     if (origmatchmax) {
1085       sv = reginsertwithop(MAYBECONST, ret, origsavepos);
1086       regtail(ret, sv);
1087     }
1088     *flagp &= ~SIMPLE;
1089   }
1090 
1091   return(ret);
1092 }
1093 
1094 /*
1095    - regatom - the lowest level
1096    *
1097    * Optimization:  gobbles an entire sequence of ordinary characters so that
1098    * it can turn them into a single node, which is smaller to store and
1099    * faster to run.  Backslashed characters are exceptions, each becoming a
1100    * separate node; the code is simpler that way and it's not worth fixing.
1101  */
1102 static rxpos
regatom(int * flagp,int parse_flags,int at_start)1103 regatom(int *flagp, int parse_flags, int at_start)
1104 {
1105   rxpos ret;
1106   int flags;
1107 
1108   *flagp = (WORST|SPFIXED);		/* Tentatively. */
1109   regmatchmin = regmatchmax = 1;
1110   regmaxlookback = 0;
1111 
1112   switch (regparsestr[regparse++]) {
1113   case '^':
1114     if (parse_flags & PARSE_SINGLE_LINE)
1115       ret = regnode(BOI);
1116     else
1117       ret = regnode(BOL);
1118     regmaxlookback = 1;
1119     regmatchmin = regmatchmax = 0;
1120     break;
1121   case '$':
1122     if (parse_flags & PARSE_SINGLE_LINE)
1123       ret = regnode(EOI);
1124     else
1125       ret = regnode(EOL);
1126     regmatchmin = regmatchmax = 0;
1127     break;
1128   case '.':
1129     --regparse;
1130     ret = regranges(parse_flags, at_start);
1131     *flagp |= HASWIDTH|SIMPLE;
1132     break;
1133   case '[':
1134     --regparse;
1135     ret = regranges(parse_flags, at_start);
1136     *flagp |= HASWIDTH|SIMPLE;
1137     break;
1138   case '(':
1139     {
1140       if (regparsestr[regparse] == '?') {
1141 	int moded = 0;
1142 
1143 	while (1) {
1144 	  if (regparsestr[regparse+1] == 'i') {
1145 	    parse_flags &= ~PARSE_CASE_SENS;
1146 	    regparse++;
1147 	    moded = 1;
1148 	  } else if (regparsestr[regparse+1] == 'm') {
1149 	    parse_flags &= ~PARSE_SINGLE_LINE;
1150 	    regparse++;
1151 	    moded = 1;
1152 	  } else if (regparsestr[regparse+1] == 's') {
1153 	    parse_flags |= PARSE_SINGLE_LINE;
1154 	    regparse++;
1155 	    moded = 1;
1156 	  } else if ((regparsestr[regparse+1] == '-')
1157 		     && (regparsestr[regparse+2] == 'i')) {
1158 	    parse_flags |= PARSE_CASE_SENS;
1159 	    regparse += 2;
1160 	    moded = 1;
1161 	  } else if ((regparsestr[regparse+1] == '-')
1162 		     && (regparsestr[regparse+2] == 'm')) {
1163 	    parse_flags |= PARSE_SINGLE_LINE;
1164 	    regparse += 2;
1165 	    moded = 1;
1166 	  } else if ((regparsestr[regparse+1] == '-')
1167 		     && (regparsestr[regparse+2] == 's')) {
1168 	    parse_flags &= ~PARSE_SINGLE_LINE;
1169 	    regparse += 2;
1170 	    moded = 1;
1171 	  } else {
1172 	    break;
1173 	  }
1174 	}
1175 
1176 	if (regparsestr[regparse+1] == ':') {
1177 	  regparse += 2;
1178 	  ret = reg(1, &flags, 0, 0, parse_flags);
1179 	  *flagp = flags;
1180 	} else if (moded) {
1181 	  FAIL("expected `:' or another mode after `(?' and a mode sequence (where a mode is `i', `-i', `m', `-m', `s', or `-s')");
1182 	} else if (regparsestr[regparse+1] == '(') {
1183 	  /* Conditional */
1184 	  if (((regparsestr[regparse+2] >= '0')
1185 	       && (regparsestr[regparse+2] <= '9'))
1186 	      || ((regparsestr[regparse+2] == '?')
1187 		  && ((regparsestr[regparse+3] == '=')
1188 		      || (regparsestr[regparse+3] == '!')
1189 		      || (regparsestr[regparse+3] == '<')))) {
1190 	    rxpos test, tbr, fbr, ender;
1191 	    int flags, matchmin, matchmax;
1192 	    Scheme_Hash_Table *backdepends;
1193 
1194 	    regparse++;
1195 	    ret = regnode(CONDITIONAL);
1196 	    regarg(0); /* space for then */
1197 	    regarg(0); /* space for else */
1198 	    if (regparsestr[regparse+1] != '?') {
1199 	      int posn;
1200 	      regparse++;
1201 	      posn = regdigit();
1202 	      test = regnode(BACKREF);
1203 	      regarg(posn);
1204 	      if (regparsestr[regparse] == ')') {
1205 		regparse++;
1206 	      } else {
1207 		FAIL("expected `)' after `(?(' followed by a digit");
1208 	      }
1209 	    } else {
1210 	      test = regatom(&flags, parse_flags, 1);
1211 	    }
1212 	    if (test != OPERAND3(ret)) {
1213 	      FAIL("test went to wrong place!?");
1214 	    }
1215 	    regtail(test, regnode(END));
1216 	    if (regparsestr[regparse] == ')') {
1217 	      FAIL("expected an expression after test in `(?(...))'");
1218 	    }
1219 
1220 	    regbackdepends = NULL;
1221 	    *flagp |= HASWIDTH; /* tentatively */
1222 
1223 	    tbr = regbranch(&flags, parse_flags, 1);
1224 
1225 	    if (!(flags&HASWIDTH)) {
1226 	      *flagp &= ~HASWIDTH;
1227 	      backdepends = NULL;
1228 	    } else if (regbackdepends) {
1229 	      backdepends = regbackdepends;
1230 	      regbackdepends = NULL;
1231 	    } else
1232 	      backdepends = NULL;
1233 
1234 	    if (!(flags & SPFIXED))
1235 	      *flagp &= ~SPFIXED;
1236 	    matchmin = regmatchmin;
1237 	    matchmax = regmatchmax;
1238 
1239 	    if (regparsestr[regparse] == ')') {
1240 	      fbr = regnode(NOTHING);
1241 	      *flagp &= ~HASWIDTH;
1242 	      matchmin = 0;
1243 	    } else if (regparsestr[regparse] != '|') {
1244 	      FAIL("expected `)' or `|' after first branch of `(?(...)...)'");
1245 	    } else {
1246 	      regparse++;
1247 	      fbr = regbranch(&flags, parse_flags, 1);
1248 	      if (regparsestr[regparse] != ')') {
1249 		FAIL("expected `)' to close `(?(...)...' after second branch");
1250 	      }
1251 
1252 	      if (!(flags&HASWIDTH)) {
1253 		*flagp &= ~HASWIDTH;
1254 		backdepends = NULL;
1255 	      } else if (regbackdepends) {
1256 		if (backdepends)
1257 		  merge_tables(backdepends, regbackdepends);
1258 		else
1259 		  backdepends = regbackdepends;
1260 	      }
1261 
1262 	      if (!(flags & SPFIXED))
1263 		*flagp &= ~SPFIXED;
1264 	      else {
1265 		if (regmatchmin < matchmin)
1266 		  matchmin = regmatchmin;
1267 		if (regmatchmax > matchmax)
1268 		  matchmax = regmatchmax;
1269 	      }
1270 	    }
1271 
1272 	    regmatchmax = matchmax;
1273 	    regmatchmin = matchmin;
1274 	    regbackdepends = backdepends;
1275 
1276 	    if (OPERAND2(ret) + 1 < regcodesize) {
1277 	      int delta;
1278 	      delta = tbr - ret;
1279 	      regstr[OPERAND(ret)] = delta >> 8;
1280 	      regstr[OPERAND(ret)+1] = delta & 255;
1281 	      delta = fbr - ret;
1282 	      regstr[OPERAND2(ret)] = delta >> 8;
1283 	      regstr[OPERAND2(ret)+1] = delta & 255;
1284 	    }
1285 	    ender = regnode(NOTHING);
1286 	    regtail(tbr, ender);
1287 	    regtail(fbr, ender);
1288 	    regtail(ret, ender);
1289 	    regparse++;
1290 	  } else
1291 	    FAIL("expected `(?=', `(?!', `(?<', or digit after `(?('");
1292 	} else if (regparsestr[regparse+1] == '>') {
1293 	  regparse += 2;
1294 	  ret = reg(1, &flags, 0, LOOKTX, parse_flags);
1295 	  *flagp = flags;
1296 	} else {
1297 	  if (regparsestr[regparse+1] == '=') {
1298 	    regparse += 2;
1299 	    ret = reg(1, &flags, 0, LOOKT, parse_flags);
1300 	  } else if (regparsestr[regparse+1] == '!') {
1301 	    regparse += 2;
1302 	    ret = reg(1, &flags, 0, LOOKF, parse_flags);
1303 	  } else if ((regparsestr[regparse+1] == '<')
1304 		     && (regparsestr[regparse+2] == '=')) {
1305 	    regparse += 3;
1306 	    ret = reg(1, &flags, 0, LOOKBT, parse_flags);
1307 	  } else if ((regparsestr[regparse+1] == '<')
1308 		     && (regparsestr[regparse+2] == '!')) {
1309 	    regparse += 3;
1310 	    ret = reg(1, &flags, 0, LOOKBF, parse_flags);
1311 	  } else {
1312 	    FAIL("expected `:', `=', `!', `<=', `<!', `i', `-i', `m', `-m', `s', or `-s' after `(?'");
1313 	  }
1314 	  regmatchmin = regmatchmax = 0;
1315 	  *flagp = SPFIXED;
1316 	  regbackdepends = NULL;
1317 	}
1318       } else {
1319 	ret = reg(1, &flags, 1, 0, parse_flags);
1320 	if (flags & NEEDSAVECONST) {
1321 	  *flagp = flags;
1322 	} else {
1323 	  *flagp |= flags&(HASWIDTH|SPSTART);
1324 	  if (!(flags&SPFIXED))
1325 	    *flagp &= ~SPFIXED;
1326 	}
1327       }
1328       /* otherwise, regmatchmin/regmatchmax is set */
1329       if (ret == 0)
1330 	FAIL("cluster failed!?");
1331     }
1332     break;
1333   case '|':
1334   case ')':
1335     FAIL("internal urp");	/* Supposed to be caught earlier. */
1336     break;
1337   case '?':
1338     FAIL("`?' follows nothing in pattern");
1339     break;
1340   case '+':
1341     FAIL("`+' follows nothing in pattern");
1342     break;
1343   case '*':
1344     FAIL("`*' follows nothing in pattern");
1345     break;
1346   case '\\':
1347     {
1348       int c;
1349       if (regparse == regparse_end)
1350 	FAIL("trailing backslash in pattern");
1351       c = regparsestr[regparse++];
1352       if ((parse_flags & PARSE_PCRE) && (c == 'b')) {
1353 	ret = regnode(WORDBOUND);
1354 	regmatchmin = regmatchmax = 0;
1355         regmaxlookback = 1;
1356       } else if ((parse_flags & PARSE_PCRE) && (c == 'B')) {
1357 	ret = regnode(NOTWORDBOUND);
1358 	regmatchmin = regmatchmax = 0;
1359         regmaxlookback = 1;
1360       } else if ((parse_flags & PARSE_PCRE) && (c == 'p')) {
1361 	ret = regunicode(0);
1362 	regmatchmax = MAX_UTF8_CHAR_BYTES;
1363 	*flagp |= HASWIDTH;
1364       } else if ((parse_flags & PARSE_PCRE) && (c == 'P')) {
1365 	ret = regunicode(1);
1366 	regmatchmax = MAX_UTF8_CHAR_BYTES;
1367 	*flagp |= HASWIDTH;
1368       } else if ((parse_flags & PARSE_PCRE) && (c >= '0') && (c <= '9')) {
1369 	int posn;
1370 	--regparse;
1371 	posn = regdigit();
1372 	if (parse_flags & PARSE_CASE_SENS)
1373 	  ret = regnode(BACKREF);
1374 	else
1375 	  ret = regnode(BACKREF_CI);
1376 	regarg(posn);
1377 	*flagp &= ~SPFIXED;
1378 	/* Set HASWIDTH flag: */
1379 	{
1380 	  Scheme_Object *f;
1381 	  if (regbackknown)
1382 	    f = scheme_hash_get(regbackknown, scheme_make_integer(posn));
1383 	  else
1384 	    f = NULL;
1385 	  if (f) {
1386 	    if (SCHEME_TRUEP(f))
1387 	      *flagp |= HASWIDTH;
1388 	  } else {
1389 	    *flagp |= HASWIDTH;
1390 	    if (!regbackdepends)
1391 	      regbackdepends = scheme_make_hash_table(SCHEME_hash_ptr);
1392 	    scheme_hash_set(regbackdepends, scheme_make_integer(posn), scheme_true);
1393 	  }
1394 	}
1395       } else {
1396 	regparse -= 2;
1397 	ret = regranges(parse_flags, at_start);
1398 	*flagp |= HASWIDTH|SIMPLE;
1399       }
1400     }
1401     break;
1402   default:
1403     {
1404       int len, ilen, c;
1405       char ender;
1406 
1407       regparse--;
1408 
1409       if (parse_flags & PARSE_PCRE) {
1410 	if (regparsestr[regparse] == '{')
1411 	  FAIL("`{' follows nothing in pattern");
1412 	if (regparsestr[regparse] == '}')
1413 	  FAIL("unmatched `}' in pattern");
1414 	if (regparsestr[regparse] == ']')
1415 	  FAIL("unmatched `]' in pattern");
1416       }
1417 
1418       for (len = ilen = 0; regparse + ilen < regparse_end; len++, ilen++) {
1419 	if (regparsestr[regparse + ilen] == '\\') {
1420 	  if (regparse + ilen + 1 >= regparse_end)
1421 	    break;
1422 	  c = regparsestr[regparse + ilen + 1];
1423 	  if (((c >= 'a') && (c <= 'z'))
1424 	      || ((c >= 'A') && (c <= 'Z'))
1425 	      || ((c >= '0') && (c <= '9')))
1426 	    break;
1427 	  ilen++;
1428 	} else if (regstrcspn(regparsestr + regparse + ilen, regparsestr + regparse + ilen + 1,
1429 			      (parse_flags & PARSE_PCRE) ? PCRE_META : META) < 1)
1430 	  break;
1431       }
1432       if (len <= 0)
1433 	FAIL("internal disaster");
1434 
1435       if ((len == 1) && at_start) {
1436 	/* Maybe convert "x|y" to "[xy]", etc.: */
1437 	ret = regranges(parse_flags, at_start);
1438 	*flagp |= HASWIDTH|SIMPLE;
1439       } else {
1440 	if (!(parse_flags & PARSE_CASE_SENS)) {
1441 	  /* Need case insensitivity? */
1442 	  int i;
1443 	  for (i = 0; i < ilen; i++) {
1444 	    c = regparsestr[regparse + i];
1445 	    if ((rx_toupper(c) != c)
1446 		|| (rx_tolower(c) != c)) {
1447 	      break;
1448 	    }
1449 	  }
1450 	  if (i >= ilen)
1451 	    parse_flags |= PARSE_CASE_SENS;
1452 	}
1453 
1454 	ender = regparsestr[regparse+ilen];
1455 	if (len > 1 && ISMULT(ender, parse_flags)) {
1456 	  /* Back off from ?+* operand. */
1457 	  len--;
1458 	  ilen--;
1459 	  if (regparsestr[regparse + ilen] == '\\')
1460 	    --ilen;
1461 	}
1462 	*flagp |= HASWIDTH;
1463 	if (len == 1)
1464 	  *flagp |= SIMPLE;
1465 	regmatchmin = regmatchmax = len;
1466 	ret = regnode((parse_flags & PARSE_CASE_SENS) ? EXACTLY : EXACTLY_CI);
1467 	regarg(len);
1468 	while (len > 0) {
1469 	  c = regparsestr[regparse++];
1470 	  if (c == '\\')
1471 	    c = regparsestr[regparse++];
1472 	  if (!(parse_flags & PARSE_CASE_SENS))
1473 	    c = rx_tolower(c);
1474 	  regc(c);
1475 	  len--;
1476 	}
1477       }
1478     }
1479     break;
1480   }
1481 
1482   if (!ret)
1483     FAIL("failed!?");
1484 
1485   return ret;
1486 }
1487 
regcharclass(int c,char * map,int * _non_ascii)1488 static int regcharclass(int c, char *map, int *_non_ascii)
1489 {
1490   switch(c) {
1491   case 'd':
1492     for (c = 0; c < 10; c++) {
1493       map['0' + c] = 1;
1494     }
1495     break;
1496   case 'D':
1497     for (c = 0; c < '0'; c++) {
1498       map[c] = 1;
1499     }
1500     for (c = '9' + 1; c < (_non_ascii ? 128 : 256); c++) {
1501       map[c] = 1;
1502     }
1503     if (_non_ascii)
1504       *_non_ascii = 1;
1505     break;
1506   case 'w':
1507     for (c = 0; c < 26; c++) {
1508       map['a' + c] = 1;
1509       map['A' + c] = 1;
1510     }
1511     for (c = 0; c < 10; c++) {
1512       map['0' + c] = 1;
1513     }
1514     map['_'] = 1;
1515     break;
1516   case 'W':
1517     for (c = 0; c < '0'; c++) {
1518       map[c] = 1;
1519     }
1520     for (c = '9' + 1; c < 'A'; c++) {
1521       map[c] = 1;
1522     }
1523     for (c = 'Z' + 1; c < '_'; c++) {
1524       map[c] = 1;
1525     }
1526     for (c = 'z' + 1; c < (_non_ascii ? 128 : 256); c++) {
1527       map[c] = 1;
1528     }
1529     if (_non_ascii)
1530       *_non_ascii = 1;
1531     break;
1532   case 's':
1533     map['\t'] = 1;
1534     map['\n'] = 1;
1535     map['\f'] = 1;
1536     map['\r'] = 1;
1537     map[' '] = 1;
1538     break;
1539   case 'S':
1540     for (c = 0; c < (_non_ascii ? 128 : 256); c++) {
1541       switch (c) {
1542       case '\t':
1543       case '\n':
1544       case '\f':
1545       case '\r':
1546       case ' ':
1547 	break;
1548       default:
1549 	map[c] = 1;
1550 	break;
1551       }
1552     }
1553     if (_non_ascii)
1554       *_non_ascii = 1;
1555     break;
1556   default:
1557     if (((c >= 'a') && (c <= 'z'))
1558 	|| ((c >= 'A') && (c <= 'Z'))) {
1559       FAIL("illegal alphabetic escape");
1560     }
1561     map[c] = 1;
1562     break;
1563   }
1564 
1565   return 1;
1566 }
1567 
is_posix_char_class(char * str,int pos,int len,char * map)1568 static int is_posix_char_class(char *str, int pos, int len, char *map)
1569 {
1570   int c;
1571 
1572   if (pos + 8 <= len) {
1573     if (!scheme_strncmp(":alnum:]", str XFORM_OK_PLUS pos, 8)) {
1574       if (map) {
1575         regcharclass('d', map, NULL);
1576         for (c = 'a'; c <= 'z'; c++) {
1577           map[c] = 1;
1578           map[c - ('a' - 'A')] = 1;
1579         }
1580       }
1581       return 1;
1582     } else if (!scheme_strncmp(":alpha:]", str XFORM_OK_PLUS pos, 8)) {
1583       if (map) {
1584         for (c = 'a'; c <= 'z'; c++) {
1585           map[c] = 1;
1586           map[c - ('a' - 'A')] = 1;
1587         }
1588       }
1589       return 1;
1590     } else if (!scheme_strncmp(":ascii:]", str XFORM_OK_PLUS pos, 8)) {
1591       if (map) {
1592         for (c = 0; c <= 127; c++) {
1593           map[c] = 1;
1594         }
1595       }
1596       return 1;
1597     } else if (!scheme_strncmp(":blank:]", str XFORM_OK_PLUS pos, 8)) {
1598       if (map) {
1599         map[' '] = 1;
1600         map['\t'] = 1;
1601       }
1602       return 1;
1603     } else if (!scheme_strncmp(":cntrl:]", str XFORM_OK_PLUS pos, 8)) {
1604       if (map) {
1605         for (c = 0; c <= 31; c++) {
1606           map[c] = 1;
1607         }
1608       }
1609       return 1;
1610     } else if (!scheme_strncmp(":digit:]", str XFORM_OK_PLUS pos, 8)) {
1611       if (map) {
1612         regcharclass('d', map, NULL);
1613       }
1614       return 1;
1615     } else if (!scheme_strncmp(":graph:]", str XFORM_OK_PLUS pos, 8)) {
1616       if (map) {
1617         for (c = 0; c <= 127; c++) {
1618           if (scheme_isgraphic(c))
1619             map[c] = 1;
1620         }
1621       }
1622       return 1;
1623     } else if (!scheme_strncmp(":lower:]", str XFORM_OK_PLUS pos, 8)) {
1624       if (map) {
1625         for (c = 'a'; c <= 'z'; c++) {
1626           map[c] = 1;
1627         }
1628       }
1629       return 1;
1630     } else if (!scheme_strncmp(":print:]", str XFORM_OK_PLUS pos, 8)) {
1631       if (map) {
1632         for (c = 0; c <= 127; c++) {
1633           if (scheme_isgraphic(c))
1634             map[c] = 1;
1635         }
1636         map[' '] = 1;
1637         map['\t'] = 1;
1638       }
1639       return 1;
1640     } else if (!scheme_strncmp(":space:]", str XFORM_OK_PLUS pos, 8)) {
1641       if (map) {
1642         regcharclass('s', map, NULL);
1643       }
1644       return 1;
1645     } else if (!scheme_strncmp(":upper:]", str XFORM_OK_PLUS pos, 8)) {
1646       if (map) {
1647         for (c = 'A'; c <= 'Z'; c++) {
1648           map[c] = 1;
1649         }
1650       }
1651       return 1;
1652     }
1653   }
1654 
1655   if ((pos + 7 <= len)
1656       && !scheme_strncmp(":word:]", str XFORM_OK_PLUS pos, 7)) {
1657     if (map) {
1658       regcharclass('w', map, NULL);
1659     }
1660     return 1;
1661   }
1662 
1663   if ((pos + 9 <= len)
1664       && !scheme_strncmp(":xdigit:]", str XFORM_OK_PLUS pos, 9)) {
1665     if (map) {
1666       regcharclass('d', map, NULL);
1667       for (c = 'a'; c <= 'f'; c++) {
1668         map[c] = 1;
1669         map[c - ('a' - 'A')] = 1;
1670       }
1671     }
1672     return 1;
1673   }
1674 
1675   return 0;
1676 }
1677 
is_posix_char_class_in_unicode(mzchar * str,int pos,int len,char * map)1678 static int is_posix_char_class_in_unicode(mzchar *str, int pos, int len, char *map)
1679 {
1680   int ulen;
1681   int i;
1682   char buf[10];
1683 
1684   if (pos + 7 > len)
1685     return 0;
1686 
1687   ulen = len - pos;
1688   if (ulen > 9)
1689     ulen = 9;
1690 
1691   for (i = 0; i < ulen; i++) {
1692     if (str[pos + i] > 127)
1693       return 0;
1694     buf[i] = (char)str[pos + i];
1695   }
1696 
1697   return is_posix_char_class(buf, 0, ulen, map);
1698 }
1699 
regrange(int parse_flags,char * map)1700 static char *regrange(int parse_flags, char *map)
1701 /* [ is already consumed; result is an array of 256 bytes of included chars */
1702 {
1703   int xclass, c;
1704   int classend, can_range = 0;
1705   int exclude = 0;
1706 
1707   if (regparsestr[regparse] == '^') { /* Complement of range. */
1708     exclude = 1;
1709     regparse++;
1710   }
1711 
1712   if (regparsestr[regparse] == ']' || regparsestr[regparse] == '-') {
1713     c = regparsestr[regparse];
1714     map[c] = 1;
1715     regparse++;
1716   }
1717   while (regparse != regparse_end && regparsestr[regparse] != ']') {
1718     if (regparsestr[regparse] == '-') {
1719       regparse++;
1720       if (regparsestr[regparse] == ']' || regparse == regparse_end) {
1721 	map['-'] = 1;
1722       } else {
1723 	if (!can_range) {
1724 	  FAIL("misplaced hyphen within square brackets in pattern");
1725 	} else {
1726 	  xclass = UCHAR(regparsestr[regparse-2])+1;
1727 	  classend = UCHAR(regparsestr[regparse]);
1728 	  if (classend == '-') {
1729 	    FAIL("misplaced hyphen within square brackets in pattern");
1730 	  }
1731 	  if ((classend == '\\') && (parse_flags & PARSE_PCRE)) {
1732 	    if (regparse+1 == regparse_end) {
1733 	      FAIL("escaping backslash at end pattern (within square brackets)");
1734 	    }
1735 	    regparse++;
1736 	    classend = UCHAR(regparsestr[regparse]);
1737 	    if (((classend >= 'a') && (classend <= 'z'))
1738 		|| ((classend >= 'A') && (classend <= 'Z'))) {
1739 	      FAIL("misplaced hyphen within square brackets in pattern");
1740 	    }
1741 	  }
1742 	  if (xclass > classend+1)
1743 	    FAIL("invalid range within square brackets in pattern");
1744 	  for (; xclass <= classend; xclass++) {
1745 	    c = xclass;
1746 	    map[c] = 1;
1747 	    if (!(parse_flags & PARSE_CASE_SENS)) {
1748 	      c = rx_toupper(c);
1749 	      map[c] = 1;
1750 	      c = rx_tolower(c);
1751 	      map[c] = 1;
1752 	    }
1753 	  }
1754 	  regparse++;
1755 	}
1756       }
1757       can_range = 0;
1758     } else if ((regparsestr[regparse] == '\\') && (parse_flags & PARSE_PCRE)) {
1759       c = UCHAR(regparsestr[regparse + 1]);
1760       if (((c >= 'a') && (c <= 'z'))
1761 	  || ((c >= 'A') && (c <= 'Z'))) {
1762 	regcharclass(c, map, NULL);
1763 	can_range = 0;
1764       } else {
1765 	map[c] = 1;
1766 	can_range = 1;
1767       }
1768       regparse += 2;
1769     } else if ((regparsestr[regparse] == '[')
1770 	       && (parse_flags & PARSE_PCRE)
1771 	       && (regparsestr[regparse+1] == ':')
1772 	       && is_posix_char_class(regparsestr, regparse + 1, regparse_end, map)) {
1773       regparse += 2;
1774       while (regparsestr[regparse] != ']') {
1775 	regparse++;
1776       }
1777       regparse++;
1778       can_range = 0;
1779     } else {
1780       c = UCHAR(regparsestr[regparse++]);
1781       map[c] = 1;
1782       if (!(parse_flags & PARSE_CASE_SENS)) {
1783 	c = rx_tolower(c);
1784 	map[c] = 1;
1785 	c = rx_toupper(c);
1786 	map[c] = 1;
1787       }
1788       can_range = 1;
1789     }
1790   }
1791 
1792   if (exclude) {
1793     for (c = 0; c < 256; c++) {
1794       map[c] = !map[c];
1795     }
1796   }
1797 
1798   if (regparsestr[regparse] != ']')
1799     FAIL("missing closing square bracket in pattern");
1800   regparse++;
1801 
1802   return map;
1803 }
1804 
1805 static rxpos
regranges(int parse_flags,int at_start)1806 regranges(int parse_flags, int at_start)
1807 {
1808   int c;
1809   rxpos ret, save_regparse = 0;
1810   int count, off_ranges, on_ranges, now_on, last_on, prev_last_on;
1811 #ifdef COUNT_CI_CHARS
1812   /* These could be used to pick an encoding as a _CI variant, but
1813      _CI variants are not picked currently: */
1814   int all_ci, num_ci;
1815 #endif
1816   char *new_map = NULL, *accum_map = NULL;
1817 
1818   count = 0;
1819   while (1) {
1820     /* This loop can end up parsing a range and not using the result,
1821        so that the range is parsed twice. That's ok, because there's
1822        no nesting (and therefore no exponential explosion). */
1823 
1824     if (!new_map)
1825       new_map = (char *)scheme_malloc_atomic(256);
1826     memset(new_map, 0, 256);
1827 
1828     if (regparsestr[regparse] == '\\'
1829 	&& (regparse + 1 < regparse_end)) {
1830       /* \<char> */
1831       c = UCHAR(regparsestr[++regparse]);
1832       if (parse_flags & PARSE_PCRE) {
1833 	if ((c >= '0') && (c <= '9'))
1834 	  break;
1835 	if (((c >= 'a') && (c <= 'z'))
1836 	    || ((c >= 'A') && (c <= 'Z'))) {
1837           if ((c == 'p') || (c == 'P')) {
1838             /* unicode char class; give up */
1839             break;
1840           }
1841 	  regcharclass(regparsestr[regparse], new_map, NULL);
1842 
1843 	} else
1844 	  new_map[c] = 1;
1845       } else
1846 	new_map[c] = 1;
1847       regparse++;
1848     } else if (regstrcspn(regparsestr + regparse, regparsestr + regparse + 1,
1849 			  (parse_flags & PARSE_PCRE) ? PCRE_META : META)) {
1850       /* <char> */
1851       c = UCHAR(regparsestr[regparse]);
1852       new_map[c] = 1;
1853       if (!(parse_flags & PARSE_CASE_SENS))  {
1854 	c = rx_tolower(c);
1855 	new_map[c] = 1;
1856 	c = rx_toupper(c);
1857 	new_map[c] = 1;
1858       }
1859       regparse++;
1860     } else if (regparsestr[regparse] == '.') {
1861       /* . */
1862       for (c = 0; c < 256; c++) {
1863 	new_map[c] = 1;
1864       }
1865       if (!(parse_flags & PARSE_SINGLE_LINE))
1866 	new_map['\n'] = 0;
1867       regparse++;
1868     } else if (regparsestr[regparse] == '[') {
1869       /* [...] */
1870       regparse++;
1871       regrange(parse_flags, new_map);
1872     } else
1873       break;
1874 
1875     /* If the most recently parsed range is not
1876        continued by a branch or the end of a sub-sequence,
1877        then abandon it, because it actually belongs
1878        with a new sequence. */
1879     if (accum_map
1880 	&& (regparse < regparse_end)
1881 	&& (regparsestr[regparse] != '|')
1882 	&& (regparsestr[regparse] != ')'))
1883       break;
1884 
1885     /* We'll keep it. Merge char maps so far: */
1886     if (accum_map) {
1887       for (c = 0; c < 256; c++) {
1888 	accum_map[c] |= new_map[c];
1889       }
1890     } else {
1891       accum_map = new_map;
1892       new_map = NULL;
1893     }
1894     save_regparse = regparse;
1895 
1896     /* If we're at the end, or if we can only do one, then we're done. */
1897     if (!at_start
1898 	|| (regparsestr[regparse] != '|')
1899 	|| (regparse >= regparse_end)
1900 	|| (regparsestr[regparse] == ')'))
1901       break;
1902 
1903     regparse++;
1904     if (regparse == regparse_end)
1905       break;
1906   }
1907 
1908   regparse = save_regparse;
1909 
1910   if (!accum_map)
1911     FAIL("should have found one range!");
1912 
1913   while (1) {
1914     /* Collect stats to pick the best run-time implementation for a range.
1915        We may do this twice if we decide to use a _CI variant. */
1916     count = 0;
1917 #ifdef COUNT_CI_CHARS
1918     num_ci = 0;
1919     all_ci = 1;
1920 #endif
1921     on_ranges = 0;
1922     off_ranges = 0;
1923     now_on = 0;
1924     last_on = -1;
1925     prev_last_on = -1;
1926     for (c = 0; c < 256; c++) {
1927       if (accum_map[c]) {
1928 	if (now_on < 0)
1929 	  off_ranges++;
1930 	now_on = 1;
1931 	count++;
1932 	prev_last_on = last_on;
1933 	last_on = c;
1934 
1935 #ifdef COUNT_CI_CHARS
1936 	if (c != rx_tolower(c)) {
1937 	  if (accum_map[rx_tolower(c)] != accum_map[c])
1938 	    all_ci = 0;
1939 	  num_ci++;
1940 	} else if (c != rx_toupper(c)) {
1941 	  if (accum_map[rx_toupper(c)] != accum_map[c])
1942 	    all_ci = 0;
1943 	  num_ci++;
1944 	}
1945 #endif
1946       } else {
1947 	if (now_on > 0)
1948 	  on_ranges++;
1949 	now_on = -1;
1950       }
1951     }
1952     if (now_on > 0)
1953       on_ranges++;
1954     else
1955       off_ranges++;
1956 
1957     /* Pick the best run-time implementation for a range. */
1958     if (count == 256) {
1959       return regnode(ANY);
1960     } else if ((count == 255) && !accum_map['\n']) {
1961       return regnode(ANYL);
1962     } else if (count == 1) {
1963       ret = regnode(EXACTLY1);
1964       regc(last_on);
1965       return ret;
1966     } else if (count == 2) {
1967       ret = regnode(EXACTLY2);
1968       regc(last_on);
1969       regc(prev_last_on);
1970       return ret;
1971     } else if ((on_ranges == 1)
1972 	       ||  (off_ranges == 1)) {
1973       int rs = 255, re = 255, on;
1974 
1975       if (on_ranges == 1)
1976 	on = 1;
1977       else
1978 	on = 0;
1979 
1980       for (c = 0; c < 256; c++) {
1981 	if (!!accum_map[c] == on) {
1982 	  rs = c;
1983 	  break;
1984 	}
1985       }
1986       for (c++; c < 256; c++) {
1987 	if ((!accum_map[c]) == on) {
1988 	  re = c - 1;
1989 	  break;
1990 	}
1991       }
1992 
1993       if (on)
1994 	ret = regnode(RANGE);
1995       else
1996 	ret = regnode(NOTRANGE);
1997       regc(rs);
1998       regc(re);
1999       return ret;
2000     } else {
2001       rxpos a;
2002 
2003       ret = regnode(ANYOF);
2004       a = regcode;
2005       for (c = 0; c < 32; c++) {
2006 	regc(0);
2007       }
2008 
2009       if (regcode <= regcodesize) {
2010 	for (c = 0; c < 256; c++) {
2011 	  if (accum_map[c]) {
2012 	    regstr[a + (c >> 3)] |= (1 << (c & 0x7));
2013 	  }
2014 	}
2015       }
2016 
2017       return ret;
2018     }
2019   }
2020 }
2021 
2022 READ_ONLY static const char *prop_names[] = { "Cn",
2023 			      "Cc",
2024 			      "Cf",
2025 			      "Cs",
2026 			      "Co",
2027 			      "Ll",
2028 			      "Lu",
2029 			      "Lt",
2030 			      "Lm",
2031 			      "Lo",
2032 			      "Nd",
2033 			      "Nl",
2034 			      "No",
2035 			      "Ps",
2036 			      "Pe",
2037 			      "Pi",
2038 			      "Pf",
2039 			      "Pc",
2040 			      "Pd",
2041 			      "Po",
2042 			      "Mn",
2043 			      "Mc",
2044 			      "Me",
2045 			      "Sc",
2046 			      "Sk",
2047 			      "Sm",
2048 			      "So",
2049 			      "Zl",
2050 			      "Zp",
2051 			      "Zs",
2052 			      NULL};
2053 
2054 static rxpos
regunicode(int negate)2055 regunicode(int negate)
2056 {
2057   rxpos ret;
2058   int len, bottom, top, i;
2059 
2060   if (regparsestr[regparse] != '{') {
2061     FAIL("expected { after \\p or \\P");
2062   }
2063   regparse++;
2064   if (regparsestr[regparse] == '^') {
2065     negate = !negate;
2066     regparse++;
2067   }
2068 
2069   len = 0;
2070   while ((regparsestr[regparse + len] != '}')
2071 	 && (regparse + len < regparse_end)) {
2072     len++;
2073   }
2074 
2075   if (regparse + len >= regparse_end) {
2076     FAIL("missing } to close \\p{ or \\P{");
2077   }
2078 
2079   bottom = top = -1;
2080   if (len == 2) {
2081     for (i = 0; prop_names[i]; i++) {
2082       if ((regparsestr[regparse] == prop_names[i][0])
2083 	  && (regparsestr[regparse+1] == prop_names[i][1])) {
2084 	bottom = top = i;
2085 	break;
2086       }
2087     }
2088     if (bottom == -1) {
2089       if ((regparsestr[regparse] == 'L')
2090 	  && (regparsestr[regparse+1] == '&')) {
2091 	bottom = mzu_Ll;
2092 	top = mzu_Lm;
2093       }
2094     }
2095   } else if (len == 1) {
2096     if (regparsestr[regparse] == '.') {
2097       bottom = 0;
2098       top = mzu_LAST;
2099     } else {
2100       for (i = 0; prop_names[i]; i++) {
2101 	if (regparsestr[regparse] == prop_names[i][0]) {
2102 	  bottom = i;
2103 	  while (prop_names[i+1]) {
2104 	    if (regparsestr[regparse] != prop_names[i+1][0])
2105 	      break;
2106 	    i++;
2107 	  }
2108 	  top = i;
2109 	  break;
2110 	}
2111       }
2112     }
2113   }
2114 
2115   if (bottom < 0) {
2116     FAIL("unrecognized property name in \\p{} or \\P{}");
2117   }
2118 
2119   regparse += len + 1;
2120 
2121   ret = regnode(UNIPROP);
2122   /* This encoding accommodates up to 63 categories: */
2123   regarg((negate << 13) | (bottom << 6) | top);
2124 
2125   return ret;
2126 }
2127 
regdigit()2128 static int regdigit()
2129 {
2130   int posn, c;
2131   c = regparsestr[regparse++];
2132   posn = c - '0';
2133   while (regparse < regparse_end) {
2134     c = regparsestr[regparse];
2135     if ((c >= '0') && (c <= '9')) {
2136       posn = (posn * 10) + (c - '0');
2137       if (posn > 0x7FFF)
2138 	FAIL("backreference number is too large");
2139       regparse++;
2140     } else
2141       break;
2142   }
2143   if (posn > regmaxbackposn)
2144     regmaxbackposn = posn;
2145   return posn;
2146 }
2147 
2148 /*
2149    - regnode - emit a node
2150    */
2151 static rxpos 			/* Location. */
regnode(char op)2152 regnode(char op)
2153 {
2154   rxpos ret;
2155   rxpos ptr;
2156 
2157   ret = regcode;
2158   if (regcode + 3 >= regcodesize) {
2159     regcode += 3;
2160     if (regcode > regcodemax)
2161       regcodemax = regcode;
2162     return ret;
2163   }
2164 
2165   ptr = ret;
2166   regstr[ptr++] = op;
2167   regstr[ptr++] = '\0';		/* Null "next" pointer. */
2168   regstr[ptr++] = '\0';
2169   regcode = ptr;
2170 
2171   if (regcode > regcodemax)
2172     regcodemax = regcode;
2173 
2174   return ret;
2175 }
2176 
2177 /*
2178    - regc - emit (if appropriate) a byte of code
2179    */
2180 static void
regc(char b)2181 regc(char b)
2182 {
2183   if (regcode + 1 < regcodesize)
2184     regstr[regcode] = b;
2185   regcode++;
2186   if (regcode > regcodemax)
2187     regcodemax = regcode;
2188 }
2189 
2190 static void
regarg(int v)2191 regarg(int v)
2192 {
2193   regc(v >> 8);
2194   regc(v & 255);
2195 }
2196 
2197 /*
2198    - reginsert - insert an operator in front of already-emitted operand
2199    *
2200    * Means relocating the operand.
2201    */
2202 static void
regshift(int amt,rxpos opnd)2203 regshift(int amt, rxpos opnd)
2204 {
2205   if (regcode + amt < regcodesize) {
2206     memmove(regstr XFORM_OK_PLUS opnd + amt,
2207 	    regstr XFORM_OK_PLUS opnd,
2208 	    regcode - opnd);
2209   }
2210   regcode += amt;
2211   if (regcode > regcodemax)
2212     regcodemax = regcode;
2213 }
2214 
2215 static void
reginsert(char op,rxpos opnd)2216 reginsert(char op, rxpos opnd)
2217 {
2218   regshift(3, opnd);
2219 
2220   if (opnd + 3 >= regcodesize) {
2221     return;
2222   }
2223 
2224   regstr[opnd++] = op;
2225   regstr[opnd++] = '\0'; /* tail */
2226   regstr[opnd++] = '\0';
2227 }
2228 
2229 static rxpos
reginsertwithop(char op,rxpos opnd,int arg)2230 reginsertwithop(char op, rxpos opnd, int arg)
2231 {
2232   regshift(5, opnd);
2233 
2234   if (opnd + 5 >= regcodesize) {
2235     return opnd + 5;
2236   }
2237 
2238   regstr[opnd++] = op;
2239   regstr[opnd++] = '\0'; /* tail */
2240   regstr[opnd++] = '\0';
2241   regstr[opnd++] = (arg >> 8);
2242   regstr[opnd++] = (arg & 255);
2243 
2244   return opnd;
2245 }
2246 
2247 static rxpos
reginsertwithopop(char op,rxpos opnd,int arg,int arg2)2248 reginsertwithopop(char op, rxpos opnd, int arg, int arg2)
2249 {
2250   regshift(7, opnd);
2251 
2252   if (opnd + 7 >= regcodesize) {
2253     return opnd + 7;
2254   }
2255 
2256   regstr[opnd++] = op;
2257   regstr[opnd++] = '\0'; /* tail */
2258   regstr[opnd++] = '\0';
2259   regstr[opnd++] = (arg >> 8);
2260   regstr[opnd++] = (arg & 255);
2261   regstr[opnd++] = (arg2 >> 8);
2262   regstr[opnd++] = (arg2 & 255);
2263 
2264   return opnd;
2265 }
2266 
2267 /*
2268    - regtail - set the next-pointer at the end of a node chain
2269    */
2270 static void
regtail(rxpos p,rxpos val)2271 regtail(rxpos p, rxpos val)
2272 {
2273   rxpos scan;
2274   rxpos temp;
2275   int offset;
2276 
2277   /* Find last node. */
2278   scan = p;
2279   for (;;) {
2280     if (scan + 2 >= regcodesize) {
2281       return;
2282     }
2283     temp = regnext(scan);
2284     if (temp == 0)
2285       break;
2286     scan = temp;
2287   }
2288 
2289   if (scan + 2 >= regcodesize) {
2290     return;
2291   }
2292 
2293   if (rOP(scan) == BACK)
2294     offset = scan - val;
2295   else
2296     offset = val - scan;
2297   regstr[scan+1] = (offset>>8)&255;
2298   regstr[scan+2] = offset&255;
2299 }
2300 
2301 /*
2302    - regoptail - regtail on operand of first argument; nop if operandless
2303    */
2304 static void
regoptail(rxpos p,rxpos val)2305 regoptail(rxpos p, rxpos val)
2306 {
2307   /* "Operandless" and "op != BRANCH" are synonymous in practice. */
2308   if (p == 0 || (p >= regcodesize) || rOP(p) != BRANCH) {
2309     return;
2310   }
2311   regtail(OPERAND(p), val);
2312 }
2313 
merge_tables(Scheme_Hash_Table * dest,Scheme_Hash_Table * src)2314 static int merge_tables(Scheme_Hash_Table *dest, Scheme_Hash_Table *src)
2315 {
2316   int i;
2317 
2318   for (i = src->size; i--; ) {
2319     if (src->vals[i]) {
2320       scheme_hash_set(dest, src->keys[i], src->vals[i]);
2321     }
2322   }
2323 
2324   return 1;
2325 }
2326 
check_and_propagate_depends(void)2327 static int check_and_propagate_depends(void)
2328 {
2329   int i, j;
2330   Scheme_Hash_Table *backdepends = regbackdepends, *ht, *next_ht = NULL;
2331   Scheme_Object *v;
2332 
2333   while (backdepends) {
2334     for (i = backdepends->size; i--; ) {
2335       if (backdepends->vals[i]) {
2336 	if (regbackknown)
2337 	  v = scheme_hash_get(regbackknown, backdepends->keys[i]);
2338 	else
2339 	  v = NULL;
2340 	if (v) {
2341 	  /* Check assumption: */
2342 	  if (SCHEME_FALSEP(v)) {
2343 	    FAIL("*, +, or {...,} operand could be empty (via empty backreference)");
2344 	  }
2345 	  if (SCHEME_HASHTP(v)) {
2346 	    /* Check/propagate assumption. The fixpoint direction is
2347 	       determined by assuming "true" while recursively checking. */
2348 	    scheme_hash_set(regbackknown, backdepends->keys[i], scheme_true);
2349 	    if (!next_ht)
2350 	      next_ht = scheme_make_hash_table(SCHEME_hash_ptr);
2351 	    ht = (Scheme_Hash_Table *)v;
2352 	    for (j = ht->size; j--; ) {
2353 	      if (ht->vals[j]) {
2354 		scheme_hash_set(next_ht, ht->keys[j], ht->vals[j]);
2355 	      }
2356 	    }
2357 	  }
2358 	} else {
2359 	  /* Add assumption */
2360 	  if (!regbackknown)
2361 	    regbackknown = scheme_make_hash_table(SCHEME_hash_ptr);
2362 	  scheme_hash_set(regbackknown, backdepends->keys[i], scheme_true);
2363 	}
2364       }
2365     }
2366     backdepends = next_ht;
2367     next_ht = NULL;
2368   }
2369 
2370   return 1;
2371 }
2372 
l_strchr(char * str,rxpos a,int l,int c)2373 static MZ_INLINE rxpos l_strchr(char *str, rxpos a, int l, int c)
2374 {
2375   int i;
2376 
2377   for (i = 0; i < l; i++) {
2378     if (str[a + i] == c)
2379       return a + i;
2380   }
2381 
2382   return -1;
2383 }
2384 
l_strchr_ci(char * str,rxpos a,int l,int c)2385 static MZ_INLINE rxpos l_strchr_ci(char *str, rxpos a, int l, int c)
2386 {
2387   int i, ch;
2388 
2389   for (i = 0; i < l; i++) {
2390     ch = str[a + i];
2391     ch = rx_tolower(ch);
2392     if (ch == c)
2393       return a + i;
2394   }
2395 
2396   return -1;
2397 }
2398 
2399 #if 0
2400 static MZ_INLINE int in_ranges(char *str, rxpos a, int l, int c)
2401 {
2402   int i;
2403 
2404   l *= 2;
2405 
2406   for (i = 0; i < l; i += 2) {
2407     if ((UCHAR(str[a + i]) <= c) && (UCHAR(str[a + i + 1]) >= c))
2408       return 1;
2409   }
2410 
2411   return 0;
2412 }
2413 
2414 static MZ_INLINE int in_ranges_ci(char *str, rxpos a, int l, int c)
2415 {
2416   int i;
2417 
2418   l *= 2;
2419 
2420   c = rx_tolower(c);
2421 
2422   for (i = 0; i < l; i += 2) {
2423     if ((UCHAR(str[a + i]) <= c) && (UCHAR(str[a + i + 1]) >= c))
2424       return 1;
2425   }
2426 
2427   return 0;
2428 }
2429 #endif
2430 
2431 /*
2432  * regexec and friends
2433  */
2434 
2435 /*
2436  * Forwards.
2437  */
2438 static int regtry(regexp *, char *, int, int, rx_lazy_str_t *, rxpos *, rxpos *, rxpos *, rxpos *, int *, Regwork *rw, rxpos,
2439                   char *, rxpos, rxpos, int);
2440 static int regtry_port(regexp *, Scheme_Object *, Scheme_Object *, int nonblock,
2441 		       rxpos *, rxpos *, rxpos *, rxpos *, int *,
2442 		       char **, rxpos *, rxpos *, rxpos, Scheme_Object*, Scheme_Object*, rxpos,
2443                        char*, rxpos, rxpos,
2444 		       int, int *);
2445 static int regmatch(Regwork *rw, rxpos);
2446 static int regrepeat(Regwork *rw, rxpos, int);
2447 
stack_room(Regwork * rw,int amt)2448 static void stack_room(Regwork *rw, int amt)
2449 {
2450   if (rw->rewind_stack_count + amt > rw->rewind_stack_size) {
2451     int sz;
2452     rxpos *p;
2453     sz = rw->rewind_stack_size * 2;
2454     if (!sz) sz = MATCH_STACK_SIZE;
2455     p = (rxpos *)scheme_malloc_atomic(sizeof(rxpos)*sz);
2456     if (rw->rewind_stack_size)
2457       memcpy(p, rw->rewind_stack, rw->rewind_stack_size * sizeof(rxpos));
2458     rw->rewind_stack = p;
2459     rw->rewind_stack_size = sz;
2460   }
2461 }
2462 
match_push(Regwork * rw)2463 static int match_push(Regwork *rw)
2464 {
2465   if (rw->non_tail >= 0) {
2466     int pos;
2467 
2468     rw->non_tail++;
2469     pos = rw->rewind_stack_count;
2470     rw->rewind_stack_prompt = pos;
2471 
2472     return pos;
2473   } else
2474     return 0;
2475 }
2476 
match_pop(Regwork * rw,int pos,int matched)2477 static void match_pop(Regwork *rw, int pos, int matched)
2478 {
2479   if (rw->non_tail >= 0) {
2480     --rw->non_tail;
2481 
2482     if (matched) {
2483       /* Save elements on stack in case an enclosing match
2484          needs to rewind. Area between prompt and pos are
2485          mapping that don't need to be re-recorded. */
2486       rw->rewind_stack_prompt = pos;
2487     } else {
2488       int i, no;
2489       for (i = rw->rewind_stack_count; i > pos; i -= 3) {
2490         no = rw->rewind_stack[i-3];
2491         if (no < 0) {
2492           rw->maybep[-no] = rw->rewind_stack[i-2];
2493         } else {
2494           rw->startp[no] = rw->rewind_stack[i-2];
2495           rw->endp[no] = rw->rewind_stack[i-1];
2496         }
2497       }
2498       rw->rewind_stack_count = pos;
2499       rw->rewind_stack_prompt = pos;
2500     }
2501   }
2502 }
2503 
match_set(Regwork * rw,int no,rxpos start,rxpos end)2504 static void match_set(Regwork *rw, int no, rxpos start, rxpos end)
2505 {
2506   int i, count;
2507 
2508   if (rw->non_tail > 0) {
2509     count = rw->rewind_stack_count;
2510     for (i = rw->rewind_stack_prompt; i < count; i += 3) {
2511       if (rw->rewind_stack[i] == no)
2512         break;
2513     }
2514 
2515     if (i >= count) {
2516       stack_room(rw, 3);
2517       i = count;
2518       rw->rewind_stack[i++] = no;
2519       rw->rewind_stack[i++] = rw->startp[no];
2520       rw->rewind_stack[i++] = rw->endp[no];
2521       rw->rewind_stack_count = i;
2522     }
2523   }
2524 
2525   rw->startp[no] = start;
2526   rw->endp[no] = end;
2527 }
2528 
match_maybe(Regwork * rw,int no,rxpos pos)2529 static void match_maybe(Regwork *rw, int no, rxpos pos)
2530 {
2531   int i, count;
2532 
2533   if (rw->non_tail > 0) {
2534     count = rw->rewind_stack_count;
2535     for (i = rw->rewind_stack_prompt; i < count; i += 3) {
2536       if (rw->rewind_stack[i] == (- no))
2537         break;
2538     }
2539 
2540     if (i >= count) {
2541       stack_room(rw, 3);
2542       i = count;
2543       rw->rewind_stack[i++] = (- no);
2544       rw->rewind_stack[i++] = rw->maybep[no];
2545       rw->rewind_stack[i++] = 0;
2546       rw->rewind_stack_count = i;
2547     }
2548   }
2549 
2550   rw->maybep[no] = pos;
2551 }
2552 
2553 #ifdef DEBUG
2554 int regnarrate = 0;
2555 void regdump();
2556 static char *regprop();
2557 #endif
2558 
2559 #define REGPORT_FLUSH_THRESHOLD 256
2560 
2561 /*
2562    - regexec - match a regexp against a string
2563    */
2564 static int
regexec(const char * who,regexp * prog,char * string,int stringpos,int stringlen,int stringorigin,rx_lazy_str_t * lazy_string,rxpos * startp,rxpos * maybep,rxpos * endp,rxpos * match_stack,Scheme_Object * port,Scheme_Object * unless_evt,int nonblock,char ** stringp,int peek,int get_offsets,intptr_t save_prior,Scheme_Object * discard_oport,Scheme_Object * portstart,Scheme_Object * portend,Scheme_Object ** _dropped,char * prefix,rxpos prefix_len,rxpos prefix_offset)2565 regexec(const char *who,
2566 	regexp *prog, char *string,
2567 	/* Used only for (bytes) strings: */
2568 	int stringpos, int stringlen, int stringorigin,
2569         /* For lazy strings: */
2570         rx_lazy_str_t *lazy_string,
2571 	/* Always used: */
2572 	rxpos *startp, rxpos *maybep, rxpos *endp, rxpos *match_stack,
2573         /* For port mode: */
2574 	Scheme_Object *port, Scheme_Object *unless_evt, int nonblock,
2575 	/* Used only when port is non-NULL: */
2576 	char **stringp, int peek, int get_offsets, intptr_t save_prior,
2577 	Scheme_Object *discard_oport,
2578 	Scheme_Object *portstart, Scheme_Object *portend, Scheme_Object **_dropped,
2579         char *prefix, rxpos prefix_len, rxpos prefix_offset)
2580 {
2581   int spos;
2582   int *counters;
2583   Scheme_Object *dropped = NULL, *peekskip = NULL; /* used for ports, only */
2584 
2585   /* Check validity of program. */
2586   if (UCHAR(prog->program[0]) != MAGIC) {
2587     regerror("corrupted program");
2588     return(0);
2589   }
2590 
2591   /* If there is a "must appear" string, look for it. */
2592   if (!port && !lazy_string && (prog->regmust >= 0)) {
2593     spos = stringpos;
2594     while (1) {
2595       int i, l = prog->regmlen, ch, pos;
2596       GC_CAN_IGNORE char *p;
2597 
2598       if ((spos - stringpos) + l <= stringlen) {
2599 	if (prog->flags & REGEXP_MUST_CI)
2600 	  pos = l_strchr_ci(string, spos, stringlen - (spos - stringpos) - (l - 1),
2601 			    (ITO(prog->program, (char *)prog) XFORM_OK_PLUS prog->regmust)[0]);
2602 	else
2603 	  pos = l_strchr(string, spos, stringlen - (spos - stringpos) - (l - 1),
2604 			 (ITO(prog->program, (char *)prog) XFORM_OK_PLUS prog->regmust)[0]);
2605 	if (pos == -1)
2606 	  return 0; /* Not present. */
2607       } else
2608 	return 0; /* Not present, since there's not enough room left. */
2609 
2610       /* ASSUMING NO GC HERE! */
2611       p = (ITO(prog->program, (char *)prog) XFORM_OK_PLUS prog->regmust);
2612       if (prog->flags & REGEXP_MUST_CI) {
2613 	for (i = 0; i < l; i++) {
2614 	  ch = string[pos + i];
2615 	  ch = rx_tolower(ch);
2616 	  if (ch != p[i])
2617 	    break;
2618 	}
2619       } else {
2620 	for (i = 0; i < l; i++) {
2621 	  if (string[pos + i] != p[i])
2622 	    break;
2623 	}
2624       }
2625       if (i >= l)
2626 	break; /* Found it. */
2627       spos = pos + 1;
2628     }
2629   }
2630 
2631   if (prog->ncounter) {
2632     counters = (int *)scheme_malloc_atomic(sizeof(int) * prog->ncounter);
2633   } else
2634     counters = NULL;
2635 
2636   if (port) {
2637     if (peek) {
2638       peekskip = portstart;
2639       dropped = portstart;
2640       /* Make sure that's there's not an EOF before peekskip: */
2641       if (!SAME_OBJ(peekskip, scheme_make_integer(0))) {
2642         char tmp[1];
2643         intptr_t got;
2644         got = scheme_get_byte_string_unless("regexp-match", port,
2645                                             tmp, 0, 1, 1,
2646                                             1, scheme_bin_minus(peekskip, scheme_make_integer(1)),
2647                                             unless_evt);
2648         if (got == EOF) {
2649           /* Hit EOF before peekstart, so cannot match */
2650           return 0;
2651         }
2652       }
2653     } else {
2654       /* In non-peek port mode, skip over portstart chars: */
2655       intptr_t amt, got;
2656 
2657       if (SCHEME_INTP(portstart)) {
2658 	amt = SCHEME_INT_VAL(portstart);
2659 	if (amt > 4096)
2660 	  amt = 4096;
2661       } else
2662 	amt = 4096;
2663 
2664       dropped = scheme_make_integer(0);
2665 
2666       if (amt) {
2667 	char *drain;
2668 
2669 	drain = (char *)scheme_malloc_atomic(amt);
2670 
2671 	do {
2672 	  got = scheme_get_byte_string(who, port, drain, 0, amt, 0, 0, 0);
2673 	  if (got != EOF) {
2674 	    Scheme_Object *delta;
2675 
2676 	    if (discard_oport)
2677 	      scheme_put_byte_string(who, discard_oport, drain, 0, got, 0);
2678 
2679 	    dropped = scheme_bin_plus(dropped, scheme_make_integer(got));
2680 	    delta = scheme_bin_minus(portstart, dropped);
2681 	    if (scheme_bin_gt(scheme_make_integer(amt), delta))
2682 	      amt = SCHEME_INT_VAL(delta);
2683 	  }
2684 	} while ((got != EOF) && amt);
2685 	if (amt)
2686 	  return 0; /* can't skip far enough, so it fails */
2687       }
2688     }
2689 
2690     if (portend)
2691       portend = scheme_bin_minus(portend, dropped);
2692   }
2693 
2694   /* Simplest case:  anchored match need be tried only once. */
2695   if (prog->flags & REGEXP_ANCH) {
2696     if (port) {
2697       rxpos len = 0, space = 0;
2698       int aborted = 0;
2699 
2700       *stringp = NULL;
2701       if (regtry_port(prog, port, unless_evt, nonblock,
2702 		      startp, maybep, endp, match_stack, counters, stringp, &len, &space, 0,
2703 		      portend, peekskip, 0, prefix, prefix_len, prefix_offset, 0,
2704                       &aborted)) {
2705 	if (!peek) {
2706 	  /* Need to consume matched chars: */
2707 	  char *drain;
2708 
2709 	  if (discard_oport && *startp)
2710 	    scheme_put_byte_string(who, discard_oport, *stringp, 0, *startp, 0);
2711 
2712 	  if (get_offsets)
2713 	    drain = *stringp;
2714 	  else
2715 	    /* Allocate fresh in case we get different results from previous peek: */
2716 	    drain = (char *)scheme_malloc_atomic(*endp);
2717 	  (void)scheme_get_byte_string(who, port, drain, 0, *endp, 0, 0, 0);
2718 	}
2719 
2720 	*_dropped = dropped;
2721 
2722 	return 1;
2723       } else {
2724 	if (!peek) {
2725 	  /* Need to consume all chars, up to portend */
2726 	  char *drain;
2727 	  intptr_t got;
2728 
2729 	  if (portend && SCHEME_INTP(portend) && SCHEME_INT_VAL(portend) < 4096) {
2730 	    got = SCHEME_INT_VAL(portend);
2731 	  } else
2732 	    got = 4096;
2733 
2734 	  drain = (char *)scheme_malloc_atomic(got);
2735 
2736 	  while ((got = scheme_get_byte_string(who, port, drain, 0, got, 0, 0, 0)) != EOF) {
2737 	    if (discard_oport)
2738 	      scheme_put_byte_string(who, discard_oport, drain, 0, got, 0);
2739 
2740 	    if (portend) {
2741 	      portend = scheme_bin_minus(portend, scheme_make_integer(got));
2742 	      if (SCHEME_INTP(portend)) {
2743 		got = SCHEME_INT_VAL(portend);
2744 		if (!got)
2745 		  break;
2746 		else if (got > 4096)
2747 		  got = 4096;
2748 	      }
2749 	    } else
2750 	      got = 4096;
2751 	  }
2752 	}
2753 	return 0;
2754       }
2755     } else
2756       return regtry(prog, string, stringpos, stringlen, lazy_string, startp, maybep, endp,
2757                     match_stack, counters, 0,
2758 		    stringorigin, prefix, prefix_len, prefix_offset, 0);
2759   }
2760 
2761   /* Messy cases:  unanchored match. */
2762   if (port) {
2763     rxpos len = 0, skip = 0, space = 0;
2764     *stringp = NULL;
2765 
2766     do {
2767       int discard = skip - prog->maxlookback;
2768       int aborted = 0;
2769 
2770       if (discard > skip - save_prior)
2771         discard = skip - save_prior;
2772 
2773       if (discard >= REGPORT_FLUSH_THRESHOLD) {
2774 	if (!peek) {
2775 	  if (discard_oport)
2776 	    scheme_put_byte_string(who, discard_oport, *stringp, 0, discard, 0);
2777 
2778 	  scheme_get_byte_string(who, port, *stringp, 0, discard, 0, 0, 0);
2779 
2780 	  if (portend)
2781 	    portend = scheme_bin_minus(portend, scheme_make_integer(discard));
2782 	} else {
2783 	  peekskip = scheme_bin_plus(peekskip, scheme_make_integer(discard));
2784 	}
2785 
2786 	dropped = scheme_bin_plus(dropped, scheme_make_integer(discard));
2787 
2788 	len -= discard;
2789 	skip -= discard;
2790 	memmove(*stringp, *stringp + discard, len);
2791 
2792         prefix = NULL;
2793         prefix_len = 0;
2794       }
2795 
2796       if (regtry_port(prog, port, unless_evt, nonblock,
2797 		      startp, maybep, endp, match_stack, counters, stringp, &len, &space, skip,
2798 		      portend, peekskip, 0, prefix, prefix_len, prefix_offset, 1,
2799                       &aborted)) {
2800 	if (!peek) {
2801 	  char *drain;
2802 
2803 	  if (discard_oport && *startp)
2804 	    scheme_put_byte_string(who, discard_oport, *stringp, 0, *startp, 0);
2805 
2806 	  if (get_offsets)
2807 	    drain = *stringp;
2808 	  else
2809 	    /* Allocate fresh in case we get different results from previous peek: */
2810 	    drain = (char *)scheme_malloc_atomic(*endp);
2811 
2812 	  scheme_get_byte_string(who, port, drain, 0, *endp, 0, 0, 0);
2813 	}
2814 
2815 	*_dropped = dropped;
2816 
2817 	return 1;
2818       } else if (aborted)
2819         return 0;
2820       skip++;
2821     } while (len >= skip);
2822 
2823     if (!peek) {
2824       /* If we get here, there must be `len' leftover characters in the port,
2825 	 and `*stringp' must hold the characters: */
2826       if (len > 0) {
2827 	if (discard_oport)
2828 	  scheme_put_byte_string(who, discard_oport, *stringp, 0, len, 0);
2829 	scheme_get_byte_string(who, port, *stringp, 0, len, 0, 0, 0);
2830       }
2831     }
2832   } else {
2833     if (regtry(prog, string, stringpos, stringlen, lazy_string,
2834 	       startp, maybep, endp, match_stack, counters,
2835 	       0, stringorigin, prefix, prefix_len, prefix_offset, 1))
2836       return 1;
2837   }
2838 
2839   /* Failure. */
2840   return 0;
2841 }
2842 
2843 #define NEED_INPUT(rw, v, n) if (rw->port && (((v) + (n)) > rw->input_end)) read_more_from_regport(rw, (v) + (n))
2844 static void read_more_from_regport(Regwork *rw, rxpos need_total);
2845 
2846 /*
2847    - regtry - try match at specific point
2848    */
2849 static int			/* 0 failure, 1 success */
regtry(regexp * prog,char * string,int stringpos,int stringlen,rx_lazy_str_t * lazy_string,rxpos * startp,rxpos * maybep,rxpos * endp,rxpos * match_stack,int * counters,Regwork * rw,rxpos stringorigin,char * prefix,rxpos prefix_len,rxpos prefix_offset,int unanchored)2850 regtry(regexp *prog, char *string, int stringpos, int stringlen, rx_lazy_str_t *lazy_string,
2851        rxpos *startp, rxpos *maybep, rxpos *endp, rxpos *match_stack, int *counters,
2852        Regwork *rw, rxpos stringorigin,
2853        char *prefix, rxpos prefix_len, rxpos prefix_offset,
2854        int unanchored)
2855 /* stringpos: where to start looking;
2856    stringlen: available bytes, counting from stringpos;
2857    stringorigin: start of input, after prefix
2858    prefix: bytes to appear before the origin to count as input */
2859 {
2860   int i;
2861   Regwork _rw;
2862 
2863   if (!rw) {
2864     rw = &_rw;
2865     rw->port = NULL;
2866   }
2867   rw->instr = string;
2868   rw->input = stringpos;
2869   rw->input_end = stringpos + stringlen;
2870   rw->input_start = stringorigin;
2871   rw->input_min = stringorigin - prefix_len;
2872   rw->startp = startp;
2873   rw->maybep = maybep;
2874   rw->endp = endp;
2875   rw->counters = counters;
2876   rw->prefix = prefix;
2877   rw->prefix_len = prefix_len;
2878   rw->prefix_delta = prefix_len + prefix_offset - stringorigin;
2879   rw->boi = stringorigin - prefix_len;
2880   rw->rewind_stack_size = (match_stack ? MATCH_STACK_SIZE : 0);
2881   rw->rewind_stack_count = 0;
2882   rw->rewind_stack_prompt = 0;
2883   rw->rewind_stack = match_stack;
2884   if (prog->nsubexp < 2)
2885     rw->non_tail = -1;
2886   else
2887     rw->non_tail = 0;
2888   rw->lazy_string = lazy_string;
2889   if (lazy_string)
2890     rw->port = scheme_true; /* hack to make NEED_INPUT() work */
2891 
2892   for (i = prog->nsubexp; i--; ) {
2893     startp[i] = rw->input_min - 1;
2894     endp[i] = rw->input_min - 1;
2895   }
2896 
2897 #ifdef INDIRECT_TO_PROGRAM
2898   regstr = prog->program;
2899 #else
2900   regstr = (char *)prog;
2901 #endif
2902 
2903   while (1) {
2904     int found;
2905 
2906     found = regmatch(rw, N_ITO_DELTA(prog->program, 1, (char *)prog));
2907 
2908     if (found) {
2909       startp[0] = stringpos;
2910       endp[0] = rw->input;
2911       return 1;
2912     } else if (unanchored) {
2913       if (lazy_string) {
2914         NEED_INPUT(rw, stringpos, 1);
2915         stringlen = rw->input_end - stringpos;
2916       }
2917       if (!stringlen)
2918 	return 0;
2919       stringpos++;
2920       --stringlen;
2921       if (prog->regstart) {
2922 	unsigned char *rs = prog->regstart;
2923 	int c;
2924 	while (1) {
2925 	  if (lazy_string) {
2926             NEED_INPUT(rw, stringpos, 1);
2927             stringlen = rw->input_end - stringpos;
2928             string = rw->instr;
2929           }
2930 
2931 	  if (!stringlen)
2932 	    return 0;
2933 
2934           c = UCHAR(string[stringpos]);
2935 	  if (rs[c >> 3] & (1 << (c & 0x7)))
2936 	    break;
2937 	  stringpos++;
2938 	  --stringlen;
2939 	}
2940       }
2941       rw->input = stringpos;
2942       for (i = prog->nsubexp; i--; ) {
2943         startp[i] = rw->input_min - 1;
2944         endp[i] = rw->input_min - 1;
2945       }
2946       /* try again... */
2947     } else
2948       return 0;
2949   }
2950 }
2951 
2952 #define LAZY_STRING_CHUNK_SIZE 32
2953 
read_more_from_lazy_string(Regwork * rw,rxpos need_total)2954 static void read_more_from_lazy_string(Regwork *rw, rxpos need_total)
2955 {
2956   rx_lazy_str_t *ls = rw->lazy_string;
2957 
2958   if (ls->start + ls->done < ls->end) {
2959     intptr_t amt = ls->done, blen, tlen;
2960     char *s;
2961 
2962     amt = amt ? (2 * amt) : LAZY_STRING_CHUNK_SIZE;
2963     if (ls->done + amt < need_total)
2964       amt = need_total - ls->done;
2965     if (ls->start + ls->done + amt > ls->end)
2966       amt = ls->end - ls->start - ls->done;
2967 
2968     blen = scheme_utf8_encode(ls->chars, ls->start + ls->done, ls->start + ls->done + amt,
2969                               NULL, 0,
2970                               0 /* not UTF-16 */);
2971     tlen = blen + ls->blen;
2972     s = (char *)scheme_malloc_atomic(tlen);
2973     if (ls->blen)
2974       memcpy(s, ls->s, ls->blen);
2975     scheme_utf8_encode(ls->chars, ls->start + ls->done, ls->start + ls->done + amt,
2976                        (unsigned char *)s, ls->blen,
2977                        0 /* not UTF-16 */);
2978 
2979     ls->blen = tlen;
2980     ls->s = s;
2981     ls->done += amt;
2982 
2983     rw->instr = s;
2984     rw->input_end = tlen;
2985   } else {
2986     /* turn off further port reading */
2987     rw->port = NULL;
2988   }
2989 }
2990 
read_more_from_regport(Regwork * rw,rxpos need_total)2991 static void read_more_from_regport(Regwork *rw, rxpos need_total)
2992      /* Called when we're about to look past our read-ahead */
2993 {
2994   intptr_t got;
2995   Scheme_Object *peekskip;
2996 
2997   if (rw->lazy_string) {
2998     read_more_from_lazy_string(rw, need_total);
2999     return;
3000   }
3001 
3002   /* limit reading by rw->input_maxend: */
3003   if (need_total > rw->input_maxend) {
3004     need_total = rw->input_maxend;
3005     if (need_total <= rw->input_end) {
3006       rw->port = NULL; /* turn off further port reading */
3007       return;
3008     }
3009   }
3010 
3011   if (rw->instr_size < need_total) {
3012     char *naya;
3013     intptr_t size = rw->instr_size;
3014 
3015     size = size * 2;
3016     if (size < need_total)
3017       size += need_total;
3018     if (size < 16)
3019       size = 16;
3020 
3021     naya = (char *)scheme_malloc_atomic(size);
3022     if (rw->input_end)
3023       memcpy(naya, rw->instr, rw->input_end);
3024 
3025     rw->instr = naya;
3026     rw->instr_size = size;
3027   }
3028 
3029   rw->str = regstr; /* get_string can swap threads */
3030 
3031   if (rw->input_maxend < rw->instr_size)
3032     got = rw->input_maxend - rw->input_end;
3033   else
3034     got = rw->instr_size - rw->input_end;
3035 
3036   if (rw->peekskip)
3037     peekskip = scheme_bin_plus(scheme_make_integer(rw->input_end), rw->peekskip);
3038   else
3039     peekskip = scheme_make_integer(rw->input_end);
3040 
3041   /* Fill as much of our buffer as possible: */
3042   got = scheme_get_byte_string_unless("regexp-match", rw->port,
3043 				      rw->instr, rw->input_end, got,
3044 				      (rw->nonblock
3045 				       ? 2   /* non-blocking read, as much as possible */
3046 				       : 1), /* read at least one char, and as much as possible */
3047 				      1, peekskip,
3048 				      rw->unless_evt);
3049 
3050   regstr = rw->str;
3051 
3052   if (got < 1) {
3053     /* EOF, special, or 0-due-to-unless/nonblock */
3054     if (!got)
3055       rw->aborted = 1;
3056     rw->port = NULL; /* turn off further port reading */
3057     rw->unless_evt = NULL;
3058   } else {
3059     rw->input_end += got;
3060 
3061     /* Non-blocking read got enough? If not, try again in blocking mode: */
3062     while (need_total > rw->input_end) {
3063       if (rw->peekskip)
3064         peekskip = scheme_bin_plus(scheme_make_integer(rw->input_end), rw->peekskip);
3065       else
3066         peekskip = scheme_make_integer(rw->input_end);
3067 
3068       rw->str = regstr; /* get_string can swap threads */
3069       got = scheme_get_byte_string_unless("regexp-match", rw->port,
3070                                           rw->instr, rw->input_end, need_total - rw->input_end,
3071                                           (rw->nonblock ? 2 : 0), /* blocking mode */
3072                                           1, peekskip,
3073                                           rw->unless_evt);
3074       regstr = rw->str;
3075 
3076       if (!got && rw->nonblock) {
3077         rw->port = NULL; /* turn off further port reading */
3078         rw->unless_evt = NULL;
3079         rw->aborted = 1;
3080         break;
3081       } else if (got == EOF) {
3082         rw->port = NULL; /* turn off further port reading */
3083         rw->unless_evt = NULL;
3084         break;
3085       } else {
3086         rw->input_end += got;
3087         if (!rw->nonblock)
3088           break;
3089       }
3090     }
3091   }
3092 }
3093 
3094 /*
3095    - regtry - try match in a port
3096    */
3097 static int
regtry_port(regexp * prog,Scheme_Object * port,Scheme_Object * unless_evt,int nonblock,rxpos * startp,rxpos * maybep,rxpos * endp,rxpos * match_stack,int * counters,char ** work_string,rxpos * len,rxpos * size,rxpos skip,Scheme_Object * maxlen,Scheme_Object * peekskip,rxpos origin,char * prefix,rxpos prefix_len,rxpos prefix_offset,int read_at_least_one,int * _aborted)3098 regtry_port(regexp *prog, Scheme_Object *port, Scheme_Object *unless_evt, int nonblock,
3099 	    rxpos *startp, rxpos *maybep, rxpos *endp, rxpos *match_stack, int *counters,
3100 	    char **work_string, rxpos *len, rxpos *size, rxpos skip,
3101 	    Scheme_Object *maxlen, Scheme_Object *peekskip,
3102 	    rxpos origin, char *prefix, rxpos prefix_len, rxpos prefix_offset,
3103 	    int read_at_least_one, int *_aborted)
3104 {
3105   int m;
3106   Regwork rw;
3107 
3108   rw.port = port;
3109   rw.unless_evt = unless_evt;
3110   rw.nonblock = (short)nonblock;
3111   rw.aborted = 0;
3112   rw.instr_size = *size;
3113   if (maxlen && SCHEME_INTP(maxlen))
3114     rw.input_maxend = SCHEME_INT_VAL(maxlen);
3115   else
3116     rw.input_maxend = BIGGEST_RXPOS;
3117   rw.peekskip = peekskip;
3118 
3119   m = regtry(prog, *work_string, skip, (*len) - skip, NULL,
3120 	     startp, maybep, endp, match_stack, counters,
3121 	     &rw, origin, prefix, prefix_len, prefix_offset, 0);
3122 
3123   if (read_at_least_one
3124       && !rw.aborted
3125       && (rw.input_end == skip)
3126       && rw.port) {
3127     read_more_from_regport(&rw, rw.input_end + 1);
3128   }
3129 
3130   *work_string = rw.instr;
3131   *len = rw.input_end;
3132   *size = rw.instr_size;
3133 
3134   if (rw.aborted) {
3135     *_aborted = 1;
3136     return 0;
3137   } else
3138     return m;
3139 }
3140 
3141 #ifdef DO_STACK_CHECK
3142 
regmatch_k(void)3143 static Scheme_Object *regmatch_k(void)
3144 {
3145   Scheme_Thread *p = scheme_current_thread;
3146   Regwork *rw = (Regwork *)p->ku.k.p1;
3147   int res;
3148 
3149   p->ku.k.p1 = NULL;
3150 
3151   regstr = rw->str; /* in case of thread swap */
3152 
3153   res = regmatch(rw, p->ku.k.i1);
3154 
3155   return (res ? scheme_true : scheme_false);
3156 }
3157 
3158 #endif
3159 
INPUT_REF_S(Regwork * rw,rxpos c,rxpos input_start)3160 XFORM_NONGCING static MZ_INLINE char INPUT_REF_S(Regwork *rw, rxpos c, rxpos input_start)
3161 {
3162   if (c < input_start)
3163     return rw->prefix[rw->prefix_delta + c];
3164   else
3165     return rw->instr[c];
3166 }
3167 
INPUT_REF(Regwork * rw,int c)3168 XFORM_NONGCING static MZ_INLINE char INPUT_REF(Regwork *rw, int c)
3169 {
3170   return INPUT_REF_S(rw, c, rw->input_start);
3171 }
3172 
3173 /*
3174    - regmatch - main matching routine
3175    *
3176    * Conceptually the strategy is simple:  check to see whether the current
3177    * node matches, call self recursively to see whether the rest matches,
3178    * and then act accordingly.  In practice we make some effort to avoid
3179    * recursion, in particular by going through "ordinary" nodes (that don't
3180    * need to know whether the rest of the match failed) by a loop instead of
3181    * by recursion.
3182    */
3183 static int			/* 0 failure, 1 success */
regmatch(Regwork * rw,rxpos prog)3184 regmatch(Regwork *rw, rxpos prog)
3185 {
3186   rxpos scan;		/* Current node. */
3187   rxpos is;		/* Input string pos */
3188   int the_op;
3189 
3190 #ifdef DO_STACK_CHECK
3191   {
3192 # include "mzstkchk.h"
3193     {
3194       Scheme_Thread *p = scheme_current_thread;
3195       Regwork *rw2;
3196       Scheme_Object *res;
3197 
3198       /* rw is likely be stack allocated, so copy out to
3199 	 the heap and then copy result back in on return. */
3200       rw2 = MALLOC_ONE_RT(Regwork);
3201       memcpy(rw2, rw, sizeof(Regwork));
3202 #ifdef MZTAG_REQUIRED
3203       rw2->type = scheme_rt_regwork;
3204 #endif
3205 
3206       rw2->str = regstr; /* in case of thread swap */
3207       p->ku.k.p1 = rw2;
3208       p->ku.k.i1 = prog;
3209       res = scheme_handle_stack_overflow(regmatch_k);
3210 
3211       memcpy(rw, rw2, sizeof(Regwork));
3212 
3213       return SCHEME_TRUEP(res);
3214     }
3215   }
3216 #endif
3217 
3218   if (DECREMENT_FUEL(scheme_fuel_counter, 1) <= 0) {
3219     char *rs;
3220     rs = regstr;
3221     scheme_out_of_fuel();
3222     regstr = rs;
3223   }
3224 
3225   is = rw->input;
3226   scan = prog;
3227   while (scan != 0) {
3228     the_op = rOP(scan);
3229     switch (the_op) {
3230     case BOI:
3231       if (is != rw->boi)
3232 	return(0);
3233       scan = NEXT_OP(scan);
3234       break;
3235     case EOI:
3236       NEED_INPUT(rw, is, 1);
3237       if (is != rw->input_end)
3238 	return(0);
3239       scan = NEXT_OP(scan);
3240       break;
3241     case BOL:
3242       if ((is != rw->boi)
3243 	  && ((is <= rw->input_min)
3244 	      || (INPUT_REF(rw, is - 1) != '\n')))
3245 	return(0);
3246       scan = NEXT_OP(scan);
3247       break;
3248     case EOL:
3249       NEED_INPUT(rw, is, 1);
3250       if (is != rw->input_end) {
3251 	if (INPUT_REF(rw, is) != '\n')
3252 	  return(0);
3253       }
3254       scan = NEXT_OP(scan);
3255       break;
3256     case ANY:
3257       NEED_INPUT(rw, is, 1);
3258       if (is == rw->input_end)
3259 	return(0);
3260       is++;
3261       scan = NEXT_OP(scan);
3262       break;
3263     case ANYL:
3264       NEED_INPUT(rw, is, 1);
3265       if (is == rw->input_end)
3266 	return(0);
3267       if (INPUT_REF(rw, is) == '\n')
3268 	return 0;
3269       is++;
3270       scan = NEXT_OP(scan);
3271       break;
3272     case EXACTLY:
3273       {
3274 	int len, i;
3275 	rxpos opnd;
3276 
3277 	opnd = OPSTR(OPERAND(scan));
3278 	len = rOPLEN(OPERAND(scan));
3279 	if (rw->port) {
3280 	  /* Like the other branch, but demand chars one at a time, as
3281 	     we need them */
3282           rxpos input_start = rw->input_start;
3283 	  for (i = 0; i < len; i++) {
3284 	    NEED_INPUT(rw, is + i, 1);
3285 	    if (is + i >= rw->input_end)
3286 	      return 0;
3287 	    if (regstr[opnd+i] != INPUT_REF_S(rw, is+i, input_start))
3288 	      return 0;
3289 	  }
3290 	} else {
3291           rxpos input_start;
3292 	  if (len > rw->input_end - is)
3293 	    return 0;
3294           input_start = rw->input_start;
3295 	  for (i = 0; i < len; i++) {
3296 	    if (regstr[opnd+i] != INPUT_REF_S(rw, is+i, input_start))
3297 	      return 0;
3298 	  }
3299 	}
3300 	is += len;
3301       }
3302       scan = NEXT_OP(scan);
3303       break;
3304     case EXACTLY_CI:
3305       {
3306 	int len, i;
3307 	char c;
3308 	rxpos opnd;
3309 
3310 	opnd = OPSTR(OPERAND(scan));
3311 	len = rOPLEN(OPERAND(scan));
3312 	if (rw->port) {
3313 	  /* Like the other branch, but demand chars one at a time, as
3314 	     we need them */
3315 	  for (i = 0; i < len; i++) {
3316 	    NEED_INPUT(rw, is + i, 1);
3317 	    if (is + i >= rw->input_end)
3318 	      return 0;
3319 	    c = INPUT_REF(rw, is+i);
3320 	    c = rx_tolower(c);
3321 	    if (regstr[opnd+i] != c)
3322 	      return 0;
3323 	  }
3324 	} else {
3325           rxpos input_start;
3326 	  if (len > rw->input_end - is)
3327 	    return 0;
3328           input_start = rw->input_start;
3329 	  for (i = 0; i < len; i++) {
3330 	    c = INPUT_REF_S(rw, is+i, input_start);
3331 	    c = rx_tolower(c);
3332 	    if (regstr[opnd+i] != c)
3333 	      return 0;
3334 	  }
3335 	}
3336 	is += len;
3337       }
3338       scan = NEXT_OP(scan);
3339       break;
3340     case ANYOF:
3341       {
3342 	int c;
3343 	NEED_INPUT(rw, is, 1);
3344 	if (is == rw->input_end)
3345 	  return 0;
3346 	c = UCHAR(INPUT_REF(rw, is));
3347 	if (!(regstr[OPERAND(scan) + (c >> 3)] & (1 << (c & 0x7))))
3348 	  return(0);
3349 	is++;
3350 	scan = NEXT_OP(scan);
3351       }
3352       break;
3353     case EXACTLY1:
3354       NEED_INPUT(rw, is, 1);
3355       if (is == rw->input_end)
3356 	return 0;
3357       if (INPUT_REF(rw, is) != regstr[OPERAND(scan)])
3358 	return 0;
3359       is++;
3360       scan = NEXT_OP(scan);
3361       break;
3362     case EXACTLY2:
3363       NEED_INPUT(rw, is, 1);
3364       if (is == rw->input_end)
3365 	return 0;
3366       if (INPUT_REF(rw, is) != regstr[OPERAND(scan)])
3367         if (INPUT_REF(rw, is) != regstr[OPERAND(scan)+1])
3368           return 0;
3369       is++;
3370       scan = NEXT_OP(scan);
3371       break;
3372     case RANGE:
3373       {
3374 	int c;
3375 	NEED_INPUT(rw, is, 1);
3376 	if (is == rw->input_end)
3377 	  return 0;
3378 	c = UCHAR(INPUT_REF(rw, is));
3379 	if ((c < UCHAR(regstr[OPERAND(scan)]))
3380 	    || (c > UCHAR(regstr[OPERAND(scan)+1])))
3381 	  return(0);
3382 	is++;
3383 	scan = NEXT_OP(scan);
3384       }
3385       break;
3386     case NOTRANGE:
3387       {
3388 	int c;
3389 	NEED_INPUT(rw, is, 1);
3390 	if (is == rw->input_end)
3391 	  return 0;
3392 	c = UCHAR(INPUT_REF(rw, is));
3393 	if ((c >= UCHAR(regstr[OPERAND(scan)]))
3394 	    && (c <= UCHAR(regstr[OPERAND(scan)+1])))
3395 	  return(0);
3396 	is++;
3397 	scan = NEXT_OP(scan);
3398       }
3399       break;
3400     case NOTHING:
3401       scan = NEXT_OP(scan);
3402       break;
3403     case BACK:
3404       scan = scan - rNEXT(scan);
3405       break;
3406     case BRANCH:
3407       {
3408         rxpos delta;
3409 	rxpos next;  /* Next node. */
3410         int stack_pos, ok;
3411 
3412 	next = NEXT_OP(scan);
3413 
3414         if (rOP(next) != BRANCH) /* No choice. */
3415           scan = OPERAND(scan);	/* Avoid recursion. */
3416         else {
3417           do {
3418             rw->input = is;
3419             stack_pos = match_push(rw);
3420             ok = regmatch(rw, OPERAND(scan));
3421             match_pop(rw, stack_pos, ok);
3422 
3423             if (ok) return(1);
3424 
3425 	    scan = next;
3426 	    delta = rNEXT(scan);
3427             if (!delta)
3428 	      break;
3429             next = scan + delta;
3430           } while (rOP(next) == BRANCH);
3431 	  scan = OPERAND(scan);
3432         }
3433       }
3434       break;
3435     case STAR:
3436     case PLUS:
3437     case STAR2:
3438     case PLUS2:
3439     case STAR3:
3440     case STAR4:
3441       {
3442 	char nextch;
3443 	int no;
3444 	rxpos save, body;
3445 	int min, maxc;
3446 	int nongreedy = (the_op == STAR2 || the_op == PLUS2 || the_op == STAR4);
3447 	rxpos next;  /* Next node. */
3448         int stack_pos, ok;
3449 
3450 	/*
3451 	 * Lookahead to avoid useless match attempts
3452 	 * when we know what character comes next.
3453 	 */
3454 	nextch = '\0';
3455 	next = NEXT_OP(scan);
3456 	if (rOP(next) == EXACTLY)
3457 	  nextch = regstr[OPSTR(OPERAND(next))];
3458 	if ((the_op == STAR3) || (the_op == STAR4)) {
3459 	  min = rOPLEN(OPERAND(scan));
3460 	  maxc = rOPLEN(OPERAND2(scan));
3461 	  body = OPERAND3(scan);
3462 	} else {
3463 	  body = OPERAND(scan);
3464 	  min = ((the_op == STAR) || (the_op == STAR2)) ? 0 : 1;
3465 	  maxc = 0;
3466 	}
3467 	save = is;
3468 
3469 	rw->input = is;
3470 	if (nongreedy && rw->port) {
3471 	  /* Get at least `min' bytes, but then don't
3472 	     let regrepeat pull in arbitrary bytes: */
3473 	  Scheme_Object *saveport;
3474 	  NEED_INPUT(rw, save, min ? min : 1);
3475 	  saveport = rw->port;
3476 	  rw->port = NULL;
3477 	  no = regrepeat(rw, body, maxc);
3478 	  rw->port = saveport;
3479 	  nongreedy = 2;
3480 	} else
3481 	  no = regrepeat(rw, body, maxc);
3482 
3483 	if (!nongreedy) {
3484 	  if (nextch)
3485 	    NEED_INPUT(rw, save + no, 1);
3486 	  while (no >= min) {
3487 	    /* If it could work, try it. */
3488 	    if (nextch == '\0' || ((save + no < rw->input_end)
3489 				   && (INPUT_REF(rw, save + no) == nextch))) {
3490 	      rw->input = is + no;
3491               stack_pos = match_push(rw);
3492 	      ok = regmatch(rw, next);
3493               match_pop(rw, stack_pos, ok);
3494               if (ok) return(1);
3495 	    }
3496 	    /* Couldn't or didn't -- back up. */
3497 	    no--;
3498 	  }
3499 	} else {
3500 	  int i;
3501 	  for (i = min; i <= no; i++) {
3502 	    /* If it could work, try it. */
3503 	    if (nextch)
3504 	      NEED_INPUT(rw, save + i, 1);
3505 	    if (nextch == '\0' || ((save+i < rw->input_end)
3506 				   && (INPUT_REF(rw, save+i) == nextch))) {
3507 	      rw->input = save + i;
3508               stack_pos = match_push(rw);
3509               ok = regmatch(rw, next);
3510               match_pop(rw, stack_pos, ok);
3511               if (ok) return(1);
3512 	    }
3513 
3514 	    if ((i == no) && (nongreedy == 2)) {
3515 	      /* Maybe regrepeat can match more if we let it read from
3516 		 the port. */
3517 	      if ((rw->input_end - save) > no) {
3518 		/* We have pulled-in chars to try. */
3519 		int moreno;
3520 		Scheme_Object *saveport;
3521 
3522 		saveport = rw->port;
3523 		rw->port = NULL;
3524 		is = save + no;
3525 		rw->input = is;
3526 		moreno = regrepeat(rw, body, maxc ? maxc - no : 0);
3527 		rw->port = saveport;
3528 
3529 		if (!moreno)
3530 		  nongreedy = 1;
3531 		else
3532 		  no += moreno;
3533 	      }
3534 	    }
3535 	  }
3536 	}
3537 	return(0);
3538       }
3539       break;
3540     case END:
3541     case LOOKE:
3542       rw->input = is;
3543       return(1);		/* Success! */
3544       break;
3545     case BACKREF:
3546       {
3547 	int no, len, start, i;
3548 	no = rOPLEN(OPERAND(scan));
3549 	if (rw->endp[no] < rw->input_min)
3550 	  return 0;
3551 
3552 	start = rw->startp[no];
3553 	len = rw->endp[no] - start;
3554 
3555 	if (rw->port) {
3556 	  /* Like the other branch, but demand chars one at a time, as
3557 	     we need them */
3558           rxpos input_start = rw->input_start;
3559 	  for (i = 0; i < len; i++) {
3560 	    NEED_INPUT(rw, is + i, 1);
3561 	    if (is + i >= rw->input_end)
3562 	      return 0;
3563 	    if (INPUT_REF_S(rw, start+i, input_start) != INPUT_REF_S(rw, is+i, input_start))
3564 	      return 0;
3565 	  }
3566 	} else {
3567           rxpos input_start = rw->input_start;
3568 	  if (len > rw->input_end - is)
3569 	    return 0;
3570 	  for (i = 0; i < len; i++) {
3571 	    if (INPUT_REF_S(rw, start+i, input_start) != INPUT_REF_S(rw, is+i, input_start))
3572 	      return 0;
3573 	  }
3574 	}
3575 	is += len;
3576 	scan = NEXT_OP(scan);
3577 	break;
3578       }
3579     case BACKREF_CI:
3580       {
3581 	int no, len, start, i, c1, c2;
3582 	no = rOPLEN(OPERAND(scan));
3583 	if (rw->endp[no] < rw->input_min)
3584 	  return 0;
3585 
3586 	start = rw->startp[no];
3587 	len = rw->endp[no] - start;
3588 
3589 	if (rw->port) {
3590 	  /* Like the other branch, but demand chars one at a time, as
3591 	     we need them */
3592           rxpos input_start = rw->input_start;
3593 	  for (i = 0; i < len; i++) {
3594 	    NEED_INPUT(rw, is + i, 1);
3595 	    if (is + i >= rw->input_end)
3596 	      return 0;
3597 	    c1 = INPUT_REF_S(rw, start+i, input_start);
3598 	    c1 = rx_tolower(c1);
3599 	    c2 = INPUT_REF_S(rw, is+i, input_start);
3600 	    c2 = rx_tolower(c2);
3601 	    if (c1 != c2)
3602 	      return 0;
3603 	  }
3604 	} else {
3605           rxpos input_start = rw->input_start;
3606 	  if (len > rw->input_end - is)
3607 	    return 0;
3608 	  for (i = 0; i < len; i++) {
3609 	    c1 = INPUT_REF_S(rw, start+i, input_start);
3610 	    c1 = rx_tolower(c1);
3611 	    c2 = INPUT_REF_S(rw, is+i, input_start);
3612 	    c2 = rx_tolower(c2);
3613 	    if (c1 != c2)
3614 	      return 0;
3615 	  }
3616 	}
3617 	is += len;
3618 	scan = NEXT_OP(scan);
3619 	break;
3620       }
3621     case LOOKT:
3622     case LOOKF:
3623     case LOOKTX:
3624     case LOOKBT:
3625     case LOOKBF:
3626       {
3627 	int t, no, no_start, no_end;
3628 	rxpos save, next;
3629         int stack_pos, ok;
3630 	next = NEXT_OP(scan);
3631 	t = ((the_op != LOOKF) && (the_op != LOOKBF));
3632 	if ((the_op == LOOKBT)  || (the_op == LOOKBF)) {
3633 	  no_start = rOPLEN(OPERAND2(scan));
3634 	  no_end = rOPLEN(OPERAND3(scan));
3635 	} else
3636 	  no_start = no_end = 0;
3637 	save = is;
3638 	if (no_end) {
3639           /* lookbehind */
3640           int found = 0;
3641 	  for (no = no_start; no <= no_end; no++) {
3642 	    if (is - rw->input_min >= no) {
3643 	      rw->input = save - no;
3644               stack_pos = match_push(rw);
3645               ok = regmatch(rw, next);
3646               match_pop(rw, stack_pos, ok);
3647 	      if (ok) {
3648 		if (rw->input == save) {
3649 		  /* Match */
3650 		  if (!t) return 0;
3651                   found = 1;
3652 		  break;
3653 		}
3654 	      }
3655 	    } else
3656 	      break;
3657 	  }
3658 	  if (!found) {
3659 	    /* No matches */
3660 	    if (t) return 0;
3661 	  }
3662 	} else {
3663           /* lookahead */
3664 	  rw->input = is;
3665           stack_pos = match_push(rw);
3666           ok = regmatch(rw, next);
3667           match_pop(rw, stack_pos, ok);
3668 	  if (ok) {
3669 	    if (!t) return 0;
3670 	  } else {
3671 	    if (t) return 0;
3672 	  }
3673 	  if (the_op == LOOKTX)
3674 	    is = rw->input;
3675 	}
3676 	scan = scan + rOPLEN(OPERAND(scan));
3677 	scan = NEXT_OP(scan);
3678       }
3679       break;
3680     case COUNTINIT:
3681       {
3682 	int no;
3683 	no = rOPLEN(OPERAND(scan));
3684 	rw->counters[no] = 0;
3685 	scan = NEXT_OP(scan);
3686       }
3687       break;
3688     case COUNTBACK:
3689       {
3690 	int no;
3691 	no = rOPLEN(OPERAND(scan));
3692 	rw->counters[no] -= 1;
3693 	scan = NEXT_OP(scan);
3694       }
3695       break;
3696     case COUNTBACKFAIL:
3697       {
3698 	int no;
3699 	no = rOPLEN(OPERAND(scan));
3700 	rw->counters[no] -= 1;
3701 	return 0;
3702       }
3703       break;
3704     case COUNTUNDER:
3705       {
3706 	int no, maxreps;
3707 	no = rOPLEN(OPERAND(scan));
3708 	maxreps = rOPLEN(OPERAND2(scan));
3709 	rw->counters[no]++;
3710 	if (maxreps && (rw->counters[no] > maxreps))
3711 	  return 0;
3712 	scan = NEXT_OP(scan);
3713       }
3714       break;
3715     case COUNTOVER:
3716       {
3717 	int no, minreps;
3718 	no = rOPLEN(OPERAND(scan));
3719 	minreps = rOPLEN(OPERAND2(scan));
3720 	if (rw->counters[no] < minreps)
3721 	  return 0;
3722 	scan = NEXT_OP(scan);
3723       }
3724       break;
3725     case SAVECONST:
3726       {
3727 	int no, len;
3728 	no = rOPLEN(OPERAND(scan));
3729 	len = rOPLEN(OPERAND2(scan));
3730 	/* Check that the match happened more than 0 times: */
3731 	if (!len || (is > rw->maybep[no])) {
3732           match_set(rw, no, is-len, is);
3733 	} else {
3734           match_set(rw, no, rw->input_min - 1, rw->input_min - 1);
3735 	}
3736 	scan = NEXT_OP(scan);
3737       }
3738       break;
3739     case MAYBECONST:
3740       {
3741 	int no;
3742 	no = rOPLEN(OPERAND(scan));
3743         match_maybe(rw, no, is);
3744 	scan = NEXT_OP(scan);
3745       }
3746       break;
3747     case WORDBOUND:
3748       {
3749         int c, w1, w2;
3750 	NEED_INPUT(rw, is, 1);
3751 	if (is > rw->input_min) {
3752           c = INPUT_REF(rw, is - 1);
3753           w1 = rx_isword(c);
3754         } else
3755           w1 = 0;
3756         if (is < rw->input_end) {
3757           c = INPUT_REF(rw, is);
3758           w2 = rx_isword(c);
3759         } else
3760           w2 = 0;
3761         if (w1 == w2) return 0;
3762 	scan = NEXT_OP(scan);
3763       }
3764       break;
3765     case NOTWORDBOUND:
3766       {
3767         int c, w1, w2;
3768 	NEED_INPUT(rw, is, 1);
3769 	if (is > rw->input_min) {
3770           c = INPUT_REF(rw, is - 1);
3771           w1 = rx_isword(c);
3772         } else
3773           w1 = 0;
3774         if (is < rw->input_end) {
3775           c = INPUT_REF(rw, is);
3776           w2 = rx_isword(c);
3777         } else
3778           w2 = 0;
3779         if (w1 != w2) return 0;
3780 	scan = NEXT_OP(scan);
3781       }
3782       break;
3783     case UNIPROP:
3784       {
3785 	unsigned char buf[MAX_UTF8_CHAR_BYTES];
3786 	mzchar us[1];
3787 	int c, data;
3788 	int v, pos;
3789 	int negate, bottom, top;
3790 
3791 	data = rOPLEN(OPERAND(scan));
3792 
3793 	negate = data >> 13;
3794 	bottom = (data >> 6) & 0x3F;
3795 	top = data & 0x3F;
3796 
3797 	NEED_INPUT(rw, is, 1);
3798 	if (is < rw->input_end) {
3799 	  c = UCHAR(INPUT_REF(rw, is));
3800 	  if (c < 128) {
3801 	    v = c;
3802 	    pos = 1;
3803 	  } else {
3804 	    pos = 1;
3805 	    buf[0] = c;
3806 	    while (1) {
3807 	      v = scheme_utf8_decode_prefix(buf, pos, us, 0);
3808 	      if (v == 1) {
3809 		v = us[0];
3810 		break;
3811 	      } else if (v < -1)
3812 		return 0;
3813 	      NEED_INPUT(rw, is, pos+1);
3814 	      if (is + pos < rw->input_end) {
3815 		buf[pos] = INPUT_REF(rw, is + pos);
3816 		pos++;
3817 	      } else
3818 		return 0;
3819 	    }
3820 	  }
3821 	} else
3822 	  return 0;
3823 
3824 	is += pos;
3825 
3826 	v = scheme_general_category(v);
3827 
3828 	if (negate) {
3829 	  if ((v >= bottom) && (v <= top))
3830 	    return 0;
3831 	} else {
3832 	  if ((v < bottom) || (v > top))
3833 	    return 0;
3834 	}
3835 
3836 	scan = NEXT_OP(scan);
3837       }
3838       break;
3839     case CONDITIONAL:
3840       {
3841 	rxpos test = OPERAND3(scan);
3842 	int t;
3843         int stack_pos;
3844 
3845 	if (rOP(test) == BACKREF) {
3846 	  int no;
3847 	  no = rOPLEN(OPERAND(test));
3848 	  t = (rw->endp[no] > rw->input_min);
3849 	} else {
3850 	  rw->input = is;
3851           stack_pos = match_push(rw);
3852 	  t = regmatch(rw, test);
3853           match_pop(rw, stack_pos, t);
3854 	}
3855 
3856 	if (t)
3857 	  scan = scan + rOPLEN(OPERAND(scan));
3858 	else
3859 	  scan = scan + rOPLEN(OPERAND2(scan));
3860       }
3861       break;
3862     default:
3863       {
3864 	int isopen;
3865 	int no;
3866 
3867 	switch (the_op) {
3868 	case OPENN:
3869 	  isopen = 1;
3870 	  no = rOPLEN(OPERAND(scan));
3871 	  if (!no)
3872 	    no = -1; /* => don't set in result array */
3873 	  break;
3874 	case CLOSEN:
3875 	  isopen = 0;
3876 	  no = rOPLEN(OPERAND(scan));
3877 	  if (!no)
3878 	    no = -1; /* => don't set in result array */
3879 	  break;
3880 	default:
3881 	  if (the_op < CLOSE) {
3882 	    isopen = 1;
3883 	    no = the_op - OPEN;
3884 	  } else {
3885 	    isopen = 0;
3886 	    no = the_op - CLOSE;
3887 	  }
3888 	}
3889 
3890 	if (no < 0) {
3891 	  /* Nothing to set */
3892 	} else if (isopen) {
3893           /* Storing the position in maybep instead of startp
3894              allows a backreference to refer to a match from a
3895              previous iteration in patterns like `(a|\1x)*'. */
3896           match_maybe(rw, no, is);
3897         } else
3898           match_set(rw, no, rw->maybep[no], is);
3899 
3900         scan = NEXT_OP(scan);
3901       }
3902       break;
3903     }
3904   }
3905 
3906   /*
3907    * We get here only if there's trouble -- normally "case END" is
3908    * the terminating point.
3909    */
3910   regerror("corrupted pointers");
3911   return(0);
3912 }
3913 
3914 /*
3915    - regrepeat - repeatedly match something simple, report how many
3916    */
3917 static int
regrepeat(Regwork * rw,rxpos p,int maxc)3918 regrepeat(Regwork *rw, rxpos p, int maxc)
3919 {
3920   int count = 0;
3921   rxpos scan;
3922   rxpos opnd;
3923 
3924   scan = rw->input;
3925   opnd = OPERAND(p);
3926   switch (rOP(p)) {
3927   case ANY:
3928     if (rw->port) {
3929       if (maxc) {
3930 	while (rw->port && (rw->input_end < scan + maxc)) {
3931 	  read_more_from_regport(rw, scan + maxc);
3932 	}
3933       } else {
3934 	/* need all port input: */
3935 	while (rw->port) {
3936 	  read_more_from_regport(rw, rw->input_end + 4096);
3937 	}
3938       }
3939     }
3940     count = rw->input_end - scan;
3941     if (maxc && (count > maxc))
3942       count = maxc;
3943     scan += count;
3944     break;
3945   case ANYL:
3946     {
3947       rxpos input_start = rw->input_start;
3948       NEED_INPUT(rw, scan, 1);
3949       while (scan != rw->input_end
3950 	     && (INPUT_REF_S(rw, scan, input_start) != '\n')) {
3951 	count++;
3952 	scan++;
3953 	if (maxc) { maxc--; if (!maxc) break; }
3954 	NEED_INPUT(rw, scan, 1);
3955       }
3956     }
3957     break;
3958   case EXACTLY:
3959     {
3960       rxpos input_start = rw->input_start;
3961       rxpos opnd2 = OPSTR(opnd);
3962       NEED_INPUT(rw, scan, 1);
3963       while (scan != rw->input_end
3964 	     && (regstr[opnd2] == INPUT_REF_S(rw, scan, input_start))) {
3965 	count++;
3966 	scan++;
3967 	if (maxc) { maxc--; if (!maxc) break; }
3968 	NEED_INPUT(rw, scan, 1);
3969       }
3970     }
3971     break;
3972   case EXACTLY_CI:
3973     {
3974       char c;
3975       rxpos input_start = rw->input_start;
3976       rxpos opnd2 = OPSTR(opnd);
3977       NEED_INPUT(rw, scan, 1);
3978       while (scan != rw->input_end) {
3979 	c = INPUT_REF_S(rw, scan, input_start);
3980 	c = rx_tolower(c);
3981 	if (regstr[opnd2] != c)
3982 	  break;
3983 	count++;
3984 	scan++;
3985 	if (maxc) { maxc--; if (!maxc) break; }
3986 	NEED_INPUT(rw, scan, 1);
3987       }
3988     }
3989     break;
3990   case ANYOF:
3991     {
3992       int c;
3993       rxpos input_start = rw->input_start;
3994       rxpos init = scan;
3995       if (rw->port || maxc) {
3996 	/* Slow but general version */
3997 	NEED_INPUT(rw, scan, 1);
3998 	while (scan != rw->input_end) {
3999 	  c = UCHAR(INPUT_REF_S(rw, scan, input_start));
4000 	  if (!(regstr[opnd + (c >> 3)] & (1 << (c & 0x7))))
4001 	    break;
4002 	  scan++;
4003 	  if (maxc) { maxc--; if (!maxc) break; }
4004 	  NEED_INPUT(rw, scan, 1);
4005 	}
4006       } else {
4007 	/* Fast version */
4008 	int e = rw->input_end;
4009 	while (scan != e) {
4010 	  c = UCHAR(INPUT_REF_S(rw, scan, input_start));
4011 	  if (!(regstr[opnd + (c >> 3)] & (1 << (c & 0x7))))
4012 	    break;
4013 	  scan++;
4014 	}
4015       }
4016       count = scan - init;
4017     }
4018     break;
4019   case EXACTLY1:
4020     {
4021       rxpos init = scan;
4022       rxpos input_start = rw->input_start;
4023       char c;
4024       c = regstr[opnd];
4025       if (rw->port || maxc) {
4026 	/* Slow but general version */
4027 	NEED_INPUT(rw, scan, 1);
4028 	while ((scan != rw->input_end)
4029 	       && (INPUT_REF_S(rw, scan, input_start) == c)) {
4030 	  scan++;
4031 	  if (maxc) { maxc--; if (!maxc) break; }
4032 	  NEED_INPUT(rw, scan, 1);
4033 	}
4034       } else {
4035 	/* Fast version */
4036 	int e = rw->input_end;
4037 	while ((scan != e)
4038 	       && (INPUT_REF(rw, scan) == c)) {
4039 	  scan++;
4040 	}
4041       }
4042       count = scan - init;
4043     }
4044     break;
4045   case EXACTLY2:
4046     {
4047       rxpos init = scan;
4048       rxpos input_start = rw->input_start;
4049       char c1, c2;
4050       c1 = regstr[opnd];
4051       c2 = regstr[opnd+1];
4052       if (rw->port || maxc) {
4053 	/* Slow but general version */
4054 	NEED_INPUT(rw, scan, 1);
4055 	while ((scan != rw->input_end)
4056 	       && ((INPUT_REF_S(rw, scan, input_start) == c1)
4057                    || (INPUT_REF_S(rw, scan, input_start) == c2))) {
4058 	  scan++;
4059 	  if (maxc) { maxc--; if (!maxc) break; }
4060 	  NEED_INPUT(rw, scan, 1);
4061 	}
4062       } else {
4063 	/* Fast version */
4064 	int e = rw->input_end;
4065 	while ((scan != e)
4066 	       && ((INPUT_REF(rw, scan) == c1)
4067                    || (INPUT_REF(rw, scan) == c2))) {
4068 	  scan++;
4069 	}
4070       }
4071       count = scan - init;
4072     }
4073     break;
4074   case RANGE:
4075     {
4076       rxpos init = scan;
4077       rxpos input_start = rw->input_start;
4078       int c, sr, er;
4079       NEED_INPUT(rw, scan, 1);
4080       sr = UCHAR(regstr[opnd]);
4081       er = UCHAR(regstr[opnd + 1]);
4082       if (rw->port || maxc) {
4083 	/* Slow but general version */
4084 	while (scan != rw->input_end) {
4085 	  c = UCHAR(INPUT_REF_S(rw, scan, input_start));
4086 	  if ((c < sr) || (c > er))
4087 	    break;
4088 	  scan++;
4089 	  if (maxc) { maxc--; if (!maxc) break; }
4090 	  NEED_INPUT(rw, scan, 1);
4091 	}
4092       } else {
4093 	/* Fast version */
4094 	int e = rw->input_end;
4095 	while (scan != e) {
4096 	  c = UCHAR(INPUT_REF(rw, scan));
4097 	  if ((c < sr) || (c > er))
4098 	    break;
4099 	  scan++;
4100 	}
4101       }
4102       count = scan - init;
4103     }
4104     break;
4105   case NOTRANGE:
4106     {
4107       rxpos input_start = rw->input_start;
4108       rxpos init = scan;
4109       int c, sr, er;
4110       NEED_INPUT(rw, scan, 1);
4111       sr = UCHAR(regstr[opnd]);
4112       er = UCHAR(regstr[opnd + 1]);
4113       if (rw->port || maxc) {
4114 	/* Slow but general version */
4115 	while (scan != rw->input_end) {
4116 	  c = UCHAR(INPUT_REF_S(rw, scan, input_start));
4117 	  if ((c >= sr) && (c <= er))
4118 	    break;
4119 	  scan++;
4120 	  if (maxc) { maxc--; if (!maxc) break; }
4121 	  NEED_INPUT(rw, scan, 1);
4122 	}
4123       } else {
4124 	/* Fast version */
4125 	int e = rw->input_end;
4126 	while (scan != e) {
4127 	  c = UCHAR(INPUT_REF_S(rw, scan, input_start));
4128 	  if ((c >= sr) && (c <= er))
4129 	    break;
4130 	  scan++;
4131 	}
4132       }
4133       count = scan - init;
4134     }
4135     break;
4136   default:			/* Oh dear.  Called inappropriately. */
4137     regerror("internal foulup");
4138     count = 0;			/* Best compromise. */
4139     break;
4140   }
4141   rw->input = scan;
4142 
4143   return(count);
4144 }
4145 
4146 /*
4147    - regnext - dig the "next" pointer out of a node
4148    */
4149 static rxpos
regnext(rxpos p)4150 regnext(rxpos p)
4151 {
4152   int offset;
4153 
4154   if (p + 2 >= regcodesize)
4155     return 0;
4156 
4157   offset = rNEXT(p);
4158   if (offset == 0)
4159     return 0;
4160 
4161   if (rOP(p) == BACK)
4162     return (p-offset);
4163   else
4164     return (p+offset);
4165 }
4166 
4167 /*
4168  * strcspn - find length of initial segment of s1 consisting entirely
4169  * of characters not from s2
4170  */
4171 
4172 static int
regstrcspn(char * s1,char * e1,char * s2)4173 regstrcspn(char *s1, char *e1, char *s2)
4174 {
4175   char *scan1;
4176   char *scan2;
4177   int count;
4178 
4179   count = 0;
4180   for (scan1 = s1; scan1 != e1; scan1++) {
4181     for (scan2 = s2; *scan2 != '\0';) { /* ++ moved down. */
4182       if (*scan1 == *scan2++)
4183 	return(count);
4184     }
4185     count++;
4186   }
4187   return(count);
4188 }
4189 
4190 #ifndef strncpy
4191   extern char *strncpy();
4192 #endif
4193 
4194 /*
4195    - regsub - perform substitutions after a regexp match
4196    */
4197 static
regsub(regexp * prog,char * src,int sourcelen,intptr_t * lenout,char * insrc,rxpos * startp,rxpos * endp,rxpos minpos,char * prefix,rxpos prefix_offset)4198 char *regsub(regexp *prog, char *src, int sourcelen, intptr_t *lenout, char *insrc,
4199              rxpos *startp, rxpos *endp, rxpos minpos,
4200              char *prefix, rxpos prefix_offset)
4201 {
4202   char *dest;
4203   char c;
4204   intptr_t no;
4205   intptr_t len;
4206   intptr_t destalloc, destlen, srcpos;
4207 
4208   destalloc = 2 * sourcelen;
4209   destlen = 0;
4210   dest = (char *)scheme_malloc_atomic(destalloc + 1);
4211 
4212   srcpos = 0;
4213   while (srcpos < sourcelen) {
4214     c = src[srcpos++];
4215     if (c == '&')
4216       no = 0;
4217     else if (c == '\\') {
4218       if (src[srcpos] == '\\' || src[srcpos] == '&')
4219 	no = -1;
4220       else if (src[srcpos] == '$') {
4221 	no = prog->nsubexp + 1; /* Gives the empty string */
4222 	srcpos++;
4223       } else {
4224 	no = 0;
4225 	while ('0' <= src[srcpos] && src[srcpos] <= '9') {
4226 	  no = (no * 10) + (src[srcpos++] - '0');
4227 	}
4228       }
4229     } else
4230       no = -1;
4231 
4232 
4233     if (no < 0) {		/* Ordinary character. */
4234       if (c == '\\' && (src[srcpos] == '\\' || src[srcpos] == '&'))
4235 	c = src[srcpos++];
4236       if (destlen + 1 >= destalloc) {
4237 	char *old = dest;
4238 	destalloc *= 2;
4239 	dest = (char *)scheme_malloc_atomic(destalloc + 1);
4240 	memcpy(dest, old, destlen);
4241       }
4242       dest[destlen++] = c;
4243     } else if (no >= prog->nsubexp) {
4244       /* Number too big; prentend it's the empty string */
4245     } else if ((startp[no] >= minpos) && (endp[no] >= minpos)) {
4246       len = endp[no] - startp[no];
4247       if (len + destlen >= destalloc) {
4248 	char *old = dest;
4249 	destalloc = 2 * destalloc + len + destlen;
4250 	dest = (char *)scheme_malloc_atomic(destalloc + 1);
4251 	memcpy(dest, old, destlen);
4252       }
4253       if (startp[no] >= 0)
4254         memcpy(dest + destlen, insrc + startp[no], len);
4255       else if (endp[no] <= 0)
4256         memcpy(dest + destlen, prefix + prefix_offset + (startp[no] - minpos), len);
4257       else {
4258         intptr_t pre_len = 0 - startp[no];
4259         memcpy(dest + destlen, prefix + prefix_offset + (startp[no] - minpos), pre_len);
4260         memcpy(dest + destlen + pre_len, insrc, len - pre_len);
4261       }
4262       destlen += len;
4263     }
4264   }
4265   dest[destlen] = '\0';
4266 
4267   if (lenout)
4268     *lenout = destlen;
4269 
4270   return dest;
4271 }
4272 
4273 /************************************************************/
4274 /*              UTF-8 -> per-byte translation               */
4275 /* Translate a UTF-8-encode regexp over the language of     */
4276 /* unicode code points into a per-byte regexp that matches  */
4277 /* equivalent portionals of a UTF-8-encoded sequences of    */
4278 /* code points.                                             */
4279 /************************************************************/
4280 
4281 #ifdef MZ_XFORM
4282 START_XFORM_SKIP;
4283 #endif
4284 #include "../gc2/my_qsort.c"
4285 #ifdef MZ_XFORM
4286 END_XFORM_SKIP;
4287 #endif
4288 
compare_ranges(const void * a,const void * b)4289 static int compare_ranges(const void *a, const void *b)
4290 {
4291   unsigned int av, bv;
4292   av = *(unsigned int *)a;
4293   bv = *(unsigned int *)b;
4294   if (av == bv)
4295     return 0;
4296   else if (av < bv)
4297     return -1;
4298   else
4299     return 1;
4300 }
4301 
4302 /* For allocating the traslated string, as we go. When writing an
4303    original char (or something that takes its place), there's always
4304    space, but call make_room() before adding new content. */
4305 typedef struct {
4306   int i;         /* number of original chars written */
4307   int orig_len;  /* original length */
4308   int size;      /* allocated size */
4309 } RoomState;
4310 
make_room(unsigned char * r,int j,int need_extra,RoomState * rs)4311 static unsigned char *make_room(unsigned char *r, int j, int need_extra, RoomState *rs)
4312 {
4313   int nrs;
4314   unsigned char *nr;
4315 
4316   if ((rs->size - j - (rs->orig_len - rs->i)) < need_extra) {
4317     nrs = ((rs->size) * 2) + need_extra;
4318     nr = (unsigned char *)scheme_malloc_atomic(nrs+1);
4319     memcpy(nr, r, j);
4320     r = nr;
4321     rs->size = nrs;
4322   }
4323 
4324   return r;
4325 }
4326 
add_byte_range(const unsigned char * lo,const unsigned char * hi,int count,unsigned char * r,int * _j,RoomState * rs,int did_alt,int wrap_alts)4327 static unsigned char *add_byte_range(const unsigned char *lo, const unsigned char *hi, int count,
4328 				     unsigned char *r, int *_j, RoomState *rs,
4329 				     /* did_alt => no need to start with "|" */
4330 				     int did_alt,
4331 				     /* wrap_alts => wrap "(?:...)" around multiple alts */
4332 				     int wrap_alts)
4333      /* Adds alternatives for matching valid UTF-8 encodings lo
4334 	through hi lexicographically. See add_range to get started. */
4335 {
4336   int same_chars, j, i;
4337   const unsigned char *lowest = (unsigned char *)"\200\200\200\200\200";
4338   const unsigned char *highest = (unsigned char *)"\277\277\277\277\277";
4339   unsigned char p, q;
4340 
4341   /* Look for a common prefix: */
4342   for (same_chars = 0; same_chars < count; same_chars++) {
4343     if (lo[same_chars] != hi[same_chars])
4344       break;
4345   }
4346 
4347   j = *_j;
4348 
4349   /* Match exactly the part that's the same for hi and lo */
4350   if (same_chars) {
4351     r = make_room(r, j, 4 + same_chars, rs);
4352     if (!did_alt) {
4353       r[j++] = '|';
4354       did_alt = 1;
4355     }
4356     for (i = 0; i < same_chars; i++) {
4357       r[j++] = lo[i];
4358     }
4359   }
4360 
4361   if (same_chars < count) {
4362     /* We have something like nxxxx to mxxxx where n < m.
4363        Find p such that p >= n and p0000 >= nxxxx, and
4364        find q such that q0000 <= mxxxx */
4365     int choices = 0;
4366 
4367     /* If the xxxxs in nxxxx are 0, then p is n,
4368        otherwise it's n + 1 */
4369     for (i = same_chars + 1; i < count; i++) {
4370       if (lo[i] != 128)
4371 	break;
4372     }
4373     if (i == count)
4374       p = lo[same_chars];
4375     else {
4376       p = lo[same_chars] + 1;
4377       choices++;
4378     }
4379 
4380     /* If the xxxxs in mxxxx are 0, then q is m,
4381        otherwise it's m - 1 */
4382     for (i = same_chars + 1; i < count; i++) {
4383       if (hi[i] != 191)
4384 	break;
4385     }
4386     if (i == count)
4387       q = hi[same_chars];
4388     else {
4389       q = hi[same_chars] - 1;
4390       choices++;
4391     }
4392 
4393     if (p <= q)
4394       choices++;
4395 
4396     if ((wrap_alts || same_chars) && (choices > 1)) {
4397       r = make_room(r, j, 4, rs);
4398       if (!did_alt) {
4399 	r[j++] = '|';
4400 	did_alt = 1;
4401       }
4402       r[j++] = '(';
4403       r[j++] = '?';
4404       r[j++] = ':';
4405     }
4406 
4407     /* Fill out [nxxxx, p0000) */
4408     if (p > lo[same_chars]) {
4409       r = make_room(r, j, 2, rs);
4410       if (!did_alt)
4411 	r[j++] = '|';
4412       r[j++] = lo[same_chars];
4413       *_j = j;
4414       r = add_byte_range(lo XFORM_OK_PLUS same_chars + 1, highest, count - same_chars - 1,
4415 			 r, _j, rs, 1, 1);
4416       j = *_j;
4417       p = lo[same_chars] + 1;
4418       did_alt = 0;
4419     }
4420 
4421     /* Fill out [m0000, mxxxx] */
4422     if (q < hi[same_chars]) {
4423       r = make_room(r, j, 2, rs);
4424       if (!did_alt)
4425 	r[j++] = '|';
4426       r[j++] = hi[same_chars];
4427       *_j = j;
4428       r = add_byte_range(lowest, hi  XFORM_OK_PLUS same_chars + 1, count - same_chars - 1,
4429 			 r, _j, rs, 1, 1);
4430       j = *_j;
4431       did_alt = 0;
4432 
4433       q = hi[same_chars] - 1;
4434     }
4435 
4436     /* Fill out [p0000,m0000) */
4437     if (p <= q) {
4438       /* Make the alternative that lets the initial digit vary,
4439 	 since there's room between the lo and hi leading digit */
4440       const char *any_str = "[\200-\277]";
4441       const int any_len = 5;
4442 
4443       r = make_room(r, j, 6 + ((count - same_chars - 1) * any_len), rs);
4444       if (!did_alt)
4445 	r[j++] = '|';
4446 
4447       if (p == q) {
4448 	r[j++] = p;
4449       } else {
4450 	r[j++] = '[';
4451 	r[j++] = p;
4452 	r[j++] = '-';
4453 	r[j++] = q;
4454 	r[j++] = ']';
4455       }
4456       for (i = same_chars + 1; i < count; i++) {
4457 	memcpy(r + j, any_str, any_len);
4458 	j += any_len;
4459       }
4460     }
4461 
4462     if ((wrap_alts || same_chars) && (choices > 1)) {
4463       /* Close out the grouping */
4464       r = make_room(r, j, 1, rs);
4465       r[j++] = ')';
4466     }
4467   }
4468 
4469   *_j = j;
4470   return r;
4471 }
4472 
add_range(unsigned char * r,int * _j,RoomState * rs,unsigned int start,unsigned int end,int did_alt)4473 static unsigned char *add_range(unsigned char *r, int *_j, RoomState *rs,
4474 				unsigned int start, unsigned int end, int did_alt)
4475 {
4476   unsigned int top;
4477   int count;
4478   unsigned char lo[6], hi[6];
4479 
4480   /* If this range spans different-sized encodings, split it up
4481      with a recursive call. */
4482   if (start <= 0x7FF) {
4483     top = 0x7FF;
4484     count = 2;
4485   } else if (start <= 0xFFFF) {
4486     top = 0xFFFF;
4487     count = 3;
4488   } else if (start <= 0x1FFFFF) {
4489     top = 0x1FFFFF;
4490     count = 4;
4491   } else if (start <= 0x3FFFFFF) {
4492     top = 0x3FFFFFF;
4493     count = 5;
4494   } else {
4495     top = 0x7FFFFFFF;
4496     count = 6;
4497   }
4498 
4499   if (end > top) {
4500     r = add_range(r, _j, rs, top + 1, end, did_alt);
4501     end = top;
4502     did_alt = 0;
4503   }
4504 
4505   /* At this point, the situation is much like creating a
4506      regexp to match decimal digits. If we wanted to match the
4507      range 28 to 75 (inclusive), we'd need three parts:
4508 
4509           2[8-9]|[3-6][0-9]|7[0-5]
4510 
4511      It gets more complex with three digits, say
4512      128 to 715:
4513 
4514        12[8-9]|1[3-6][0-9]|[2-6][0-9][0-9]|7[0-0][0-9]|71[0-5]
4515 
4516      but you get the idea. Note that any_str takes the place of
4517      [0-9].
4518 
4519      This same idea works with UTF-8 "digits", so first encode
4520      our code-point numbers in UTF-8: */
4521 
4522   scheme_utf8_encode_all(&start, 1, lo);
4523   scheme_utf8_encode_all(&end, 1, hi);
4524 
4525   return add_byte_range(lo, hi, count, r, _j, rs, did_alt, 0);
4526 }
4527 
need_ci_alternates(unsigned char * s,int delta,int len)4528 static int need_ci_alternates(unsigned char *s, int delta, int len)
4529 {
4530   mzchar us[1], c;
4531 
4532   scheme_utf8_decode(s, delta, len, us, 0, 1, NULL, 0, 0);
4533   c = us[0];
4534 
4535   return ((c != scheme_toupper(c))
4536 	  || (c != scheme_tolower(c))
4537 	  || (c != scheme_tofold(c))
4538 	  || (c != scheme_totitle(c)));
4539 }
4540 
translate(unsigned char * s,int len,char ** result,int pcre)4541 static int translate(unsigned char *s, int len, char **result, int pcre)
4542 {
4543   int j, parse_flags = PARSE_CASE_SENS | PARSE_SINGLE_LINE;
4544   RoomState rs;
4545   unsigned char *r;
4546   Scheme_Object *parse_params = NULL;
4547 
4548   rs.orig_len = len;
4549   rs.size = len;
4550 
4551   r = (unsigned char *)scheme_malloc_atomic(rs.size + 1);
4552 
4553   /* We need to translate if the pattern contains any use of ".", if
4554      there's a big character in a range, if there's a not-range, or if
4555      there's a big character before '+', '*', or '?'. */
4556 
4557   for (rs.i = j = 0; rs.i < len;) {
4558     if (s[rs.i] == '[') {
4559       int k = rs.i + 1, saw_big = 0;
4560       int not_mode = 0;
4561 
4562       /* First, check whether we need to translate this particular
4563 	 range. */
4564 
4565       /* Caret at start is special: */
4566       if ((k < len) && (s[k] == '^')) {
4567 	not_mode = 1;
4568 	k++;
4569       }
4570       /* Close bracket start is special: */
4571       if ((k < len) && (s[k] == ']'))
4572 	k++;
4573       while ((k < len) && (s[k] != ']')) {
4574 	if (s[k] > 127)
4575 	  saw_big = 1;
4576 	else if (pcre && (s[k] == '\\') && (k + 1 < len)) {
4577           if ((s[k+1] == 'D') || (s[k+1] == 'W') || (s[k+1] == 'S'))
4578             saw_big = 1;
4579 	  k++;
4580         } else if (pcre
4581                  && (s[k] == '[')
4582                  && (k + 1 < len)
4583                  && (s[k+1] == ':')
4584                  && is_posix_char_class((char *)s, k + 1, len, NULL)) {
4585           while (s[k] != ']') {
4586             k++;
4587           }
4588         }
4589 	k++;
4590       }
4591       if ((k >= len) || (!saw_big && !not_mode)) {
4592 	/* No translation necessary. */
4593 	while (rs.i <= k) {
4594 	  r[j++] = s[rs.i++];
4595 	}
4596       } else {
4597 	/* Need to translate. */
4598 	char *simple_on;
4599         int non_ascii = 0;
4600 	Scheme_Object *ranges;
4601 	unsigned int *us, *range_array;
4602 	int ulen, on_count, range_len, rp, p;
4603 
4604 	ulen = scheme_utf8_decode(s, rs.i + 1, k, NULL, 0, -1, NULL, 0, 0);
4605 	us = (unsigned int *)scheme_malloc_atomic(ulen * sizeof(unsigned int));
4606 	scheme_utf8_decode(s, rs.i + 1, k, us, 0, -1, NULL, 0, 0);
4607 
4608 	/* The simple_on array lists ASCII chars to (not) find
4609 	   for the match, and `non_ascii` virtually extends
4610            to the rest of Unicode */
4611 	simple_on = (char *)scheme_malloc_atomic(128);
4612 	memset(simple_on, 0, 128);
4613 	/* The ranges list is pairs of larger ranges */
4614 	ranges = scheme_null;
4615 
4616 	p = 0;
4617 	if (not_mode)
4618 	  p++;
4619 	if (us[p] == '-') {
4620 	  simple_on['-'] = 1;
4621 	  p++;
4622 	}
4623 
4624 	while (p < ulen) {
4625 	  if (((p + 2) < ulen)
4626 	      && us[p+1] == '-'
4627 	      && (!pcre || ((us[p] != '\\') && (us[p+2] != '\\')))) {
4628 	    int beg = us[p], end = us[p+2];
4629 	    if (end == '-') {
4630 	      FAIL("misplaced hyphen within square brackets in pattern");
4631 	    }
4632 	    if (end < beg) {
4633 	      /* Bad regexp */
4634 	      FAIL("invalid range within square brackets in pattern");
4635 	    }
4636 
4637 	    if ((beg > 127) || (end > 127)) {
4638 	      /* A big-char range */
4639 	      ranges = scheme_make_pair(scheme_make_pair(scheme_make_integer_value_from_unsigned(beg),
4640 							 scheme_make_integer_value_from_unsigned(end)),
4641 					ranges);
4642 	      if (!(parse_flags & PARSE_CASE_SENS)) {
4643 		/* Try to build up parallel ranges, though they may
4644 		   not turn out to be parallel. If the ranges overlap,
4645 		   we'll clean them up in the final sort-and-merge
4646 		   pass for the whole ranges list. */
4647 		int c, beg2, end2, c2, mode;
4648 		for (mode = 0; mode < 4; mode++) {
4649 		  for (c = beg; c <= end; c++) {
4650 		    switch (mode) {
4651 		    case 0:
4652 		      beg2 = scheme_tofold(c);
4653 		      break;
4654 		    case 1:
4655 		      beg2 = scheme_tolower(c);
4656 		      break;
4657 		    case 2:
4658 		      beg2 = scheme_toupper(c);
4659 		      break;
4660 		    case 3:
4661 		    default:
4662 		      beg2 = scheme_totitle(c);
4663 		      break;
4664 		    }
4665 		    if (c != beg2) {
4666 		      end2 = beg2;
4667 		      for (; c <= end; c++) {
4668 			switch (mode) {
4669 			case 0:
4670 			  c2 = scheme_tofold(c);
4671 			  break;
4672 			case 1:
4673 			  c2 = scheme_tolower(c);
4674 			  break;
4675 			case 2:
4676 			  c2 = scheme_toupper(c);
4677 			  break;
4678 			case 3:
4679 			default:
4680 			  c2 = scheme_totitle(c);
4681 			  break;
4682 			}
4683 			if ((c2 == c) || (c2 != end2 + 1))
4684 			  break;
4685 		      }
4686 		      ranges = scheme_make_pair(scheme_make_pair(scheme_make_integer_value_from_unsigned(beg2),
4687 								 scheme_make_integer_value_from_unsigned(end2)),
4688 						ranges);
4689 		    }
4690 		  }
4691 		}
4692 	      }
4693 	    } else {
4694 	      /* Small range */
4695 	      int w;
4696 	      for (w = beg; w <= end; w++) {
4697 		simple_on[w] = 1;
4698 	      }
4699 	    }
4700 	    p += 3;
4701 	  } else if (pcre && (us[p] == '\\')) {
4702 	    if ((p + 1) < ulen) {
4703 	      int c = us[p + 1];
4704 	      if (((c >= 'a') && (c <= 'z'))
4705 		  || ((c >= 'A') && (c <= 'Z'))) {
4706 		regcharclass(c, simple_on, &non_ascii);
4707 		p += 2;
4708 	      } else if (c < 128) {
4709 		simple_on[c] = 1;
4710 		p += 2;
4711 	      } else {
4712 		/* Let next iteration handle it.
4713 		   (There's no danger of using it as a meta-character.) */
4714                 p++;
4715 	      }
4716 	    } else
4717 	      FAIL("trailing \\ in pattern");
4718 	  } else if (us[p] > 127) {
4719 	    int c = us[p];
4720 	    ranges = scheme_make_pair(scheme_make_pair(scheme_make_integer_value_from_unsigned(c),
4721 						       scheme_make_integer_value_from_unsigned(c)),
4722 				      ranges);
4723 	    if (!(parse_flags & PARSE_CASE_SENS)) {
4724 	      int mode, c2;
4725 	      for (mode = 0; mode < 4; mode++) {
4726 		switch (mode) {
4727 		case 0:
4728 		  c2 = scheme_tofold(c);
4729 		  break;
4730 		case 1:
4731 		  c2 = scheme_tolower(c);
4732 		  break;
4733 		case 2:
4734 		  c2 = scheme_toupper(c);
4735 		  break;
4736 		case 3:
4737 		default:
4738 		  c2 = scheme_totitle(c);
4739 		  break;
4740 		}
4741 		if (c2 != c) {
4742 		  ranges = scheme_make_pair(scheme_make_pair(scheme_make_integer_value_from_unsigned(c2),
4743 							     scheme_make_integer_value_from_unsigned(c2)),
4744 					    ranges);
4745 		}
4746 	      }
4747 	    }
4748 	    p++;
4749           } else if (pcre
4750                      && (us[p] == '[')
4751                      && ((p + 1) < ulen)
4752                      && (us[p+1] == ':')
4753                      && is_posix_char_class_in_unicode(us, p + 1, ulen, simple_on)) {
4754             while (us[p] != ']') {
4755               p++;
4756             }
4757             p++;
4758 	  } else {
4759 	    if (((p + 1) < ulen) && (us[p] == '-')) {
4760 	      FAIL("misplaced hyphen within square brackets in pattern");
4761 	      return 0;
4762 	    }
4763 	    simple_on[us[p]] = 1;
4764 	    p++;
4765 	  }
4766 	}
4767 
4768         if (non_ascii) {
4769           /* Replace the ranges array to cover all non-ASCII characters */
4770           ranges = scheme_make_pair(scheme_make_pair(scheme_make_integer(128),
4771                                                      scheme_make_integer(0x10FFFF)),
4772                                     scheme_null);
4773         }
4774 
4775 	/* Turn the ranges list into an array */
4776 	range_len = scheme_list_length(ranges);
4777 	range_array = (unsigned int *)scheme_malloc_atomic(2 * range_len * sizeof(unsigned int));
4778 	for (rp = 0; SCHEME_PAIRP(ranges); ranges = SCHEME_CDR(ranges), rp += 2) {
4779 	  uintptr_t hi, lo;
4780 	  scheme_get_unsigned_int_val(SCHEME_CAAR(ranges), &lo);
4781 	  scheme_get_unsigned_int_val(SCHEME_CDR(SCHEME_CAR(ranges)), &hi);
4782 	  range_array[rp] = (unsigned int)lo;
4783 	  range_array[rp+1] = (unsigned int)hi;
4784 	}
4785 	range_len *= 2;
4786 	/* Sort the ranges by the starting index. */
4787 	my_qsort(range_array, range_len >> 1, 2 * sizeof(unsigned int), compare_ranges);
4788 
4789 	/* If a range starts below 128, fill in the simple array */
4790 	for (rp = 0; rp < range_len; rp += 2) {
4791 	  if (range_array[rp] < 128) {
4792 	    for (p = range_array[rp]; p < 128; p++) {
4793 	      simple_on[p] = 1;
4794 	    }
4795 	    range_array[rp] = 128;
4796 	  }
4797 	}
4798 
4799 	if (!(parse_flags & PARSE_CASE_SENS)) {
4800 	  for (p = 'a'; p <= 'z'; p++) {
4801 	    if (simple_on[p])
4802 	      simple_on[rx_toupper(p)] = 1;
4803 	    if (simple_on[rx_toupper(p)])
4804 	      simple_on[p] = 1;
4805 	  }
4806 	}
4807 
4808 	/* Count simples that are on */
4809 	on_count = 0;
4810 	for (p = 0; p < 128; p++) {
4811 	  if (simple_on[p])
4812 	    on_count++;
4813 	}
4814 
4815 	if (not_mode) {
4816 	  /* "Not" mode. We produce something in regular mode */
4817 	  /* Start with "(?:[...]|" for simples. */
4818 	  unsigned int last_end;
4819 	  int did_alt;
4820 	  r = make_room(r, j, 6 + (128 - on_count) + ((pcre && !simple_on['\\']) ? 1 : 0), &rs);
4821 	  r[j++] = '(';
4822 	  r[j++] = '?';
4823 	  r[j++] = ':';
4824 	  if (on_count < 128) {
4825 	    if (!on_count) {
4826 	      r[j++] = '[';
4827 	      r[j++] = 0;
4828 	      r[j++] = '-';
4829 	      r[j++] = 127;
4830 	      r[j++] = ']';
4831 	    } else {
4832 	      r[j++] = '[';
4833 	      if (!simple_on[']'])
4834 		r[j++] = ']';
4835 	      for (p = 0; p < 128; p++) {
4836 		if ((p != '-') && (p != ']') && (!pcre || (p != '\\')))
4837 		  if (!simple_on[p])
4838 		    r[j++] = p;
4839 	      }
4840 	      if (pcre && !simple_on['\\']) {
4841 		r[j++] = '\\';
4842 		r[j++] = '\\';
4843 	      }
4844 	      if (!simple_on['-'])
4845 		r[j++] = '-';
4846 	      r[j++] = ']';
4847 	    }
4848 	    did_alt = 0;
4849 	  } else
4850 	    did_alt = 1;
4851 	  last_end = 128;
4852 	  for (rp = 0; rp < range_len; rp += 2) {
4853 	    if (range_array[rp] > last_end) {
4854 	      r = add_range(r, &j, &rs, last_end, range_array[rp] - 1, did_alt);
4855 	      did_alt = 0;
4856 	    }
4857 	    if ((range_array[rp + 1] + 1) > last_end)
4858 	      last_end = range_array[rp + 1] + 1;
4859 	  }
4860 	  if (last_end <= 0x10FFFF) {
4861 	    if (last_end < 0xD800) {
4862 	      r = add_range(r, &j, &rs, last_end, 0xD7FF, did_alt);
4863 	      did_alt = 0;
4864 	      r = add_range(r, &j, &rs, 0xE000, 0x10FFFF, did_alt);
4865 	    } else {
4866 	      r = add_range(r, &j, &rs, last_end, 0x10FFFF, did_alt);
4867 	      did_alt = 0;
4868 	    }
4869 	  }
4870 	  r = make_room(r, j, 1, &rs);
4871 	  r[j++] = ')';
4872 	} else {
4873 	  /* Normal mode */
4874 	  /* Start with "(?:[...]|" for simples. */
4875 	  int p, did_alt;
4876 	  r = make_room(r, j, 5 + on_count + ((pcre && simple_on['\\']) ? 1 : 0), &rs);
4877 	  r[j++] = '(';
4878 	  r[j++] = '?';
4879 	  r[j++] = ':';
4880 	  if (on_count) {
4881 	    if (on_count == 128) {
4882 	      r[j++] = '[';
4883 	      r[j++] = 0;
4884 	      r[j++] = '-';
4885 	      r[j++] = 127;
4886 	      r[j++] = ']';
4887 	    } else {
4888 	      r[j++] = '[';
4889 	      if (simple_on[']'])
4890 		r[j++] = ']';
4891 	      for (p = 0; p < 128; p++) {
4892 		if ((p != '-') && (p != ']') && (!pcre || (p != '\\')))
4893 		  if (simple_on[p])
4894 		    r[j++] = p;
4895 	      }
4896 	      if (pcre && simple_on['\\']) {
4897 		r[j++] = '\\';
4898 		r[j++] = '\\';
4899 	      }
4900 	      if (simple_on['-'])
4901 		r[j++] = '-';
4902 	      r[j++] = ']';
4903 	    }
4904 	    did_alt = 0;
4905 	  } else
4906 	    did_alt = 1;
4907 	  for (rp = 0; rp < range_len; rp += 2) {
4908 	    r = add_range(r, &j, &rs, range_array[rp], range_array[rp+1], did_alt);
4909 	    did_alt = 0;
4910 	  }
4911 	  r = make_room(r, j, 1, &rs);
4912 	  r[j++] = ')';
4913 	}
4914       }
4915       rs.i = k + 1;
4916     } else if (s[rs.i] == '\\') {
4917       if (pcre
4918           && (rs.i+1 < len)
4919           && ((s[rs.i+1] == 'D')
4920               || (s[rs.i+1] == 'W')
4921               || (s[rs.i+1] == 'S'))) {
4922         /* matches non-ASCII characters, so convert */
4923         char *simple_on;
4924         int non_ascii;
4925         int n;
4926 
4927         simple_on = (char *)scheme_malloc_atomic(128);
4928 	memset(simple_on, 0, 128);
4929 
4930         rs.i++;
4931         regcharclass(s[rs.i++], simple_on, &non_ascii);
4932 
4933         r = make_room(r, j, 4, &rs);
4934         r[j++] = '(';
4935         r[j++] = '?';
4936         r[j++] = ':';
4937         r[j++] = '[';
4938 
4939         for (n = 0; n < 128; ) {
4940           if (simple_on[n]) {
4941             int m;
4942             for (m = n + 1; (m < 128) && simple_on[m]; m++) {
4943             }
4944             r = make_room(r, j, 3, &rs);
4945             r[j++] = n;
4946             r[j++] = '-';
4947             r[j++] = m-1;
4948             n = m;
4949           } else
4950             n++;
4951         }
4952         r = make_room(r, j, 1, &rs);
4953         r[j++] = ']';
4954 
4955         if (non_ascii) { /* we expect this to be true! */
4956           r = add_range(r, &j, &rs, 128, 0x10FFFF, 0);
4957           r = make_room(r, j, 1, &rs);
4958         }
4959         r[j++] = ')';
4960       } else {
4961         /* Skip over next char, possibly big: */
4962         r[j++] = s[rs.i++];
4963         if ((rs.i < len)
4964             && (s[rs.i] > 127)) {
4965           r[j++] = s[rs.i++];
4966           while ((rs.i < len) && ((s[rs.i] & 0xC0) == 0x80)) {
4967             r[j++] = s[rs.i++];
4968           }
4969         } else
4970           r[j++] = s[rs.i++];
4971       }
4972     } else if ((s[rs.i] == '.')
4973 	       && (!pcre
4974 		   || (rs.i < 3)
4975 		   || (s[rs.i-1] != '{')
4976 		   || ((s[rs.i-2] == 'p')
4977 		       && (s[rs.i-2] == 'P'))
4978 		   || (s[rs.i-3] != '\\'))) {
4979       /* "." has to be expanded. */
4980       r = make_room(r, j, (parse_flags & PARSE_SINGLE_LINE) ? 9 : 8, &rs);
4981       r[j++] = '(';
4982       r[j++] = '?';
4983       r[j++] = ':';
4984       r[j++] = '[';
4985       r[j++] = '\00';
4986       r[j++] = '-';
4987       if (!(parse_flags & PARSE_SINGLE_LINE)) {
4988 	r[j++] = '\n' - 1;
4989 	r[j++] = '\n' + 1;
4990 	r[j++] = '-';
4991       }
4992       r[j++] = '\177';
4993       r[j++] = ']';
4994       r = add_range(r, &j, &rs, 128, 0xD7FF, 0);
4995       r = add_range(r, &j, &rs, 0xE000, 0x10FFFF, 0);
4996       r = make_room(r, j, 1, &rs);
4997       r[j++] = ')';
4998       rs.i++;
4999     } else if (s[rs.i] > 127) {
5000       int k = rs.i + 1;
5001       /* Look for *, +, or ? after this big char */
5002       while ((k < len) && ((s[k] & 0xC0) == 0x80)) {
5003 	k++;
5004       }
5005       if ((k < len) && ((s[k] == '+')
5006 			|| (s[k] == '*')
5007 			|| (s[k] == '?')
5008 			|| (!(parse_flags & PARSE_CASE_SENS)
5009 			    && need_ci_alternates(s, rs.i, k)))) {
5010 	/* Need to translate; wrap char in (?: ...) */
5011 	int orig_i;
5012 	r = make_room(r, j, 4, &rs);
5013 	r[j++] = '(';
5014 	r[j++] = '?';
5015 	r[j++] = ':';
5016 	orig_i = rs.i;
5017 	while (rs.i < k) {
5018 	  r[j++] = s[rs.i++];
5019 	}
5020 	if (!(parse_flags & PARSE_CASE_SENS)) {
5021 	  /* Add alternates for different cases: */
5022 	  mzchar us[1], c0, c1, wrote[4];
5023 	  int clen, ci, num_wrote = 1, mode;
5024 	  unsigned char s2[MAX_UTF8_CHAR_BYTES];
5025 
5026 	  scheme_utf8_decode(s, orig_i, k, us, 0, 1, NULL, 0, 0);
5027 	  c0 = us[0];
5028 	  wrote[0] = c0;
5029 	  for (mode = 0; mode < 4; mode++) {
5030 	    switch (mode) {
5031 	    case 0:
5032 	      c1 = scheme_tofold(c0);
5033 	      break;
5034 	    case 1:
5035 	      c1 = scheme_tolower(c0);
5036 	      break;
5037 	    case 2:
5038 	      c1 = scheme_toupper(c0);
5039 	      break;
5040 	    case 3:
5041 	    default:
5042 	      c1 = scheme_totitle(c0);
5043 	      break;
5044 	    }
5045 	    for (ci = 0; ci < num_wrote; ci++) {
5046 	      if (c1 == wrote[ci])
5047 		break;
5048 	    }
5049 	    if (ci >= num_wrote) {
5050 	      wrote[num_wrote++] = c1;
5051 	      us[0] = c1;
5052 	      clen = scheme_utf8_encode(us, 0, 1, s2, 0, 0);
5053 	      r = make_room(r, j, clen + 1, &rs);
5054 	      r[j++] = '|';
5055 	      for (ci = 0; ci < clen; ci++) {
5056 		r[j++] = s2[ci];
5057 	      }
5058 	    }
5059 	  }
5060 	}
5061 	r[j++] = ')';
5062       } else {
5063 	/* No translation. */
5064 	while (rs.i < k) {
5065 	  r[j++] = s[rs.i++];
5066 	}
5067       }
5068     } else {
5069       /* The translation needs to know about case-insensitive
5070 	 and single-line modes, so track parens: */
5071       if (s[rs.i] == '(') {
5072 	int old_flags = parse_flags;
5073 	if ((rs.i + 1 < len) && (s[rs.i + 1] == '?')) {
5074 	  int k;
5075 	  for (k = rs.i + 2; k < len; k++) {
5076 	    if ((s[k] == ':')
5077 		|| (s[k] == '<')
5078 		|| (s[k] == '>')
5079 		|| (s[k] == '=')
5080 		|| (s[k] == '!'))
5081 	      break;
5082 	    if (s[k] == 'i') {
5083 	      parse_flags &= ~PARSE_CASE_SENS;
5084 	    } else if (s[k] == 's') {
5085 	      parse_flags |= PARSE_SINGLE_LINE;
5086 	    } else if (s[k] == 'm') {
5087 	      parse_flags &= ~PARSE_SINGLE_LINE;
5088 	    } else if (s[k] == '-') {
5089 	      if (k + 1 < len) {
5090 		k++;
5091 		if (s[k] == 'i') {
5092 		  parse_flags |= PARSE_CASE_SENS;
5093 		} else if (s[k] == 's') {
5094 		  parse_flags &= ~PARSE_SINGLE_LINE;
5095 		} else if (s[k] == 'm') {
5096 		  parse_flags |= PARSE_SINGLE_LINE;
5097 		}
5098 	      }
5099 	    }
5100 	  }
5101 	}
5102 	if (parse_params || (parse_flags != old_flags)) {
5103 	  parse_params = scheme_make_raw_pair(scheme_make_integer(old_flags),
5104 					      parse_params);
5105 	}
5106       } else if (s[rs.i] == ')') {
5107 	if (parse_params) {
5108 	  parse_flags = SCHEME_INT_VAL(SCHEME_CAR(parse_params));
5109 	  parse_params = SCHEME_CDR(parse_params);
5110 	}
5111       }
5112       r[j++] = s[rs.i++];
5113     }
5114   }
5115 
5116   r[j] = 0;
5117   *result = (char *)r;
5118   return j;
5119 }
5120 
5121 /************************************************************/
5122 /*                   Racket front end                       */
5123 /************************************************************/
5124 
scheme_is_pregexp(Scheme_Object * o)5125 int scheme_is_pregexp(Scheme_Object *o)
5126 {
5127   return !!(((regexp *)o)->flags & REGEXP_IS_PCRE);
5128 }
5129 
do_make_regexp(const char * who,int is_byte,int pcre,int argc,Scheme_Object * argv[])5130 static Scheme_Object *do_make_regexp(const char *who, int is_byte, int pcre, int argc, Scheme_Object *argv[])
5131 {
5132   Scheme_Object *re, *bs;
5133   Scheme_Object *handler;
5134   char *s;
5135   int slen;
5136 
5137   if (is_byte) {
5138     if (!SCHEME_BYTE_STRINGP(argv[0]))
5139       scheme_wrong_contract(who, "bytes?", 0, argc, argv);
5140     bs = argv[0];
5141   } else {
5142     if (!SCHEME_CHAR_STRINGP(argv[0]))
5143       scheme_wrong_contract(who, "string?", 0, argc, argv);
5144     bs = scheme_char_string_to_byte_string(argv[0]);
5145   }
5146 
5147   if (argc >= 2) {
5148     if (!scheme_check_proc_arity2(who, 1, 1, argc, argv, 1))
5149       scheme_wrong_contract(who, "(or/c #f (string? -> any))", 1, argc, argv);
5150     handler = argv[1];
5151   } else
5152     handler = scheme_false;
5153 
5154   s = SCHEME_BYTE_STR_VAL(bs);
5155   slen = SCHEME_BYTE_STRTAG_VAL(bs);
5156 
5157   if (!is_byte) {
5158     slen = translate((unsigned char *)s, slen, &s, pcre);
5159 #if 0
5160     /* Debugging, to see the translated regexp: */
5161     {
5162       char *cp;
5163       int i;
5164       cp = (char *)scheme_malloc_atomic(slen + 1);
5165       memcpy(cp, s, slen + 1);
5166       for (i = 0; i < slen; i++) {
5167 	if (!cp[i]) cp[i] = '0';
5168       }
5169       printf("%d %s\n", slen, scheme_write_to_string(scheme_make_byte_string(cp), 0));
5170     }
5171 #endif
5172   }
5173 
5174   regerrorwho = who;
5175   re = (Scheme_Object *)regcomp(s, 0, slen, pcre, handler);
5176   regerrorwho = NULL;
5177 
5178   /* passed a handler and regexp compilation failed */
5179   if (!re) {
5180     return regerrorval;
5181   }
5182 
5183   if (!is_byte)
5184     ((regexp *)re)->flags |= REGEXP_IS_UTF8;
5185   if (pcre)
5186     ((regexp *)re)->flags |= REGEXP_IS_PCRE;
5187 
5188   if (SCHEME_IMMUTABLEP(argv[0]))
5189     ((regexp *)re)->source = argv[0];
5190   else if (is_byte) {
5191     Scheme_Object *src;
5192     src = scheme_make_immutable_sized_byte_string(SCHEME_BYTE_STR_VAL(argv[0]),
5193 						  SCHEME_BYTE_STRTAG_VAL(argv[0]),
5194 						  1);
5195     ((regexp *)re)->source = src;
5196   } else {
5197     Scheme_Object *src;
5198     src = scheme_make_immutable_sized_char_string(SCHEME_CHAR_STR_VAL(argv[0]),
5199 						  SCHEME_CHAR_STRTAG_VAL(argv[0]),
5200 						  1);
5201     ((regexp *)re)->source = src;
5202   }
5203 
5204   return re;
5205 }
5206 
make_regexp(int argc,Scheme_Object * argv[])5207 static Scheme_Object *make_regexp(int argc, Scheme_Object *argv[])
5208 {
5209   return do_make_regexp("byte-regexp", 1, 0, argc, argv);
5210 }
5211 
make_utf8_regexp(int argc,Scheme_Object * argv[])5212 static Scheme_Object *make_utf8_regexp(int argc, Scheme_Object *argv[])
5213 {
5214   return do_make_regexp("regexp", 0, 0, argc, argv);
5215 }
5216 
make_pregexp(int argc,Scheme_Object * argv[])5217 static Scheme_Object *make_pregexp(int argc, Scheme_Object *argv[])
5218 {
5219   return do_make_regexp("byte-pregexp", 1, 1, argc, argv);
5220 }
5221 
make_utf8_pregexp(int argc,Scheme_Object * argv[])5222 static Scheme_Object *make_utf8_pregexp(int argc, Scheme_Object *argv[])
5223 {
5224   return do_make_regexp("pregexp", 0, 1, argc, argv);
5225 }
5226 
scheme_make_regexp(Scheme_Object * str,int is_byte,int pcre,int * volatile result_is_err_string)5227 Scheme_Object *scheme_make_regexp(Scheme_Object *str, int is_byte, int pcre, int * volatile result_is_err_string)
5228 {
5229   mz_jmp_buf * volatile save, newbuf;
5230   Scheme_Object * volatile result;
5231 
5232   *result_is_err_string = 0;
5233 
5234   /* we rely on single-threaded, non-blocking regexp compilation: */
5235   save = scheme_current_thread->error_buf;
5236   scheme_current_thread->error_buf = &newbuf;
5237   failure_msg_for_read = "yes";
5238   if (!scheme_setjmp(newbuf)) {
5239     if (is_byte) {
5240       if (pcre)
5241 	result = make_pregexp(1, &str);
5242       else
5243 	result = make_regexp(1, &str);
5244     } else {
5245       if (pcre)
5246 	result = make_utf8_pregexp(1, &str);
5247       else
5248 	result = make_utf8_regexp(1, &str);
5249     }
5250   } else {
5251     result = (Scheme_Object *)failure_msg_for_read;
5252     *result_is_err_string = 1;
5253   }
5254 
5255   failure_msg_for_read = NULL;
5256   scheme_current_thread->error_buf = save;
5257   return result;
5258 }
5259 
regcomp_object(Scheme_Object * str)5260 static regexp *regcomp_object(Scheme_Object *str)
5261 {
5262   if (SCHEME_BYTE_STRINGP(str))
5263     return (regexp *)make_regexp(1, &str);
5264   else
5265     return (regexp *)make_utf8_regexp(1, &str);
5266 }
5267 
scheme_clear_rx_buffers(void)5268 void scheme_clear_rx_buffers(void)
5269 {
5270   startp_buffer_cache = NULL;
5271   endp_buffer_cache = NULL;
5272   maybep_buffer_cache = NULL;
5273   match_stack_buffer_cache = NULL;
5274 }
5275 
gen_compare(char * name,int pos,int argc,Scheme_Object * argv[],int peek,int nonblock,int last_bytes)5276 static Scheme_Object *gen_compare(char *name, int pos,
5277 				  int argc, Scheme_Object *argv[],
5278 				  int peek, int nonblock, int last_bytes)
5279 {
5280   regexp *r;
5281   char *full_s, *prefix = NULL;
5282   rxpos *startp, *maybep, *endp, *match_stack, prefix_len = 0, prefix_offset = 0, minpos;
5283   int offset = 0, orig_offset, endset, m, was_non_byte, last_bytes_count = last_bytes;
5284   Scheme_Object *iport, *oport = NULL, *startv = NULL, *endv = NULL, *dropped, *unless_evt = NULL;
5285   Scheme_Object *last_bytes_str = scheme_false, *srcin;
5286   rx_lazy_str_t *lazy_string = NULL;
5287 
5288   if (SCHEME_TYPE(argv[0]) != scheme_regexp_type
5289       && !SCHEME_BYTE_STRINGP(argv[0])
5290       && !SCHEME_CHAR_STRINGP(argv[0]))
5291     scheme_wrong_contract(name, "(or/c regexp? byte-regexp? string? bytes?)", 0, argc, argv);
5292   if ((peek || (!SCHEME_BYTE_STRINGP(argv[1]) && !SCHEME_CHAR_STRINGP(argv[1])))
5293       && !SCHEME_INPUT_PORTP(argv[1])
5294       && !SCHEME_PATHP(argv[1]))
5295     scheme_wrong_contract(name, peek ? "input-port?" : "(or/c string? bytes? path? input-port?)", 1, argc, argv);
5296 
5297   srcin = argv[1];
5298   if (SCHEME_PATHP(srcin)) {
5299     if (SCHEME_BYTE_STRINGP(argv[0])
5300         || (SAME_TYPE(SCHEME_TYPE(argv[0]), scheme_regexp_type)
5301             && !(((regexp *)argv[0])->flags & REGEXP_IS_UTF8)))
5302       srcin = scheme_make_sized_byte_string(SCHEME_PATH_VAL(srcin),
5303                                             SCHEME_PATH_LEN(srcin),
5304                                             1);
5305     else
5306       srcin = scheme_path_to_char_string(srcin);
5307   }
5308 
5309   if (SCHEME_CHAR_STRINGP(srcin)) {
5310     iport = NULL;
5311     endset = SCHEME_CHAR_STRLEN_VAL(srcin);
5312   } else if (SCHEME_INPUT_PORTP(srcin)) {
5313     iport = srcin;
5314     endset = -2;
5315   } else {
5316     iport = NULL;
5317     endset = SCHEME_BYTE_STRLEN_VAL(srcin);
5318   }
5319 
5320   if (argc > 2) {
5321     int len = endset;
5322 
5323     offset = scheme_extract_index(name, 2, argc, argv, len + 1, 0);
5324 
5325     if (!iport && (offset > len)) {
5326       scheme_out_of_range(name, NULL, "offset ", argv[2], srcin, 0, len);
5327       return NULL;
5328     } else if (offset < 0) {
5329       /* argument was a bignum */
5330       offset = 0x7FFFFFFF;
5331     }
5332     startv = argv[2];
5333 
5334     if (argc > 3) {
5335       if (!SCHEME_FALSEP(argv[3])) {
5336 	endset = scheme_extract_index(name, 3, argc, argv, len + 1, 1);
5337 
5338 	if (iport) {
5339 	  if (endset < 0) {
5340 	    /* argument was a bignum */
5341 	    endset = 0x7FFFFFFF;
5342 	  }
5343 	  /* compare numbers */
5344 	  if (scheme_bin_lt(argv[3], argv[2])) {
5345             scheme_contract_error(name,
5346                                   "ending index is smaller than starting index",
5347                                   "starting index", 1, argv[2],
5348                                   "ending index", 1, argv[3],
5349                                   NULL);
5350 	    return NULL;
5351 	  }
5352 	} else if (endset < offset || endset > len) {
5353 	  scheme_out_of_range(name, NULL, "ending ", argv[3], srcin, offset, len);
5354 	  return NULL;
5355 	}
5356 	endv = argv[3];
5357       }
5358 
5359       if (argc > 4) {
5360 	if (peek) {
5361 	  if (!SCHEME_FALSEP(argv[4])) {
5362 	    unless_evt = argv[4];
5363 	    if (!SAME_TYPE(SCHEME_TYPE(unless_evt), scheme_progress_evt_type)) {
5364 	      scheme_wrong_contract(name, "(or/c progress-evt? #f)", 4, argc, argv);
5365 	      return NULL;
5366 	    }
5367 	    if (!iport) {
5368 	      scheme_contract_error(name,
5369                                     "progress evt cannot be used with string input",
5370                                     "progress evt", 1, unless_evt,
5371                                     NULL);
5372 	    } else if (!SAME_OBJ(iport, SCHEME_PTR1_VAL(unless_evt))) {
5373 	      scheme_contract_error(name,
5374                                     "evt is not a progress evt for the given port",
5375                                     "progress evt", 1, unless_evt,
5376                                     "port", 1, iport,
5377                                     NULL);
5378 	      return NULL;
5379 	    }
5380 	  }
5381 	} else {
5382 	  if (SCHEME_TRUEP(argv[4])) {
5383 	    if (!SCHEME_OUTPUT_PORTP(argv[4]))
5384 	      scheme_wrong_contract(name, "(or/c output-port? #f)", 4, argc, argv);
5385 	    oport = argv[4];
5386 	  }
5387 	}
5388       }
5389 
5390       if (argc > 5) {
5391         if (!SCHEME_BYTE_STRINGP(argv[5]))
5392           scheme_wrong_contract(name, "bytes?", 5, argc, argv);
5393         prefix = SCHEME_BYTE_STR_VAL(argv[5]);
5394         prefix_len = SCHEME_BYTE_STRLEN_VAL(argv[5]);
5395         prefix_offset = 0;
5396 
5397         if (argc > 6) {
5398           if (!scheme_nonneg_exact_p(argv[6]))
5399             scheme_wrong_contract(name, "exact-nonnegative-integer?", 6, argc, argv);
5400           if (SCHEME_INTP(argv[6]))
5401             last_bytes_count = SCHEME_INT_VAL(argv[6]);
5402           else
5403             last_bytes_count = -1; /* => as many as available */
5404         }
5405       }
5406     }
5407   }
5408 
5409   if (iport && !startv)
5410     startv = scheme_make_integer(0);
5411 
5412   if (SCHEME_BYTE_STRINGP(argv[0])
5413       || SCHEME_CHAR_STRINGP(argv[0]))
5414     r = regcomp_object(argv[0]);
5415   else
5416     r = (regexp *)argv[0];
5417 
5418   was_non_byte = 0;
5419   orig_offset = 0; /* extra offset */
5420   if (!iport) {
5421     if (SCHEME_BYTE_STRINGP(srcin))
5422       full_s = SCHEME_BYTE_STR_VAL(srcin);
5423     else {
5424       /* Extract substring and UTF-8 encode: */
5425       if (endset - offset < LAZY_STRING_CHUNK_SIZE) {
5426         /* String is short enough to decode in one go: */
5427         int blen;
5428         blen = scheme_utf8_encode(SCHEME_CHAR_STR_VAL(srcin), offset, endset,
5429                                   NULL, 0,
5430                                   0 /* not UTF-16 */);
5431         full_s = (char *)scheme_malloc_atomic(blen);
5432         scheme_utf8_encode(SCHEME_CHAR_STR_VAL(srcin), offset, endset,
5433                            (unsigned char *)full_s, 0,
5434                            0 /* not UTF-16 */);
5435         orig_offset = offset;
5436         offset = 0;
5437         endset = blen;
5438       } else {
5439         /* Handle extremely long strings by decoding lazily: */
5440         lazy_string = MALLOC_ONE_RT(rx_lazy_str_t);
5441 #ifdef MZTAG_REQUIRED
5442         lazy_string->type = scheme_rt_rx_lazy_string;
5443 #endif
5444         lazy_string->start = offset;
5445         lazy_string->end = endset;
5446         lazy_string->done = 0;
5447         lazy_string->blen = 0;
5448         lazy_string->s = NULL;
5449         lazy_string->chars = SCHEME_CHAR_STR_VAL(srcin);
5450         full_s = NULL;
5451         orig_offset = offset;
5452         offset = 0;
5453         endset = 0;
5454       }
5455       if (r->flags & REGEXP_IS_UTF8)
5456 	was_non_byte = 1;
5457       else {
5458 	/* Convert orig_offset into encoded bytes */
5459 	orig_offset = scheme_utf8_encode(SCHEME_CHAR_STR_VAL(srcin), 0, orig_offset,
5460 					 NULL, 0,
5461 					 0);
5462       }
5463     }
5464   } else
5465     full_s = NULL;
5466 
5467   if (startp_buffer_cache && (r->nsubexp <= rx_buffer_size)) {
5468     startp = startp_buffer_cache;
5469     maybep = maybep_buffer_cache;
5470     endp = endp_buffer_cache;
5471     startp_buffer_cache = NULL;
5472   } else {
5473     startp = MALLOC_N_ATOMIC(rxpos, r->nsubexp);
5474     maybep = NULL;
5475     match_stack = NULL;
5476     endp = MALLOC_N_ATOMIC(rxpos, r->nsubexp);
5477   }
5478   if ((r->nsubexp > 1) && !maybep) {
5479     maybep = MALLOC_N_ATOMIC(rxpos, r->nsubexp);
5480 
5481     if (match_stack_buffer_cache) {
5482       match_stack = match_stack_buffer_cache;
5483       match_stack_buffer_cache = NULL;
5484     } else
5485       match_stack = MALLOC_N_ATOMIC(rxpos, MATCH_STACK_SIZE);
5486   } else {
5487     match_stack = NULL;
5488   }
5489 
5490   dropped = scheme_make_integer(0);
5491 
5492   m = regexec(name, r, full_s,
5493               offset, (endset < 0 ? endset : endset - offset),
5494               offset, lazy_string,
5495               startp, maybep, endp, match_stack,
5496 	      iport, unless_evt, nonblock,
5497 	      &full_s, peek, pos, last_bytes_count, oport,
5498 	      startv, endv, &dropped,
5499               prefix, prefix_len, prefix_offset);
5500 
5501   if (lazy_string) {
5502     full_s = lazy_string->s;
5503     endset = lazy_string->blen;
5504   }
5505 
5506   if (iport) {
5507     minpos = -prefix_len;
5508     offset = 0;
5509   } else
5510     minpos = offset - prefix_len;
5511 
5512   if (m) {
5513     int i;
5514     Scheme_Object *l = scheme_null, *rs;
5515 
5516     if (oport && !iport)
5517       scheme_put_byte_string(name, oport, full_s, 0, *startp, 0);
5518 
5519     if (last_bytes) {
5520       rxpos frompos, tooffset;
5521 
5522       if ((last_bytes_count < 0)
5523           || (endp[0] - minpos < last_bytes_count))
5524         last_bytes_count = endp[0] - minpos;
5525 
5526       if (!last_bytes_count) {
5527         last_bytes_str = empty_byte_string;
5528       } else {
5529         frompos = endp[0] - last_bytes_count;
5530         tooffset = 0;
5531 
5532         last_bytes_str = scheme_alloc_byte_string(last_bytes_count, 0);
5533         if (frompos < offset) {
5534           /* draw from prefix: */
5535           rxpos amt = last_bytes_count;
5536           if (frompos + last_bytes_count > offset)
5537             amt = offset - frompos;
5538           memcpy(SCHEME_BYTE_STR_VAL(last_bytes_str) XFORM_OK_PLUS tooffset,
5539                  prefix + prefix_offset + prefix_len - (offset - frompos),
5540                  amt);
5541           frompos += amt;
5542           tooffset += amt;
5543           last_bytes_count -= amt;
5544         }
5545         memcpy(SCHEME_BYTE_STR_VAL(last_bytes_str) XFORM_OK_PLUS tooffset,
5546                full_s + frompos,
5547                last_bytes_count);
5548       }
5549     }
5550 
5551     if (pos > 1) {
5552       /* pos == 2 => just get true or false */
5553       dropped = scheme_true;
5554     } else {
5555       for (i = r->nsubexp; i--; ) {
5556 	if (startp[i] >= minpos) {
5557 	  if (pos) {
5558 	    Scheme_Object *startpd, *endpd;
5559 
5560 	    if (was_non_byte) {
5561 	      /* Need to figure out how startpd and endpd correspond to
5562 		 code points. Note that the input regexp matches only
5563 		 unicode chars, so the start and end points can't be in
5564 		 the middle of encoded characters. */
5565 	      int uspd, uepd;
5566               if (startp[i] >= offset)
5567                 uspd = scheme_utf8_decode((const unsigned char *)full_s, offset, startp[i],
5568                                           NULL, 0, -1,
5569                                           NULL, 0, 0);
5570               else {
5571                 uspd = scheme_utf8_decode((const unsigned char *)prefix,
5572                                           prefix_offset + prefix_len + (startp[i] - offset),
5573                                           prefix_offset + prefix_len,
5574                                           NULL, 0, -1,
5575                                           NULL, 0, 0);
5576                 uspd = offset - uspd;
5577               }
5578 	      uspd += orig_offset;
5579 	      startpd = scheme_make_integer(uspd);
5580               if (startp[i] >= offset) {
5581                 uepd = scheme_utf8_decode((const unsigned char *)full_s, startp[i], endp[i],
5582                                           NULL, 0, -1,
5583                                           NULL, 0, 0);
5584                 uepd += uspd;
5585               } else if (endp[i] <= offset) {
5586                 uepd = scheme_utf8_decode((const unsigned char *)prefix,
5587                                           prefix_offset + prefix_len + (endp[i] - offset),
5588                                           prefix_offset + prefix_len,
5589                                           NULL, 0, -1,
5590                                           NULL, 0, 0);
5591                 uepd = offset - uepd;
5592                 uepd += orig_offset;
5593               } else {
5594                 uepd = scheme_utf8_decode((const unsigned char *)full_s, 0, endp[i],
5595                                           NULL, 0, -1,
5596                                           NULL, 0, 0);
5597                 uepd += orig_offset;
5598               }
5599 	      endpd = scheme_make_integer(uepd);
5600 	    } else {
5601 	      int v;
5602 	      v = startp[i] + orig_offset;
5603 	      startpd = scheme_make_integer(v);
5604 	      v = endp[i] + orig_offset;
5605 	      endpd = scheme_make_integer(v);
5606 
5607 	      if (iport) {
5608 		/* Increment by drop count: */
5609 		startpd = scheme_bin_plus(startpd, dropped);
5610 		endpd = scheme_bin_plus(endpd, dropped);
5611 	      }
5612 	    }
5613 
5614 	    l = scheme_make_pair(scheme_make_pair(startpd, endpd),
5615 				 l);
5616 	  } else {
5617 	    intptr_t len;
5618 	    len = endp[i] - startp[i];
5619             if (startp[i] >= offset) {
5620               if (was_non_byte) {
5621                 rs = scheme_make_sized_offset_utf8_string(full_s, startp[i], len);
5622               } else {
5623                 rs = scheme_make_sized_offset_byte_string(full_s, startp[i], len, 1);
5624               }
5625             } else if (endp[i] <= offset) {
5626               /* all in prefix */
5627               rs = scheme_make_sized_offset_byte_string(prefix,
5628                                                         prefix_offset + (startp[i] - minpos),
5629                                                         endp[i] - startp[i],
5630                                                         1);
5631               if (was_non_byte)
5632                 rs = scheme_byte_string_to_char_string(rs);
5633             } else {
5634               /* span both */
5635               Scheme_Object *rs2;
5636               rs = scheme_make_sized_offset_byte_string(prefix,
5637                                                         prefix_offset + (startp[i] - minpos),
5638                                                         offset - startp[i],
5639                                                         1);
5640               rs2 = scheme_make_sized_offset_byte_string(full_s, offset, endp[i] - offset, 1);
5641               rs = scheme_append_byte_string(rs, rs2);
5642               if (was_non_byte)
5643                 rs = scheme_byte_string_to_char_string(rs);
5644             }
5645 	    l = scheme_make_pair(rs, l);
5646 	  }
5647 	} else
5648 	  l = scheme_make_pair(scheme_false, l);
5649       }
5650       dropped = l;
5651     }
5652   } else {
5653     if (oport && !iport)
5654       scheme_put_byte_string(name, oport, full_s, 0, endset, 0);
5655 
5656     dropped = scheme_false;
5657     last_bytes_str = scheme_false;
5658   }
5659 
5660   if (!startp_buffer_cache || (r->nsubexp > rx_buffer_size)) {
5661     rx_buffer_size = r->nsubexp;
5662     startp_buffer_cache = startp;
5663     maybep_buffer_cache = maybep;
5664     endp_buffer_cache = endp;
5665   } else if (maybep && !maybep_buffer_cache && (r->nsubexp == rx_buffer_size)) {
5666     maybep_buffer_cache = maybep;
5667   }
5668   if (match_stack && !match_stack_buffer_cache)
5669     match_stack_buffer_cache = match_stack;
5670 
5671   if (last_bytes) {
5672     Scheme_Object *a[2];
5673     a[0] = dropped;
5674     a[1] = last_bytes_str;
5675     return scheme_values(2, a);
5676   } else
5677     return dropped;
5678 }
5679 
compare(int argc,Scheme_Object * argv[])5680 static Scheme_Object *compare(int argc, Scheme_Object *argv[])
5681 {
5682   return gen_compare("regexp-match", 0, argc, argv, 0, 0, 0);
5683 }
5684 
compare_end(int argc,Scheme_Object * argv[])5685 static Scheme_Object *compare_end(int argc, Scheme_Object *argv[])
5686 {
5687   return gen_compare("regexp-match/end", 0, argc, argv, 0, 0, 1);
5688 }
5689 
positions(int argc,Scheme_Object * argv[])5690 static Scheme_Object *positions(int argc, Scheme_Object *argv[])
5691 {
5692   return gen_compare("regexp-match-positions", 1, argc, argv, 0, 0, 0);
5693 }
5694 
positions_end(int argc,Scheme_Object * argv[])5695 static Scheme_Object *positions_end(int argc, Scheme_Object *argv[])
5696 {
5697   return gen_compare("regexp-match-positions/end", 1, argc, argv, 0, 0, 1);
5698 }
5699 
compare_bool(int argc,Scheme_Object * argv[])5700 static Scheme_Object *compare_bool(int argc, Scheme_Object *argv[])
5701 {
5702   return gen_compare("regexp-match?", 2, argc, argv, 0, 0, 0);
5703 }
5704 
scheme_regexp_match_p(Scheme_Object * regexp,Scheme_Object * target)5705 int scheme_regexp_match_p(Scheme_Object *regexp, Scheme_Object *target)
5706 {
5707   Scheme_Object *a[2];
5708   a[0] = regexp;
5709   a[1] = target;
5710   return SCHEME_TRUEP(compare_bool(2, a));
5711 }
5712 
compare_peek(int argc,Scheme_Object * argv[])5713 static Scheme_Object *compare_peek(int argc, Scheme_Object *argv[])
5714 {
5715   return gen_compare("regexp-match-peek", 0, argc, argv, 1, 0, 0);
5716 }
5717 
positions_peek(int argc,Scheme_Object * argv[])5718 static Scheme_Object *positions_peek(int argc, Scheme_Object *argv[])
5719 {
5720   return gen_compare("regexp-match-peek-positions", 1, argc, argv, 1, 0, 0);
5721 }
5722 
positions_peek_end(int argc,Scheme_Object * argv[])5723 static Scheme_Object *positions_peek_end(int argc, Scheme_Object *argv[])
5724 {
5725   return gen_compare("regexp-match-peek-positions/end", 1, argc, argv, 1, 0, 1);
5726 }
5727 
compare_peek_nonblock(int argc,Scheme_Object * argv[])5728 static Scheme_Object *compare_peek_nonblock(int argc, Scheme_Object *argv[])
5729 {
5730   return gen_compare("regexp-match-peek-immediate", 0, argc, argv, 1, 1, 0);
5731 }
5732 
positions_peek_nonblock(int argc,Scheme_Object * argv[])5733 static Scheme_Object *positions_peek_nonblock(int argc, Scheme_Object *argv[])
5734 {
5735   return gen_compare("regexp-match-peek-positions-immediate", 1, argc, argv, 1, 1, 0);
5736 }
5737 
positions_peek_nonblock_end(int argc,Scheme_Object * argv[])5738 static Scheme_Object *positions_peek_nonblock_end(int argc, Scheme_Object *argv[])
5739 {
5740   return gen_compare("regexp-match-peek-positions-immediate/end", 1, argc, argv, 1, 1, 1);
5741 }
5742 
build_call_name(const char * n)5743 static char *build_call_name(const char *n)
5744 {
5745   char *m;
5746   int l;
5747   l = strlen(n);
5748   m = (char *)scheme_malloc_atomic(l + 42);
5749   memcpy(m, n, l);
5750   strcpy(m XFORM_OK_PLUS l, " (calling given filter procedure)");
5751   return m;
5752 }
5753 
initial_char_len(unsigned char * source,intptr_t start,intptr_t end)5754 static int initial_char_len(unsigned char *source, intptr_t start, intptr_t end)
5755 {
5756   intptr_t i;
5757 
5758   for (i = start + 1; i <= end; i++) {
5759     if (scheme_utf8_decode_count(source, start, i, NULL, 1, 1)) {
5760       return i - start;
5761     }
5762   }
5763 
5764   return 1;
5765 }
5766 
gen_replace(const char * name,int argc,Scheme_Object * argv[],int all)5767 static Scheme_Object *gen_replace(const char *name, int argc, Scheme_Object *argv[], int all)
5768 {
5769   Scheme_Object *orig;
5770   regexp *r;
5771   char *source, *prefix = NULL, *deststr;
5772   rxpos *startp, *maybep, *endp, minpos;
5773   int prefix_len = 0, prefix_offset = 0, sourcelen, srcoffset = 0, was_non_byte, destlen;
5774   char *result_pre = NULL;
5775   int result_pre_len = 0, result_pre_size = 0;
5776   int cannot_match_more = 0;
5777 
5778   if (SCHEME_TYPE(argv[0]) != scheme_regexp_type
5779       && !SCHEME_BYTE_STRINGP(argv[0])
5780       && !SCHEME_CHAR_STRINGP(argv[0]))
5781     scheme_wrong_contract(name, "(or/c regexp? byte-regexp? string? bytes?)", 0, argc, argv);
5782   if (!SCHEME_BYTE_STRINGP(argv[1])
5783       && !SCHEME_CHAR_STRINGP(argv[1]))
5784     scheme_wrong_contract(name, "(or/c string? bytes?)", 1, argc, argv);
5785   if (!SCHEME_BYTE_STRINGP(argv[2])
5786       && !SCHEME_CHAR_STRINGP(argv[2])
5787       && !SCHEME_PROCP(argv[2]))
5788     scheme_wrong_contract(name, "(or/c string? bytes? procedure?)", 2, argc, argv);
5789 
5790   if (SCHEME_BYTE_STRINGP(argv[2])) {
5791     if (SCHEME_CHAR_STRINGP(argv[0])
5792 	|| ((SCHEME_TYPE(argv[0]) == scheme_regexp_type)
5793 	    && (((regexp *)argv[0])->flags & REGEXP_IS_UTF8))) {
5794       if (SCHEME_CHAR_STRINGP(argv[1])) {
5795 	scheme_contract_error(name, "cannot replace a string with a byte string",
5796                               "string-matching regexp", 1, argv[0],
5797                               "byte string", 1, argv[2],
5798                               NULL);
5799       }
5800     }
5801   }
5802 
5803   if (SCHEME_BYTE_STRINGP(argv[0])
5804       || SCHEME_CHAR_STRINGP(argv[0]))
5805     r = regcomp_object(argv[0]);
5806   else
5807     r = (regexp *)argv[0];
5808 
5809   if (0) {
5810     /* This check is disabled for backward compatibility, because
5811        `regexp-replace*` in `racket/private/string` does not check,
5812        and that was the exported `regexp-replace*` for a long time. */
5813     if (SCHEME_PROCP(argv[2])) {
5814       if (!scheme_check_proc_arity(NULL, r->nsubexp, 2, argc, argv)) {
5815         scheme_contract_error(name,
5816                               "replace procedure's arity does not include regexp's match count",
5817                               "regexp", 1, r,
5818                               "regexp match count", 1, scheme_make_integer(r->nsubexp),
5819                               "replace procedure", 1, argv[2],
5820                               NULL);
5821       }
5822     }
5823   }
5824 
5825   if (argc > 3) {
5826     if (!SCHEME_BYTE_STRINGP(argv[3]))
5827       scheme_wrong_contract(name, "bytes?", 3, argc, argv);
5828     prefix = SCHEME_BYTE_STR_VAL(argv[3]);
5829     prefix_len = SCHEME_BYTE_STRLEN_VAL(argv[3]);
5830     prefix_offset = 0;
5831   }
5832 
5833   if (SCHEME_CHAR_STRINGP(argv[1])) {
5834     orig = scheme_char_string_to_byte_string(argv[1]);
5835     if (r->flags & REGEXP_IS_UTF8)
5836       was_non_byte = 1;
5837     else
5838       was_non_byte = 0;
5839   } else {
5840     orig = argv[1];
5841     was_non_byte = 0;
5842   }
5843   source = SCHEME_BYTE_STR_VAL(orig);
5844   sourcelen = SCHEME_BYTE_STRTAG_VAL(orig);
5845   deststr = NULL;
5846   destlen = 0;
5847 
5848   startp = MALLOC_N_ATOMIC(rxpos, r->nsubexp);
5849   if (r->nsubexp > 1)
5850     maybep = MALLOC_N_ATOMIC(rxpos, r->nsubexp);
5851   else
5852     maybep = NULL;
5853   endp = MALLOC_N_ATOMIC(rxpos, r->nsubexp);
5854 
5855   minpos = -prefix_len;
5856 
5857   while (1) {
5858     int m;
5859 
5860     if (cannot_match_more)
5861       m = 0;
5862     else
5863       m = regexec(name, r, source, srcoffset, sourcelen - srcoffset, 0, NULL,
5864                   startp, maybep, endp, NULL,
5865                   NULL, NULL, 0,
5866                   NULL, 0, 0, 0, NULL, NULL, NULL, NULL,
5867                   prefix, prefix_len, prefix_offset);
5868 
5869     if (m) {
5870       char *insert;
5871       intptr_t len, end, startpd, endpd;
5872 
5873       if (SCHEME_PROCP(argv[2])) {
5874         int i;
5875         Scheme_Object *m, **args, *quick_args[5];
5876 
5877 	if (r->nsubexp <= 5) {
5878 	  args = quick_args;
5879 	} else {
5880 	  args = MALLOC_N(Scheme_Object*, r->nsubexp);
5881 	}
5882 
5883         for (i = r->nsubexp; i--; ) {
5884           if (startp[i] < minpos) {
5885             args[i] = scheme_false;
5886           } else {
5887             intptr_t len;
5888             len = endp[i] - startp[i];
5889             if (startp[i] >= 0) {
5890               if (was_non_byte) {
5891                 m = scheme_make_sized_offset_utf8_string(source, startp[i], len);
5892               } else {
5893                 m = scheme_make_sized_offset_byte_string(source, startp[i], len, 1);
5894               }
5895             } else {
5896               /* at least some of prefix is included */
5897               int pre_len = len;
5898 
5899               if (endp[i] > 0)
5900                 pre_len -= endp[i];
5901               m = scheme_make_sized_offset_byte_string(prefix,
5902                                                        prefix_offset + prefix_len + startp[i],
5903                                                        pre_len,
5904                                                        1);
5905               if (was_non_byte)
5906                 m = scheme_byte_string_to_char_string(m);
5907 
5908               if (endp[i] > 0) {
5909                 Scheme_Object *m2;
5910                 if (was_non_byte) {
5911                   m2 = scheme_make_sized_offset_utf8_string(source, 0, endp[i]);
5912                   m = scheme_append_char_string(m, m2);
5913                 } else {
5914                   m2 = scheme_make_sized_offset_byte_string(source, 0, endp[i], 1);
5915                   m = scheme_append_byte_string(m, m2);
5916                 }
5917               }
5918             }
5919             args[i] = m;
5920           }
5921         }
5922 
5923         m = _scheme_apply(argv[2], r->nsubexp, args);
5924 
5925 	if (!was_non_byte) {
5926           if (!SCHEME_BYTE_STRINGP(m)) {
5927 	    args[0] = m;
5928 	    scheme_wrong_contract(build_call_name(name), "bytes?", -1, -1, args);
5929 	  }
5930 	  insert = SCHEME_BYTE_STR_VAL(m);
5931           len = SCHEME_BYTE_STRLEN_VAL(m);
5932         } else {
5933 	  if (!SCHEME_CHAR_STRINGP(m)) {
5934 	    args[0] = m;
5935 	    scheme_wrong_contract(build_call_name(name), "string?", -1, -1, args);
5936 	  }
5937           len = scheme_utf8_encode(SCHEME_CHAR_STR_VAL(m), 0,
5938                                    SCHEME_CHAR_STRLEN_VAL(m),
5939                                    NULL, 0, 0 /* not UTF-16 */);
5940           insert = (char *)scheme_malloc_atomic(len);
5941           scheme_utf8_encode(SCHEME_CHAR_STR_VAL(m), 0,
5942                              SCHEME_CHAR_STRLEN_VAL(m),
5943                              (unsigned char *)insert, 0, 0 /* not UTF-16 */);
5944         }
5945       } else {
5946 	if (!deststr) {
5947 	  if (SCHEME_CHAR_STRINGP(argv[2])) {
5948 	    Scheme_Object *bs;
5949 	    bs = scheme_char_string_to_byte_string(argv[2]);
5950 	    deststr = SCHEME_BYTE_STR_VAL(bs);
5951 	    destlen = SCHEME_BYTE_STRTAG_VAL(bs);
5952 	  } else {
5953 	    deststr = SCHEME_BYTE_STR_VAL(argv[2]);
5954 	    destlen = SCHEME_BYTE_STRTAG_VAL(argv[2]);
5955 	  }
5956 	}
5957 	insert = regsub(r, deststr, destlen, &len, source, startp, endp,
5958                         minpos, prefix, prefix_offset);
5959       }
5960 
5961       end = sourcelen;
5962 
5963       startpd = startp[0];
5964       endpd = endp[0];
5965 
5966       if (!startpd && (endpd == end) && !result_pre && !all) {
5967 	if (was_non_byte)
5968 	  return scheme_make_sized_utf8_string(insert, len);
5969 	else
5970 	  return scheme_make_sized_byte_string(insert, len, 0);
5971       } else if (!all) {
5972 	char *result;
5973 	intptr_t total;
5974 
5975 	total = len + (startpd - srcoffset) + (end - endpd);
5976 
5977 	result = (char *)scheme_malloc_atomic(total + 1);
5978 	memcpy(result, source + srcoffset, startpd - srcoffset);
5979 	memcpy(result + (startpd - srcoffset), insert, len);
5980 	memcpy(result + (startpd - srcoffset) + len, source + endpd, (end - endpd) + 1);
5981 
5982 	if (was_non_byte)
5983 	  return scheme_make_sized_utf8_string(result, total);
5984 	else
5985 	  return scheme_make_sized_byte_string(result, total, 0);
5986       } else {
5987 	intptr_t total;
5988         int more;
5989 
5990         if (startpd == endpd)  {
5991           if (endpd == end) {
5992             more = 0;
5993             cannot_match_more = 1;
5994           } else if (was_non_byte)
5995             more = initial_char_len((unsigned char *)source, startpd, sourcelen);
5996           else
5997             more = 1;
5998         } else
5999           more = 0;
6000 
6001 	total = len + (startpd - srcoffset) + more;
6002         if (!result_pre || (total + result_pre_len > result_pre_size)) {
6003           char *naya;
6004           intptr_t new_size = (2 * ((result_pre_len < total) ? total : result_pre_len)) + 1;
6005           naya = (char *)scheme_malloc_atomic(new_size);
6006           if (result_pre_size > 0)
6007             memcpy(naya, result_pre, result_pre_len);
6008           result_pre = naya;
6009           result_pre_size = new_size;
6010         }
6011 
6012 	memcpy(result_pre + result_pre_len, source + srcoffset, startpd - srcoffset);
6013 	memcpy(result_pre + result_pre_len + (startpd - srcoffset), insert, len);
6014         if (more) {
6015           memcpy(result_pre + result_pre_len + (endpd - srcoffset) + len, source + startpd, more);
6016         }
6017         result_pre_len += total;
6018 
6019 	srcoffset = endpd + more;
6020       }
6021     } else if (!result_pre) {
6022       if (was_non_byte)
6023 	return argv[1];
6024       else
6025 	return orig;
6026     } else {
6027       char *result;
6028       intptr_t total, slen;
6029 
6030       slen = sourcelen - srcoffset;
6031       total = result_pre_len + slen;
6032 
6033       result = (char *)scheme_malloc_atomic(total + 1);
6034       memcpy(result, result_pre, result_pre_len);
6035       memcpy(result + result_pre_len, source + srcoffset, slen);
6036       result[result_pre_len + slen] = 0;
6037 
6038       if (was_non_byte)
6039 	return scheme_make_sized_utf8_string(result, total);
6040       else
6041 	return scheme_make_sized_byte_string(result, total, 0);
6042     }
6043 
6044     SCHEME_USE_FUEL(1);
6045   }
6046 }
6047 
replace(int argc,Scheme_Object * argv[])6048 static Scheme_Object *replace(int argc, Scheme_Object *argv[])
6049 {
6050   return gen_replace("regexp-replace", argc, argv, 0);
6051 }
6052 
replace_star(int argc,Scheme_Object * argv[])6053 static Scheme_Object *replace_star(int argc, Scheme_Object *argv[])
6054 {
6055   return gen_replace("regexp-replace*", argc, argv, 1);
6056 }
6057 
regexp_p(int argc,Scheme_Object * argv[])6058 static Scheme_Object *regexp_p(int argc, Scheme_Object *argv[])
6059 {
6060   return (((SCHEME_TYPE(argv[0]) == scheme_regexp_type)
6061 	   && (((regexp *)argv[0])->flags & REGEXP_IS_UTF8))
6062 	  ? scheme_true
6063 	  : scheme_false);
6064 }
6065 
byte_regexp_p(int argc,Scheme_Object * argv[])6066 static Scheme_Object *byte_regexp_p(int argc, Scheme_Object *argv[])
6067 {
6068   return (((SCHEME_TYPE(argv[0]) == scheme_regexp_type)
6069 	   && !(((regexp *)argv[0])->flags & REGEXP_IS_UTF8))
6070 	  ? scheme_true
6071 	  : scheme_false);
6072 }
6073 
pregexp_p(int argc,Scheme_Object * argv[])6074 static Scheme_Object *pregexp_p(int argc, Scheme_Object *argv[])
6075 {
6076   return (((SCHEME_TYPE(argv[0]) == scheme_regexp_type)
6077 	   && (((regexp *)argv[0])->flags & REGEXP_IS_UTF8)
6078 	   && (((regexp *)argv[0])->flags & REGEXP_IS_PCRE))
6079 	  ? scheme_true
6080 	  : scheme_false);
6081 }
6082 
byte_pregexp_p(int argc,Scheme_Object * argv[])6083 static Scheme_Object *byte_pregexp_p(int argc, Scheme_Object *argv[])
6084 {
6085   return (((SCHEME_TYPE(argv[0]) == scheme_regexp_type)
6086 	   && !(((regexp *)argv[0])->flags & REGEXP_IS_UTF8)
6087 	   && (((regexp *)argv[0])->flags & REGEXP_IS_PCRE))
6088 	  ? scheme_true
6089 	  : scheme_false);
6090 }
6091 
regexp_lookbehind(int argc,Scheme_Object * argv[])6092 Scheme_Object *regexp_lookbehind(int argc, Scheme_Object *argv[])
6093 {
6094   if (!SAME_TYPE(SCHEME_TYPE(argv[0]), scheme_regexp_type))
6095     scheme_wrong_contract("regexp-max-lookbehind", "(or/c regexp? byte-regexp?)", 0, argc, argv);
6096   return scheme_make_integer(((regexp *)argv[0])->maxlookback);
6097 }
6098 
scheme_regexp_source(Scheme_Object * re)6099 Scheme_Object *scheme_regexp_source(Scheme_Object *re)
6100 {
6101   return ((regexp *)re)->source;
6102 }
6103 
scheme_regexp_is_byte(Scheme_Object * re)6104 int scheme_regexp_is_byte(Scheme_Object *re)
6105 {
6106   return !(((regexp *)re)->flags & REGEXP_IS_UTF8);
6107 }
6108 
scheme_regexp_is_pregexp(Scheme_Object * re)6109 int scheme_regexp_is_pregexp(Scheme_Object *re)
6110 {
6111   return !!(((regexp *)re)->flags & REGEXP_IS_PCRE);
6112 }
6113 
6114 #ifdef MZ_PRECISE_GC
6115 START_XFORM_SKIP;
6116 #include "mzmark_regexp.inc"
6117 END_XFORM_SKIP;
6118 #endif
6119 
scheme_regexp_initialize(Scheme_Startup_Env * env)6120 void scheme_regexp_initialize(Scheme_Startup_Env *env)
6121 {
6122 #ifdef MZ_PRECISE_GC
6123   GC_REG_TRAV(scheme_regexp_type, mark_regexp);
6124   GC_REG_TRAV(scheme_rt_regwork, mark_regwork);
6125   GC_REG_TRAV(scheme_rt_rx_lazy_string, mark_lazy_string);
6126 #endif
6127 
6128   REGISTER_SO(empty_byte_string);
6129   empty_byte_string = scheme_alloc_byte_string(0, 0);
6130 
6131   ADD_PRIM_W_ARITY("byte-regexp",                           make_regexp,             1, 2, env);
6132   ADD_PRIM_W_ARITY("regexp",                                make_utf8_regexp,        1, 2, env);
6133   ADD_PRIM_W_ARITY("byte-pregexp",                          make_pregexp,            1, 2, env);
6134   ADD_PRIM_W_ARITY("pregexp",                               make_utf8_pregexp,       1, 2, env);
6135   ADD_PRIM_W_ARITY("regexp-match",                          compare,                 2, 6, env);
6136   ADD_PRIM_W_ARITY2("regexp-match/end",                      compare_end,            2, 7, 2, 2, env);
6137   ADD_PRIM_W_ARITY("regexp-match-positions",                positions,               2, 6, env);
6138   ADD_PRIM_W_ARITY2("regexp-match-positions/end",            positions_end,          2, 7, 2, 2, env);
6139   ADD_PRIM_W_ARITY("regexp-match?",                         compare_bool,            2, 6, env);
6140   ADD_PRIM_W_ARITY("regexp-match-peek",                     compare_peek,            2, 6, env);
6141   ADD_PRIM_W_ARITY("regexp-match-peek-positions",           positions_peek,          2, 6, env);
6142   ADD_PRIM_W_ARITY2("regexp-match-peek-positions/end",       positions_peek_end,     2, 7, 2, 2, env);
6143   ADD_PRIM_W_ARITY("regexp-match-peek-immediate",           compare_peek_nonblock,   2, 6, env);
6144   ADD_PRIM_W_ARITY("regexp-match-peek-positions-immediate", positions_peek_nonblock, 2, 6, env);
6145   ADD_PRIM_W_ARITY2("regexp-match-peek-positions-immediate/end", positions_peek_nonblock_end, 2, 7, 2, 2, env);
6146   ADD_PRIM_W_ARITY("regexp-replace",                        replace,                 3, 4, env);
6147   ADD_PRIM_W_ARITY("regexp-replace*",                       replace_star,            3, 4, env);
6148 
6149   ADD_FOLDING_PRIM("regexp?",                               regexp_p,        1, 1, 1, env);
6150   ADD_FOLDING_PRIM("byte-regexp?",                          byte_regexp_p,   1, 1, 1, env);
6151   ADD_FOLDING_PRIM("pregexp?",                              pregexp_p,       1, 1, 1, env);
6152   ADD_FOLDING_PRIM("byte-pregexp?",                         byte_pregexp_p,  1, 1, 1, env);
6153 
6154   ADD_FOLDING_PRIM("regexp-max-lookbehind",                 regexp_lookbehind, 1, 1, 1, env);
6155 }
6156 
scheme_init_regexp_places()6157 void scheme_init_regexp_places()
6158 {
6159   REGISTER_SO(regparsestr);
6160   REGISTER_SO(regstr);
6161   REGISTER_SO(regbackknown);
6162   REGISTER_SO(regbackdepends);
6163   REGISTER_SO(regerrorproc);
6164   REGISTER_SO(regerrorval);
6165 }
6166