1 /*
2  * Regular Expression Engine
3  *
4  * Copyright (c) 2017-2018 Fabrice Bellard
5  *
6  * Permission is hereby granted, free of charge, to any person obtaining a copy
7  * of this software and associated documentation files (the "Software"), to deal
8  * in the Software without restriction, including without limitation the rights
9  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10  * copies of the Software, and to permit persons to whom the Software is
11  * furnished to do so, subject to the following conditions:
12  *
13  * The above copyright notice and this permission notice shall be included in
14  * all copies or substantial portions of the Software.
15  *
16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
22  * THE SOFTWARE.
23  */
24 #include <stdlib.h>
25 #include <stdio.h>
26 #include <stdarg.h>
27 #include <inttypes.h>
28 #include <string.h>
29 #include <assert.h>
30 
31 #include "cutils.h"
32 #include "libregexp.h"
33 
34 /*
35   TODO:
36 
37   - Add full unicode canonicalize rules for character ranges (not
38     really useful but needed for exact "ignorecase" compatibility).
39 
40   - Add a lock step execution mode (=linear time execution guaranteed)
41     when the regular expression is "simple" i.e. no backreference nor
42     complicated lookahead. The opcodes are designed for this execution
43     model.
44 */
45 
46 #if defined(TEST)
47 #define DUMP_REOP
48 #endif
49 
50 typedef enum {
51 #define DEF(id, size) REOP_ ## id,
52 #include "libregexp-opcode.h"
53 #undef DEF
54     REOP_COUNT,
55 } REOPCodeEnum;
56 
57 #define CAPTURE_COUNT_MAX 255
58 #define STACK_SIZE_MAX 255
59 
60 /* unicode code points */
61 #define CP_LS   0x2028
62 #define CP_PS   0x2029
63 
64 #define TMP_BUF_SIZE 128
65 
66 typedef struct {
67     DynBuf byte_code;
68     const uint8_t *buf_ptr;
69     const uint8_t *buf_end;
70     const uint8_t *buf_start;
71     int re_flags;
72     BOOL is_utf16;
73     BOOL ignore_case;
74     BOOL dotall;
75     int capture_count;
76     int total_capture_count; /* -1 = not computed yet */
77     int has_named_captures; /* -1 = don't know, 0 = no, 1 = yes */
78     void *mem_opaque;
79     DynBuf group_names;
80     union {
81         char error_msg[TMP_BUF_SIZE];
82         char tmp_buf[TMP_BUF_SIZE];
83     } u;
84 } REParseState;
85 
86 typedef struct {
87 #ifdef DUMP_REOP
88     const char *name;
89 #endif
90     uint8_t size;
91 } REOpCode;
92 
93 static const REOpCode reopcode_info[REOP_COUNT] = {
94 #ifdef DUMP_REOP
95 #define DEF(id, size) { #id, size },
96 #else
97 #define DEF(id, size) { size },
98 #endif
99 #include "libregexp-opcode.h"
100 #undef DEF
101 };
102 
103 #define RE_HEADER_FLAGS         0
104 #define RE_HEADER_CAPTURE_COUNT 1
105 #define RE_HEADER_STACK_SIZE    2
106 
107 #define RE_HEADER_LEN 7
108 
is_digit(int c)109 static inline int is_digit(int c) {
110     return c >= '0' && c <= '9';
111 }
112 
113 /* insert 'len' bytes at position 'pos' */
dbuf_insert(DynBuf * s,int pos,int len)114 static void dbuf_insert(DynBuf *s, int pos, int len)
115 {
116     dbuf_realloc(s, s->size + len);
117     memmove(s->buf + pos + len, s->buf + pos, s->size - pos);
118     s->size += len;
119 }
120 
121 /* canonicalize with the specific JS regexp rules */
lre_canonicalize(uint32_t c,BOOL is_utf16)122 static uint32_t lre_canonicalize(uint32_t c, BOOL is_utf16)
123 {
124     uint32_t res[LRE_CC_RES_LEN_MAX];
125     int len;
126     if (is_utf16) {
127         if (likely(c < 128)) {
128             if (c >= 'A' && c <= 'Z')
129                 c = c - 'A' + 'a';
130         } else {
131             lre_case_conv(res, c, 2);
132             c = res[0];
133         }
134     } else {
135         if (likely(c < 128)) {
136             if (c >= 'a' && c <= 'z')
137                 c = c - 'a' + 'A';
138         } else {
139             /* legacy regexp: to upper case if single char >= 128 */
140             len = lre_case_conv(res, c, FALSE);
141             if (len == 1 && res[0] >= 128)
142                 c = res[0];
143         }
144     }
145     return c;
146 }
147 
148 static const uint16_t char_range_d[] = {
149     1,
150     0x0030, 0x0039 + 1,
151 };
152 
153 /* code point ranges for Zs,Zl or Zp property */
154 static const uint16_t char_range_s[] = {
155     10,
156     0x0009, 0x000D + 1,
157     0x0020, 0x0020 + 1,
158     0x00A0, 0x00A0 + 1,
159     0x1680, 0x1680 + 1,
160     0x2000, 0x200A + 1,
161     /* 2028;LINE SEPARATOR;Zl;0;WS;;;;;N;;;;; */
162     /* 2029;PARAGRAPH SEPARATOR;Zp;0;B;;;;;N;;;;; */
163     0x2028, 0x2029 + 1,
164     0x202F, 0x202F + 1,
165     0x205F, 0x205F + 1,
166     0x3000, 0x3000 + 1,
167     /* FEFF;ZERO WIDTH NO-BREAK SPACE;Cf;0;BN;;;;;N;BYTE ORDER MARK;;;; */
168     0xFEFF, 0xFEFF + 1,
169 };
170 
lre_is_space(int c)171 BOOL lre_is_space(int c)
172 {
173     int i, n, low, high;
174     n = (countof(char_range_s) - 1) / 2;
175     for(i = 0; i < n; i++) {
176         low = char_range_s[2 * i + 1];
177         if (c < low)
178             return FALSE;
179         high = char_range_s[2 * i + 2];
180         if (c < high)
181             return TRUE;
182     }
183     return FALSE;
184 }
185 
186 uint32_t const lre_id_start_table_ascii[4] = {
187     /* $ A-Z _ a-z */
188     0x00000000, 0x00000010, 0x87FFFFFE, 0x07FFFFFE
189 };
190 
191 uint32_t const lre_id_continue_table_ascii[4] = {
192     /* $ 0-9 A-Z _ a-z */
193     0x00000000, 0x03FF0010, 0x87FFFFFE, 0x07FFFFFE
194 };
195 
196 
197 static const uint16_t char_range_w[] = {
198     4,
199     0x0030, 0x0039 + 1,
200     0x0041, 0x005A + 1,
201     0x005F, 0x005F + 1,
202     0x0061, 0x007A + 1,
203 };
204 
205 #define CLASS_RANGE_BASE 0x40000000
206 
207 typedef enum {
208     CHAR_RANGE_d,
209     CHAR_RANGE_D,
210     CHAR_RANGE_s,
211     CHAR_RANGE_S,
212     CHAR_RANGE_w,
213     CHAR_RANGE_W,
214 } CharRangeEnum;
215 
216 static const uint16_t *char_range_table[] = {
217     char_range_d,
218     char_range_s,
219     char_range_w,
220 };
221 
cr_init_char_range(REParseState * s,CharRange * cr,uint32_t c)222 static int cr_init_char_range(REParseState *s, CharRange *cr, uint32_t c)
223 {
224     BOOL invert;
225     const uint16_t *c_pt;
226     int len, i;
227 
228     invert = c & 1;
229     c_pt = char_range_table[c >> 1];
230     len = *c_pt++;
231     cr_init(cr, s->mem_opaque, lre_realloc);
232     for(i = 0; i < len * 2; i++) {
233         if (cr_add_point(cr, c_pt[i]))
234             goto fail;
235     }
236     if (invert) {
237         if (cr_invert(cr))
238             goto fail;
239     }
240     return 0;
241  fail:
242     cr_free(cr);
243     return -1;
244 }
245 
cr_canonicalize(CharRange * cr)246 static int cr_canonicalize(CharRange *cr)
247 {
248     CharRange a;
249     uint32_t pt[2];
250     int i, ret;
251 
252     cr_init(&a, cr->mem_opaque, lre_realloc);
253     pt[0] = 'a';
254     pt[1] = 'z' + 1;
255     ret = cr_op(&a, cr->points, cr->len, pt, 2, CR_OP_INTER);
256     if (ret)
257         goto fail;
258     /* convert to upper case */
259     /* XXX: the generic unicode case would be much more complicated
260        and not really useful */
261     for(i = 0; i < a.len; i++) {
262         a.points[i] += 'A' - 'a';
263     }
264     /* Note: for simplicity we keep the lower case ranges */
265     ret = cr_union1(cr, a.points, a.len);
266  fail:
267     cr_free(&a);
268     return ret;
269 }
270 
271 #ifdef DUMP_REOP
lre_dump_bytecode(const uint8_t * buf,int buf_len)272 static __maybe_unused void lre_dump_bytecode(const uint8_t *buf,
273                                                      int buf_len)
274 {
275     int pos, len, opcode, bc_len, re_flags, i;
276     uint32_t val;
277 
278     assert(buf_len >= RE_HEADER_LEN);
279 
280     re_flags=  buf[0];
281     bc_len = get_u32(buf + 3);
282     assert(bc_len + RE_HEADER_LEN <= buf_len);
283     printf("flags: 0x%x capture_count=%d stack_size=%d\n",
284            re_flags, buf[1], buf[2]);
285     if (re_flags & LRE_FLAG_NAMED_GROUPS) {
286         const char *p;
287         p = (char *)buf + RE_HEADER_LEN + bc_len;
288         printf("named groups: ");
289         for(i = 1; i < buf[1]; i++) {
290             if (i != 1)
291                 printf(",");
292             printf("<%s>", p);
293             p += strlen(p) + 1;
294         }
295         printf("\n");
296         assert(p == (char *)(buf + buf_len));
297     }
298     printf("bytecode_len=%d\n", bc_len);
299 
300     buf += RE_HEADER_LEN;
301     pos = 0;
302     while (pos < bc_len) {
303         printf("%5u: ", pos);
304         opcode = buf[pos];
305         len = reopcode_info[opcode].size;
306         if (opcode >= REOP_COUNT) {
307             printf(" invalid opcode=0x%02x\n", opcode);
308             break;
309         }
310         if ((pos + len) > bc_len) {
311             printf(" buffer overflow (opcode=0x%02x)\n", opcode);
312             break;
313         }
314         printf("%s", reopcode_info[opcode].name);
315         switch(opcode) {
316         case REOP_char:
317             val = get_u16(buf + pos + 1);
318             if (val >= ' ' && val <= 126)
319                 printf(" '%c'", val);
320             else
321                 printf(" 0x%04x", val);
322             break;
323         case REOP_char32:
324             val = get_u32(buf + pos + 1);
325             if (val >= ' ' && val <= 126)
326                 printf(" '%c'", val);
327             else
328                 printf(" 0x%08x", val);
329             break;
330         case REOP_goto:
331         case REOP_split_goto_first:
332         case REOP_split_next_first:
333         case REOP_loop:
334         case REOP_lookahead:
335         case REOP_negative_lookahead:
336         case REOP_bne_char_pos:
337             val = get_u32(buf + pos + 1);
338             val += (pos + 5);
339             printf(" %u", val);
340             break;
341         case REOP_simple_greedy_quant:
342             printf(" %u %u %u %u",
343                    get_u32(buf + pos + 1) + (pos + 17),
344                    get_u32(buf + pos + 1 + 4),
345                    get_u32(buf + pos + 1 + 8),
346                    get_u32(buf + pos + 1 + 12));
347             break;
348         case REOP_save_start:
349         case REOP_save_end:
350         case REOP_back_reference:
351         case REOP_backward_back_reference:
352             printf(" %u", buf[pos + 1]);
353             break;
354         case REOP_save_reset:
355             printf(" %u %u", buf[pos + 1], buf[pos + 2]);
356             break;
357         case REOP_push_i32:
358             val = get_u32(buf + pos + 1);
359             printf(" %d", val);
360             break;
361         case REOP_range:
362             {
363                 int n, i;
364                 n = get_u16(buf + pos + 1);
365                 len += n * 4;
366                 for(i = 0; i < n * 2; i++) {
367                     val = get_u16(buf + pos + 3 + i * 2);
368                     printf(" 0x%04x", val);
369                 }
370             }
371             break;
372         case REOP_range32:
373             {
374                 int n, i;
375                 n = get_u16(buf + pos + 1);
376                 len += n * 8;
377                 for(i = 0; i < n * 2; i++) {
378                     val = get_u32(buf + pos + 3 + i * 4);
379                     printf(" 0x%08x", val);
380                 }
381             }
382             break;
383         default:
384             break;
385         }
386         printf("\n");
387         pos += len;
388     }
389 }
390 #endif
391 
re_emit_op(REParseState * s,int op)392 static void re_emit_op(REParseState *s, int op)
393 {
394     dbuf_putc(&s->byte_code, op);
395 }
396 
397 /* return the offset of the u32 value */
re_emit_op_u32(REParseState * s,int op,uint32_t val)398 static int re_emit_op_u32(REParseState *s, int op, uint32_t val)
399 {
400     int pos;
401     dbuf_putc(&s->byte_code, op);
402     pos = s->byte_code.size;
403     dbuf_put_u32(&s->byte_code, val);
404     return pos;
405 }
406 
re_emit_goto(REParseState * s,int op,uint32_t val)407 static int re_emit_goto(REParseState *s, int op, uint32_t val)
408 {
409     int pos;
410     dbuf_putc(&s->byte_code, op);
411     pos = s->byte_code.size;
412     dbuf_put_u32(&s->byte_code, val - (pos + 4));
413     return pos;
414 }
415 
re_emit_op_u8(REParseState * s,int op,uint32_t val)416 static void re_emit_op_u8(REParseState *s, int op, uint32_t val)
417 {
418     dbuf_putc(&s->byte_code, op);
419     dbuf_putc(&s->byte_code, val);
420 }
421 
re_emit_op_u16(REParseState * s,int op,uint32_t val)422 static void re_emit_op_u16(REParseState *s, int op, uint32_t val)
423 {
424     dbuf_putc(&s->byte_code, op);
425     dbuf_put_u16(&s->byte_code, val);
426 }
427 
428 #if defined(_MSC_VER)
re_parse_error(REParseState * s,const char * fmt,...)429 static int re_parse_error(REParseState *s, const char *fmt, ...)
430 #else
431 static int __attribute__((format(printf, 2, 3))) re_parse_error(REParseState *s, const char *fmt, ...)
432 #endif
433 {
434     va_list ap;
435     va_start(ap, fmt);
436     vsnprintf(s->u.error_msg, sizeof(s->u.error_msg), fmt, ap);
437     va_end(ap);
438     return -1;
439 }
440 
441 /* Return -1 in case of overflow */
parse_digits(const uint8_t ** pp)442 static int parse_digits(const uint8_t **pp)
443 {
444     const uint8_t *p;
445     uint64_t v;
446     int c;
447 
448     p = *pp;
449     v = 0;
450     for(;;) {
451         c = *p;
452         if (c < '0' || c > '9')
453             break;
454         v = v * 10 + c - '0';
455         if (v >= INT32_MAX)
456             return -1;
457         p++;
458     }
459     *pp = p;
460     return v;
461 }
462 
re_parse_expect(REParseState * s,const uint8_t ** pp,int c)463 static int re_parse_expect(REParseState *s, const uint8_t **pp, int c)
464 {
465     const uint8_t *p;
466     p = *pp;
467     if (*p != c)
468         return re_parse_error(s, "expecting '%c'", c);
469     p++;
470     *pp = p;
471     return 0;
472 }
473 
474 /* Parse an escape sequence, *pp points after the '\':
475    allow_utf16 value:
476    0 : no UTF-16 escapes allowed
477    1 : UTF-16 escapes allowed
478    2 : UTF-16 escapes allowed and escapes of surrogate pairs are
479    converted to a unicode character (unicode regexp case).
480 
481    Return the unicode char and update *pp if recognized,
482    return -1 if malformed escape,
483    return -2 otherwise. */
lre_parse_escape(const uint8_t ** pp,int allow_utf16)484 int lre_parse_escape(const uint8_t **pp, int allow_utf16)
485 {
486     const uint8_t *p;
487     uint32_t c;
488 
489     p = *pp;
490     c = *p++;
491     switch(c) {
492     case 'b':
493         c = '\b';
494         break;
495     case 'f':
496         c = '\f';
497         break;
498     case 'n':
499         c = '\n';
500         break;
501     case 'r':
502         c = '\r';
503         break;
504     case 't':
505         c = '\t';
506         break;
507     case 'v':
508         c = '\v';
509         break;
510     case 'x':
511     case 'u':
512         {
513             int h, n, i;
514             uint32_t c1;
515 
516             if (*p == '{' && allow_utf16) {
517                 p++;
518                 c = 0;
519                 for(;;) {
520                     h = from_hex(*p++);
521                     if (h < 0)
522                         return -1;
523                     c = (c << 4) | h;
524                     if (c > 0x10FFFF)
525                         return -1;
526                     if (*p == '}')
527                         break;
528                 }
529                 p++;
530             } else {
531                 if (c == 'x') {
532                     n = 2;
533                 } else {
534                     n = 4;
535                 }
536 
537                 c = 0;
538                 for(i = 0; i < n; i++) {
539                     h = from_hex(*p++);
540                     if (h < 0) {
541                         return -1;
542                     }
543                     c = (c << 4) | h;
544                 }
545                 if (c >= 0xd800 && c < 0xdc00 &&
546                     allow_utf16 == 2 && p[0] == '\\' && p[1] == 'u') {
547                     /* convert an escaped surrogate pair into a
548                        unicode char */
549                     c1 = 0;
550                     for(i = 0; i < 4; i++) {
551                         h = from_hex(p[2 + i]);
552                         if (h < 0)
553                             break;
554                         c1 = (c1 << 4) | h;
555                     }
556                     if (i == 4 && c1 >= 0xdc00 && c1 < 0xe000) {
557                         p += 6;
558                         c = (((c & 0x3ff) << 10) | (c1 & 0x3ff)) + 0x10000;
559                     }
560                 }
561             }
562         }
563         break;
564 #if defined(_MSC_VER)
565     case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7':
566 #else
567     case '0' ... '7':
568 #endif
569         c -= '0';
570         if (allow_utf16 == 2) {
571             /* only accept \0 not followed by digit */
572             if (c != 0 || is_digit(*p))
573                 return -1;
574         } else {
575             /* parse a legacy octal sequence */
576             uint32_t v;
577             v = *p - '0';
578             if (v > 7)
579                 break;
580             c = (c << 3) | v;
581             p++;
582             if (c >= 32)
583                 break;
584             v = *p - '0';
585             if (v > 7)
586                 break;
587             c = (c << 3) | v;
588             p++;
589         }
590         break;
591     default:
592         return -2;
593     }
594     *pp = p;
595     return c;
596 }
597 
598 #ifdef CONFIG_ALL_UNICODE
599 /* XXX: we use the same chars for name and value */
is_unicode_char(int c)600 static BOOL is_unicode_char(int c)
601 {
602     return ((c >= '0' && c <= '9') ||
603             (c >= 'A' && c <= 'Z') ||
604             (c >= 'a' && c <= 'z') ||
605             (c == '_'));
606 }
607 
parse_unicode_property(REParseState * s,CharRange * cr,const uint8_t ** pp,BOOL is_inv)608 static int parse_unicode_property(REParseState *s, CharRange *cr,
609                                   const uint8_t **pp, BOOL is_inv)
610 {
611     const uint8_t *p;
612     char name[64], value[64];
613     char *q;
614     BOOL script_ext;
615     int ret;
616 
617     p = *pp;
618     if (*p != '{')
619         return re_parse_error(s, "expecting '{' after \\p");
620     p++;
621     q = name;
622     while (is_unicode_char(*p)) {
623         if ((q - name) > sizeof(name) - 1)
624             goto unknown_property_name;
625         *q++ = *p++;
626     }
627     *q = '\0';
628     q = value;
629     if (*p == '=') {
630         p++;
631         while (is_unicode_char(*p)) {
632             if ((q - value) > sizeof(value) - 1)
633                 return re_parse_error(s, "unknown unicode property value");
634             *q++ = *p++;
635         }
636     }
637     *q = '\0';
638     if (*p != '}')
639         return re_parse_error(s, "expecting '}'");
640     p++;
641     //    printf("name=%s value=%s\n", name, value);
642 
643     if (!strcmp(name, "Script") || !strcmp(name, "sc")) {
644         script_ext = FALSE;
645         goto do_script;
646     } else if (!strcmp(name, "Script_Extensions") || !strcmp(name, "scx")) {
647         script_ext = TRUE;
648     do_script:
649         cr_init(cr, s->mem_opaque, lre_realloc);
650         ret = unicode_script(cr, value, script_ext);
651         if (ret) {
652             cr_free(cr);
653             if (ret == -2)
654                 return re_parse_error(s, "unknown unicode script");
655             else
656                 goto out_of_memory;
657         }
658     } else if (!strcmp(name, "General_Category") || !strcmp(name, "gc")) {
659         cr_init(cr, s->mem_opaque, lre_realloc);
660         ret = unicode_general_category(cr, value);
661         if (ret) {
662             cr_free(cr);
663             if (ret == -2)
664                 return re_parse_error(s, "unknown unicode general category");
665             else
666                 goto out_of_memory;
667         }
668     } else if (value[0] == '\0') {
669         cr_init(cr, s->mem_opaque, lre_realloc);
670         ret = unicode_general_category(cr, name);
671         if (ret == -1) {
672             cr_free(cr);
673             goto out_of_memory;
674         }
675         if (ret < 0) {
676             ret = unicode_prop(cr, name);
677             if (ret) {
678                 cr_free(cr);
679                 if (ret == -2)
680                     goto unknown_property_name;
681                 else
682                     goto out_of_memory;
683             }
684         }
685     } else {
686     unknown_property_name:
687         return re_parse_error(s, "unknown unicode property name");
688     }
689 
690     if (is_inv) {
691         if (cr_invert(cr)) {
692             cr_free(cr);
693             return -1;
694         }
695     }
696     *pp = p;
697     return 0;
698  out_of_memory:
699     return re_parse_error(s, "out of memory");
700 }
701 #endif /* CONFIG_ALL_UNICODE */
702 
703 /* return -1 if error otherwise the character or a class range
704    (CLASS_RANGE_BASE). In case of class range, 'cr' is
705    initialized. Otherwise, it is ignored. */
get_class_atom(REParseState * s,CharRange * cr,const uint8_t ** pp,BOOL inclass)706 static int get_class_atom(REParseState *s, CharRange *cr,
707                           const uint8_t **pp, BOOL inclass)
708 {
709     const uint8_t *p;
710     uint32_t c;
711     int ret;
712 
713     p = *pp;
714 
715     c = *p;
716     switch(c) {
717     case '\\':
718         p++;
719         if (p >= s->buf_end)
720             goto unexpected_end;
721         c = *p++;
722         switch(c) {
723         case 'd':
724             c = CHAR_RANGE_d;
725             goto class_range;
726         case 'D':
727             c = CHAR_RANGE_D;
728             goto class_range;
729         case 's':
730             c = CHAR_RANGE_s;
731             goto class_range;
732         case 'S':
733             c = CHAR_RANGE_S;
734             goto class_range;
735         case 'w':
736             c = CHAR_RANGE_w;
737             goto class_range;
738         case 'W':
739             c = CHAR_RANGE_W;
740         class_range:
741             if (cr_init_char_range(s, cr, c))
742                 return -1;
743             c = CLASS_RANGE_BASE;
744             break;
745         case 'c':
746             c = *p;
747             if ((c >= 'a' && c <= 'z') ||
748                 (c >= 'A' && c <= 'Z') ||
749                 (((c >= '0' && c <= '9') || c == '_') &&
750                  inclass && !s->is_utf16)) {   /* Annex B.1.4 */
751                 c &= 0x1f;
752                 p++;
753             } else if (s->is_utf16) {
754                 goto invalid_escape;
755             } else {
756                 /* otherwise return '\' and 'c' */
757                 p--;
758                 c = '\\';
759             }
760             break;
761 #ifdef CONFIG_ALL_UNICODE
762         case 'p':
763         case 'P':
764             if (s->is_utf16) {
765                 if (parse_unicode_property(s, cr, &p, (c == 'P')))
766                     return -1;
767                 c = CLASS_RANGE_BASE;
768                 break;
769             }
770             /* fall thru */
771 #endif
772         default:
773             p--;
774             ret = lre_parse_escape(&p, s->is_utf16 * 2);
775             if (ret >= 0) {
776                 c = ret;
777             } else {
778                 if (ret == -2 && *p != '\0' && strchr("^$\\.*+?()[]{}|/", *p)) {
779                     /* always valid to escape these characters */
780                     goto normal_char;
781                 } else if (s->is_utf16) {
782                 invalid_escape:
783                     return re_parse_error(s, "invalid escape sequence in regular expression");
784                 } else {
785                     /* just ignore the '\' */
786                     goto normal_char;
787                 }
788             }
789             break;
790         }
791         break;
792     case '\0':
793         if (p >= s->buf_end) {
794         unexpected_end:
795             return re_parse_error(s, "unexpected end");
796         }
797         /* fall thru */
798     default:
799     normal_char:
800         /* normal char */
801         if (c >= 128) {
802             c = unicode_from_utf8(p, UTF8_CHAR_LEN_MAX, &p);
803             if ((unsigned)c > 0xffff && !s->is_utf16) {
804                 /* XXX: should handle non BMP-1 code points */
805                 return re_parse_error(s, "malformed unicode char");
806             }
807         } else {
808             p++;
809         }
810         break;
811     }
812     *pp = p;
813     return c;
814 }
815 
re_emit_range(REParseState * s,const CharRange * cr)816 static int re_emit_range(REParseState *s, const CharRange *cr)
817 {
818     int len, i;
819     uint32_t high;
820 
821     len = (unsigned)cr->len / 2;
822     if (len >= 65535)
823         return re_parse_error(s, "too many ranges");
824     if (len == 0) {
825         /* not sure it can really happen. Emit a match that is always
826            false */
827         re_emit_op_u32(s, REOP_char32, -1);
828     } else {
829         high = cr->points[cr->len - 1];
830         if (high == UINT32_MAX)
831             high = cr->points[cr->len - 2];
832         if (high <= 0xffff) {
833             /* can use 16 bit ranges with the conversion that 0xffff =
834                infinity */
835             re_emit_op_u16(s, REOP_range, len);
836             for(i = 0; i < cr->len; i += 2) {
837                 dbuf_put_u16(&s->byte_code, cr->points[i]);
838                 high = cr->points[i + 1] - 1;
839                 if (high == UINT32_MAX - 1)
840                     high = 0xffff;
841                 dbuf_put_u16(&s->byte_code, high);
842             }
843         } else {
844             re_emit_op_u16(s, REOP_range32, len);
845             for(i = 0; i < cr->len; i += 2) {
846                 dbuf_put_u32(&s->byte_code, cr->points[i]);
847                 dbuf_put_u32(&s->byte_code, cr->points[i + 1] - 1);
848             }
849         }
850     }
851     return 0;
852 }
853 
re_parse_char_class(REParseState * s,const uint8_t ** pp)854 static int re_parse_char_class(REParseState *s, const uint8_t **pp)
855 {
856     const uint8_t *p;
857     uint32_t c1, c2;
858     CharRange cr_s, *cr = &cr_s;
859     CharRange cr1_s, *cr1 = &cr1_s;
860     BOOL invert;
861 
862     cr_init(cr, s->mem_opaque, lre_realloc);
863     p = *pp;
864     p++;    /* skip '[' */
865     invert = FALSE;
866     if (*p == '^') {
867         p++;
868         invert = TRUE;
869     }
870     for(;;) {
871         if (*p == ']')
872             break;
873         c1 = get_class_atom(s, cr1, &p, TRUE);
874         if ((int)c1 < 0)
875             goto fail;
876         if (*p == '-' && p[1] != ']') {
877             const uint8_t *p0 = p + 1;
878             if (c1 >= CLASS_RANGE_BASE) {
879                 if (s->is_utf16) {
880                     cr_free(cr1);
881                     goto invalid_class_range;
882                 }
883                 /* Annex B: match '-' character */
884                 goto class_atom;
885             }
886             c2 = get_class_atom(s, cr1, &p0, TRUE);
887             if ((int)c2 < 0)
888                 goto fail;
889             if (c2 >= CLASS_RANGE_BASE) {
890                 cr_free(cr1);
891                 if (s->is_utf16) {
892                     goto invalid_class_range;
893                 }
894                 /* Annex B: match '-' character */
895                 goto class_atom;
896             }
897             p = p0;
898             if (c2 < c1) {
899             invalid_class_range:
900                 re_parse_error(s, "invalid class range");
901                 goto fail;
902             }
903             if (cr_union_interval(cr, c1, c2))
904                 goto memory_error;
905         } else {
906         class_atom:
907             if (c1 >= CLASS_RANGE_BASE) {
908                 int ret;
909                 ret = cr_union1(cr, cr1->points, cr1->len);
910                 cr_free(cr1);
911                 if (ret)
912                     goto memory_error;
913             } else {
914                 if (cr_union_interval(cr, c1, c1))
915                     goto memory_error;
916             }
917         }
918     }
919     if (s->ignore_case) {
920         if (cr_canonicalize(cr))
921             goto memory_error;
922     }
923     if (invert) {
924         if (cr_invert(cr))
925             goto memory_error;
926     }
927     if (re_emit_range(s, cr))
928         goto fail;
929     cr_free(cr);
930     p++;    /* skip ']' */
931     *pp = p;
932     return 0;
933  memory_error:
934     re_parse_error(s, "out of memory");
935  fail:
936     cr_free(cr);
937     return -1;
938 }
939 
940 /* Return:
941    1 if the opcodes in bc_buf[] always advance the character pointer.
942    0 if the character pointer may not be advanced.
943    -1 if the code may depend on side effects of its previous execution (backreference)
944 */
re_check_advance(const uint8_t * bc_buf,int bc_buf_len)945 static int re_check_advance(const uint8_t *bc_buf, int bc_buf_len)
946 {
947     int pos, opcode, ret, len, i;
948     uint32_t val, last;
949     BOOL has_back_reference;
950     uint8_t capture_bitmap[CAPTURE_COUNT_MAX];
951 
952     ret = -2; /* not known yet */
953     pos = 0;
954     has_back_reference = FALSE;
955     memset(capture_bitmap, 0, sizeof(capture_bitmap));
956 
957     while (pos < bc_buf_len) {
958         opcode = bc_buf[pos];
959         len = reopcode_info[opcode].size;
960         switch(opcode) {
961         case REOP_range:
962             val = get_u16(bc_buf + pos + 1);
963             len += val * 4;
964             goto simple_char;
965         case REOP_range32:
966             val = get_u16(bc_buf + pos + 1);
967             len += val * 8;
968             goto simple_char;
969         case REOP_char:
970         case REOP_char32:
971         case REOP_dot:
972         case REOP_any:
973         simple_char:
974             if (ret == -2)
975                 ret = 1;
976             break;
977         case REOP_line_start:
978         case REOP_line_end:
979         case REOP_push_i32:
980         case REOP_push_char_pos:
981         case REOP_drop:
982         case REOP_word_boundary:
983         case REOP_not_word_boundary:
984         case REOP_prev:
985             /* no effect */
986             break;
987         case REOP_save_start:
988         case REOP_save_end:
989             val = bc_buf[pos + 1];
990             capture_bitmap[val] |= 1;
991             break;
992         case REOP_save_reset:
993             {
994                 val = bc_buf[pos + 1];
995                 last = bc_buf[pos + 2];
996                 while (val < last)
997                     capture_bitmap[val++] |= 1;
998             }
999             break;
1000         case REOP_back_reference:
1001         case REOP_backward_back_reference:
1002             val = bc_buf[pos + 1];
1003             capture_bitmap[val] |= 2;
1004             has_back_reference = TRUE;
1005             break;
1006         default:
1007             /* safe behvior: we cannot predict the outcome */
1008             if (ret == -2)
1009                 ret = 0;
1010             break;
1011         }
1012         pos += len;
1013     }
1014     if (has_back_reference) {
1015         /* check if there is back reference which references a capture
1016            made in the some code */
1017         for(i = 0; i < CAPTURE_COUNT_MAX; i++) {
1018             if (capture_bitmap[i] == 3)
1019                 return -1;
1020         }
1021     }
1022     if (ret == -2)
1023         ret = 0;
1024     return ret;
1025 }
1026 
1027 /* return -1 if a simple quantifier cannot be used. Otherwise return
1028    the number of characters in the atom. */
re_is_simple_quantifier(const uint8_t * bc_buf,int bc_buf_len)1029 static int re_is_simple_quantifier(const uint8_t *bc_buf, int bc_buf_len)
1030 {
1031     int pos, opcode, len, count;
1032     uint32_t val;
1033 
1034     count = 0;
1035     pos = 0;
1036     while (pos < bc_buf_len) {
1037         opcode = bc_buf[pos];
1038         len = reopcode_info[opcode].size;
1039         switch(opcode) {
1040         case REOP_range:
1041             val = get_u16(bc_buf + pos + 1);
1042             len += val * 4;
1043             goto simple_char;
1044         case REOP_range32:
1045             val = get_u16(bc_buf + pos + 1);
1046             len += val * 8;
1047             goto simple_char;
1048         case REOP_char:
1049         case REOP_char32:
1050         case REOP_dot:
1051         case REOP_any:
1052         simple_char:
1053             count++;
1054             break;
1055         case REOP_line_start:
1056         case REOP_line_end:
1057         case REOP_word_boundary:
1058         case REOP_not_word_boundary:
1059             break;
1060         default:
1061             return -1;
1062         }
1063         pos += len;
1064     }
1065     return count;
1066 }
1067 
1068 /* '*pp' is the first char after '<' */
re_parse_group_name(char * buf,int buf_size,const uint8_t ** pp,BOOL is_utf16)1069 static int re_parse_group_name(char *buf, int buf_size,
1070                                const uint8_t **pp, BOOL is_utf16)
1071 {
1072     const uint8_t *p;
1073     uint32_t c;
1074     char *q;
1075 
1076     p = *pp;
1077     q = buf;
1078     for(;;) {
1079         c = *p;
1080         if (c == '\\') {
1081             p++;
1082             if (*p != 'u')
1083                 return -1;
1084             c = lre_parse_escape(&p, is_utf16 * 2);
1085         } else if (c == '>') {
1086             break;
1087         } else if (c >= 128) {
1088             c = unicode_from_utf8(p, UTF8_CHAR_LEN_MAX, &p);
1089         } else {
1090             p++;
1091         }
1092         if (c > 0x10FFFF)
1093             return -1;
1094         if (q == buf) {
1095             if (!lre_js_is_ident_first(c))
1096                 return -1;
1097         } else {
1098             if (!lre_js_is_ident_next(c))
1099                 return -1;
1100         }
1101         if ((q - buf + UTF8_CHAR_LEN_MAX + 1) > buf_size)
1102             return -1;
1103         if (c < 128) {
1104             *q++ = c;
1105         } else {
1106             q += unicode_to_utf8((uint8_t*)q, c);
1107         }
1108     }
1109     if (q == buf)
1110         return -1;
1111     *q = '\0';
1112     p++;
1113     *pp = p;
1114     return 0;
1115 }
1116 
1117 /* if capture_name = NULL: return the number of captures + 1.
1118    Otherwise, return the capture index corresponding to capture_name
1119    or -1 if none */
re_parse_captures(REParseState * s,int * phas_named_captures,const char * capture_name)1120 static int re_parse_captures(REParseState *s, int *phas_named_captures,
1121                              const char *capture_name)
1122 {
1123     const uint8_t *p;
1124     int capture_index;
1125     char name[TMP_BUF_SIZE];
1126 
1127     capture_index = 1;
1128     *phas_named_captures = 0;
1129     for (p = s->buf_start; p < s->buf_end; p++) {
1130         switch (*p) {
1131         case '(':
1132             if (p[1] == '?') {
1133                 if (p[2] == '<' && p[3] != '=' && p[3] != '!') {
1134                     *phas_named_captures = 1;
1135                     /* potential named capture */
1136                     if (capture_name) {
1137                         p += 3;
1138                         if (re_parse_group_name(name, sizeof(name), &p,
1139                                                 s->is_utf16) == 0) {
1140                             if (!strcmp(name, capture_name))
1141                                 return capture_index;
1142                         }
1143                     }
1144                     capture_index++;
1145                 }
1146             } else {
1147                 capture_index++;
1148             }
1149             break;
1150         case '\\':
1151             p++;
1152             break;
1153         case '[':
1154             for (p += 1 + (*p == ']'); p < s->buf_end && *p != ']'; p++) {
1155                 if (*p == '\\')
1156                     p++;
1157             }
1158             break;
1159         }
1160     }
1161     if (capture_name)
1162         return -1;
1163     else
1164         return capture_index;
1165 }
1166 
re_count_captures(REParseState * s)1167 static int re_count_captures(REParseState *s)
1168 {
1169     if (s->total_capture_count < 0) {
1170         s->total_capture_count = re_parse_captures(s, &s->has_named_captures,
1171                                                    NULL);
1172     }
1173     return s->total_capture_count;
1174 }
1175 
re_has_named_captures(REParseState * s)1176 static BOOL re_has_named_captures(REParseState *s)
1177 {
1178     if (s->has_named_captures < 0)
1179         re_count_captures(s);
1180     return s->has_named_captures;
1181 }
1182 
find_group_name(REParseState * s,const char * name)1183 static int find_group_name(REParseState *s, const char *name)
1184 {
1185     const char *p, *buf_end;
1186     size_t len, name_len;
1187     int capture_index;
1188 
1189     name_len = strlen(name);
1190     p = (char *)s->group_names.buf;
1191     buf_end = (char *)s->group_names.buf + s->group_names.size;
1192     capture_index = 1;
1193     while (p < buf_end) {
1194         len = strlen(p);
1195         if (len == name_len && memcmp(name, p, name_len) == 0)
1196             return capture_index;
1197         p += len + 1;
1198         capture_index++;
1199     }
1200     return -1;
1201 }
1202 
1203 static int re_parse_disjunction(REParseState *s, BOOL is_backward_dir);
1204 
re_parse_term(REParseState * s,BOOL is_backward_dir)1205 static int re_parse_term(REParseState *s, BOOL is_backward_dir)
1206 {
1207     const uint8_t *p;
1208     int c, last_atom_start, quant_min, quant_max, last_capture_count;
1209     BOOL greedy, add_zero_advance_check, is_neg, is_backward_lookahead;
1210     CharRange cr_s, *cr = &cr_s;
1211 
1212     last_atom_start = -1;
1213     last_capture_count = 0;
1214     p = s->buf_ptr;
1215     c = *p;
1216     switch(c) {
1217     case '^':
1218         p++;
1219         re_emit_op(s, REOP_line_start);
1220         break;
1221     case '$':
1222         p++;
1223         re_emit_op(s, REOP_line_end);
1224         break;
1225     case '.':
1226         p++;
1227         last_atom_start = s->byte_code.size;
1228         last_capture_count = s->capture_count;
1229         if (is_backward_dir)
1230             re_emit_op(s, REOP_prev);
1231         re_emit_op(s, s->dotall ? REOP_any : REOP_dot);
1232         if (is_backward_dir)
1233             re_emit_op(s, REOP_prev);
1234         break;
1235     case '{':
1236         /* As an extension (see ES6 annex B), we accept '{' not
1237            followed by digits as a normal atom */
1238         if (!is_digit(p[1])) {
1239             if (s->is_utf16)
1240                 goto invalid_quant_count;
1241             goto parse_class_atom;
1242         }
1243         /* fall tru */
1244     case '*':
1245     case '+':
1246     case '?':
1247         return re_parse_error(s, "nothing to repeat");
1248     case '(':
1249         if (p[1] == '?') {
1250             if (p[2] == ':') {
1251                 p += 3;
1252                 last_atom_start = s->byte_code.size;
1253                 last_capture_count = s->capture_count;
1254                 s->buf_ptr = p;
1255                 if (re_parse_disjunction(s, is_backward_dir))
1256                     return -1;
1257                 p = s->buf_ptr;
1258                 if (re_parse_expect(s, &p, ')'))
1259                     return -1;
1260             } else if ((p[2] == '=' || p[2] == '!')) {
1261                 is_neg = (p[2] == '!');
1262                 is_backward_lookahead = FALSE;
1263                 p += 3;
1264                 goto lookahead;
1265             } else if (p[2] == '<' &&
1266                        (p[3] == '=' || p[3] == '!')) {
1267                 int pos;
1268                 is_neg = (p[3] == '!');
1269                 is_backward_lookahead = TRUE;
1270                 p += 4;
1271                 /* lookahead */
1272             lookahead:
1273                 /* Annex B allows lookahead to be used as an atom for
1274                    the quantifiers */
1275                 if (!s->is_utf16 && !is_backward_lookahead)  {
1276                     last_atom_start = s->byte_code.size;
1277                     last_capture_count = s->capture_count;
1278                 }
1279                 pos = re_emit_op_u32(s, REOP_lookahead + is_neg, 0);
1280                 s->buf_ptr = p;
1281                 if (re_parse_disjunction(s, is_backward_lookahead))
1282                     return -1;
1283                 p = s->buf_ptr;
1284                 if (re_parse_expect(s, &p, ')'))
1285                     return -1;
1286                 re_emit_op(s, REOP_match);
1287                 /* jump after the 'match' after the lookahead is successful */
1288                 put_u32(s->byte_code.buf + pos, s->byte_code.size - (pos + 4));
1289             } else if (p[2] == '<') {
1290                 p += 3;
1291                 if (re_parse_group_name(s->u.tmp_buf, sizeof(s->u.tmp_buf),
1292                                         &p, s->is_utf16)) {
1293                     return re_parse_error(s, "invalid group name");
1294                 }
1295                 if (find_group_name(s, s->u.tmp_buf) > 0) {
1296                     return re_parse_error(s, "duplicate group name");
1297                 }
1298                 /* group name with a trailing zero */
1299                 dbuf_put(&s->group_names, (uint8_t *)s->u.tmp_buf,
1300                          strlen(s->u.tmp_buf) + 1);
1301                 s->has_named_captures = 1;
1302                 goto parse_capture;
1303             } else {
1304                 return re_parse_error(s, "invalid group");
1305             }
1306         } else {
1307             int capture_index;
1308             p++;
1309             /* capture without group name */
1310             dbuf_putc(&s->group_names, 0);
1311         parse_capture:
1312             if (s->capture_count >= CAPTURE_COUNT_MAX)
1313                 return re_parse_error(s, "too many captures");
1314             last_atom_start = s->byte_code.size;
1315             last_capture_count = s->capture_count;
1316             capture_index = s->capture_count++;
1317             re_emit_op_u8(s, REOP_save_start + is_backward_dir,
1318                           capture_index);
1319 
1320             s->buf_ptr = p;
1321             if (re_parse_disjunction(s, is_backward_dir))
1322                 return -1;
1323             p = s->buf_ptr;
1324 
1325             re_emit_op_u8(s, REOP_save_start + 1 - is_backward_dir,
1326                           capture_index);
1327 
1328             if (re_parse_expect(s, &p, ')'))
1329                 return -1;
1330         }
1331         break;
1332     case '\\':
1333         switch(p[1]) {
1334         case 'b':
1335         case 'B':
1336             re_emit_op(s, REOP_word_boundary + (p[1] != 'b'));
1337             p += 2;
1338             break;
1339         case 'k':
1340             {
1341                 const uint8_t *p1;
1342                 int dummy_res;
1343 
1344                 p1 = p;
1345                 if (p1[2] != '<') {
1346                     /* annex B: we tolerate invalid group names in non
1347                        unicode mode if there is no named capture
1348                        definition */
1349                     if (s->is_utf16 || re_has_named_captures(s))
1350                         return re_parse_error(s, "expecting group name");
1351                     else
1352                         goto parse_class_atom;
1353                 }
1354                 p1 += 3;
1355                 if (re_parse_group_name(s->u.tmp_buf, sizeof(s->u.tmp_buf),
1356                                         &p1, s->is_utf16)) {
1357                     if (s->is_utf16 || re_has_named_captures(s))
1358                         return re_parse_error(s, "invalid group name");
1359                     else
1360                         goto parse_class_atom;
1361                 }
1362                 c = find_group_name(s, s->u.tmp_buf);
1363                 if (c < 0) {
1364                     /* no capture name parsed before, try to look
1365                        after (inefficient, but hopefully not common */
1366                     c = re_parse_captures(s, &dummy_res, s->u.tmp_buf);
1367                     if (c < 0) {
1368                         if (s->is_utf16 || re_has_named_captures(s))
1369                             return re_parse_error(s, "group name not defined");
1370                         else
1371                             goto parse_class_atom;
1372                     }
1373                 }
1374                 p = p1;
1375             }
1376             goto emit_back_reference;
1377         case '0':
1378             p += 2;
1379             c = 0;
1380             if (s->is_utf16) {
1381                 if (is_digit(*p)) {
1382                     return re_parse_error(s, "invalid decimal escape in regular expression");
1383                 }
1384             } else {
1385                 /* Annex B.1.4: accept legacy octal */
1386                 if (*p >= '0' && *p <= '7') {
1387                     c = *p++ - '0';
1388                     if (*p >= '0' && *p <= '7') {
1389                         c = (c << 3) + *p++ - '0';
1390                     }
1391                 }
1392             }
1393             goto normal_char;
1394 #if defined(_MSC_VER)
1395         case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9':
1396 #else
1397         case '1' ... '9':
1398 #endif
1399             {
1400                 const uint8_t *q = ++p;
1401 
1402                 c = parse_digits(&p);
1403                 if (c < 0 || (c >= s->capture_count && c >= re_count_captures(s))) {
1404                     if (!s->is_utf16) {
1405                         /* Annex B.1.4: accept legacy octal */
1406                         p = q;
1407                         if (*p <= '7') {
1408                             c = 0;
1409                             if (*p <= '3')
1410                                 c = *p++ - '0';
1411                             if (*p >= '0' && *p <= '7') {
1412                                 c = (c << 3) + *p++ - '0';
1413                                 if (*p >= '0' && *p <= '7') {
1414                                     c = (c << 3) + *p++ - '0';
1415                                 }
1416                             }
1417                         } else {
1418                             c = *p++;
1419                         }
1420                         goto normal_char;
1421                     }
1422                     return re_parse_error(s, "back reference out of range in reguar expression");
1423                 }
1424             emit_back_reference:
1425                 last_atom_start = s->byte_code.size;
1426                 last_capture_count = s->capture_count;
1427                 re_emit_op_u8(s, REOP_back_reference + is_backward_dir, c);
1428             }
1429             break;
1430         default:
1431             goto parse_class_atom;
1432         }
1433         break;
1434     case '[':
1435         last_atom_start = s->byte_code.size;
1436         last_capture_count = s->capture_count;
1437         if (is_backward_dir)
1438             re_emit_op(s, REOP_prev);
1439         if (re_parse_char_class(s, &p))
1440             return -1;
1441         if (is_backward_dir)
1442             re_emit_op(s, REOP_prev);
1443         break;
1444     case ']':
1445     case '}':
1446         if (s->is_utf16)
1447             return re_parse_error(s, "syntax error");
1448         goto parse_class_atom;
1449     default:
1450     parse_class_atom:
1451         c = get_class_atom(s, cr, &p, FALSE);
1452         if ((int)c < 0)
1453             return -1;
1454     normal_char:
1455         last_atom_start = s->byte_code.size;
1456         last_capture_count = s->capture_count;
1457         if (is_backward_dir)
1458             re_emit_op(s, REOP_prev);
1459         if (c >= CLASS_RANGE_BASE) {
1460             int ret;
1461             /* Note: canonicalization is not needed */
1462             ret = re_emit_range(s, cr);
1463             cr_free(cr);
1464             if (ret)
1465                 return -1;
1466         } else {
1467             if (s->ignore_case)
1468                 c = lre_canonicalize(c, s->is_utf16);
1469             if (c <= 0xffff)
1470                 re_emit_op_u16(s, REOP_char, c);
1471             else
1472                 re_emit_op_u32(s, REOP_char32, c);
1473         }
1474         if (is_backward_dir)
1475             re_emit_op(s, REOP_prev);
1476         break;
1477     }
1478 
1479     /* quantifier */
1480     if (last_atom_start >= 0) {
1481         c = *p;
1482         switch(c) {
1483         case '*':
1484             p++;
1485             quant_min = 0;
1486             quant_max = INT32_MAX;
1487             goto quantifier;
1488         case '+':
1489             p++;
1490             quant_min = 1;
1491             quant_max = INT32_MAX;
1492             goto quantifier;
1493         case '?':
1494             p++;
1495             quant_min = 0;
1496             quant_max = 1;
1497             goto quantifier;
1498         case '{':
1499             /* As an extension (see ES6 annex B), we accept '{' not
1500                followed by digits as a normal atom */
1501             if (!is_digit(p[1])) {
1502                 if (s->is_utf16)
1503                     goto invalid_quant_count;
1504                 break;
1505             }
1506             p++;
1507             quant_min = parse_digits(&p);
1508             if (quant_min < 0) {
1509             invalid_quant_count:
1510                 return re_parse_error(s, "invalid repetition count");
1511             }
1512             quant_max = quant_min;
1513             if (*p == ',') {
1514                 p++;
1515                 if (is_digit(*p)) {
1516                     quant_max = parse_digits(&p);
1517                     if (quant_max < 0 || quant_max < quant_min)
1518                         goto invalid_quant_count;
1519                 } else {
1520                     quant_max = INT32_MAX; /* infinity */
1521                 }
1522             }
1523             if (re_parse_expect(s, &p, '}'))
1524                 return -1;
1525         quantifier:
1526             greedy = TRUE;
1527             if (*p == '?') {
1528                 p++;
1529                 greedy = FALSE;
1530             }
1531             if (last_atom_start < 0) {
1532                 return re_parse_error(s, "nothing to repeat");
1533             }
1534             if (greedy) {
1535                 int len, pos;
1536 
1537                 if (quant_max > 0) {
1538                     /* specific optimization for simple quantifiers */
1539                     len = re_is_simple_quantifier(s->byte_code.buf + last_atom_start,
1540                                                  s->byte_code.size - last_atom_start);
1541                     if (len > 0) {
1542                         re_emit_op(s, REOP_match);
1543 
1544                         dbuf_insert(&s->byte_code, last_atom_start, 17);
1545                         pos = last_atom_start;
1546                         s->byte_code.buf[pos++] = REOP_simple_greedy_quant;
1547                         put_u32(&s->byte_code.buf[pos],
1548                                 s->byte_code.size - last_atom_start - 17);
1549                         pos += 4;
1550                         put_u32(&s->byte_code.buf[pos], quant_min);
1551                         pos += 4;
1552                         put_u32(&s->byte_code.buf[pos], quant_max);
1553                         pos += 4;
1554                         put_u32(&s->byte_code.buf[pos], len);
1555                         pos += 4;
1556                         goto done;
1557                     }
1558                 }
1559 
1560                 add_zero_advance_check = (re_check_advance(s->byte_code.buf + last_atom_start,
1561                                                            s->byte_code.size - last_atom_start) == 0);
1562             } else {
1563                 add_zero_advance_check = FALSE;
1564             }
1565 
1566             {
1567                 int len, pos;
1568                 len = s->byte_code.size - last_atom_start;
1569                 if (quant_min == 0) {
1570                     /* need to reset the capture in case the atom is
1571                        not executed */
1572                     if (last_capture_count != s->capture_count) {
1573                         dbuf_insert(&s->byte_code, last_atom_start, 3);
1574                         s->byte_code.buf[last_atom_start++] = REOP_save_reset;
1575                         s->byte_code.buf[last_atom_start++] = last_capture_count;
1576                         s->byte_code.buf[last_atom_start++] = s->capture_count - 1;
1577                     }
1578                     if (quant_max == 0) {
1579                         s->byte_code.size = last_atom_start;
1580                     } else if (quant_max == 1) {
1581                         dbuf_insert(&s->byte_code, last_atom_start, 5);
1582                         s->byte_code.buf[last_atom_start] = REOP_split_goto_first +
1583                             greedy;
1584                         put_u32(s->byte_code.buf + last_atom_start + 1, len);
1585                     } else if (quant_max == INT32_MAX) {
1586                         dbuf_insert(&s->byte_code, last_atom_start, 5 + add_zero_advance_check);
1587                         s->byte_code.buf[last_atom_start] = REOP_split_goto_first +
1588                             greedy;
1589                         put_u32(s->byte_code.buf + last_atom_start + 1,
1590                                 len + 5 + add_zero_advance_check);
1591                         if (add_zero_advance_check) {
1592                             /* avoid infinite loop by stoping the
1593                                recursion if no advance was made in the
1594                                atom (only works if the atom has no
1595                                side effect) */
1596                             s->byte_code.buf[last_atom_start + 1 + 4] = REOP_push_char_pos;
1597                             re_emit_goto(s, REOP_bne_char_pos, last_atom_start);
1598                         } else {
1599                             re_emit_goto(s, REOP_goto, last_atom_start);
1600                         }
1601                     } else {
1602                         dbuf_insert(&s->byte_code, last_atom_start, 10);
1603                         pos = last_atom_start;
1604                         s->byte_code.buf[pos++] = REOP_push_i32;
1605                         put_u32(s->byte_code.buf + pos, quant_max);
1606                         pos += 4;
1607                         s->byte_code.buf[pos++] = REOP_split_goto_first + greedy;
1608                         put_u32(s->byte_code.buf + pos, len + 5);
1609                         re_emit_goto(s, REOP_loop, last_atom_start + 5);
1610                         re_emit_op(s, REOP_drop);
1611                     }
1612                 } else if (quant_min == 1 && quant_max == INT32_MAX &&
1613                            !add_zero_advance_check) {
1614                     re_emit_goto(s, REOP_split_next_first - greedy,
1615                                  last_atom_start);
1616                 } else {
1617                     if (quant_min == 1) {
1618                         /* nothing to add */
1619                     } else {
1620                         dbuf_insert(&s->byte_code, last_atom_start, 5);
1621                         s->byte_code.buf[last_atom_start] = REOP_push_i32;
1622                         put_u32(s->byte_code.buf + last_atom_start + 1,
1623                                 quant_min);
1624                         last_atom_start += 5;
1625                         re_emit_goto(s, REOP_loop, last_atom_start);
1626                         re_emit_op(s, REOP_drop);
1627                     }
1628                     if (quant_max == INT32_MAX) {
1629                         pos = s->byte_code.size;
1630                         re_emit_op_u32(s, REOP_split_goto_first + greedy,
1631                                        len + 5 + add_zero_advance_check);
1632                         if (add_zero_advance_check)
1633                             re_emit_op(s, REOP_push_char_pos);
1634                         /* copy the atom */
1635                         dbuf_put_self(&s->byte_code, last_atom_start, len);
1636                         if (add_zero_advance_check)
1637                             re_emit_goto(s, REOP_bne_char_pos, pos);
1638                         else
1639                             re_emit_goto(s, REOP_goto, pos);
1640                     } else if (quant_max > quant_min) {
1641                         re_emit_op_u32(s, REOP_push_i32, quant_max - quant_min);
1642                         pos = s->byte_code.size;
1643                         re_emit_op_u32(s, REOP_split_goto_first + greedy, len + 5);
1644                         /* copy the atom */
1645                         dbuf_put_self(&s->byte_code, last_atom_start, len);
1646 
1647                         re_emit_goto(s, REOP_loop, pos);
1648                         re_emit_op(s, REOP_drop);
1649                     }
1650                 }
1651                 last_atom_start = -1;
1652             }
1653             break;
1654         default:
1655             break;
1656         }
1657     }
1658  done:
1659     s->buf_ptr = p;
1660     return 0;
1661 }
1662 
re_parse_alternative(REParseState * s,BOOL is_backward_dir)1663 static int re_parse_alternative(REParseState *s, BOOL is_backward_dir)
1664 {
1665     const uint8_t *p;
1666     int ret;
1667     size_t start, term_start, end, term_size;
1668 
1669     start = s->byte_code.size;
1670     for(;;) {
1671         p = s->buf_ptr;
1672         if (p >= s->buf_end)
1673             break;
1674         if (*p == '|' || *p == ')')
1675             break;
1676         term_start = s->byte_code.size;
1677         ret = re_parse_term(s, is_backward_dir);
1678         if (ret)
1679             return ret;
1680         if (is_backward_dir) {
1681             /* reverse the order of the terms (XXX: inefficient, but
1682                speed is not really critical here) */
1683             end = s->byte_code.size;
1684             term_size = end - term_start;
1685             if (dbuf_realloc(&s->byte_code, end + term_size))
1686                 return -1;
1687             memmove(s->byte_code.buf + start + term_size,
1688                     s->byte_code.buf + start,
1689                     end - start);
1690             memcpy(s->byte_code.buf + start, s->byte_code.buf + end,
1691                    term_size);
1692         }
1693     }
1694     return 0;
1695 }
1696 
re_parse_disjunction(REParseState * s,BOOL is_backward_dir)1697 static int re_parse_disjunction(REParseState *s, BOOL is_backward_dir)
1698 {
1699     int start, len, pos;
1700 
1701     start = s->byte_code.size;
1702     if (re_parse_alternative(s, is_backward_dir))
1703         return -1;
1704     while (*s->buf_ptr == '|') {
1705         s->buf_ptr++;
1706 
1707         len = s->byte_code.size - start;
1708 
1709         /* insert a split before the first alternative */
1710         dbuf_insert(&s->byte_code, start, 5);
1711         s->byte_code.buf[start] = REOP_split_next_first;
1712         put_u32(s->byte_code.buf + start + 1, len + 5);
1713 
1714         pos = re_emit_op_u32(s, REOP_goto, 0);
1715 
1716         if (re_parse_alternative(s, is_backward_dir))
1717             return -1;
1718 
1719         /* patch the goto */
1720         len = s->byte_code.size - (pos + 4);
1721         put_u32(s->byte_code.buf + pos, len);
1722     }
1723     return 0;
1724 }
1725 
1726 /* the control flow is recursive so the analysis can be linear */
compute_stack_size(const uint8_t * bc_buf,int bc_buf_len)1727 static int compute_stack_size(const uint8_t *bc_buf, int bc_buf_len)
1728 {
1729     int stack_size, stack_size_max, pos, opcode, len;
1730     uint32_t val;
1731 
1732     stack_size = 0;
1733     stack_size_max = 0;
1734     bc_buf += RE_HEADER_LEN;
1735     bc_buf_len -= RE_HEADER_LEN;
1736     pos = 0;
1737     while (pos < bc_buf_len) {
1738         opcode = bc_buf[pos];
1739         len = reopcode_info[opcode].size;
1740         assert(opcode < REOP_COUNT);
1741         assert((pos + len) <= bc_buf_len);
1742         switch(opcode) {
1743         case REOP_push_i32:
1744         case REOP_push_char_pos:
1745             stack_size++;
1746             if (stack_size > stack_size_max) {
1747                 if (stack_size > STACK_SIZE_MAX)
1748                     return -1;
1749                 stack_size_max = stack_size;
1750             }
1751             break;
1752         case REOP_drop:
1753         case REOP_bne_char_pos:
1754             assert(stack_size > 0);
1755             stack_size--;
1756             break;
1757         case REOP_range:
1758             val = get_u16(bc_buf + pos + 1);
1759             len += val * 4;
1760             break;
1761         case REOP_range32:
1762             val = get_u16(bc_buf + pos + 1);
1763             len += val * 8;
1764             break;
1765         }
1766         pos += len;
1767     }
1768     return stack_size_max;
1769 }
1770 
1771 /* 'buf' must be a zero terminated UTF-8 string of length buf_len.
1772    Return NULL if error and allocate an error message in *perror_msg,
1773    otherwise the compiled bytecode and its length in plen.
1774 */
lre_compile(int * plen,char * error_msg,int error_msg_size,const char * buf,size_t buf_len,int re_flags,void * opaque)1775 uint8_t *lre_compile(int *plen, char *error_msg, int error_msg_size,
1776                      const char *buf, size_t buf_len, int re_flags,
1777                      void *opaque)
1778 {
1779     REParseState s_s, *s = &s_s;
1780     int stack_size;
1781     BOOL is_sticky;
1782 
1783     memset(s, 0, sizeof(*s));
1784     s->mem_opaque = opaque;
1785     s->buf_ptr = (const uint8_t *)buf;
1786     s->buf_end = s->buf_ptr + buf_len;
1787     s->buf_start = s->buf_ptr;
1788     s->re_flags = re_flags;
1789     s->is_utf16 = ((re_flags & LRE_FLAG_UTF16) != 0);
1790     is_sticky = ((re_flags & LRE_FLAG_STICKY) != 0);
1791     s->ignore_case = ((re_flags & LRE_FLAG_IGNORECASE) != 0);
1792     s->dotall = ((re_flags & LRE_FLAG_DOTALL) != 0);
1793     s->capture_count = 1;
1794     s->total_capture_count = -1;
1795     s->has_named_captures = -1;
1796 
1797     dbuf_init2(&s->byte_code, opaque, lre_realloc);
1798     dbuf_init2(&s->group_names, opaque, lre_realloc);
1799 
1800     dbuf_putc(&s->byte_code, re_flags); /* first element is the flags */
1801     dbuf_putc(&s->byte_code, 0); /* second element is the number of captures */
1802     dbuf_putc(&s->byte_code, 0); /* stack size */
1803     dbuf_put_u32(&s->byte_code, 0); /* bytecode length */
1804 
1805     if (!is_sticky) {
1806         /* iterate thru all positions (about the same as .*?( ... ) )
1807            .  We do it without an explicit loop so that lock step
1808            thread execution will be possible in an optimized
1809            implementation */
1810         re_emit_op_u32(s, REOP_split_goto_first, 1 + 5);
1811         re_emit_op(s, REOP_any);
1812         re_emit_op_u32(s, REOP_goto, -(5 + 1 + 5));
1813     }
1814     re_emit_op_u8(s, REOP_save_start, 0);
1815 
1816     if (re_parse_disjunction(s, FALSE)) {
1817     error:
1818         dbuf_free(&s->byte_code);
1819         dbuf_free(&s->group_names);
1820         pstrcpy(error_msg, error_msg_size, s->u.error_msg);
1821         *plen = 0;
1822         return NULL;
1823     }
1824 
1825     re_emit_op_u8(s, REOP_save_end, 0);
1826 
1827     re_emit_op(s, REOP_match);
1828 
1829     if (*s->buf_ptr != '\0') {
1830         re_parse_error(s, "extraneous characters at the end");
1831         goto error;
1832     }
1833 
1834     if (dbuf_error(&s->byte_code)) {
1835         re_parse_error(s, "out of memory");
1836         goto error;
1837     }
1838 
1839     stack_size = compute_stack_size(s->byte_code.buf, s->byte_code.size);
1840     if (stack_size < 0) {
1841         re_parse_error(s, "too many imbricated quantifiers");
1842         goto error;
1843     }
1844 
1845     s->byte_code.buf[RE_HEADER_CAPTURE_COUNT] = s->capture_count;
1846     s->byte_code.buf[RE_HEADER_STACK_SIZE] = stack_size;
1847     put_u32(s->byte_code.buf + 3, s->byte_code.size - RE_HEADER_LEN);
1848 
1849     /* add the named groups if needed */
1850     if (s->group_names.size > (s->capture_count - 1)) {
1851         dbuf_put(&s->byte_code, s->group_names.buf, s->group_names.size);
1852         s->byte_code.buf[RE_HEADER_FLAGS] |= LRE_FLAG_NAMED_GROUPS;
1853     }
1854     dbuf_free(&s->group_names);
1855 
1856 #ifdef DUMP_REOP
1857     lre_dump_bytecode(s->byte_code.buf, s->byte_code.size);
1858 #endif
1859 
1860     error_msg[0] = '\0';
1861     *plen = s->byte_code.size;
1862     return s->byte_code.buf;
1863 }
1864 
is_line_terminator(uint32_t c)1865 static BOOL is_line_terminator(uint32_t c)
1866 {
1867     return (c == '\n' || c == '\r' || c == CP_LS || c == CP_PS);
1868 }
1869 
is_word_char(uint32_t c)1870 static BOOL is_word_char(uint32_t c)
1871 {
1872     return ((c >= '0' && c <= '9') ||
1873             (c >= 'a' && c <= 'z') ||
1874             (c >= 'A' && c <= 'Z') ||
1875             (c == '_'));
1876 }
1877 
1878 #define GET_CHAR(c, cptr, cbuf_end)                                     \
1879     do {                                                                \
1880         if (cbuf_type == 0) {                                           \
1881             c = *cptr++;                                                \
1882         } else {                                                        \
1883             uint32_t __c1;                                              \
1884             c = *(uint16_t *)cptr;                                      \
1885             cptr += 2;                                                  \
1886             if (c >= 0xd800 && c < 0xdc00 &&                            \
1887                 cbuf_type == 2 && cptr < cbuf_end) {                    \
1888                 __c1 = *(uint16_t *)cptr;                               \
1889                 if (__c1 >= 0xdc00 && __c1 < 0xe000) {                  \
1890                     c = (((c & 0x3ff) << 10) | (__c1 & 0x3ff)) + 0x10000; \
1891                     cptr += 2;                                          \
1892                 }                                                       \
1893             }                                                           \
1894         }                                                               \
1895     } while (0)
1896 
1897 #define PEEK_CHAR(c, cptr, cbuf_end)             \
1898     do {                                         \
1899         if (cbuf_type == 0) {                    \
1900             c = cptr[0];                         \
1901         } else {                                 \
1902             uint32_t __c1;                                              \
1903             c = ((uint16_t *)cptr)[0];                                  \
1904             if (c >= 0xd800 && c < 0xdc00 &&                            \
1905                 cbuf_type == 2 && (cptr + 2) < cbuf_end) {              \
1906                 __c1 = ((uint16_t *)cptr)[1];                           \
1907                 if (__c1 >= 0xdc00 && __c1 < 0xe000) {                  \
1908                     c = (((c & 0x3ff) << 10) | (__c1 & 0x3ff)) + 0x10000; \
1909                 }                                                       \
1910             }                                                           \
1911         }                                        \
1912     } while (0)
1913 
1914 #define PEEK_PREV_CHAR(c, cptr, cbuf_start)                 \
1915     do {                                         \
1916         if (cbuf_type == 0) {                    \
1917             c = cptr[-1];                        \
1918         } else {                                 \
1919             uint32_t __c1;                                              \
1920             c = ((uint16_t *)cptr)[-1];                                 \
1921             if (c >= 0xdc00 && c < 0xe000 &&                            \
1922                 cbuf_type == 2 && (cptr - 4) >= cbuf_start) {              \
1923                 __c1 = ((uint16_t *)cptr)[-2];                          \
1924                 if (__c1 >= 0xd800 && __c1 < 0xdc00 ) {                 \
1925                     c = (((__c1 & 0x3ff) << 10) | (c & 0x3ff)) + 0x10000; \
1926                 }                                                       \
1927             }                                                           \
1928         }                                                               \
1929     } while (0)
1930 
1931 #define GET_PREV_CHAR(c, cptr, cbuf_start)       \
1932     do {                                         \
1933         if (cbuf_type == 0) {                    \
1934             cptr--;                              \
1935             c = cptr[0];                         \
1936         } else {                                 \
1937             uint32_t __c1;                                              \
1938             cptr -= 2;                                                  \
1939             c = ((uint16_t *)cptr)[0];                                 \
1940             if (c >= 0xdc00 && c < 0xe000 &&                            \
1941                 cbuf_type == 2 && cptr > cbuf_start) {                  \
1942                 __c1 = ((uint16_t *)cptr)[-1];                          \
1943                 if (__c1 >= 0xd800 && __c1 < 0xdc00 ) {                 \
1944                     cptr -= 2;                                          \
1945                     c = (((__c1 & 0x3ff) << 10) | (c & 0x3ff)) + 0x10000; \
1946                 }                                                       \
1947             }                                                           \
1948         }                                                               \
1949     } while (0)
1950 
1951 #define PREV_CHAR(cptr, cbuf_start)       \
1952     do {                                  \
1953         if (cbuf_type == 0) {             \
1954             cptr--;                       \
1955         } else {                          \
1956             cptr -= 2;                          \
1957             if (cbuf_type == 2) {                                       \
1958                 c = ((uint16_t *)cptr)[0];                              \
1959                 if (c >= 0xdc00 && c < 0xe000 && cptr > cbuf_start) {   \
1960                     c = ((uint16_t *)cptr)[-1];                         \
1961                     if (c >= 0xd800 && c < 0xdc00)                      \
1962                         cptr -= 2;                                      \
1963                 }                                                       \
1964             }                                                           \
1965         }                                                               \
1966     } while (0)
1967 
1968 typedef uintptr_t StackInt;
1969 
1970 typedef enum {
1971     RE_EXEC_STATE_SPLIT,
1972     RE_EXEC_STATE_LOOKAHEAD,
1973     RE_EXEC_STATE_NEGATIVE_LOOKAHEAD,
1974     RE_EXEC_STATE_GREEDY_QUANT,
1975 } REExecStateEnum;
1976 
1977 typedef struct REExecState {
1978     REExecStateEnum type : 8;
1979     uint8_t stack_len;
1980     size_t count; /* only used for RE_EXEC_STATE_GREEDY_QUANT */
1981     const uint8_t *cptr;
1982     const uint8_t *pc;
1983     void *buf[0];
1984 } REExecState;
1985 
1986 typedef struct {
1987     const uint8_t *cbuf;
1988     const uint8_t *cbuf_end;
1989     /* 0 = 8 bit chars, 1 = 16 bit chars, 2 = 16 bit chars, UTF-16 */
1990     int cbuf_type;
1991     int capture_count;
1992     int stack_size_max;
1993     BOOL multi_line;
1994     BOOL ignore_case;
1995     BOOL is_utf16;
1996     void *opaque; /* used for stack overflow check */
1997 
1998     size_t state_size;
1999     uint8_t *state_stack;
2000     size_t state_stack_size;
2001     size_t state_stack_len;
2002 } REExecContext;
2003 
push_state(REExecContext * s,uint8_t ** capture,StackInt * stack,size_t stack_len,const uint8_t * pc,const uint8_t * cptr,REExecStateEnum type,size_t count)2004 static int push_state(REExecContext *s,
2005                       uint8_t **capture,
2006                       StackInt *stack, size_t stack_len,
2007                       const uint8_t *pc, const uint8_t *cptr,
2008                       REExecStateEnum type, size_t count)
2009 {
2010     REExecState *rs;
2011     uint8_t *new_stack;
2012     size_t new_size, i, n;
2013     StackInt *stack_buf;
2014 
2015     if (unlikely((s->state_stack_len + 1) > s->state_stack_size)) {
2016         /* reallocate the stack */
2017         new_size = s->state_stack_size * 3 / 2;
2018         if (new_size < 8)
2019             new_size = 8;
2020         new_stack = lre_realloc(s->opaque, s->state_stack, new_size * s->state_size);
2021         if (!new_stack)
2022             return -1;
2023         s->state_stack_size = new_size;
2024         s->state_stack = new_stack;
2025     }
2026     rs = (REExecState *)(s->state_stack + s->state_stack_len * s->state_size);
2027     s->state_stack_len++;
2028     rs->type = type;
2029     rs->count = count;
2030     rs->stack_len = stack_len;
2031     rs->cptr = cptr;
2032     rs->pc = pc;
2033     n = 2 * s->capture_count;
2034     for(i = 0; i < n; i++)
2035         rs->buf[i] = capture[i];
2036     stack_buf = (StackInt *)(rs->buf + n);
2037     for(i = 0; i < stack_len; i++)
2038         stack_buf[i] = stack[i];
2039     return 0;
2040 }
2041 
2042 /* return 1 if match, 0 if not match or -1 if error. */
lre_exec_backtrack(REExecContext * s,uint8_t ** capture,StackInt * stack,int stack_len,const uint8_t * pc,const uint8_t * cptr,BOOL no_recurse)2043 static intptr_t lre_exec_backtrack(REExecContext *s, uint8_t **capture,
2044                                    StackInt *stack, int stack_len,
2045                                    const uint8_t *pc, const uint8_t *cptr,
2046                                    BOOL no_recurse)
2047 {
2048     int opcode, ret;
2049     int cbuf_type;
2050     uint32_t val, c;
2051     const uint8_t *cbuf_end;
2052 
2053     cbuf_type = s->cbuf_type;
2054     cbuf_end = s->cbuf_end;
2055 
2056     for(;;) {
2057         //        printf("top=%p: pc=%d\n", th_list.top, (int)(pc - (bc_buf + RE_HEADER_LEN)));
2058         opcode = *pc++;
2059         switch(opcode) {
2060         case REOP_match:
2061             {
2062                 REExecState *rs;
2063                 if (no_recurse)
2064                     return (intptr_t)cptr;
2065                 ret = 1;
2066                 goto recurse;
2067             no_match:
2068                 if (no_recurse)
2069                     return 0;
2070                 ret = 0;
2071             recurse:
2072                 for(;;) {
2073                     if (s->state_stack_len == 0)
2074                         return ret;
2075                     rs = (REExecState *)(s->state_stack +
2076                                          (s->state_stack_len - 1) * s->state_size);
2077                     if (rs->type == RE_EXEC_STATE_SPLIT) {
2078                         if (!ret) {
2079                         pop_state:
2080                             memcpy(capture, rs->buf,
2081                                    sizeof(capture[0]) * 2 * s->capture_count);
2082                         pop_state1:
2083                             pc = rs->pc;
2084                             cptr = rs->cptr;
2085                             stack_len = rs->stack_len;
2086                             memcpy(stack, rs->buf + 2 * s->capture_count,
2087                                    stack_len * sizeof(stack[0]));
2088                             s->state_stack_len--;
2089                             break;
2090                         }
2091                     } else if (rs->type == RE_EXEC_STATE_GREEDY_QUANT) {
2092                         if (!ret) {
2093                             uint32_t char_count, i;
2094                             memcpy(capture, rs->buf,
2095                                    sizeof(capture[0]) * 2 * s->capture_count);
2096                             stack_len = rs->stack_len;
2097                             memcpy(stack, rs->buf + 2 * s->capture_count,
2098                                    stack_len * sizeof(stack[0]));
2099                             pc = rs->pc;
2100                             cptr = rs->cptr;
2101                             /* go backward */
2102                             char_count = get_u32(pc + 12);
2103                             for(i = 0; i < char_count; i++) {
2104                                 PREV_CHAR(cptr, s->cbuf);
2105                             }
2106                             pc = (pc + 16) + (int)get_u32(pc);
2107                             rs->cptr = cptr;
2108                             rs->count--;
2109                             if (rs->count == 0) {
2110                                 s->state_stack_len--;
2111                             }
2112                             break;
2113                         }
2114                     } else {
2115                         ret = ((rs->type == RE_EXEC_STATE_LOOKAHEAD && ret) ||
2116                                (rs->type == RE_EXEC_STATE_NEGATIVE_LOOKAHEAD && !ret));
2117                         if (ret) {
2118                             /* keep the capture in case of positive lookahead */
2119                             if (rs->type == RE_EXEC_STATE_LOOKAHEAD)
2120                                 goto pop_state1;
2121                             else
2122                                 goto pop_state;
2123                         }
2124                     }
2125                     s->state_stack_len--;
2126                 }
2127             }
2128             break;
2129         case REOP_char32:
2130             val = get_u32(pc);
2131             pc += 4;
2132             goto test_char;
2133         case REOP_char:
2134             val = get_u16(pc);
2135             pc += 2;
2136         test_char:
2137             if (cptr >= cbuf_end)
2138                 goto no_match;
2139             GET_CHAR(c, cptr, cbuf_end);
2140             if (s->ignore_case) {
2141                 c = lre_canonicalize(c, s->is_utf16);
2142             }
2143             if (val != c)
2144                 goto no_match;
2145             break;
2146         case REOP_split_goto_first:
2147         case REOP_split_next_first:
2148             {
2149                 const uint8_t *pc1;
2150 
2151                 val = get_u32(pc);
2152                 pc += 4;
2153                 if (opcode == REOP_split_next_first) {
2154                     pc1 = pc + (int)val;
2155                 } else {
2156                     pc1 = pc;
2157                     pc = pc + (int)val;
2158                 }
2159                 ret = push_state(s, capture, stack, stack_len,
2160                                  pc1, cptr, RE_EXEC_STATE_SPLIT, 0);
2161                 if (ret < 0)
2162                     return -1;
2163                 break;
2164             }
2165         case REOP_lookahead:
2166         case REOP_negative_lookahead:
2167             val = get_u32(pc);
2168             pc += 4;
2169             ret = push_state(s, capture, stack, stack_len,
2170                              pc + (int)val, cptr,
2171                              RE_EXEC_STATE_LOOKAHEAD + opcode - REOP_lookahead,
2172                              0);
2173             if (ret < 0)
2174                 return -1;
2175             break;
2176 
2177         case REOP_goto:
2178             val = get_u32(pc);
2179             pc += 4 + (int)val;
2180             break;
2181         case REOP_line_start:
2182             if (cptr == s->cbuf)
2183                 break;
2184             if (!s->multi_line)
2185                 goto no_match;
2186             PEEK_PREV_CHAR(c, cptr, s->cbuf);
2187             if (!is_line_terminator(c))
2188                 goto no_match;
2189             break;
2190         case REOP_line_end:
2191             if (cptr == cbuf_end)
2192                 break;
2193             if (!s->multi_line)
2194                 goto no_match;
2195             PEEK_CHAR(c, cptr, cbuf_end);
2196             if (!is_line_terminator(c))
2197                 goto no_match;
2198             break;
2199         case REOP_dot:
2200             if (cptr == cbuf_end)
2201                 goto no_match;
2202             GET_CHAR(c, cptr, cbuf_end);
2203             if (is_line_terminator(c))
2204                 goto no_match;
2205             break;
2206         case REOP_any:
2207             if (cptr == cbuf_end)
2208                 goto no_match;
2209             GET_CHAR(c, cptr, cbuf_end);
2210             break;
2211         case REOP_save_start:
2212         case REOP_save_end:
2213             val = *pc++;
2214             assert(val < s->capture_count);
2215             capture[2 * val + opcode - REOP_save_start] = (uint8_t *)cptr;
2216             break;
2217         case REOP_save_reset:
2218             {
2219                 uint32_t val2;
2220                 val = pc[0];
2221                 val2 = pc[1];
2222                 pc += 2;
2223                 assert(val2 < s->capture_count);
2224                 while (val <= val2) {
2225                     capture[2 * val] = NULL;
2226                     capture[2 * val + 1] = NULL;
2227                     val++;
2228                 }
2229             }
2230             break;
2231         case REOP_push_i32:
2232             val = get_u32(pc);
2233             pc += 4;
2234             stack[stack_len++] = val;
2235             break;
2236         case REOP_drop:
2237             stack_len--;
2238             break;
2239         case REOP_loop:
2240             val = get_u32(pc);
2241             pc += 4;
2242             if (--stack[stack_len - 1] != 0) {
2243                 pc += (int)val;
2244             }
2245             break;
2246         case REOP_push_char_pos:
2247             stack[stack_len++] = (uintptr_t)cptr;
2248             break;
2249         case REOP_bne_char_pos:
2250             val = get_u32(pc);
2251             pc += 4;
2252             if (stack[--stack_len] != (uintptr_t)cptr)
2253                 pc += (int)val;
2254             break;
2255         case REOP_word_boundary:
2256         case REOP_not_word_boundary:
2257             {
2258                 BOOL v1, v2;
2259                 /* char before */
2260                 if (cptr == s->cbuf) {
2261                     v1 = FALSE;
2262                 } else {
2263                     PEEK_PREV_CHAR(c, cptr, s->cbuf);
2264                     v1 = is_word_char(c);
2265                 }
2266                 /* current char */
2267                 if (cptr >= cbuf_end) {
2268                     v2 = FALSE;
2269                 } else {
2270                     PEEK_CHAR(c, cptr, cbuf_end);
2271                     v2 = is_word_char(c);
2272                 }
2273                 if (v1 ^ v2 ^ (REOP_not_word_boundary - opcode))
2274                     goto no_match;
2275             }
2276             break;
2277         case REOP_back_reference:
2278         case REOP_backward_back_reference:
2279             {
2280                 const uint8_t *cptr1, *cptr1_end, *cptr1_start;
2281                 uint32_t c1, c2;
2282 
2283                 val = *pc++;
2284                 if (val >= s->capture_count)
2285                     goto no_match;
2286                 cptr1_start = capture[2 * val];
2287                 cptr1_end = capture[2 * val + 1];
2288                 if (!cptr1_start || !cptr1_end)
2289                     break;
2290                 if (opcode == REOP_back_reference) {
2291                     cptr1 = cptr1_start;
2292                     while (cptr1 < cptr1_end) {
2293                         if (cptr >= cbuf_end)
2294                             goto no_match;
2295                         GET_CHAR(c1, cptr1, cptr1_end);
2296                         GET_CHAR(c2, cptr, cbuf_end);
2297                         if (s->ignore_case) {
2298                             c1 = lre_canonicalize(c1, s->is_utf16);
2299                             c2 = lre_canonicalize(c2, s->is_utf16);
2300                         }
2301                         if (c1 != c2)
2302                             goto no_match;
2303                     }
2304                 } else {
2305                     cptr1 = cptr1_end;
2306                     while (cptr1 > cptr1_start) {
2307                         if (cptr == s->cbuf)
2308                             goto no_match;
2309                         GET_PREV_CHAR(c1, cptr1, cptr1_start);
2310                         GET_PREV_CHAR(c2, cptr, s->cbuf);
2311                         if (s->ignore_case) {
2312                             c1 = lre_canonicalize(c1, s->is_utf16);
2313                             c2 = lre_canonicalize(c2, s->is_utf16);
2314                         }
2315                         if (c1 != c2)
2316                             goto no_match;
2317                     }
2318                 }
2319             }
2320             break;
2321         case REOP_range:
2322             {
2323                 int n;
2324                 uint32_t low, high, idx_min, idx_max, idx;
2325 
2326                 n = get_u16(pc); /* n must be >= 1 */
2327                 pc += 2;
2328                 if (cptr >= cbuf_end)
2329                     goto no_match;
2330                 GET_CHAR(c, cptr, cbuf_end);
2331                 if (s->ignore_case) {
2332                     c = lre_canonicalize(c, s->is_utf16);
2333                 }
2334                 idx_min = 0;
2335                 low = get_u16(pc + 0 * 4);
2336                 if (c < low)
2337                     goto no_match;
2338                 idx_max = n - 1;
2339                 high = get_u16(pc + idx_max * 4 + 2);
2340                 /* 0xffff in for last value means +infinity */
2341                 if (unlikely(c >= 0xffff) && high == 0xffff)
2342                     goto range_match;
2343                 if (c > high)
2344                     goto no_match;
2345                 while (idx_min <= idx_max) {
2346                     idx = (idx_min + idx_max) / 2;
2347                     low = get_u16(pc + idx * 4);
2348                     high = get_u16(pc + idx * 4 + 2);
2349                     if (c < low)
2350                         idx_max = idx - 1;
2351                     else if (c > high)
2352                         idx_min = idx + 1;
2353                     else
2354                         goto range_match;
2355                 }
2356                 goto no_match;
2357             range_match:
2358                 pc += 4 * n;
2359             }
2360             break;
2361         case REOP_range32:
2362             {
2363                 int n;
2364                 uint32_t low, high, idx_min, idx_max, idx;
2365 
2366                 n = get_u16(pc); /* n must be >= 1 */
2367                 pc += 2;
2368                 if (cptr >= cbuf_end)
2369                     goto no_match;
2370                 GET_CHAR(c, cptr, cbuf_end);
2371                 if (s->ignore_case) {
2372                     c = lre_canonicalize(c, s->is_utf16);
2373                 }
2374                 idx_min = 0;
2375                 low = get_u32(pc + 0 * 8);
2376                 if (c < low)
2377                     goto no_match;
2378                 idx_max = n - 1;
2379                 high = get_u32(pc + idx_max * 8 + 4);
2380                 if (c > high)
2381                     goto no_match;
2382                 while (idx_min <= idx_max) {
2383                     idx = (idx_min + idx_max) / 2;
2384                     low = get_u32(pc + idx * 8);
2385                     high = get_u32(pc + idx * 8 + 4);
2386                     if (c < low)
2387                         idx_max = idx - 1;
2388                     else if (c > high)
2389                         idx_min = idx + 1;
2390                     else
2391                         goto range32_match;
2392                 }
2393                 goto no_match;
2394             range32_match:
2395                 pc += 8 * n;
2396             }
2397             break;
2398         case REOP_prev:
2399             /* go to the previous char */
2400             if (cptr == s->cbuf)
2401                 goto no_match;
2402             PREV_CHAR(cptr, s->cbuf);
2403             break;
2404         case REOP_simple_greedy_quant:
2405             {
2406                 uint32_t next_pos, quant_min, quant_max;
2407                 size_t q;
2408                 intptr_t res;
2409                 const uint8_t *pc1;
2410 
2411                 next_pos = get_u32(pc);
2412                 quant_min = get_u32(pc + 4);
2413                 quant_max = get_u32(pc + 8);
2414                 pc += 16;
2415                 pc1 = pc;
2416                 pc += (int)next_pos;
2417 
2418                 q = 0;
2419                 for(;;) {
2420                     res = lre_exec_backtrack(s, capture, stack, stack_len,
2421                                              pc1, cptr, TRUE);
2422                     if (res == -1)
2423                         return res;
2424                     if (!res)
2425                         break;
2426                     cptr = (uint8_t *)res;
2427                     q++;
2428                     if (q >= quant_max && quant_max != INT32_MAX)
2429                         break;
2430                 }
2431                 if (q < quant_min)
2432                     goto no_match;
2433                 if (q > quant_min) {
2434                     /* will examine all matches down to quant_min */
2435                     ret = push_state(s, capture, stack, stack_len,
2436                                      pc1 - 16, cptr,
2437                                      RE_EXEC_STATE_GREEDY_QUANT,
2438                                      q - quant_min);
2439                     if (ret < 0)
2440                         return -1;
2441                 }
2442             }
2443             break;
2444         default:
2445             abort();
2446         }
2447     }
2448 }
2449 
2450 /* Return 1 if match, 0 if not match or -1 if error. cindex is the
2451    starting position of the match and must be such as 0 <= cindex <=
2452    clen. */
lre_exec(uint8_t ** capture,const uint8_t * bc_buf,const uint8_t * cbuf,int cindex,int clen,int cbuf_type,void * opaque)2453 int lre_exec(uint8_t **capture,
2454              const uint8_t *bc_buf, const uint8_t *cbuf, int cindex, int clen,
2455              int cbuf_type, void *opaque)
2456 {
2457     REExecContext s_s, *s = &s_s;
2458     int re_flags, i, alloca_size, ret;
2459     StackInt *stack_buf;
2460 
2461     re_flags = bc_buf[RE_HEADER_FLAGS];
2462     s->multi_line = (re_flags & LRE_FLAG_MULTILINE) != 0;
2463     s->ignore_case = (re_flags & LRE_FLAG_IGNORECASE) != 0;
2464     s->is_utf16 = (re_flags & LRE_FLAG_UTF16) != 0;
2465     s->capture_count = bc_buf[RE_HEADER_CAPTURE_COUNT];
2466     s->stack_size_max = bc_buf[RE_HEADER_STACK_SIZE];
2467     s->cbuf = cbuf;
2468     s->cbuf_end = cbuf + (clen << cbuf_type);
2469     s->cbuf_type = cbuf_type;
2470     if (s->cbuf_type == 1 && s->is_utf16)
2471         s->cbuf_type = 2;
2472     s->opaque = opaque;
2473 
2474     s->state_size = sizeof(REExecState) +
2475         s->capture_count * sizeof(capture[0]) * 2 +
2476         s->stack_size_max * sizeof(stack_buf[0]);
2477     s->state_stack = NULL;
2478     s->state_stack_len = 0;
2479     s->state_stack_size = 0;
2480 
2481     for(i = 0; i < s->capture_count * 2; i++)
2482         capture[i] = NULL;
2483     alloca_size = s->stack_size_max * sizeof(stack_buf[0]);
2484     stack_buf = alloca(alloca_size);
2485     ret = lre_exec_backtrack(s, capture, stack_buf, 0, bc_buf + RE_HEADER_LEN,
2486                              cbuf + (cindex << cbuf_type), FALSE);
2487     lre_realloc(s->opaque, s->state_stack, 0);
2488     return ret;
2489 }
2490 
lre_get_capture_count(const uint8_t * bc_buf)2491 int lre_get_capture_count(const uint8_t *bc_buf)
2492 {
2493     return bc_buf[RE_HEADER_CAPTURE_COUNT];
2494 }
2495 
lre_get_flags(const uint8_t * bc_buf)2496 int lre_get_flags(const uint8_t *bc_buf)
2497 {
2498     return bc_buf[RE_HEADER_FLAGS];
2499 }
2500 
2501 #ifdef TEST
2502 
lre_check_stack_overflow(void * opaque,size_t alloca_size)2503 BOOL lre_check_stack_overflow(void *opaque, size_t alloca_size)
2504 {
2505     return FALSE;
2506 }
2507 
lre_realloc(void * opaque,void * ptr,size_t size)2508 void *lre_realloc(void *opaque, void *ptr, size_t size)
2509 {
2510     return realloc(ptr, size);
2511 }
2512 
main(int argc,char ** argv)2513 int main(int argc, char **argv)
2514 {
2515     int len, ret, i;
2516     uint8_t *bc;
2517     char error_msg[64];
2518     uint8_t *capture[CAPTURE_COUNT_MAX * 2];
2519     const char *input;
2520     int input_len, capture_count;
2521 
2522     if (argc < 3) {
2523         printf("usage: %s regexp input\n", argv[0]);
2524         exit(1);
2525     }
2526     bc = lre_compile(&len, error_msg, sizeof(error_msg), argv[1],
2527                      strlen(argv[1]), 0, NULL);
2528     if (!bc) {
2529         fprintf(stderr, "error: %s\n", error_msg);
2530         exit(1);
2531     }
2532 
2533     input = argv[2];
2534     input_len = strlen(input);
2535 
2536     ret = lre_exec(capture, bc, (uint8_t *)input, 0, input_len, 0, NULL);
2537     printf("ret=%d\n", ret);
2538     if (ret == 1) {
2539         capture_count = lre_get_capture_count(bc);
2540         for(i = 0; i < 2 * capture_count; i++) {
2541             uint8_t *ptr;
2542             ptr = capture[i];
2543             printf("%d: ", i);
2544             if (!ptr)
2545                 printf("<nil>");
2546             else
2547                 printf("%u", (int)(ptr - (uint8_t *)input));
2548             printf("\n");
2549         }
2550     }
2551     return 0;
2552 }
2553 #endif
2554