1 /*
2 * Regular Expression Engine
3 *
4 * Copyright (c) 2017-2018 Fabrice Bellard
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to deal
8 * in the Software without restriction, including without limitation the rights
9 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10 * copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be included in
14 * all copies or substantial portions of the Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
22 * THE SOFTWARE.
23 */
24 #include <stdlib.h>
25 #include <stdio.h>
26 #include <stdarg.h>
27 #include <inttypes.h>
28 #include <string.h>
29 #include <assert.h>
30
31 #include "cutils.h"
32 #include "libregexp.h"
33
34 /*
35 TODO:
36
37 - Add full unicode canonicalize rules for character ranges (not
38 really useful but needed for exact "ignorecase" compatibility).
39
40 - Add a lock step execution mode (=linear time execution guaranteed)
41 when the regular expression is "simple" i.e. no backreference nor
42 complicated lookahead. The opcodes are designed for this execution
43 model.
44 */
45
46 #if defined(TEST)
47 #define DUMP_REOP
48 #endif
49
50 typedef enum {
51 #define DEF(id, size) REOP_ ## id,
52 #include "libregexp-opcode.h"
53 #undef DEF
54 REOP_COUNT,
55 } REOPCodeEnum;
56
57 #define CAPTURE_COUNT_MAX 255
58 #define STACK_SIZE_MAX 255
59
60 /* unicode code points */
61 #define CP_LS 0x2028
62 #define CP_PS 0x2029
63
64 #define TMP_BUF_SIZE 128
65
66 typedef struct {
67 DynBuf byte_code;
68 const uint8_t *buf_ptr;
69 const uint8_t *buf_end;
70 const uint8_t *buf_start;
71 int re_flags;
72 BOOL is_utf16;
73 BOOL ignore_case;
74 BOOL dotall;
75 int capture_count;
76 int total_capture_count; /* -1 = not computed yet */
77 int has_named_captures; /* -1 = don't know, 0 = no, 1 = yes */
78 void *mem_opaque;
79 DynBuf group_names;
80 union {
81 char error_msg[TMP_BUF_SIZE];
82 char tmp_buf[TMP_BUF_SIZE];
83 } u;
84 } REParseState;
85
86 typedef struct {
87 #ifdef DUMP_REOP
88 const char *name;
89 #endif
90 uint8_t size;
91 } REOpCode;
92
93 static const REOpCode reopcode_info[REOP_COUNT] = {
94 #ifdef DUMP_REOP
95 #define DEF(id, size) { #id, size },
96 #else
97 #define DEF(id, size) { size },
98 #endif
99 #include "libregexp-opcode.h"
100 #undef DEF
101 };
102
103 #define RE_HEADER_FLAGS 0
104 #define RE_HEADER_CAPTURE_COUNT 1
105 #define RE_HEADER_STACK_SIZE 2
106
107 #define RE_HEADER_LEN 7
108
is_digit(int c)109 static inline int is_digit(int c) {
110 return c >= '0' && c <= '9';
111 }
112
113 /* insert 'len' bytes at position 'pos' */
dbuf_insert(DynBuf * s,int pos,int len)114 static void dbuf_insert(DynBuf *s, int pos, int len)
115 {
116 dbuf_realloc(s, s->size + len);
117 memmove(s->buf + pos + len, s->buf + pos, s->size - pos);
118 s->size += len;
119 }
120
121 /* canonicalize with the specific JS regexp rules */
lre_canonicalize(uint32_t c,BOOL is_utf16)122 static uint32_t lre_canonicalize(uint32_t c, BOOL is_utf16)
123 {
124 uint32_t res[LRE_CC_RES_LEN_MAX];
125 int len;
126 if (is_utf16) {
127 if (likely(c < 128)) {
128 if (c >= 'A' && c <= 'Z')
129 c = c - 'A' + 'a';
130 } else {
131 lre_case_conv(res, c, 2);
132 c = res[0];
133 }
134 } else {
135 if (likely(c < 128)) {
136 if (c >= 'a' && c <= 'z')
137 c = c - 'a' + 'A';
138 } else {
139 /* legacy regexp: to upper case if single char >= 128 */
140 len = lre_case_conv(res, c, FALSE);
141 if (len == 1 && res[0] >= 128)
142 c = res[0];
143 }
144 }
145 return c;
146 }
147
148 static const uint16_t char_range_d[] = {
149 1,
150 0x0030, 0x0039 + 1,
151 };
152
153 /* code point ranges for Zs,Zl or Zp property */
154 static const uint16_t char_range_s[] = {
155 10,
156 0x0009, 0x000D + 1,
157 0x0020, 0x0020 + 1,
158 0x00A0, 0x00A0 + 1,
159 0x1680, 0x1680 + 1,
160 0x2000, 0x200A + 1,
161 /* 2028;LINE SEPARATOR;Zl;0;WS;;;;;N;;;;; */
162 /* 2029;PARAGRAPH SEPARATOR;Zp;0;B;;;;;N;;;;; */
163 0x2028, 0x2029 + 1,
164 0x202F, 0x202F + 1,
165 0x205F, 0x205F + 1,
166 0x3000, 0x3000 + 1,
167 /* FEFF;ZERO WIDTH NO-BREAK SPACE;Cf;0;BN;;;;;N;BYTE ORDER MARK;;;; */
168 0xFEFF, 0xFEFF + 1,
169 };
170
lre_is_space(int c)171 BOOL lre_is_space(int c)
172 {
173 int i, n, low, high;
174 n = (countof(char_range_s) - 1) / 2;
175 for(i = 0; i < n; i++) {
176 low = char_range_s[2 * i + 1];
177 if (c < low)
178 return FALSE;
179 high = char_range_s[2 * i + 2];
180 if (c < high)
181 return TRUE;
182 }
183 return FALSE;
184 }
185
186 uint32_t const lre_id_start_table_ascii[4] = {
187 /* $ A-Z _ a-z */
188 0x00000000, 0x00000010, 0x87FFFFFE, 0x07FFFFFE
189 };
190
191 uint32_t const lre_id_continue_table_ascii[4] = {
192 /* $ 0-9 A-Z _ a-z */
193 0x00000000, 0x03FF0010, 0x87FFFFFE, 0x07FFFFFE
194 };
195
196
197 static const uint16_t char_range_w[] = {
198 4,
199 0x0030, 0x0039 + 1,
200 0x0041, 0x005A + 1,
201 0x005F, 0x005F + 1,
202 0x0061, 0x007A + 1,
203 };
204
205 #define CLASS_RANGE_BASE 0x40000000
206
207 typedef enum {
208 CHAR_RANGE_d,
209 CHAR_RANGE_D,
210 CHAR_RANGE_s,
211 CHAR_RANGE_S,
212 CHAR_RANGE_w,
213 CHAR_RANGE_W,
214 } CharRangeEnum;
215
216 static const uint16_t *char_range_table[] = {
217 char_range_d,
218 char_range_s,
219 char_range_w,
220 };
221
cr_init_char_range(REParseState * s,CharRange * cr,uint32_t c)222 static int cr_init_char_range(REParseState *s, CharRange *cr, uint32_t c)
223 {
224 BOOL invert;
225 const uint16_t *c_pt;
226 int len, i;
227
228 invert = c & 1;
229 c_pt = char_range_table[c >> 1];
230 len = *c_pt++;
231 cr_init(cr, s->mem_opaque, lre_realloc);
232 for(i = 0; i < len * 2; i++) {
233 if (cr_add_point(cr, c_pt[i]))
234 goto fail;
235 }
236 if (invert) {
237 if (cr_invert(cr))
238 goto fail;
239 }
240 return 0;
241 fail:
242 cr_free(cr);
243 return -1;
244 }
245
cr_canonicalize(CharRange * cr)246 static int cr_canonicalize(CharRange *cr)
247 {
248 CharRange a;
249 uint32_t pt[2];
250 int i, ret;
251
252 cr_init(&a, cr->mem_opaque, lre_realloc);
253 pt[0] = 'a';
254 pt[1] = 'z' + 1;
255 ret = cr_op(&a, cr->points, cr->len, pt, 2, CR_OP_INTER);
256 if (ret)
257 goto fail;
258 /* convert to upper case */
259 /* XXX: the generic unicode case would be much more complicated
260 and not really useful */
261 for(i = 0; i < a.len; i++) {
262 a.points[i] += 'A' - 'a';
263 }
264 /* Note: for simplicity we keep the lower case ranges */
265 ret = cr_union1(cr, a.points, a.len);
266 fail:
267 cr_free(&a);
268 return ret;
269 }
270
271 #ifdef DUMP_REOP
lre_dump_bytecode(const uint8_t * buf,int buf_len)272 static __maybe_unused void lre_dump_bytecode(const uint8_t *buf,
273 int buf_len)
274 {
275 int pos, len, opcode, bc_len, re_flags, i;
276 uint32_t val;
277
278 assert(buf_len >= RE_HEADER_LEN);
279
280 re_flags= buf[0];
281 bc_len = get_u32(buf + 3);
282 assert(bc_len + RE_HEADER_LEN <= buf_len);
283 printf("flags: 0x%x capture_count=%d stack_size=%d\n",
284 re_flags, buf[1], buf[2]);
285 if (re_flags & LRE_FLAG_NAMED_GROUPS) {
286 const char *p;
287 p = (char *)buf + RE_HEADER_LEN + bc_len;
288 printf("named groups: ");
289 for(i = 1; i < buf[1]; i++) {
290 if (i != 1)
291 printf(",");
292 printf("<%s>", p);
293 p += strlen(p) + 1;
294 }
295 printf("\n");
296 assert(p == (char *)(buf + buf_len));
297 }
298 printf("bytecode_len=%d\n", bc_len);
299
300 buf += RE_HEADER_LEN;
301 pos = 0;
302 while (pos < bc_len) {
303 printf("%5u: ", pos);
304 opcode = buf[pos];
305 len = reopcode_info[opcode].size;
306 if (opcode >= REOP_COUNT) {
307 printf(" invalid opcode=0x%02x\n", opcode);
308 break;
309 }
310 if ((pos + len) > bc_len) {
311 printf(" buffer overflow (opcode=0x%02x)\n", opcode);
312 break;
313 }
314 printf("%s", reopcode_info[opcode].name);
315 switch(opcode) {
316 case REOP_char:
317 val = get_u16(buf + pos + 1);
318 if (val >= ' ' && val <= 126)
319 printf(" '%c'", val);
320 else
321 printf(" 0x%04x", val);
322 break;
323 case REOP_char32:
324 val = get_u32(buf + pos + 1);
325 if (val >= ' ' && val <= 126)
326 printf(" '%c'", val);
327 else
328 printf(" 0x%08x", val);
329 break;
330 case REOP_goto:
331 case REOP_split_goto_first:
332 case REOP_split_next_first:
333 case REOP_loop:
334 case REOP_lookahead:
335 case REOP_negative_lookahead:
336 case REOP_bne_char_pos:
337 val = get_u32(buf + pos + 1);
338 val += (pos + 5);
339 printf(" %u", val);
340 break;
341 case REOP_simple_greedy_quant:
342 printf(" %u %u %u %u",
343 get_u32(buf + pos + 1) + (pos + 17),
344 get_u32(buf + pos + 1 + 4),
345 get_u32(buf + pos + 1 + 8),
346 get_u32(buf + pos + 1 + 12));
347 break;
348 case REOP_save_start:
349 case REOP_save_end:
350 case REOP_back_reference:
351 case REOP_backward_back_reference:
352 printf(" %u", buf[pos + 1]);
353 break;
354 case REOP_save_reset:
355 printf(" %u %u", buf[pos + 1], buf[pos + 2]);
356 break;
357 case REOP_push_i32:
358 val = get_u32(buf + pos + 1);
359 printf(" %d", val);
360 break;
361 case REOP_range:
362 {
363 int n, i;
364 n = get_u16(buf + pos + 1);
365 len += n * 4;
366 for(i = 0; i < n * 2; i++) {
367 val = get_u16(buf + pos + 3 + i * 2);
368 printf(" 0x%04x", val);
369 }
370 }
371 break;
372 case REOP_range32:
373 {
374 int n, i;
375 n = get_u16(buf + pos + 1);
376 len += n * 8;
377 for(i = 0; i < n * 2; i++) {
378 val = get_u32(buf + pos + 3 + i * 4);
379 printf(" 0x%08x", val);
380 }
381 }
382 break;
383 default:
384 break;
385 }
386 printf("\n");
387 pos += len;
388 }
389 }
390 #endif
391
re_emit_op(REParseState * s,int op)392 static void re_emit_op(REParseState *s, int op)
393 {
394 dbuf_putc(&s->byte_code, op);
395 }
396
397 /* return the offset of the u32 value */
re_emit_op_u32(REParseState * s,int op,uint32_t val)398 static int re_emit_op_u32(REParseState *s, int op, uint32_t val)
399 {
400 int pos;
401 dbuf_putc(&s->byte_code, op);
402 pos = s->byte_code.size;
403 dbuf_put_u32(&s->byte_code, val);
404 return pos;
405 }
406
re_emit_goto(REParseState * s,int op,uint32_t val)407 static int re_emit_goto(REParseState *s, int op, uint32_t val)
408 {
409 int pos;
410 dbuf_putc(&s->byte_code, op);
411 pos = s->byte_code.size;
412 dbuf_put_u32(&s->byte_code, val - (pos + 4));
413 return pos;
414 }
415
re_emit_op_u8(REParseState * s,int op,uint32_t val)416 static void re_emit_op_u8(REParseState *s, int op, uint32_t val)
417 {
418 dbuf_putc(&s->byte_code, op);
419 dbuf_putc(&s->byte_code, val);
420 }
421
re_emit_op_u16(REParseState * s,int op,uint32_t val)422 static void re_emit_op_u16(REParseState *s, int op, uint32_t val)
423 {
424 dbuf_putc(&s->byte_code, op);
425 dbuf_put_u16(&s->byte_code, val);
426 }
427
428 #if defined(_MSC_VER)
re_parse_error(REParseState * s,const char * fmt,...)429 static int re_parse_error(REParseState *s, const char *fmt, ...)
430 #else
431 static int __attribute__((format(printf, 2, 3))) re_parse_error(REParseState *s, const char *fmt, ...)
432 #endif
433 {
434 va_list ap;
435 va_start(ap, fmt);
436 vsnprintf(s->u.error_msg, sizeof(s->u.error_msg), fmt, ap);
437 va_end(ap);
438 return -1;
439 }
440
441 /* Return -1 in case of overflow */
parse_digits(const uint8_t ** pp)442 static int parse_digits(const uint8_t **pp)
443 {
444 const uint8_t *p;
445 uint64_t v;
446 int c;
447
448 p = *pp;
449 v = 0;
450 for(;;) {
451 c = *p;
452 if (c < '0' || c > '9')
453 break;
454 v = v * 10 + c - '0';
455 if (v >= INT32_MAX)
456 return -1;
457 p++;
458 }
459 *pp = p;
460 return v;
461 }
462
re_parse_expect(REParseState * s,const uint8_t ** pp,int c)463 static int re_parse_expect(REParseState *s, const uint8_t **pp, int c)
464 {
465 const uint8_t *p;
466 p = *pp;
467 if (*p != c)
468 return re_parse_error(s, "expecting '%c'", c);
469 p++;
470 *pp = p;
471 return 0;
472 }
473
474 /* Parse an escape sequence, *pp points after the '\':
475 allow_utf16 value:
476 0 : no UTF-16 escapes allowed
477 1 : UTF-16 escapes allowed
478 2 : UTF-16 escapes allowed and escapes of surrogate pairs are
479 converted to a unicode character (unicode regexp case).
480
481 Return the unicode char and update *pp if recognized,
482 return -1 if malformed escape,
483 return -2 otherwise. */
lre_parse_escape(const uint8_t ** pp,int allow_utf16)484 int lre_parse_escape(const uint8_t **pp, int allow_utf16)
485 {
486 const uint8_t *p;
487 uint32_t c;
488
489 p = *pp;
490 c = *p++;
491 switch(c) {
492 case 'b':
493 c = '\b';
494 break;
495 case 'f':
496 c = '\f';
497 break;
498 case 'n':
499 c = '\n';
500 break;
501 case 'r':
502 c = '\r';
503 break;
504 case 't':
505 c = '\t';
506 break;
507 case 'v':
508 c = '\v';
509 break;
510 case 'x':
511 case 'u':
512 {
513 int h, n, i;
514 uint32_t c1;
515
516 if (*p == '{' && allow_utf16) {
517 p++;
518 c = 0;
519 for(;;) {
520 h = from_hex(*p++);
521 if (h < 0)
522 return -1;
523 c = (c << 4) | h;
524 if (c > 0x10FFFF)
525 return -1;
526 if (*p == '}')
527 break;
528 }
529 p++;
530 } else {
531 if (c == 'x') {
532 n = 2;
533 } else {
534 n = 4;
535 }
536
537 c = 0;
538 for(i = 0; i < n; i++) {
539 h = from_hex(*p++);
540 if (h < 0) {
541 return -1;
542 }
543 c = (c << 4) | h;
544 }
545 if (c >= 0xd800 && c < 0xdc00 &&
546 allow_utf16 == 2 && p[0] == '\\' && p[1] == 'u') {
547 /* convert an escaped surrogate pair into a
548 unicode char */
549 c1 = 0;
550 for(i = 0; i < 4; i++) {
551 h = from_hex(p[2 + i]);
552 if (h < 0)
553 break;
554 c1 = (c1 << 4) | h;
555 }
556 if (i == 4 && c1 >= 0xdc00 && c1 < 0xe000) {
557 p += 6;
558 c = (((c & 0x3ff) << 10) | (c1 & 0x3ff)) + 0x10000;
559 }
560 }
561 }
562 }
563 break;
564 #if defined(_MSC_VER)
565 case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7':
566 #else
567 case '0' ... '7':
568 #endif
569 c -= '0';
570 if (allow_utf16 == 2) {
571 /* only accept \0 not followed by digit */
572 if (c != 0 || is_digit(*p))
573 return -1;
574 } else {
575 /* parse a legacy octal sequence */
576 uint32_t v;
577 v = *p - '0';
578 if (v > 7)
579 break;
580 c = (c << 3) | v;
581 p++;
582 if (c >= 32)
583 break;
584 v = *p - '0';
585 if (v > 7)
586 break;
587 c = (c << 3) | v;
588 p++;
589 }
590 break;
591 default:
592 return -2;
593 }
594 *pp = p;
595 return c;
596 }
597
598 #ifdef CONFIG_ALL_UNICODE
599 /* XXX: we use the same chars for name and value */
is_unicode_char(int c)600 static BOOL is_unicode_char(int c)
601 {
602 return ((c >= '0' && c <= '9') ||
603 (c >= 'A' && c <= 'Z') ||
604 (c >= 'a' && c <= 'z') ||
605 (c == '_'));
606 }
607
parse_unicode_property(REParseState * s,CharRange * cr,const uint8_t ** pp,BOOL is_inv)608 static int parse_unicode_property(REParseState *s, CharRange *cr,
609 const uint8_t **pp, BOOL is_inv)
610 {
611 const uint8_t *p;
612 char name[64], value[64];
613 char *q;
614 BOOL script_ext;
615 int ret;
616
617 p = *pp;
618 if (*p != '{')
619 return re_parse_error(s, "expecting '{' after \\p");
620 p++;
621 q = name;
622 while (is_unicode_char(*p)) {
623 if ((q - name) > sizeof(name) - 1)
624 goto unknown_property_name;
625 *q++ = *p++;
626 }
627 *q = '\0';
628 q = value;
629 if (*p == '=') {
630 p++;
631 while (is_unicode_char(*p)) {
632 if ((q - value) > sizeof(value) - 1)
633 return re_parse_error(s, "unknown unicode property value");
634 *q++ = *p++;
635 }
636 }
637 *q = '\0';
638 if (*p != '}')
639 return re_parse_error(s, "expecting '}'");
640 p++;
641 // printf("name=%s value=%s\n", name, value);
642
643 if (!strcmp(name, "Script") || !strcmp(name, "sc")) {
644 script_ext = FALSE;
645 goto do_script;
646 } else if (!strcmp(name, "Script_Extensions") || !strcmp(name, "scx")) {
647 script_ext = TRUE;
648 do_script:
649 cr_init(cr, s->mem_opaque, lre_realloc);
650 ret = unicode_script(cr, value, script_ext);
651 if (ret) {
652 cr_free(cr);
653 if (ret == -2)
654 return re_parse_error(s, "unknown unicode script");
655 else
656 goto out_of_memory;
657 }
658 } else if (!strcmp(name, "General_Category") || !strcmp(name, "gc")) {
659 cr_init(cr, s->mem_opaque, lre_realloc);
660 ret = unicode_general_category(cr, value);
661 if (ret) {
662 cr_free(cr);
663 if (ret == -2)
664 return re_parse_error(s, "unknown unicode general category");
665 else
666 goto out_of_memory;
667 }
668 } else if (value[0] == '\0') {
669 cr_init(cr, s->mem_opaque, lre_realloc);
670 ret = unicode_general_category(cr, name);
671 if (ret == -1) {
672 cr_free(cr);
673 goto out_of_memory;
674 }
675 if (ret < 0) {
676 ret = unicode_prop(cr, name);
677 if (ret) {
678 cr_free(cr);
679 if (ret == -2)
680 goto unknown_property_name;
681 else
682 goto out_of_memory;
683 }
684 }
685 } else {
686 unknown_property_name:
687 return re_parse_error(s, "unknown unicode property name");
688 }
689
690 if (is_inv) {
691 if (cr_invert(cr)) {
692 cr_free(cr);
693 return -1;
694 }
695 }
696 *pp = p;
697 return 0;
698 out_of_memory:
699 return re_parse_error(s, "out of memory");
700 }
701 #endif /* CONFIG_ALL_UNICODE */
702
703 /* return -1 if error otherwise the character or a class range
704 (CLASS_RANGE_BASE). In case of class range, 'cr' is
705 initialized. Otherwise, it is ignored. */
get_class_atom(REParseState * s,CharRange * cr,const uint8_t ** pp,BOOL inclass)706 static int get_class_atom(REParseState *s, CharRange *cr,
707 const uint8_t **pp, BOOL inclass)
708 {
709 const uint8_t *p;
710 uint32_t c;
711 int ret;
712
713 p = *pp;
714
715 c = *p;
716 switch(c) {
717 case '\\':
718 p++;
719 if (p >= s->buf_end)
720 goto unexpected_end;
721 c = *p++;
722 switch(c) {
723 case 'd':
724 c = CHAR_RANGE_d;
725 goto class_range;
726 case 'D':
727 c = CHAR_RANGE_D;
728 goto class_range;
729 case 's':
730 c = CHAR_RANGE_s;
731 goto class_range;
732 case 'S':
733 c = CHAR_RANGE_S;
734 goto class_range;
735 case 'w':
736 c = CHAR_RANGE_w;
737 goto class_range;
738 case 'W':
739 c = CHAR_RANGE_W;
740 class_range:
741 if (cr_init_char_range(s, cr, c))
742 return -1;
743 c = CLASS_RANGE_BASE;
744 break;
745 case 'c':
746 c = *p;
747 if ((c >= 'a' && c <= 'z') ||
748 (c >= 'A' && c <= 'Z') ||
749 (((c >= '0' && c <= '9') || c == '_') &&
750 inclass && !s->is_utf16)) { /* Annex B.1.4 */
751 c &= 0x1f;
752 p++;
753 } else if (s->is_utf16) {
754 goto invalid_escape;
755 } else {
756 /* otherwise return '\' and 'c' */
757 p--;
758 c = '\\';
759 }
760 break;
761 #ifdef CONFIG_ALL_UNICODE
762 case 'p':
763 case 'P':
764 if (s->is_utf16) {
765 if (parse_unicode_property(s, cr, &p, (c == 'P')))
766 return -1;
767 c = CLASS_RANGE_BASE;
768 break;
769 }
770 /* fall thru */
771 #endif
772 default:
773 p--;
774 ret = lre_parse_escape(&p, s->is_utf16 * 2);
775 if (ret >= 0) {
776 c = ret;
777 } else {
778 if (ret == -2 && *p != '\0' && strchr("^$\\.*+?()[]{}|/", *p)) {
779 /* always valid to escape these characters */
780 goto normal_char;
781 } else if (s->is_utf16) {
782 invalid_escape:
783 return re_parse_error(s, "invalid escape sequence in regular expression");
784 } else {
785 /* just ignore the '\' */
786 goto normal_char;
787 }
788 }
789 break;
790 }
791 break;
792 case '\0':
793 if (p >= s->buf_end) {
794 unexpected_end:
795 return re_parse_error(s, "unexpected end");
796 }
797 /* fall thru */
798 default:
799 normal_char:
800 /* normal char */
801 if (c >= 128) {
802 c = unicode_from_utf8(p, UTF8_CHAR_LEN_MAX, &p);
803 if ((unsigned)c > 0xffff && !s->is_utf16) {
804 /* XXX: should handle non BMP-1 code points */
805 return re_parse_error(s, "malformed unicode char");
806 }
807 } else {
808 p++;
809 }
810 break;
811 }
812 *pp = p;
813 return c;
814 }
815
re_emit_range(REParseState * s,const CharRange * cr)816 static int re_emit_range(REParseState *s, const CharRange *cr)
817 {
818 int len, i;
819 uint32_t high;
820
821 len = (unsigned)cr->len / 2;
822 if (len >= 65535)
823 return re_parse_error(s, "too many ranges");
824 if (len == 0) {
825 /* not sure it can really happen. Emit a match that is always
826 false */
827 re_emit_op_u32(s, REOP_char32, -1);
828 } else {
829 high = cr->points[cr->len - 1];
830 if (high == UINT32_MAX)
831 high = cr->points[cr->len - 2];
832 if (high <= 0xffff) {
833 /* can use 16 bit ranges with the conversion that 0xffff =
834 infinity */
835 re_emit_op_u16(s, REOP_range, len);
836 for(i = 0; i < cr->len; i += 2) {
837 dbuf_put_u16(&s->byte_code, cr->points[i]);
838 high = cr->points[i + 1] - 1;
839 if (high == UINT32_MAX - 1)
840 high = 0xffff;
841 dbuf_put_u16(&s->byte_code, high);
842 }
843 } else {
844 re_emit_op_u16(s, REOP_range32, len);
845 for(i = 0; i < cr->len; i += 2) {
846 dbuf_put_u32(&s->byte_code, cr->points[i]);
847 dbuf_put_u32(&s->byte_code, cr->points[i + 1] - 1);
848 }
849 }
850 }
851 return 0;
852 }
853
re_parse_char_class(REParseState * s,const uint8_t ** pp)854 static int re_parse_char_class(REParseState *s, const uint8_t **pp)
855 {
856 const uint8_t *p;
857 uint32_t c1, c2;
858 CharRange cr_s, *cr = &cr_s;
859 CharRange cr1_s, *cr1 = &cr1_s;
860 BOOL invert;
861
862 cr_init(cr, s->mem_opaque, lre_realloc);
863 p = *pp;
864 p++; /* skip '[' */
865 invert = FALSE;
866 if (*p == '^') {
867 p++;
868 invert = TRUE;
869 }
870 for(;;) {
871 if (*p == ']')
872 break;
873 c1 = get_class_atom(s, cr1, &p, TRUE);
874 if ((int)c1 < 0)
875 goto fail;
876 if (*p == '-' && p[1] != ']') {
877 const uint8_t *p0 = p + 1;
878 if (c1 >= CLASS_RANGE_BASE) {
879 if (s->is_utf16) {
880 cr_free(cr1);
881 goto invalid_class_range;
882 }
883 /* Annex B: match '-' character */
884 goto class_atom;
885 }
886 c2 = get_class_atom(s, cr1, &p0, TRUE);
887 if ((int)c2 < 0)
888 goto fail;
889 if (c2 >= CLASS_RANGE_BASE) {
890 cr_free(cr1);
891 if (s->is_utf16) {
892 goto invalid_class_range;
893 }
894 /* Annex B: match '-' character */
895 goto class_atom;
896 }
897 p = p0;
898 if (c2 < c1) {
899 invalid_class_range:
900 re_parse_error(s, "invalid class range");
901 goto fail;
902 }
903 if (cr_union_interval(cr, c1, c2))
904 goto memory_error;
905 } else {
906 class_atom:
907 if (c1 >= CLASS_RANGE_BASE) {
908 int ret;
909 ret = cr_union1(cr, cr1->points, cr1->len);
910 cr_free(cr1);
911 if (ret)
912 goto memory_error;
913 } else {
914 if (cr_union_interval(cr, c1, c1))
915 goto memory_error;
916 }
917 }
918 }
919 if (s->ignore_case) {
920 if (cr_canonicalize(cr))
921 goto memory_error;
922 }
923 if (invert) {
924 if (cr_invert(cr))
925 goto memory_error;
926 }
927 if (re_emit_range(s, cr))
928 goto fail;
929 cr_free(cr);
930 p++; /* skip ']' */
931 *pp = p;
932 return 0;
933 memory_error:
934 re_parse_error(s, "out of memory");
935 fail:
936 cr_free(cr);
937 return -1;
938 }
939
940 /* Return:
941 1 if the opcodes in bc_buf[] always advance the character pointer.
942 0 if the character pointer may not be advanced.
943 -1 if the code may depend on side effects of its previous execution (backreference)
944 */
re_check_advance(const uint8_t * bc_buf,int bc_buf_len)945 static int re_check_advance(const uint8_t *bc_buf, int bc_buf_len)
946 {
947 int pos, opcode, ret, len, i;
948 uint32_t val, last;
949 BOOL has_back_reference;
950 uint8_t capture_bitmap[CAPTURE_COUNT_MAX];
951
952 ret = -2; /* not known yet */
953 pos = 0;
954 has_back_reference = FALSE;
955 memset(capture_bitmap, 0, sizeof(capture_bitmap));
956
957 while (pos < bc_buf_len) {
958 opcode = bc_buf[pos];
959 len = reopcode_info[opcode].size;
960 switch(opcode) {
961 case REOP_range:
962 val = get_u16(bc_buf + pos + 1);
963 len += val * 4;
964 goto simple_char;
965 case REOP_range32:
966 val = get_u16(bc_buf + pos + 1);
967 len += val * 8;
968 goto simple_char;
969 case REOP_char:
970 case REOP_char32:
971 case REOP_dot:
972 case REOP_any:
973 simple_char:
974 if (ret == -2)
975 ret = 1;
976 break;
977 case REOP_line_start:
978 case REOP_line_end:
979 case REOP_push_i32:
980 case REOP_push_char_pos:
981 case REOP_drop:
982 case REOP_word_boundary:
983 case REOP_not_word_boundary:
984 case REOP_prev:
985 /* no effect */
986 break;
987 case REOP_save_start:
988 case REOP_save_end:
989 val = bc_buf[pos + 1];
990 capture_bitmap[val] |= 1;
991 break;
992 case REOP_save_reset:
993 {
994 val = bc_buf[pos + 1];
995 last = bc_buf[pos + 2];
996 while (val < last)
997 capture_bitmap[val++] |= 1;
998 }
999 break;
1000 case REOP_back_reference:
1001 case REOP_backward_back_reference:
1002 val = bc_buf[pos + 1];
1003 capture_bitmap[val] |= 2;
1004 has_back_reference = TRUE;
1005 break;
1006 default:
1007 /* safe behvior: we cannot predict the outcome */
1008 if (ret == -2)
1009 ret = 0;
1010 break;
1011 }
1012 pos += len;
1013 }
1014 if (has_back_reference) {
1015 /* check if there is back reference which references a capture
1016 made in the some code */
1017 for(i = 0; i < CAPTURE_COUNT_MAX; i++) {
1018 if (capture_bitmap[i] == 3)
1019 return -1;
1020 }
1021 }
1022 if (ret == -2)
1023 ret = 0;
1024 return ret;
1025 }
1026
1027 /* return -1 if a simple quantifier cannot be used. Otherwise return
1028 the number of characters in the atom. */
re_is_simple_quantifier(const uint8_t * bc_buf,int bc_buf_len)1029 static int re_is_simple_quantifier(const uint8_t *bc_buf, int bc_buf_len)
1030 {
1031 int pos, opcode, len, count;
1032 uint32_t val;
1033
1034 count = 0;
1035 pos = 0;
1036 while (pos < bc_buf_len) {
1037 opcode = bc_buf[pos];
1038 len = reopcode_info[opcode].size;
1039 switch(opcode) {
1040 case REOP_range:
1041 val = get_u16(bc_buf + pos + 1);
1042 len += val * 4;
1043 goto simple_char;
1044 case REOP_range32:
1045 val = get_u16(bc_buf + pos + 1);
1046 len += val * 8;
1047 goto simple_char;
1048 case REOP_char:
1049 case REOP_char32:
1050 case REOP_dot:
1051 case REOP_any:
1052 simple_char:
1053 count++;
1054 break;
1055 case REOP_line_start:
1056 case REOP_line_end:
1057 case REOP_word_boundary:
1058 case REOP_not_word_boundary:
1059 break;
1060 default:
1061 return -1;
1062 }
1063 pos += len;
1064 }
1065 return count;
1066 }
1067
1068 /* '*pp' is the first char after '<' */
re_parse_group_name(char * buf,int buf_size,const uint8_t ** pp,BOOL is_utf16)1069 static int re_parse_group_name(char *buf, int buf_size,
1070 const uint8_t **pp, BOOL is_utf16)
1071 {
1072 const uint8_t *p;
1073 uint32_t c;
1074 char *q;
1075
1076 p = *pp;
1077 q = buf;
1078 for(;;) {
1079 c = *p;
1080 if (c == '\\') {
1081 p++;
1082 if (*p != 'u')
1083 return -1;
1084 c = lre_parse_escape(&p, is_utf16 * 2);
1085 } else if (c == '>') {
1086 break;
1087 } else if (c >= 128) {
1088 c = unicode_from_utf8(p, UTF8_CHAR_LEN_MAX, &p);
1089 } else {
1090 p++;
1091 }
1092 if (c > 0x10FFFF)
1093 return -1;
1094 if (q == buf) {
1095 if (!lre_js_is_ident_first(c))
1096 return -1;
1097 } else {
1098 if (!lre_js_is_ident_next(c))
1099 return -1;
1100 }
1101 if ((q - buf + UTF8_CHAR_LEN_MAX + 1) > buf_size)
1102 return -1;
1103 if (c < 128) {
1104 *q++ = c;
1105 } else {
1106 q += unicode_to_utf8((uint8_t*)q, c);
1107 }
1108 }
1109 if (q == buf)
1110 return -1;
1111 *q = '\0';
1112 p++;
1113 *pp = p;
1114 return 0;
1115 }
1116
1117 /* if capture_name = NULL: return the number of captures + 1.
1118 Otherwise, return the capture index corresponding to capture_name
1119 or -1 if none */
re_parse_captures(REParseState * s,int * phas_named_captures,const char * capture_name)1120 static int re_parse_captures(REParseState *s, int *phas_named_captures,
1121 const char *capture_name)
1122 {
1123 const uint8_t *p;
1124 int capture_index;
1125 char name[TMP_BUF_SIZE];
1126
1127 capture_index = 1;
1128 *phas_named_captures = 0;
1129 for (p = s->buf_start; p < s->buf_end; p++) {
1130 switch (*p) {
1131 case '(':
1132 if (p[1] == '?') {
1133 if (p[2] == '<' && p[3] != '=' && p[3] != '!') {
1134 *phas_named_captures = 1;
1135 /* potential named capture */
1136 if (capture_name) {
1137 p += 3;
1138 if (re_parse_group_name(name, sizeof(name), &p,
1139 s->is_utf16) == 0) {
1140 if (!strcmp(name, capture_name))
1141 return capture_index;
1142 }
1143 }
1144 capture_index++;
1145 }
1146 } else {
1147 capture_index++;
1148 }
1149 break;
1150 case '\\':
1151 p++;
1152 break;
1153 case '[':
1154 for (p += 1 + (*p == ']'); p < s->buf_end && *p != ']'; p++) {
1155 if (*p == '\\')
1156 p++;
1157 }
1158 break;
1159 }
1160 }
1161 if (capture_name)
1162 return -1;
1163 else
1164 return capture_index;
1165 }
1166
re_count_captures(REParseState * s)1167 static int re_count_captures(REParseState *s)
1168 {
1169 if (s->total_capture_count < 0) {
1170 s->total_capture_count = re_parse_captures(s, &s->has_named_captures,
1171 NULL);
1172 }
1173 return s->total_capture_count;
1174 }
1175
re_has_named_captures(REParseState * s)1176 static BOOL re_has_named_captures(REParseState *s)
1177 {
1178 if (s->has_named_captures < 0)
1179 re_count_captures(s);
1180 return s->has_named_captures;
1181 }
1182
find_group_name(REParseState * s,const char * name)1183 static int find_group_name(REParseState *s, const char *name)
1184 {
1185 const char *p, *buf_end;
1186 size_t len, name_len;
1187 int capture_index;
1188
1189 name_len = strlen(name);
1190 p = (char *)s->group_names.buf;
1191 buf_end = (char *)s->group_names.buf + s->group_names.size;
1192 capture_index = 1;
1193 while (p < buf_end) {
1194 len = strlen(p);
1195 if (len == name_len && memcmp(name, p, name_len) == 0)
1196 return capture_index;
1197 p += len + 1;
1198 capture_index++;
1199 }
1200 return -1;
1201 }
1202
1203 static int re_parse_disjunction(REParseState *s, BOOL is_backward_dir);
1204
re_parse_term(REParseState * s,BOOL is_backward_dir)1205 static int re_parse_term(REParseState *s, BOOL is_backward_dir)
1206 {
1207 const uint8_t *p;
1208 int c, last_atom_start, quant_min, quant_max, last_capture_count;
1209 BOOL greedy, add_zero_advance_check, is_neg, is_backward_lookahead;
1210 CharRange cr_s, *cr = &cr_s;
1211
1212 last_atom_start = -1;
1213 last_capture_count = 0;
1214 p = s->buf_ptr;
1215 c = *p;
1216 switch(c) {
1217 case '^':
1218 p++;
1219 re_emit_op(s, REOP_line_start);
1220 break;
1221 case '$':
1222 p++;
1223 re_emit_op(s, REOP_line_end);
1224 break;
1225 case '.':
1226 p++;
1227 last_atom_start = s->byte_code.size;
1228 last_capture_count = s->capture_count;
1229 if (is_backward_dir)
1230 re_emit_op(s, REOP_prev);
1231 re_emit_op(s, s->dotall ? REOP_any : REOP_dot);
1232 if (is_backward_dir)
1233 re_emit_op(s, REOP_prev);
1234 break;
1235 case '{':
1236 /* As an extension (see ES6 annex B), we accept '{' not
1237 followed by digits as a normal atom */
1238 if (!is_digit(p[1])) {
1239 if (s->is_utf16)
1240 goto invalid_quant_count;
1241 goto parse_class_atom;
1242 }
1243 /* fall tru */
1244 case '*':
1245 case '+':
1246 case '?':
1247 return re_parse_error(s, "nothing to repeat");
1248 case '(':
1249 if (p[1] == '?') {
1250 if (p[2] == ':') {
1251 p += 3;
1252 last_atom_start = s->byte_code.size;
1253 last_capture_count = s->capture_count;
1254 s->buf_ptr = p;
1255 if (re_parse_disjunction(s, is_backward_dir))
1256 return -1;
1257 p = s->buf_ptr;
1258 if (re_parse_expect(s, &p, ')'))
1259 return -1;
1260 } else if ((p[2] == '=' || p[2] == '!')) {
1261 is_neg = (p[2] == '!');
1262 is_backward_lookahead = FALSE;
1263 p += 3;
1264 goto lookahead;
1265 } else if (p[2] == '<' &&
1266 (p[3] == '=' || p[3] == '!')) {
1267 int pos;
1268 is_neg = (p[3] == '!');
1269 is_backward_lookahead = TRUE;
1270 p += 4;
1271 /* lookahead */
1272 lookahead:
1273 /* Annex B allows lookahead to be used as an atom for
1274 the quantifiers */
1275 if (!s->is_utf16 && !is_backward_lookahead) {
1276 last_atom_start = s->byte_code.size;
1277 last_capture_count = s->capture_count;
1278 }
1279 pos = re_emit_op_u32(s, REOP_lookahead + is_neg, 0);
1280 s->buf_ptr = p;
1281 if (re_parse_disjunction(s, is_backward_lookahead))
1282 return -1;
1283 p = s->buf_ptr;
1284 if (re_parse_expect(s, &p, ')'))
1285 return -1;
1286 re_emit_op(s, REOP_match);
1287 /* jump after the 'match' after the lookahead is successful */
1288 put_u32(s->byte_code.buf + pos, s->byte_code.size - (pos + 4));
1289 } else if (p[2] == '<') {
1290 p += 3;
1291 if (re_parse_group_name(s->u.tmp_buf, sizeof(s->u.tmp_buf),
1292 &p, s->is_utf16)) {
1293 return re_parse_error(s, "invalid group name");
1294 }
1295 if (find_group_name(s, s->u.tmp_buf) > 0) {
1296 return re_parse_error(s, "duplicate group name");
1297 }
1298 /* group name with a trailing zero */
1299 dbuf_put(&s->group_names, (uint8_t *)s->u.tmp_buf,
1300 strlen(s->u.tmp_buf) + 1);
1301 s->has_named_captures = 1;
1302 goto parse_capture;
1303 } else {
1304 return re_parse_error(s, "invalid group");
1305 }
1306 } else {
1307 int capture_index;
1308 p++;
1309 /* capture without group name */
1310 dbuf_putc(&s->group_names, 0);
1311 parse_capture:
1312 if (s->capture_count >= CAPTURE_COUNT_MAX)
1313 return re_parse_error(s, "too many captures");
1314 last_atom_start = s->byte_code.size;
1315 last_capture_count = s->capture_count;
1316 capture_index = s->capture_count++;
1317 re_emit_op_u8(s, REOP_save_start + is_backward_dir,
1318 capture_index);
1319
1320 s->buf_ptr = p;
1321 if (re_parse_disjunction(s, is_backward_dir))
1322 return -1;
1323 p = s->buf_ptr;
1324
1325 re_emit_op_u8(s, REOP_save_start + 1 - is_backward_dir,
1326 capture_index);
1327
1328 if (re_parse_expect(s, &p, ')'))
1329 return -1;
1330 }
1331 break;
1332 case '\\':
1333 switch(p[1]) {
1334 case 'b':
1335 case 'B':
1336 re_emit_op(s, REOP_word_boundary + (p[1] != 'b'));
1337 p += 2;
1338 break;
1339 case 'k':
1340 {
1341 const uint8_t *p1;
1342 int dummy_res;
1343
1344 p1 = p;
1345 if (p1[2] != '<') {
1346 /* annex B: we tolerate invalid group names in non
1347 unicode mode if there is no named capture
1348 definition */
1349 if (s->is_utf16 || re_has_named_captures(s))
1350 return re_parse_error(s, "expecting group name");
1351 else
1352 goto parse_class_atom;
1353 }
1354 p1 += 3;
1355 if (re_parse_group_name(s->u.tmp_buf, sizeof(s->u.tmp_buf),
1356 &p1, s->is_utf16)) {
1357 if (s->is_utf16 || re_has_named_captures(s))
1358 return re_parse_error(s, "invalid group name");
1359 else
1360 goto parse_class_atom;
1361 }
1362 c = find_group_name(s, s->u.tmp_buf);
1363 if (c < 0) {
1364 /* no capture name parsed before, try to look
1365 after (inefficient, but hopefully not common */
1366 c = re_parse_captures(s, &dummy_res, s->u.tmp_buf);
1367 if (c < 0) {
1368 if (s->is_utf16 || re_has_named_captures(s))
1369 return re_parse_error(s, "group name not defined");
1370 else
1371 goto parse_class_atom;
1372 }
1373 }
1374 p = p1;
1375 }
1376 goto emit_back_reference;
1377 case '0':
1378 p += 2;
1379 c = 0;
1380 if (s->is_utf16) {
1381 if (is_digit(*p)) {
1382 return re_parse_error(s, "invalid decimal escape in regular expression");
1383 }
1384 } else {
1385 /* Annex B.1.4: accept legacy octal */
1386 if (*p >= '0' && *p <= '7') {
1387 c = *p++ - '0';
1388 if (*p >= '0' && *p <= '7') {
1389 c = (c << 3) + *p++ - '0';
1390 }
1391 }
1392 }
1393 goto normal_char;
1394 #if defined(_MSC_VER)
1395 case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9':
1396 #else
1397 case '1' ... '9':
1398 #endif
1399 {
1400 const uint8_t *q = ++p;
1401
1402 c = parse_digits(&p);
1403 if (c < 0 || (c >= s->capture_count && c >= re_count_captures(s))) {
1404 if (!s->is_utf16) {
1405 /* Annex B.1.4: accept legacy octal */
1406 p = q;
1407 if (*p <= '7') {
1408 c = 0;
1409 if (*p <= '3')
1410 c = *p++ - '0';
1411 if (*p >= '0' && *p <= '7') {
1412 c = (c << 3) + *p++ - '0';
1413 if (*p >= '0' && *p <= '7') {
1414 c = (c << 3) + *p++ - '0';
1415 }
1416 }
1417 } else {
1418 c = *p++;
1419 }
1420 goto normal_char;
1421 }
1422 return re_parse_error(s, "back reference out of range in reguar expression");
1423 }
1424 emit_back_reference:
1425 last_atom_start = s->byte_code.size;
1426 last_capture_count = s->capture_count;
1427 re_emit_op_u8(s, REOP_back_reference + is_backward_dir, c);
1428 }
1429 break;
1430 default:
1431 goto parse_class_atom;
1432 }
1433 break;
1434 case '[':
1435 last_atom_start = s->byte_code.size;
1436 last_capture_count = s->capture_count;
1437 if (is_backward_dir)
1438 re_emit_op(s, REOP_prev);
1439 if (re_parse_char_class(s, &p))
1440 return -1;
1441 if (is_backward_dir)
1442 re_emit_op(s, REOP_prev);
1443 break;
1444 case ']':
1445 case '}':
1446 if (s->is_utf16)
1447 return re_parse_error(s, "syntax error");
1448 goto parse_class_atom;
1449 default:
1450 parse_class_atom:
1451 c = get_class_atom(s, cr, &p, FALSE);
1452 if ((int)c < 0)
1453 return -1;
1454 normal_char:
1455 last_atom_start = s->byte_code.size;
1456 last_capture_count = s->capture_count;
1457 if (is_backward_dir)
1458 re_emit_op(s, REOP_prev);
1459 if (c >= CLASS_RANGE_BASE) {
1460 int ret;
1461 /* Note: canonicalization is not needed */
1462 ret = re_emit_range(s, cr);
1463 cr_free(cr);
1464 if (ret)
1465 return -1;
1466 } else {
1467 if (s->ignore_case)
1468 c = lre_canonicalize(c, s->is_utf16);
1469 if (c <= 0xffff)
1470 re_emit_op_u16(s, REOP_char, c);
1471 else
1472 re_emit_op_u32(s, REOP_char32, c);
1473 }
1474 if (is_backward_dir)
1475 re_emit_op(s, REOP_prev);
1476 break;
1477 }
1478
1479 /* quantifier */
1480 if (last_atom_start >= 0) {
1481 c = *p;
1482 switch(c) {
1483 case '*':
1484 p++;
1485 quant_min = 0;
1486 quant_max = INT32_MAX;
1487 goto quantifier;
1488 case '+':
1489 p++;
1490 quant_min = 1;
1491 quant_max = INT32_MAX;
1492 goto quantifier;
1493 case '?':
1494 p++;
1495 quant_min = 0;
1496 quant_max = 1;
1497 goto quantifier;
1498 case '{':
1499 /* As an extension (see ES6 annex B), we accept '{' not
1500 followed by digits as a normal atom */
1501 if (!is_digit(p[1])) {
1502 if (s->is_utf16)
1503 goto invalid_quant_count;
1504 break;
1505 }
1506 p++;
1507 quant_min = parse_digits(&p);
1508 if (quant_min < 0) {
1509 invalid_quant_count:
1510 return re_parse_error(s, "invalid repetition count");
1511 }
1512 quant_max = quant_min;
1513 if (*p == ',') {
1514 p++;
1515 if (is_digit(*p)) {
1516 quant_max = parse_digits(&p);
1517 if (quant_max < 0 || quant_max < quant_min)
1518 goto invalid_quant_count;
1519 } else {
1520 quant_max = INT32_MAX; /* infinity */
1521 }
1522 }
1523 if (re_parse_expect(s, &p, '}'))
1524 return -1;
1525 quantifier:
1526 greedy = TRUE;
1527 if (*p == '?') {
1528 p++;
1529 greedy = FALSE;
1530 }
1531 if (last_atom_start < 0) {
1532 return re_parse_error(s, "nothing to repeat");
1533 }
1534 if (greedy) {
1535 int len, pos;
1536
1537 if (quant_max > 0) {
1538 /* specific optimization for simple quantifiers */
1539 len = re_is_simple_quantifier(s->byte_code.buf + last_atom_start,
1540 s->byte_code.size - last_atom_start);
1541 if (len > 0) {
1542 re_emit_op(s, REOP_match);
1543
1544 dbuf_insert(&s->byte_code, last_atom_start, 17);
1545 pos = last_atom_start;
1546 s->byte_code.buf[pos++] = REOP_simple_greedy_quant;
1547 put_u32(&s->byte_code.buf[pos],
1548 s->byte_code.size - last_atom_start - 17);
1549 pos += 4;
1550 put_u32(&s->byte_code.buf[pos], quant_min);
1551 pos += 4;
1552 put_u32(&s->byte_code.buf[pos], quant_max);
1553 pos += 4;
1554 put_u32(&s->byte_code.buf[pos], len);
1555 pos += 4;
1556 goto done;
1557 }
1558 }
1559
1560 add_zero_advance_check = (re_check_advance(s->byte_code.buf + last_atom_start,
1561 s->byte_code.size - last_atom_start) == 0);
1562 } else {
1563 add_zero_advance_check = FALSE;
1564 }
1565
1566 {
1567 int len, pos;
1568 len = s->byte_code.size - last_atom_start;
1569 if (quant_min == 0) {
1570 /* need to reset the capture in case the atom is
1571 not executed */
1572 if (last_capture_count != s->capture_count) {
1573 dbuf_insert(&s->byte_code, last_atom_start, 3);
1574 s->byte_code.buf[last_atom_start++] = REOP_save_reset;
1575 s->byte_code.buf[last_atom_start++] = last_capture_count;
1576 s->byte_code.buf[last_atom_start++] = s->capture_count - 1;
1577 }
1578 if (quant_max == 0) {
1579 s->byte_code.size = last_atom_start;
1580 } else if (quant_max == 1) {
1581 dbuf_insert(&s->byte_code, last_atom_start, 5);
1582 s->byte_code.buf[last_atom_start] = REOP_split_goto_first +
1583 greedy;
1584 put_u32(s->byte_code.buf + last_atom_start + 1, len);
1585 } else if (quant_max == INT32_MAX) {
1586 dbuf_insert(&s->byte_code, last_atom_start, 5 + add_zero_advance_check);
1587 s->byte_code.buf[last_atom_start] = REOP_split_goto_first +
1588 greedy;
1589 put_u32(s->byte_code.buf + last_atom_start + 1,
1590 len + 5 + add_zero_advance_check);
1591 if (add_zero_advance_check) {
1592 /* avoid infinite loop by stoping the
1593 recursion if no advance was made in the
1594 atom (only works if the atom has no
1595 side effect) */
1596 s->byte_code.buf[last_atom_start + 1 + 4] = REOP_push_char_pos;
1597 re_emit_goto(s, REOP_bne_char_pos, last_atom_start);
1598 } else {
1599 re_emit_goto(s, REOP_goto, last_atom_start);
1600 }
1601 } else {
1602 dbuf_insert(&s->byte_code, last_atom_start, 10);
1603 pos = last_atom_start;
1604 s->byte_code.buf[pos++] = REOP_push_i32;
1605 put_u32(s->byte_code.buf + pos, quant_max);
1606 pos += 4;
1607 s->byte_code.buf[pos++] = REOP_split_goto_first + greedy;
1608 put_u32(s->byte_code.buf + pos, len + 5);
1609 re_emit_goto(s, REOP_loop, last_atom_start + 5);
1610 re_emit_op(s, REOP_drop);
1611 }
1612 } else if (quant_min == 1 && quant_max == INT32_MAX &&
1613 !add_zero_advance_check) {
1614 re_emit_goto(s, REOP_split_next_first - greedy,
1615 last_atom_start);
1616 } else {
1617 if (quant_min == 1) {
1618 /* nothing to add */
1619 } else {
1620 dbuf_insert(&s->byte_code, last_atom_start, 5);
1621 s->byte_code.buf[last_atom_start] = REOP_push_i32;
1622 put_u32(s->byte_code.buf + last_atom_start + 1,
1623 quant_min);
1624 last_atom_start += 5;
1625 re_emit_goto(s, REOP_loop, last_atom_start);
1626 re_emit_op(s, REOP_drop);
1627 }
1628 if (quant_max == INT32_MAX) {
1629 pos = s->byte_code.size;
1630 re_emit_op_u32(s, REOP_split_goto_first + greedy,
1631 len + 5 + add_zero_advance_check);
1632 if (add_zero_advance_check)
1633 re_emit_op(s, REOP_push_char_pos);
1634 /* copy the atom */
1635 dbuf_put_self(&s->byte_code, last_atom_start, len);
1636 if (add_zero_advance_check)
1637 re_emit_goto(s, REOP_bne_char_pos, pos);
1638 else
1639 re_emit_goto(s, REOP_goto, pos);
1640 } else if (quant_max > quant_min) {
1641 re_emit_op_u32(s, REOP_push_i32, quant_max - quant_min);
1642 pos = s->byte_code.size;
1643 re_emit_op_u32(s, REOP_split_goto_first + greedy, len + 5);
1644 /* copy the atom */
1645 dbuf_put_self(&s->byte_code, last_atom_start, len);
1646
1647 re_emit_goto(s, REOP_loop, pos);
1648 re_emit_op(s, REOP_drop);
1649 }
1650 }
1651 last_atom_start = -1;
1652 }
1653 break;
1654 default:
1655 break;
1656 }
1657 }
1658 done:
1659 s->buf_ptr = p;
1660 return 0;
1661 }
1662
re_parse_alternative(REParseState * s,BOOL is_backward_dir)1663 static int re_parse_alternative(REParseState *s, BOOL is_backward_dir)
1664 {
1665 const uint8_t *p;
1666 int ret;
1667 size_t start, term_start, end, term_size;
1668
1669 start = s->byte_code.size;
1670 for(;;) {
1671 p = s->buf_ptr;
1672 if (p >= s->buf_end)
1673 break;
1674 if (*p == '|' || *p == ')')
1675 break;
1676 term_start = s->byte_code.size;
1677 ret = re_parse_term(s, is_backward_dir);
1678 if (ret)
1679 return ret;
1680 if (is_backward_dir) {
1681 /* reverse the order of the terms (XXX: inefficient, but
1682 speed is not really critical here) */
1683 end = s->byte_code.size;
1684 term_size = end - term_start;
1685 if (dbuf_realloc(&s->byte_code, end + term_size))
1686 return -1;
1687 memmove(s->byte_code.buf + start + term_size,
1688 s->byte_code.buf + start,
1689 end - start);
1690 memcpy(s->byte_code.buf + start, s->byte_code.buf + end,
1691 term_size);
1692 }
1693 }
1694 return 0;
1695 }
1696
re_parse_disjunction(REParseState * s,BOOL is_backward_dir)1697 static int re_parse_disjunction(REParseState *s, BOOL is_backward_dir)
1698 {
1699 int start, len, pos;
1700
1701 start = s->byte_code.size;
1702 if (re_parse_alternative(s, is_backward_dir))
1703 return -1;
1704 while (*s->buf_ptr == '|') {
1705 s->buf_ptr++;
1706
1707 len = s->byte_code.size - start;
1708
1709 /* insert a split before the first alternative */
1710 dbuf_insert(&s->byte_code, start, 5);
1711 s->byte_code.buf[start] = REOP_split_next_first;
1712 put_u32(s->byte_code.buf + start + 1, len + 5);
1713
1714 pos = re_emit_op_u32(s, REOP_goto, 0);
1715
1716 if (re_parse_alternative(s, is_backward_dir))
1717 return -1;
1718
1719 /* patch the goto */
1720 len = s->byte_code.size - (pos + 4);
1721 put_u32(s->byte_code.buf + pos, len);
1722 }
1723 return 0;
1724 }
1725
1726 /* the control flow is recursive so the analysis can be linear */
compute_stack_size(const uint8_t * bc_buf,int bc_buf_len)1727 static int compute_stack_size(const uint8_t *bc_buf, int bc_buf_len)
1728 {
1729 int stack_size, stack_size_max, pos, opcode, len;
1730 uint32_t val;
1731
1732 stack_size = 0;
1733 stack_size_max = 0;
1734 bc_buf += RE_HEADER_LEN;
1735 bc_buf_len -= RE_HEADER_LEN;
1736 pos = 0;
1737 while (pos < bc_buf_len) {
1738 opcode = bc_buf[pos];
1739 len = reopcode_info[opcode].size;
1740 assert(opcode < REOP_COUNT);
1741 assert((pos + len) <= bc_buf_len);
1742 switch(opcode) {
1743 case REOP_push_i32:
1744 case REOP_push_char_pos:
1745 stack_size++;
1746 if (stack_size > stack_size_max) {
1747 if (stack_size > STACK_SIZE_MAX)
1748 return -1;
1749 stack_size_max = stack_size;
1750 }
1751 break;
1752 case REOP_drop:
1753 case REOP_bne_char_pos:
1754 assert(stack_size > 0);
1755 stack_size--;
1756 break;
1757 case REOP_range:
1758 val = get_u16(bc_buf + pos + 1);
1759 len += val * 4;
1760 break;
1761 case REOP_range32:
1762 val = get_u16(bc_buf + pos + 1);
1763 len += val * 8;
1764 break;
1765 }
1766 pos += len;
1767 }
1768 return stack_size_max;
1769 }
1770
1771 /* 'buf' must be a zero terminated UTF-8 string of length buf_len.
1772 Return NULL if error and allocate an error message in *perror_msg,
1773 otherwise the compiled bytecode and its length in plen.
1774 */
lre_compile(int * plen,char * error_msg,int error_msg_size,const char * buf,size_t buf_len,int re_flags,void * opaque)1775 uint8_t *lre_compile(int *plen, char *error_msg, int error_msg_size,
1776 const char *buf, size_t buf_len, int re_flags,
1777 void *opaque)
1778 {
1779 REParseState s_s, *s = &s_s;
1780 int stack_size;
1781 BOOL is_sticky;
1782
1783 memset(s, 0, sizeof(*s));
1784 s->mem_opaque = opaque;
1785 s->buf_ptr = (const uint8_t *)buf;
1786 s->buf_end = s->buf_ptr + buf_len;
1787 s->buf_start = s->buf_ptr;
1788 s->re_flags = re_flags;
1789 s->is_utf16 = ((re_flags & LRE_FLAG_UTF16) != 0);
1790 is_sticky = ((re_flags & LRE_FLAG_STICKY) != 0);
1791 s->ignore_case = ((re_flags & LRE_FLAG_IGNORECASE) != 0);
1792 s->dotall = ((re_flags & LRE_FLAG_DOTALL) != 0);
1793 s->capture_count = 1;
1794 s->total_capture_count = -1;
1795 s->has_named_captures = -1;
1796
1797 dbuf_init2(&s->byte_code, opaque, lre_realloc);
1798 dbuf_init2(&s->group_names, opaque, lre_realloc);
1799
1800 dbuf_putc(&s->byte_code, re_flags); /* first element is the flags */
1801 dbuf_putc(&s->byte_code, 0); /* second element is the number of captures */
1802 dbuf_putc(&s->byte_code, 0); /* stack size */
1803 dbuf_put_u32(&s->byte_code, 0); /* bytecode length */
1804
1805 if (!is_sticky) {
1806 /* iterate thru all positions (about the same as .*?( ... ) )
1807 . We do it without an explicit loop so that lock step
1808 thread execution will be possible in an optimized
1809 implementation */
1810 re_emit_op_u32(s, REOP_split_goto_first, 1 + 5);
1811 re_emit_op(s, REOP_any);
1812 re_emit_op_u32(s, REOP_goto, -(5 + 1 + 5));
1813 }
1814 re_emit_op_u8(s, REOP_save_start, 0);
1815
1816 if (re_parse_disjunction(s, FALSE)) {
1817 error:
1818 dbuf_free(&s->byte_code);
1819 dbuf_free(&s->group_names);
1820 pstrcpy(error_msg, error_msg_size, s->u.error_msg);
1821 *plen = 0;
1822 return NULL;
1823 }
1824
1825 re_emit_op_u8(s, REOP_save_end, 0);
1826
1827 re_emit_op(s, REOP_match);
1828
1829 if (*s->buf_ptr != '\0') {
1830 re_parse_error(s, "extraneous characters at the end");
1831 goto error;
1832 }
1833
1834 if (dbuf_error(&s->byte_code)) {
1835 re_parse_error(s, "out of memory");
1836 goto error;
1837 }
1838
1839 stack_size = compute_stack_size(s->byte_code.buf, s->byte_code.size);
1840 if (stack_size < 0) {
1841 re_parse_error(s, "too many imbricated quantifiers");
1842 goto error;
1843 }
1844
1845 s->byte_code.buf[RE_HEADER_CAPTURE_COUNT] = s->capture_count;
1846 s->byte_code.buf[RE_HEADER_STACK_SIZE] = stack_size;
1847 put_u32(s->byte_code.buf + 3, s->byte_code.size - RE_HEADER_LEN);
1848
1849 /* add the named groups if needed */
1850 if (s->group_names.size > (s->capture_count - 1)) {
1851 dbuf_put(&s->byte_code, s->group_names.buf, s->group_names.size);
1852 s->byte_code.buf[RE_HEADER_FLAGS] |= LRE_FLAG_NAMED_GROUPS;
1853 }
1854 dbuf_free(&s->group_names);
1855
1856 #ifdef DUMP_REOP
1857 lre_dump_bytecode(s->byte_code.buf, s->byte_code.size);
1858 #endif
1859
1860 error_msg[0] = '\0';
1861 *plen = s->byte_code.size;
1862 return s->byte_code.buf;
1863 }
1864
is_line_terminator(uint32_t c)1865 static BOOL is_line_terminator(uint32_t c)
1866 {
1867 return (c == '\n' || c == '\r' || c == CP_LS || c == CP_PS);
1868 }
1869
is_word_char(uint32_t c)1870 static BOOL is_word_char(uint32_t c)
1871 {
1872 return ((c >= '0' && c <= '9') ||
1873 (c >= 'a' && c <= 'z') ||
1874 (c >= 'A' && c <= 'Z') ||
1875 (c == '_'));
1876 }
1877
1878 #define GET_CHAR(c, cptr, cbuf_end) \
1879 do { \
1880 if (cbuf_type == 0) { \
1881 c = *cptr++; \
1882 } else { \
1883 uint32_t __c1; \
1884 c = *(uint16_t *)cptr; \
1885 cptr += 2; \
1886 if (c >= 0xd800 && c < 0xdc00 && \
1887 cbuf_type == 2 && cptr < cbuf_end) { \
1888 __c1 = *(uint16_t *)cptr; \
1889 if (__c1 >= 0xdc00 && __c1 < 0xe000) { \
1890 c = (((c & 0x3ff) << 10) | (__c1 & 0x3ff)) + 0x10000; \
1891 cptr += 2; \
1892 } \
1893 } \
1894 } \
1895 } while (0)
1896
1897 #define PEEK_CHAR(c, cptr, cbuf_end) \
1898 do { \
1899 if (cbuf_type == 0) { \
1900 c = cptr[0]; \
1901 } else { \
1902 uint32_t __c1; \
1903 c = ((uint16_t *)cptr)[0]; \
1904 if (c >= 0xd800 && c < 0xdc00 && \
1905 cbuf_type == 2 && (cptr + 2) < cbuf_end) { \
1906 __c1 = ((uint16_t *)cptr)[1]; \
1907 if (__c1 >= 0xdc00 && __c1 < 0xe000) { \
1908 c = (((c & 0x3ff) << 10) | (__c1 & 0x3ff)) + 0x10000; \
1909 } \
1910 } \
1911 } \
1912 } while (0)
1913
1914 #define PEEK_PREV_CHAR(c, cptr, cbuf_start) \
1915 do { \
1916 if (cbuf_type == 0) { \
1917 c = cptr[-1]; \
1918 } else { \
1919 uint32_t __c1; \
1920 c = ((uint16_t *)cptr)[-1]; \
1921 if (c >= 0xdc00 && c < 0xe000 && \
1922 cbuf_type == 2 && (cptr - 4) >= cbuf_start) { \
1923 __c1 = ((uint16_t *)cptr)[-2]; \
1924 if (__c1 >= 0xd800 && __c1 < 0xdc00 ) { \
1925 c = (((__c1 & 0x3ff) << 10) | (c & 0x3ff)) + 0x10000; \
1926 } \
1927 } \
1928 } \
1929 } while (0)
1930
1931 #define GET_PREV_CHAR(c, cptr, cbuf_start) \
1932 do { \
1933 if (cbuf_type == 0) { \
1934 cptr--; \
1935 c = cptr[0]; \
1936 } else { \
1937 uint32_t __c1; \
1938 cptr -= 2; \
1939 c = ((uint16_t *)cptr)[0]; \
1940 if (c >= 0xdc00 && c < 0xe000 && \
1941 cbuf_type == 2 && cptr > cbuf_start) { \
1942 __c1 = ((uint16_t *)cptr)[-1]; \
1943 if (__c1 >= 0xd800 && __c1 < 0xdc00 ) { \
1944 cptr -= 2; \
1945 c = (((__c1 & 0x3ff) << 10) | (c & 0x3ff)) + 0x10000; \
1946 } \
1947 } \
1948 } \
1949 } while (0)
1950
1951 #define PREV_CHAR(cptr, cbuf_start) \
1952 do { \
1953 if (cbuf_type == 0) { \
1954 cptr--; \
1955 } else { \
1956 cptr -= 2; \
1957 if (cbuf_type == 2) { \
1958 c = ((uint16_t *)cptr)[0]; \
1959 if (c >= 0xdc00 && c < 0xe000 && cptr > cbuf_start) { \
1960 c = ((uint16_t *)cptr)[-1]; \
1961 if (c >= 0xd800 && c < 0xdc00) \
1962 cptr -= 2; \
1963 } \
1964 } \
1965 } \
1966 } while (0)
1967
1968 typedef uintptr_t StackInt;
1969
1970 typedef enum {
1971 RE_EXEC_STATE_SPLIT,
1972 RE_EXEC_STATE_LOOKAHEAD,
1973 RE_EXEC_STATE_NEGATIVE_LOOKAHEAD,
1974 RE_EXEC_STATE_GREEDY_QUANT,
1975 } REExecStateEnum;
1976
1977 typedef struct REExecState {
1978 REExecStateEnum type : 8;
1979 uint8_t stack_len;
1980 size_t count; /* only used for RE_EXEC_STATE_GREEDY_QUANT */
1981 const uint8_t *cptr;
1982 const uint8_t *pc;
1983 void *buf[0];
1984 } REExecState;
1985
1986 typedef struct {
1987 const uint8_t *cbuf;
1988 const uint8_t *cbuf_end;
1989 /* 0 = 8 bit chars, 1 = 16 bit chars, 2 = 16 bit chars, UTF-16 */
1990 int cbuf_type;
1991 int capture_count;
1992 int stack_size_max;
1993 BOOL multi_line;
1994 BOOL ignore_case;
1995 BOOL is_utf16;
1996 void *opaque; /* used for stack overflow check */
1997
1998 size_t state_size;
1999 uint8_t *state_stack;
2000 size_t state_stack_size;
2001 size_t state_stack_len;
2002 } REExecContext;
2003
push_state(REExecContext * s,uint8_t ** capture,StackInt * stack,size_t stack_len,const uint8_t * pc,const uint8_t * cptr,REExecStateEnum type,size_t count)2004 static int push_state(REExecContext *s,
2005 uint8_t **capture,
2006 StackInt *stack, size_t stack_len,
2007 const uint8_t *pc, const uint8_t *cptr,
2008 REExecStateEnum type, size_t count)
2009 {
2010 REExecState *rs;
2011 uint8_t *new_stack;
2012 size_t new_size, i, n;
2013 StackInt *stack_buf;
2014
2015 if (unlikely((s->state_stack_len + 1) > s->state_stack_size)) {
2016 /* reallocate the stack */
2017 new_size = s->state_stack_size * 3 / 2;
2018 if (new_size < 8)
2019 new_size = 8;
2020 new_stack = lre_realloc(s->opaque, s->state_stack, new_size * s->state_size);
2021 if (!new_stack)
2022 return -1;
2023 s->state_stack_size = new_size;
2024 s->state_stack = new_stack;
2025 }
2026 rs = (REExecState *)(s->state_stack + s->state_stack_len * s->state_size);
2027 s->state_stack_len++;
2028 rs->type = type;
2029 rs->count = count;
2030 rs->stack_len = stack_len;
2031 rs->cptr = cptr;
2032 rs->pc = pc;
2033 n = 2 * s->capture_count;
2034 for(i = 0; i < n; i++)
2035 rs->buf[i] = capture[i];
2036 stack_buf = (StackInt *)(rs->buf + n);
2037 for(i = 0; i < stack_len; i++)
2038 stack_buf[i] = stack[i];
2039 return 0;
2040 }
2041
2042 /* return 1 if match, 0 if not match or -1 if error. */
lre_exec_backtrack(REExecContext * s,uint8_t ** capture,StackInt * stack,int stack_len,const uint8_t * pc,const uint8_t * cptr,BOOL no_recurse)2043 static intptr_t lre_exec_backtrack(REExecContext *s, uint8_t **capture,
2044 StackInt *stack, int stack_len,
2045 const uint8_t *pc, const uint8_t *cptr,
2046 BOOL no_recurse)
2047 {
2048 int opcode, ret;
2049 int cbuf_type;
2050 uint32_t val, c;
2051 const uint8_t *cbuf_end;
2052
2053 cbuf_type = s->cbuf_type;
2054 cbuf_end = s->cbuf_end;
2055
2056 for(;;) {
2057 // printf("top=%p: pc=%d\n", th_list.top, (int)(pc - (bc_buf + RE_HEADER_LEN)));
2058 opcode = *pc++;
2059 switch(opcode) {
2060 case REOP_match:
2061 {
2062 REExecState *rs;
2063 if (no_recurse)
2064 return (intptr_t)cptr;
2065 ret = 1;
2066 goto recurse;
2067 no_match:
2068 if (no_recurse)
2069 return 0;
2070 ret = 0;
2071 recurse:
2072 for(;;) {
2073 if (s->state_stack_len == 0)
2074 return ret;
2075 rs = (REExecState *)(s->state_stack +
2076 (s->state_stack_len - 1) * s->state_size);
2077 if (rs->type == RE_EXEC_STATE_SPLIT) {
2078 if (!ret) {
2079 pop_state:
2080 memcpy(capture, rs->buf,
2081 sizeof(capture[0]) * 2 * s->capture_count);
2082 pop_state1:
2083 pc = rs->pc;
2084 cptr = rs->cptr;
2085 stack_len = rs->stack_len;
2086 memcpy(stack, rs->buf + 2 * s->capture_count,
2087 stack_len * sizeof(stack[0]));
2088 s->state_stack_len--;
2089 break;
2090 }
2091 } else if (rs->type == RE_EXEC_STATE_GREEDY_QUANT) {
2092 if (!ret) {
2093 uint32_t char_count, i;
2094 memcpy(capture, rs->buf,
2095 sizeof(capture[0]) * 2 * s->capture_count);
2096 stack_len = rs->stack_len;
2097 memcpy(stack, rs->buf + 2 * s->capture_count,
2098 stack_len * sizeof(stack[0]));
2099 pc = rs->pc;
2100 cptr = rs->cptr;
2101 /* go backward */
2102 char_count = get_u32(pc + 12);
2103 for(i = 0; i < char_count; i++) {
2104 PREV_CHAR(cptr, s->cbuf);
2105 }
2106 pc = (pc + 16) + (int)get_u32(pc);
2107 rs->cptr = cptr;
2108 rs->count--;
2109 if (rs->count == 0) {
2110 s->state_stack_len--;
2111 }
2112 break;
2113 }
2114 } else {
2115 ret = ((rs->type == RE_EXEC_STATE_LOOKAHEAD && ret) ||
2116 (rs->type == RE_EXEC_STATE_NEGATIVE_LOOKAHEAD && !ret));
2117 if (ret) {
2118 /* keep the capture in case of positive lookahead */
2119 if (rs->type == RE_EXEC_STATE_LOOKAHEAD)
2120 goto pop_state1;
2121 else
2122 goto pop_state;
2123 }
2124 }
2125 s->state_stack_len--;
2126 }
2127 }
2128 break;
2129 case REOP_char32:
2130 val = get_u32(pc);
2131 pc += 4;
2132 goto test_char;
2133 case REOP_char:
2134 val = get_u16(pc);
2135 pc += 2;
2136 test_char:
2137 if (cptr >= cbuf_end)
2138 goto no_match;
2139 GET_CHAR(c, cptr, cbuf_end);
2140 if (s->ignore_case) {
2141 c = lre_canonicalize(c, s->is_utf16);
2142 }
2143 if (val != c)
2144 goto no_match;
2145 break;
2146 case REOP_split_goto_first:
2147 case REOP_split_next_first:
2148 {
2149 const uint8_t *pc1;
2150
2151 val = get_u32(pc);
2152 pc += 4;
2153 if (opcode == REOP_split_next_first) {
2154 pc1 = pc + (int)val;
2155 } else {
2156 pc1 = pc;
2157 pc = pc + (int)val;
2158 }
2159 ret = push_state(s, capture, stack, stack_len,
2160 pc1, cptr, RE_EXEC_STATE_SPLIT, 0);
2161 if (ret < 0)
2162 return -1;
2163 break;
2164 }
2165 case REOP_lookahead:
2166 case REOP_negative_lookahead:
2167 val = get_u32(pc);
2168 pc += 4;
2169 ret = push_state(s, capture, stack, stack_len,
2170 pc + (int)val, cptr,
2171 RE_EXEC_STATE_LOOKAHEAD + opcode - REOP_lookahead,
2172 0);
2173 if (ret < 0)
2174 return -1;
2175 break;
2176
2177 case REOP_goto:
2178 val = get_u32(pc);
2179 pc += 4 + (int)val;
2180 break;
2181 case REOP_line_start:
2182 if (cptr == s->cbuf)
2183 break;
2184 if (!s->multi_line)
2185 goto no_match;
2186 PEEK_PREV_CHAR(c, cptr, s->cbuf);
2187 if (!is_line_terminator(c))
2188 goto no_match;
2189 break;
2190 case REOP_line_end:
2191 if (cptr == cbuf_end)
2192 break;
2193 if (!s->multi_line)
2194 goto no_match;
2195 PEEK_CHAR(c, cptr, cbuf_end);
2196 if (!is_line_terminator(c))
2197 goto no_match;
2198 break;
2199 case REOP_dot:
2200 if (cptr == cbuf_end)
2201 goto no_match;
2202 GET_CHAR(c, cptr, cbuf_end);
2203 if (is_line_terminator(c))
2204 goto no_match;
2205 break;
2206 case REOP_any:
2207 if (cptr == cbuf_end)
2208 goto no_match;
2209 GET_CHAR(c, cptr, cbuf_end);
2210 break;
2211 case REOP_save_start:
2212 case REOP_save_end:
2213 val = *pc++;
2214 assert(val < s->capture_count);
2215 capture[2 * val + opcode - REOP_save_start] = (uint8_t *)cptr;
2216 break;
2217 case REOP_save_reset:
2218 {
2219 uint32_t val2;
2220 val = pc[0];
2221 val2 = pc[1];
2222 pc += 2;
2223 assert(val2 < s->capture_count);
2224 while (val <= val2) {
2225 capture[2 * val] = NULL;
2226 capture[2 * val + 1] = NULL;
2227 val++;
2228 }
2229 }
2230 break;
2231 case REOP_push_i32:
2232 val = get_u32(pc);
2233 pc += 4;
2234 stack[stack_len++] = val;
2235 break;
2236 case REOP_drop:
2237 stack_len--;
2238 break;
2239 case REOP_loop:
2240 val = get_u32(pc);
2241 pc += 4;
2242 if (--stack[stack_len - 1] != 0) {
2243 pc += (int)val;
2244 }
2245 break;
2246 case REOP_push_char_pos:
2247 stack[stack_len++] = (uintptr_t)cptr;
2248 break;
2249 case REOP_bne_char_pos:
2250 val = get_u32(pc);
2251 pc += 4;
2252 if (stack[--stack_len] != (uintptr_t)cptr)
2253 pc += (int)val;
2254 break;
2255 case REOP_word_boundary:
2256 case REOP_not_word_boundary:
2257 {
2258 BOOL v1, v2;
2259 /* char before */
2260 if (cptr == s->cbuf) {
2261 v1 = FALSE;
2262 } else {
2263 PEEK_PREV_CHAR(c, cptr, s->cbuf);
2264 v1 = is_word_char(c);
2265 }
2266 /* current char */
2267 if (cptr >= cbuf_end) {
2268 v2 = FALSE;
2269 } else {
2270 PEEK_CHAR(c, cptr, cbuf_end);
2271 v2 = is_word_char(c);
2272 }
2273 if (v1 ^ v2 ^ (REOP_not_word_boundary - opcode))
2274 goto no_match;
2275 }
2276 break;
2277 case REOP_back_reference:
2278 case REOP_backward_back_reference:
2279 {
2280 const uint8_t *cptr1, *cptr1_end, *cptr1_start;
2281 uint32_t c1, c2;
2282
2283 val = *pc++;
2284 if (val >= s->capture_count)
2285 goto no_match;
2286 cptr1_start = capture[2 * val];
2287 cptr1_end = capture[2 * val + 1];
2288 if (!cptr1_start || !cptr1_end)
2289 break;
2290 if (opcode == REOP_back_reference) {
2291 cptr1 = cptr1_start;
2292 while (cptr1 < cptr1_end) {
2293 if (cptr >= cbuf_end)
2294 goto no_match;
2295 GET_CHAR(c1, cptr1, cptr1_end);
2296 GET_CHAR(c2, cptr, cbuf_end);
2297 if (s->ignore_case) {
2298 c1 = lre_canonicalize(c1, s->is_utf16);
2299 c2 = lre_canonicalize(c2, s->is_utf16);
2300 }
2301 if (c1 != c2)
2302 goto no_match;
2303 }
2304 } else {
2305 cptr1 = cptr1_end;
2306 while (cptr1 > cptr1_start) {
2307 if (cptr == s->cbuf)
2308 goto no_match;
2309 GET_PREV_CHAR(c1, cptr1, cptr1_start);
2310 GET_PREV_CHAR(c2, cptr, s->cbuf);
2311 if (s->ignore_case) {
2312 c1 = lre_canonicalize(c1, s->is_utf16);
2313 c2 = lre_canonicalize(c2, s->is_utf16);
2314 }
2315 if (c1 != c2)
2316 goto no_match;
2317 }
2318 }
2319 }
2320 break;
2321 case REOP_range:
2322 {
2323 int n;
2324 uint32_t low, high, idx_min, idx_max, idx;
2325
2326 n = get_u16(pc); /* n must be >= 1 */
2327 pc += 2;
2328 if (cptr >= cbuf_end)
2329 goto no_match;
2330 GET_CHAR(c, cptr, cbuf_end);
2331 if (s->ignore_case) {
2332 c = lre_canonicalize(c, s->is_utf16);
2333 }
2334 idx_min = 0;
2335 low = get_u16(pc + 0 * 4);
2336 if (c < low)
2337 goto no_match;
2338 idx_max = n - 1;
2339 high = get_u16(pc + idx_max * 4 + 2);
2340 /* 0xffff in for last value means +infinity */
2341 if (unlikely(c >= 0xffff) && high == 0xffff)
2342 goto range_match;
2343 if (c > high)
2344 goto no_match;
2345 while (idx_min <= idx_max) {
2346 idx = (idx_min + idx_max) / 2;
2347 low = get_u16(pc + idx * 4);
2348 high = get_u16(pc + idx * 4 + 2);
2349 if (c < low)
2350 idx_max = idx - 1;
2351 else if (c > high)
2352 idx_min = idx + 1;
2353 else
2354 goto range_match;
2355 }
2356 goto no_match;
2357 range_match:
2358 pc += 4 * n;
2359 }
2360 break;
2361 case REOP_range32:
2362 {
2363 int n;
2364 uint32_t low, high, idx_min, idx_max, idx;
2365
2366 n = get_u16(pc); /* n must be >= 1 */
2367 pc += 2;
2368 if (cptr >= cbuf_end)
2369 goto no_match;
2370 GET_CHAR(c, cptr, cbuf_end);
2371 if (s->ignore_case) {
2372 c = lre_canonicalize(c, s->is_utf16);
2373 }
2374 idx_min = 0;
2375 low = get_u32(pc + 0 * 8);
2376 if (c < low)
2377 goto no_match;
2378 idx_max = n - 1;
2379 high = get_u32(pc + idx_max * 8 + 4);
2380 if (c > high)
2381 goto no_match;
2382 while (idx_min <= idx_max) {
2383 idx = (idx_min + idx_max) / 2;
2384 low = get_u32(pc + idx * 8);
2385 high = get_u32(pc + idx * 8 + 4);
2386 if (c < low)
2387 idx_max = idx - 1;
2388 else if (c > high)
2389 idx_min = idx + 1;
2390 else
2391 goto range32_match;
2392 }
2393 goto no_match;
2394 range32_match:
2395 pc += 8 * n;
2396 }
2397 break;
2398 case REOP_prev:
2399 /* go to the previous char */
2400 if (cptr == s->cbuf)
2401 goto no_match;
2402 PREV_CHAR(cptr, s->cbuf);
2403 break;
2404 case REOP_simple_greedy_quant:
2405 {
2406 uint32_t next_pos, quant_min, quant_max;
2407 size_t q;
2408 intptr_t res;
2409 const uint8_t *pc1;
2410
2411 next_pos = get_u32(pc);
2412 quant_min = get_u32(pc + 4);
2413 quant_max = get_u32(pc + 8);
2414 pc += 16;
2415 pc1 = pc;
2416 pc += (int)next_pos;
2417
2418 q = 0;
2419 for(;;) {
2420 res = lre_exec_backtrack(s, capture, stack, stack_len,
2421 pc1, cptr, TRUE);
2422 if (res == -1)
2423 return res;
2424 if (!res)
2425 break;
2426 cptr = (uint8_t *)res;
2427 q++;
2428 if (q >= quant_max && quant_max != INT32_MAX)
2429 break;
2430 }
2431 if (q < quant_min)
2432 goto no_match;
2433 if (q > quant_min) {
2434 /* will examine all matches down to quant_min */
2435 ret = push_state(s, capture, stack, stack_len,
2436 pc1 - 16, cptr,
2437 RE_EXEC_STATE_GREEDY_QUANT,
2438 q - quant_min);
2439 if (ret < 0)
2440 return -1;
2441 }
2442 }
2443 break;
2444 default:
2445 abort();
2446 }
2447 }
2448 }
2449
2450 /* Return 1 if match, 0 if not match or -1 if error. cindex is the
2451 starting position of the match and must be such as 0 <= cindex <=
2452 clen. */
lre_exec(uint8_t ** capture,const uint8_t * bc_buf,const uint8_t * cbuf,int cindex,int clen,int cbuf_type,void * opaque)2453 int lre_exec(uint8_t **capture,
2454 const uint8_t *bc_buf, const uint8_t *cbuf, int cindex, int clen,
2455 int cbuf_type, void *opaque)
2456 {
2457 REExecContext s_s, *s = &s_s;
2458 int re_flags, i, alloca_size, ret;
2459 StackInt *stack_buf;
2460
2461 re_flags = bc_buf[RE_HEADER_FLAGS];
2462 s->multi_line = (re_flags & LRE_FLAG_MULTILINE) != 0;
2463 s->ignore_case = (re_flags & LRE_FLAG_IGNORECASE) != 0;
2464 s->is_utf16 = (re_flags & LRE_FLAG_UTF16) != 0;
2465 s->capture_count = bc_buf[RE_HEADER_CAPTURE_COUNT];
2466 s->stack_size_max = bc_buf[RE_HEADER_STACK_SIZE];
2467 s->cbuf = cbuf;
2468 s->cbuf_end = cbuf + (clen << cbuf_type);
2469 s->cbuf_type = cbuf_type;
2470 if (s->cbuf_type == 1 && s->is_utf16)
2471 s->cbuf_type = 2;
2472 s->opaque = opaque;
2473
2474 s->state_size = sizeof(REExecState) +
2475 s->capture_count * sizeof(capture[0]) * 2 +
2476 s->stack_size_max * sizeof(stack_buf[0]);
2477 s->state_stack = NULL;
2478 s->state_stack_len = 0;
2479 s->state_stack_size = 0;
2480
2481 for(i = 0; i < s->capture_count * 2; i++)
2482 capture[i] = NULL;
2483 alloca_size = s->stack_size_max * sizeof(stack_buf[0]);
2484 stack_buf = alloca(alloca_size);
2485 ret = lre_exec_backtrack(s, capture, stack_buf, 0, bc_buf + RE_HEADER_LEN,
2486 cbuf + (cindex << cbuf_type), FALSE);
2487 lre_realloc(s->opaque, s->state_stack, 0);
2488 return ret;
2489 }
2490
lre_get_capture_count(const uint8_t * bc_buf)2491 int lre_get_capture_count(const uint8_t *bc_buf)
2492 {
2493 return bc_buf[RE_HEADER_CAPTURE_COUNT];
2494 }
2495
lre_get_flags(const uint8_t * bc_buf)2496 int lre_get_flags(const uint8_t *bc_buf)
2497 {
2498 return bc_buf[RE_HEADER_FLAGS];
2499 }
2500
2501 #ifdef TEST
2502
lre_check_stack_overflow(void * opaque,size_t alloca_size)2503 BOOL lre_check_stack_overflow(void *opaque, size_t alloca_size)
2504 {
2505 return FALSE;
2506 }
2507
lre_realloc(void * opaque,void * ptr,size_t size)2508 void *lre_realloc(void *opaque, void *ptr, size_t size)
2509 {
2510 return realloc(ptr, size);
2511 }
2512
main(int argc,char ** argv)2513 int main(int argc, char **argv)
2514 {
2515 int len, ret, i;
2516 uint8_t *bc;
2517 char error_msg[64];
2518 uint8_t *capture[CAPTURE_COUNT_MAX * 2];
2519 const char *input;
2520 int input_len, capture_count;
2521
2522 if (argc < 3) {
2523 printf("usage: %s regexp input\n", argv[0]);
2524 exit(1);
2525 }
2526 bc = lre_compile(&len, error_msg, sizeof(error_msg), argv[1],
2527 strlen(argv[1]), 0, NULL);
2528 if (!bc) {
2529 fprintf(stderr, "error: %s\n", error_msg);
2530 exit(1);
2531 }
2532
2533 input = argv[2];
2534 input_len = strlen(input);
2535
2536 ret = lre_exec(capture, bc, (uint8_t *)input, 0, input_len, 0, NULL);
2537 printf("ret=%d\n", ret);
2538 if (ret == 1) {
2539 capture_count = lre_get_capture_count(bc);
2540 for(i = 0; i < 2 * capture_count; i++) {
2541 uint8_t *ptr;
2542 ptr = capture[i];
2543 printf("%d: ", i);
2544 if (!ptr)
2545 printf("<nil>");
2546 else
2547 printf("%u", (int)(ptr - (uint8_t *)input));
2548 printf("\n");
2549 }
2550 }
2551 return 0;
2552 }
2553 #endif
2554